diff --git a/README.md b/README.md index 853ddc3..f469c40 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![PyPI - Version](https://img.shields.io/pypi/v/pygnss-tec)](https://pypi.org/project/pygnss-tec/) ![Supported Python Versions](https://img.shields.io/badge/python-%3E%3D3.10-blue) +[![Codacy Badge](https://app.codacy.com/project/badge/Grade/5ec7c48b66f04ebb8a3ce1ea7c03ed64)](https://app.codacy.com/gh/Eureka-0/pygnss-tec/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) ![License](https://img.shields.io/badge/license-MIT-blue) [![Test](https://github.com/Eureka-0/pygnss-tec/actions/workflows/test.yml/badge.svg)](https://github.com/Eureka-0/pygnss-tec/actions/workflows/test.yml) @@ -11,12 +12,11 @@ PyGNSS-TEC is a high-performance Python package leveraging Rust acceleration, de ## Features -- **RINEX File Reading**: Efficient reading and parsing of RINEX GNSS observation files using [rinex crate](https://crates.io/crates/rinex) (see [benchmarks](#benchmarks) for details). +- **RINEX File Reading**: Efficient reading and parsing of RINEX GNSS observation files using [rinex crate](https://crates.io/crates/rinex) (see [benchmarks](#benchmarks-on-m2-pro-12-core-cpu) for details). - **Multiple File Formats**: Support for RINEX versions 2.x and 3.x., as well as Hatanaka compressed files (e.g., .Z, .crx, .crx.gz). -- **TEC Calculation**: Efficiently compute TEC from dual-frequency GNSS observations using [polars](https://pola.rs/) DataFrames and lazy evaluation (see [benchmarks](#benchmarks) for details). - +- **TEC Calculation**: Efficiently compute TEC from dual-frequency GNSS observations using [polars](https://pola.rs/) DataFrames and lazy evaluation (see [benchmarks](#benchmarks-on-m2-pro-12-core-cpu) for details). - **Multi-GNSS Support**: Process observations from multiple GNSS constellations (see [Overview](#overview) for constellation support). - **Open-Source**: Fully open-source under the MIT License, encouraging community contributions and collaboration. @@ -188,7 +188,7 @@ tec_lf = gt.calc_tec_from_df(lf, header, "./data/bias/CAS0OPSRAP_20240100000_01D #### From parquet file -Reading RINEX files is time-consuming, accounting for at least 80% of the total processing time. Thus, if you need to perform TEC calculation multiple times on the same RINEX files (e.g., when tuning configuration), it is recommended to save the parsed LazyFrame to a parquet file after the first read, and then use `calc_tec_from_parquet` for subsequent TEC calculations: +Reading RINEX files is time-consuming, accounting for at least 90% of the total calculation time. Thus, if you need to perform TEC calculation multiple times on the same RINEX files (e.g., when tuning configuration), it is recommended to save the parsed LazyFrame to a parquet file after the first read, and then use `calc_tec_from_parquet` for subsequent TEC calculations: ```python header, lf = gt.read_rinex_obs( @@ -222,12 +222,22 @@ print(gt.TECConfig()) # min_elevation=30.0, # min_snr=30.0, # c1_codes={ -# 'C': ['C2I', 'C2D', 'C2X', 'C1I', 'C1D', 'C1X', 'C2W', 'C1C'], -# 'G': ['C1W', 'C1C', 'C1X'] +# '2': { +# 'G': ['C1'] +# }, +# '3': { +# 'C': ['C2I', 'C2D', 'C2X', 'C1I', 'C1D', 'C1X', 'C2W', 'C1C'], +# 'G': ['C1W', 'C1C', 'C1X'] +# }, # }, # c2_codes={ -# 'C': ['C6I', 'C6D', 'C6X', 'C7I', 'C7D', 'C7X', 'C5I', 'C5D', 'C5X'], -# 'G': ['C2W', 'C2C', 'C2X', 'C5W', 'C5C', 'C5X'] +# '2': { +# 'G': ['C2', 'C5'] +# }, +# '3': { +# 'C': ['C6I', 'C6D', 'C6X', 'C7I', 'C7D', 'C7X', 'C5I', 'C5D', 'C5X'], +# 'G': ['C2W', 'C2C', 'C2X', 'C5W', 'C5C', 'C5X'] +# }, # }, # rx_bias='external', # mapping_function='slm', @@ -240,10 +250,21 @@ The meaning of each parameter is as follows: - `ipp_height`: The assumed height of the ionospheric pierce point (IPP) in kilometers. - `min_elevation`: The minimum satellite elevation angle (in degrees) for observations to be considered in the TEC calculation. - `min_snr`: The minimum signal-to-noise ratio (in dB-Hz) for observations to be considered in the TEC calculation. -- `c1_codes`: A dictionary specifying the preferred observation codes for the first frequency (C1) for each constellation. The codes are prioritized in the order they are listed, with the first available code being used. This parameter supports setting for partial constellations (e.g., `c1_codes={'C': [...]} ` to only set for Beidou, and use default for others). -- `c2_codes`: A dictionary specifying the preferred observation codes for the second frequency (C2) for each constellation, similar to `c1_codes`. +- `c1_codes`: A dictionary specifying the preferred observation codes for the first frequency (C1) for each RINEX version and constellation. The codes are prioritized in the order they are listed, with the first available code being used. This parameter supports partial setting (e.g., `c1_codes={'3': {'C': [...]} }` to only set for Beidou in RINEX version 3, and use default for others). +- `c2_codes`: A dictionary specifying the preferred observation codes for the second frequency (C2) for each RINEX version and constellation, similar to `c1_codes`. - `rx_bias`: Specifies how to handle receiver bias. It can be set to 'external' to use an external DCB file for correction, 'mstd' to use the minimum standard deviation method for estimation, 'lsq' to use least squares estimation, or `None` to skip receiver bias correction. Note that the receiver bias estimation is only applicable after the satellite bias has been corrected using an external DCB file (e.g., from IGS). If no external DCB file is provided, this parameter will be ignored. The 'mstd' and 'lsq' methods are for stations that are not included in the external DCB file. - `mapping_function`: The mapping function to use for converting slant TEC to vertical TEC. It can be set to 'slm' for the Single Layer Model or 'mslm' for the Modified Single Layer Model. - `retain_intermediate`: Names of intermediate columns to retain in the output DataFrame. It can be set to `None` to discard all intermediate columns, 'all' to retain all intermediate columns, or a list of column names to keep specific ones. -## Benchmarks +## Benchmarks (on M2 Pro 12-Core CPU) + +| Task | Time (s) | +|:----------------------------------------------------------|-----------:| +| Read RINEX v2 (3.65 MB) | 0.1362 | +| Read RINEX v3 (14.02 MB) | 0.7397 | +| Read RINEX v3 (6.05 MB Hatanaka-compressed) | 1.2468 | +| Read RINEX v3 (2.34 MB Hatanaka-compressed) | 0.5653 | +| Calculate TEC from RINEX v2 (3.65 MB) | 0.1457 | +| Calculate TEC from RINEX v3 (14.02 MB) | 0.7532 | +| Calculate TEC from RINEX v3 (6.05 MB Hatanaka-compressed) | 1.3067 | +| Calculate TEC from RINEX v3 (2.34 MB Hatanaka-compressed) | 0.5908 | diff --git a/benchmarks/bench.py b/benchmarks/bench.py new file mode 100644 index 0000000..c400dd9 --- /dev/null +++ b/benchmarks/bench.py @@ -0,0 +1,116 @@ +import os +from pathlib import Path +from time import perf_counter + +import pandas as pd + +import gnss_tec as gt + +# Constants +OBS_V2 = "data/rinex_obs_v2/dgar0100.24o.gz" +NAV_V2 = "data/rinex_nav_v2/brdc0100.24n.gz" +OBS_V3 = "data/rinex_obs_v3/CIBG00IDN_R_20240100000_01D_30S_MO.rnx.gz" +OBS_V3_CRX = "data/rinex_obs_v3/CIBG00IDN_R_20240100000_01D_30S_MO.crx.gz" +OBS_V3_CRX_SMALL = "data/rinex_obs_v3/BELE00BRA_R_20240100000_01D_30S_MO.crx.gz" +NAV_V3 = "data/rinex_nav_v3/BRDC00IGS_R_20240100000_01D_MN.rnx.gz" +BIAS = "data/bias/CAS0OPSRAP_20240100000_01D_01D_DCB.BIA.gz" +OUTPUT_FILE = "benchmarks/benchmark.md" + + +def benchmark_read_rinex(obs_path: str, nav_path: str) -> tuple[str, float]: + """Benchmark reading RINEX files.""" + start_time = perf_counter() + header, df = gt.read_rinex_obs(obs_path, nav_path) + df.collect() + end_time = perf_counter() + + if header.version.startswith("2"): + version = "2" + elif header.version.startswith("3"): + version = "3" + else: + raise ValueError(f"Unknown RINEX version: {header.version}") + + file_size = Path(obs_path).stat().st_size / (1024 * 1024) # in MB + + if "crx" in obs_path.lower(): + hatanaka = " Hatanaka-compressed" + else: + hatanaka = "" + + return ( + f"Read RINEX v{version} ({file_size:.2f} MB{hatanaka})", + end_time - start_time, + ) + + +def benchmark_calc_tec( + obs_path: str, nav_path: str, bias_path: str +) -> tuple[str, float]: + """Benchmark TEC calculation from RINEX files.""" + start_time = perf_counter() + header, lf = gt.read_rinex_obs(obs_path, nav_path) + df = gt.calc_tec_from_df(lf, header, bias_path) + df.collect() + end_time = perf_counter() + + if header.version.startswith("2"): + version = "2" + elif header.version.startswith("3"): + version = "3" + else: + raise ValueError(f"Unknown RINEX version: {header.version}") + + file_size = Path(obs_path).stat().st_size / (1024 * 1024) # in MB + + if "crx" in obs_path.lower(): + hatanaka = " Hatanaka-compressed" + else: + hatanaka = "" + + return ( + f"Calculate TEC from RINEX v{version} ({file_size:.2f} MB{hatanaka})", + end_time - start_time, + ) + + +def main(): + """Main function to run benchmarks and write results.""" + benchmarks = [ + benchmark_read_rinex(OBS_V2, NAV_V2), + benchmark_read_rinex(OBS_V3, NAV_V3), + benchmark_read_rinex(OBS_V3_CRX, NAV_V3), + benchmark_read_rinex(OBS_V3_CRX_SMALL, NAV_V3), + benchmark_calc_tec(OBS_V2, NAV_V2, BIAS), + benchmark_calc_tec(OBS_V3, NAV_V3, BIAS), + benchmark_calc_tec(OBS_V3_CRX, NAV_V3, BIAS), + benchmark_calc_tec(OBS_V3_CRX_SMALL, NAV_V3, BIAS), + ] + + # Create a DataFrame from the benchmark results + results_df = pd.DataFrame( + { + "Task": [b[0] for b in benchmarks], + "Time (s)": [f"{b[1]:.4f}" for b in benchmarks], + } + ) + + # Convert DataFrame to Markdown table + markdown_table = results_df.to_markdown(index=False) + + # Get system information + cpu_count = os.cpu_count() + system_info = f"M2 Pro {cpu_count}-Core CPU" + + # Write the Markdown table to the output file + with Path(OUTPUT_FILE).open("w") as f: + f.write(f"# Benchmark Results (on {system_info})\n\n") + f.write(markdown_table) + f.write("\n") + + print(f"Benchmark results written to {OUTPUT_FILE}") + print(markdown_table) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/benchmark.md b/benchmarks/benchmark.md new file mode 100644 index 0000000..bfabe96 --- /dev/null +++ b/benchmarks/benchmark.md @@ -0,0 +1,12 @@ +# Benchmark Results (on M2 Pro 12-Core CPU) + +| Task | Time (s) | +|:----------------------------------------------------------|-----------:| +| Read RINEX v2 (3.65 MB) | 0.1362 | +| Read RINEX v3 (14.02 MB) | 0.7397 | +| Read RINEX v3 (6.05 MB Hatanaka-compressed) | 1.2468 | +| Read RINEX v3 (2.34 MB Hatanaka-compressed) | 0.5653 | +| Calculate TEC from RINEX v2 (3.65 MB) | 0.1457 | +| Calculate TEC from RINEX v3 (14.02 MB) | 0.7532 | +| Calculate TEC from RINEX v3 (6.05 MB Hatanaka-compressed) | 1.3067 | +| Calculate TEC from RINEX v3 (2.34 MB Hatanaka-compressed) | 0.5908 | diff --git a/data/bias/CAS0OPSRAP_20240110000_01D_01D_DCB.BIA.gz b/data/bias/CAS0OPSRAP_20240110000_01D_01D_DCB.BIA.gz deleted file mode 100644 index 92e8aa9..0000000 Binary files a/data/bias/CAS0OPSRAP_20240110000_01D_01D_DCB.BIA.gz and /dev/null differ diff --git a/data/bias/GFZ0OPSRAP_20240110000_01D_01D_DCB.BIA.gz b/data/bias/GFZ0OPSRAP_20240110000_01D_01D_DCB.BIA.gz deleted file mode 100644 index 3a45011..0000000 Binary files a/data/bias/GFZ0OPSRAP_20240110000_01D_01D_DCB.BIA.gz and /dev/null differ diff --git a/gnss_tec/tec/bias.py b/gnss_tec/tec/bias.py index fd32982..0740a4f 100644 --- a/gnss_tec/tec/bias.py +++ b/gnss_tec/tec/bias.py @@ -136,12 +136,13 @@ def estimate_rx_bias( Returns: pl.LazyFrame: LazyFrame with receiver bias estimates added, in TECU. """ - if method == "mstd": - estimate_func = _mstd_rx_bias - elif method == "lsq": - estimate_func = _lsq_rx_bias - else: - raise ValueError(f"Unknown bias correction method: {method}") + match method: + case "mstd": + estimate_func = _mstd_rx_bias + case "lsq": + estimate_func = _lsq_rx_bias + case _: + raise ValueError(f"Unknown bias correction method: {method}") lf = df.lazy() if downsample: @@ -158,7 +159,7 @@ def estimate_rx_bias( bias_lf = bias_lf.group_by( "date", "station", "constellation", "C1_code", "C2_code" ).agg( - pl.struct("time", "stec_dcb_corrected", "mf", "ipp_lat", "ipp_lon", "rx_lat") + pl.struct("time", "stec", "tx_bias", "mf", "ipp_lat", "ipp_lon", "rx_lat") .map_batches(estimate_func, return_dtype=pl.Float64, returns_scalar=True) .alias("rx_bias") ) @@ -168,7 +169,6 @@ def estimate_rx_bias( pl.col("time").dt.date().alias("date"), pl.col("prn").cat.slice(0, 1).alias("constellation"), ) - .drop("rx_bias") .join( bias_lf, on=["date", "station", "constellation", "C1_code", "C2_code"], @@ -197,7 +197,7 @@ def _mstd_rx_bias(s: pl.Series) -> float: def mean_std(bias: float) -> float: corrected = df.with_columns( - (pl.col("stec_dcb_corrected").sub(bias) / pl.col("mf")).alias("vtec") + (pl.col("stec").sub(pl.col("tx_bias") + bias) / pl.col("mf")).alias("vtec") ).with_columns(pl.col("vtec").std().over("time").mean().alias("mean_std")) return corrected.get_column("mean_std").item(0) @@ -246,6 +246,11 @@ def _lsq_rx_bias(s: pl.Series) -> float: .fill_null(np.nan) .to_numpy() ) - b = df.get_column("stec_dcb_corrected").fill_null(np.nan).to_numpy() + b = ( + df.select(pl.col("stec") - pl.col("tx_bias")) + .get_column("stec") + .fill_null(np.nan) + .to_numpy() + ) result = lsq_linear(A, b) return result.x[0] diff --git a/gnss_tec/tec/constants.py b/gnss_tec/tec/constants.py index d8d2fb9..16c6744 100644 --- a/gnss_tec/tec/constants.py +++ b/gnss_tec/tec/constants.py @@ -12,7 +12,7 @@ Re = 6378.137e3 """Earth radius in meters.""" -SUPPORTED_RINEX_VERSIONS = ["3"] +SUPPORTED_RINEX_VERSIONS = ["2", "3"] """Supported RINEX major versions.""" SUPPORTED_CONSTELLATIONS = {"C": "BDS", "G": "GPS"} @@ -137,6 +137,35 @@ def c2_priority(self, version: Literal["2", "3"]) -> Mapping[str, int]: priority[f"{const}_{code}"] = i return priority + @staticmethod + def validate_codes(codes, default) -> dict[str, dict[str, list[str]]]: + if not codes: + return default + + codes = dict(codes) + unknown = codes.keys() - SUPPORTED_RINEX_VERSIONS + if unknown: + raise ValueError( + f"Invalid RINEX versions in codes: {unknown}. " + f"Allowed versions are {SUPPORTED_RINEX_VERSIONS}." + ) + + validated_codes = {} + for ver in SUPPORTED_RINEX_VERSIONS: + if ver not in codes: + validated_codes[ver] = default[ver] + continue + + allowed = set(SUPPORTED_CONSTELLATIONS.keys()) + invalid = codes[ver].keys() - allowed + if invalid: + raise ValueError( + f"Invalid constellations in codes: {invalid}. " + f"Allowed constellations are {allowed}." + ) + validated_codes[ver] = default[ver] | dict(codes[ver]) + return validated_codes + def __post_init__(self): # Validate constellations allowed = set(SUPPORTED_CONSTELLATIONS.keys()) @@ -150,29 +179,12 @@ def __post_init__(self): ) # Set default codes if not provided - if not self.c1_codes: - object.__setattr__(self, "c1_codes", DEFAULT_C1_CODES) - else: - user_codes = dict(self.c1_codes) - unknown = user_codes.keys() - allowed - if unknown: - raise ValueError( - f"Invalid constellations in c1_codes: {unknown}. " - f"Allowed constellations are {allowed}." - ) - object.__setattr__(self, "c1_codes", DEFAULT_C1_CODES | user_codes) - - if not self.c2_codes: - object.__setattr__(self, "c2_codes", DEFAULT_C2_CODES) - else: - user_codes = dict(self.c2_codes) - unknown = user_codes.keys() - allowed - if unknown: - raise ValueError( - f"Invalid constellations in c2_codes: {unknown}. " - f"Allowed constellations are {allowed}." - ) - object.__setattr__(self, "c2_codes", DEFAULT_C2_CODES | user_codes) + object.__setattr__( + self, "c1_codes", self.validate_codes(self.c1_codes, DEFAULT_C1_CODES) + ) + object.__setattr__( + self, "c2_codes", self.validate_codes(self.c2_codes, DEFAULT_C2_CODES) + ) @dataclass(frozen=True) diff --git a/gnss_tec/tec/tec_calculation.py b/gnss_tec/tec/tec_calculation.py index 05a3774..3a38131 100644 --- a/gnss_tec/tec/tec_calculation.py +++ b/gnss_tec/tec/tec_calculation.py @@ -78,8 +78,6 @@ def _coalesce_observations( rx_bias_lf, on=["station", "constellation", "C1_code", "C2_code", "date"], ) - else: - codes_lf = codes_lf.with_columns(pl.lit(None).alias("rx_bias")) # ---- 3. Keep only the highest priority codes for C1 and C2. ---- codes_lf = ( @@ -416,24 +414,25 @@ def calc_tec_from_df( lf = lf.with_columns( # Convert biases from ns to TECU pl.col("tx_bias").mul(tecu_per_ns) - ).with_columns( - # sTEC corrected for satellite DCB biases, in TECU - pl.col("stec").sub(pl.col("tx_bias")).alias("stec_dcb_corrected") ) - if config.rx_bias != "external" and config.rx_bias is not None: - lf = estimate_rx_bias( - lf, method=config.rx_bias, downsample=sampling_interval < 10 - ) - else: + if config.rx_bias is None: + lf = lf.with_columns(pl.lit(None).alias("rx_bias")) + elif config.rx_bias == "external": lf = lf.with_columns( # Convert biases from ns to TECU pl.col("rx_bias").mul(tecu_per_ns) ) + else: + lf = estimate_rx_bias( + lf, method=config.rx_bias, downsample=sampling_interval < 10 + ) lf = lf.with_columns( # sTEC corrected for DCB biases, in TECU - pl.col("stec_dcb_corrected").sub(pl.col("rx_bias").fill_null(0)) + pl.col("stec") + .sub(pl.col("tx_bias") + pl.col("rx_bias").fill_null(0)) + .alias("stec_dcb_corrected") ).with_columns( # vTEC, in TECU (pl.col("stec_dcb_corrected") / pl.col("mf")).alias("vtec"), diff --git a/pyproject.toml b/pyproject.toml index 406de47..7cf5f52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,4 +47,5 @@ dev = [ "pytest>=8.4.2", "ruff>=0.14.6", "scipy-stubs>=1.15.3.0", + "tabulate>=0.9.0", ] diff --git a/uv.lock b/uv.lock index 4d51ce3..42d8f08 100644 --- a/uv.lock +++ b/uv.lock @@ -1494,6 +1494,7 @@ dev = [ { name = "ruff" }, { name = "scipy-stubs", version = "1.15.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scipy-stubs", version = "1.16.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "tabulate" }, ] [package.metadata] @@ -1514,6 +1515,7 @@ dev = [ { name = "pytest", specifier = ">=8.4.2" }, { name = "ruff", specifier = ">=0.14.6" }, { name = "scipy-stubs", specifier = ">=1.15.3.0" }, + { name = "tabulate", specifier = ">=0.9.0" }, ] [[package]] @@ -1996,6 +1998,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, ] +[[package]] +name = "tabulate" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, +] + [[package]] name = "tomli" version = "2.3.0"