diff --git a/.gitignore b/.gitignore index 5471529b..f49e3a69 100755 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,6 @@ ENV/ # IDE settings .vscode/ .idea/ + +# UV +uv.lock diff --git a/cdm_reader_mapper/mdf_reader/properties.py b/cdm_reader_mapper/mdf_reader/properties.py index 5289d9ae..f6f491be 100755 --- a/cdm_reader_mapper/mdf_reader/properties.py +++ b/cdm_reader_mapper/mdf_reader/properties.py @@ -2,6 +2,8 @@ from __future__ import annotations +import polars as pl + from ..properties import numeric_types, object_types, supported_data_models # noqa _base = "cdm_reader_mapper.mdf_reader" @@ -16,22 +18,23 @@ "craid": ("drifter_measurements", "JULD"), } -pandas_dtypes = {} +polars_dtypes = {} for dtype in object_types: - pandas_dtypes[dtype] = "object" -pandas_dtypes.update({x: x for x in numeric_types}) -pandas_dtypes["datetime"] = "datetime" + polars_dtypes[dtype] = pl.String +polars_dtypes.update({x: x for x in numeric_types}) +polars_dtypes[pl.Datetime] = pl.Datetime -pandas_int = "Int64" +polars_int = pl.Int64 # ....and how they are managed data_type_conversion_args = {} for dtype in numeric_types: data_type_conversion_args[dtype] = ["scale", "offset"] -data_type_conversion_args["str"] = ["disable_white_strip"] -data_type_conversion_args["object"] = ["disable_white_strip"] -data_type_conversion_args["key"] = ["disable_white_strip"] -data_type_conversion_args["datetime"] = ["datetime_format"] +data_type_conversion_args[pl.Utf8] = ["disable_white_strip"] +data_type_conversion_args[pl.String] = ["disable_white_strip"] +data_type_conversion_args[pl.Categorical] = ["disable_white_strip"] +data_type_conversion_args[pl.Object] = ["disable_white_strip"] +data_type_conversion_args[pl.Datetime] = ["datetime_format"] # Misc ------------------------------------------------------------------------ dummy_level = "_SECTION_" diff --git a/cdm_reader_mapper/mdf_reader/reader.py b/cdm_reader_mapper/mdf_reader/reader.py index 9a0ef14e..0a514b74 100755 --- a/cdm_reader_mapper/mdf_reader/reader.py +++ b/cdm_reader_mapper/mdf_reader/reader.py @@ -6,9 +6,11 @@ import csv import logging import os -from io import StringIO as StringIO +from io import StringIO import pandas as pd +import polars as pl +import xarray as xr from cdm_reader_mapper.common.json_dict import open_json_file from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy @@ -45,24 +47,26 @@ def _convert_and_decode( converter_kwargs, decoder_dict, ): + """DOCUMENTATION.""" for section in converter_dict.keys(): if section not in df.columns: continue if section in decoder_dict.keys(): decoded = decoder_dict[section](df[section]) - decoded.index = df[section].index - df[section] = decoded + df = df.with_columns(decoded.alias(section)) converted = converter_dict[section]( df[section], **converter_kwargs[section] ) - converted.index = df[section].index - df[section] = converted + # converted.index = df[section].index + df = df.with_columns(converted.alias(section)) return df - def _validate(self, df): + def _validate(self, df, mask) -> pl.DataFrame: + """DOCUMENTATION.""" return validate( data=df, + mask=mask, imodel=self.imodel, ext_table_path=self.ext_table_path, schema=self.schema, @@ -115,78 +119,31 @@ def convert_and_decode_entries( if decode is not True: decoder_dict = {} - if isinstance(data, pd.DataFrame): - data = self._convert_and_decode( - data, - converter_dict, - converter_kwargs, - decoder_dict, - ) - else: - data_buffer = StringIO() - TextParser = make_copy(data) - for i, df_ in enumerate(TextParser): - df = self._convert_and_decode( - df_, - converter_dict, - converter_kwargs, - decoder_dict, - ) - df.to_csv( - data_buffer, - header=False, - mode="a", - encoding=self.encoding, - index=False, - quoting=csv.QUOTE_NONE, - sep=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - - data_buffer.seek(0) - data = pd.read_csv( - data_buffer, - names=df.columns, - chunksize=self.chunksize, - dtype=object, - delimiter=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) + data = self._convert_and_decode( + data, + converter_dict, + converter_kwargs, + decoder_dict, + ) return data - def validate_entries(self, data, validate): + def validate_entries(self, data, mask, validate) -> pl.DataFrame: """Validate data entries by using a pre-defined data model. Fill attribute `valid` with boolean mask. """ if validate is not True: - mask = pd.DataFrame() - elif isinstance(data, pd.DataFrame): - mask = self._validate(data) + mask = pl.DataFrame() + elif isinstance(data, pl.DataFrame): + mask = self._validate(data, mask) else: - data_buffer = StringIO() - TextParser_ = make_copy(data) - for i, df_ in enumerate(TextParser_): - mask_ = self._validate(df_) - mask_.to_csv( - data_buffer, - header=False, - mode="a", - encoding=self.encoding, - index=False, - ) - data_buffer.seek(0) - mask = pd.read_csv( - data_buffer, - names=df_.columns, - chunksize=self.chunksize, - ) + raise TypeError("Unknown data type") return mask def remove_boolean_values(self, data): """DOCUMENTATION""" + if isinstance(data, pl.DataFrame): + return data if isinstance(data, pd.DataFrame): data = data.map(remove_boolean_values) dtype = adjust_dtype(self.dtypes, data) @@ -273,6 +230,7 @@ def read( self.chunksize = chunksize self.skiprows = skiprows + self.format = properties.open_file.get(self.imodel, "text") # 2. READ AND VALIDATE DATA logging.info(f"EXTRACTING DATA FROM MODEL: {self.imodel}") @@ -287,26 +245,71 @@ def read( # a list with a single dataframe or a pd.io.parsers.TextFileReader logging.info("Getting data string from source...") self.configurations = self.get_configurations(read_sections_list, sections) - data = self.open_data( - read_sections_list, - sections, - # INFO: Set default as "pandas" to account for custom schema - open_with=properties.open_file.get(self.imodel, "pandas"), + + TextParser = self.open_data( chunksize=chunksize, + format=self.format, ) # 2.3. Extract, read and validate data in same loop - logging.info("Extracting and reading sections") - data = self.convert_and_decode_entries( - data, - convert=convert, - decode=decode, - ) - mask = self.validate_entries(data, validate) + if isinstance(TextParser, (pd.DataFrame, xr.Dataset)): + data, mask = self._read_loop( + TextParser, read_sections_list, sections, decode, convert, validate + ) + else: + data_buffer = StringIO() + mask_buffer = StringIO() + for df_ in TextParser: + df, mask_ = self._read_loop( + df_, read_sections_list, sections, decode, convert, validate + ) + df.to_csv( + data_buffer, + header=False, + mode="a", + encoding="utf-8", + index=False, + quoting=csv.QUOTE_NONE, + sep=properties.internal_delimiter, + quotechar="\0", + escapechar="\0", + ) + mask_.to_csv( + mask_buffer, + header=False, + mode="a", + encoding="utf-8", + index=False, + quoting=csv.QUOTE_NONE, + sep=properties.internal_delimiter, + quotechar="\0", + escapechar="\0", + ) + data_buffer.seek(0) + data = pd.read_csv( + data_buffer, + names=self.columns, + chunksize=self.chunksize, + dtype=object, + parse_dates=self.parse_dates, + delimiter=properties.internal_delimiter, + quotechar="\0", + escapechar="\0", + ) + mask_buffer.seek(0) + mask = pd.read_csv( + mask_buffer, + names=self.columns, + chunksize=self.chunksize, + dtype=object, + parse_dates=self.parse_dates, + delimiter=properties.internal_delimiter, + quotechar="\0", + escapechar="\0", + ) # 3. Create output DataBundle object - logging.info("Creata output DataBundle object") - data = self.remove_boolean_values(data) + logging.info("Create output DataBundle object") return DataBundle( data=data, columns=self.columns, @@ -317,6 +320,29 @@ def read( imodel=self.imodel, ) + def _read_loop( + self, TextParser, order, valid, decode, convert, validate + ) -> tuple[pd.DataFrame, pd.DataFrame]: + logging.info("Extracting and reading sections") + data, mask = self._read_sections(TextParser, order, valid, format=self.format) + + logging.info("Decoding and converting entries") + data = self.convert_and_decode_entries( + data, + convert=convert, + decode=decode, + ) + + logging.info("Extracting and reading sections") + mask = self.validate_entries(data, mask, validate) + + renames = {c: tuple(c.split(":")) for c in data.columns} + + return ( + data.to_pandas().rename(columns=renames), + mask.to_pandas().rename(columns=renames), + ) + def read_mdf( source, diff --git a/cdm_reader_mapper/mdf_reader/schemas/schemas.py b/cdm_reader_mapper/mdf_reader/schemas/schemas.py index 1a0d115c..2afe668a 100755 --- a/cdm_reader_mapper/mdf_reader/schemas/schemas.py +++ b/cdm_reader_mapper/mdf_reader/schemas/schemas.py @@ -13,30 +13,64 @@ import os from pathlib import Path +import polars as pl + from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts from .. import properties +def _get_type_map(): + return { + "float": pl.Float64, + "float64": pl.Float64, + "float32": pl.Float32, + "int": pl.Int64, + "int64": pl.Int64, + "int32": pl.Int32, + "int16": pl.Int16, + "int8": pl.Int8, + "uint": pl.UInt64, + "uint64": pl.UInt64, + "uint32": pl.UInt32, + "uint16": pl.UInt16, + "uint8": pl.UInt8, + "str": pl.String, + "object": pl.String, + "key": pl.String, + "date": pl.Datetime, + "datetime": pl.Datetime, + "time": pl.Datetime, + } + + def convert_dtype_to_default(dtype, section, element): """Convert data type to defaults (int, float).""" if dtype is None: - return - elif dtype == "float": - return dtype - elif dtype == "int": - return properties.pandas_int - elif "float" in dtype.lower(): + return # pl.String + dtype_map = _get_type_map() + dtype = dtype.lower() + + polars_dtype = dtype_map.get(dtype) + if polars_dtype is not None: + return polars_dtype + + if "float" in dtype: + logging.warning( + f"Set column type of ({section}, {element}) from deprecated {dtype} to float 32." + ) + return pl.Float32 + elif "uint" in dtype: logging.warning( - f"Set column type of ({section}, {element}) from deprecated {dtype} to float." + f"Set column type of ({section}, {element}) from deprecated {dtype} to uint 32." ) - return "float" - elif "int" in dtype.lower(): + return pl.UInt32 + elif "int" in dtype: logging.warning( - f"Set column type of ({section}, {element}) from deprecated {dtype} to int." + f"Set column type of ({section}, {element}) from deprecated {dtype} to int 32." ) - return properties.pandas_int - return dtype + return pl.Int32 + return pl.String def _read_schema(schema): @@ -154,7 +188,7 @@ def read_schema(imodel=None, ext_schema_path=None, ext_schema_file=None): else: imodel = imodel.split("_") if imodel[0] not in properties.supported_data_models: - logging.error("Input data model " f"{imodel[0]}" " not supported") + logging.error(f"Input data model {imodel[0]} not supported") return schema_files = collect_json_files(*imodel, base=f"{properties._base}.schemas") @@ -209,7 +243,7 @@ def clean_schema(columns, schema): def get_index(idx, lst, section): if len(lst) == 1: return idx - return (section, idx) + return ":".join([section, idx]) flat_schema = dict() for section in schema.get("sections"): diff --git a/cdm_reader_mapper/mdf_reader/utils/configurator.py b/cdm_reader_mapper/mdf_reader/utils/configurator.py index 652c02bd..effe3fb3 100755 --- a/cdm_reader_mapper/mdf_reader/utils/configurator.py +++ b/cdm_reader_mapper/mdf_reader/utils/configurator.py @@ -3,11 +3,11 @@ from __future__ import annotations import ast -import csv -import logging -import numpy as np import pandas as pd +import polars as pl +import polars.selectors as cs +import xarray as xr from .. import properties from . import converters, decoders @@ -19,7 +19,7 @@ class Configurator: def __init__( self, - df=pd.DataFrame(), + df: pd.DataFrame | pl.DataFrame | xr.Dataset = pl.DataFrame(), schema=None, order=None, valid=None, @@ -38,7 +38,7 @@ def _get_index(self, section, order): if len(self.orders) == 1: return section else: - return (order, section) + return ":".join([order, section]) def _get_ignore(self, section_dict): ignore = section_dict.get("ignore") @@ -47,7 +47,7 @@ def _get_ignore(self, section_dict): return ignore def _get_dtype(self): - return properties.pandas_dtypes.get(self.sections_dict.get("column_type")) + return properties.polars_dtypes.get(self.sections_dict.get("column_type")) def _get_converter(self): return converters.get(self.sections_dict.get("column_type")) @@ -137,92 +137,148 @@ def get_configuration(self): }, } - def open_pandas(self): - """Open TextParser to pd.DataSeries.""" - return self.df.apply(lambda x: self._read_line(x[0]), axis=1) - - def _read_line(self, line: str): - i = j = 0 - data_dict = {} - for order in self.orders: - header = self.schema["sections"][order]["header"] - - disable_read = header.get("disable_read") - if disable_read is True: - data_dict[order] = line[i : properties.MAX_FULL_REPORT_WIDTH] - continue + def open_text(self): + """Open TextParser to a polars.DataFrame""" + if isinstance(self.df, pd.DataFrame): + self.df = pl.from_pandas(self.df) + if not isinstance(self.df, pl.DataFrame): + raise TypeError(f"Cannot open with polars for {type(self.df) = }") + self.df = self.df + mask_df = pl.DataFrame() + for section in self.orders: + header = self.schema["sections"][section]["header"] sentinal = header.get("sentinal") - bad_sentinal = sentinal is not None and not self._validate_sentinal( - i, line, sentinal - ) section_length = header.get("length", properties.MAX_FULL_REPORT_WIDTH) - sections = self.schema["sections"][order]["elements"] + # Get data associated with current section + if sentinal is not None: + self.df = self.df.with_columns( + [ + ( + pl.when(pl.col("full_str").str.starts_with(sentinal)) + .then(pl.col("full_str").str.head(section_length)) + .otherwise(pl.lit(None)) + .alias(section) + ), + ( + pl.when(pl.col("full_str").str.starts_with(sentinal)) + .then(pl.col("full_str").str.tail(-section_length)) + .otherwise(pl.col("full_str")) + .alias("full_str") + ), + ] + ) + else: + # Sentinal is None, the section is always present + self.df = self.df.with_columns( + [ + pl.col("full_str").str.head(section_length).alias(section), + pl.col("full_str").str.tail(-section_length).alias("full_str"), + ] + ) + + # Used for validation + section_missing = self.df.get_column(section).is_null() + + # Don't read fields + disable_read = header.get("disable_read", False) + if disable_read is True: + continue + + fields = self.schema["sections"][section]["elements"] field_layout = header.get("field_layout") delimiter = header.get("delimiter") + + # Handle delimited section if delimiter is not None: delimiter_format = header.get("format") if delimiter_format == "delimited": # Read as CSV - field_names = sections.keys() - fields = list(csv.reader([line[i:]], delimiter=delimiter))[0] - for field_name, field in zip(field_names, fields): - index = self._get_index(field_name, order) - data_dict[index] = field.strip() - i += len(field) - j = i + field_names = fields.keys() + field_names = [self._get_index(x, section) for x in field_names] + n_fields = len(field_names) + self.df = self.df.with_columns( + pl.col(section) + .str.splitn(delimiter, n_fields) + .struct.rename_fields(field_names) + .struct.unnest() + ) + for field in field_names: + self.df = self.df.with_columns( + pl.col(field).str.strip_chars(" ").name.keep() + ) + mask_df = mask_df.with_columns( + ( + section_missing + | self.df.get_column(field).is_not_null() + ).alias(field) + ) + self.df = self.df.drop([section]) + continue elif field_layout != "fixed_width": - logging.error( - f"Delimiter for {order} is set to {delimiter}. Please specify either format or field_layout in your header schema {header}." + raise ValueError( + f"Delimiter for {section} is set to {delimiter}. " + + f"Please specify either format or field_layout in your header schema {header}." ) - return - k = i + section_length - for section, section_dict in sections.items(): - missing = True - index = self._get_index(section, order) - ignore = (order not in self.valid) or self._get_ignore(section_dict) - na_value = section_dict.get("missing_value") - field_length = section_dict.get( + # Loop through fixed-width fields + for field, field_dict in fields.items(): + index = self._get_index(field, section) + ignore = (section not in self.valid) or self._get_ignore(field_dict) + field_length = field_dict.get( "field_length", properties.MAX_FULL_REPORT_WIDTH ) + na_value = field_dict.get("missing_value") - j = (i + field_length) if not bad_sentinal else i - if j > k: - missing = False - j = k - - if ignore is not True: - value = line[i:j] - - if not value.strip(): - value = True - if value == na_value: - value = True - - if i == j and missing is True: - value = False + if ignore: + # Move to next field + self.df = self.df.with_columns( + pl.col(section).str.tail(-field_length).name.keep(), + ) + if delimiter is not None: + self.df = self.df.with_columns( + pl.col(section).str.strip_prefix(delimiter).name.keep() + ) + continue - data_dict[index] = value + missing_map = {"": None} + if na_value is not None: + missing_map[na_value] = None + + self.df = self.df.with_columns( + [ + # If section not present in a row, then both these are null + ( + pl.col(section) + .str.head(field_length) + .str.strip_chars(" ") + .replace(missing_map) + .alias(index) + ), + pl.col(section).str.tail(-field_length).name.keep(), + ] + ) + mask_df = mask_df.with_columns( + (section_missing | self.df.get_column(index).is_not_null()).alias( + index + ) + ) + if delimiter is not None: + self.df = self.df.with_columns( + pl.col(section).str.strip_prefix(delimiter).name.keep() + ) - if delimiter is not None and line[j : j + len(delimiter)] == delimiter: - j += len(delimiter) - i = j + self.df = self.df.drop([section]) - return pd.Series(data_dict) + return self.df.drop("full_str"), mask_df def open_netcdf(self): - """Open netCDF to pd.Series.""" - - def replace_empty_strings(series): - if series.dtype == "object": - series = series.str.decode("utf-8") - series = series.str.strip() - series = series.map(lambda x: True if x == "" else x) - return series + """Open netCDF to polars.DataFrame.""" + if not isinstance(self.df, xr.Dataset): + raise TypeError(f"Cannot open with netCDF for {type(self.df) = }") missing_values = [] attrs = {} @@ -249,15 +305,28 @@ def replace_empty_strings(series): elif section in self.df.dims: renames[section] = index elif section in self.df.attrs: - attrs[index] = self.df.attrs[index] + # Initialise a constant column + attrs[index] = pl.lit(self.df.attrs[section].replace("\n", "; ")) else: missing_values.append(index) - df = self.df[renames.keys()].to_dataframe().reset_index() - attrs = {k: v.replace("\n", "; ") for k, v in attrs.items()} - df = df.rename(columns=renames) - df = df.assign(**attrs) - df[disables] = np.nan - df = df.apply(lambda x: replace_empty_strings(x)) - df[missing_values] = False - return df + df: pl.DataFrame = pl.from_pandas( + self.df[renames.keys()].to_dataframe().reset_index() + ) + df = df.rename(mapping=renames) + df = df.with_columns(**attrs) + df = df.with_columns( + [pl.lit(None).alias(missing) for missing in missing_values] + ) + df = df.with_columns([pl.lit(None).alias(disable) for disable in disables]) + # Replace empty or whitespace string with None + df = df.with_columns( + cs.binary().cast(pl.String).str.strip_chars().replace("", None) + ) + + # Create missing mask + mask_df = df.select(pl.all().is_not_null()) + mask_df = mask_df.with_columns( + [pl.lit(True).alias(c) for c in missing_values + disables] + ) + return df, mask_df diff --git a/cdm_reader_mapper/mdf_reader/utils/converters.py b/cdm_reader_mapper/mdf_reader/utils/converters.py index 4c14aa5e..9485bb45 100755 --- a/cdm_reader_mapper/mdf_reader/utils/converters.py +++ b/cdm_reader_mapper/mdf_reader/utils/converters.py @@ -2,10 +2,11 @@ from __future__ import annotations -import pandas as pd +import logging + +import polars as pl from .. import properties -from .utilities import convert_str_boolean class df_converters: @@ -16,24 +17,59 @@ def __init__(self, dtype): self.numeric_scale = 1.0 if self.dtype == "float" else 1 self.numeric_offset = 0.0 if self.dtype == "float" else 0 - def to_numeric(self, data, offset, scale): - """Convert object type elements of a pandas series to numeric type.""" + def _check_conversion(self, data: pl.Series, converted: pl.Series, threshold: int): + if ( + bad_converts := data.filter(converted.is_null() & data.is_not_null()) + ).len() > 0: + msg = f"Have {bad_converts.len()} values that failed to be converted to {self.dtype}" + if bad_converts.len() <= threshold: + msg += f": values = {', '.join(bad_converts)}" + logging.warning(msg) + return None + + def _drop_whitespace_vals(self, data: pl.Series): + data_name = data.name + return ( + data.to_frame() + .select( + pl.when(data.str.contains(r"^\s*$")) + .then(pl.lit(None)) + .otherwise(data) + .alias(data_name) + ) + .get_column(data_name) + ) + + # May not be needed? + def decode(self, data): + """Decode object type elements of a pandas series to UTF-8.""" - def _to_numeric(x): - x = convert_str_boolean(x) - if isinstance(x, bool): + def _decode(x): + if not isinstance(x, str): return x - if isinstance(x, str): - x = x.strip() - x.replace(" ", "0") + try: - return offset + float(x) * scale - except ValueError: - return False + encoded = x.encode("latin1") + return encoded.decode("utf-8") + except (UnicodeDecodeError, UnicodeEncodeError): + return x + + return data.apply(lambda x: _decode(x)) - return data.apply(lambda x: _to_numeric(x)) + def to_numeric(self, data: pl.Series): + """Convert object type elements of a pandas series to numeric type.""" + data = self._drop_whitespace_vals(data) + # str method fails if all nan, pd.Series.replace method is not the same + # as pd.Series.str.replace! + data = data.str.strip_chars().str.replace_all(" ", "0") + + converted = data.cast(self.dtype, strict=False) + self._check_conversion(data, converted, 20) - def object_to_numeric(self, data, scale=None, offset=None): + # Convert to numeric, then scale (?!) and give it's actual int type + return converted + + def object_to_numeric(self, data: pl.Series, scale=None, offset=None): """ Convert the object type elements of a pandas series to numeric type. @@ -63,36 +99,40 @@ def object_to_numeric(self, data, scale=None, offset=None): """ scale = scale if scale else self.numeric_scale offset = offset if offset else self.numeric_offset - if data.dtype == "object": - data = self.to_numeric(data, offset, scale) - return data + if not data.dtype.is_numeric(): + data = self.to_numeric(data) + + data = offset + data * scale + return data.cast(self.dtype) - def object_to_object(self, data, disable_white_strip=False): + def object_to_object(self, data: pl.Series, disable_white_strip=None): """DOCUMENTATION.""" if data.dtype != "object": return data - - if not disable_white_strip: - data = data.str.strip() - elif disable_white_strip == "l": - data = data.str.rstrip() - elif disable_white_strip == "r": - data = data.str.lstrip() - return data.apply( - lambda x: None if isinstance(x, str) and (x.isspace() or not x) else x - ) - - def object_to_datetime(self, data, datetime_format="%Y%m%d"): + data = self.decode(data) + if disable_white_strip is None: + data = data.str.strip_chars(" ") + else: + if disable_white_strip == "l": + data = data.str.strip_chars_end(" ") + elif disable_white_strip == "r": + data = data.str.strip_chars_start(" ") + return self._drop_whitespace_vals(data) + + def object_to_datetime(self, data: pl.Series, datetime_format="%Y%m%d"): """DOCUMENTATION.""" if data.dtype != "object": return data - return pd.to_datetime(data, format=datetime_format, errors="coerce") + converted = data.str.to_datetime(format=datetime_format, strict=False) + self._check_conversion(data, converted, 20) + return converted converters = dict() for dtype in properties.numeric_types: converters[dtype] = df_converters(dtype).object_to_numeric -converters["datetime"] = df_converters("datetime").object_to_datetime -converters["str"] = df_converters("str").object_to_object -converters["object"] = df_converters("object").object_to_object -converters["key"] = df_converters("key").object_to_object +converters[pl.Datetime] = df_converters(pl.Datetime).object_to_datetime +converters[pl.String] = df_converters(pl.String).object_to_object +converters[pl.Utf8] = df_converters(pl.Utf8).object_to_object +converters[pl.Object] = df_converters(pl.Object).object_to_object +converters[pl.Categorical] = df_converters(pl.Categorical).object_to_object diff --git a/cdm_reader_mapper/mdf_reader/utils/decoders.py b/cdm_reader_mapper/mdf_reader/utils/decoders.py index e1d75ba1..1437042e 100755 --- a/cdm_reader_mapper/mdf_reader/utils/decoders.py +++ b/cdm_reader_mapper/mdf_reader/utils/decoders.py @@ -2,8 +2,11 @@ from __future__ import annotations +import logging + +import polars as pl + from .. import properties -from .utilities import convert_str_boolean class df_decoders: @@ -11,21 +14,37 @@ class df_decoders: def __init__(self, dtype): # Return as object, conversion to actual type in converters only! - self.dtype = "object" + self.dtype = pl.String + + def _check_decode( + self, data: pl.Series, decoded: pl.Series, threshold: int, method: str + ): + if ( + bad_decode := data.filter(decoded.is_null() & data.is_not_null()) + ).len() > 0: + msg = f"Have {bad_decode.len()} values that failed to be {method} decoded" + if bad_decode.len() <= threshold: + msg += f": values = {', '.join(bad_decode)}" + logging.warning(msg) + return None def base36(self, data): """DOCUMENTATION.""" + # Caution: int(str(np.nan),36) ==> 30191 + decoded = ( + data.replace({"NaN": None}) + .str.strip_chars(" ") + .replace({"": None}) + .str.to_integer(base=36, strict=False) + .cast(self.dtype) + ) - def _base36(x): - x = convert_str_boolean(x) - if isinstance(x, bool): - return x - return str(int(str(x), 36)) - - return data.apply(lambda x: _base36(x)) + self._check_decode(data, decoded, 20, "base36") + return decoded -decoders = {"base36": {}} +decoders = dict() +decoders["base36"] = dict() for dtype in properties.numeric_types: decoders["base36"][dtype] = df_decoders(dtype).base36 -decoders["base36"]["key"] = df_decoders("key").base36 +decoders["base36"][pl.Categorical] = df_decoders(pl.Categorical).base36 diff --git a/cdm_reader_mapper/mdf_reader/utils/filereader.py b/cdm_reader_mapper/mdf_reader/utils/filereader.py index 2ab03403..afd09275 100755 --- a/cdm_reader_mapper/mdf_reader/utils/filereader.py +++ b/cdm_reader_mapper/mdf_reader/utils/filereader.py @@ -2,12 +2,11 @@ from __future__ import annotations -import csv import logging import os from copy import deepcopy -from io import StringIO +import polars as pl import pandas as pd import xarray as xr @@ -81,31 +80,32 @@ def _adjust_schema(self, ds, dtypes): else: del self.schema["sections"][section]["elements"][data_var][attr] - def _select_years(self, df): - def get_years_from_datetime(date): - try: - return date.year - except AttributeError: - return date - + def _select_years(self, df: pl.DataFrame) -> pl.DataFrame: if self.year_init is None and self.year_end is None: return df + if self.imodel is None: + logging.error("Selection of years is not supported for custom schema") + return df + data_model = self.imodel.split("_")[0] - dates = df[properties.year_column[data_model]] - years = dates.apply(lambda x: get_years_from_datetime(x)) - years = years.astype(int) + dates = df.get_column(properties.year_column[data_model]) + if dates.dtype == pl.Datetime: + years = dates.dt.year() + else: + years = dates.cast(pl.Int64, strict=False) - mask = pd.Series([True] * len(years)) - if self.year_init: - mask[years < self.year_init] = False - if self.year_end: - mask[years > self.year_end] = False + mask = pl.repeat(True, df.height, eager=True) + if self.year_init and self.year_end: + mask = years.is_between(self.year_init, self.year_end, closed="both") + elif self.year_init: + mask = years.ge(self.year_init) + elif self.year_end: + mask = years.le(self.year_end) - index = mask[mask].index - return df.iloc[index].reset_index(drop=True) + return df.filter(mask) - def _read_pandas(self, **kwargs): + def _read_text(self, **kwargs): return pd.read_fwf( self.source, header=None, @@ -113,6 +113,8 @@ def _read_pandas(self, **kwargs): escapechar="\0", dtype=object, skip_blank_lines=False, + widths=[properties.MAX_FULL_REPORT_WIDTH], + names=["full_str"], **kwargs, ) @@ -126,21 +128,26 @@ def _read_sections( TextParser, order, valid, - open_with, + format, ): - if open_with == "pandas": - df = Configurator( + if format == "text": + df, mask_df = Configurator( df=TextParser, schema=self.schema, order=order, valid=valid - ).open_pandas() - elif open_with == "netcdf": - df = Configurator( + ).open_text() + elif format == "netcdf": + df, mask_df = Configurator( df=TextParser, schema=self.schema, order=order, valid=valid ).open_netcdf() else: - raise ValueError("open_with has to be one of ['pandas', 'netcdf']") + raise ValueError("format has to be one of ['text', 'netcdf']") + + # missing_values = df.select(["index", "missing_values"]).pipe(set_missing_values) + df = df.pipe(self._select_years) self.columns = df.columns - return self._select_years(df) + # Replace None with NaN - is this necessary for polars? + # df = df.where(df.notnull(), np.nan) + return df, mask_df def get_configurations(self, order, valid): """DOCUMENTATION.""" @@ -154,51 +161,53 @@ def get_configurations(self, order, valid): def open_data( self, - order, - valid, chunksize, - open_with="pandas", + format="text", ): """DOCUMENTATION.""" encoding = self.schema["header"].get("encoding") - if open_with == "netcdf": + if format == "netcdf": TextParser = self._read_netcdf() - elif open_with == "pandas": - TextParser = self._read_pandas( + # NOTE: Chunking - polars does have pl.read_csv_batched, but batch_size + # is not respected: https://github.com/pola-rs/polars/issues/19978 + # alternative: lazy? + elif format == "text": + TextParser = self._read_text( encoding=encoding, - widths=[properties.MAX_FULL_REPORT_WIDTH], skiprows=self.skiprows, chunksize=chunksize, ) else: - raise ValueError("open_with has to be one of ['pandas', 'netcdf']") - - if isinstance(TextParser, pd.DataFrame) or isinstance(TextParser, xr.Dataset): - return self._read_sections(TextParser, order, valid, open_with=open_with) - else: - data_buffer = StringIO() - for i, df_ in enumerate(TextParser): - df = self._read_sections(df_, order, valid, open_with=open_with) - df.to_csv( - data_buffer, - header=False, - mode="a", - encoding=encoding, - index=False, - quoting=csv.QUOTE_NONE, - sep=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - data_buffer.seek(0) - data = pd.read_csv( - data_buffer, - names=df.columns, - chunksize=self.chunksize, - dtype=object, - parse_dates=self.parse_dates, - delimiter=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - return data + raise ValueError("format has to be one of ['text', 'netcdf']") + + return TextParser + # if isinstance(TextParser, (pl.DataFrame, xr.Dataset)): + # df, mask_df = self._read_sections(TextParser, order, valid, open_with=open_with) + # return df, mask_df + # else: + # data_buffer = StringIO() + # for i, df_ in enumerate(TextParser): + # df = self._read_sections(df_, order, valid, open_with=open_with) + # df.to_csv( + # data_buffer, + # header=False, + # mode="a", + # encoding="utf-8", + # index=False, + # quoting=csv.QUOTE_NONE, + # sep=properties.internal_delimiter, + # quotechar="\0", + # escapechar="\0", + # ) + # data_buffer.seek(0) + # data = pd.read_csv( + # data_buffer, + # names=df.columns, + # chunksize=self.chunksize, + # dtype=object, + # parse_dates=self.parse_dates, + # delimiter=properties.internal_delimiter, + # quotechar="\0", + # escapechar="\0", + # ) + # return data diff --git a/cdm_reader_mapper/mdf_reader/utils/validators.py b/cdm_reader_mapper/mdf_reader/utils/validators.py index 8787dfbb..7f2fd0d1 100755 --- a/cdm_reader_mapper/mdf_reader/utils/validators.py +++ b/cdm_reader_mapper/mdf_reader/utils/validators.py @@ -5,7 +5,7 @@ import logging import numpy as np -import pandas as pd +import polars as pl from .. import properties from ..codes import codes @@ -13,36 +13,24 @@ from .utilities import convert_str_boolean -def validate_datetime(elements, data): +def validate_datetime(mask, elements, data: pl.DataFrame): """DOCUMENTATION.""" + for element in elements: + col = data.get_column(element) + if not col.dtype.is_temporal(): + mask = mask.with_columns(pl.lit(False).alias(element)) + continue - def is_date_object(object): - if hasattr(object, "year"): - return True + mask = mask.with_columns((pl.col(element) & col.is_not_null()).alias(element)) - mask = pd.DataFrame(index=data.index, data=False, columns=elements) - mask[elements] = ( - data[elements].apply(np.vectorize(is_date_object)) | data[elements].isna() - ) return mask -def validate_numeric(elements, data, schema): +def validate_numeric(mask: pl.DataFrame, elements, data: pl.DataFrame, schema): """DOCUMENTATION.""" - - # Find thresholds in schema. Flag if not available -> warn - def _to_numeric(x): - if x is None: - return np.nan - x = convert_str_boolean(x) - if isinstance(x, bool): - return x - return float(x) - - data[elements] = data[elements].map(_to_numeric) - mask = pd.DataFrame(index=data.index, data=False, columns=elements) - lower = {x: schema.get(x).get("valid_min", -np.inf) for x in elements} - upper = {x: schema.get(x).get("valid_max", np.inf) for x in elements} + # Handle cases where value is explicitly None in the dictionary + lower = {x: schema.get(x).get("valid_min") or -np.inf for x in elements} + upper = {x: schema.get(x).get("valid_max") or np.inf for x in elements} set_elements = [ x for x in lower.keys() if lower.get(x) != -np.inf and upper.get(x) != np.inf @@ -57,26 +45,45 @@ def _to_numeric(x): logging.warning( "Corresponding upper and/or lower bounds set to +/-inf for validation" ) - mask[elements] = ( - (data[elements] >= [lower.get(x) for x in elements]) - & (data[elements] <= [upper.get(x) for x in elements]) - ) | data[elements].isna() + + mask = mask.with_columns( + [ + ( + pl.col(element) + | ( + data[element].is_not_null() + & data[element].is_not_nan() + & data[element].is_between( + lower.get(element), upper.get(element), closed="both" + ) + ) + ).alias(element) + for element in elements + ] + ) return mask -def validate_str(elements, data): +def validate_str(mask, elements, data): """DOCUMENTATION.""" - return pd.DataFrame(index=data.index, data=True, columns=elements) + return mask -def validate_codes(elements, data, schema, imodel, ext_table_path): +def validate_codes( + mask: pl.DataFrame, + elements: list[str], + data: pl.DataFrame, + schema, + imodel, + ext_table_path, +): """DOCUMENTATION.""" - mask = pd.DataFrame(index=data.index, data=False, columns=elements) for element in elements: code_table_name = schema.get(element).get("codetable") if not code_table_name: logging.error(f"Code table not defined for element {element}") logging.warning("Element mask set to False") + mask = mask.with_columns(pl.lit(False).alias(element)) continue table = codes.read_table( @@ -87,19 +94,18 @@ def validate_codes(elements, data, schema, imodel, ext_table_path): if not table: continue - dtype = properties.pandas_dtypes.get(schema.get(element).get("column_type")) + dtype = properties.polars_dtypes.get(schema.get(element).get("column_type")) table_keys = list(table.keys()) - validation_df = data[element] - value = validation_df.astype(dtype).astype("str") - valid = validation_df.notna() - mask_ = value.isin(table_keys) - mask[element] = mask_.where(valid, True) + col = data.get_column(element) + col = col.cast(dtype).cast(pl.String) + valid = col.is_not_null() & col.is_not_nan() & col.is_in(table_keys) + mask = mask.with_columns((pl.col(element) | valid).alias(element)) return mask -def _get_elements(elements, element_atts, key): +def _get_elements(elements, element_atts, key) -> list[str]: def _condition(x): column_types = element_atts.get(x).get("column_type") if key == "numeric_types": @@ -125,7 +131,8 @@ def _mask_boolean(x, boolean): def validate( - data, + data: pl.DataFrame, + mask: pl.DataFrame, imodel, ext_table_path, schema, @@ -159,37 +166,25 @@ def validate( filename=None, ) # Check input - if not isinstance(data, pd.DataFrame): # or not isinstance(mask0, pd.DataFrame): + if not isinstance(data, pl.DataFrame) or not isinstance(mask, pl.DataFrame): # logging.error("Input data and mask must be a pandas data frame object") logging.error("input data must be a pandas DataFrame.") return - mask = pd.DataFrame(index=data.index, columns=data.columns, dtype=object) - if data.empty: + if data.is_empty(): return mask + disables = disables or [] + # Get the data elements from the input data: might be just a subset of # data model and flatten the schema to get a simple and sequential list # of elements included in the input data - elements = [x for x in data if x not in disables] + elements = [x for x in data.columns if x not in disables] element_atts = schemas.df_schema(elements, schema) - # See what elements we need to validate + # 1. Numeric elements numeric_elements = _get_elements(elements, element_atts, "numeric_types") - datetime_elements = _get_elements(elements, element_atts, "datetime") - coded_elements = _get_elements(elements, element_atts, "key") - str_elements = _get_elements(elements, element_atts, "str") - - if _element_tuples(numeric_elements, datetime_elements, coded_elements): - validated_columns = pd.MultiIndex.from_tuples( - list(set(numeric_elements + coded_elements + datetime_elements)) - ) - else: - validated_columns = list( - set(numeric_elements + coded_elements + datetime_elements) - ) - - mask[numeric_elements] = validate_numeric(numeric_elements, data, element_atts) + mask = validate_numeric(mask, numeric_elements, data, element_atts) # 2. Table coded elements # See following: in multiple keys code tables, the non parameter element, @@ -201,8 +196,10 @@ def validate( # Get the full list of keys combinations (tuples, triplets...) and check the column combination against that: if it fails, mark the element! # Need to see how to grab the YEAR part of a datetime when YEAR comes from a datetime element # pd.DatetimeIndex(df['_datetime']).year + coded_elements = _get_elements(elements, element_atts, "key") if len(coded_elements) > 0: - mask[coded_elements] = validate_codes( + mask = validate_codes( + mask, coded_elements, data, element_atts, @@ -211,21 +208,12 @@ def validate( ) # 3. Datetime elements - mask[datetime_elements] = validate_datetime(datetime_elements, data) + datetime_elements = _get_elements(elements, element_atts, "datetime") + mask = validate_datetime(mask, datetime_elements, data) # 4. str elements - mask[str_elements] = validate_str(str_elements, data) - - # 5. Set False values - mask[validated_columns] = mask[validated_columns].mask( - data[validated_columns].map(_mask_boolean, boolean=False), - False, - ) - - mask[validated_columns] = mask[validated_columns].mask( - data[validated_columns].map(_mask_boolean, boolean=True), - True, - ) + str_elements = _get_elements(elements, element_atts, "str") + mask = validate_str(mask, str_elements, data) - mask[disables] = np.nan + mask = mask.with_columns([pl.lit(None).alias(disable) for disable in disables]) return mask diff --git a/cdm_reader_mapper/properties.py b/cdm_reader_mapper/properties.py index 59668b9f..2ecaef89 100755 --- a/cdm_reader_mapper/properties.py +++ b/cdm_reader_mapper/properties.py @@ -2,8 +2,23 @@ from __future__ import annotations -numeric_types = ["Int64", "int", "float"] +import polars as pl -object_types = ["str", "object", "key", "datetime"] +# numeric_types = ["Int64", "int", "float"] +numeric_types = [ + pl.Float64, + pl.Float32, + pl.Int64, + pl.Int32, + pl.Int16, + pl.Int8, + pl.UInt64, + pl.UInt32, + pl.UInt16, + pl.UInt8, +] + +# object_types = ["str", "object", "key", "datetime"] +object_types = [pl.String, pl.Utf8, pl.Datetime, pl.Object, pl.Categorical] supported_data_models = ["craid", "gcc", "icoads", "pub47"] diff --git a/docs/example_notebooks/read_overview.ipynb b/docs/example_notebooks/read_overview.ipynb index de546ab3..ced54b4b 100755 --- a/docs/example_notebooks/read_overview.ipynb +++ b/docs/example_notebooks/read_overview.ipynb @@ -16,7 +16,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-01-07 11:44:05,441 - root - INFO - init basic configure of logging success\n" + "2025-01-31 09:29:23,553 - root - INFO - init basic configure of logging success\n", + "/Users/josidd/git/github/cdm_fork/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], @@ -24,6 +26,7 @@ "from __future__ import annotations\n", "\n", "import pandas as pd\n", + "import polars as pl\n", "\n", "from cdm_reader_mapper import properties, read_mdf, test_data" ] @@ -51,7 +54,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-01-07 11:44:09,196 - root - INFO - Attempting to fetch remote file: icoads/r300/d704/input/icoads_r300_d704_1878-10-01_subset.imma.md5\n" + "2025-01-31 09:29:27,576 - root - INFO - Attempting to fetch remote file: icoads/r300/d704/input/icoads_r300_d704_1878-10-01_subset.imma.md5\n" ] }, { @@ -130,7 +133,7 @@ "A **schema** file gathers a collection of descriptors that enable the `mdf_reader` tool to access the content\n", "of a `data model/ schema` and extract the sections of the raw data file that contains meaningful information. These **schema files** are the `bones` of the data model, basically `.json` files outlining the structure of the incoming raw data.\n", "\n", - "The `mdf_reader` takes this information and translate the characteristics of the data to a python pandas dataframe.\n", + "The `mdf_reader` takes this information and translate the characteristics of the data to a python polars dataframe.\n", "\n", "The tool has several **schema** templates build in." ] @@ -171,19 +174,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-01-07 11:44:10,310 - root - INFO - READING DATA MODEL SCHEMA FILE...\n", - "2025-01-07 11:44:10,327 - root - INFO - EXTRACTING DATA FROM MODEL: icoads_r300_d704\n", - "2025-01-07 11:44:10,327 - root - INFO - Getting data string from source...\n", - "2025-01-07 11:44:11,139 - root - WARNING - Data numeric elements with missing upper or lower threshold: ('c1', 'BSI'),('c1', 'AQZ'),('c1', 'AQA'),('c1', 'UQZ'),('c1', 'UQA'),('c1', 'VQZ'),('c1', 'VQA'),('c1', 'PQZ'),('c1', 'PQA'),('c1', 'DQZ'),('c1', 'DQA'),('c5', 'OS'),('c5', 'OP'),('c5', 'FM'),('c5', 'IMMV'),('c5', 'IX'),('c5', 'W2'),('c5', 'WMI'),('c5', 'SD2'),('c5', 'SP2'),('c5', 'IS'),('c5', 'RS'),('c5', 'IC1'),('c5', 'IC2'),('c5', 'IC3'),('c5', 'IC4'),('c5', 'IC5'),('c5', 'IR'),('c5', 'RRR'),('c5', 'TR'),('c5', 'NU'),('c5', 'QCI'),('c5', 'QI1'),('c5', 'QI2'),('c5', 'QI3'),('c5', 'QI4'),('c5', 'QI5'),('c5', 'QI6'),('c5', 'QI7'),('c5', 'QI8'),('c5', 'QI9'),('c5', 'QI10'),('c5', 'QI11'),('c5', 'QI12'),('c5', 'QI13'),('c5', 'QI14'),('c5', 'QI15'),('c5', 'QI16'),('c5', 'QI17'),('c5', 'QI18'),('c5', 'QI19'),('c5', 'QI20'),('c5', 'QI21'),('c5', 'QI22'),('c5', 'QI23'),('c5', 'QI24'),('c5', 'QI25'),('c5', 'QI26'),('c5', 'QI27'),('c5', 'QI28'),('c5', 'QI29'),('c5', 'RHI'),('c5', 'AWSI'),('c6', 'FBSRC'),('c6', 'MST'),('c7', 'OPM'),('c7', 'LOT'),('c9', 'CCe'),('c9', 'WWe'),('c9', 'Ne'),('c9', 'NHe'),('c9', 'He'),('c9', 'CLe'),('c9', 'CMe'),('c9', 'CHe'),('c9', 'SBI'),('c95', 'DPRO'),('c95', 'DPRP'),('c95', 'UFR'),('c95', 'ASIR'),('c96', 'ASII'),('c97', 'ASIE'),('c99_journal', 'vessel_length'),('c99_journal', 'vessel_beam'),('c99_journal', 'hold_depth'),('c99_journal', 'tonnage'),('c99_journal', 'baro_height'),('c99_daily', 'year'),('c99_daily', 'month'),('c99_daily', 'day'),('c99_daily', 'distance'),('c99_daily', 'lat_deg_an'),('c99_daily', 'lat_min_an'),('c99_daily', 'lon_deg_an'),('c99_daily', 'lon_min_an'),('c99_daily', 'lat_deg_on'),('c99_daily', 'lat_min_on'),('c99_daily', 'lon_deg_of'),('c99_daily', 'lon_min_of'),('c99_daily', 'current_speed'),('c99_data4', 'year'),('c99_data4', 'month'),('c99_data4', 'day'),('c99_data4', 'hour'),('c99_data4', 'ship_speed'),('c99_data4', 'compass_correction'),('c99_data4', 'attached_thermometer'),('c99_data4', 'air_temperature'),('c99_data4', 'wet_bulb_temperature'),('c99_data4', 'sea_temperature'),('c99_data4', 'sky_clear'),('c99_data5', 'year'),('c99_data5', 'month'),('c99_data5', 'day'),('c99_data5', 'hour'),('c99_data5', 'ship_speed'),('c99_data5', 'attached_thermometer'),('c99_data5', 'air_temperature'),('c99_data5', 'wet_bulb_temperature'),('c99_data5', 'sea_temperature'),('c99_data5', 'sky_clear'),('c99_data5', 'compass_correction')\n", - "2025-01-07 11:44:11,139 - root - WARNING - Corresponding upper and/or lower bounds set to +/-inf for validation\n", - "2025-01-07 11:44:12,409 - root - INFO - CREATING OUTPUT DATA ATTRIBUTES FROM DATA MODEL\n" + "2025-01-31 09:29:27,710 - root - INFO - READING DATA MODEL SCHEMA FILE...\n", + "2025-01-31 09:29:27,718 - root - INFO - EXTRACTING DATA FROM MODEL: icoads_r300_d704\n", + "2025-01-31 09:29:27,718 - root - INFO - Getting data string from source...\n", + "2025-01-31 09:29:27,789 - root - INFO - Extracting and reading sections\n", + "2025-01-31 09:29:27,875 - root - INFO - Create output DataBundle object\n" ] } ], "source": [ "schema = \"icoads_r300_d704\"\n", "\n", - "data = read_mdf(data_path, imodel=schema)" + "data = read_mdf(data_path, imodel=schema, validate=False)" ] }, { @@ -215,7 +217,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now metadata information can be extracted as a component of the padas dataframe." + "The full output `polars.DataFrame` can be accessed:" ] }, { @@ -226,196 +228,31 @@ { "data": { "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sentinalreel_nojournal_noframe_noship_namejournal_edrigship_materialvessel_typevessel_length...hold_depthtonnagebaro_typebaro_heightbaro_cdatebaro_locbaro_unitsbaro_corthermo_mountSST_I
0100200180003Panay780111187...231190214NaNBulkhead of cabin1- .1022NaN
1100200180003Panay780111187...231190214NaNBulkhead of cabin1- .1022NaN
2100200180003Panay780111187...231190214NaNBulkhead of cabin1- .1022NaN
3100200180003Panay780111187...231190214NaNBulkhead of cabin1- .1022NaN
4100200180003Panay780111187...231190214NaNBulkhead of cabin1- .1022NaN
\n", - "

5 rows × 24 columns

\n", - "
" + "shape: (5, 418)
index_core_missingcore:YRcore:MOcore:DYcore:HRcore:LATcore:LONcore:IMcore:ATTCcore:TIcore:LIcore:DScore:VScore:NIDcore:IIcore:IDcore:C1core:DIcore:Dcore:WIcore:Wcore:VIcore:VVcore:WWcore:W1core:SLPcore:Acore:PPPcore:ITcore:ATcore:WBTIcore:WBTcore:DPTIcore:DPTcore:SIcore:SSTc99_data4:sea_temperaturec99_data4:present_weatherc99_data4:cloudsc99_data4:sky_clearc99_data4:sea_state_c99_data5_missingc99_data5:sentinalc99_data5:reel_noc99_data5:journal_noc99_data5:frame_startc99_data5:framec99_data5:yearc99_data5:monthc99_data5:dayc99_data5:time_indc99_data5:hourc99_data5:ship_speedc99_data5:compass_indc99_data5:ship_course_compassc99_data5:blankc99_data5:ship_course_truec99_data5:wind_dir_magc99_data5:wind_dir_truec99_data5:wind_forcec99_data5:barometerc99_data5:temp_indc99_data5:attached_thermometerc99_data5:air_temperaturec99_data5:wet_bulb_temperaturec99_data5:sea_temperaturec99_data5:present_weatherc99_data5:cloudsc99_data5:sky_clearc99_data5:sea_statec99_data5:compass_correction_indc99_data5:compass_correctionc99_data5:compass_correction_dir
u32booli64i64i64f64f64f64stri64strstrstrstrstrstrstrstrstri64strf64strstrstrstrf64strf64strf64strf64strf64strf64f64strstri64strboolstrstrstrstrstri64i64i64stri64f64strstrstrstrstrstrstrstrstrf64f64f64f64strstri64strstrf64str
0false187810206.042.28291.59"1"3"0""6""2""3"null"10""Panay"null"1"232"5"12.3nullnullnullnull996.1nullnullnullnullnullnullnullnullnullnullnull"BOC""CU"5"R"truenullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
1false187810208.042.31291.97"1"3"0""6""2""3"null"10""Panay"null"1"232"5"12.3nullnullnullnull996.3nullnullnullnullnullnullnullnullnullnullnull"BOC""SC"3"R"truenullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
2false1878102010.042.33292.36"1"3"0""6""2""3"null"10""Panay"null"1"254"5"12.3nullnullnullnull996.9nullnull"7"8.9nullnullnullnull"1"11.15.2"OCG""SC"0"R"truenullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
3false1878102012.042.35292.71"1"3"0""6""2""3"null"10""Panay"null"1"254"5"12.3nullnullnullnull997.6nullnull"7"8.9nullnullnullnull"1"11.15.2"CG""SC"0"R"truenullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
4false1878102014.042.37293.1"1"3"0""6""2""3"null"10""Panay"null"1"254"5"12.3nullnullnullnull999.2nullnull"7"8.9nullnullnullnull"1"10.05.0"BC""SC"2"L"truenullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
" ], "text/plain": [ - " sentinal reel_no journal_no frame_no ship_name journal_ed rig ship_material \\\n", - "0 1 002 0018 0003 Panay 78 01 1 \n", - "1 1 002 0018 0003 Panay 78 01 1 \n", - "2 1 002 0018 0003 Panay 78 01 1 \n", - "3 1 002 0018 0003 Panay 78 01 1 \n", - "4 1 002 0018 0003 Panay 78 01 1 \n", - "\n", - " vessel_type vessel_length ... hold_depth tonnage baro_type baro_height \\\n", - "0 1 187 ... 23 1190 2 14 \n", - "1 1 187 ... 23 1190 2 14 \n", - "2 1 187 ... 23 1190 2 14 \n", - "3 1 187 ... 23 1190 2 14 \n", - "4 1 187 ... 23 1190 2 14 \n", - "\n", - " baro_cdate baro_loc baro_units baro_cor thermo_mount SST_I \n", - "0 NaN Bulkhead of cabin 1 - .102 2 NaN \n", - "1 NaN Bulkhead of cabin 1 - .102 2 NaN \n", - "2 NaN Bulkhead of cabin 1 - .102 2 NaN \n", - "3 NaN Bulkhead of cabin 1 - .102 2 NaN \n", - "4 NaN Bulkhead of cabin 1 - .102 2 NaN \n", - "\n", - "[5 rows x 24 columns]" + "shape: (5, 418)\n", + "┌───────┬─────────────┬─────────┬─────────┬───┬─────────────┬────────────┬────────────┬────────────┐\n", + "│ index ┆ _core_missi ┆ core:YR ┆ core:MO ┆ … ┆ c99_data5:s ┆ c99_data5: ┆ c99_data5: ┆ c99_data5: │\n", + "│ --- ┆ ng ┆ --- ┆ --- ┆ ┆ ea_state ┆ compass_co ┆ compass_co ┆ compass_co │\n", + "│ u32 ┆ --- ┆ i64 ┆ i64 ┆ ┆ --- ┆ rrection_i ┆ rrection ┆ rrection_d │\n", + "│ ┆ bool ┆ ┆ ┆ ┆ str ┆ … ┆ --- ┆ … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ --- ┆ f64 ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ ┆ str │\n", + "╞═══════╪═════════════╪═════════╪═════════╪═══╪═════════════╪════════════╪════════════╪════════════╡\n", + "│ 0 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "│ 1 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "│ 2 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "│ 3 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "│ 4 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "└───────┴─────────────┴─────────┴─────────┴───┴─────────────┴────────────┴────────────┴────────────┘" ] }, "execution_count": 5, @@ -424,29 +261,72 @@ } ], "source": [ - "data.data.c99_journal" + "data.data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To learn how to construct a schema or data model for a particular deck/source, visit this other [tutorial notebook](https://github.com/glamod/cdm_reader_mapper/blob/main/docs/example_notebooks/CLIWOC_datamodel.ipynb)" + "Now metadata information can be extracted as a component of the polars dataframe." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 24)
c99_journal:sentinalc99_journal:reel_noc99_journal:journal_noc99_journal:frame_noc99_journal:ship_namec99_journal:journal_edc99_journal:rigc99_journal:ship_materialc99_journal:vessel_typec99_journal:vessel_lengthc99_journal:vessel_beamc99_journal:commanderc99_journal:countryc99_journal:screw_paddlec99_journal:hold_depthc99_journal:tonnagec99_journal:baro_typec99_journal:baro_heightc99_journal:baro_cdatec99_journal:baro_locc99_journal:baro_unitsc99_journal:baro_corc99_journal:thermo_mountc99_journal:SST_I
strstrstrstrstrstrstrstrstri64i64strstrstri64i64stri64strstrstrstrstrstr
"1""002""0018""0003""Panay""78""01""1""1"18737"S.P.Bray,Jr""01""3"231190"2"14null"Bulkhead of cabin""1""- .102""2"null
"1""002""0018""0003""Panay""78""01""1""1"18737"S.P.Bray,Jr""01""3"231190"2"14null"Bulkhead of cabin""1""- .102""2"null
"1""002""0018""0003""Panay""78""01""1""1"18737"S.P.Bray,Jr""01""3"231190"2"14null"Bulkhead of cabin""1""- .102""2"null
"1""002""0018""0003""Panay""78""01""1""1"18737"S.P.Bray,Jr""01""3"231190"2"14null"Bulkhead of cabin""1""- .102""2"null
"1""002""0018""0003""Panay""78""01""1""1"18737"S.P.Bray,Jr""01""3"231190"2"14null"Bulkhead of cabin""1""- .102""2"null
" + ], + "text/plain": [ + "shape: (5, 24)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ c99_journ ┆ c99_journ ┆ c99_journ ┆ c99_journ ┆ … ┆ c99_journ ┆ c99_journ ┆ c99_journ ┆ c99_jour │\n", + "│ al:sentin ┆ al:reel_n ┆ al:journa ┆ al:frame_ ┆ ┆ al:baro_u ┆ al:baro_c ┆ al:thermo ┆ nal:SST_ │\n", + "│ al ┆ o ┆ l_no ┆ no ┆ ┆ nits ┆ or ┆ _mount ┆ I │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n", + "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n", + "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n", + "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n", + "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.data.select(pl.col(\"^c99_journal:.*$\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To learn how to construct a schema or data model for a particular deck/source, visit this other [tutorial notebook](https://github.com/glamod/cdm_reader_mapper/blob/main/docs/example_notebooks/CLIWOC_datamodel.ipynb)" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "cdm", "language": "python", - "name": "python3" + "name": "cdm" }, "language_info": { "codemirror_mode": { @@ -458,7 +338,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.12.8" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 35532d0e..730b639a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ dependencies = [ "numba >=0.60.0", "pandas>=2.2.0", "platformdirs >4.0.0", + "polars >= 1.26", "pyarrow >=15.0.0", "python-slugify", "recordlinkage >= 0.15", @@ -225,7 +226,9 @@ target-version = "py312" exclude = [ ".git", "build", - ".eggs" + ".eggs", + ".venv", + "venv" ] extend-include = [ "*.ipynb" # Include notebooks @@ -261,6 +264,7 @@ check-typed-exception = true "matplotlib.pyplot" = "plt" numpy = "np" pandas = "pd" +polars = "pl" scipy = "sp" xarray = "xr"