diff --git a/.gitignore b/.gitignore
index 5471529b..f49e3a69 100755
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,6 @@ ENV/
# IDE settings
.vscode/
.idea/
+
+# UV
+uv.lock
diff --git a/cdm_reader_mapper/mdf_reader/properties.py b/cdm_reader_mapper/mdf_reader/properties.py
index 5289d9ae..f6f491be 100755
--- a/cdm_reader_mapper/mdf_reader/properties.py
+++ b/cdm_reader_mapper/mdf_reader/properties.py
@@ -2,6 +2,8 @@
from __future__ import annotations
+import polars as pl
+
from ..properties import numeric_types, object_types, supported_data_models # noqa
_base = "cdm_reader_mapper.mdf_reader"
@@ -16,22 +18,23 @@
"craid": ("drifter_measurements", "JULD"),
}
-pandas_dtypes = {}
+polars_dtypes = {}
for dtype in object_types:
- pandas_dtypes[dtype] = "object"
-pandas_dtypes.update({x: x for x in numeric_types})
-pandas_dtypes["datetime"] = "datetime"
+ polars_dtypes[dtype] = pl.String
+polars_dtypes.update({x: x for x in numeric_types})
+polars_dtypes[pl.Datetime] = pl.Datetime
-pandas_int = "Int64"
+polars_int = pl.Int64
# ....and how they are managed
data_type_conversion_args = {}
for dtype in numeric_types:
data_type_conversion_args[dtype] = ["scale", "offset"]
-data_type_conversion_args["str"] = ["disable_white_strip"]
-data_type_conversion_args["object"] = ["disable_white_strip"]
-data_type_conversion_args["key"] = ["disable_white_strip"]
-data_type_conversion_args["datetime"] = ["datetime_format"]
+data_type_conversion_args[pl.Utf8] = ["disable_white_strip"]
+data_type_conversion_args[pl.String] = ["disable_white_strip"]
+data_type_conversion_args[pl.Categorical] = ["disable_white_strip"]
+data_type_conversion_args[pl.Object] = ["disable_white_strip"]
+data_type_conversion_args[pl.Datetime] = ["datetime_format"]
# Misc ------------------------------------------------------------------------
dummy_level = "_SECTION_"
diff --git a/cdm_reader_mapper/mdf_reader/reader.py b/cdm_reader_mapper/mdf_reader/reader.py
index 9a0ef14e..0a514b74 100755
--- a/cdm_reader_mapper/mdf_reader/reader.py
+++ b/cdm_reader_mapper/mdf_reader/reader.py
@@ -6,9 +6,11 @@
import csv
import logging
import os
-from io import StringIO as StringIO
+from io import StringIO
import pandas as pd
+import polars as pl
+import xarray as xr
from cdm_reader_mapper.common.json_dict import open_json_file
from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy
@@ -45,24 +47,26 @@ def _convert_and_decode(
converter_kwargs,
decoder_dict,
):
+ """DOCUMENTATION."""
for section in converter_dict.keys():
if section not in df.columns:
continue
if section in decoder_dict.keys():
decoded = decoder_dict[section](df[section])
- decoded.index = df[section].index
- df[section] = decoded
+ df = df.with_columns(decoded.alias(section))
converted = converter_dict[section](
df[section], **converter_kwargs[section]
)
- converted.index = df[section].index
- df[section] = converted
+ # converted.index = df[section].index
+ df = df.with_columns(converted.alias(section))
return df
- def _validate(self, df):
+ def _validate(self, df, mask) -> pl.DataFrame:
+ """DOCUMENTATION."""
return validate(
data=df,
+ mask=mask,
imodel=self.imodel,
ext_table_path=self.ext_table_path,
schema=self.schema,
@@ -115,78 +119,31 @@ def convert_and_decode_entries(
if decode is not True:
decoder_dict = {}
- if isinstance(data, pd.DataFrame):
- data = self._convert_and_decode(
- data,
- converter_dict,
- converter_kwargs,
- decoder_dict,
- )
- else:
- data_buffer = StringIO()
- TextParser = make_copy(data)
- for i, df_ in enumerate(TextParser):
- df = self._convert_and_decode(
- df_,
- converter_dict,
- converter_kwargs,
- decoder_dict,
- )
- df.to_csv(
- data_buffer,
- header=False,
- mode="a",
- encoding=self.encoding,
- index=False,
- quoting=csv.QUOTE_NONE,
- sep=properties.internal_delimiter,
- quotechar="\0",
- escapechar="\0",
- )
-
- data_buffer.seek(0)
- data = pd.read_csv(
- data_buffer,
- names=df.columns,
- chunksize=self.chunksize,
- dtype=object,
- delimiter=properties.internal_delimiter,
- quotechar="\0",
- escapechar="\0",
- )
+ data = self._convert_and_decode(
+ data,
+ converter_dict,
+ converter_kwargs,
+ decoder_dict,
+ )
return data
- def validate_entries(self, data, validate):
+ def validate_entries(self, data, mask, validate) -> pl.DataFrame:
"""Validate data entries by using a pre-defined data model.
Fill attribute `valid` with boolean mask.
"""
if validate is not True:
- mask = pd.DataFrame()
- elif isinstance(data, pd.DataFrame):
- mask = self._validate(data)
+ mask = pl.DataFrame()
+ elif isinstance(data, pl.DataFrame):
+ mask = self._validate(data, mask)
else:
- data_buffer = StringIO()
- TextParser_ = make_copy(data)
- for i, df_ in enumerate(TextParser_):
- mask_ = self._validate(df_)
- mask_.to_csv(
- data_buffer,
- header=False,
- mode="a",
- encoding=self.encoding,
- index=False,
- )
- data_buffer.seek(0)
- mask = pd.read_csv(
- data_buffer,
- names=df_.columns,
- chunksize=self.chunksize,
- )
+ raise TypeError("Unknown data type")
return mask
def remove_boolean_values(self, data):
"""DOCUMENTATION"""
+ if isinstance(data, pl.DataFrame):
+ return data
if isinstance(data, pd.DataFrame):
data = data.map(remove_boolean_values)
dtype = adjust_dtype(self.dtypes, data)
@@ -273,6 +230,7 @@ def read(
self.chunksize = chunksize
self.skiprows = skiprows
+ self.format = properties.open_file.get(self.imodel, "text")
# 2. READ AND VALIDATE DATA
logging.info(f"EXTRACTING DATA FROM MODEL: {self.imodel}")
@@ -287,26 +245,71 @@ def read(
# a list with a single dataframe or a pd.io.parsers.TextFileReader
logging.info("Getting data string from source...")
self.configurations = self.get_configurations(read_sections_list, sections)
- data = self.open_data(
- read_sections_list,
- sections,
- # INFO: Set default as "pandas" to account for custom schema
- open_with=properties.open_file.get(self.imodel, "pandas"),
+
+ TextParser = self.open_data(
chunksize=chunksize,
+ format=self.format,
)
# 2.3. Extract, read and validate data in same loop
- logging.info("Extracting and reading sections")
- data = self.convert_and_decode_entries(
- data,
- convert=convert,
- decode=decode,
- )
- mask = self.validate_entries(data, validate)
+ if isinstance(TextParser, (pd.DataFrame, xr.Dataset)):
+ data, mask = self._read_loop(
+ TextParser, read_sections_list, sections, decode, convert, validate
+ )
+ else:
+ data_buffer = StringIO()
+ mask_buffer = StringIO()
+ for df_ in TextParser:
+ df, mask_ = self._read_loop(
+ df_, read_sections_list, sections, decode, convert, validate
+ )
+ df.to_csv(
+ data_buffer,
+ header=False,
+ mode="a",
+ encoding="utf-8",
+ index=False,
+ quoting=csv.QUOTE_NONE,
+ sep=properties.internal_delimiter,
+ quotechar="\0",
+ escapechar="\0",
+ )
+ mask_.to_csv(
+ mask_buffer,
+ header=False,
+ mode="a",
+ encoding="utf-8",
+ index=False,
+ quoting=csv.QUOTE_NONE,
+ sep=properties.internal_delimiter,
+ quotechar="\0",
+ escapechar="\0",
+ )
+ data_buffer.seek(0)
+ data = pd.read_csv(
+ data_buffer,
+ names=self.columns,
+ chunksize=self.chunksize,
+ dtype=object,
+ parse_dates=self.parse_dates,
+ delimiter=properties.internal_delimiter,
+ quotechar="\0",
+ escapechar="\0",
+ )
+ mask_buffer.seek(0)
+ mask = pd.read_csv(
+ mask_buffer,
+ names=self.columns,
+ chunksize=self.chunksize,
+ dtype=object,
+ parse_dates=self.parse_dates,
+ delimiter=properties.internal_delimiter,
+ quotechar="\0",
+ escapechar="\0",
+ )
# 3. Create output DataBundle object
- logging.info("Creata output DataBundle object")
- data = self.remove_boolean_values(data)
+ logging.info("Create output DataBundle object")
return DataBundle(
data=data,
columns=self.columns,
@@ -317,6 +320,29 @@ def read(
imodel=self.imodel,
)
+ def _read_loop(
+ self, TextParser, order, valid, decode, convert, validate
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
+ logging.info("Extracting and reading sections")
+ data, mask = self._read_sections(TextParser, order, valid, format=self.format)
+
+ logging.info("Decoding and converting entries")
+ data = self.convert_and_decode_entries(
+ data,
+ convert=convert,
+ decode=decode,
+ )
+
+ logging.info("Extracting and reading sections")
+ mask = self.validate_entries(data, mask, validate)
+
+ renames = {c: tuple(c.split(":")) for c in data.columns}
+
+ return (
+ data.to_pandas().rename(columns=renames),
+ mask.to_pandas().rename(columns=renames),
+ )
+
def read_mdf(
source,
diff --git a/cdm_reader_mapper/mdf_reader/schemas/schemas.py b/cdm_reader_mapper/mdf_reader/schemas/schemas.py
index 1a0d115c..2afe668a 100755
--- a/cdm_reader_mapper/mdf_reader/schemas/schemas.py
+++ b/cdm_reader_mapper/mdf_reader/schemas/schemas.py
@@ -13,30 +13,64 @@
import os
from pathlib import Path
+import polars as pl
+
from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts
from .. import properties
+def _get_type_map():
+ return {
+ "float": pl.Float64,
+ "float64": pl.Float64,
+ "float32": pl.Float32,
+ "int": pl.Int64,
+ "int64": pl.Int64,
+ "int32": pl.Int32,
+ "int16": pl.Int16,
+ "int8": pl.Int8,
+ "uint": pl.UInt64,
+ "uint64": pl.UInt64,
+ "uint32": pl.UInt32,
+ "uint16": pl.UInt16,
+ "uint8": pl.UInt8,
+ "str": pl.String,
+ "object": pl.String,
+ "key": pl.String,
+ "date": pl.Datetime,
+ "datetime": pl.Datetime,
+ "time": pl.Datetime,
+ }
+
+
def convert_dtype_to_default(dtype, section, element):
"""Convert data type to defaults (int, float)."""
if dtype is None:
- return
- elif dtype == "float":
- return dtype
- elif dtype == "int":
- return properties.pandas_int
- elif "float" in dtype.lower():
+ return # pl.String
+ dtype_map = _get_type_map()
+ dtype = dtype.lower()
+
+ polars_dtype = dtype_map.get(dtype)
+ if polars_dtype is not None:
+ return polars_dtype
+
+ if "float" in dtype:
+ logging.warning(
+ f"Set column type of ({section}, {element}) from deprecated {dtype} to float 32."
+ )
+ return pl.Float32
+ elif "uint" in dtype:
logging.warning(
- f"Set column type of ({section}, {element}) from deprecated {dtype} to float."
+ f"Set column type of ({section}, {element}) from deprecated {dtype} to uint 32."
)
- return "float"
- elif "int" in dtype.lower():
+ return pl.UInt32
+ elif "int" in dtype:
logging.warning(
- f"Set column type of ({section}, {element}) from deprecated {dtype} to int."
+ f"Set column type of ({section}, {element}) from deprecated {dtype} to int 32."
)
- return properties.pandas_int
- return dtype
+ return pl.Int32
+ return pl.String
def _read_schema(schema):
@@ -154,7 +188,7 @@ def read_schema(imodel=None, ext_schema_path=None, ext_schema_file=None):
else:
imodel = imodel.split("_")
if imodel[0] not in properties.supported_data_models:
- logging.error("Input data model " f"{imodel[0]}" " not supported")
+ logging.error(f"Input data model {imodel[0]} not supported")
return
schema_files = collect_json_files(*imodel, base=f"{properties._base}.schemas")
@@ -209,7 +243,7 @@ def clean_schema(columns, schema):
def get_index(idx, lst, section):
if len(lst) == 1:
return idx
- return (section, idx)
+ return ":".join([section, idx])
flat_schema = dict()
for section in schema.get("sections"):
diff --git a/cdm_reader_mapper/mdf_reader/utils/configurator.py b/cdm_reader_mapper/mdf_reader/utils/configurator.py
index 652c02bd..effe3fb3 100755
--- a/cdm_reader_mapper/mdf_reader/utils/configurator.py
+++ b/cdm_reader_mapper/mdf_reader/utils/configurator.py
@@ -3,11 +3,11 @@
from __future__ import annotations
import ast
-import csv
-import logging
-import numpy as np
import pandas as pd
+import polars as pl
+import polars.selectors as cs
+import xarray as xr
from .. import properties
from . import converters, decoders
@@ -19,7 +19,7 @@ class Configurator:
def __init__(
self,
- df=pd.DataFrame(),
+ df: pd.DataFrame | pl.DataFrame | xr.Dataset = pl.DataFrame(),
schema=None,
order=None,
valid=None,
@@ -38,7 +38,7 @@ def _get_index(self, section, order):
if len(self.orders) == 1:
return section
else:
- return (order, section)
+ return ":".join([order, section])
def _get_ignore(self, section_dict):
ignore = section_dict.get("ignore")
@@ -47,7 +47,7 @@ def _get_ignore(self, section_dict):
return ignore
def _get_dtype(self):
- return properties.pandas_dtypes.get(self.sections_dict.get("column_type"))
+ return properties.polars_dtypes.get(self.sections_dict.get("column_type"))
def _get_converter(self):
return converters.get(self.sections_dict.get("column_type"))
@@ -137,92 +137,148 @@ def get_configuration(self):
},
}
- def open_pandas(self):
- """Open TextParser to pd.DataSeries."""
- return self.df.apply(lambda x: self._read_line(x[0]), axis=1)
-
- def _read_line(self, line: str):
- i = j = 0
- data_dict = {}
- for order in self.orders:
- header = self.schema["sections"][order]["header"]
-
- disable_read = header.get("disable_read")
- if disable_read is True:
- data_dict[order] = line[i : properties.MAX_FULL_REPORT_WIDTH]
- continue
+ def open_text(self):
+ """Open TextParser to a polars.DataFrame"""
+ if isinstance(self.df, pd.DataFrame):
+ self.df = pl.from_pandas(self.df)
+ if not isinstance(self.df, pl.DataFrame):
+ raise TypeError(f"Cannot open with polars for {type(self.df) = }")
+ self.df = self.df
+ mask_df = pl.DataFrame()
+ for section in self.orders:
+ header = self.schema["sections"][section]["header"]
sentinal = header.get("sentinal")
- bad_sentinal = sentinal is not None and not self._validate_sentinal(
- i, line, sentinal
- )
section_length = header.get("length", properties.MAX_FULL_REPORT_WIDTH)
- sections = self.schema["sections"][order]["elements"]
+ # Get data associated with current section
+ if sentinal is not None:
+ self.df = self.df.with_columns(
+ [
+ (
+ pl.when(pl.col("full_str").str.starts_with(sentinal))
+ .then(pl.col("full_str").str.head(section_length))
+ .otherwise(pl.lit(None))
+ .alias(section)
+ ),
+ (
+ pl.when(pl.col("full_str").str.starts_with(sentinal))
+ .then(pl.col("full_str").str.tail(-section_length))
+ .otherwise(pl.col("full_str"))
+ .alias("full_str")
+ ),
+ ]
+ )
+ else:
+ # Sentinal is None, the section is always present
+ self.df = self.df.with_columns(
+ [
+ pl.col("full_str").str.head(section_length).alias(section),
+ pl.col("full_str").str.tail(-section_length).alias("full_str"),
+ ]
+ )
+
+ # Used for validation
+ section_missing = self.df.get_column(section).is_null()
+
+ # Don't read fields
+ disable_read = header.get("disable_read", False)
+ if disable_read is True:
+ continue
+
+ fields = self.schema["sections"][section]["elements"]
field_layout = header.get("field_layout")
delimiter = header.get("delimiter")
+
+ # Handle delimited section
if delimiter is not None:
delimiter_format = header.get("format")
if delimiter_format == "delimited":
# Read as CSV
- field_names = sections.keys()
- fields = list(csv.reader([line[i:]], delimiter=delimiter))[0]
- for field_name, field in zip(field_names, fields):
- index = self._get_index(field_name, order)
- data_dict[index] = field.strip()
- i += len(field)
- j = i
+ field_names = fields.keys()
+ field_names = [self._get_index(x, section) for x in field_names]
+ n_fields = len(field_names)
+ self.df = self.df.with_columns(
+ pl.col(section)
+ .str.splitn(delimiter, n_fields)
+ .struct.rename_fields(field_names)
+ .struct.unnest()
+ )
+ for field in field_names:
+ self.df = self.df.with_columns(
+ pl.col(field).str.strip_chars(" ").name.keep()
+ )
+ mask_df = mask_df.with_columns(
+ (
+ section_missing
+ | self.df.get_column(field).is_not_null()
+ ).alias(field)
+ )
+ self.df = self.df.drop([section])
+
continue
elif field_layout != "fixed_width":
- logging.error(
- f"Delimiter for {order} is set to {delimiter}. Please specify either format or field_layout in your header schema {header}."
+ raise ValueError(
+ f"Delimiter for {section} is set to {delimiter}. "
+ + f"Please specify either format or field_layout in your header schema {header}."
)
- return
- k = i + section_length
- for section, section_dict in sections.items():
- missing = True
- index = self._get_index(section, order)
- ignore = (order not in self.valid) or self._get_ignore(section_dict)
- na_value = section_dict.get("missing_value")
- field_length = section_dict.get(
+ # Loop through fixed-width fields
+ for field, field_dict in fields.items():
+ index = self._get_index(field, section)
+ ignore = (section not in self.valid) or self._get_ignore(field_dict)
+ field_length = field_dict.get(
"field_length", properties.MAX_FULL_REPORT_WIDTH
)
+ na_value = field_dict.get("missing_value")
- j = (i + field_length) if not bad_sentinal else i
- if j > k:
- missing = False
- j = k
-
- if ignore is not True:
- value = line[i:j]
-
- if not value.strip():
- value = True
- if value == na_value:
- value = True
-
- if i == j and missing is True:
- value = False
+ if ignore:
+ # Move to next field
+ self.df = self.df.with_columns(
+ pl.col(section).str.tail(-field_length).name.keep(),
+ )
+ if delimiter is not None:
+ self.df = self.df.with_columns(
+ pl.col(section).str.strip_prefix(delimiter).name.keep()
+ )
+ continue
- data_dict[index] = value
+ missing_map = {"": None}
+ if na_value is not None:
+ missing_map[na_value] = None
+
+ self.df = self.df.with_columns(
+ [
+ # If section not present in a row, then both these are null
+ (
+ pl.col(section)
+ .str.head(field_length)
+ .str.strip_chars(" ")
+ .replace(missing_map)
+ .alias(index)
+ ),
+ pl.col(section).str.tail(-field_length).name.keep(),
+ ]
+ )
+ mask_df = mask_df.with_columns(
+ (section_missing | self.df.get_column(index).is_not_null()).alias(
+ index
+ )
+ )
+ if delimiter is not None:
+ self.df = self.df.with_columns(
+ pl.col(section).str.strip_prefix(delimiter).name.keep()
+ )
- if delimiter is not None and line[j : j + len(delimiter)] == delimiter:
- j += len(delimiter)
- i = j
+ self.df = self.df.drop([section])
- return pd.Series(data_dict)
+ return self.df.drop("full_str"), mask_df
def open_netcdf(self):
- """Open netCDF to pd.Series."""
-
- def replace_empty_strings(series):
- if series.dtype == "object":
- series = series.str.decode("utf-8")
- series = series.str.strip()
- series = series.map(lambda x: True if x == "" else x)
- return series
+ """Open netCDF to polars.DataFrame."""
+ if not isinstance(self.df, xr.Dataset):
+ raise TypeError(f"Cannot open with netCDF for {type(self.df) = }")
missing_values = []
attrs = {}
@@ -249,15 +305,28 @@ def replace_empty_strings(series):
elif section in self.df.dims:
renames[section] = index
elif section in self.df.attrs:
- attrs[index] = self.df.attrs[index]
+ # Initialise a constant column
+ attrs[index] = pl.lit(self.df.attrs[section].replace("\n", "; "))
else:
missing_values.append(index)
- df = self.df[renames.keys()].to_dataframe().reset_index()
- attrs = {k: v.replace("\n", "; ") for k, v in attrs.items()}
- df = df.rename(columns=renames)
- df = df.assign(**attrs)
- df[disables] = np.nan
- df = df.apply(lambda x: replace_empty_strings(x))
- df[missing_values] = False
- return df
+ df: pl.DataFrame = pl.from_pandas(
+ self.df[renames.keys()].to_dataframe().reset_index()
+ )
+ df = df.rename(mapping=renames)
+ df = df.with_columns(**attrs)
+ df = df.with_columns(
+ [pl.lit(None).alias(missing) for missing in missing_values]
+ )
+ df = df.with_columns([pl.lit(None).alias(disable) for disable in disables])
+ # Replace empty or whitespace string with None
+ df = df.with_columns(
+ cs.binary().cast(pl.String).str.strip_chars().replace("", None)
+ )
+
+ # Create missing mask
+ mask_df = df.select(pl.all().is_not_null())
+ mask_df = mask_df.with_columns(
+ [pl.lit(True).alias(c) for c in missing_values + disables]
+ )
+ return df, mask_df
diff --git a/cdm_reader_mapper/mdf_reader/utils/converters.py b/cdm_reader_mapper/mdf_reader/utils/converters.py
index 4c14aa5e..9485bb45 100755
--- a/cdm_reader_mapper/mdf_reader/utils/converters.py
+++ b/cdm_reader_mapper/mdf_reader/utils/converters.py
@@ -2,10 +2,11 @@
from __future__ import annotations
-import pandas as pd
+import logging
+
+import polars as pl
from .. import properties
-from .utilities import convert_str_boolean
class df_converters:
@@ -16,24 +17,59 @@ def __init__(self, dtype):
self.numeric_scale = 1.0 if self.dtype == "float" else 1
self.numeric_offset = 0.0 if self.dtype == "float" else 0
- def to_numeric(self, data, offset, scale):
- """Convert object type elements of a pandas series to numeric type."""
+ def _check_conversion(self, data: pl.Series, converted: pl.Series, threshold: int):
+ if (
+ bad_converts := data.filter(converted.is_null() & data.is_not_null())
+ ).len() > 0:
+ msg = f"Have {bad_converts.len()} values that failed to be converted to {self.dtype}"
+ if bad_converts.len() <= threshold:
+ msg += f": values = {', '.join(bad_converts)}"
+ logging.warning(msg)
+ return None
+
+ def _drop_whitespace_vals(self, data: pl.Series):
+ data_name = data.name
+ return (
+ data.to_frame()
+ .select(
+ pl.when(data.str.contains(r"^\s*$"))
+ .then(pl.lit(None))
+ .otherwise(data)
+ .alias(data_name)
+ )
+ .get_column(data_name)
+ )
+
+ # May not be needed?
+ def decode(self, data):
+ """Decode object type elements of a pandas series to UTF-8."""
- def _to_numeric(x):
- x = convert_str_boolean(x)
- if isinstance(x, bool):
+ def _decode(x):
+ if not isinstance(x, str):
return x
- if isinstance(x, str):
- x = x.strip()
- x.replace(" ", "0")
+
try:
- return offset + float(x) * scale
- except ValueError:
- return False
+ encoded = x.encode("latin1")
+ return encoded.decode("utf-8")
+ except (UnicodeDecodeError, UnicodeEncodeError):
+ return x
+
+ return data.apply(lambda x: _decode(x))
- return data.apply(lambda x: _to_numeric(x))
+ def to_numeric(self, data: pl.Series):
+ """Convert object type elements of a pandas series to numeric type."""
+ data = self._drop_whitespace_vals(data)
+ # str method fails if all nan, pd.Series.replace method is not the same
+ # as pd.Series.str.replace!
+ data = data.str.strip_chars().str.replace_all(" ", "0")
+
+ converted = data.cast(self.dtype, strict=False)
+ self._check_conversion(data, converted, 20)
- def object_to_numeric(self, data, scale=None, offset=None):
+ # Convert to numeric, then scale (?!) and give it's actual int type
+ return converted
+
+ def object_to_numeric(self, data: pl.Series, scale=None, offset=None):
"""
Convert the object type elements of a pandas series to numeric type.
@@ -63,36 +99,40 @@ def object_to_numeric(self, data, scale=None, offset=None):
"""
scale = scale if scale else self.numeric_scale
offset = offset if offset else self.numeric_offset
- if data.dtype == "object":
- data = self.to_numeric(data, offset, scale)
- return data
+ if not data.dtype.is_numeric():
+ data = self.to_numeric(data)
+
+ data = offset + data * scale
+ return data.cast(self.dtype)
- def object_to_object(self, data, disable_white_strip=False):
+ def object_to_object(self, data: pl.Series, disable_white_strip=None):
"""DOCUMENTATION."""
if data.dtype != "object":
return data
-
- if not disable_white_strip:
- data = data.str.strip()
- elif disable_white_strip == "l":
- data = data.str.rstrip()
- elif disable_white_strip == "r":
- data = data.str.lstrip()
- return data.apply(
- lambda x: None if isinstance(x, str) and (x.isspace() or not x) else x
- )
-
- def object_to_datetime(self, data, datetime_format="%Y%m%d"):
+ data = self.decode(data)
+ if disable_white_strip is None:
+ data = data.str.strip_chars(" ")
+ else:
+ if disable_white_strip == "l":
+ data = data.str.strip_chars_end(" ")
+ elif disable_white_strip == "r":
+ data = data.str.strip_chars_start(" ")
+ return self._drop_whitespace_vals(data)
+
+ def object_to_datetime(self, data: pl.Series, datetime_format="%Y%m%d"):
"""DOCUMENTATION."""
if data.dtype != "object":
return data
- return pd.to_datetime(data, format=datetime_format, errors="coerce")
+ converted = data.str.to_datetime(format=datetime_format, strict=False)
+ self._check_conversion(data, converted, 20)
+ return converted
converters = dict()
for dtype in properties.numeric_types:
converters[dtype] = df_converters(dtype).object_to_numeric
-converters["datetime"] = df_converters("datetime").object_to_datetime
-converters["str"] = df_converters("str").object_to_object
-converters["object"] = df_converters("object").object_to_object
-converters["key"] = df_converters("key").object_to_object
+converters[pl.Datetime] = df_converters(pl.Datetime).object_to_datetime
+converters[pl.String] = df_converters(pl.String).object_to_object
+converters[pl.Utf8] = df_converters(pl.Utf8).object_to_object
+converters[pl.Object] = df_converters(pl.Object).object_to_object
+converters[pl.Categorical] = df_converters(pl.Categorical).object_to_object
diff --git a/cdm_reader_mapper/mdf_reader/utils/decoders.py b/cdm_reader_mapper/mdf_reader/utils/decoders.py
index e1d75ba1..1437042e 100755
--- a/cdm_reader_mapper/mdf_reader/utils/decoders.py
+++ b/cdm_reader_mapper/mdf_reader/utils/decoders.py
@@ -2,8 +2,11 @@
from __future__ import annotations
+import logging
+
+import polars as pl
+
from .. import properties
-from .utilities import convert_str_boolean
class df_decoders:
@@ -11,21 +14,37 @@ class df_decoders:
def __init__(self, dtype):
# Return as object, conversion to actual type in converters only!
- self.dtype = "object"
+ self.dtype = pl.String
+
+ def _check_decode(
+ self, data: pl.Series, decoded: pl.Series, threshold: int, method: str
+ ):
+ if (
+ bad_decode := data.filter(decoded.is_null() & data.is_not_null())
+ ).len() > 0:
+ msg = f"Have {bad_decode.len()} values that failed to be {method} decoded"
+ if bad_decode.len() <= threshold:
+ msg += f": values = {', '.join(bad_decode)}"
+ logging.warning(msg)
+ return None
def base36(self, data):
"""DOCUMENTATION."""
+ # Caution: int(str(np.nan),36) ==> 30191
+ decoded = (
+ data.replace({"NaN": None})
+ .str.strip_chars(" ")
+ .replace({"": None})
+ .str.to_integer(base=36, strict=False)
+ .cast(self.dtype)
+ )
- def _base36(x):
- x = convert_str_boolean(x)
- if isinstance(x, bool):
- return x
- return str(int(str(x), 36))
-
- return data.apply(lambda x: _base36(x))
+ self._check_decode(data, decoded, 20, "base36")
+ return decoded
-decoders = {"base36": {}}
+decoders = dict()
+decoders["base36"] = dict()
for dtype in properties.numeric_types:
decoders["base36"][dtype] = df_decoders(dtype).base36
-decoders["base36"]["key"] = df_decoders("key").base36
+decoders["base36"][pl.Categorical] = df_decoders(pl.Categorical).base36
diff --git a/cdm_reader_mapper/mdf_reader/utils/filereader.py b/cdm_reader_mapper/mdf_reader/utils/filereader.py
index 2ab03403..afd09275 100755
--- a/cdm_reader_mapper/mdf_reader/utils/filereader.py
+++ b/cdm_reader_mapper/mdf_reader/utils/filereader.py
@@ -2,12 +2,11 @@
from __future__ import annotations
-import csv
import logging
import os
from copy import deepcopy
-from io import StringIO
+import polars as pl
import pandas as pd
import xarray as xr
@@ -81,31 +80,32 @@ def _adjust_schema(self, ds, dtypes):
else:
del self.schema["sections"][section]["elements"][data_var][attr]
- def _select_years(self, df):
- def get_years_from_datetime(date):
- try:
- return date.year
- except AttributeError:
- return date
-
+ def _select_years(self, df: pl.DataFrame) -> pl.DataFrame:
if self.year_init is None and self.year_end is None:
return df
+ if self.imodel is None:
+ logging.error("Selection of years is not supported for custom schema")
+ return df
+
data_model = self.imodel.split("_")[0]
- dates = df[properties.year_column[data_model]]
- years = dates.apply(lambda x: get_years_from_datetime(x))
- years = years.astype(int)
+ dates = df.get_column(properties.year_column[data_model])
+ if dates.dtype == pl.Datetime:
+ years = dates.dt.year()
+ else:
+ years = dates.cast(pl.Int64, strict=False)
- mask = pd.Series([True] * len(years))
- if self.year_init:
- mask[years < self.year_init] = False
- if self.year_end:
- mask[years > self.year_end] = False
+ mask = pl.repeat(True, df.height, eager=True)
+ if self.year_init and self.year_end:
+ mask = years.is_between(self.year_init, self.year_end, closed="both")
+ elif self.year_init:
+ mask = years.ge(self.year_init)
+ elif self.year_end:
+ mask = years.le(self.year_end)
- index = mask[mask].index
- return df.iloc[index].reset_index(drop=True)
+ return df.filter(mask)
- def _read_pandas(self, **kwargs):
+ def _read_text(self, **kwargs):
return pd.read_fwf(
self.source,
header=None,
@@ -113,6 +113,8 @@ def _read_pandas(self, **kwargs):
escapechar="\0",
dtype=object,
skip_blank_lines=False,
+ widths=[properties.MAX_FULL_REPORT_WIDTH],
+ names=["full_str"],
**kwargs,
)
@@ -126,21 +128,26 @@ def _read_sections(
TextParser,
order,
valid,
- open_with,
+ format,
):
- if open_with == "pandas":
- df = Configurator(
+ if format == "text":
+ df, mask_df = Configurator(
df=TextParser, schema=self.schema, order=order, valid=valid
- ).open_pandas()
- elif open_with == "netcdf":
- df = Configurator(
+ ).open_text()
+ elif format == "netcdf":
+ df, mask_df = Configurator(
df=TextParser, schema=self.schema, order=order, valid=valid
).open_netcdf()
else:
- raise ValueError("open_with has to be one of ['pandas', 'netcdf']")
+ raise ValueError("format has to be one of ['text', 'netcdf']")
+
+ # missing_values = df.select(["index", "missing_values"]).pipe(set_missing_values)
+ df = df.pipe(self._select_years)
self.columns = df.columns
- return self._select_years(df)
+ # Replace None with NaN - is this necessary for polars?
+ # df = df.where(df.notnull(), np.nan)
+ return df, mask_df
def get_configurations(self, order, valid):
"""DOCUMENTATION."""
@@ -154,51 +161,53 @@ def get_configurations(self, order, valid):
def open_data(
self,
- order,
- valid,
chunksize,
- open_with="pandas",
+ format="text",
):
"""DOCUMENTATION."""
encoding = self.schema["header"].get("encoding")
- if open_with == "netcdf":
+ if format == "netcdf":
TextParser = self._read_netcdf()
- elif open_with == "pandas":
- TextParser = self._read_pandas(
+ # NOTE: Chunking - polars does have pl.read_csv_batched, but batch_size
+ # is not respected: https://github.com/pola-rs/polars/issues/19978
+ # alternative: lazy?
+ elif format == "text":
+ TextParser = self._read_text(
encoding=encoding,
- widths=[properties.MAX_FULL_REPORT_WIDTH],
skiprows=self.skiprows,
chunksize=chunksize,
)
else:
- raise ValueError("open_with has to be one of ['pandas', 'netcdf']")
-
- if isinstance(TextParser, pd.DataFrame) or isinstance(TextParser, xr.Dataset):
- return self._read_sections(TextParser, order, valid, open_with=open_with)
- else:
- data_buffer = StringIO()
- for i, df_ in enumerate(TextParser):
- df = self._read_sections(df_, order, valid, open_with=open_with)
- df.to_csv(
- data_buffer,
- header=False,
- mode="a",
- encoding=encoding,
- index=False,
- quoting=csv.QUOTE_NONE,
- sep=properties.internal_delimiter,
- quotechar="\0",
- escapechar="\0",
- )
- data_buffer.seek(0)
- data = pd.read_csv(
- data_buffer,
- names=df.columns,
- chunksize=self.chunksize,
- dtype=object,
- parse_dates=self.parse_dates,
- delimiter=properties.internal_delimiter,
- quotechar="\0",
- escapechar="\0",
- )
- return data
+ raise ValueError("format has to be one of ['text', 'netcdf']")
+
+ return TextParser
+ # if isinstance(TextParser, (pl.DataFrame, xr.Dataset)):
+ # df, mask_df = self._read_sections(TextParser, order, valid, open_with=open_with)
+ # return df, mask_df
+ # else:
+ # data_buffer = StringIO()
+ # for i, df_ in enumerate(TextParser):
+ # df = self._read_sections(df_, order, valid, open_with=open_with)
+ # df.to_csv(
+ # data_buffer,
+ # header=False,
+ # mode="a",
+ # encoding="utf-8",
+ # index=False,
+ # quoting=csv.QUOTE_NONE,
+ # sep=properties.internal_delimiter,
+ # quotechar="\0",
+ # escapechar="\0",
+ # )
+ # data_buffer.seek(0)
+ # data = pd.read_csv(
+ # data_buffer,
+ # names=df.columns,
+ # chunksize=self.chunksize,
+ # dtype=object,
+ # parse_dates=self.parse_dates,
+ # delimiter=properties.internal_delimiter,
+ # quotechar="\0",
+ # escapechar="\0",
+ # )
+ # return data
diff --git a/cdm_reader_mapper/mdf_reader/utils/validators.py b/cdm_reader_mapper/mdf_reader/utils/validators.py
index 8787dfbb..7f2fd0d1 100755
--- a/cdm_reader_mapper/mdf_reader/utils/validators.py
+++ b/cdm_reader_mapper/mdf_reader/utils/validators.py
@@ -5,7 +5,7 @@
import logging
import numpy as np
-import pandas as pd
+import polars as pl
from .. import properties
from ..codes import codes
@@ -13,36 +13,24 @@
from .utilities import convert_str_boolean
-def validate_datetime(elements, data):
+def validate_datetime(mask, elements, data: pl.DataFrame):
"""DOCUMENTATION."""
+ for element in elements:
+ col = data.get_column(element)
+ if not col.dtype.is_temporal():
+ mask = mask.with_columns(pl.lit(False).alias(element))
+ continue
- def is_date_object(object):
- if hasattr(object, "year"):
- return True
+ mask = mask.with_columns((pl.col(element) & col.is_not_null()).alias(element))
- mask = pd.DataFrame(index=data.index, data=False, columns=elements)
- mask[elements] = (
- data[elements].apply(np.vectorize(is_date_object)) | data[elements].isna()
- )
return mask
-def validate_numeric(elements, data, schema):
+def validate_numeric(mask: pl.DataFrame, elements, data: pl.DataFrame, schema):
"""DOCUMENTATION."""
-
- # Find thresholds in schema. Flag if not available -> warn
- def _to_numeric(x):
- if x is None:
- return np.nan
- x = convert_str_boolean(x)
- if isinstance(x, bool):
- return x
- return float(x)
-
- data[elements] = data[elements].map(_to_numeric)
- mask = pd.DataFrame(index=data.index, data=False, columns=elements)
- lower = {x: schema.get(x).get("valid_min", -np.inf) for x in elements}
- upper = {x: schema.get(x).get("valid_max", np.inf) for x in elements}
+ # Handle cases where value is explicitly None in the dictionary
+ lower = {x: schema.get(x).get("valid_min") or -np.inf for x in elements}
+ upper = {x: schema.get(x).get("valid_max") or np.inf for x in elements}
set_elements = [
x for x in lower.keys() if lower.get(x) != -np.inf and upper.get(x) != np.inf
@@ -57,26 +45,45 @@ def _to_numeric(x):
logging.warning(
"Corresponding upper and/or lower bounds set to +/-inf for validation"
)
- mask[elements] = (
- (data[elements] >= [lower.get(x) for x in elements])
- & (data[elements] <= [upper.get(x) for x in elements])
- ) | data[elements].isna()
+
+ mask = mask.with_columns(
+ [
+ (
+ pl.col(element)
+ | (
+ data[element].is_not_null()
+ & data[element].is_not_nan()
+ & data[element].is_between(
+ lower.get(element), upper.get(element), closed="both"
+ )
+ )
+ ).alias(element)
+ for element in elements
+ ]
+ )
return mask
-def validate_str(elements, data):
+def validate_str(mask, elements, data):
"""DOCUMENTATION."""
- return pd.DataFrame(index=data.index, data=True, columns=elements)
+ return mask
-def validate_codes(elements, data, schema, imodel, ext_table_path):
+def validate_codes(
+ mask: pl.DataFrame,
+ elements: list[str],
+ data: pl.DataFrame,
+ schema,
+ imodel,
+ ext_table_path,
+):
"""DOCUMENTATION."""
- mask = pd.DataFrame(index=data.index, data=False, columns=elements)
for element in elements:
code_table_name = schema.get(element).get("codetable")
if not code_table_name:
logging.error(f"Code table not defined for element {element}")
logging.warning("Element mask set to False")
+ mask = mask.with_columns(pl.lit(False).alias(element))
continue
table = codes.read_table(
@@ -87,19 +94,18 @@ def validate_codes(elements, data, schema, imodel, ext_table_path):
if not table:
continue
- dtype = properties.pandas_dtypes.get(schema.get(element).get("column_type"))
+ dtype = properties.polars_dtypes.get(schema.get(element).get("column_type"))
table_keys = list(table.keys())
- validation_df = data[element]
- value = validation_df.astype(dtype).astype("str")
- valid = validation_df.notna()
- mask_ = value.isin(table_keys)
- mask[element] = mask_.where(valid, True)
+ col = data.get_column(element)
+ col = col.cast(dtype).cast(pl.String)
+ valid = col.is_not_null() & col.is_not_nan() & col.is_in(table_keys)
+ mask = mask.with_columns((pl.col(element) | valid).alias(element))
return mask
-def _get_elements(elements, element_atts, key):
+def _get_elements(elements, element_atts, key) -> list[str]:
def _condition(x):
column_types = element_atts.get(x).get("column_type")
if key == "numeric_types":
@@ -125,7 +131,8 @@ def _mask_boolean(x, boolean):
def validate(
- data,
+ data: pl.DataFrame,
+ mask: pl.DataFrame,
imodel,
ext_table_path,
schema,
@@ -159,37 +166,25 @@ def validate(
filename=None,
)
# Check input
- if not isinstance(data, pd.DataFrame): # or not isinstance(mask0, pd.DataFrame):
+ if not isinstance(data, pl.DataFrame) or not isinstance(mask, pl.DataFrame):
# logging.error("Input data and mask must be a pandas data frame object")
logging.error("input data must be a pandas DataFrame.")
return
- mask = pd.DataFrame(index=data.index, columns=data.columns, dtype=object)
- if data.empty:
+ if data.is_empty():
return mask
+ disables = disables or []
+
# Get the data elements from the input data: might be just a subset of
# data model and flatten the schema to get a simple and sequential list
# of elements included in the input data
- elements = [x for x in data if x not in disables]
+ elements = [x for x in data.columns if x not in disables]
element_atts = schemas.df_schema(elements, schema)
- # See what elements we need to validate
+ # 1. Numeric elements
numeric_elements = _get_elements(elements, element_atts, "numeric_types")
- datetime_elements = _get_elements(elements, element_atts, "datetime")
- coded_elements = _get_elements(elements, element_atts, "key")
- str_elements = _get_elements(elements, element_atts, "str")
-
- if _element_tuples(numeric_elements, datetime_elements, coded_elements):
- validated_columns = pd.MultiIndex.from_tuples(
- list(set(numeric_elements + coded_elements + datetime_elements))
- )
- else:
- validated_columns = list(
- set(numeric_elements + coded_elements + datetime_elements)
- )
-
- mask[numeric_elements] = validate_numeric(numeric_elements, data, element_atts)
+ mask = validate_numeric(mask, numeric_elements, data, element_atts)
# 2. Table coded elements
# See following: in multiple keys code tables, the non parameter element,
@@ -201,8 +196,10 @@ def validate(
# Get the full list of keys combinations (tuples, triplets...) and check the column combination against that: if it fails, mark the element!
# Need to see how to grab the YEAR part of a datetime when YEAR comes from a datetime element
# pd.DatetimeIndex(df['_datetime']).year
+ coded_elements = _get_elements(elements, element_atts, "key")
if len(coded_elements) > 0:
- mask[coded_elements] = validate_codes(
+ mask = validate_codes(
+ mask,
coded_elements,
data,
element_atts,
@@ -211,21 +208,12 @@ def validate(
)
# 3. Datetime elements
- mask[datetime_elements] = validate_datetime(datetime_elements, data)
+ datetime_elements = _get_elements(elements, element_atts, "datetime")
+ mask = validate_datetime(mask, datetime_elements, data)
# 4. str elements
- mask[str_elements] = validate_str(str_elements, data)
-
- # 5. Set False values
- mask[validated_columns] = mask[validated_columns].mask(
- data[validated_columns].map(_mask_boolean, boolean=False),
- False,
- )
-
- mask[validated_columns] = mask[validated_columns].mask(
- data[validated_columns].map(_mask_boolean, boolean=True),
- True,
- )
+ str_elements = _get_elements(elements, element_atts, "str")
+ mask = validate_str(mask, str_elements, data)
- mask[disables] = np.nan
+ mask = mask.with_columns([pl.lit(None).alias(disable) for disable in disables])
return mask
diff --git a/cdm_reader_mapper/properties.py b/cdm_reader_mapper/properties.py
index 59668b9f..2ecaef89 100755
--- a/cdm_reader_mapper/properties.py
+++ b/cdm_reader_mapper/properties.py
@@ -2,8 +2,23 @@
from __future__ import annotations
-numeric_types = ["Int64", "int", "float"]
+import polars as pl
-object_types = ["str", "object", "key", "datetime"]
+# numeric_types = ["Int64", "int", "float"]
+numeric_types = [
+ pl.Float64,
+ pl.Float32,
+ pl.Int64,
+ pl.Int32,
+ pl.Int16,
+ pl.Int8,
+ pl.UInt64,
+ pl.UInt32,
+ pl.UInt16,
+ pl.UInt8,
+]
+
+# object_types = ["str", "object", "key", "datetime"]
+object_types = [pl.String, pl.Utf8, pl.Datetime, pl.Object, pl.Categorical]
supported_data_models = ["craid", "gcc", "icoads", "pub47"]
diff --git a/docs/example_notebooks/read_overview.ipynb b/docs/example_notebooks/read_overview.ipynb
index de546ab3..ced54b4b 100755
--- a/docs/example_notebooks/read_overview.ipynb
+++ b/docs/example_notebooks/read_overview.ipynb
@@ -16,7 +16,9 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2025-01-07 11:44:05,441 - root - INFO - init basic configure of logging success\n"
+ "2025-01-31 09:29:23,553 - root - INFO - init basic configure of logging success\n",
+ "/Users/josidd/git/github/cdm_fork/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
@@ -24,6 +26,7 @@
"from __future__ import annotations\n",
"\n",
"import pandas as pd\n",
+ "import polars as pl\n",
"\n",
"from cdm_reader_mapper import properties, read_mdf, test_data"
]
@@ -51,7 +54,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2025-01-07 11:44:09,196 - root - INFO - Attempting to fetch remote file: icoads/r300/d704/input/icoads_r300_d704_1878-10-01_subset.imma.md5\n"
+ "2025-01-31 09:29:27,576 - root - INFO - Attempting to fetch remote file: icoads/r300/d704/input/icoads_r300_d704_1878-10-01_subset.imma.md5\n"
]
},
{
@@ -130,7 +133,7 @@
"A **schema** file gathers a collection of descriptors that enable the `mdf_reader` tool to access the content\n",
"of a `data model/ schema` and extract the sections of the raw data file that contains meaningful information. These **schema files** are the `bones` of the data model, basically `.json` files outlining the structure of the incoming raw data.\n",
"\n",
- "The `mdf_reader` takes this information and translate the characteristics of the data to a python pandas dataframe.\n",
+ "The `mdf_reader` takes this information and translate the characteristics of the data to a python polars dataframe.\n",
"\n",
"The tool has several **schema** templates build in."
]
@@ -171,19 +174,18 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2025-01-07 11:44:10,310 - root - INFO - READING DATA MODEL SCHEMA FILE...\n",
- "2025-01-07 11:44:10,327 - root - INFO - EXTRACTING DATA FROM MODEL: icoads_r300_d704\n",
- "2025-01-07 11:44:10,327 - root - INFO - Getting data string from source...\n",
- "2025-01-07 11:44:11,139 - root - WARNING - Data numeric elements with missing upper or lower threshold: ('c1', 'BSI'),('c1', 'AQZ'),('c1', 'AQA'),('c1', 'UQZ'),('c1', 'UQA'),('c1', 'VQZ'),('c1', 'VQA'),('c1', 'PQZ'),('c1', 'PQA'),('c1', 'DQZ'),('c1', 'DQA'),('c5', 'OS'),('c5', 'OP'),('c5', 'FM'),('c5', 'IMMV'),('c5', 'IX'),('c5', 'W2'),('c5', 'WMI'),('c5', 'SD2'),('c5', 'SP2'),('c5', 'IS'),('c5', 'RS'),('c5', 'IC1'),('c5', 'IC2'),('c5', 'IC3'),('c5', 'IC4'),('c5', 'IC5'),('c5', 'IR'),('c5', 'RRR'),('c5', 'TR'),('c5', 'NU'),('c5', 'QCI'),('c5', 'QI1'),('c5', 'QI2'),('c5', 'QI3'),('c5', 'QI4'),('c5', 'QI5'),('c5', 'QI6'),('c5', 'QI7'),('c5', 'QI8'),('c5', 'QI9'),('c5', 'QI10'),('c5', 'QI11'),('c5', 'QI12'),('c5', 'QI13'),('c5', 'QI14'),('c5', 'QI15'),('c5', 'QI16'),('c5', 'QI17'),('c5', 'QI18'),('c5', 'QI19'),('c5', 'QI20'),('c5', 'QI21'),('c5', 'QI22'),('c5', 'QI23'),('c5', 'QI24'),('c5', 'QI25'),('c5', 'QI26'),('c5', 'QI27'),('c5', 'QI28'),('c5', 'QI29'),('c5', 'RHI'),('c5', 'AWSI'),('c6', 'FBSRC'),('c6', 'MST'),('c7', 'OPM'),('c7', 'LOT'),('c9', 'CCe'),('c9', 'WWe'),('c9', 'Ne'),('c9', 'NHe'),('c9', 'He'),('c9', 'CLe'),('c9', 'CMe'),('c9', 'CHe'),('c9', 'SBI'),('c95', 'DPRO'),('c95', 'DPRP'),('c95', 'UFR'),('c95', 'ASIR'),('c96', 'ASII'),('c97', 'ASIE'),('c99_journal', 'vessel_length'),('c99_journal', 'vessel_beam'),('c99_journal', 'hold_depth'),('c99_journal', 'tonnage'),('c99_journal', 'baro_height'),('c99_daily', 'year'),('c99_daily', 'month'),('c99_daily', 'day'),('c99_daily', 'distance'),('c99_daily', 'lat_deg_an'),('c99_daily', 'lat_min_an'),('c99_daily', 'lon_deg_an'),('c99_daily', 'lon_min_an'),('c99_daily', 'lat_deg_on'),('c99_daily', 'lat_min_on'),('c99_daily', 'lon_deg_of'),('c99_daily', 'lon_min_of'),('c99_daily', 'current_speed'),('c99_data4', 'year'),('c99_data4', 'month'),('c99_data4', 'day'),('c99_data4', 'hour'),('c99_data4', 'ship_speed'),('c99_data4', 'compass_correction'),('c99_data4', 'attached_thermometer'),('c99_data4', 'air_temperature'),('c99_data4', 'wet_bulb_temperature'),('c99_data4', 'sea_temperature'),('c99_data4', 'sky_clear'),('c99_data5', 'year'),('c99_data5', 'month'),('c99_data5', 'day'),('c99_data5', 'hour'),('c99_data5', 'ship_speed'),('c99_data5', 'attached_thermometer'),('c99_data5', 'air_temperature'),('c99_data5', 'wet_bulb_temperature'),('c99_data5', 'sea_temperature'),('c99_data5', 'sky_clear'),('c99_data5', 'compass_correction')\n",
- "2025-01-07 11:44:11,139 - root - WARNING - Corresponding upper and/or lower bounds set to +/-inf for validation\n",
- "2025-01-07 11:44:12,409 - root - INFO - CREATING OUTPUT DATA ATTRIBUTES FROM DATA MODEL\n"
+ "2025-01-31 09:29:27,710 - root - INFO - READING DATA MODEL SCHEMA FILE...\n",
+ "2025-01-31 09:29:27,718 - root - INFO - EXTRACTING DATA FROM MODEL: icoads_r300_d704\n",
+ "2025-01-31 09:29:27,718 - root - INFO - Getting data string from source...\n",
+ "2025-01-31 09:29:27,789 - root - INFO - Extracting and reading sections\n",
+ "2025-01-31 09:29:27,875 - root - INFO - Create output DataBundle object\n"
]
}
],
"source": [
"schema = \"icoads_r300_d704\"\n",
"\n",
- "data = read_mdf(data_path, imodel=schema)"
+ "data = read_mdf(data_path, imodel=schema, validate=False)"
]
},
{
@@ -215,7 +217,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Now metadata information can be extracted as a component of the padas dataframe."
+ "The full output `polars.DataFrame` can be accessed:"
]
},
{
@@ -226,196 +228,31 @@
{
"data": {
"text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " sentinal | \n",
- " reel_no | \n",
- " journal_no | \n",
- " frame_no | \n",
- " ship_name | \n",
- " journal_ed | \n",
- " rig | \n",
- " ship_material | \n",
- " vessel_type | \n",
- " vessel_length | \n",
- " ... | \n",
- " hold_depth | \n",
- " tonnage | \n",
- " baro_type | \n",
- " baro_height | \n",
- " baro_cdate | \n",
- " baro_loc | \n",
- " baro_units | \n",
- " baro_cor | \n",
- " thermo_mount | \n",
- " SST_I | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " 002 | \n",
- " 0018 | \n",
- " 0003 | \n",
- " Panay | \n",
- " 78 | \n",
- " 01 | \n",
- " 1 | \n",
- " 1 | \n",
- " 187 | \n",
- " ... | \n",
- " 23 | \n",
- " 1190 | \n",
- " 2 | \n",
- " 14 | \n",
- " NaN | \n",
- " Bulkhead of cabin | \n",
- " 1 | \n",
- " - .102 | \n",
- " 2 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " 002 | \n",
- " 0018 | \n",
- " 0003 | \n",
- " Panay | \n",
- " 78 | \n",
- " 01 | \n",
- " 1 | \n",
- " 1 | \n",
- " 187 | \n",
- " ... | \n",
- " 23 | \n",
- " 1190 | \n",
- " 2 | \n",
- " 14 | \n",
- " NaN | \n",
- " Bulkhead of cabin | \n",
- " 1 | \n",
- " - .102 | \n",
- " 2 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1 | \n",
- " 002 | \n",
- " 0018 | \n",
- " 0003 | \n",
- " Panay | \n",
- " 78 | \n",
- " 01 | \n",
- " 1 | \n",
- " 1 | \n",
- " 187 | \n",
- " ... | \n",
- " 23 | \n",
- " 1190 | \n",
- " 2 | \n",
- " 14 | \n",
- " NaN | \n",
- " Bulkhead of cabin | \n",
- " 1 | \n",
- " - .102 | \n",
- " 2 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1 | \n",
- " 002 | \n",
- " 0018 | \n",
- " 0003 | \n",
- " Panay | \n",
- " 78 | \n",
- " 01 | \n",
- " 1 | \n",
- " 1 | \n",
- " 187 | \n",
- " ... | \n",
- " 23 | \n",
- " 1190 | \n",
- " 2 | \n",
- " 14 | \n",
- " NaN | \n",
- " Bulkhead of cabin | \n",
- " 1 | \n",
- " - .102 | \n",
- " 2 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1 | \n",
- " 002 | \n",
- " 0018 | \n",
- " 0003 | \n",
- " Panay | \n",
- " 78 | \n",
- " 01 | \n",
- " 1 | \n",
- " 1 | \n",
- " 187 | \n",
- " ... | \n",
- " 23 | \n",
- " 1190 | \n",
- " 2 | \n",
- " 14 | \n",
- " NaN | \n",
- " Bulkhead of cabin | \n",
- " 1 | \n",
- " - .102 | \n",
- " 2 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 24 columns
\n",
- "
"
+ "shape: (5, 418)| index | _core_missing | core:YR | core:MO | core:DY | core:HR | core:LAT | core:LON | core:IM | core:ATTC | core:TI | core:LI | core:DS | core:VS | core:NID | core:II | core:ID | core:C1 | core:DI | core:D | core:WI | core:W | core:VI | core:VV | core:WW | core:W1 | core:SLP | core:A | core:PPP | core:IT | core:AT | core:WBTI | core:WBT | core:DPTI | core:DPT | core:SI | core:SST | … | c99_data4:sea_temperature | c99_data4:present_weather | c99_data4:clouds | c99_data4:sky_clear | c99_data4:sea_state | _c99_data5_missing | c99_data5:sentinal | c99_data5:reel_no | c99_data5:journal_no | c99_data5:frame_start | c99_data5:frame | c99_data5:year | c99_data5:month | c99_data5:day | c99_data5:time_ind | c99_data5:hour | c99_data5:ship_speed | c99_data5:compass_ind | c99_data5:ship_course_compass | c99_data5:blank | c99_data5:ship_course_true | c99_data5:wind_dir_mag | c99_data5:wind_dir_true | c99_data5:wind_force | c99_data5:barometer | c99_data5:temp_ind | c99_data5:attached_thermometer | c99_data5:air_temperature | c99_data5:wet_bulb_temperature | c99_data5:sea_temperature | c99_data5:present_weather | c99_data5:clouds | c99_data5:sky_clear | c99_data5:sea_state | c99_data5:compass_correction_ind | c99_data5:compass_correction | c99_data5:compass_correction_dir |
|---|
| u32 | bool | i64 | i64 | i64 | f64 | f64 | f64 | str | i64 | str | str | str | str | str | str | str | str | str | i64 | str | f64 | str | str | str | str | f64 | str | f64 | str | f64 | str | f64 | str | f64 | str | f64 | … | f64 | str | str | i64 | str | bool | str | str | str | str | str | i64 | i64 | i64 | str | i64 | f64 | str | str | str | str | str | str | str | str | str | f64 | f64 | f64 | f64 | str | str | i64 | str | str | f64 | str |
| 0 | false | 1878 | 10 | 20 | 6.0 | 42.28 | 291.59 | "1" | 3 | "0" | "6" | "2" | "3" | null | "10" | "Panay" | null | "1" | 232 | "5" | 12.3 | null | null | null | null | 996.1 | null | null | null | null | null | null | null | null | null | null | … | null | "BOC" | "CU" | 5 | "R" | true | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
| 1 | false | 1878 | 10 | 20 | 8.0 | 42.31 | 291.97 | "1" | 3 | "0" | "6" | "2" | "3" | null | "10" | "Panay" | null | "1" | 232 | "5" | 12.3 | null | null | null | null | 996.3 | null | null | null | null | null | null | null | null | null | null | … | null | "BOC" | "SC" | 3 | "R" | true | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
| 2 | false | 1878 | 10 | 20 | 10.0 | 42.33 | 292.36 | "1" | 3 | "0" | "6" | "2" | "3" | null | "10" | "Panay" | null | "1" | 254 | "5" | 12.3 | null | null | null | null | 996.9 | null | null | "7" | 8.9 | null | null | null | null | "1" | 11.1 | … | 5.2 | "OCG" | "SC" | 0 | "R" | true | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
| 3 | false | 1878 | 10 | 20 | 12.0 | 42.35 | 292.71 | "1" | 3 | "0" | "6" | "2" | "3" | null | "10" | "Panay" | null | "1" | 254 | "5" | 12.3 | null | null | null | null | 997.6 | null | null | "7" | 8.9 | null | null | null | null | "1" | 11.1 | … | 5.2 | "CG" | "SC" | 0 | "R" | true | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
| 4 | false | 1878 | 10 | 20 | 14.0 | 42.37 | 293.1 | "1" | 3 | "0" | "6" | "2" | "3" | null | "10" | "Panay" | null | "1" | 254 | "5" | 12.3 | null | null | null | null | 999.2 | null | null | "7" | 8.9 | null | null | null | null | "1" | 10.0 | … | 5.0 | "BC" | "SC" | 2 | "L" | true | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
"
],
"text/plain": [
- " sentinal reel_no journal_no frame_no ship_name journal_ed rig ship_material \\\n",
- "0 1 002 0018 0003 Panay 78 01 1 \n",
- "1 1 002 0018 0003 Panay 78 01 1 \n",
- "2 1 002 0018 0003 Panay 78 01 1 \n",
- "3 1 002 0018 0003 Panay 78 01 1 \n",
- "4 1 002 0018 0003 Panay 78 01 1 \n",
- "\n",
- " vessel_type vessel_length ... hold_depth tonnage baro_type baro_height \\\n",
- "0 1 187 ... 23 1190 2 14 \n",
- "1 1 187 ... 23 1190 2 14 \n",
- "2 1 187 ... 23 1190 2 14 \n",
- "3 1 187 ... 23 1190 2 14 \n",
- "4 1 187 ... 23 1190 2 14 \n",
- "\n",
- " baro_cdate baro_loc baro_units baro_cor thermo_mount SST_I \n",
- "0 NaN Bulkhead of cabin 1 - .102 2 NaN \n",
- "1 NaN Bulkhead of cabin 1 - .102 2 NaN \n",
- "2 NaN Bulkhead of cabin 1 - .102 2 NaN \n",
- "3 NaN Bulkhead of cabin 1 - .102 2 NaN \n",
- "4 NaN Bulkhead of cabin 1 - .102 2 NaN \n",
- "\n",
- "[5 rows x 24 columns]"
+ "shape: (5, 418)\n",
+ "┌───────┬─────────────┬─────────┬─────────┬───┬─────────────┬────────────┬────────────┬────────────┐\n",
+ "│ index ┆ _core_missi ┆ core:YR ┆ core:MO ┆ … ┆ c99_data5:s ┆ c99_data5: ┆ c99_data5: ┆ c99_data5: │\n",
+ "│ --- ┆ ng ┆ --- ┆ --- ┆ ┆ ea_state ┆ compass_co ┆ compass_co ┆ compass_co │\n",
+ "│ u32 ┆ --- ┆ i64 ┆ i64 ┆ ┆ --- ┆ rrection_i ┆ rrection ┆ rrection_d │\n",
+ "│ ┆ bool ┆ ┆ ┆ ┆ str ┆ … ┆ --- ┆ … │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ --- ┆ f64 ┆ --- │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ ┆ str │\n",
+ "╞═══════╪═════════════╪═════════╪═════════╪═══╪═════════════╪════════════╪════════════╪════════════╡\n",
+ "│ 0 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
+ "│ 1 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
+ "│ 2 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
+ "│ 3 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
+ "│ 4 ┆ false ┆ 1878 ┆ 10 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
+ "└───────┴─────────────┴─────────┴─────────┴───┴─────────────┴────────────┴────────────┴────────────┘"
]
},
"execution_count": 5,
@@ -424,29 +261,72 @@
}
],
"source": [
- "data.data.c99_journal"
+ "data.data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "To learn how to construct a schema or data model for a particular deck/source, visit this other [tutorial notebook](https://github.com/glamod/cdm_reader_mapper/blob/main/docs/example_notebooks/CLIWOC_datamodel.ipynb)"
+ "Now metadata information can be extracted as a component of the polars dataframe."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 24)| c99_journal:sentinal | c99_journal:reel_no | c99_journal:journal_no | c99_journal:frame_no | c99_journal:ship_name | c99_journal:journal_ed | c99_journal:rig | c99_journal:ship_material | c99_journal:vessel_type | c99_journal:vessel_length | c99_journal:vessel_beam | c99_journal:commander | c99_journal:country | c99_journal:screw_paddle | c99_journal:hold_depth | c99_journal:tonnage | c99_journal:baro_type | c99_journal:baro_height | c99_journal:baro_cdate | c99_journal:baro_loc | c99_journal:baro_units | c99_journal:baro_cor | c99_journal:thermo_mount | c99_journal:SST_I |
|---|
| str | str | str | str | str | str | str | str | str | i64 | i64 | str | str | str | i64 | i64 | str | i64 | str | str | str | str | str | str |
| "1" | "002" | "0018" | "0003" | "Panay" | "78" | "01" | "1" | "1" | 187 | 37 | "S.P.Bray,Jr" | "01" | "3" | 23 | 1190 | "2" | 14 | null | "Bulkhead of cabin" | "1" | "- .102" | "2" | null |
| "1" | "002" | "0018" | "0003" | "Panay" | "78" | "01" | "1" | "1" | 187 | 37 | "S.P.Bray,Jr" | "01" | "3" | 23 | 1190 | "2" | 14 | null | "Bulkhead of cabin" | "1" | "- .102" | "2" | null |
| "1" | "002" | "0018" | "0003" | "Panay" | "78" | "01" | "1" | "1" | 187 | 37 | "S.P.Bray,Jr" | "01" | "3" | 23 | 1190 | "2" | 14 | null | "Bulkhead of cabin" | "1" | "- .102" | "2" | null |
| "1" | "002" | "0018" | "0003" | "Panay" | "78" | "01" | "1" | "1" | 187 | 37 | "S.P.Bray,Jr" | "01" | "3" | 23 | 1190 | "2" | 14 | null | "Bulkhead of cabin" | "1" | "- .102" | "2" | null |
| "1" | "002" | "0018" | "0003" | "Panay" | "78" | "01" | "1" | "1" | 187 | 37 | "S.P.Bray,Jr" | "01" | "3" | 23 | 1190 | "2" | 14 | null | "Bulkhead of cabin" | "1" | "- .102" | "2" | null |
"
+ ],
+ "text/plain": [
+ "shape: (5, 24)\n",
+ "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n",
+ "│ c99_journ ┆ c99_journ ┆ c99_journ ┆ c99_journ ┆ … ┆ c99_journ ┆ c99_journ ┆ c99_journ ┆ c99_jour │\n",
+ "│ al:sentin ┆ al:reel_n ┆ al:journa ┆ al:frame_ ┆ ┆ al:baro_u ┆ al:baro_c ┆ al:thermo ┆ nal:SST_ │\n",
+ "│ al ┆ o ┆ l_no ┆ no ┆ ┆ nits ┆ or ┆ _mount ┆ I │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n",
+ "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n",
+ "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n",
+ "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n",
+ "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n",
+ "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n",
+ "│ 1 ┆ 002 ┆ 0018 ┆ 0003 ┆ … ┆ 1 ┆ - .102 ┆ 2 ┆ null │\n",
+ "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.data.select(pl.col(\"^c99_journal:.*$\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To learn how to construct a schema or data model for a particular deck/source, visit this other [tutorial notebook](https://github.com/glamod/cdm_reader_mapper/blob/main/docs/example_notebooks/CLIWOC_datamodel.ipynb)"
+ ]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "cdm",
"language": "python",
- "name": "python3"
+ "name": "cdm"
},
"language_info": {
"codemirror_mode": {
@@ -458,7 +338,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.3"
+ "version": "3.12.8"
}
},
"nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index 35532d0e..730b639a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ dependencies = [
"numba >=0.60.0",
"pandas>=2.2.0",
"platformdirs >4.0.0",
+ "polars >= 1.26",
"pyarrow >=15.0.0",
"python-slugify",
"recordlinkage >= 0.15",
@@ -225,7 +226,9 @@ target-version = "py312"
exclude = [
".git",
"build",
- ".eggs"
+ ".eggs",
+ ".venv",
+ "venv"
]
extend-include = [
"*.ipynb" # Include notebooks
@@ -261,6 +264,7 @@ check-typed-exception = true
"matplotlib.pyplot" = "plt"
numpy = "np"
pandas = "pd"
+polars = "pl"
scipy = "sp"
xarray = "xr"