Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
1c0a2e8
deps: add polars
jtsiddons Jan 27, 2025
90aedfb
refactor!: add functions for reading sections using polars
jtsiddons Jan 27, 2025
9cca765
chore: remove print
jtsiddons Jan 28, 2025
8bc58b1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 28, 2025
6ee5757
deps: add polars to ci requirements/environment files
jtsiddons Jan 28, 2025
9c98891
fix: typo in log message
jtsiddons Jan 28, 2025
f286232
refactor: missing values are only assigned for missing sections
jtsiddons Jan 28, 2025
c129c9b
ignore: add uv.lock
jtsiddons Jan 28, 2025
58bb58d
refactor: return polars Frame after reading netCDF
jtsiddons Jan 29, 2025
3f7f5a5
fix: use ":" as delimiter in column names
jtsiddons Jan 29, 2025
9263d4e
refactor: use polars operations. Remove chunksize option for polars.
jtsiddons Jan 29, 2025
e8b1219
refactor: set_missing_values to polars
jtsiddons Jan 29, 2025
61a5216
feat: add polars dtypes
jtsiddons Jan 29, 2025
476e6c4
refactor: decoders into polars
jtsiddons Jan 29, 2025
f8a78b7
refactor: converters to polars
jtsiddons Jan 29, 2025
0ee0297
refactor: remove chunk looping from convert_and_decode_entries
jtsiddons Jan 29, 2025
2eca5d0
refactor: update properties to reflect polars, minor corrections
jtsiddons Jan 30, 2025
3d12a12
docs: update and run example notebook
jtsiddons Jan 30, 2025
bfa2bbd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 30, 2025
25cd538
fix: simplify convert_dtype_to_default
jtsiddons Jan 30, 2025
8df5370
fix: replace todo comment with note
jtsiddons Jan 30, 2025
a96ef2b
fix: ensure output polars frame has index when reading sections from …
jtsiddons Jan 30, 2025
b1e7f5c
chore: ruff linter fixes
jtsiddons Jan 30, 2025
1f65f8c
fix: get fields after checking if disable_read is True
jtsiddons Jan 30, 2025
1854b58
tool: add ruff formatting settings to match black
jtsiddons Jan 31, 2025
7464d03
opt: use str.head/str.tail rather than str.slice
jtsiddons Jan 31, 2025
c135c9d
Merge branch 'main' into polarising
jtsiddons Jan 31, 2025
9af7950
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 31, 2025
d662b84
chore: remove unused function
jtsiddons Jan 31, 2025
fc6e37a
Merge branch 'main' into polarising
jtsiddons Jan 31, 2025
a7b4a87
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 31, 2025
ddfaa8b
Merge branch 'main' into polarising
ludwiglierhammer Feb 12, 2025
b1219c7
Merge branch 'main' into polarising
jtsiddons Mar 27, 2025
3b18679
refactor: use pandas to read, open_with -> format with 'text', 'netcd…
jtsiddons Mar 27, 2025
fb75779
refactor: Configurator open_ methods now return two polars Frames. Re…
jtsiddons Mar 27, 2025
60e45e0
refactor: perform all read steps in one loop rather than repeatedly s…
jtsiddons Mar 27, 2025
d84e119
fix: remove duplicate "widths" argument being passed to _read_text me…
jtsiddons Mar 27, 2025
b738f2c
fix: set column name for full-string read by read_text
jtsiddons Mar 27, 2025
97e60fd
fix: correct call to get field name from _get_index
jtsiddons Mar 28, 2025
b59ccd5
fix: cast binary to string
jtsiddons Mar 28, 2025
219f75a
fix: correct column name indexing and naming for mask
jtsiddons Mar 28, 2025
e3d450c
fix: drop section from data if delimited
jtsiddons Mar 28, 2025
e38b9a1
fix: add row index at return
jtsiddons Mar 28, 2025
536406c
opt: use tail rather than slice
jtsiddons Mar 28, 2025
d905c23
fix: don't convert to polars in read_loop
jtsiddons Mar 28, 2025
2b077c5
fix: following polars method column name
jtsiddons Mar 28, 2025
e849ff3
fix: don't add index to data and mask polars frames
jtsiddons Mar 28, 2025
90fd5d9
refactor(validators)!: polarise validators, pass mask as first argument
jtsiddons Mar 28, 2025
4973677
chore: remove debug print statement
jtsiddons Mar 28, 2025
56ab3fe
fix: reduce complexity, handle explicit None in schema for numeric bo…
jtsiddons Mar 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,6 @@ ENV/
# IDE settings
.vscode/
.idea/

# UV
uv.lock
21 changes: 12 additions & 9 deletions cdm_reader_mapper/mdf_reader/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from __future__ import annotations

import polars as pl

from ..properties import numeric_types, object_types, supported_data_models # noqa

_base = "cdm_reader_mapper.mdf_reader"
Expand All @@ -16,22 +18,23 @@
"craid": ("drifter_measurements", "JULD"),
}

pandas_dtypes = {}
polars_dtypes = {}
for dtype in object_types:
pandas_dtypes[dtype] = "object"
pandas_dtypes.update({x: x for x in numeric_types})
pandas_dtypes["datetime"] = "datetime"
polars_dtypes[dtype] = pl.String
polars_dtypes.update({x: x for x in numeric_types})
polars_dtypes[pl.Datetime] = pl.Datetime

pandas_int = "Int64"
polars_int = pl.Int64

# ....and how they are managed
data_type_conversion_args = {}
for dtype in numeric_types:
data_type_conversion_args[dtype] = ["scale", "offset"]
data_type_conversion_args["str"] = ["disable_white_strip"]
data_type_conversion_args["object"] = ["disable_white_strip"]
data_type_conversion_args["key"] = ["disable_white_strip"]
data_type_conversion_args["datetime"] = ["datetime_format"]
data_type_conversion_args[pl.Utf8] = ["disable_white_strip"]
data_type_conversion_args[pl.String] = ["disable_white_strip"]
data_type_conversion_args[pl.Categorical] = ["disable_white_strip"]
data_type_conversion_args[pl.Object] = ["disable_white_strip"]
data_type_conversion_args[pl.Datetime] = ["datetime_format"]

# Misc ------------------------------------------------------------------------
dummy_level = "_SECTION_"
Expand Down
186 changes: 106 additions & 80 deletions cdm_reader_mapper/mdf_reader/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
import csv
import logging
import os
from io import StringIO as StringIO
from io import StringIO

import pandas as pd
import polars as pl
import xarray as xr

from cdm_reader_mapper.common.json_dict import open_json_file
from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy
Expand Down Expand Up @@ -45,24 +47,26 @@ def _convert_and_decode(
converter_kwargs,
decoder_dict,
):
"""DOCUMENTATION."""
for section in converter_dict.keys():
if section not in df.columns:
continue
if section in decoder_dict.keys():
decoded = decoder_dict[section](df[section])
decoded.index = df[section].index
df[section] = decoded
df = df.with_columns(decoded.alias(section))

converted = converter_dict[section](
df[section], **converter_kwargs[section]
)
converted.index = df[section].index
df[section] = converted
# converted.index = df[section].index
df = df.with_columns(converted.alias(section))
return df

def _validate(self, df):
def _validate(self, df, mask) -> pl.DataFrame:
"""DOCUMENTATION."""
return validate(
data=df,
mask=mask,
imodel=self.imodel,
ext_table_path=self.ext_table_path,
schema=self.schema,
Expand Down Expand Up @@ -115,78 +119,31 @@ def convert_and_decode_entries(
if decode is not True:
decoder_dict = {}

if isinstance(data, pd.DataFrame):
data = self._convert_and_decode(
data,
converter_dict,
converter_kwargs,
decoder_dict,
)
else:
data_buffer = StringIO()
TextParser = make_copy(data)
for i, df_ in enumerate(TextParser):
df = self._convert_and_decode(
df_,
converter_dict,
converter_kwargs,
decoder_dict,
)
df.to_csv(
data_buffer,
header=False,
mode="a",
encoding=self.encoding,
index=False,
quoting=csv.QUOTE_NONE,
sep=properties.internal_delimiter,
quotechar="\0",
escapechar="\0",
)

data_buffer.seek(0)
data = pd.read_csv(
data_buffer,
names=df.columns,
chunksize=self.chunksize,
dtype=object,
delimiter=properties.internal_delimiter,
quotechar="\0",
escapechar="\0",
)
data = self._convert_and_decode(
data,
converter_dict,
converter_kwargs,
decoder_dict,
)
return data

def validate_entries(self, data, validate):
def validate_entries(self, data, mask, validate) -> pl.DataFrame:
"""Validate data entries by using a pre-defined data model.

Fill attribute `valid` with boolean mask.
"""
if validate is not True:
mask = pd.DataFrame()
elif isinstance(data, pd.DataFrame):
mask = self._validate(data)
mask = pl.DataFrame()
elif isinstance(data, pl.DataFrame):
mask = self._validate(data, mask)
else:
data_buffer = StringIO()
TextParser_ = make_copy(data)
for i, df_ in enumerate(TextParser_):
mask_ = self._validate(df_)
mask_.to_csv(
data_buffer,
header=False,
mode="a",
encoding=self.encoding,
index=False,
)
data_buffer.seek(0)
mask = pd.read_csv(
data_buffer,
names=df_.columns,
chunksize=self.chunksize,
)
raise TypeError("Unknown data type")
return mask

def remove_boolean_values(self, data):
"""DOCUMENTATION"""
if isinstance(data, pl.DataFrame):
return data
if isinstance(data, pd.DataFrame):
data = data.map(remove_boolean_values)
dtype = adjust_dtype(self.dtypes, data)
Expand Down Expand Up @@ -273,6 +230,7 @@ def read(

self.chunksize = chunksize
self.skiprows = skiprows
self.format = properties.open_file.get(self.imodel, "text")

# 2. READ AND VALIDATE DATA
logging.info(f"EXTRACTING DATA FROM MODEL: {self.imodel}")
Expand All @@ -287,26 +245,71 @@ def read(
# a list with a single dataframe or a pd.io.parsers.TextFileReader
logging.info("Getting data string from source...")
self.configurations = self.get_configurations(read_sections_list, sections)
data = self.open_data(
read_sections_list,
sections,
# INFO: Set default as "pandas" to account for custom schema
open_with=properties.open_file.get(self.imodel, "pandas"),

TextParser = self.open_data(
chunksize=chunksize,
format=self.format,
)

# 2.3. Extract, read and validate data in same loop
logging.info("Extracting and reading sections")
data = self.convert_and_decode_entries(
data,
convert=convert,
decode=decode,
)
mask = self.validate_entries(data, validate)
if isinstance(TextParser, (pd.DataFrame, xr.Dataset)):
data, mask = self._read_loop(
TextParser, read_sections_list, sections, decode, convert, validate
)
else:
data_buffer = StringIO()
mask_buffer = StringIO()
for df_ in TextParser:
df, mask_ = self._read_loop(
df_, read_sections_list, sections, decode, convert, validate
)
df.to_csv(
data_buffer,
header=False,
mode="a",
encoding="utf-8",
index=False,
quoting=csv.QUOTE_NONE,
sep=properties.internal_delimiter,
quotechar="\0",
escapechar="\0",
)
mask_.to_csv(
mask_buffer,
header=False,
mode="a",
encoding="utf-8",
index=False,
quoting=csv.QUOTE_NONE,
sep=properties.internal_delimiter,
quotechar="\0",
escapechar="\0",
)
data_buffer.seek(0)
data = pd.read_csv(
data_buffer,
names=self.columns,
chunksize=self.chunksize,
dtype=object,
parse_dates=self.parse_dates,
delimiter=properties.internal_delimiter,
quotechar="\0",
escapechar="\0",
)
mask_buffer.seek(0)
mask = pd.read_csv(
mask_buffer,
names=self.columns,
chunksize=self.chunksize,
dtype=object,
parse_dates=self.parse_dates,
delimiter=properties.internal_delimiter,
quotechar="\0",
escapechar="\0",
)

# 3. Create output DataBundle object
logging.info("Creata output DataBundle object")
data = self.remove_boolean_values(data)
logging.info("Create output DataBundle object")
return DataBundle(
data=data,
columns=self.columns,
Expand All @@ -317,6 +320,29 @@ def read(
imodel=self.imodel,
)

def _read_loop(
self, TextParser, order, valid, decode, convert, validate
) -> tuple[pd.DataFrame, pd.DataFrame]:
logging.info("Extracting and reading sections")
data, mask = self._read_sections(TextParser, order, valid, format=self.format)

logging.info("Decoding and converting entries")
data = self.convert_and_decode_entries(
data,
convert=convert,
decode=decode,
)

logging.info("Extracting and reading sections")
mask = self.validate_entries(data, mask, validate)

renames = {c: tuple(c.split(":")) for c in data.columns}

return (
data.to_pandas().rename(columns=renames),
mask.to_pandas().rename(columns=renames),
)


def read_mdf(
source,
Expand Down
Loading
Loading