From 3f13baacd3b708c38abddd72188d247bca6aeea3 Mon Sep 17 00:00:00 2001 From: Charvee Saraiya Date: Tue, 28 Oct 2025 16:45:05 -0700 Subject: [PATCH] Add standardize() transform and tests; flake8 clean --- petl/__init__.py | 1 + petl/transform/standardize.py | 142 ++++++++++++++++++++++++++++++++++ tests/test_standardize.py | 65 ++++++++++++++++ 3 files changed, 208 insertions(+) create mode 100644 petl/transform/standardize.py create mode 100644 tests/test_standardize.py diff --git a/petl/__init__.py b/petl/__init__.py index 36ac6344..08f153a9 100644 --- a/petl/__init__.py +++ b/petl/__init__.py @@ -13,3 +13,4 @@ from petl import config from petl import errors from petl.errors import * +from .transform.standardize import standardize diff --git a/petl/transform/standardize.py b/petl/transform/standardize.py new file mode 100644 index 00000000..adddac11 --- /dev/null +++ b/petl/transform/standardize.py @@ -0,0 +1,142 @@ +# petl/transform/standardize.py +from __future__ import annotations +from typing import Iterable, Iterator, List, Sequence, Tuple, Union, Optional, Any +import math + + +Row = Sequence[Any] +Table = Iterable[Row] +Field = Union[str, int] +Fields = Union[Field, Sequence[Field]] + + +def _as_list(x: Fields) -> List[Field]: + if isinstance(x, (list, tuple)): + return list(x) + return [x] + + +def _is_number(x: Any) -> bool: + return isinstance(x, (int, float)) and not isinstance(x, bool) + + +def _mean_std(values: List[float], ddof: int = 0) -> Tuple[float, float]: + if not values: + return 0.0, 0.0 + m = sum(values) / len(values) + var = sum((v - m) ** 2 for v in values) / max(1, len(values) - ddof) + return m, math.sqrt(var) + + +def standardize( + table: Table, + fields: Fields, + newfields: Optional[Fields] = None, + ddof: int = 0, +) -> Table: + + """ + Return a new table with the given numeric field(s) standardized to mean 0 and std 1. + + Parameters + ---------- + table : PETL-style table (first row is header) + fields : str | int | sequence + Column name(s) to standardize. + newfields : Optional[str | sequence] + If provided, write standardized values into these new columns + (same length as `fields`). Otherwise, overwrite the original columns. + ddof : int + Delta degrees of freedom for std. Use 0 for population (default), 1 for sample. + + Notes + ----- + - Non-numeric or None values are passed through unchanged. + - If std is 0 (constant column), standardized values are 0.0. + """ + fields_list = _as_list(fields) + if newfields is not None: + newfields_list = _as_list(newfields) + if len(newfields_list) != len(fields_list): + raise ValueError("newfields must be the same length as fields") + else: + newfields_list = None + + # Materialize the table so we can compute stats and then yield again. + rows = list(table) + if not rows: + return rows # empty table + + header = list(rows[0]) + # Map field names to indices + name_to_idx = {name: i for i, name in enumerate(header)} + + idxs: List[int] = [] + for f in fields_list: + if isinstance(f, int): + idxs.append(f) + else: + if f not in name_to_idx: + raise KeyError(f"field {f!r} not found in header") + idxs.append(name_to_idx[f]) + + # Compute mean/std for each selected field + col_stats: List[Tuple[float, float]] = [] + data_rows = rows[1:] + for ci in idxs: + nums = [float(r[ci]) for r in data_rows if _is_number(r[ci])] + m, s = _mean_std(nums, ddof=ddof) + col_stats.append((m, s)) + + # If adding new fields, extend header + if newfields_list: + header_out = header + list(newfields_list) + else: + header_out = header + + def _iter() -> Iterator[Row]: + # yield header + yield tuple(header_out) + + for r in data_rows: + r_list = list(r) + + # compute standardized values for each selected column + zvals: List[Optional[float]] = [] + for (ci, (m, s)) in zip(idxs, col_stats): + val = r_list[ci] + if _is_number(val): + if s == 0 or math.isnan(s): + z = 0.0 + else: + z = (float(val) - m) / s + zvals.append(z) + else: + zvals.append(val) # keep as-is (e.g., None or text) + + if newfields_list: + # append new z-values at the end + yield tuple(r_list + zvals) + else: + # overwrite original fields in-place + for offset, ci in enumerate(idxs): + r_list[ci] = zvals[offset] + yield tuple(r_list) + + return _iter() + + +if __name__ == "__main__": + # Example usage + import petl as etl + + table = [ + ('id', 'score'), + (1, 50), + (2, 60), + (3, 70), + ] + + result = etl.standardize(table, 'score') + for row in result: + print(row) diff --git a/tests/test_standardize.py b/tests/test_standardize.py new file mode 100644 index 00000000..3334861c --- /dev/null +++ b/tests/test_standardize.py @@ -0,0 +1,65 @@ +# petl/test/test_standardize.py +import math +from petl import standardize + +def approx_equal(a, b, tol=1e-9): + return abs(a - b) <= tol + +def test_standardize_overwrite_single_column(): + table = [ + ('id', 'x', 'y'), + (1, 10.0, 100), + (2, 20.0, 200), + (3, 30.0, 300), + ] + + out = list(standardize(table, 'x')) # overwrite x + header, rows = out[0], out[1:] + assert header == ('id', 'x', 'y') + + # standardized x should have mean ~0 and std ~1 (population by default ddof=0) + xs = [r[1] for r in rows] + m = sum(xs) / len(xs) + var = sum((v - m) ** 2 for v in xs) / len(xs) + assert approx_equal(m, 0.0, 1e-12) + assert approx_equal(var, 1.0, 1e-12) + +def test_standardize_write_new_column_multiple_fields(): + table = [ + ('id', 'a', 'b'), + (1, 5, 10.0), + (2, 6, 20.0), + (3, 7, 30.0), + (4, 8, None), + ] + + out = list(standardize(table, fields=['a', 'b'], newfields=['a_z', 'b_z'], ddof=1)) + header, rows = out[0], out[1:] + assert header == ('id', 'a', 'b', 'a_z', 'b_z') + + # Check lengths and None handling + assert len(rows) == 4 + assert rows[-1][3] is not None # a_z exists + assert rows[-1][4] is None # b was None → stays None in z + +def test_standardize_constant_column_returns_zeroes(): + table = [ + ('id', 'c'), + (1, 5.0), + (2, 5.0), + (3, 5.0), + ] + out = list(standardize(table, 'c')) + xs = [r[1] for r in out[1:]] + assert all(approx_equal(x, 0.0) for x in xs) + +def test_standardize_raises_on_missing_field(): + table = [ + ('id', 'x'), + (1, 1.0), + ] + try: + _ = standardize(table, 'nope') + assert False, "expected KeyError" + except KeyError: + assert True