From 3f13baacd3b708c38abddd72188d247bca6aeea3 Mon Sep 17 00:00:00 2001
From: Charvee Saraiya <charveesaraiya@gmail.com>
Date: Tue, 28 Oct 2025 16:45:05 -0700
Subject: [PATCH] Add standardize() transform and tests; flake8 clean

---
 petl/__init__.py              |   1 +
 petl/transform/standardize.py | 142 ++++++++++++++++++++++++++++++++++
 tests/test_standardize.py     |  65 ++++++++++++++++
 3 files changed, 208 insertions(+)
 create mode 100644 petl/transform/standardize.py
 create mode 100644 tests/test_standardize.py

diff --git a/petl/__init__.py b/petl/__init__.py
index 36ac6344..08f153a9 100644
--- a/petl/__init__.py
+++ b/petl/__init__.py
@@ -13,3 +13,4 @@
 from petl import config
 from petl import errors
 from petl.errors import *
+from .transform.standardize import standardize  
diff --git a/petl/transform/standardize.py b/petl/transform/standardize.py
new file mode 100644
index 00000000..adddac11
--- /dev/null
+++ b/petl/transform/standardize.py
@@ -0,0 +1,142 @@
+# petl/transform/standardize.py
+from __future__ import annotations
+from typing import Iterable, Iterator, List, Sequence, Tuple, Union, Optional, Any
+import math
+
+
+Row = Sequence[Any]
+Table = Iterable[Row]
+Field = Union[str, int]
+Fields = Union[Field, Sequence[Field]]
+
+
+def _as_list(x: Fields) -> List[Field]:
+    if isinstance(x, (list, tuple)):
+        return list(x)
+    return [x]
+
+
+def _is_number(x: Any) -> bool:
+    return isinstance(x, (int, float)) and not isinstance(x, bool)
+
+
+def _mean_std(values: List[float], ddof: int = 0) -> Tuple[float, float]:
+    if not values:
+        return 0.0, 0.0
+    m = sum(values) / len(values)
+    var = sum((v - m) ** 2 for v in values) / max(1, len(values) - ddof)
+    return m, math.sqrt(var)
+
+
+def standardize(
+    table: Table,
+    fields: Fields,
+    newfields: Optional[Fields] = None,
+    ddof: int = 0,
+) -> Table:
+
+    """
+    Return a new table with the given numeric field(s) standardized to mean 0 and std 1.
+
+    Parameters
+    ----------
+    table : PETL-style table (first row is header)
+    fields : str | int | sequence
+        Column name(s) to standardize.
+    newfields : Optional[str | sequence]
+        If provided, write standardized values into these new columns
+        (same length as `fields`). Otherwise, overwrite the original columns.
+    ddof : int
+        Delta degrees of freedom for std. Use 0 for population (default), 1 for sample.
+
+    Notes
+    -----
+    - Non-numeric or None values are passed through unchanged.
+    - If std is 0 (constant column), standardized values are 0.0.
+    """
+    fields_list = _as_list(fields)
+    if newfields is not None:
+        newfields_list = _as_list(newfields)
+        if len(newfields_list) != len(fields_list):
+            raise ValueError("newfields must be the same length as fields")
+    else:
+        newfields_list = None
+
+    # Materialize the table so we can compute stats and then yield again.
+    rows = list(table)
+    if not rows:
+        return rows  # empty table
+
+    header = list(rows[0])
+    # Map field names to indices
+    name_to_idx = {name: i for i, name in enumerate(header)}
+
+    idxs: List[int] = []
+    for f in fields_list:
+        if isinstance(f, int):
+            idxs.append(f)
+        else:
+            if f not in name_to_idx:
+                raise KeyError(f"field {f!r} not found in header")
+            idxs.append(name_to_idx[f])
+
+    # Compute mean/std for each selected field
+    col_stats: List[Tuple[float, float]] = []
+    data_rows = rows[1:]
+    for ci in idxs:
+        nums = [float(r[ci]) for r in data_rows if _is_number(r[ci])]
+        m, s = _mean_std(nums, ddof=ddof)
+        col_stats.append((m, s))
+
+    # If adding new fields, extend header
+    if newfields_list:
+        header_out = header + list(newfields_list)
+    else:
+        header_out = header
+
+    def _iter() -> Iterator[Row]:
+        # yield header
+        yield tuple(header_out)
+
+        for r in data_rows:
+            r_list = list(r)
+
+            # compute standardized values for each selected column
+            zvals: List[Optional[float]] = []
+            for (ci, (m, s)) in zip(idxs, col_stats):
+                val = r_list[ci]
+                if _is_number(val):
+                    if s == 0 or math.isnan(s):
+                        z = 0.0
+                    else:
+                        z = (float(val) - m) / s
+                    zvals.append(z)
+                else:
+                    zvals.append(val)  # keep as-is (e.g., None or text)
+
+            if newfields_list:
+                # append new z-values at the end
+                yield tuple(r_list + zvals)
+            else:
+                # overwrite original fields in-place
+                for offset, ci in enumerate(idxs):
+                    r_list[ci] = zvals[offset]
+                yield tuple(r_list)
+
+    return _iter()
+
+
+if __name__ == "__main__":
+    # Example usage
+    import petl as etl
+
+    table = [
+        ('id', 'score'),
+        (1, 50),
+        (2, 60),
+        (3, 70),
+    ]
+
+    result = etl.standardize(table, 'score')
+    for row in result:
+        print(row)
diff --git a/tests/test_standardize.py b/tests/test_standardize.py
new file mode 100644
index 00000000..3334861c
--- /dev/null
+++ b/tests/test_standardize.py
@@ -0,0 +1,65 @@
+# petl/test/test_standardize.py
+import math
+from petl import standardize
+
+def approx_equal(a, b, tol=1e-9):
+    return abs(a - b) <= tol
+
+def test_standardize_overwrite_single_column():
+    table = [
+        ('id', 'x', 'y'),
+        (1, 10.0, 100),
+        (2, 20.0, 200),
+        (3, 30.0, 300),
+    ]
+
+    out = list(standardize(table, 'x'))  # overwrite x
+    header, rows = out[0], out[1:]
+    assert header == ('id', 'x', 'y')
+
+    # standardized x should have mean ~0 and std ~1 (population by default ddof=0)
+    xs = [r[1] for r in rows]
+    m = sum(xs) / len(xs)
+    var = sum((v - m) ** 2 for v in xs) / len(xs)
+    assert approx_equal(m, 0.0, 1e-12)
+    assert approx_equal(var, 1.0, 1e-12)
+
+def test_standardize_write_new_column_multiple_fields():
+    table = [
+        ('id', 'a', 'b'),
+        (1, 5,  10.0),
+        (2, 6,  20.0),
+        (3, 7,  30.0),
+        (4, 8,  None),
+    ]
+
+    out = list(standardize(table, fields=['a', 'b'], newfields=['a_z', 'b_z'], ddof=1))
+    header, rows = out[0], out[1:]
+    assert header == ('id', 'a', 'b', 'a_z', 'b_z')
+
+    # Check lengths and None handling
+    assert len(rows) == 4
+    assert rows[-1][3] is not None  # a_z exists
+    assert rows[-1][4] is None      # b was None → stays None in z
+
+def test_standardize_constant_column_returns_zeroes():
+    table = [
+        ('id', 'c'),
+        (1, 5.0),
+        (2, 5.0),
+        (3, 5.0),
+    ]
+    out = list(standardize(table, 'c'))
+    xs = [r[1] for r in out[1:]]
+    assert all(approx_equal(x, 0.0) for x in xs)
+
+def test_standardize_raises_on_missing_field():
+    table = [
+        ('id', 'x'),
+        (1, 1.0),
+    ]
+    try:
+        _ = standardize(table, 'nope')
+        assert False, "expected KeyError"
+    except KeyError:
+        assert True