From edaafdb561d2a27bd39616bddf096b9cb111e79f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 14 May 2025 18:00:38 +0200 Subject: [PATCH 1/5] FEAT: added support for dataframes with MultiIndex in columns in from_frame (closes #466) --- doc/source/changes/version_0_35.rst.inc | 3 + larray/inout/pandas.py | 79 +++++++++++++++++-------- larray/tests/test_array.py | 76 ++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 25 deletions(-) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 22167ef49..8cca1c95f 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -92,6 +92,9 @@ Miscellaneous improvements >>> arr.plot.bar(stack='gender') +* :py:obj:`from_frame()` and :py:obj:`asarray()` now support Pandas DataFrames + with more than one level (row) of columns (closes :issue:`466`). + * :py:obj:`Array.to_frame()` gained an ``ncolaxes`` argument to control how many axes should be used as columns (defaults to 1, as before). diff --git a/larray/inout/pandas.py b/larray/inout/pandas.py index d67fec7e8..52ab48144 100644 --- a/larray/inout/pandas.py +++ b/larray/inout/pandas.py @@ -6,7 +6,6 @@ from larray.core.array import Array from larray.core.axis import Axis, AxisCollection from larray.core.constants import nan -from larray.util.misc import unique_list def decode(s, encoding='utf-8', errors='strict'): @@ -46,34 +45,51 @@ def index_to_labels(idx, sort=True): """ if isinstance(idx, pd.MultiIndex): if sort: - return list(idx.levels) + return list(idx.levels) # list of pd.Index else: - return [unique_list(idx.get_level_values(label)) for label in range(idx.nlevels)] + # requires Pandas >= 0.23 (and it does NOT sort the values) + # TODO: unsure to_list is necessary (larray tests pass without it + # but I am not sure this code path is covered by tests) + # and there might be a subtle difference. The type + # of the returned object without to_list() is pd.Index + return [idx.unique(level).to_list() for level in range(idx.nlevels)] else: assert isinstance(idx, pd.Index) labels = list(idx.values) return [sorted(labels) if sort else labels] -def cartesian_product_df(df, sort_rows=False, sort_columns=False, fill_value=nan, **kwargs): - idx = df.index - labels = index_to_labels(idx, sort=sort_rows) +def product_index(idx, sort=False): + """ + Converts a pandas (Multi)Index to an (Multi)Index with a cartesian + product of the labels present in each level + """ + labels = index_to_labels(idx, sort=sort) if isinstance(idx, pd.MultiIndex): - if sort_rows: - new_index = pd.MultiIndex.from_product(labels) - else: - new_index = pd.MultiIndex.from_tuples(list(product(*labels))) + return pd.MultiIndex.from_product(labels), labels else: - if sort_rows: - new_index = pd.Index(labels[0], name=idx.name) + assert isinstance(idx, pd.Index) + if sort: + return pd.Index(labels[0], name=idx.name), labels else: - new_index = idx - columns = sorted(df.columns) if sort_columns else list(df.columns) - # the prodlen test is meant to avoid the more expensive array_equal test - prodlen = np.prod([len(axis_labels) for axis_labels in labels]) - if prodlen == len(df) and columns == list(df.columns) and np.array_equal(idx.values, new_index.values): - return df, labels - return df.reindex(index=new_index, columns=columns, fill_value=fill_value, **kwargs), labels + return idx, labels + + +def cartesian_product_df(df, sort_rows=False, sort_columns=False, + fill_value=nan, **kwargs): + idx = df.index + columns = df.columns + prod_index, index_labels = product_index(idx, sort=sort_rows) + prod_columns, column_labels = product_index(columns, sort=sort_columns) + combined_labels = index_labels + column_labels + # the len() tests are meant to avoid the more expensive array_equal tests + if (len(prod_index) == len(idx) and + len(prod_columns) == len(columns) and + np.array_equal(idx.values, prod_index.values) and + np.array_equal(columns.values, prod_columns.values)): + return df, combined_labels + return df.reindex(index=prod_index, columns=prod_columns, + fill_value=fill_value, **kwargs), combined_labels def from_series(s, sort_rows=False, fill_value=nan, meta=None, **kwargs) -> Array: @@ -124,8 +140,13 @@ def from_series(s, sort_rows=False, fill_value=nan, meta=None, **kwargs) -> Arra a1 b1 6.0 7.0 """ if isinstance(s.index, pd.MultiIndex): - # TODO: use argument sort=False when it will be available - # (see https://github.com/pandas-dev/pandas/issues/15105) + # Using unstack sort argument (requires Pandas >= 2.1) would make this + # code simpler, but it makes it even slower than it already is. + # As of Pandas 2.3.3 on 12/2025, a series with a large MultiIndex is + # extremely slow to unstack, whether sort is used or not: + # >>> arr = ndtest((200, 200, 200)) + # >>> s = arr.to_series() # 31.4 ms + # >>> s.unstack(level=-1, fill_value=np.nan) # 1.5s !!! df = s.unstack(level=-1, fill_value=fill_value) # pandas (un)stack and pivot(_table) methods return a Dataframe/Series with sorted index and columns if not sort_rows: @@ -211,13 +232,15 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo # handle 2 or more dimensions with the last axis name given using \ if unfold_last_axis_name: + # Note that having several axes in columns (and using df.columns.names) + # in this case does not make sense if isinstance(axes_names[-1], str) and '\\' in axes_names[-1]: last_axes = [name.strip() for name in axes_names[-1].split('\\')] axes_names = axes_names[:-1] + last_axes else: axes_names += [None] else: - axes_names += [df.columns.name] + axes_names += df.columns.names if cartesian_prod: df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns, @@ -226,12 +249,18 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo if sort_rows or sort_columns: raise ValueError('sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. ' 'Please call the method sort_labels on the returned array to sort rows or columns') - axes_labels = index_to_labels(df.index, sort=False) + index_labels = index_to_labels(df.index, sort=False) + column_labels = index_to_labels(df.columns, sort=False) + axes_labels = index_labels + column_labels # Pandas treats column labels as column names (strings) so we need to convert them to values - last_axis_labels = [parse(cell) for cell in df.columns.values] if parse_header else list(df.columns.values) - axes_labels.append(last_axis_labels) + if parse_header: + ncolaxes = df.columns.nlevels + for i in range(len(axes_labels) - ncolaxes, len(axes_labels)): + axes_labels[i] = [parse(cell) for cell in axes_labels[i]] + # TODO: use zip(..., strict=True) instead when we drop support for Python 3.9 + assert len(axes_labels) == len(axes_names) axes = AxisCollection([Axis(labels, name) for labels, name in zip(axes_labels, axes_names)]) data = df.values.reshape(axes.shape) return Array(data, axes, meta=meta) diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 5beb0f54d..050a4692a 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -4121,6 +4121,7 @@ def test_to_frame(): assert df.columns.to_list() == ['c0', 'c1'] assert df.index.names == ['a', r'b\c'] + def test_from_frame(): # 1) data = scalar # ================ @@ -4530,6 +4531,81 @@ def test_from_frame(): res = from_frame(df, fill_value=-1) assert_larray_equal(res, expected) + # 6) with a multi-index in columns + # ================================ + + # a) normal + arr = ndtest((2, 2, 2, 2)) + df = arr.to_frame(ncolaxes=2) + res = from_frame(df) + assert_larray_equal(res, arr) + + # b) with duplicated axis names + arr = ndtest("a=a0,a1;a=b0,b1;a=c0,c1;a=d0,d1") + df = arr.to_frame(ncolaxes=2) + res = from_frame(df) + assert_larray_equal(res, arr) + + # c) with duplicated axes names and labels + arr = ndtest("a=a0,a1;a=a0,a1;a=a0,a1;a=a0,a1") + df = arr.to_frame(ncolaxes=2) + res = from_frame(df) + assert_larray_equal(res, arr) + + # d) with unsorted labels + arr = ndtest("a=a1,a0;b=b1,b0;c=c1,c0;d=d1,d0") + df = arr.to_frame(ncolaxes=2) + res = from_frame(df) + assert_larray_equal(res, arr) + + # e) with sorting of unsorted column labels + arr = ndtest("a=a1,a0;b=b1,b0;c=c1,c0;d=d1,d0") + df = arr.to_frame(ncolaxes=2) + expected = from_string(r""" + a b c\d d0 d1 + a1 b1 c0 3 2 + a1 b1 c1 1 0 + a1 b0 c0 7 6 + a1 b0 c1 5 4 + a0 b1 c0 11 10 + a0 b1 c1 9 8 + a0 b0 c0 15 14 + a0 b0 c1 13 12""") + res = from_frame(df, sort_columns=True) + assert_larray_equal(res, expected) + + # f) with sorting of unsorted row labels + arr = ndtest("a=a1,a0;b=b1,b0;c=c1,c0;d=d1,d0") + df = arr.to_frame(ncolaxes=2) + expected = from_string(r""" + a b c\d d1 d0 + a0 b0 c1 12 13 + a0 b0 c0 14 15 + a0 b1 c1 8 9 + a0 b1 c0 10 11 + a1 b0 c1 4 5 + a1 b0 c0 6 7 + a1 b1 c1 0 1 + a1 b1 c0 2 3""") + res = from_frame(df, sort_rows=True) + assert_larray_equal(res, expected) + + # g) with sorting of all unsorted labels + arr = ndtest("a=a1,a0;b=b1,b0;c=c1,c0;d=d1,d0") + df = arr.to_frame(ncolaxes=2) + expected = from_string(r""" + a b c\d d0 d1 + a0 b0 c0 15 14 + a0 b0 c1 13 12 + a0 b1 c0 11 10 + a0 b1 c1 9 8 + a1 b0 c0 7 6 + a1 b0 c1 5 4 + a1 b1 c0 3 2 + a1 b1 c1 1 0""") + res = from_frame(df, sort_rows=True, sort_columns=True) + assert_larray_equal(res, expected) + def test_asarray(): series = pd.Series([0, 1, 2], ['a0', 'a1', 'a2'], name='a') From 539a06b7ada8ad12735a62bc2c8893452874faf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 1 Dec 2025 10:17:19 +0100 Subject: [PATCH 2/5] MAINT: added support for Python 3.13 (closes #1128) --- .github/workflows/ci.yml | 2 +- doc/source/changes/version_0_35.rst.inc | 2 ++ setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6d9d1f9a6..5b3ed14d8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,7 +27,7 @@ jobs: fail-fast: false matrix: # os: ["ubuntu-latest", "macos-latest", "windows-latest"] - python-version: ['3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] defaults: run: diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 8cca1c95f..ffd9d2187 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -31,6 +31,8 @@ Backward incompatible changes New features ^^^^^^^^^^^^ +* added support for Python 3.13 (closes :issue:`1128`). + * :py:obj:`Array.plot()` now has an ``animate`` argument to produce animated plots. The argument takes an axis (it also supports several axes but that is rarely useful) and will create an animation, with one image per label of that diff --git a/setup.py b/setup.py index db6a8d818..4cde7409a 100644 --- a/setup.py +++ b/setup.py @@ -30,11 +30,11 @@ def readlocal(fname): 'Intended Audience :: Developers', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', 'Topic :: Scientific/Engineering', 'Topic :: Software Development :: Libraries', ] From f98850006b88741bb2cf1b1ed5fa9662320aa7d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 2 Dec 2025 16:53:13 +0100 Subject: [PATCH 3/5] MAINT: modernize github actions --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5b3ed14d8..b205e5def 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ jobs: with: # Pin ruff version to make sure we do not break our builds at the # worst times - version: "0.14.5" + version: "0.14.7" test: # name: Test (${{ matrix.python-version }}, ${{ matrix.os }}) From e3dcb33932faee1780ee29f363a919714bec90c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 3 Dec 2025 11:36:35 +0100 Subject: [PATCH 4/5] MAINT: use https links --- condarecipe/larray/meta.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/condarecipe/larray/meta.yaml b/condarecipe/larray/meta.yaml index 13cd7e886..fbb062431 100644 --- a/condarecipe/larray/meta.yaml +++ b/condarecipe/larray/meta.yaml @@ -35,7 +35,7 @@ test: - pytest --pyargs larray about: - home: http://github.com/larray-project/larray + home: https://github.com/larray-project/larray license: GPL-3.0-only license_family: GPL license_file: LICENSE @@ -43,8 +43,8 @@ about: description: | LArray is an open source Python library that aims to provide tools for easy exploration and manipulation of N-dimensional labelled data structures. - doc_url: http://larray.readthedocs.io/ - dev_url: http://github.com/larray-project/larray + doc_url: https://larray.readthedocs.io/ + dev_url: https://github.com/larray-project/larray extra: recipe-maintainers: From 459295c7bbe679bd13ab700919f9080fbeb6d7ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 3 Dec 2025 12:39:53 +0100 Subject: [PATCH 5/5] FEAT: better error message for AxisCollection.index(name) this is also Python3.14-proof, while we used to rely on the list.index builtin message, which changed (for the worse IMO) in Python 3.14 --- larray/core/axis.py | 5 ++++- larray/tests/test_axiscollection.py | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index 6c58a878a..9b18c4905 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -2302,7 +2302,10 @@ def index(self, axis, compatible=False) -> int: name = axis if name is None: raise ValueError(f"{axis!r} is not in collection") - return self.names.index(name) + try: + return self.names.index(name) + except ValueError: + raise ValueError(f"axis {name!r} is not in collection") # XXX: we might want to return a new AxisCollection (same question for other inplace operations: # append, extend, pop, __delitem__, __setitem__) diff --git a/larray/tests/test_axiscollection.py b/larray/tests/test_axiscollection.py index dd34478d9..e0d26bdf9 100644 --- a/larray/tests/test_axiscollection.py +++ b/larray/tests/test_axiscollection.py @@ -221,9 +221,9 @@ def test_contains(col): def test_index(col): assert col.index('lipro') == 0 - with must_raise(ValueError, msg="'nonexisting' is not in list"): + with must_raise(ValueError, msg="axis 'nonexisting' is not in collection"): col.index('nonexisting') - assert col.index(0) == 0 + assert col.index(0) == 0 assert col.index(1) == 1 assert col.index(2) == 2 assert col.index(-1) == -1 @@ -237,9 +237,9 @@ def test_index(col): assert col.index(sex) == 1 assert col.index(age) == 2 assert col.index(sex2) == 1 - with must_raise(ValueError, msg="'geo' is not in list"): + with must_raise(ValueError, msg="axis 'geo' is not in collection"): col.index(geo) - with must_raise(ValueError, msg="'value' is not in list"): + with must_raise(ValueError, msg="axis 'value' is not in collection"): col.index(value) # test anonymous axes