From f6e3324168dff5ab921cf8ea3af62d9714b6aea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 24 Nov 2025 10:29:58 +0100 Subject: [PATCH 01/22] DOC: small design note --- design.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/design.txt b/design.txt index c3037d85c..b44ff5d46 100644 --- a/design.txt +++ b/design.txt @@ -1131,6 +1131,9 @@ subset = pop.q('M, age.sum(10:20 >> yada1, 20:30 >> yada2')) # without ambiguity, that would be subset = pop.q('M, sum(10:20 >> yada1, 20:30 >> yada2')) +# this could work too: +subset = pop.q('M', sum(age[10:20] >> 'yada1', age[20:30] >> 'yada2')) + # if using a function (like .q) we could also "rename" axes on the fly. the above would create an aggregated axis # named "age" but the code below would create "toto" instead subset = pop.q('M', toto=age.sum[10:20, 20:30]) From 9f42939cb3b51d6ddff982b160e6f9388e87bc05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 25 Nov 2025 11:38:45 +0100 Subject: [PATCH 02/22] FIX: fixed reindex(axis_name=labels_def) (fixes #1120) purposefully not including this in the changelog because I might deprecate that syntax in 0.36 (see #1142) --- larray/core/array.py | 20 ++++++++++++-------- larray/tests/test_array.py | 12 ++++++++++++ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index e4aec2ea7..35a91f0bd 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -1720,6 +1720,14 @@ def is_axis_def(axis_def): return ((isinstance(axis_def, str) and '=' in axis_def) or isinstance(axis_def, Group)) + def axes_refs_and_defs_to_axes(axes_to_reindex: dict): + new_axes_to_reindex = {} + for k, v in axes_to_reindex.items(): + src_axis = axis_ref_to_axis(self.axes, k) + dst_axis = labels_def_and_name_to_axis(v, src_axis.name) + new_axes_to_reindex[src_axis] = dst_axis + return new_axes_to_reindex + if new_axis is None: if isinstance(axes_to_reindex, Axis) and not isinstance(axes_to_reindex, AxisReference): axes_to_reindex = {axes_to_reindex: axes_to_reindex} @@ -1757,14 +1765,10 @@ def is_axis_def(axis_def): else: # TODO: move this to AxisCollection.replace if isinstance(axes_to_reindex, dict): - new_axes_to_reindex = {} - for k, v in axes_to_reindex.items(): - src_axis = axis_ref_to_axis(self.axes, k) - dst_axis = labels_def_and_name_to_axis(v, src_axis.name) - new_axes_to_reindex[src_axis] = dst_axis - axes_to_reindex = new_axes_to_reindex - - res_axes = self.axes.replace(axes_to_reindex, **kwargs) + axes_to_reindex = axes_refs_and_defs_to_axes(axes_to_reindex) + res_axes = self.axes.replace(axes_to_reindex) + if kwargs: + res_axes = res_axes.replace(axes_refs_and_defs_to_axes(kwargs)) res = full(res_axes, fill_value, dtype=common_dtype((self.data, fill_value))) def get_group(res_axes, self_axis): diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 656314f47..65b3ddf08 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -3195,6 +3195,18 @@ def test_reindex(): res = arr.reindex({'a': 'a0,a1,a2'}) assert_larray_nan_equal(res, expected) + # using the **kwargs syntax + res = arr.reindex(a=new_a) + assert_larray_nan_equal(res, expected) + + # using the **kwargs syntax with a list of labels + res = arr.reindex(a=['a0', 'a1', 'a2']) + assert_larray_nan_equal(res, expected) + + # using the **kwargs syntax with a labels def string (issue #1120) + res = arr.reindex(a='a0,a1,a2') + assert_larray_nan_equal(res, expected) + # test error conditions msg = ("In Array.reindex, when using an axis reference ('axis name', X.axis_name or " "axis_integer_position) as axes_to_reindex, you must provide a value for `new_axis`.") From 7d9901655f4137e978ee30e7bd39ce59ec0c023f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 25 Nov 2025 13:38:15 +0100 Subject: [PATCH 03/22] CLN/PERF: simplified Array.eq implementation should also be faster for object dtypes and when rtol=0 and atol=0 and nans_equal is True --- larray/core/array.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 35a91f0bd..04d640241 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -814,8 +814,6 @@ def _doc_agg_method(func, by=False, long_name='', action_verb='perform', extra_a _always_return_float = {np.mean, np.nanmean, np.median, np.nanmedian, np.percentile, np.nanpercentile, np.std, np.nanstd, np.var, np.nanvar} -obj_isnan = np.vectorize(lambda x: x != x, otypes=[bool]) - def element_equal(a1, a2, rtol=0, atol=0, nan_equals=False): warnings.warn("element_equal() is deprecated. Use array1.eq(array2, rtol, atol, nan_equals) instead.", @@ -6027,24 +6025,15 @@ def eq(self, other, rtol=0, atol=0, nans_equal=False) -> 'Array': """ other = asarray(other) + (self_data, other_data), res_axes = raw_broadcastable([self, other]) if rtol == 0 and atol == 0: - if not nans_equal: - return self == other - else: - from larray.core.npufuncs import isnan - - def general_isnan(a): - if issubclass(a.dtype.type, np.inexact): - return isnan(a) - elif a.dtype.type is np.object_: - return Array(obj_isnan(a), a.axes) - else: - return False - - return (self == other) | (general_isnan(self) & general_isnan(other)) + res_data = self_data == other_data + if nans_equal: + res_data |= (self_data != self_data) & (other_data != other_data) else: - (a1_data, a2_data), res_axes = raw_broadcastable([self, other]) - return Array(np.isclose(a1_data, a2_data, rtol=rtol, atol=atol, equal_nan=nans_equal), res_axes) + res_data = np.isclose(self_data, other_data, + rtol=rtol, atol=atol, equal_nan=nans_equal) + return Array(res_data, res_axes) def isin(self, test_values, assume_unique=False, invert=False) -> 'Array': r""" From d4b47fc4f2808863eecbbb8ddec994bbdcfc7e12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 25 Nov 2025 15:21:48 +0100 Subject: [PATCH 04/22] FEAT: implemented nan_to_num and isnan for object arrays --- larray/__init__.py | 6 +- larray/core/npufuncs.py | 2 - larray/core/ufuncs.py | 186 +++++++++++++++++++++++++++++++++++++ larray/tests/test_array.py | 34 ++++++- 4 files changed, 221 insertions(+), 7 deletions(-) diff --git a/larray/__init__.py b/larray/__init__.py index eb3d91aaa..eddf1ee86 100644 --- a/larray/__init__.py +++ b/larray/__init__.py @@ -11,15 +11,15 @@ from larray.core.checked import CheckedArray, CheckedSession, CheckedParameters from larray.core.constants import nan, inf, pi, e, euler_gamma from larray.core.metadata import Metadata -from larray.core.ufuncs import wrap_elementwise_array_func, maximum, minimum, where +from larray.core.ufuncs import wrap_elementwise_array_func, maximum, minimum, where, isnan, nan_to_num from larray.core.npufuncs import (sin, cos, tan, arcsin, arccos, arctan, hypot, arctan2, degrees, radians, unwrap, sinh, cosh, tanh, arcsinh, arccosh, arctanh, angle, real, imag, conj, round, around, rint, fix, floor, ceil, trunc, exp, expm1, exp2, log, log10, log2, log1p, logaddexp, logaddexp2, i0, sinc, signbit, copysign, frexp, ldexp, - convolve, clip, sqrt, absolute, fabs, sign, fmax, fmin, nan_to_num, - real_if_close, interp, isnan, isinf, inverse) + convolve, clip, sqrt, absolute, fabs, sign, fmax, fmin, + real_if_close, interp, isinf, inverse) from larray.core.misc import isscalar from larray.inout.misc import from_lists, from_string diff --git a/larray/core/npufuncs.py b/larray/core/npufuncs.py index bd40ad067..5d8fb5a58 100644 --- a/larray/core/npufuncs.py +++ b/larray/core/npufuncs.py @@ -118,12 +118,10 @@ sign = wrap_numpy_func(np.sign) fmax = wrap_numpy_func(np.fmax) fmin = wrap_numpy_func(np.fmin) -nan_to_num = wrap_numpy_func(np.nan_to_num) real_if_close = wrap_numpy_func(np.real_if_close) # TODO: add examples for functions below sqrt = wrap_numpy_func(np.sqrt) -isnan = wrap_numpy_func(np.isnan) isinf = wrap_numpy_func(np.isinf) inverse = wrap_numpy_func(np.linalg.inv) diff --git a/larray/core/ufuncs.py b/larray/core/ufuncs.py index 55eb075f2..481285a26 100644 --- a/larray/core/ufuncs.py +++ b/larray/core/ufuncs.py @@ -300,3 +300,189 @@ def wrap_numpy_func(func, doc=None): a0 6 2 5 0 a1 5 2 3 0 """) + +def _generalized_isnan(arr, out=None, where=True, **kwargs): + if isinstance(arr, np.ndarray) and arr.dtype.kind == 'O': + if out is not None or where is not True or kwargs: + raise ValueError("The 'out', 'where' and other keyword arguments " + "are not supported for object arrays.") + return arr != arr + else: + return np.isnan(arr, out=out, where=where, **kwargs) + +isnan = wrap_elementwise_array_func(_generalized_isnan, r""" +Test element-wise for NaN and return result as a boolean array. + +Parameters +---------- +x : array_like + Input array. +out : ndarray, None, or tuple of ndarray and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or None, + a freshly-allocated array is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. +where : array_like, optional + This condition is broadcast over the input. At locations where the + condition is True, the `out` array will be set to the ufunc result. + Elsewhere, the `out` array will retain its original value. + Note that if an uninitialized `out` array is created via the default + ``out=None``, locations within it where the condition is False will + remain uninitialized. +**kwargs + For other keyword-only arguments, see the + :ref:`ufunc docs `. + +Returns +------- +y : ndarray or bool + True where ``x`` is NaN, false otherwise. + This is a scalar if `x` is a scalar. + +See Also +-------- +isinf, isneginf, isposinf, isfinite, isnat + +Notes +----- +Contrary to the numpy implementation, this function support object arrays. + +NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic +(IEEE 754). This means that Not a Number is not equivalent to infinity. + +Examples +-------- +>>> import larray as la +>>> la.isnan(la.nan) +True +>>> la.isnan(la.inf) +False +>>> arr = la.Array([la.nan, 1, la.inf], +... la.Axis(3, 'values')) +>>> la.isnan(arr) +values* 0 1 2 + True False False +>>> arr = la.Array(['abc', 1, la.nan], +... la.Axis(3, 'values'), dtype=object) +>>> la.isnan(arr) +values* 0 1 2 + False False True +""") + + +def _generalized_nan_to_num(arr, copy=True, nan=0, posinf=None, neginf=None): + if isinstance(arr, np.ndarray) and arr.dtype.kind == 'O': + import sys + if posinf is None: + posinf = sys.float_info.max + if neginf is None: + neginf = -sys.float_info.max + res = arr.copy() if copy else arr + is_nan_value = arr != arr + is_pos_inf_value = arr == np.inf + is_neg_inf_value = arr == -np.inf + if isinstance(nan, np.ndarray): + # each array argument is reshaped to a compatible shape for + # broadcasting by larray machinery but not actually broadcasted yet + nan = np.broadcast_to(nan, arr.shape)[is_nan_value] + res[is_nan_value] = nan + if isinstance(posinf, np.ndarray): + posinf = np.broadcast_to(posinf, arr.shape)[is_pos_inf_value] + res[is_pos_inf_value] = posinf + if isinstance(neginf, np.ndarray): + neginf = np.broadcast_to(neginf, arr.shape)[is_neg_inf_value] + res[is_neg_inf_value] = neginf + return res + else: + return np.nan_to_num(arr, copy=copy, nan=nan, posinf=posinf, neginf=neginf) + +nan_to_num = wrap_elementwise_array_func(_generalized_nan_to_num,r""" + Replace NaN with zero and infinity with large finite numbers (default + behaviour) or with the numbers defined by the user using the `nan`, + `posinf` and/or `neginf` keywords. + + If `x` is inexact or an object array, NaN is replaced by zero or by the user + defined value in `nan` keyword, infinity is replaced by the largest finite + floating point value representable by ``x.dtype`` or by the user defined + value in `posinf` keyword and -infinity is replaced by the most negative + finite floating point value representable by ``x.dtype`` or by the user + defined value in `neginf` keyword. + + For complex dtypes, the above is applied to each of the real and + imaginary components of `x` separately. + + If `x` is not inexact or object, then no replacements are made. + + Parameters + ---------- + x : scalar or array_like + Input data. + copy : bool, optional + Whether to create a copy of `x` (True) or to replace values + in-place (False). The in-place operation only occurs if + casting to an array does not require a copy. + Default is True. + nan : int, float or array_like, optional + Value to be used to fill NaN values. If no value is passed + then NaN values will be replaced with 0.0. + posinf : int, float, optional + Value to be used to fill positive infinity values. If no value is + passed then positive infinity values will be replaced with the largest + finite floating point value representable by ``x.dtype``. + neginf : int, float, optional + Value to be used to fill negative infinity values. If no value is + passed then negative infinity values will be replaced with the most + negative finite floating point value representable by ``x.dtype``. + + Returns + ------- + out : Array or scalar + `x`, with the non-finite values replaced. If `copy` is False, this may + be `x` itself. + + See Also + -------- + isinf : Shows which elements are positive or negative infinity. + isneginf : Shows which elements are negative infinity. + isposinf : Shows which elements are positive infinity. + isnan : Shows which elements are Not a Number (NaN). + isfinite : Shows which elements are finite (not NaN, not infinity) + + Notes + ----- + Contrary to the numpy implementation, this function support object arrays. + + NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). This means that Not a Number is not equivalent to infinity. + + Examples + -------- + >>> import larray as la + + >>> la.nan_to_num(la.inf) + 1.7976931348623157e+308 + >>> la.nan_to_num(-la.inf) + -1.7976931348623157e+308 + >>> la.nan_to_num(np.nan) + 0.0 + + >>> x = la.Array([-la.inf, 1, la.nan, 2, la.inf], la.Axis(5, 'values')) + >>> la.nan_to_num(x) + values* 0 1 2 3 4 + -1.7976931348623157e+308 1.0 0.0 2.0 1.7976931348623157e+308 + >>> la.nan_to_num(x, nan=-1, posinf=999, neginf=-999) + values* 0 1 2 3 4 + -999.0 1.0 -1.0 2.0 999.0 + + >>> x = la.Array([1, 'abc', la.nan, 2], la.Axis(4, 'values'), dtype=object) + >>> la.nan_to_num(x) + values* 0 1 2 3 + 1 abc 0 2 + + >>> y = la.Array([complex(la.inf, la.nan), la.nan, complex(la.nan, la.inf)], + ... la.Axis(3, 'values')) + >>> la.nan_to_num(y) + values* 0 1 2 + (1.7976931348623157e+308+0j) 0j 1.7976931348623157e+308j + """ +) \ No newline at end of file diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 65b3ddf08..1a0c1c6a6 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -1,4 +1,5 @@ import os +import sys import pytest import numpy as np @@ -17,7 +18,7 @@ asarray, union, clip, exp, where, X, mean, inf, nan, isnan, round, read_hdf, read_csv, read_eurostat, read_excel, open_excel, from_lists, from_string, from_frame, from_series, - zip_array_values, zip_array_items) + zip_array_values, zip_array_items, nan_to_num) from larray.core.axis import _to_ticks, _to_tick, _to_key from larray.util.misc import LHDFStore @@ -5829,7 +5830,9 @@ def test_deprecated_methods(): def test_eq(): a = ndtest((2, 3, 4)) ao = a.astype(object) - assert_larray_equal(ao.eq(ao['c0'], nans_equal=True), a == a['c0']) + res = ao.eq(ao['c0'], nans_equal=True) + expected = a == a['c0'] + assert_larray_equal(res, expected) def test_zip_array_values(): @@ -5897,6 +5900,33 @@ def test_np_array(): assert res is not arr.data +def test_nan_to_num(): + a = Axis('a=a0..a4') + arr = Array([1.0, nan, inf, 2.0, -inf], axes=a) + res = nan_to_num(arr) + max_float = np.finfo(arr.dtype).max + expected = Array([1.0, 0.0, max_float, 2.0, -max_float], axes=a) + assert_larray_equal(res, expected) + + max_float = sys.float_info.max + arr = Array(["abc", 1.0, nan, inf, -inf], axes=a, dtype=object) + res = nan_to_num(arr, neginf=-10, posinf=10) + expected = Array(["abc", 1.0, 0.0, 10.0, -10.0], axes=a, dtype=object) + assert_larray_equal(res, expected) + + +def test_isnan(): + a = Axis('a=a0..a3') + arr = Array([1.0, nan, inf, -inf], axes=a) + res = isnan(arr) + expected = Array([False, True, False, False], axes=a) + assert_larray_equal(res, expected) + + oarr = arr.astype(object) + res = isnan(oarr) + assert_larray_equal(res, expected) + + if __name__ == "__main__": # import doctest # import unittest From f2a5dd9c7e77e96680de2f30547fe9735118c727 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 8 Apr 2025 16:15:31 +0200 Subject: [PATCH 05/22] FIX: added support for Pandas Series in asarray() (fixes #895) considered a fix because it sort of worked but silently ignored the index and name of the series --- doc/source/changes/version_0_35.rst.inc | 5 ++++- larray/core/array.py | 3 +++ larray/tests/test_array.py | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 754a05e52..924ba45c5 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -85,4 +85,7 @@ Miscellaneous improvements Fixes ^^^^^ -* fixed something (closes :issue:`1`). +* added support for Pandas Series in :py:obj:`asarray()`. This is considered a + fix because it kind of worked but silently ignored the index and name of the + series (closes :issue:`895`). + diff --git a/larray/core/array.py b/larray/core/array.py index 04d640241..015c21837 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -8351,6 +8351,9 @@ def asarray(a, meta=None) -> Array: elif isinstance(a, pd.DataFrame): from larray.inout.pandas import from_frame return from_frame(a, meta=meta) + elif isinstance(a, pd.Series): + from larray.inout.pandas import from_series + return from_series(a, meta=meta) else: return Array(a, meta=meta) diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 1a0c1c6a6..d543aaec9 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -4482,6 +4482,13 @@ def test_from_frame(): assert_larray_equal(res, expected) +def test_asarray(): + series = pd.Series([0, 1, 2], ['a0', 'a1', 'a2'], name='a') + expected = ndtest(3) + res = asarray(series) + assert_larray_equal(res, expected) + + def test_to_csv(tmp_path): io_3d.to_csv(tmp_path / 'out3d.csv') assert (tmp_path / 'out3d.csv').read_text() == """\ From 273892428ea92cdc828c17d16e437f09d987e434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 25 Nov 2025 16:23:54 +0100 Subject: [PATCH 06/22] DOC: fixed typo in changelog --- doc/source/changes/version_0_35.rst.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 924ba45c5..40a5b7876 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -79,7 +79,7 @@ Miscellaneous improvements always stacking the last axis. For example, a plot with genders stacked could be specified as: - >>> arr.plot.bar(stacked='gender') + >>> arr.plot.bar(stack='gender') Fixes From 5fb3d37cd06021a82241a46cd3d5f5f407939fd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 26 Nov 2025 10:20:33 +0100 Subject: [PATCH 07/22] FEAT: improved lazy expressions (ExprNode) reprs the fix I will commit next will make it more likely users see them, so we might as well make it more useful --- larray/core/expr.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/larray/core/expr.py b/larray/core/expr.py index 92aebb2ba..1df5d499d 100644 --- a/larray/core/expr.py +++ b/larray/core/expr.py @@ -1,4 +1,6 @@ -from larray.core.abstractbases import ABCArray +import numpy as np + +from larray.core.abstractbases import ABCAxisReference, ABCAxis, ABCArray class ExprNode: @@ -81,6 +83,25 @@ def expr_eval(expr, context): return expr.evaluate(context) if isinstance(expr, ExprNode) else expr +def value_summary(value): + if isinstance(value, ABCArray): + axes = value.axes + axes_info = ' x '.join(f'{name} ({length})' + for name, length + in zip(axes.display_names, axes.shape)) + return f"Array(<{axes_info}>)" + elif isinstance(value, ABCAxisReference): + return f"X.{value.name}" + elif isinstance(value, ABCAxis): + return f"Axis(<{value.name} ({len(value)})>)" + elif isinstance(value, ExprNode): + return repr(value) + else: + assert np.isscalar(value), (f"Expected scalar value, " + f"got {type(value).__name__}") + return repr(value) + + class BinaryOp(ExprNode): def __init__(self, op, expr1, expr2): self.opname = f'__{op}__' @@ -94,7 +115,9 @@ def evaluate(self, context): return getattr(expr1, self.opname)(expr2) def __repr__(self): - return f"BinaryOp({self.opname[2:-2]!r}, {self.expr1!r}, {self.expr2!r})" + return (f"BinaryOp({self.opname[2:-2]!r}, " + f"{value_summary(self.expr1)}, " + f"{value_summary(self.expr2)})") class UnaryOp(ExprNode): @@ -108,4 +131,4 @@ def evaluate(self, context): return getattr(expr, self.opname)() def __repr__(self): - return f"UnaryOp({self.opname[2:-2]!r}, {self.expr!r})" + return f"UnaryOp({self.opname[2:-2]!r}, {value_summary(self.expr)})" From a5ebdbcd8b47254fdb6cb732bb57ac2d24bb7b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 25 Nov 2025 17:13:11 +0100 Subject: [PATCH 08/22] FIX: delay binops with X.axis when not all axes available (fixes #1129) --- doc/source/changes/version_0_35.rst.inc | 3 +++ larray/core/array.py | 11 ++++++-- larray/core/axis.py | 3 +++ larray/core/expr.py | 36 ++++++++++++++++++++++--- larray/tests/test_array.py | 12 +++++++++ 5 files changed, 60 insertions(+), 5 deletions(-) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 40a5b7876..3d4af077c 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -89,3 +89,6 @@ Fixes fix because it kind of worked but silently ignored the index and name of the series (closes :issue:`895`). +* fixed evaluating operations involving X.axis and an array when + that operation is only valid in the context of a larger array by delaying + the evaluation until the larger array is known (closes :issue:`1129`). \ No newline at end of file diff --git a/larray/core/array.py b/larray/core/array.py index 015c21837..ee1a64d51 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -50,7 +50,7 @@ from larray.core.abstractbases import ABCArray from larray.core.constants import nan, inf from larray.core.metadata import Metadata -from larray.core.expr import ExprNode +from larray.core.expr import ExprNode, BinaryOp from larray.core.group import (Group, IGroup, LGroup, _to_key, _to_keys, _translate_sheet_name, _translate_group_key_hdf) from larray.core.axis import Axis, AxisReference, AxisCollection, X, _make_axis # noqa: F401 @@ -5557,8 +5557,15 @@ def _binop(opname): super_method = getattr(np.ndarray, fullname) def opmethod(self, other) -> 'Array': + # we could implement this more cleanly by returning + # NotImplemented in this case and letting the ExprNode reverse + # op (r*) handle it, but this can change the result axes order + # so I am unsure about that. if isinstance(other, ExprNode): - other = other.evaluate(self.axes) + if other.can_evaluate_with(self.axes): + other = other.evaluate(self.axes) + else: + return BinaryOp(opname, self, other) # XXX: unsure what happens for non scalar Groups. # we might want to be more general than this and .eval all Groups? diff --git a/larray/core/axis.py b/larray/core/axis.py index 80b2664bc..7ff953b46 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -3817,6 +3817,9 @@ def evaluate(self, context) -> Axis: """ return context[self.name] + def can_evaluate_with(self, context) -> set: # set[int] + return self.name in context + # Use the default hash. We have to specify it explicitly because we define __eq__ via ExprNode and # ExprNode.__hash__ (which is not set explicitly) takes precedence over Axis.__hash__ __hash__ = object.__hash__ diff --git a/larray/core/expr.py b/larray/core/expr.py index 1df5d499d..0e4e98aea 100644 --- a/larray/core/expr.py +++ b/larray/core/expr.py @@ -9,9 +9,13 @@ def __bool__(self): # method factory def _binop(opname): + # reversed = opname.startswith('r') and opname != 'rshift' def opmethod(self, other): - # evaluate eagerly when possible - if isinstance(other, ABCArray): + assert isinstance(self, ExprNode), \ + (f"Expected ExprNode, got {type(self).__name__} " + f"({self=} {other=})") + if (isinstance(other, ABCArray) and + self.can_evaluate_with(other.axes)): self_value = self.evaluate(other.axes) return getattr(self_value, f'__{opname}__')(other) else: @@ -20,6 +24,7 @@ def opmethod(self, other): opmethod.__name__ = f'__{opname}__' return opmethod + __rmatmul__ = _binop('rmatmul') __matmul__ = _binop('matmul') __ror__ = _binop('ror') __or__ = _binop('or') @@ -28,7 +33,7 @@ def opmethod(self, other): __rand__ = _binop('rand') __and__ = _binop('and') __rrshift__ = _binop('rrshift') - __rshift__ = _binop('rshift') + __rshift__ = _binop('rshift') # not reverse even though it starts with 'r' __rlshift__ = _binop('rlshift') __lshift__ = _binop('lshift') __rpow__ = _binop('rpow') @@ -68,6 +73,21 @@ def opmethod(self): __abs__ = _unaryop('abs') __invert__ = _unaryop('invert') + def can_evaluate_with(self, context): + """ + Returns wether this expression can be evaluated using the given context. + + Parameters + ---------- + context : AxisCollection + Use axes from this collection + + Returns + ------- + bool + """ + raise NotImplementedError() + def evaluate(self, context): """ Parameters @@ -114,6 +134,13 @@ def evaluate(self, context): expr2 = expr_eval(self.expr2, context) return getattr(expr1, self.opname)(expr2) + def can_evaluate_with(self, context): + return ( + (self.expr1.can_evaluate_with(context) if isinstance(self.expr1, ExprNode) else True) + and + (self.expr2.can_evaluate_with(context) if isinstance(self.expr2, ExprNode) else True) + ) + def __repr__(self): return (f"BinaryOp({self.opname[2:-2]!r}, " f"{value_summary(self.expr1)}, " @@ -130,5 +157,8 @@ def evaluate(self, context): expr = expr_eval(self.expr, context) return getattr(expr, self.opname)() + def can_evaluate_with(self, context): + return self.expr.can_evaluate_with(context) + def __repr__(self): return f"UnaryOp({self.opname[2:-2]!r}, {value_summary(self.expr)})" diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index d543aaec9..a8cebf110 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -829,6 +829,18 @@ def test_getitem_bool_larray_key_arr_whout_bool_axis(): res = arr[X.b < 2] assert_nparray_equal(res.data, raw[:, :2]) + # using an AxisReference (ExprNode) with an Array which cannot be evaluated + # outside of the getitem context (issue #1129) + threshold = stack({'a0': 2, 'a1': 3}, 'a') + expected = Array([2, 3, 7], Axis('b_a=2_a0,3_a0,3_a1')) + res = arr[X.b >= threshold] + assert_larray_equal(res, expected) + + # same situation when the array is first (the code path is different) + expected = Array([2, 3, 7], Axis('a_b=a0_2,a0_3,a1_3')) + res = arr[threshold <= X.b] + assert_larray_equal(res, expected) + def test_getitem_bool_larray_key_arr_wh_bool_axis(): gender = Axis([False, True], 'gender') From 6f21c0ce75634fcfc65775f4e3c706220054bf74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Sun, 17 Aug 2025 13:40:31 +0200 Subject: [PATCH 09/22] DOC: fix comment --- larray/inout/xw_excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/larray/inout/xw_excel.py b/larray/inout/xw_excel.py index e15567805..254e4b795 100644 --- a/larray/inout/xw_excel.py +++ b/larray/inout/xw_excel.py @@ -150,7 +150,7 @@ def __init__(self, filepath=None, overwrite_file=False, visible=None, silent=Non app = global_app assert isinstance(app, xw.App) - # activate XLA(M) addins, if nee + # activate XLA(M) addins, if needed # By default, add-ins are not activated when an Excel Workbook is opened via COM if load_addins: xl_app = app.api From 421db842c29d41f3dcf4dde841a2d8a3265881c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 26 Nov 2025 12:10:37 +0100 Subject: [PATCH 10/22] PERF/FIX: removed chunking code it slowed things down in the usual case and messed some error messages --- larray/core/axis.py | 32 +------------------------------- larray/tests/test_array.py | 1 + 2 files changed, 2 insertions(+), 31 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index 7ff953b46..a91b84f6f 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -2805,8 +2805,6 @@ def _translate_axis_key(self, axis_key): """ # called from _key_to_igroups - from .array import Array - # Need to convert string keys to groups otherwise command like # >>> ndtest((5, 5)).drop('1[a0]') # will work although it shouldn't @@ -2827,35 +2825,7 @@ def _translate_axis_key(self, axis_key): else: axis_key = axis_key.labels - # TODO: do it for Group without axis too - if isinstance(axis_key, (tuple, list, np.ndarray, Array)): - axis = None - # TODO: I should actually do some benchmarks to see if this is useful, and estimate which numbers to use - # FIXME: check that size is < than key size - for size in (1, 10, 100, 1000): - # TODO: do not recheck already checked elements - key_chunk = axis_key.i[:size] if isinstance(axis_key, Array) else axis_key[:size] - try: - axis, ikey = self._translate_axis_key_chunk(key_chunk) - # if key is unambiguous (did not raise an exception), we know the axis - # TODO: if len(axis_key) < size, we can return axis, ikey directly - break - # TODO: we should only continue when ValueError is caused by an ambiguous key, otherwise we only delay - # an inevitable failure - except ValueError: - continue - # the (start of the) key match a single axis - if axis is not None: - # make sure we have an Axis object - # TODO: we should make sure the tkey returned from _translate_axis_key_chunk always contains a - # real Axis (and thus kill this line) - # axis = self[axis] - # wrap key in LGroup - axis_key = axis[axis_key] - # XXX: reuse tkey chunks and only translate the rest? - return self._translate_axis_key_chunk(axis_key) - else: - return self._translate_axis_key_chunk(axis_key) + return self._translate_axis_key_chunk(axis_key) def _key_to_axis_indices_dict(self, key): """ diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index a8cebf110..db0c40a13 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -1101,6 +1101,7 @@ def test_getitem_single_larray_key_guess(): assert_larray_equal(arr[key], expected) + def test_getitem_multiple_larray_key_guess(): a, b, c, d, e = ndtest((2, 3, 2, 3, 2)).axes arr = ndtest((a, b)) From 5ead3d308e187ca437a46fe6079886c5c66cdb0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 26 Nov 2025 11:54:37 +0100 Subject: [PATCH 11/22] FIX/FEAT: fixed and improved arr[array_key_with_bad_values] error message (fixes #1134) * the code to compute the error message was itself raising an exception when the array key had ndim > 1 * mention the bad key parts if not too large --- doc/source/changes/version_0_35.rst.inc | 4 ++ larray/core/axis.py | 51 +++++++++++++++++++++---- larray/tests/test_array.py | 40 ++++++++++++++++--- 3 files changed, 82 insertions(+), 13 deletions(-) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 3d4af077c..43aec8ef8 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -85,6 +85,10 @@ Miscellaneous improvements Fixes ^^^^^ +* fixed error message when trying to take a subset of an array with an array + key which has ndim > 1 and some bad values in the key. The message was also + improved (see the issue for details). Closes :issue:`1134`. + * added support for Pandas Series in :py:obj:`asarray()`. This is considered a fix because it kind of worked but silently ignored the index and name of the series (closes :issue:`895`). diff --git a/larray/core/axis.py b/larray/core/axis.py index a91b84f6f..cd06bc867 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -2681,19 +2681,33 @@ def _translate_nice_key(self, axis_key): except KeyError: continue if not valid_axes: - # if the key has several labels + # transform string key to object (Group, list, slice, range, scalar) nicer_key = _to_key(axis_key) sequence_types = (tuple, list, np.ndarray, ABCArray) if (isinstance(nicer_key, sequence_types) or (isinstance(nicer_key, Group) and isinstance(nicer_key.key, sequence_types))): - # we use a different "base" message in this case (because axis_key is not really a *label*) - msg = f"{axis_key!r} is not a valid subset for any axis:\n{self._axes_summary()}" - # ... and check for partial matches if isinstance(nicer_key, Group): nicer_key = nicer_key.eval() - key_label_set = set(nicer_key) + + # we transform arrays with ndim > 1 to flat arrays because + # otherwise the elements are arrays themselves which are not + # hashable and thus we cannot compute a set of them + if isinstance(nicer_key, (ABCArray, np.ndarray)): + key_flat_values = nicer_key.data.flat if isinstance(nicer_key, ABCArray) else nicer_key.flat + array_key = True + msg = (f"The values of the array key:\n\n{axis_key!r}\n\n" + f"do not all correspond to labels of a single axis " + f"of the subsetted array which has the following " + f"axes:\n\n{self._axes_summary()}\n") + else: + key_flat_values = nicer_key + array_key = False + msg = (f"{axis_key!r} is not a valid subset for any axis:\n" + f"{self._axes_summary()}") + + key_label_set = set(key_flat_values) partial_matches = {} for axis in self: missing_labels = key_label_set - set(axis.labels) @@ -2704,12 +2718,33 @@ def _translate_nice_key(self, axis_key): partial_matches_str = '\n'.join( f" * axis '{self.axis_id(axis)}' contains {len(key_label_set) - len(missing_labels)}" f' out of {len(key_label_set)}' - f' labels (missing labels: {", ".join(repr(label) for label in missing_labels)})' + f' labels (labels not found: {", ".join(repr(label) for label in missing_labels)})' for axis, missing_labels in partial_matches.items() ) - msg += f"\nSome of those labels are valid though:\n{partial_matches_str}" + what = 'key values' if array_key else 'labels' + msg += f"\nSome of those {what} correspond though:\n{partial_matches_str}" + + # if we have a single partial match and an la.Array key (we + # don't do it for np.ndarray keys), we compute the problematic + # parts of the key and mention them if they are small enough + if len(partial_matches) == 1 and isinstance(nicer_key, ABCArray): + axis = next(iter(partial_matches.keys())) + is_bad_key_value = (~nicer_key.isin(axis.labels)).compact() + bad_indices_per_axis = is_bad_key_value.data.nonzero() + SMALL_BAD_PART_THRESHOLD = 5 + small_bad_parts_locations = [ + f" {axis.name}: " + + ' '.join(repr(label) + for label in axis.labels[axis_indices]) + for axis, axis_indices in zip(nicer_key.axes, bad_indices_per_axis) + if len(axis_indices) <= SMALL_BAD_PART_THRESHOLD + ] + if small_bad_parts_locations: + msg += ("\n\nNote that all the bad key values are " + "located within the following labels:\n") + msg += '\n'.join(small_bad_parts_locations) else: - # we have single label + # we have a single label msg = f"{axis_key!r} is not a valid label for any axis:\n{self._axes_summary()}" raise ValueError(msg) diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index db0c40a13..3ab2ed513 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -675,9 +675,13 @@ def test_getitem_guess_axis(array): # key with partial invalid list (ie list containing a label not found # on any axis) - # FIXME: this should not mention the a axis specifically (this is due to the chunking code) - with must_raise(ValueError, "a[3, 999] is not a valid label for the 'a' axis with labels: 0, 1, 2, 3, 4, 5, 6, " - "7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18"): + with must_raise(ValueError, """[3, 999] is not a valid subset for any axis: + a [19]: 0 1 2 ... 16 17 18 + b [12]: 'b0' 'b1' 'b2' ... 'b10' 'b11' 'b3' + c [2]: 'c0' 'c1' + d [6]: 'd1' 'd2' 'd3' 'd4' 'd5' 'd6' +Some of those labels correspond though: + * axis 'a' contains 1 out of 2 labels (labels not found: 999)"""): _ = array[[1, 2], [3, 999]] with must_raise(ValueError, """[999, 4] is not a valid subset for any axis: @@ -685,8 +689,8 @@ def test_getitem_guess_axis(array): b [12]: 'b0' 'b1' 'b2' ... 'b10' 'b11' 'b3' c [2]: 'c0' 'c1' d [6]: 'd1' 'd2' 'd3' 'd4' 'd5' 'd6' -Some of those labels are valid though: - * axis 'a' contains 1 out of 2 labels (missing labels: 999)"""): +Some of those labels correspond though: + * axis 'a' contains 1 out of 2 labels (labels not found: 999)"""): _ = array[[1, 2], [999, 4]] # ambiguous key @@ -1100,6 +1104,32 @@ def test_getitem_single_larray_key_guess(): a1 b2 2 5 5 2""") assert_larray_equal(arr[key], expected) + arr = ndtest((2, 3)) + # key with invalid label (ie label not found on any axis) + key = from_string(r""" + a\b b0 b1 b2 + a0 a0 a1 a0 + a1 a1 a0 a2""").astype(str) + with must_raise(ValueError, r"""The values of the array key: + +a\b b0 b1 b2 + a0 a0 a1 a0 + a1 a1 a0 a2 + +do not all correspond to labels of a single axis of the subsetted array which has the following axes: + + a [2]: 'a0' 'a1' + b [3]: 'b0' 'b1' 'b2' + +Some of those key values correspond though: + * axis 'a' contains 2 out of 3 labels (labels not found: 'a2') + +Note that all the bad key values are located within the following labels: + a: 'a1' + b: 'b2'"""): + _ = arr[key] + + def test_getitem_multiple_larray_key_guess(): From 5cf2058a543d7200441aac1cfa04765ca0c49892 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 14 May 2025 16:30:24 +0200 Subject: [PATCH 12/22] CLN: moved Array.to_frame(fold_last_axis_name) from doctest to unittest that should be an internal thing --- larray/inout/pandas.py | 17 ----------------- larray/tests/test_array.py | 6 ++++++ 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/larray/inout/pandas.py b/larray/inout/pandas.py index 3696350f5..d67fec7e8 100644 --- a/larray/inout/pandas.py +++ b/larray/inout/pandas.py @@ -205,23 +205,6 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 - - Names of the last two axes written as ``before_last_axis_name\\last_axis_name`` - - >>> df = ndtest((2, 2, 2)).to_frame(fold_last_axis_name=True) - >>> df # doctest: +NORMALIZE_WHITESPACE - c0 c1 - a b\c - a0 b0 0 1 - b1 2 3 - a1 b0 4 5 - b1 6 7 - >>> from_frame(df, unfold_last_axis_name=True) - a b\c c0 c1 - a0 b0 0 1 - a0 b1 2 3 - a1 b0 4 5 - a1 b1 6 7 """ axes_names = [decode(name, 'utf8') if isinstance(name, bytes) else name for name in df.index.names] diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 3ab2ed513..5beb0f54d 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -4114,6 +4114,12 @@ def test_to_frame(): assert df.columns.to_list() == ['c0'] assert df.index.names == ['a', 'b'] + # fold_last_axis_name + arr = ndtest((2, 2, 2)) + df = arr.to_frame(fold_last_axis_name=True) + assert df.columns.name is None + assert df.columns.to_list() == ['c0', 'c1'] + assert df.index.names == ['a', r'b\c'] def test_from_frame(): # 1) data = scalar From b4023b24a938277e743dc90d60ec618ff21c2c4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 14 May 2025 14:49:12 +0200 Subject: [PATCH 13/22] FEAT: implemented ncolaxes in Array.to_frame --- doc/source/changes/version_0_35.rst.inc | 3 ++ larray/core/array.py | 66 ++++++++++++++++++------- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 43aec8ef8..437e4df8f 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -81,6 +81,9 @@ Miscellaneous improvements >>> arr.plot.bar(stack='gender') +* :py:obj:`Array.to_frame()` gained an ``ncolaxes`` argument to control how many + axes should be used as columns (defaults to 1, as before). + Fixes ^^^^^ diff --git a/larray/core/array.py b/larray/core/array.py index ee1a64d51..4502c39e1 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -1204,7 +1204,8 @@ def ipoints(self) -> ArrayPositionalPointsIndexer: return ArrayPositionalPointsIndexer(self) ipoints.__doc__ = ArrayPositionalPointsIndexer.__doc__ - def to_frame(self, fold_last_axis_name=False, dropna=None) -> pd.DataFrame: + def to_frame(self, fold_last_axis_name=False, + dropna=None, ncolaxes=1) -> pd.DataFrame: r""" Convert an Array into a Pandas DataFrame. @@ -1217,6 +1218,8 @@ def to_frame(self, fold_last_axis_name=False, dropna=None) -> pd.DataFrame: * any : if any NA values are present, drop that label * all : if all values are NA, drop that label * None by default. + ncolaxes : int, optional + Number of axes to be used as columns. Defaults to 1. Returns ------- @@ -1250,26 +1253,55 @@ def to_frame(self, fold_last_axis_name=False, dropna=None) -> pd.DataFrame: b1 2 3 a1 b0 4 5 b1 6 7 + >>> arr.to_frame(ncolaxes=2) # doctest: +NORMALIZE_WHITESPACE + b b0 b1 + c c0 c1 c0 c1 + a + a0 0 1 2 3 + a1 4 5 6 7 """ - last_name = self.axes[-1].name - columns_name = None if fold_last_axis_name else last_name - columns = np_array_to_pd_index(self.axes[-1].labels, name=columns_name) - if self.ndim > 1: - axes_names = self.axes.names[:-1] + if ncolaxes != 1: + if not (0 < ncolaxes < self.ndim): + raise ValueError(f"ncolaxes is {ncolaxes} but it must be " + f"0 < ncolaxes < {self.ndim} (number of " + f"dimensions)") if fold_last_axis_name: - tmp = axes_names[-1] if axes_names[-1] is not None else '' - if last_name: - axes_names[-1] = f"{tmp}\\{last_name}" - if self.ndim == 2: - index = np_array_to_pd_index(self.axes[0].labels, name=axes_names[0]) + raise ValueError("ncolaxes cannot be used in combination with" + "fold_last_axis_name=True") + + axes = list(self.axes) + if fold_last_axis_name and self.ndim > 1: + assert ncolaxes == 1 + + # the goal is to move the last axis name from the column index + # to the row index name (ndim=2) or last level name (ndim>2) + col_axis_name = axes[-1].name + if col_axis_name: + last_row_axis_name = axes[-2].name if axes[-2].name is not None else '' + axes[-2] = axes[-2].rename(f"{last_row_axis_name}\\{col_axis_name}") + axes[-1] = axes[-1].rename(None) + + def _axes_to_index(axes: list): + assert len(axes) > 0 + if len(axes) == 1: + return np_array_to_pd_index(axes[0].labels, name=axes[0].name) else: - index = pd.MultiIndex.from_product(self.axes.labels[:-1], names=axes_names) + return pd.MultiIndex.from_product( + [axis.labels for axis in axes], + names=[axis.name for axis in axes] + ) + + if self.ndim > 1: + row_index = _axes_to_index(axes[:-ncolaxes]) else: - index = pd.Index(['']) - if fold_last_axis_name: - index.name = self.axes.names[-1] - data = np.asarray(self).reshape((len(index), len(columns))) - df = pd.DataFrame(data, index, columns) + row_index = pd.Index(['']) + col_index = _axes_to_index(axes[-ncolaxes:]) + if fold_last_axis_name and self.ndim == 1: + row_index.name = col_index.name + col_index.name = None + + data = self.data.reshape((len(row_index), len(col_index))) + df = pd.DataFrame(data, row_index, col_index) if dropna is not None: dropna = dropna if dropna is not True else 'all' df.dropna(inplace=True, how=dropna) From f5f585c4d42aa116590fcc9960311e960893e60f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 16 Oct 2024 18:04:13 +0200 Subject: [PATCH 14/22] PERF: slightly faster ipfp * avoid useless copy of the main array when display_progress is False * avoid computing the sum on the last axis at last iteration * also added a special case for ndim == 1, but that is just to avoid breaking that corner case by the other two changes rather than making it more efficient given that it will probably never be used Overall, this makes a ~7% speed improvement for tiny test arrays. No idea for larger arrays. --- doc/source/changes/version_0_35.rst.inc | 2 ++ larray/extra/ipfp.py | 43 +++++++++++++++---------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 437e4df8f..1c42cbe66 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -84,6 +84,8 @@ Miscellaneous improvements * :py:obj:`Array.to_frame()` gained an ``ncolaxes`` argument to control how many axes should be used as columns (defaults to 1, as before). +* made :py:obj:`ipfp()` slightly faster when display_progress is False. + Fixes ^^^^^ diff --git a/larray/extra/ipfp.py b/larray/extra/ipfp.py index 6b5bbf47a..a471f42b3 100644 --- a/larray/extra/ipfp.py +++ b/larray/extra/ipfp.py @@ -154,10 +154,10 @@ def ipfp(target_sums, a=None, axes=None, maxiter=1000, threshold=0.5, stepstoabo target_sums = [asarray(ts) for ts in target_sums] - n = len(target_sums) + ndim = len(target_sums) if axes is None: - axes = list(range(n)) + axes = list(range(ndim)) def has_anonymous_axes(a): return any(axis.name is None for axis in a.axes) @@ -252,32 +252,41 @@ def has_anonymous_axes(a): # Here is the nice version of the algorithm # for i in range(maxiter): - # for axis, axis_target in zip(axes, target_sums): - # r *= axis_target.divnot0(r.sum(axis)) - # max_sum_diff = max(abs(r.sum(axis) - axis_target).max() - # for axis, axis_target in zip(axes, target_sums)) + # for axis, axis_target_sum in zip(axes, target_sums): + # r *= axis_target_sum.divnot0(r.sum(axis)) + # max_sum_diff = max(abs(r.sum(axis) - axis_target_sum).max() + # for axis, axis_target_sum in zip(axes, target_sums)) # step_sum_improvement = ... - # Here is the ugly optimized version which use only numpy operations and avoids computing the sum for the first - # axis twice per iteration + # Here is the ugly optimized version which use only numpy operations and reuses the sum for the first + # axis from the previous iteration "check phase" target_sums = [axis_target.data for axis_target in target_sums] res_data = a.data.astype(float) axes_indices = [a.axes.index(axis) for axis in axes] axis0_sum = res_data.sum(axes_indices[0]) + if ndim == 1: + # When there is only one dimension, the algorithm always + # terminates after a single iteration + res_data *= np.expand_dims(divnot0(target_sums[0], axis0_sum), axes_indices[0]) + return Array(res_data, a.axes) + for i in range(maxiter): - startr = res_data.copy() + if display_progress: + startr = res_data.copy() + # r = r * target_sums[0].divnot0(axis0_sum) res_data *= np.expand_dims(divnot0(target_sums[0], axis0_sum), axes_indices[0]) - for axis_idx, axis_target in zip(axes_indices[1:], target_sums[1:]): - # r = r * axis_target.divnot0(r.sum(axis)) - res_data *= np.expand_dims(divnot0(axis_target, res_data.sum(axis_idx)), axis_idx) - - # XXX: can't we skip computing the sum and max_diff for the last axis which should be good for each - # iteration??? - axes_sum = [res_data.sum(axis_idx) for axis_idx in axes_indices] + for axis_target_sum, axis_idx in zip(target_sums[1:], axes_indices[1:]): + # r = r * axis_target_sum.divnot0(r.sum(axis)) + res_data *= np.expand_dims(divnot0(axis_target_sum, res_data.sum(axis_idx)), axis_idx) + + # We avoid computing the sum and max_diff for the last axis which is always equal + # to the corresponding target_sum modulo numerical inaccuracy, hence the two [:-1] + # in the 3 following lines + axes_sum = [res_data.sum(axis_idx) for axis_idx in axes_indices[:-1]] max_sum_diff = max(abs(axis_sum - axis_target).max() - for axis_sum, axis_target in zip(axes_sum, target_sums)) + for axis_sum, axis_target in zip(axes_sum, target_sums[:-1])) axis0_sum = axes_sum[0] if display_progress: From 1f71d34a017880161188444ec5070be6f4fa49c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 7 Mar 2025 10:45:28 +0100 Subject: [PATCH 15/22] FIX: fixed AxisCollection.index(X[0]) --- larray/core/axis.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index cd06bc867..a95338549 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -2223,13 +2223,14 @@ def index(self, axis, compatible=False) -> int: >>> col.index('sex') 1 """ + if isinstance(axis, AxisReference): + axis = axis.name + # not using an elif because name can be an int if isinstance(axis, int): if -len(self) <= axis < len(self): return axis else: raise ValueError(f"axis {axis} is not in collection") - elif isinstance(axis, AxisReference): - name = axis.name elif isinstance(axis, Axis): try: # 1) first look for that particular axis object From e749f1a39ed9dd950d3af0851f41d69438513924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 11 Oct 2024 14:50:40 +0200 Subject: [PATCH 16/22] FIX: changed type error to TypeError instead of NotImplementedError --- larray/core/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 4502c39e1..19deb5247 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -2902,8 +2902,8 @@ def to_labelgroup(key, stack_depth=1): elif isinstance(key, (Group, int, str, list, slice)): return self.axes._guess_axis(key) else: - key_type = type(key).__name__ - raise NotImplementedError(f"{key} has invalid type ({key_type}) for a group aggregate key") + raise TypeError(f"{key} has invalid type ({type(key).__name__})" + f" for a group aggregate key") def standardise_arg(arg, stack_depth=1): if self.axes.isaxis(arg): From f5b0f2bb8e6bc4bfc583b1064c57e959c2520423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Thu, 28 Nov 2024 12:16:11 +0100 Subject: [PATCH 17/22] FEAT: added explicit check about Axis("name=labels", "name") --- larray/core/axis.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/larray/core/axis.py b/larray/core/axis.py index a95338549..8325811f3 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -102,6 +102,10 @@ def __init__(self, labels, name=None): name = name.name if isinstance(labels, str): if '=' in labels: + if name is not None: + raise ValueError("Axis(labels, name=None) cannot have " + "both a string labels with an '=' sign " + "and a value for the name argument") name, labels = [o.strip() for o in labels.split('=')] elif '..' not in labels and ',' not in labels: warnings.warn("Arguments 'name' and 'labels' of Axis constructor have been inverted in " From 7cbee1e26823268659d682fac747302852628046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 28 Oct 2024 15:04:18 +0100 Subject: [PATCH 18/22] SYNTAX: raise an error on Axis.id since I think (hope), nobody uses this, go directly to the exception step instead of a warning --- larray/core/axis.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index 8325811f3..800234c42 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -1027,13 +1027,9 @@ def index(self, key) -> Union[int, np.ndarray, slice]: translate = renamed_to(index, 'translate', raise_error=True) - # FIXME: remove id @property def id(self) -> str: - if self.name is not None: - return self.name - else: - raise ValueError('Axis has no name, so no id') + raise NotImplementedError('Axis.id is deprecated. Please use Axis.name instead.') def __str__(self) -> str: name = str(self.name) if self.name is not None else '{?}' From 58a658cfd0e58413faa7aac6b7b1d45ce78cfb6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 28 Nov 2025 16:25:09 +0100 Subject: [PATCH 19/22] DOC: improved changelog wording and links --- doc/source/changes/version_0_35.rst.inc | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 1c42cbe66..db5234325 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -15,29 +15,30 @@ Syntax changes Backward incompatible changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Plots made with Array.plot() in a Python script will be shown by default, - unless either the filepath (see below) or ax arguments are used. Shown plots - will open a window and pause the running script until the window is closed by - the user. To revert to the previous behavior, use show=False. +* Plots made with :py:obj:`Array.plot()` in a Python script will be shown by + default, unless either the filepath (see below) or ax arguments are used. + Shown plots will open a window and pause the running script until the window + is closed by the user. To revert to the previous behavior, use show=False. New features ^^^^^^^^^^^^ -* Array.plot now has an ´animate´ argument to produce animated plots. The - argument takes an axis (it also supports several axes but that is rarely - useful) and will create an animation, with one image per label of that axis. - For example, +* :py:obj:`Array.plot()` now has an ``animate`` argument to produce animated + plots. The argument takes an axis (it also supports several axes but that is + rarely useful) and will create an animation, with one image per label of that + axis. For example, >>> arr.plot.bar(animate='year') will create an animated bar plot with one frame per year. -* implemented Array.plot `filepath` argument to save plots to a file directly, - without having to use the matplotlib API. +* implemented :py:obj:`Array.plot()` ``filepath`` argument to save plots to a + file directly, without having to use the matplotlib API. -* implemented Array.plot `show` argument to display plots directly, without - having to use the matplotlib API. This is the new default behavior. +* implemented :py:obj:`Array.plot()` ``show`` argument to display plots + directly, without having to use the matplotlib API. This is the new default + behavior, unless a ``filepath`` is given. * implemented a new kind of plot: `heatmap`. It can be used like this: From 2033098eae94773dee638d74e03b59d95c4e0fda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 22 Jan 2025 16:33:00 +0100 Subject: [PATCH 20/22] MAINT: bump minimum Python version to 3.9 --- condarecipe/larray/meta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/condarecipe/larray/meta.yaml b/condarecipe/larray/meta.yaml index ea3615335..13cd7e886 100644 --- a/condarecipe/larray/meta.yaml +++ b/condarecipe/larray/meta.yaml @@ -16,11 +16,11 @@ build: requirements: host: - - python >=3.7 + - python >=3.9 - pip run: - - python >=3.7 + - python >=3.9 - numpy >=1.22 - pandas >=0.20 From 3588c643565f305cadb386de103b7ee5f5e37bd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 19 Jun 2024 15:12:17 +0200 Subject: [PATCH 21/22] FEAT: implemented Session.align(other_session) (closes #501) in the process had to implement Axis.align(*axes), AxisCollection.align(*axis_collections) and Array.align(*arrays) also deprecated passing non (arrays or scalar) to Array.align and check that each aligned axis is either anonymous or have the same name than others --- doc/source/changes/version_0_35.rst.inc | 14 ++ larray/core/array.py | 54 ++++--- larray/core/axis.py | 178 +++++++++++++++++------- larray/core/session.py | 93 +++++++++++++ larray/tests/test_session.py | 48 ++++++- larray/util/misc.py | 4 + 6 files changed, 320 insertions(+), 71 deletions(-) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index db5234325..138207137 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -11,6 +11,11 @@ Syntax changes (:py:obj:`Array.plot.area()`, :py:obj:`Array.plot.bar()`, :py:obj:`Array.plot.barh()`, and :py:obj:`Array.plot.line()`). +* all align() methods (:py:obj:`Axis.align()`, :py:obj:`AxisCollection.align()` + and :py:obj:`Array.align()`) only take options (``join``, ``axes`` and/or + ``fill_value``) as keywords arguments. Extra positional arguments will be + considered as more objects to align (see below). + Backward incompatible changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -44,6 +49,9 @@ New features >>> arr.plot.heatmap() +* implemented :py:obj:`Session.align()` to align all the arrays in several + sessions at once. Closes :issue:`501`. + * added a feature (see the :ref:`miscellaneous section ` for details). It works on :ref:`api-axis` and :ref:`api-group` objects. @@ -87,6 +95,12 @@ Miscellaneous improvements * made :py:obj:`ipfp()` slightly faster when display_progress is False. +* all align() methods (:py:obj:`Axis.align()`, :py:obj:`AxisCollection.align()` + and :py:obj:`Array.align()`) now support aligning more than two objects at + once by passing them as positional arguments. For example: + + >>> array1.align(array2, array3, join='outer') + Fixes ^^^^^ diff --git a/larray/core/array.py b/larray/core/array.py index 19deb5247..501a65d4d 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -54,6 +54,7 @@ from larray.core.group import (Group, IGroup, LGroup, _to_key, _to_keys, _translate_sheet_name, _translate_group_key_hdf) from larray.core.axis import Axis, AxisReference, AxisCollection, X, _make_axis # noqa: F401 +from larray.core.axis import align_axis_collections from larray.core.plot import PlotObject from larray.util.misc import (table2str, size2str, ReprString, float_error_handler_factory, light_product, common_dtype, @@ -853,6 +854,34 @@ def np_array_to_pd_index(array, name=None, tupleize_cols=True): return pd.Index(array, dtype=dtype, name=name, tupleize_cols=tupleize_cols) +def align_arrays(values, join='outer', fill_value=nan, axes=None): + bad_values = [value for value in values + if not isinstance(value, Array) and not np.isscalar(value)] + if bad_values: + bad_types = set(type(v) for v in bad_values) + bad_type_names = sorted(t.__name__ for t in bad_types) + raise TypeError("align only supports Arrays and scalars but got:" + f"{', '.join(bad_type_names)}") + axis_collections = [ + value.axes if isinstance(value, Array) else AxisCollection() + for value in values + ] + # fail early because reindex does not currently support anonymous axes + if any(any(name is None for name in axis_col.names) + for axis_col in axis_collections): + raise ValueError("arrays with anonymous axes are currently not " + "supported by Array.align") + try: + aligned_axis_collections = align_axis_collections(axis_collections, + join=join, axes=axes) + except ValueError as e: + raise ValueError(f"Arrays are not aligned because {e}") + return tuple(value.reindex(aligned_axes, fill_value=fill_value) + if isinstance(value, Array) + else value + for value, aligned_axes in zip(values, aligned_axis_collections)) + + class Array(ABCArray): r""" An Array object represents a multidimensional, homogeneous array of fixed-size items with labeled axes. @@ -1817,14 +1846,14 @@ def get_group(res_axes, self_axis): else: return res - def align(self, other, join='outer', fill_value=nan, axes=None) -> Tuple['Array', 'Array']: - r"""Align two arrays on their axes with the specified join method. + def align(self, *other, join='outer', fill_value=nan, axes=None) -> Tuple['Array', 'Array']: + r"""Align array with other(s) on their axes with the specified join method. In other words, it ensure all common axes are compatible. Those arrays can then be used in binary operations. Parameters ---------- - other : Array-like + *other : Array-like join : {'outer', 'inner', 'left', 'right', 'exact'}, optional Join method. For each axis common to both arrays: - outer: will use a label if it is in either arrays axis (ordered like the first array). @@ -1837,13 +1866,13 @@ def align(self, other, join='outer', fill_value=nan, axes=None) -> Tuple['Array' Value used to fill cells corresponding to label combinations which are not common to both arrays. Defaults to NaN. axes : AxisReference or sequence of them, optional - Axes to align. Need to be valid in both arrays. Defaults to None (all common axes). This must be specified + Axes to align. Need to be valid in all arrays. Defaults to None (all common axes). This must be specified when mixing anonymous and non-anonymous axes. Returns ------- - (left, right) : (Array, Array) - Aligned objects + arrays : tuple of Array + Aligned arrays Notes ----- @@ -1989,18 +2018,11 @@ def align(self, other, join='outer', fill_value=nan, axes=None) -> Tuple['Array' >>> arr1.align(arr2, join='exact') # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... - ValueError: Both arrays are not aligned because align method with join='exact' + ValueError: Arrays are not aligned because align method with join='exact' expected Axis(['a0', 'a1'], 'a') to be equal to Axis(['a0', 'a1', 'a2'], 'a') """ - other = asarray(other) - # reindex does not currently support anonymous axes - if any(name is None for name in self.axes.names) or any(name is None for name in other.axes.names): - raise ValueError("arrays with anonymous axes are currently not supported by Array.align") - try: - left_axes, right_axes = self.axes.align(other.axes, join=join, axes=axes) - except ValueError as e: - raise ValueError(f"Both arrays are not aligned because {e}") - return self.reindex(left_axes, fill_value=fill_value), other.reindex(right_axes, fill_value=fill_value) + return align_arrays((self, *other), + join=join, fill_value=fill_value, axes=axes) @deprecate_kwarg('reverse', 'ascending', {True: False, False: True}) def sort_values(self, key=None, axis=None, ascending=True) -> 'Array': diff --git a/larray/core/axis.py b/larray/core/axis.py index 800234c42..c173a82d7 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -17,9 +17,9 @@ from larray.util.misc import (duplicates, array_lookup2, ReprString, index_by_id, renamed_to, LHDFStore, lazy_attribute, _isnoneslice, unique_list, unique_multi, Product, argsort, has_duplicates, exactly_one, concatenate_ndarrays) +from larray.util.misc import first from larray.util.types import Scalar - np_frompyfunc = np.frompyfunc @@ -1330,12 +1330,12 @@ def difference(self, other) -> 'Axis': to_drop = set(other) return Axis([label for label in self.labels if label not in to_drop], self.name) - def align(self, other, join='outer') -> 'Axis': + def align(self, *other, join='outer') -> 'Axis': r"""Align axis with other object using specified join method. Parameters ---------- - other : Axis or label sequence + *other : Axis or label sequence join : {'outer', 'inner', 'left', 'right', 'exact'}, optional Defaults to 'outer'. @@ -1366,22 +1366,16 @@ def align(self, other, join='outer') -> 'Axis': ValueError: align method with join='exact' expected Axis(['a0', 'a1', 'a2'], 'a') to be equal to Axis(['a1', 'a2', 'a3'], 'a') """ - assert join in {'outer', 'inner', 'left', 'right', 'exact'} - if join == 'outer': - return self.union(other) - elif join == 'inner': - return self.intersection(other) - elif join == 'left': - return self - elif join == 'right': - if not isinstance(other, Axis): - other = Axis(other) - return other - elif join == 'exact': - if not self.equals(other): - raise ValueError(f"align method with join='exact' expected {self!r} to be equal to {other!r}") - else: - return self + bad_objs = [obj for obj in other if not isinstance(obj, Axis)] + if bad_objs: + for obj in bad_objs: + obj_type = type(obj).__name__ + warnings.warn(f"aligning an Axis to a non-Axis object " + f"({obj_type}) is deprecated. Please convert to " + f"an Axis first.", FutureWarning, stacklevel=2) + other = [obj if isinstance(obj, Axis) else Axis(obj) + for obj in other] + return align_axes((self, *other), join=join) def to_hdf(self, filepath, key=None) -> None: r""" @@ -1462,6 +1456,50 @@ def ignore_labels(self) -> 'Axis': return Axis(len(self), self.name) +def align_axes(axes: Sequence[Axis], join: str = 'outer') -> Axis: + if not all(isinstance(axis, Axis) for axis in axes): + raise TypeError("all objects to align must be Axis objects") + + if join not in {'outer', 'inner', 'left', 'right', 'exact'}: + raise ValueError(f"join must be one of 'outer', 'inner', 'left', " + f"'right' or 'exact', got {join!r}") + + names = [axis.name for axis in axes] + first_name = first((name for name in names if name is not None), + default=None) + if first_name is not None: + if not all(name is None or name == first_name for name in names): + raise ValueError("In align, all axes must be anonymous or " + "have the same name: " + f"{', '.join(repr(name) for name in names)}") + + def join_left(axis1, axis2): + return axis1 + def join_right(axis1, axis2): + return axis2 + def join_exact(axis1, axis2): + if not axis1.equals(axis2): + raise ValueError(f"align method with join='exact' expected " + f"{axis1!r} to be equal to {axis2!r}") + else: + return axis1 + if join == 'outer': + join_labels_func = Axis.union + elif join == 'inner': + join_labels_func = Axis.intersection + elif join == 'left': + join_labels_func = join_left + elif join == 'right': + join_labels_func = join_right + else: + assert join == 'exact' + join_labels_func = join_exact + aligned_axis = axes[0] + for axis in axes[1:]: + aligned_axis = join_labels_func(aligned_axis, axis) + return aligned_axis + + def _make_axis(obj) -> Axis: if isinstance(obj, Axis): return obj @@ -3552,23 +3590,25 @@ def _prepare_split_axes(self, axes, names, sep) -> dict: split_axis = renamed_to(split_axes, 'split_axis', raise_error=True) - def align(self, other, join='outer', axes=None) -> Tuple['AxisCollection', 'AxisCollection']: - r"""Align this axis collection with another. + def align(self, *other, join='outer', axes=None) -> Tuple['AxisCollection']: + r"""Align this AxisCollection with (an)other AxisCollection(s). This ensures all common axes are compatible. Parameters ---------- - other : AxisCollection + *other : AxisCollection + AxisCollection(s) to align with this one. join : {'outer', 'inner', 'left', 'right', 'exact'}, optional Defaults to 'outer'. axes : AxisReference or sequence of them, optional - Axes to align. Need to be valid in both arrays. Defaults to None (all common axes). This must be specified + Axes to align. Need to be valid in all axis collections. + Defaults to None (all common axes). This must be specified when mixing anonymous and non-anonymous axes. Returns ------- - (left, right) : (AxisCollection, AxisCollection) + tuple of AxisCollection Aligned collections See Also @@ -3631,31 +3671,20 @@ def align(self, other, join='outer', axes=None) -> Tuple['AxisCollection', 'Axis Axis(['c0'], None) ]) """ - if join not in {'outer', 'inner', 'left', 'right', 'exact'}: - raise ValueError("join should be one of 'outer', 'inner', 'left', 'right' or 'exact'") - other = other if isinstance(other, AxisCollection) else AxisCollection(other) - - # if axes not specified - if axes is None: - # and we have only anonymous axes on both sides - if all(name is None for name in self.names) and all(name is None for name in other.names): - # use N first axes by position - join_axes = list(range(min(len(self), len(other)))) - elif any(name is None for name in self.names) or any(name is None for name in other.names): - raise ValueError("axes collections with mixed anonymous/non anonymous axes are not supported by align" - "without specifying axes explicitly") - else: - assert all(name is not None for name in self.names) and all(name is not None for name in other.names) - # use all common axes - join_axes = list(OrderedSet(self.names) & OrderedSet(other.names)) - else: - if isinstance(axes, (int, str, Axis)): - axes = [axes] - join_axes = axes - new_axes = [self_axis.align(other_axis, join=join) - for self_axis, other_axis in zip(self[join_axes], other[join_axes])] - axes_changes = list(zip(join_axes, new_axes)) - return self.replace(axes_changes), other.replace(axes_changes) + # For backward compatibility with older code using align with a + # non-AxisCollection second argument, we only support aligning more + # than two collection when other contains actual AxisCollection objects + bad_objs = [obj for obj in other if not isinstance(obj, AxisCollection)] + if bad_objs: + for obj in bad_objs: + obj_type = type(obj).__name__ + warnings.warn(f"aligning an AxisCollection to a " + f"non-AxisCollection object ({obj_type}) is " + f"deprecated. Please convert to an AxisCollection " + f"first.", FutureWarning, stacklevel=2) + other = [AxisCollection(obj) for obj in other] + + return align_axis_collections((self, *other), join=join, axes=axes) # XXX: make this into a public method/property? AxisCollection.flat_labels[flat_indices]? def _flat_lookup(self, flat_indices): @@ -3802,6 +3831,57 @@ def _adv_keys_to_combined_axes(self, key, wildcard=False, sep='_'): return AxisCollection(combined_axis) +def align_axis_collections(axis_collections, join='outer', axes=None): + if join not in {'outer', 'inner', 'left', 'right', 'exact'}: + raise ValueError("join should be one of 'outer', 'inner', 'left', " + "'right' or 'exact'") + + # if axes not specified + if axes is None: + # and we have only anonymous axes + if all(name is None for col in axis_collections + for name in col.names): + # use all axes by position + max_length = max(len(col) for col in axis_collections) + join_axes_refs = list(range(max_length)) + elif any(name is None for col in axis_collections + for name in col.names): + raise ValueError( + "axes collections with mixed anonymous/non anonymous axes " + "are not supported by align without specifying axes " + "explicitly") + else: + assert all(name is not None for col in axis_collections + for name in col.names) + # use all axes by name + join_axes_refs = OrderedSet(axis_collections[0].names) + for col in axis_collections[1:]: + join_axes_refs |= OrderedSet(col.names) + else: + if isinstance(axes, (int, str, Axis)): + axes = [axes] + join_axes_refs = axes + + # first compute all aligned axes for all collections + axes_changes = { + axis_ref: align_axes([axis_col[axis_ref] + for axis_col in axis_collections + if axis_ref in axis_col], + join=join) + for axis_ref in join_axes_refs + } + + # then apply the changed axes for the collections where the axis exists + return tuple( + axis_col.replace({ + axis_ref: aligned_axis + for axis_ref, aligned_axis in axes_changes.items() + if axis_ref in axis_col + }) + for axis_col in axis_collections + ) + + class AxisReference(ABCAxisReference, ExprNode, Axis): def __init__(self, name): self.name = name diff --git a/larray/core/session.py b/larray/core/session.py index 65d05c858..d81cb1e83 100644 --- a/larray/core/session.py +++ b/larray/core/session.py @@ -13,7 +13,9 @@ from larray.core.axis import Axis from larray.core.constants import nan from larray.core.array import Array, get_axes, ndtest, zeros, zeros_like, sequence # noqa: F401 +from larray.core.array import align_arrays from larray.util.misc import float_error_handler_factory, is_interactive_interpreter, renamed_to, inverseop, size2str +from larray.util.misc import unique_list, first from larray.inout.session import ext_default_engine, get_file_handler @@ -1578,6 +1580,97 @@ def memory_used(self) -> str: """ return size2str(self.nbytes) + def align(self, *other, join='outer', fill_value=nan): + r"""Align the current session with (an)other session(s) + + Arrays from all sessions will be aligned with the corresponding arrays + in all other sessions where arrays with the same name are present. + + Non-Array objects (eg. Axis, Group) are not aligned, but simply copied + to the resulting sessions. + + Parameters + ---------- + *other : Session + Session(s) to align with. + join : {'outer', 'inner', 'left', 'right', 'exact'}, optional + How to handle common axes when aligning arrays. + See :py:obj:`Array.align()` for details. Defaults to 'outer'. + fill_value : scalar or Array, optional + Value used to fill cells corresponding to label combinations which + are not present in an array. Defaults to NaN. + + Returns + ------- + sessions: tuple of Session + Aligned sessions. + + Examples + -------- + >>> arr1 = ndtest('a=a0,a1; b=b0,b1') + >>> arr1 + a\b b0 b1 + a0 0 1 + a1 2 3 + >>> arr2 = ndtest('a=a1,a2; b=b1,b2') + >>> arr2 + a\b b1 b2 + a1 0 1 + a2 2 3 + >>> s1 = Session({'a': arr1.a, 'arr': arr1}) + >>> s2 = Session({'a': arr2.a, 'arr': arr2}) + >>> s1_aligned, s2_aligned = s1.align(s2, join='outer', fill_value=-1) + >>> s1_aligned.arr + a\b b0 b1 b2 + a0 0 1 -1 + a1 2 3 -1 + a2 -1 -1 -1 + >>> s2_aligned.arr + a\b b0 b1 b2 + a0 -1 -1 -1 + a1 -1 0 1 + a2 -1 2 3 + >>> s1_aligned.a + Axis(['a0', 'a1'], 'a') + """ + sessions = (self, other) if isinstance(other, Session) else (self, *other) + if not all(isinstance(s, Session) for s in sessions): + raise TypeError("Session.align only supports aligning with other " + "Session objects") + + seen = set() + all_keys = [] + for s in sessions: + unique_list(s.keys(), all_keys, seen) + + def rename_anonymous_axes(obj): + if not isinstance(obj, Array): + return obj + if not any(axis.name is None for axis in obj.axes): + return obj + return obj.rename({ + axis_num: axis.name + if axis.name is not None else f'axis{axis_num}' + for axis_num, axis in enumerate(obj.axes) + }) + + res_sessions = tuple(Session() for s in sessions) + for name in all_keys: + objects = [s.get(name, np.nan) for s in sessions] + first_array = first((obj for obj in objects + if isinstance(obj, Array))) + if first_array is None: + # not a single array, copy the objects as is + aligned_objects = objects + else: + # rename anonymous axes because they are not supported by align + objects = [rename_anonymous_axes(obj) for obj in objects] + aligned_objects = align_arrays(objects, join=join, + fill_value=fill_value) + for res_session, obj in zip(res_sessions, aligned_objects): + res_session[name] = obj + return res_sessions + def _exclude_private_vars(vars_dict: Dict[str, Any]) -> Dict[str, Any]: return {k: v for k, v in vars_dict.items() if not k.startswith('_')} diff --git a/larray/tests/test_session.py b/larray/tests/test_session.py index 5bc08a8bb..183562b3b 100644 --- a/larray/tests/test_session.py +++ b/larray/tests/test_session.py @@ -7,13 +7,18 @@ import pandas as pd import pytest -from larray.tests.common import meta -from larray.tests.common import (assert_larray_equal, assert_array_nan_equal, inputpath, - needs_xlwings, needs_pytables, needs_openpyxl, must_warn, must_raise) +from larray.tests.common import ( + meta, inputpath, + assert_larray_equal, assert_array_nan_equal, assert_larray_nan_equal, + needs_xlwings, needs_pytables, needs_openpyxl, + must_warn, must_raise +) from larray.inout.common import _supported_scalars_types -from larray import (Session, Axis, Array, Group, isnan, zeros_like, ndtest, ones_like, - ones, full, full_like, stack, local_arrays, global_arrays, arrays, CheckedSession) - +from larray import (Session, Axis, Array, Group, CheckedSession, + isnan, + zeros_like, ndtest, ones_like, ones, full, full_like, + stack, from_string, + local_arrays, global_arrays, arrays) # avoid flake8 errors meta = meta @@ -688,5 +693,36 @@ def test_stack(): assert_larray_equal(res.arr2, expected_arr2) +def test_align(): + s1 = Session(arr1=ndtest(" a=a0,a1 ;b=b0,b1"), + arr2=ndtest(" a=a0,a1 ;b=b0,b1")) + s2 = Session(arr1=ndtest(" a=a0,a1,a2;b=b0,b1"), # extra label + arr2=ndtest("c=c0,c1;a=a0,a1 ;b=b0,b1"), # extra axis + arr3=ndtest(" a=a0,a1 ;b=b0,b1")) # extra array + s3 = Session( # missing array + arr2=ndtest(" a=a0,a1 ;b= b1"), # missing label + arr3=ndtest(" b=b0,b1")) # missing axis + + al_s1, al_s2, al_s3 = s1.align(s2, s3) + + assert_larray_nan_equal(al_s1.arr1, from_string(r""" + a\b b0 b1 + a0 0.0 1.0 + a1 2.0 3.0 + a2 nan nan""")) + assert_larray_equal(al_s1.arr2, s1.arr2) # no change + assert isnan(al_s1.arr3) + + assert_larray_equal(al_s2.arr1, s2.arr1) # no change + assert_larray_equal(al_s2.arr2, s2.arr2) # no change + assert_larray_equal(al_s2.arr3, s2.arr3) # no change + + assert isnan(al_s3.arr1) + assert_larray_nan_equal(al_s3.arr2, from_string(r""" + a\b b0 b1 + a0 nan 0.0 + a1 nan 1.0""")) + assert_larray_equal(al_s3.arr3, s3.arr3) # no change + if __name__ == "__main__": pytest.main() diff --git a/larray/util/misc.py b/larray/util/misc.py index 5dd470029..ff6c62f22 100644 --- a/larray/util/misc.py +++ b/larray/util/misc.py @@ -1077,3 +1077,7 @@ def concatenate_ndarrays(arrays) -> np.ndarray: arrays = [np.asarray(labels, dtype=object) for labels in arrays] # TODO: try using the new dtype argument to concatenate instead of converting labels explicitly as above return np.concatenate(arrays) + + +def first(iterable, default=None): + return next(iter(iterable), default) From ceffdf84aee7e832f4238b076947b79f265500e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 26 Nov 2025 15:36:49 +0100 Subject: [PATCH 22/22] DOC: added mention and test about Axis.intersection conserving duplicate labels --- larray/core/axis.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index c173a82d7..6c58a878a 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -1265,8 +1265,8 @@ def union(self, other) -> 'Axis': def intersection(self, other) -> 'Axis': r"""Return axis with the (set) intersection of this axis labels and other labels. - In other words, this will use labels from this axis if they are also in other. Labels relative order will be - kept intact. + In other words, this will use labels from this axis if they are also in + other. Duplicate labels and labels relative order will be kept intact. Parameters ---------- @@ -1290,6 +1290,8 @@ def intersection(self, other) -> 'Axis': Axis(['a1', 'a2'], 'a') >>> a.intersection(['a1', 'a2', 'a3']) Axis(['a1', 'a2'], 'a') + >>> Axis('a=a0,a1,a0').intersection('a1,a0') + Axis(['a0', 'a1', 'a0'], 'a') """ non_string_scalar = np.isscalar(other) and not isinstance(other, str) other = [other] if non_string_scalar else _to_ticks(other)