diff --git a/condarecipe/larray/meta.yaml b/condarecipe/larray/meta.yaml index ea3615335..13cd7e886 100644 --- a/condarecipe/larray/meta.yaml +++ b/condarecipe/larray/meta.yaml @@ -16,11 +16,11 @@ build: requirements: host: - - python >=3.7 + - python >=3.9 - pip run: - - python >=3.7 + - python >=3.9 - numpy >=1.22 - pandas >=0.20 diff --git a/design.txt b/design.txt index c3037d85c..b44ff5d46 100644 --- a/design.txt +++ b/design.txt @@ -1131,6 +1131,9 @@ subset = pop.q('M, age.sum(10:20 >> yada1, 20:30 >> yada2')) # without ambiguity, that would be subset = pop.q('M, sum(10:20 >> yada1, 20:30 >> yada2')) +# this could work too: +subset = pop.q('M', sum(age[10:20] >> 'yada1', age[20:30] >> 'yada2')) + # if using a function (like .q) we could also "rename" axes on the fly. the above would create an aggregated axis # named "age" but the code below would create "toto" instead subset = pop.q('M', toto=age.sum[10:20, 20:30]) diff --git a/doc/source/changes/version_0_35.rst.inc b/doc/source/changes/version_0_35.rst.inc index 754a05e52..138207137 100644 --- a/doc/source/changes/version_0_35.rst.inc +++ b/doc/source/changes/version_0_35.rst.inc @@ -11,38 +11,47 @@ Syntax changes (:py:obj:`Array.plot.area()`, :py:obj:`Array.plot.bar()`, :py:obj:`Array.plot.barh()`, and :py:obj:`Array.plot.line()`). +* all align() methods (:py:obj:`Axis.align()`, :py:obj:`AxisCollection.align()` + and :py:obj:`Array.align()`) only take options (``join``, ``axes`` and/or + ``fill_value``) as keywords arguments. Extra positional arguments will be + considered as more objects to align (see below). + Backward incompatible changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Plots made with Array.plot() in a Python script will be shown by default, - unless either the filepath (see below) or ax arguments are used. Shown plots - will open a window and pause the running script until the window is closed by - the user. To revert to the previous behavior, use show=False. +* Plots made with :py:obj:`Array.plot()` in a Python script will be shown by + default, unless either the filepath (see below) or ax arguments are used. + Shown plots will open a window and pause the running script until the window + is closed by the user. To revert to the previous behavior, use show=False. New features ^^^^^^^^^^^^ -* Array.plot now has an ´animate´ argument to produce animated plots. The - argument takes an axis (it also supports several axes but that is rarely - useful) and will create an animation, with one image per label of that axis. - For example, +* :py:obj:`Array.plot()` now has an ``animate`` argument to produce animated + plots. The argument takes an axis (it also supports several axes but that is + rarely useful) and will create an animation, with one image per label of that + axis. For example, >>> arr.plot.bar(animate='year') will create an animated bar plot with one frame per year. -* implemented Array.plot `filepath` argument to save plots to a file directly, - without having to use the matplotlib API. +* implemented :py:obj:`Array.plot()` ``filepath`` argument to save plots to a + file directly, without having to use the matplotlib API. -* implemented Array.plot `show` argument to display plots directly, without - having to use the matplotlib API. This is the new default behavior. +* implemented :py:obj:`Array.plot()` ``show`` argument to display plots + directly, without having to use the matplotlib API. This is the new default + behavior, unless a ``filepath`` is given. * implemented a new kind of plot: `heatmap`. It can be used like this: >>> arr.plot.heatmap() +* implemented :py:obj:`Session.align()` to align all the arrays in several + sessions at once. Closes :issue:`501`. + * added a feature (see the :ref:`miscellaneous section ` for details). It works on :ref:`api-axis` and :ref:`api-group` objects. @@ -79,10 +88,31 @@ Miscellaneous improvements always stacking the last axis. For example, a plot with genders stacked could be specified as: - >>> arr.plot.bar(stacked='gender') + >>> arr.plot.bar(stack='gender') + +* :py:obj:`Array.to_frame()` gained an ``ncolaxes`` argument to control how many + axes should be used as columns (defaults to 1, as before). + +* made :py:obj:`ipfp()` slightly faster when display_progress is False. + +* all align() methods (:py:obj:`Axis.align()`, :py:obj:`AxisCollection.align()` + and :py:obj:`Array.align()`) now support aligning more than two objects at + once by passing them as positional arguments. For example: + + >>> array1.align(array2, array3, join='outer') Fixes ^^^^^ -* fixed something (closes :issue:`1`). +* fixed error message when trying to take a subset of an array with an array + key which has ndim > 1 and some bad values in the key. The message was also + improved (see the issue for details). Closes :issue:`1134`. + +* added support for Pandas Series in :py:obj:`asarray()`. This is considered a + fix because it kind of worked but silently ignored the index and name of the + series (closes :issue:`895`). + +* fixed evaluating operations involving X.axis and an array when + that operation is only valid in the context of a larger array by delaying + the evaluation until the larger array is known (closes :issue:`1129`). \ No newline at end of file diff --git a/larray/__init__.py b/larray/__init__.py index eb3d91aaa..eddf1ee86 100644 --- a/larray/__init__.py +++ b/larray/__init__.py @@ -11,15 +11,15 @@ from larray.core.checked import CheckedArray, CheckedSession, CheckedParameters from larray.core.constants import nan, inf, pi, e, euler_gamma from larray.core.metadata import Metadata -from larray.core.ufuncs import wrap_elementwise_array_func, maximum, minimum, where +from larray.core.ufuncs import wrap_elementwise_array_func, maximum, minimum, where, isnan, nan_to_num from larray.core.npufuncs import (sin, cos, tan, arcsin, arccos, arctan, hypot, arctan2, degrees, radians, unwrap, sinh, cosh, tanh, arcsinh, arccosh, arctanh, angle, real, imag, conj, round, around, rint, fix, floor, ceil, trunc, exp, expm1, exp2, log, log10, log2, log1p, logaddexp, logaddexp2, i0, sinc, signbit, copysign, frexp, ldexp, - convolve, clip, sqrt, absolute, fabs, sign, fmax, fmin, nan_to_num, - real_if_close, interp, isnan, isinf, inverse) + convolve, clip, sqrt, absolute, fabs, sign, fmax, fmin, + real_if_close, interp, isinf, inverse) from larray.core.misc import isscalar from larray.inout.misc import from_lists, from_string diff --git a/larray/core/array.py b/larray/core/array.py index e4aec2ea7..501a65d4d 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -50,10 +50,11 @@ from larray.core.abstractbases import ABCArray from larray.core.constants import nan, inf from larray.core.metadata import Metadata -from larray.core.expr import ExprNode +from larray.core.expr import ExprNode, BinaryOp from larray.core.group import (Group, IGroup, LGroup, _to_key, _to_keys, _translate_sheet_name, _translate_group_key_hdf) from larray.core.axis import Axis, AxisReference, AxisCollection, X, _make_axis # noqa: F401 +from larray.core.axis import align_axis_collections from larray.core.plot import PlotObject from larray.util.misc import (table2str, size2str, ReprString, float_error_handler_factory, light_product, common_dtype, @@ -814,8 +815,6 @@ def _doc_agg_method(func, by=False, long_name='', action_verb='perform', extra_a _always_return_float = {np.mean, np.nanmean, np.median, np.nanmedian, np.percentile, np.nanpercentile, np.std, np.nanstd, np.var, np.nanvar} -obj_isnan = np.vectorize(lambda x: x != x, otypes=[bool]) - def element_equal(a1, a2, rtol=0, atol=0, nan_equals=False): warnings.warn("element_equal() is deprecated. Use array1.eq(array2, rtol, atol, nan_equals) instead.", @@ -855,6 +854,34 @@ def np_array_to_pd_index(array, name=None, tupleize_cols=True): return pd.Index(array, dtype=dtype, name=name, tupleize_cols=tupleize_cols) +def align_arrays(values, join='outer', fill_value=nan, axes=None): + bad_values = [value for value in values + if not isinstance(value, Array) and not np.isscalar(value)] + if bad_values: + bad_types = set(type(v) for v in bad_values) + bad_type_names = sorted(t.__name__ for t in bad_types) + raise TypeError("align only supports Arrays and scalars but got:" + f"{', '.join(bad_type_names)}") + axis_collections = [ + value.axes if isinstance(value, Array) else AxisCollection() + for value in values + ] + # fail early because reindex does not currently support anonymous axes + if any(any(name is None for name in axis_col.names) + for axis_col in axis_collections): + raise ValueError("arrays with anonymous axes are currently not " + "supported by Array.align") + try: + aligned_axis_collections = align_axis_collections(axis_collections, + join=join, axes=axes) + except ValueError as e: + raise ValueError(f"Arrays are not aligned because {e}") + return tuple(value.reindex(aligned_axes, fill_value=fill_value) + if isinstance(value, Array) + else value + for value, aligned_axes in zip(values, aligned_axis_collections)) + + class Array(ABCArray): r""" An Array object represents a multidimensional, homogeneous array of fixed-size items with labeled axes. @@ -1206,7 +1233,8 @@ def ipoints(self) -> ArrayPositionalPointsIndexer: return ArrayPositionalPointsIndexer(self) ipoints.__doc__ = ArrayPositionalPointsIndexer.__doc__ - def to_frame(self, fold_last_axis_name=False, dropna=None) -> pd.DataFrame: + def to_frame(self, fold_last_axis_name=False, + dropna=None, ncolaxes=1) -> pd.DataFrame: r""" Convert an Array into a Pandas DataFrame. @@ -1219,6 +1247,8 @@ def to_frame(self, fold_last_axis_name=False, dropna=None) -> pd.DataFrame: * any : if any NA values are present, drop that label * all : if all values are NA, drop that label * None by default. + ncolaxes : int, optional + Number of axes to be used as columns. Defaults to 1. Returns ------- @@ -1252,26 +1282,55 @@ def to_frame(self, fold_last_axis_name=False, dropna=None) -> pd.DataFrame: b1 2 3 a1 b0 4 5 b1 6 7 + >>> arr.to_frame(ncolaxes=2) # doctest: +NORMALIZE_WHITESPACE + b b0 b1 + c c0 c1 c0 c1 + a + a0 0 1 2 3 + a1 4 5 6 7 """ - last_name = self.axes[-1].name - columns_name = None if fold_last_axis_name else last_name - columns = np_array_to_pd_index(self.axes[-1].labels, name=columns_name) - if self.ndim > 1: - axes_names = self.axes.names[:-1] + if ncolaxes != 1: + if not (0 < ncolaxes < self.ndim): + raise ValueError(f"ncolaxes is {ncolaxes} but it must be " + f"0 < ncolaxes < {self.ndim} (number of " + f"dimensions)") if fold_last_axis_name: - tmp = axes_names[-1] if axes_names[-1] is not None else '' - if last_name: - axes_names[-1] = f"{tmp}\\{last_name}" - if self.ndim == 2: - index = np_array_to_pd_index(self.axes[0].labels, name=axes_names[0]) + raise ValueError("ncolaxes cannot be used in combination with" + "fold_last_axis_name=True") + + axes = list(self.axes) + if fold_last_axis_name and self.ndim > 1: + assert ncolaxes == 1 + + # the goal is to move the last axis name from the column index + # to the row index name (ndim=2) or last level name (ndim>2) + col_axis_name = axes[-1].name + if col_axis_name: + last_row_axis_name = axes[-2].name if axes[-2].name is not None else '' + axes[-2] = axes[-2].rename(f"{last_row_axis_name}\\{col_axis_name}") + axes[-1] = axes[-1].rename(None) + + def _axes_to_index(axes: list): + assert len(axes) > 0 + if len(axes) == 1: + return np_array_to_pd_index(axes[0].labels, name=axes[0].name) else: - index = pd.MultiIndex.from_product(self.axes.labels[:-1], names=axes_names) + return pd.MultiIndex.from_product( + [axis.labels for axis in axes], + names=[axis.name for axis in axes] + ) + + if self.ndim > 1: + row_index = _axes_to_index(axes[:-ncolaxes]) else: - index = pd.Index(['']) - if fold_last_axis_name: - index.name = self.axes.names[-1] - data = np.asarray(self).reshape((len(index), len(columns))) - df = pd.DataFrame(data, index, columns) + row_index = pd.Index(['']) + col_index = _axes_to_index(axes[-ncolaxes:]) + if fold_last_axis_name and self.ndim == 1: + row_index.name = col_index.name + col_index.name = None + + data = self.data.reshape((len(row_index), len(col_index))) + df = pd.DataFrame(data, row_index, col_index) if dropna is not None: dropna = dropna if dropna is not True else 'all' df.dropna(inplace=True, how=dropna) @@ -1720,6 +1779,14 @@ def is_axis_def(axis_def): return ((isinstance(axis_def, str) and '=' in axis_def) or isinstance(axis_def, Group)) + def axes_refs_and_defs_to_axes(axes_to_reindex: dict): + new_axes_to_reindex = {} + for k, v in axes_to_reindex.items(): + src_axis = axis_ref_to_axis(self.axes, k) + dst_axis = labels_def_and_name_to_axis(v, src_axis.name) + new_axes_to_reindex[src_axis] = dst_axis + return new_axes_to_reindex + if new_axis is None: if isinstance(axes_to_reindex, Axis) and not isinstance(axes_to_reindex, AxisReference): axes_to_reindex = {axes_to_reindex: axes_to_reindex} @@ -1757,14 +1824,10 @@ def is_axis_def(axis_def): else: # TODO: move this to AxisCollection.replace if isinstance(axes_to_reindex, dict): - new_axes_to_reindex = {} - for k, v in axes_to_reindex.items(): - src_axis = axis_ref_to_axis(self.axes, k) - dst_axis = labels_def_and_name_to_axis(v, src_axis.name) - new_axes_to_reindex[src_axis] = dst_axis - axes_to_reindex = new_axes_to_reindex - - res_axes = self.axes.replace(axes_to_reindex, **kwargs) + axes_to_reindex = axes_refs_and_defs_to_axes(axes_to_reindex) + res_axes = self.axes.replace(axes_to_reindex) + if kwargs: + res_axes = res_axes.replace(axes_refs_and_defs_to_axes(kwargs)) res = full(res_axes, fill_value, dtype=common_dtype((self.data, fill_value))) def get_group(res_axes, self_axis): @@ -1783,14 +1846,14 @@ def get_group(res_axes, self_axis): else: return res - def align(self, other, join='outer', fill_value=nan, axes=None) -> Tuple['Array', 'Array']: - r"""Align two arrays on their axes with the specified join method. + def align(self, *other, join='outer', fill_value=nan, axes=None) -> Tuple['Array', 'Array']: + r"""Align array with other(s) on their axes with the specified join method. In other words, it ensure all common axes are compatible. Those arrays can then be used in binary operations. Parameters ---------- - other : Array-like + *other : Array-like join : {'outer', 'inner', 'left', 'right', 'exact'}, optional Join method. For each axis common to both arrays: - outer: will use a label if it is in either arrays axis (ordered like the first array). @@ -1803,13 +1866,13 @@ def align(self, other, join='outer', fill_value=nan, axes=None) -> Tuple['Array' Value used to fill cells corresponding to label combinations which are not common to both arrays. Defaults to NaN. axes : AxisReference or sequence of them, optional - Axes to align. Need to be valid in both arrays. Defaults to None (all common axes). This must be specified + Axes to align. Need to be valid in all arrays. Defaults to None (all common axes). This must be specified when mixing anonymous and non-anonymous axes. Returns ------- - (left, right) : (Array, Array) - Aligned objects + arrays : tuple of Array + Aligned arrays Notes ----- @@ -1955,18 +2018,11 @@ def align(self, other, join='outer', fill_value=nan, axes=None) -> Tuple['Array' >>> arr1.align(arr2, join='exact') # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... - ValueError: Both arrays are not aligned because align method with join='exact' + ValueError: Arrays are not aligned because align method with join='exact' expected Axis(['a0', 'a1'], 'a') to be equal to Axis(['a0', 'a1', 'a2'], 'a') """ - other = asarray(other) - # reindex does not currently support anonymous axes - if any(name is None for name in self.axes.names) or any(name is None for name in other.axes.names): - raise ValueError("arrays with anonymous axes are currently not supported by Array.align") - try: - left_axes, right_axes = self.axes.align(other.axes, join=join, axes=axes) - except ValueError as e: - raise ValueError(f"Both arrays are not aligned because {e}") - return self.reindex(left_axes, fill_value=fill_value), other.reindex(right_axes, fill_value=fill_value) + return align_arrays((self, *other), + join=join, fill_value=fill_value, axes=axes) @deprecate_kwarg('reverse', 'ascending', {True: False, False: True}) def sort_values(self, key=None, axis=None, ascending=True) -> 'Array': @@ -2868,8 +2924,8 @@ def to_labelgroup(key, stack_depth=1): elif isinstance(key, (Group, int, str, list, slice)): return self.axes._guess_axis(key) else: - key_type = type(key).__name__ - raise NotImplementedError(f"{key} has invalid type ({key_type}) for a group aggregate key") + raise TypeError(f"{key} has invalid type ({type(key).__name__})" + f" for a group aggregate key") def standardise_arg(arg, stack_depth=1): if self.axes.isaxis(arg): @@ -5555,8 +5611,15 @@ def _binop(opname): super_method = getattr(np.ndarray, fullname) def opmethod(self, other) -> 'Array': + # we could implement this more cleanly by returning + # NotImplemented in this case and letting the ExprNode reverse + # op (r*) handle it, but this can change the result axes order + # so I am unsure about that. if isinstance(other, ExprNode): - other = other.evaluate(self.axes) + if other.can_evaluate_with(self.axes): + other = other.evaluate(self.axes) + else: + return BinaryOp(opname, self, other) # XXX: unsure what happens for non scalar Groups. # we might want to be more general than this and .eval all Groups? @@ -6023,24 +6086,15 @@ def eq(self, other, rtol=0, atol=0, nans_equal=False) -> 'Array': """ other = asarray(other) + (self_data, other_data), res_axes = raw_broadcastable([self, other]) if rtol == 0 and atol == 0: - if not nans_equal: - return self == other - else: - from larray.core.npufuncs import isnan - - def general_isnan(a): - if issubclass(a.dtype.type, np.inexact): - return isnan(a) - elif a.dtype.type is np.object_: - return Array(obj_isnan(a), a.axes) - else: - return False - - return (self == other) | (general_isnan(self) & general_isnan(other)) + res_data = self_data == other_data + if nans_equal: + res_data |= (self_data != self_data) & (other_data != other_data) else: - (a1_data, a2_data), res_axes = raw_broadcastable([self, other]) - return Array(np.isclose(a1_data, a2_data, rtol=rtol, atol=atol, equal_nan=nans_equal), res_axes) + res_data = np.isclose(self_data, other_data, + rtol=rtol, atol=atol, equal_nan=nans_equal) + return Array(res_data, res_axes) def isin(self, test_values, assume_unique=False, invert=False) -> 'Array': r""" @@ -8358,6 +8412,9 @@ def asarray(a, meta=None) -> Array: elif isinstance(a, pd.DataFrame): from larray.inout.pandas import from_frame return from_frame(a, meta=meta) + elif isinstance(a, pd.Series): + from larray.inout.pandas import from_series + return from_series(a, meta=meta) else: return Array(a, meta=meta) diff --git a/larray/core/axis.py b/larray/core/axis.py index 80b2664bc..6c58a878a 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -17,9 +17,9 @@ from larray.util.misc import (duplicates, array_lookup2, ReprString, index_by_id, renamed_to, LHDFStore, lazy_attribute, _isnoneslice, unique_list, unique_multi, Product, argsort, has_duplicates, exactly_one, concatenate_ndarrays) +from larray.util.misc import first from larray.util.types import Scalar - np_frompyfunc = np.frompyfunc @@ -102,6 +102,10 @@ def __init__(self, labels, name=None): name = name.name if isinstance(labels, str): if '=' in labels: + if name is not None: + raise ValueError("Axis(labels, name=None) cannot have " + "both a string labels with an '=' sign " + "and a value for the name argument") name, labels = [o.strip() for o in labels.split('=')] elif '..' not in labels and ',' not in labels: warnings.warn("Arguments 'name' and 'labels' of Axis constructor have been inverted in " @@ -1023,13 +1027,9 @@ def index(self, key) -> Union[int, np.ndarray, slice]: translate = renamed_to(index, 'translate', raise_error=True) - # FIXME: remove id @property def id(self) -> str: - if self.name is not None: - return self.name - else: - raise ValueError('Axis has no name, so no id') + raise NotImplementedError('Axis.id is deprecated. Please use Axis.name instead.') def __str__(self) -> str: name = str(self.name) if self.name is not None else '{?}' @@ -1265,8 +1265,8 @@ def union(self, other) -> 'Axis': def intersection(self, other) -> 'Axis': r"""Return axis with the (set) intersection of this axis labels and other labels. - In other words, this will use labels from this axis if they are also in other. Labels relative order will be - kept intact. + In other words, this will use labels from this axis if they are also in + other. Duplicate labels and labels relative order will be kept intact. Parameters ---------- @@ -1290,6 +1290,8 @@ def intersection(self, other) -> 'Axis': Axis(['a1', 'a2'], 'a') >>> a.intersection(['a1', 'a2', 'a3']) Axis(['a1', 'a2'], 'a') + >>> Axis('a=a0,a1,a0').intersection('a1,a0') + Axis(['a0', 'a1', 'a0'], 'a') """ non_string_scalar = np.isscalar(other) and not isinstance(other, str) other = [other] if non_string_scalar else _to_ticks(other) @@ -1330,12 +1332,12 @@ def difference(self, other) -> 'Axis': to_drop = set(other) return Axis([label for label in self.labels if label not in to_drop], self.name) - def align(self, other, join='outer') -> 'Axis': + def align(self, *other, join='outer') -> 'Axis': r"""Align axis with other object using specified join method. Parameters ---------- - other : Axis or label sequence + *other : Axis or label sequence join : {'outer', 'inner', 'left', 'right', 'exact'}, optional Defaults to 'outer'. @@ -1366,22 +1368,16 @@ def align(self, other, join='outer') -> 'Axis': ValueError: align method with join='exact' expected Axis(['a0', 'a1', 'a2'], 'a') to be equal to Axis(['a1', 'a2', 'a3'], 'a') """ - assert join in {'outer', 'inner', 'left', 'right', 'exact'} - if join == 'outer': - return self.union(other) - elif join == 'inner': - return self.intersection(other) - elif join == 'left': - return self - elif join == 'right': - if not isinstance(other, Axis): - other = Axis(other) - return other - elif join == 'exact': - if not self.equals(other): - raise ValueError(f"align method with join='exact' expected {self!r} to be equal to {other!r}") - else: - return self + bad_objs = [obj for obj in other if not isinstance(obj, Axis)] + if bad_objs: + for obj in bad_objs: + obj_type = type(obj).__name__ + warnings.warn(f"aligning an Axis to a non-Axis object " + f"({obj_type}) is deprecated. Please convert to " + f"an Axis first.", FutureWarning, stacklevel=2) + other = [obj if isinstance(obj, Axis) else Axis(obj) + for obj in other] + return align_axes((self, *other), join=join) def to_hdf(self, filepath, key=None) -> None: r""" @@ -1462,6 +1458,50 @@ def ignore_labels(self) -> 'Axis': return Axis(len(self), self.name) +def align_axes(axes: Sequence[Axis], join: str = 'outer') -> Axis: + if not all(isinstance(axis, Axis) for axis in axes): + raise TypeError("all objects to align must be Axis objects") + + if join not in {'outer', 'inner', 'left', 'right', 'exact'}: + raise ValueError(f"join must be one of 'outer', 'inner', 'left', " + f"'right' or 'exact', got {join!r}") + + names = [axis.name for axis in axes] + first_name = first((name for name in names if name is not None), + default=None) + if first_name is not None: + if not all(name is None or name == first_name for name in names): + raise ValueError("In align, all axes must be anonymous or " + "have the same name: " + f"{', '.join(repr(name) for name in names)}") + + def join_left(axis1, axis2): + return axis1 + def join_right(axis1, axis2): + return axis2 + def join_exact(axis1, axis2): + if not axis1.equals(axis2): + raise ValueError(f"align method with join='exact' expected " + f"{axis1!r} to be equal to {axis2!r}") + else: + return axis1 + if join == 'outer': + join_labels_func = Axis.union + elif join == 'inner': + join_labels_func = Axis.intersection + elif join == 'left': + join_labels_func = join_left + elif join == 'right': + join_labels_func = join_right + else: + assert join == 'exact' + join_labels_func = join_exact + aligned_axis = axes[0] + for axis in axes[1:]: + aligned_axis = join_labels_func(aligned_axis, axis) + return aligned_axis + + def _make_axis(obj) -> Axis: if isinstance(obj, Axis): return obj @@ -2223,13 +2263,14 @@ def index(self, axis, compatible=False) -> int: >>> col.index('sex') 1 """ + if isinstance(axis, AxisReference): + axis = axis.name + # not using an elif because name can be an int if isinstance(axis, int): if -len(self) <= axis < len(self): return axis else: raise ValueError(f"axis {axis} is not in collection") - elif isinstance(axis, AxisReference): - name = axis.name elif isinstance(axis, Axis): try: # 1) first look for that particular axis object @@ -2681,19 +2722,33 @@ def _translate_nice_key(self, axis_key): except KeyError: continue if not valid_axes: - # if the key has several labels + # transform string key to object (Group, list, slice, range, scalar) nicer_key = _to_key(axis_key) sequence_types = (tuple, list, np.ndarray, ABCArray) if (isinstance(nicer_key, sequence_types) or (isinstance(nicer_key, Group) and isinstance(nicer_key.key, sequence_types))): - # we use a different "base" message in this case (because axis_key is not really a *label*) - msg = f"{axis_key!r} is not a valid subset for any axis:\n{self._axes_summary()}" - # ... and check for partial matches if isinstance(nicer_key, Group): nicer_key = nicer_key.eval() - key_label_set = set(nicer_key) + + # we transform arrays with ndim > 1 to flat arrays because + # otherwise the elements are arrays themselves which are not + # hashable and thus we cannot compute a set of them + if isinstance(nicer_key, (ABCArray, np.ndarray)): + key_flat_values = nicer_key.data.flat if isinstance(nicer_key, ABCArray) else nicer_key.flat + array_key = True + msg = (f"The values of the array key:\n\n{axis_key!r}\n\n" + f"do not all correspond to labels of a single axis " + f"of the subsetted array which has the following " + f"axes:\n\n{self._axes_summary()}\n") + else: + key_flat_values = nicer_key + array_key = False + msg = (f"{axis_key!r} is not a valid subset for any axis:\n" + f"{self._axes_summary()}") + + key_label_set = set(key_flat_values) partial_matches = {} for axis in self: missing_labels = key_label_set - set(axis.labels) @@ -2704,12 +2759,33 @@ def _translate_nice_key(self, axis_key): partial_matches_str = '\n'.join( f" * axis '{self.axis_id(axis)}' contains {len(key_label_set) - len(missing_labels)}" f' out of {len(key_label_set)}' - f' labels (missing labels: {", ".join(repr(label) for label in missing_labels)})' + f' labels (labels not found: {", ".join(repr(label) for label in missing_labels)})' for axis, missing_labels in partial_matches.items() ) - msg += f"\nSome of those labels are valid though:\n{partial_matches_str}" + what = 'key values' if array_key else 'labels' + msg += f"\nSome of those {what} correspond though:\n{partial_matches_str}" + + # if we have a single partial match and an la.Array key (we + # don't do it for np.ndarray keys), we compute the problematic + # parts of the key and mention them if they are small enough + if len(partial_matches) == 1 and isinstance(nicer_key, ABCArray): + axis = next(iter(partial_matches.keys())) + is_bad_key_value = (~nicer_key.isin(axis.labels)).compact() + bad_indices_per_axis = is_bad_key_value.data.nonzero() + SMALL_BAD_PART_THRESHOLD = 5 + small_bad_parts_locations = [ + f" {axis.name}: " + + ' '.join(repr(label) + for label in axis.labels[axis_indices]) + for axis, axis_indices in zip(nicer_key.axes, bad_indices_per_axis) + if len(axis_indices) <= SMALL_BAD_PART_THRESHOLD + ] + if small_bad_parts_locations: + msg += ("\n\nNote that all the bad key values are " + "located within the following labels:\n") + msg += '\n'.join(small_bad_parts_locations) else: - # we have single label + # we have a single label msg = f"{axis_key!r} is not a valid label for any axis:\n{self._axes_summary()}" raise ValueError(msg) @@ -2805,8 +2881,6 @@ def _translate_axis_key(self, axis_key): """ # called from _key_to_igroups - from .array import Array - # Need to convert string keys to groups otherwise command like # >>> ndtest((5, 5)).drop('1[a0]') # will work although it shouldn't @@ -2827,35 +2901,7 @@ def _translate_axis_key(self, axis_key): else: axis_key = axis_key.labels - # TODO: do it for Group without axis too - if isinstance(axis_key, (tuple, list, np.ndarray, Array)): - axis = None - # TODO: I should actually do some benchmarks to see if this is useful, and estimate which numbers to use - # FIXME: check that size is < than key size - for size in (1, 10, 100, 1000): - # TODO: do not recheck already checked elements - key_chunk = axis_key.i[:size] if isinstance(axis_key, Array) else axis_key[:size] - try: - axis, ikey = self._translate_axis_key_chunk(key_chunk) - # if key is unambiguous (did not raise an exception), we know the axis - # TODO: if len(axis_key) < size, we can return axis, ikey directly - break - # TODO: we should only continue when ValueError is caused by an ambiguous key, otherwise we only delay - # an inevitable failure - except ValueError: - continue - # the (start of the) key match a single axis - if axis is not None: - # make sure we have an Axis object - # TODO: we should make sure the tkey returned from _translate_axis_key_chunk always contains a - # real Axis (and thus kill this line) - # axis = self[axis] - # wrap key in LGroup - axis_key = axis[axis_key] - # XXX: reuse tkey chunks and only translate the rest? - return self._translate_axis_key_chunk(axis_key) - else: - return self._translate_axis_key_chunk(axis_key) + return self._translate_axis_key_chunk(axis_key) def _key_to_axis_indices_dict(self, key): """ @@ -3546,23 +3592,25 @@ def _prepare_split_axes(self, axes, names, sep) -> dict: split_axis = renamed_to(split_axes, 'split_axis', raise_error=True) - def align(self, other, join='outer', axes=None) -> Tuple['AxisCollection', 'AxisCollection']: - r"""Align this axis collection with another. + def align(self, *other, join='outer', axes=None) -> Tuple['AxisCollection']: + r"""Align this AxisCollection with (an)other AxisCollection(s). This ensures all common axes are compatible. Parameters ---------- - other : AxisCollection + *other : AxisCollection + AxisCollection(s) to align with this one. join : {'outer', 'inner', 'left', 'right', 'exact'}, optional Defaults to 'outer'. axes : AxisReference or sequence of them, optional - Axes to align. Need to be valid in both arrays. Defaults to None (all common axes). This must be specified + Axes to align. Need to be valid in all axis collections. + Defaults to None (all common axes). This must be specified when mixing anonymous and non-anonymous axes. Returns ------- - (left, right) : (AxisCollection, AxisCollection) + tuple of AxisCollection Aligned collections See Also @@ -3625,31 +3673,20 @@ def align(self, other, join='outer', axes=None) -> Tuple['AxisCollection', 'Axis Axis(['c0'], None) ]) """ - if join not in {'outer', 'inner', 'left', 'right', 'exact'}: - raise ValueError("join should be one of 'outer', 'inner', 'left', 'right' or 'exact'") - other = other if isinstance(other, AxisCollection) else AxisCollection(other) - - # if axes not specified - if axes is None: - # and we have only anonymous axes on both sides - if all(name is None for name in self.names) and all(name is None for name in other.names): - # use N first axes by position - join_axes = list(range(min(len(self), len(other)))) - elif any(name is None for name in self.names) or any(name is None for name in other.names): - raise ValueError("axes collections with mixed anonymous/non anonymous axes are not supported by align" - "without specifying axes explicitly") - else: - assert all(name is not None for name in self.names) and all(name is not None for name in other.names) - # use all common axes - join_axes = list(OrderedSet(self.names) & OrderedSet(other.names)) - else: - if isinstance(axes, (int, str, Axis)): - axes = [axes] - join_axes = axes - new_axes = [self_axis.align(other_axis, join=join) - for self_axis, other_axis in zip(self[join_axes], other[join_axes])] - axes_changes = list(zip(join_axes, new_axes)) - return self.replace(axes_changes), other.replace(axes_changes) + # For backward compatibility with older code using align with a + # non-AxisCollection second argument, we only support aligning more + # than two collection when other contains actual AxisCollection objects + bad_objs = [obj for obj in other if not isinstance(obj, AxisCollection)] + if bad_objs: + for obj in bad_objs: + obj_type = type(obj).__name__ + warnings.warn(f"aligning an AxisCollection to a " + f"non-AxisCollection object ({obj_type}) is " + f"deprecated. Please convert to an AxisCollection " + f"first.", FutureWarning, stacklevel=2) + other = [AxisCollection(obj) for obj in other] + + return align_axis_collections((self, *other), join=join, axes=axes) # XXX: make this into a public method/property? AxisCollection.flat_labels[flat_indices]? def _flat_lookup(self, flat_indices): @@ -3796,6 +3833,57 @@ def _adv_keys_to_combined_axes(self, key, wildcard=False, sep='_'): return AxisCollection(combined_axis) +def align_axis_collections(axis_collections, join='outer', axes=None): + if join not in {'outer', 'inner', 'left', 'right', 'exact'}: + raise ValueError("join should be one of 'outer', 'inner', 'left', " + "'right' or 'exact'") + + # if axes not specified + if axes is None: + # and we have only anonymous axes + if all(name is None for col in axis_collections + for name in col.names): + # use all axes by position + max_length = max(len(col) for col in axis_collections) + join_axes_refs = list(range(max_length)) + elif any(name is None for col in axis_collections + for name in col.names): + raise ValueError( + "axes collections with mixed anonymous/non anonymous axes " + "are not supported by align without specifying axes " + "explicitly") + else: + assert all(name is not None for col in axis_collections + for name in col.names) + # use all axes by name + join_axes_refs = OrderedSet(axis_collections[0].names) + for col in axis_collections[1:]: + join_axes_refs |= OrderedSet(col.names) + else: + if isinstance(axes, (int, str, Axis)): + axes = [axes] + join_axes_refs = axes + + # first compute all aligned axes for all collections + axes_changes = { + axis_ref: align_axes([axis_col[axis_ref] + for axis_col in axis_collections + if axis_ref in axis_col], + join=join) + for axis_ref in join_axes_refs + } + + # then apply the changed axes for the collections where the axis exists + return tuple( + axis_col.replace({ + axis_ref: aligned_axis + for axis_ref, aligned_axis in axes_changes.items() + if axis_ref in axis_col + }) + for axis_col in axis_collections + ) + + class AxisReference(ABCAxisReference, ExprNode, Axis): def __init__(self, name): self.name = name @@ -3817,6 +3905,9 @@ def evaluate(self, context) -> Axis: """ return context[self.name] + def can_evaluate_with(self, context) -> set: # set[int] + return self.name in context + # Use the default hash. We have to specify it explicitly because we define __eq__ via ExprNode and # ExprNode.__hash__ (which is not set explicitly) takes precedence over Axis.__hash__ __hash__ = object.__hash__ diff --git a/larray/core/expr.py b/larray/core/expr.py index 92aebb2ba..0e4e98aea 100644 --- a/larray/core/expr.py +++ b/larray/core/expr.py @@ -1,4 +1,6 @@ -from larray.core.abstractbases import ABCArray +import numpy as np + +from larray.core.abstractbases import ABCAxisReference, ABCAxis, ABCArray class ExprNode: @@ -7,9 +9,13 @@ def __bool__(self): # method factory def _binop(opname): + # reversed = opname.startswith('r') and opname != 'rshift' def opmethod(self, other): - # evaluate eagerly when possible - if isinstance(other, ABCArray): + assert isinstance(self, ExprNode), \ + (f"Expected ExprNode, got {type(self).__name__} " + f"({self=} {other=})") + if (isinstance(other, ABCArray) and + self.can_evaluate_with(other.axes)): self_value = self.evaluate(other.axes) return getattr(self_value, f'__{opname}__')(other) else: @@ -18,6 +24,7 @@ def opmethod(self, other): opmethod.__name__ = f'__{opname}__' return opmethod + __rmatmul__ = _binop('rmatmul') __matmul__ = _binop('matmul') __ror__ = _binop('ror') __or__ = _binop('or') @@ -26,7 +33,7 @@ def opmethod(self, other): __rand__ = _binop('rand') __and__ = _binop('and') __rrshift__ = _binop('rrshift') - __rshift__ = _binop('rshift') + __rshift__ = _binop('rshift') # not reverse even though it starts with 'r' __rlshift__ = _binop('rlshift') __lshift__ = _binop('lshift') __rpow__ = _binop('rpow') @@ -66,6 +73,21 @@ def opmethod(self): __abs__ = _unaryop('abs') __invert__ = _unaryop('invert') + def can_evaluate_with(self, context): + """ + Returns wether this expression can be evaluated using the given context. + + Parameters + ---------- + context : AxisCollection + Use axes from this collection + + Returns + ------- + bool + """ + raise NotImplementedError() + def evaluate(self, context): """ Parameters @@ -81,6 +103,25 @@ def expr_eval(expr, context): return expr.evaluate(context) if isinstance(expr, ExprNode) else expr +def value_summary(value): + if isinstance(value, ABCArray): + axes = value.axes + axes_info = ' x '.join(f'{name} ({length})' + for name, length + in zip(axes.display_names, axes.shape)) + return f"Array(<{axes_info}>)" + elif isinstance(value, ABCAxisReference): + return f"X.{value.name}" + elif isinstance(value, ABCAxis): + return f"Axis(<{value.name} ({len(value)})>)" + elif isinstance(value, ExprNode): + return repr(value) + else: + assert np.isscalar(value), (f"Expected scalar value, " + f"got {type(value).__name__}") + return repr(value) + + class BinaryOp(ExprNode): def __init__(self, op, expr1, expr2): self.opname = f'__{op}__' @@ -93,8 +134,17 @@ def evaluate(self, context): expr2 = expr_eval(self.expr2, context) return getattr(expr1, self.opname)(expr2) + def can_evaluate_with(self, context): + return ( + (self.expr1.can_evaluate_with(context) if isinstance(self.expr1, ExprNode) else True) + and + (self.expr2.can_evaluate_with(context) if isinstance(self.expr2, ExprNode) else True) + ) + def __repr__(self): - return f"BinaryOp({self.opname[2:-2]!r}, {self.expr1!r}, {self.expr2!r})" + return (f"BinaryOp({self.opname[2:-2]!r}, " + f"{value_summary(self.expr1)}, " + f"{value_summary(self.expr2)})") class UnaryOp(ExprNode): @@ -107,5 +157,8 @@ def evaluate(self, context): expr = expr_eval(self.expr, context) return getattr(expr, self.opname)() + def can_evaluate_with(self, context): + return self.expr.can_evaluate_with(context) + def __repr__(self): - return f"UnaryOp({self.opname[2:-2]!r}, {self.expr!r})" + return f"UnaryOp({self.opname[2:-2]!r}, {value_summary(self.expr)})" diff --git a/larray/core/npufuncs.py b/larray/core/npufuncs.py index bd40ad067..5d8fb5a58 100644 --- a/larray/core/npufuncs.py +++ b/larray/core/npufuncs.py @@ -118,12 +118,10 @@ sign = wrap_numpy_func(np.sign) fmax = wrap_numpy_func(np.fmax) fmin = wrap_numpy_func(np.fmin) -nan_to_num = wrap_numpy_func(np.nan_to_num) real_if_close = wrap_numpy_func(np.real_if_close) # TODO: add examples for functions below sqrt = wrap_numpy_func(np.sqrt) -isnan = wrap_numpy_func(np.isnan) isinf = wrap_numpy_func(np.isinf) inverse = wrap_numpy_func(np.linalg.inv) diff --git a/larray/core/session.py b/larray/core/session.py index 65d05c858..d81cb1e83 100644 --- a/larray/core/session.py +++ b/larray/core/session.py @@ -13,7 +13,9 @@ from larray.core.axis import Axis from larray.core.constants import nan from larray.core.array import Array, get_axes, ndtest, zeros, zeros_like, sequence # noqa: F401 +from larray.core.array import align_arrays from larray.util.misc import float_error_handler_factory, is_interactive_interpreter, renamed_to, inverseop, size2str +from larray.util.misc import unique_list, first from larray.inout.session import ext_default_engine, get_file_handler @@ -1578,6 +1580,97 @@ def memory_used(self) -> str: """ return size2str(self.nbytes) + def align(self, *other, join='outer', fill_value=nan): + r"""Align the current session with (an)other session(s) + + Arrays from all sessions will be aligned with the corresponding arrays + in all other sessions where arrays with the same name are present. + + Non-Array objects (eg. Axis, Group) are not aligned, but simply copied + to the resulting sessions. + + Parameters + ---------- + *other : Session + Session(s) to align with. + join : {'outer', 'inner', 'left', 'right', 'exact'}, optional + How to handle common axes when aligning arrays. + See :py:obj:`Array.align()` for details. Defaults to 'outer'. + fill_value : scalar or Array, optional + Value used to fill cells corresponding to label combinations which + are not present in an array. Defaults to NaN. + + Returns + ------- + sessions: tuple of Session + Aligned sessions. + + Examples + -------- + >>> arr1 = ndtest('a=a0,a1; b=b0,b1') + >>> arr1 + a\b b0 b1 + a0 0 1 + a1 2 3 + >>> arr2 = ndtest('a=a1,a2; b=b1,b2') + >>> arr2 + a\b b1 b2 + a1 0 1 + a2 2 3 + >>> s1 = Session({'a': arr1.a, 'arr': arr1}) + >>> s2 = Session({'a': arr2.a, 'arr': arr2}) + >>> s1_aligned, s2_aligned = s1.align(s2, join='outer', fill_value=-1) + >>> s1_aligned.arr + a\b b0 b1 b2 + a0 0 1 -1 + a1 2 3 -1 + a2 -1 -1 -1 + >>> s2_aligned.arr + a\b b0 b1 b2 + a0 -1 -1 -1 + a1 -1 0 1 + a2 -1 2 3 + >>> s1_aligned.a + Axis(['a0', 'a1'], 'a') + """ + sessions = (self, other) if isinstance(other, Session) else (self, *other) + if not all(isinstance(s, Session) for s in sessions): + raise TypeError("Session.align only supports aligning with other " + "Session objects") + + seen = set() + all_keys = [] + for s in sessions: + unique_list(s.keys(), all_keys, seen) + + def rename_anonymous_axes(obj): + if not isinstance(obj, Array): + return obj + if not any(axis.name is None for axis in obj.axes): + return obj + return obj.rename({ + axis_num: axis.name + if axis.name is not None else f'axis{axis_num}' + for axis_num, axis in enumerate(obj.axes) + }) + + res_sessions = tuple(Session() for s in sessions) + for name in all_keys: + objects = [s.get(name, np.nan) for s in sessions] + first_array = first((obj for obj in objects + if isinstance(obj, Array))) + if first_array is None: + # not a single array, copy the objects as is + aligned_objects = objects + else: + # rename anonymous axes because they are not supported by align + objects = [rename_anonymous_axes(obj) for obj in objects] + aligned_objects = align_arrays(objects, join=join, + fill_value=fill_value) + for res_session, obj in zip(res_sessions, aligned_objects): + res_session[name] = obj + return res_sessions + def _exclude_private_vars(vars_dict: Dict[str, Any]) -> Dict[str, Any]: return {k: v for k, v in vars_dict.items() if not k.startswith('_')} diff --git a/larray/core/ufuncs.py b/larray/core/ufuncs.py index 55eb075f2..481285a26 100644 --- a/larray/core/ufuncs.py +++ b/larray/core/ufuncs.py @@ -300,3 +300,189 @@ def wrap_numpy_func(func, doc=None): a0 6 2 5 0 a1 5 2 3 0 """) + +def _generalized_isnan(arr, out=None, where=True, **kwargs): + if isinstance(arr, np.ndarray) and arr.dtype.kind == 'O': + if out is not None or where is not True or kwargs: + raise ValueError("The 'out', 'where' and other keyword arguments " + "are not supported for object arrays.") + return arr != arr + else: + return np.isnan(arr, out=out, where=where, **kwargs) + +isnan = wrap_elementwise_array_func(_generalized_isnan, r""" +Test element-wise for NaN and return result as a boolean array. + +Parameters +---------- +x : array_like + Input array. +out : ndarray, None, or tuple of ndarray and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or None, + a freshly-allocated array is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. +where : array_like, optional + This condition is broadcast over the input. At locations where the + condition is True, the `out` array will be set to the ufunc result. + Elsewhere, the `out` array will retain its original value. + Note that if an uninitialized `out` array is created via the default + ``out=None``, locations within it where the condition is False will + remain uninitialized. +**kwargs + For other keyword-only arguments, see the + :ref:`ufunc docs `. + +Returns +------- +y : ndarray or bool + True where ``x`` is NaN, false otherwise. + This is a scalar if `x` is a scalar. + +See Also +-------- +isinf, isneginf, isposinf, isfinite, isnat + +Notes +----- +Contrary to the numpy implementation, this function support object arrays. + +NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic +(IEEE 754). This means that Not a Number is not equivalent to infinity. + +Examples +-------- +>>> import larray as la +>>> la.isnan(la.nan) +True +>>> la.isnan(la.inf) +False +>>> arr = la.Array([la.nan, 1, la.inf], +... la.Axis(3, 'values')) +>>> la.isnan(arr) +values* 0 1 2 + True False False +>>> arr = la.Array(['abc', 1, la.nan], +... la.Axis(3, 'values'), dtype=object) +>>> la.isnan(arr) +values* 0 1 2 + False False True +""") + + +def _generalized_nan_to_num(arr, copy=True, nan=0, posinf=None, neginf=None): + if isinstance(arr, np.ndarray) and arr.dtype.kind == 'O': + import sys + if posinf is None: + posinf = sys.float_info.max + if neginf is None: + neginf = -sys.float_info.max + res = arr.copy() if copy else arr + is_nan_value = arr != arr + is_pos_inf_value = arr == np.inf + is_neg_inf_value = arr == -np.inf + if isinstance(nan, np.ndarray): + # each array argument is reshaped to a compatible shape for + # broadcasting by larray machinery but not actually broadcasted yet + nan = np.broadcast_to(nan, arr.shape)[is_nan_value] + res[is_nan_value] = nan + if isinstance(posinf, np.ndarray): + posinf = np.broadcast_to(posinf, arr.shape)[is_pos_inf_value] + res[is_pos_inf_value] = posinf + if isinstance(neginf, np.ndarray): + neginf = np.broadcast_to(neginf, arr.shape)[is_neg_inf_value] + res[is_neg_inf_value] = neginf + return res + else: + return np.nan_to_num(arr, copy=copy, nan=nan, posinf=posinf, neginf=neginf) + +nan_to_num = wrap_elementwise_array_func(_generalized_nan_to_num,r""" + Replace NaN with zero and infinity with large finite numbers (default + behaviour) or with the numbers defined by the user using the `nan`, + `posinf` and/or `neginf` keywords. + + If `x` is inexact or an object array, NaN is replaced by zero or by the user + defined value in `nan` keyword, infinity is replaced by the largest finite + floating point value representable by ``x.dtype`` or by the user defined + value in `posinf` keyword and -infinity is replaced by the most negative + finite floating point value representable by ``x.dtype`` or by the user + defined value in `neginf` keyword. + + For complex dtypes, the above is applied to each of the real and + imaginary components of `x` separately. + + If `x` is not inexact or object, then no replacements are made. + + Parameters + ---------- + x : scalar or array_like + Input data. + copy : bool, optional + Whether to create a copy of `x` (True) or to replace values + in-place (False). The in-place operation only occurs if + casting to an array does not require a copy. + Default is True. + nan : int, float or array_like, optional + Value to be used to fill NaN values. If no value is passed + then NaN values will be replaced with 0.0. + posinf : int, float, optional + Value to be used to fill positive infinity values. If no value is + passed then positive infinity values will be replaced with the largest + finite floating point value representable by ``x.dtype``. + neginf : int, float, optional + Value to be used to fill negative infinity values. If no value is + passed then negative infinity values will be replaced with the most + negative finite floating point value representable by ``x.dtype``. + + Returns + ------- + out : Array or scalar + `x`, with the non-finite values replaced. If `copy` is False, this may + be `x` itself. + + See Also + -------- + isinf : Shows which elements are positive or negative infinity. + isneginf : Shows which elements are negative infinity. + isposinf : Shows which elements are positive infinity. + isnan : Shows which elements are Not a Number (NaN). + isfinite : Shows which elements are finite (not NaN, not infinity) + + Notes + ----- + Contrary to the numpy implementation, this function support object arrays. + + NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). This means that Not a Number is not equivalent to infinity. + + Examples + -------- + >>> import larray as la + + >>> la.nan_to_num(la.inf) + 1.7976931348623157e+308 + >>> la.nan_to_num(-la.inf) + -1.7976931348623157e+308 + >>> la.nan_to_num(np.nan) + 0.0 + + >>> x = la.Array([-la.inf, 1, la.nan, 2, la.inf], la.Axis(5, 'values')) + >>> la.nan_to_num(x) + values* 0 1 2 3 4 + -1.7976931348623157e+308 1.0 0.0 2.0 1.7976931348623157e+308 + >>> la.nan_to_num(x, nan=-1, posinf=999, neginf=-999) + values* 0 1 2 3 4 + -999.0 1.0 -1.0 2.0 999.0 + + >>> x = la.Array([1, 'abc', la.nan, 2], la.Axis(4, 'values'), dtype=object) + >>> la.nan_to_num(x) + values* 0 1 2 3 + 1 abc 0 2 + + >>> y = la.Array([complex(la.inf, la.nan), la.nan, complex(la.nan, la.inf)], + ... la.Axis(3, 'values')) + >>> la.nan_to_num(y) + values* 0 1 2 + (1.7976931348623157e+308+0j) 0j 1.7976931348623157e+308j + """ +) \ No newline at end of file diff --git a/larray/extra/ipfp.py b/larray/extra/ipfp.py index 6b5bbf47a..a471f42b3 100644 --- a/larray/extra/ipfp.py +++ b/larray/extra/ipfp.py @@ -154,10 +154,10 @@ def ipfp(target_sums, a=None, axes=None, maxiter=1000, threshold=0.5, stepstoabo target_sums = [asarray(ts) for ts in target_sums] - n = len(target_sums) + ndim = len(target_sums) if axes is None: - axes = list(range(n)) + axes = list(range(ndim)) def has_anonymous_axes(a): return any(axis.name is None for axis in a.axes) @@ -252,32 +252,41 @@ def has_anonymous_axes(a): # Here is the nice version of the algorithm # for i in range(maxiter): - # for axis, axis_target in zip(axes, target_sums): - # r *= axis_target.divnot0(r.sum(axis)) - # max_sum_diff = max(abs(r.sum(axis) - axis_target).max() - # for axis, axis_target in zip(axes, target_sums)) + # for axis, axis_target_sum in zip(axes, target_sums): + # r *= axis_target_sum.divnot0(r.sum(axis)) + # max_sum_diff = max(abs(r.sum(axis) - axis_target_sum).max() + # for axis, axis_target_sum in zip(axes, target_sums)) # step_sum_improvement = ... - # Here is the ugly optimized version which use only numpy operations and avoids computing the sum for the first - # axis twice per iteration + # Here is the ugly optimized version which use only numpy operations and reuses the sum for the first + # axis from the previous iteration "check phase" target_sums = [axis_target.data for axis_target in target_sums] res_data = a.data.astype(float) axes_indices = [a.axes.index(axis) for axis in axes] axis0_sum = res_data.sum(axes_indices[0]) + if ndim == 1: + # When there is only one dimension, the algorithm always + # terminates after a single iteration + res_data *= np.expand_dims(divnot0(target_sums[0], axis0_sum), axes_indices[0]) + return Array(res_data, a.axes) + for i in range(maxiter): - startr = res_data.copy() + if display_progress: + startr = res_data.copy() + # r = r * target_sums[0].divnot0(axis0_sum) res_data *= np.expand_dims(divnot0(target_sums[0], axis0_sum), axes_indices[0]) - for axis_idx, axis_target in zip(axes_indices[1:], target_sums[1:]): - # r = r * axis_target.divnot0(r.sum(axis)) - res_data *= np.expand_dims(divnot0(axis_target, res_data.sum(axis_idx)), axis_idx) - - # XXX: can't we skip computing the sum and max_diff for the last axis which should be good for each - # iteration??? - axes_sum = [res_data.sum(axis_idx) for axis_idx in axes_indices] + for axis_target_sum, axis_idx in zip(target_sums[1:], axes_indices[1:]): + # r = r * axis_target_sum.divnot0(r.sum(axis)) + res_data *= np.expand_dims(divnot0(axis_target_sum, res_data.sum(axis_idx)), axis_idx) + + # We avoid computing the sum and max_diff for the last axis which is always equal + # to the corresponding target_sum modulo numerical inaccuracy, hence the two [:-1] + # in the 3 following lines + axes_sum = [res_data.sum(axis_idx) for axis_idx in axes_indices[:-1]] max_sum_diff = max(abs(axis_sum - axis_target).max() - for axis_sum, axis_target in zip(axes_sum, target_sums)) + for axis_sum, axis_target in zip(axes_sum, target_sums[:-1])) axis0_sum = axes_sum[0] if display_progress: diff --git a/larray/inout/pandas.py b/larray/inout/pandas.py index 3696350f5..d67fec7e8 100644 --- a/larray/inout/pandas.py +++ b/larray/inout/pandas.py @@ -205,23 +205,6 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 - - Names of the last two axes written as ``before_last_axis_name\\last_axis_name`` - - >>> df = ndtest((2, 2, 2)).to_frame(fold_last_axis_name=True) - >>> df # doctest: +NORMALIZE_WHITESPACE - c0 c1 - a b\c - a0 b0 0 1 - b1 2 3 - a1 b0 4 5 - b1 6 7 - >>> from_frame(df, unfold_last_axis_name=True) - a b\c c0 c1 - a0 b0 0 1 - a0 b1 2 3 - a1 b0 4 5 - a1 b1 6 7 """ axes_names = [decode(name, 'utf8') if isinstance(name, bytes) else name for name in df.index.names] diff --git a/larray/inout/xw_excel.py b/larray/inout/xw_excel.py index e15567805..254e4b795 100644 --- a/larray/inout/xw_excel.py +++ b/larray/inout/xw_excel.py @@ -150,7 +150,7 @@ def __init__(self, filepath=None, overwrite_file=False, visible=None, silent=Non app = global_app assert isinstance(app, xw.App) - # activate XLA(M) addins, if nee + # activate XLA(M) addins, if needed # By default, add-ins are not activated when an Excel Workbook is opened via COM if load_addins: xl_app = app.api diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 656314f47..5beb0f54d 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -1,4 +1,5 @@ import os +import sys import pytest import numpy as np @@ -17,7 +18,7 @@ asarray, union, clip, exp, where, X, mean, inf, nan, isnan, round, read_hdf, read_csv, read_eurostat, read_excel, open_excel, from_lists, from_string, from_frame, from_series, - zip_array_values, zip_array_items) + zip_array_values, zip_array_items, nan_to_num) from larray.core.axis import _to_ticks, _to_tick, _to_key from larray.util.misc import LHDFStore @@ -674,9 +675,13 @@ def test_getitem_guess_axis(array): # key with partial invalid list (ie list containing a label not found # on any axis) - # FIXME: this should not mention the a axis specifically (this is due to the chunking code) - with must_raise(ValueError, "a[3, 999] is not a valid label for the 'a' axis with labels: 0, 1, 2, 3, 4, 5, 6, " - "7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18"): + with must_raise(ValueError, """[3, 999] is not a valid subset for any axis: + a [19]: 0 1 2 ... 16 17 18 + b [12]: 'b0' 'b1' 'b2' ... 'b10' 'b11' 'b3' + c [2]: 'c0' 'c1' + d [6]: 'd1' 'd2' 'd3' 'd4' 'd5' 'd6' +Some of those labels correspond though: + * axis 'a' contains 1 out of 2 labels (labels not found: 999)"""): _ = array[[1, 2], [3, 999]] with must_raise(ValueError, """[999, 4] is not a valid subset for any axis: @@ -684,8 +689,8 @@ def test_getitem_guess_axis(array): b [12]: 'b0' 'b1' 'b2' ... 'b10' 'b11' 'b3' c [2]: 'c0' 'c1' d [6]: 'd1' 'd2' 'd3' 'd4' 'd5' 'd6' -Some of those labels are valid though: - * axis 'a' contains 1 out of 2 labels (missing labels: 999)"""): +Some of those labels correspond though: + * axis 'a' contains 1 out of 2 labels (labels not found: 999)"""): _ = array[[1, 2], [999, 4]] # ambiguous key @@ -828,6 +833,18 @@ def test_getitem_bool_larray_key_arr_whout_bool_axis(): res = arr[X.b < 2] assert_nparray_equal(res.data, raw[:, :2]) + # using an AxisReference (ExprNode) with an Array which cannot be evaluated + # outside of the getitem context (issue #1129) + threshold = stack({'a0': 2, 'a1': 3}, 'a') + expected = Array([2, 3, 7], Axis('b_a=2_a0,3_a0,3_a1')) + res = arr[X.b >= threshold] + assert_larray_equal(res, expected) + + # same situation when the array is first (the code path is different) + expected = Array([2, 3, 7], Axis('a_b=a0_2,a0_3,a1_3')) + res = arr[threshold <= X.b] + assert_larray_equal(res, expected) + def test_getitem_bool_larray_key_arr_wh_bool_axis(): gender = Axis([False, True], 'gender') @@ -1087,6 +1104,33 @@ def test_getitem_single_larray_key_guess(): a1 b2 2 5 5 2""") assert_larray_equal(arr[key], expected) + arr = ndtest((2, 3)) + # key with invalid label (ie label not found on any axis) + key = from_string(r""" + a\b b0 b1 b2 + a0 a0 a1 a0 + a1 a1 a0 a2""").astype(str) + with must_raise(ValueError, r"""The values of the array key: + +a\b b0 b1 b2 + a0 a0 a1 a0 + a1 a1 a0 a2 + +do not all correspond to labels of a single axis of the subsetted array which has the following axes: + + a [2]: 'a0' 'a1' + b [3]: 'b0' 'b1' 'b2' + +Some of those key values correspond though: + * axis 'a' contains 2 out of 3 labels (labels not found: 'a2') + +Note that all the bad key values are located within the following labels: + a: 'a1' + b: 'b2'"""): + _ = arr[key] + + + def test_getitem_multiple_larray_key_guess(): a, b, c, d, e = ndtest((2, 3, 2, 3, 2)).axes @@ -3195,6 +3239,18 @@ def test_reindex(): res = arr.reindex({'a': 'a0,a1,a2'}) assert_larray_nan_equal(res, expected) + # using the **kwargs syntax + res = arr.reindex(a=new_a) + assert_larray_nan_equal(res, expected) + + # using the **kwargs syntax with a list of labels + res = arr.reindex(a=['a0', 'a1', 'a2']) + assert_larray_nan_equal(res, expected) + + # using the **kwargs syntax with a labels def string (issue #1120) + res = arr.reindex(a='a0,a1,a2') + assert_larray_nan_equal(res, expected) + # test error conditions msg = ("In Array.reindex, when using an axis reference ('axis name', X.axis_name or " "axis_integer_position) as axes_to_reindex, you must provide a value for `new_axis`.") @@ -4058,6 +4114,12 @@ def test_to_frame(): assert df.columns.to_list() == ['c0'] assert df.index.names == ['a', 'b'] + # fold_last_axis_name + arr = ndtest((2, 2, 2)) + df = arr.to_frame(fold_last_axis_name=True) + assert df.columns.name is None + assert df.columns.to_list() == ['c0', 'c1'] + assert df.index.names == ['a', r'b\c'] def test_from_frame(): # 1) data = scalar @@ -4469,6 +4531,13 @@ def test_from_frame(): assert_larray_equal(res, expected) +def test_asarray(): + series = pd.Series([0, 1, 2], ['a0', 'a1', 'a2'], name='a') + expected = ndtest(3) + res = asarray(series) + assert_larray_equal(res, expected) + + def test_to_csv(tmp_path): io_3d.to_csv(tmp_path / 'out3d.csv') assert (tmp_path / 'out3d.csv').read_text() == """\ @@ -5817,7 +5886,9 @@ def test_deprecated_methods(): def test_eq(): a = ndtest((2, 3, 4)) ao = a.astype(object) - assert_larray_equal(ao.eq(ao['c0'], nans_equal=True), a == a['c0']) + res = ao.eq(ao['c0'], nans_equal=True) + expected = a == a['c0'] + assert_larray_equal(res, expected) def test_zip_array_values(): @@ -5885,6 +5956,33 @@ def test_np_array(): assert res is not arr.data +def test_nan_to_num(): + a = Axis('a=a0..a4') + arr = Array([1.0, nan, inf, 2.0, -inf], axes=a) + res = nan_to_num(arr) + max_float = np.finfo(arr.dtype).max + expected = Array([1.0, 0.0, max_float, 2.0, -max_float], axes=a) + assert_larray_equal(res, expected) + + max_float = sys.float_info.max + arr = Array(["abc", 1.0, nan, inf, -inf], axes=a, dtype=object) + res = nan_to_num(arr, neginf=-10, posinf=10) + expected = Array(["abc", 1.0, 0.0, 10.0, -10.0], axes=a, dtype=object) + assert_larray_equal(res, expected) + + +def test_isnan(): + a = Axis('a=a0..a3') + arr = Array([1.0, nan, inf, -inf], axes=a) + res = isnan(arr) + expected = Array([False, True, False, False], axes=a) + assert_larray_equal(res, expected) + + oarr = arr.astype(object) + res = isnan(oarr) + assert_larray_equal(res, expected) + + if __name__ == "__main__": # import doctest # import unittest diff --git a/larray/tests/test_session.py b/larray/tests/test_session.py index 5bc08a8bb..183562b3b 100644 --- a/larray/tests/test_session.py +++ b/larray/tests/test_session.py @@ -7,13 +7,18 @@ import pandas as pd import pytest -from larray.tests.common import meta -from larray.tests.common import (assert_larray_equal, assert_array_nan_equal, inputpath, - needs_xlwings, needs_pytables, needs_openpyxl, must_warn, must_raise) +from larray.tests.common import ( + meta, inputpath, + assert_larray_equal, assert_array_nan_equal, assert_larray_nan_equal, + needs_xlwings, needs_pytables, needs_openpyxl, + must_warn, must_raise +) from larray.inout.common import _supported_scalars_types -from larray import (Session, Axis, Array, Group, isnan, zeros_like, ndtest, ones_like, - ones, full, full_like, stack, local_arrays, global_arrays, arrays, CheckedSession) - +from larray import (Session, Axis, Array, Group, CheckedSession, + isnan, + zeros_like, ndtest, ones_like, ones, full, full_like, + stack, from_string, + local_arrays, global_arrays, arrays) # avoid flake8 errors meta = meta @@ -688,5 +693,36 @@ def test_stack(): assert_larray_equal(res.arr2, expected_arr2) +def test_align(): + s1 = Session(arr1=ndtest(" a=a0,a1 ;b=b0,b1"), + arr2=ndtest(" a=a0,a1 ;b=b0,b1")) + s2 = Session(arr1=ndtest(" a=a0,a1,a2;b=b0,b1"), # extra label + arr2=ndtest("c=c0,c1;a=a0,a1 ;b=b0,b1"), # extra axis + arr3=ndtest(" a=a0,a1 ;b=b0,b1")) # extra array + s3 = Session( # missing array + arr2=ndtest(" a=a0,a1 ;b= b1"), # missing label + arr3=ndtest(" b=b0,b1")) # missing axis + + al_s1, al_s2, al_s3 = s1.align(s2, s3) + + assert_larray_nan_equal(al_s1.arr1, from_string(r""" + a\b b0 b1 + a0 0.0 1.0 + a1 2.0 3.0 + a2 nan nan""")) + assert_larray_equal(al_s1.arr2, s1.arr2) # no change + assert isnan(al_s1.arr3) + + assert_larray_equal(al_s2.arr1, s2.arr1) # no change + assert_larray_equal(al_s2.arr2, s2.arr2) # no change + assert_larray_equal(al_s2.arr3, s2.arr3) # no change + + assert isnan(al_s3.arr1) + assert_larray_nan_equal(al_s3.arr2, from_string(r""" + a\b b0 b1 + a0 nan 0.0 + a1 nan 1.0""")) + assert_larray_equal(al_s3.arr3, s3.arr3) # no change + if __name__ == "__main__": pytest.main() diff --git a/larray/util/misc.py b/larray/util/misc.py index 5dd470029..ff6c62f22 100644 --- a/larray/util/misc.py +++ b/larray/util/misc.py @@ -1077,3 +1077,7 @@ def concatenate_ndarrays(arrays) -> np.ndarray: arrays = [np.asarray(labels, dtype=object) for labels in arrays] # TODO: try using the new dtype argument to concatenate instead of converting labels explicitly as above return np.concatenate(arrays) + + +def first(iterable, default=None): + return next(iter(iterable), default)