66from larray .core .array import Array
77from larray .core .axis import Axis , AxisCollection
88from larray .core .constants import nan
9- from larray .util .misc import unique_list
109
1110
1211def decode (s , encoding = 'utf-8' , errors = 'strict' ):
@@ -46,34 +45,51 @@ def index_to_labels(idx, sort=True):
4645 """
4746 if isinstance (idx , pd .MultiIndex ):
4847 if sort :
49- return list (idx .levels )
48+ return list (idx .levels ) # list of pd.Index
5049 else :
51- return [unique_list (idx .get_level_values (label )) for label in range (idx .nlevels )]
50+ # requires Pandas >= 0.23 (and it does NOT sort the values)
51+ # TODO: unsure to_list is necessary (larray tests pass without it
52+ # but I am not sure this code path is covered by tests)
53+ # and there might be a subtle difference. The type
54+ # of the returned object without to_list() is pd.Index
55+ return [idx .unique (level ).to_list () for level in range (idx .nlevels )]
5256 else :
5357 assert isinstance (idx , pd .Index )
5458 labels = list (idx .values )
5559 return [sorted (labels ) if sort else labels ]
5660
5761
58- def cartesian_product_df (df , sort_rows = False , sort_columns = False , fill_value = nan , ** kwargs ):
59- idx = df .index
60- labels = index_to_labels (idx , sort = sort_rows )
62+ def product_index (idx , sort = False ):
63+ """
64+ Converts a pandas (Multi)Index to an (Multi)Index with a cartesian
65+ product of the labels present in each level
66+ """
67+ labels = index_to_labels (idx , sort = sort )
6168 if isinstance (idx , pd .MultiIndex ):
62- if sort_rows :
63- new_index = pd .MultiIndex .from_product (labels )
64- else :
65- new_index = pd .MultiIndex .from_tuples (list (product (* labels )))
69+ return pd .MultiIndex .from_product (labels ), labels
6670 else :
67- if sort_rows :
68- new_index = pd .Index (labels [0 ], name = idx .name )
71+ assert isinstance (idx , pd .Index )
72+ if sort :
73+ return pd .Index (labels [0 ], name = idx .name ), labels
6974 else :
70- new_index = idx
71- columns = sorted (df .columns ) if sort_columns else list (df .columns )
72- # the prodlen test is meant to avoid the more expensive array_equal test
73- prodlen = np .prod ([len (axis_labels ) for axis_labels in labels ])
74- if prodlen == len (df ) and columns == list (df .columns ) and np .array_equal (idx .values , new_index .values ):
75- return df , labels
76- return df .reindex (index = new_index , columns = columns , fill_value = fill_value , ** kwargs ), labels
75+ return idx , labels
76+
77+
78+ def cartesian_product_df (df , sort_rows = False , sort_columns = False ,
79+ fill_value = nan , ** kwargs ):
80+ idx = df .index
81+ columns = df .columns
82+ prod_index , index_labels = product_index (idx , sort = sort_rows )
83+ prod_columns , column_labels = product_index (columns , sort = sort_columns )
84+ combined_labels = index_labels + column_labels
85+ # the len() tests are meant to avoid the more expensive array_equal tests
86+ if (len (prod_index ) == len (idx ) and
87+ len (prod_columns ) == len (columns ) and
88+ np .array_equal (idx .values , prod_index .values ) and
89+ np .array_equal (columns .values , prod_columns .values )):
90+ return df , combined_labels
91+ return df .reindex (index = prod_index , columns = prod_columns ,
92+ fill_value = fill_value , ** kwargs ), combined_labels
7793
7894
7995def from_series (s , sort_rows = False , fill_value = nan , meta = None , ** kwargs ) -> Array :
@@ -124,8 +140,13 @@ def from_series(s, sort_rows=False, fill_value=nan, meta=None, **kwargs) -> Arra
124140 a1 b1 6.0 7.0
125141 """
126142 if isinstance (s .index , pd .MultiIndex ):
127- # TODO: use argument sort=False when it will be available
128- # (see https://github.com/pandas-dev/pandas/issues/15105)
143+ # Using unstack sort argument (requires Pandas >= 2.1) would make this
144+ # code simpler, but it makes it even slower than it already is.
145+ # As of Pandas 2.3.3 on 12/2025, a series with a large MultiIndex is
146+ # extremely slow to unstack, whether sort is used or not:
147+ # >>> arr = ndtest((200, 200, 200))
148+ # >>> s = arr.to_series() # 31.4 ms
149+ # >>> s.unstack(level=-1, fill_value=np.nan) # 1.5s !!!
129150 df = s .unstack (level = - 1 , fill_value = fill_value )
130151 # pandas (un)stack and pivot(_table) methods return a Dataframe/Series with sorted index and columns
131152 if not sort_rows :
@@ -211,13 +232,15 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
211232
212233 # handle 2 or more dimensions with the last axis name given using \
213234 if unfold_last_axis_name :
235+ # Note that having several axes in columns (and using df.columns.names)
236+ # in this case does not make sense
214237 if isinstance (axes_names [- 1 ], str ) and '\\ ' in axes_names [- 1 ]:
215238 last_axes = [name .strip () for name in axes_names [- 1 ].split ('\\ ' )]
216239 axes_names = axes_names [:- 1 ] + last_axes
217240 else :
218241 axes_names += [None ]
219242 else :
220- axes_names += [ df .columns .name ]
243+ axes_names += df .columns .names
221244
222245 if cartesian_prod :
223246 df , axes_labels = cartesian_product_df (df , sort_rows = sort_rows , sort_columns = sort_columns ,
@@ -226,12 +249,18 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
226249 if sort_rows or sort_columns :
227250 raise ValueError ('sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. '
228251 'Please call the method sort_labels on the returned array to sort rows or columns' )
229- axes_labels = index_to_labels (df .index , sort = False )
252+ index_labels = index_to_labels (df .index , sort = False )
253+ column_labels = index_to_labels (df .columns , sort = False )
254+ axes_labels = index_labels + column_labels
230255
231256 # Pandas treats column labels as column names (strings) so we need to convert them to values
232- last_axis_labels = [parse (cell ) for cell in df .columns .values ] if parse_header else list (df .columns .values )
233- axes_labels .append (last_axis_labels )
257+ if parse_header :
258+ ncolaxes = df .columns .nlevels
259+ for i in range (len (axes_labels ) - ncolaxes , len (axes_labels )):
260+ axes_labels [i ] = [parse (cell ) for cell in axes_labels [i ]]
234261
262+ # TODO: use zip(..., strict=True) instead when we drop support for Python 3.9
263+ assert len (axes_labels ) == len (axes_names )
235264 axes = AxisCollection ([Axis (labels , name ) for labels , name in zip (axes_labels , axes_names )])
236265 data = df .values .reshape (axes .shape )
237266 return Array (data , axes , meta = meta )
0 commit comments