1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15+ import datetime
1516import functools
1617import hashlib
1718import logging
@@ -1034,7 +1035,7 @@ def binning_data(
10341035def bin_data (
10351036 df : pd .DataFrame ,
10361037 bins : int | dict [str , list ],
1037- non_categorical_label_style : Literal ["bounded" , "unbounded" ] = "unbounded" ,
1038+ non_categorical_label_style : Literal ["bounded" , "unbounded" , "lower" ] = "unbounded" ,
10381039) -> tuple [pd .DataFrame , dict [str , list ]]:
10391040 """
10401041 Splits data into bins.
@@ -1048,41 +1049,32 @@ def bin_data(
10481049
10491050 # Note, that we create a new pd.DataFrame to avoid fragmentation warning messages that can occur if we try to
10501051 # replace hundreds of columns of a large dataset
1051- cols = {}
1052-
1053- bins_dct = {}
1054- num_cols = [c for c in df .columns if pd .api .types .is_numeric_dtype (df [c ])]
1055- dat_cols = [c for c in df .columns if pd .api .types .is_datetime64_any_dtype (df [c ])]
1056- cat_cols = [c for c in df .columns if c not in num_cols + dat_cols ]
1052+ cols , bins_dct = {}, {}
10571053 if isinstance (bins , int ):
1058- for col in num_cols :
1059- cols [col ], bins_dct [col ] = bin_numeric (df [col ], bins , label_style = non_categorical_label_style )
1060- for col in dat_cols :
1061- cols [col ], bins_dct [col ] = bin_datetime (df [col ], bins , label_style = non_categorical_label_style )
1062- for col in cat_cols :
1063- cols [col ], bins_dct [col ] = bin_categorical (df [col ], bins )
1064- else : # bins is a dict
1065- for col in num_cols :
1066- if col in bins :
1067- cols [col ], _ = bin_numeric (df [col ], bins [col ], label_style = non_categorical_label_style )
1054+ for col in df .columns :
1055+ if pd .api .types .is_numeric_dtype (df [col ]):
1056+ cols [col ], bins_dct [col ] = bin_numeric (df [col ], bins , label_style = non_categorical_label_style )
1057+ elif pd .api .types .is_datetime64_any_dtype (df [col ]):
1058+ cols [col ], bins_dct [col ] = bin_datetime (df [col ], bins , label_style = non_categorical_label_style )
10681059 else :
1069- _LOG .warning (f"'{ col } ' is missing in bins" )
1070- for col in dat_cols :
1071- if col in bins :
1072- cols [col ], _ = bin_datetime (df [col ], bins [col ], label_style = non_categorical_label_style )
1073- else :
1074- _LOG .warning (f"'{ col } ' is missing in bins" )
1075- for col in cat_cols :
1060+ cols [col ], bins_dct [col ] = bin_categorical (df [col ], bins )
1061+ else : # bins is a dict
1062+ for col in df .columns :
10761063 if col in bins :
1077- cols [col ], _ = bin_categorical (df [col ], bins [col ])
1064+ if isinstance (bins [col ][0 ], (int , float , np .integer , np .floating )):
1065+ cols [col ], _ = bin_numeric (df [col ], bins [col ], label_style = non_categorical_label_style )
1066+ elif isinstance (bins [col ][0 ], (datetime .date , datetime .datetime , np .datetime64 )):
1067+ cols [col ], _ = bin_datetime (df [col ], bins [col ], label_style = non_categorical_label_style )
1068+ else :
1069+ cols [col ], _ = bin_categorical (df [col ], bins [col ])
10781070 else :
1079- _LOG . warning ( f"' { col } ' is missing in bins" )
1071+ cols [ col ] = df [ col ]
10801072 bins_dct = bins
10811073 return pd .DataFrame (cols ), bins_dct
10821074
10831075
10841076def bin_numeric (
1085- col : pd .Series , bins : int | list [str ], label_style : Literal ["bounded" , "unbounded" ] = "unbounded"
1077+ col : pd .Series , bins : int | list [str ], label_style : Literal ["bounded" , "unbounded" , "lower" ] = "unbounded"
10861078) -> tuple [pd .Categorical , list ]:
10871079 def _clip (col , bins ):
10881080 if isinstance (bins , list ):
@@ -1131,7 +1123,7 @@ def _adjust_breaks(breaks):
11311123
11321124
11331125def bin_datetime (
1134- col : pd .Series , bins : int | list [str ], label_style : Literal ["bounded" , "unbounded" ] = "unbounded"
1126+ col : pd .Series , bins : int | list [str ], label_style : Literal ["bounded" , "unbounded" , "lower" ] = "unbounded"
11351127) -> tuple [pd .Categorical , list ]:
11361128 def _clip (col , bins ):
11371129 if isinstance (bins , list ):
@@ -1184,7 +1176,7 @@ def bin_non_categorical(
11841176 clip_and_breaks : Callable ,
11851177 create_labels : Callable ,
11861178 adjust_breaks : Callable ,
1187- label_style : Literal ["bounded" , "unbounded" ] = "unbounded" ,
1179+ label_style : Literal ["bounded" , "unbounded" , "lower" ] = "unbounded" ,
11881180) -> tuple [pd .Categorical , list ]:
11891181 col = col .fillna (np .nan ).infer_objects (copy = False )
11901182
@@ -1203,7 +1195,9 @@ def bin_non_categorical(
12031195 )
12041196 labels = [str (b ) for b in breaks [:- 1 ]]
12051197
1206- if label_style == "unbounded" :
1198+ if label_style == "lower" :
1199+ new_labels_map = {label : f"{ label } " for label in labels }
1200+ elif label_style == "unbounded" :
12071201 new_labels_map = {label : f"⪰ { label } " for label in labels }
12081202 else : # label_style == "bounded"
12091203 new_labels_map = {label : f"⪰ { label } ≺ { next_label } " for label , next_label in zip (labels , labels [1 :] + ["∞" ])}
0 commit comments