From 94ed2e8bfc9d449fb08db028e8f1c71994674f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Gouv=C3=AAa?= Date: Sat, 26 Apr 2025 20:15:45 +0200 Subject: [PATCH] Update vanilla.py Implemented a clipping method after scaling the data. Some features may only have reasonable values for a subset that wasn't included in the training set and when applying the scaler on the test set their values blow up. This may cause inf values and overall disruption of the predictions. --- modnet/models/vanilla.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py index 6eb8b4ff..5f42a58e 100644 --- a/modnet/models/vanilla.py +++ b/modnet/models/vanilla.py @@ -1383,6 +1383,24 @@ class OR only return the most probable class. if self._scale_impute is not None: x = self._scale_impute.transform(x) + # Clip the scaled features to safe bounds + if self._scaler is not None: + if isinstance(self._scaler, MinMaxScaler): + # For MinMaxScaler, use 5x the feature range as bounds + lower_bound = 5.0 * self._scaler.feature_range[0] + upper_bound = 5.0 * self._scaler.feature_range[1] + elif isinstance(self._scaler, StandardScaler): + # For StandardScaler, use 10 standard deviations as bounds + lower_bound = -10.0 + upper_bound = 10.0 + else: + # For other scalers, use default wide bounds + lower_bound = -np.inf + upper_bound = np.inf + + # Clip the features + x = np.clip(x, lower_bound, upper_bound) + p = np.array(self.model.predict(x)) if len(p.shape) == 2: @@ -1445,6 +1463,24 @@ def evaluate(self, test_data: MODData) -> pd.DataFrame: if self._scale_impute is not None: x = self._scale_impute.transform(x) + # Clip the scaled features to safe bounds + if self._scaler is not None: + if isinstance(self._scaler, MinMaxScaler): + # For MinMaxScaler, use 5x the feature range as bounds + lower_bound = 5.0 * self._scaler.feature_range[0] + upper_bound = 5.0 * self._scaler.feature_range[1] + elif isinstance(self._scaler, StandardScaler): + # For StandardScaler, use 10 standard deviations as bounds + lower_bound = -10.0 + upper_bound = 10.0 + else: + # For other scalers, use default wide bounds + lower_bound = -np.inf + upper_bound = np.inf + + # Clip the features + x = np.clip(x, lower_bound, upper_bound) + y_pred = np.array(self.model.predict(x)) if len(y_pred.shape) == 2: y_pred = np.array([y_pred])