Conversation
HenriChabert
left a comment
There was a problem hiding this comment.
Seems great! Some minor changes but nothing really important.
Could you also update the Makefile by adding your package please?
|
|
||
|
|
||
| ```python | ||
| from dku_model_accessor import get_model_handler, ModelAccessor |
There was a problem hiding this comment.
| from dku_model_accessor import get_model_handler, ModelAccessor | |
| import dataiku | |
| from dku_model_accessor import get_model_handler, ModelAccessor |
| ```python | ||
| from dku_model_accessor import get_model_handler, ModelAccessor | ||
|
|
||
| model_id = 'XQyU0TO0' |
There was a problem hiding this comment.
| model_id = 'XQyU0TO0' | |
| model_id = 'YOUR_MODEL_ID' |
Maybe use a more explicit ID
| class DkuModelAccessorConstants(object): | ||
| MODEL_ID = 'model_id' | ||
| VERSION_ID = 'version_id' | ||
| REGRRSSION_TYPE = 'REGRESSION' |
There was a problem hiding this comment.
| REGRRSSION_TYPE = 'REGRESSION' | |
| REGRESSION_TYPE = 'REGRESSION' |
typo
| def get_original_test_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW): | ||
| try: | ||
| full_test_df = self.model_handler.get_test_df()[0] | ||
| test_df = full_test_df[:limit] | ||
| logger.info('Loading {}/{} rows of the original test set'.format(len(test_df), len(full_test_df))) | ||
| return test_df | ||
| except Exception as e: | ||
| logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e)) | ||
| full_test_df = self.model_handler.get_full_df()[0] | ||
| test_df = full_test_df[:limit] | ||
| logger.info('Loading {}/{} rows of the whole original test set'.format(len(test_df), len(full_test_df))) | ||
| return test_df |
There was a problem hiding this comment.
| def get_original_test_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW): | |
| try: | |
| full_test_df = self.model_handler.get_test_df()[0] | |
| test_df = full_test_df[:limit] | |
| logger.info('Loading {}/{} rows of the original test set'.format(len(test_df), len(full_test_df))) | |
| return test_df | |
| except Exception as e: | |
| logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e)) | |
| full_test_df = self.model_handler.get_full_df()[0] | |
| test_df = full_test_df[:limit] | |
| logger.info('Loading {}/{} rows of the whole original test set'.format(len(test_df), len(full_test_df))) | |
| return test_df | |
| def get_original_test_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW): | |
| try: | |
| full_test_df = self.model_handler.get_test_df()[0] | |
| except Exception as e: | |
| logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e)) | |
| full_test_df = self.model_handler.get_full_df()[0] | |
| test_df = full_test_df[:limit] | |
| logger.info('Loading {}/{} rows of the original test set'.format(len(test_df), len(full_test_df))) | |
| return test_df |
Code repeated, I would only try...catch on what can really raise an exception.
| logger.info('Fitting surrogate model ...') | ||
| surrogate_model = SurrogateModel(self.get_prediction_type()) | ||
| original_test_df = self.get_original_test_df() | ||
| predictions_on_original_test_df = self.get_predictor().predict(original_test_df) | ||
| surrogate_df = original_test_df[self.get_selected_features()] | ||
| surrogate_df[DkuModelAccessorConstants.SURROGATE_TARGET] = predictions_on_original_test_df['prediction'] | ||
| surrogate_model.fit(surrogate_df, DkuModelAccessorConstants.SURROGATE_TARGET) | ||
| feature_names = surrogate_model.get_features() | ||
| feature_importances = surrogate_model.clf.feature_importances_ |
There was a problem hiding this comment.
Maybe it would be better to wrap into a new method fit_surrogate_model()
| def get_selected_features(self): | ||
| """ | ||
| Return only features used in the model | ||
| """ | ||
| selected_features = [] | ||
| for feat, feat_info in self.get_per_feature().items(): | ||
| if feat_info.get('role') == 'INPUT': | ||
| selected_features.append(feat) | ||
| return selected_features | ||
|
|
||
| def get_selected_and_rejected_features(self): | ||
| """ | ||
| Return all features in the input dataset except the target | ||
| """ | ||
| selected_features = [] | ||
| for feat, feat_info in self.get_per_feature().items(): | ||
| if feat_info.get('role') in ['INPUT', 'REJECT']: | ||
| selected_features.append(feat) | ||
| return selected_features |
There was a problem hiding this comment.
| def get_selected_features(self): | |
| """ | |
| Return only features used in the model | |
| """ | |
| selected_features = [] | |
| for feat, feat_info in self.get_per_feature().items(): | |
| if feat_info.get('role') == 'INPUT': | |
| selected_features.append(feat) | |
| return selected_features | |
| def get_selected_and_rejected_features(self): | |
| """ | |
| Return all features in the input dataset except the target | |
| """ | |
| selected_features = [] | |
| for feat, feat_info in self.get_per_feature().items(): | |
| if feat_info.get('role') in ['INPUT', 'REJECT']: | |
| selected_features.append(feat) | |
| return selected_features | |
| def get_features_by_status(self, status): | |
| return [feat for feat, feat_info in self.get_per_feature().items() if feat_info.get('role') in status] | |
| def get_selected_features(self): | |
| """ | |
| Return only features used in the model | |
| """ | |
| return self.get_features_by_status(['INPUT']) | |
| def get_selected_and_rejected_features(self): | |
| """ | |
| Return all features in the input dataset except the target | |
| """ | |
| return self.get_features_by_status(['INPUT', 'REJECT']) |
DRY
| for algorithm in ALGORITHMS_WITH_VARIABLE_IMPORTANCE: | ||
| if isinstance(algo, algorithm): | ||
| return True | ||
| elif predictor.params.modeling_params.get('algorithm') in [DkuModelAccessorConstants.DKU_XGBOOST_CLASSIF, DkuModelAccessorConstants.DKU_XGBOOST_REGRESSION]: |
There was a problem hiding this comment.
I would place this array in a new var tree_based_algo = [DkuModelAccessorConstants.DKU_XGBOOST_CLASSIF, DkuModelAccessorConstants.DKU_XGBOOST_REGRESSION] for PEP8 and ease of addition
No description provided.