From 5b813b9255fcdb7d146e2732ccede1f2ae43ad09 Mon Sep 17 00:00:00 2001 From: Raymond Pan Date: Fri, 16 May 2025 19:43:30 -0400 Subject: [PATCH 1/7] =?UTF-8?q?Bump=20version:=200.0.5.dev9=20=E2=86=92=20?= =?UTF-8?q?0.0.5.dev10?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.cfg | 2 +- setup.py | 2 +- zephyr_ml/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 05eddc8..b5212e3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.5.dev9 +current_version = 0.0.5.dev10 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 9fc7774..2ac9a80 100644 --- a/setup.py +++ b/setup.py @@ -124,6 +124,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/zephyr', - version='0.0.5.dev9', + version='0.0.5.dev10', zip_safe=False, ) diff --git a/zephyr_ml/__init__.py b/zephyr_ml/__init__.py index 99e651b..6213460 100644 --- a/zephyr_ml/__init__.py +++ b/zephyr_ml/__init__.py @@ -4,7 +4,7 @@ __author__ = 'MIT Data To AI Lab' __email__ = 'dai-lab@mit.edu' -__version__ = '0.0.5.dev9' +__version__ = '0.0.5.dev10' import os From 5bb1e70db60c45906044bb64dc784a44a58176ce Mon Sep 17 00:00:00 2001 From: Raymond Pan Date: Sat, 24 May 2025 19:50:53 -0700 Subject: [PATCH 2/7] moved evaluation metrics to separate file --- zephyr_ml/core.py | 4 +- zephyr_ml/primitives/evaluation.py | 64 +++++++++++++++++++ ...imitives.evaluation.confusion_matrix.json} | 4 +- ...s.evaluation.roc_auc_score_and_curve.json} | 4 +- zephyr_ml/primitives/postprocessing.py | 60 +---------------- 5 files changed, 71 insertions(+), 65 deletions(-) create mode 100644 zephyr_ml/primitives/evaluation.py rename zephyr_ml/primitives/jsons/{zephyr_ml.primitives.postprocessing.confusion_matrix.json => zephyr_ml.primitives.evaluation.confusion_matrix.json} (85%) rename zephyr_ml/primitives/jsons/{zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve.json => zephyr_ml.primitives.evaluation.roc_auc_score_and_curve.json} (84%) diff --git a/zephyr_ml/core.py b/zephyr_ml/core.py index 2e5bb8d..8417207 100644 --- a/zephyr_ml/core.py +++ b/zephyr_ml/core.py @@ -21,8 +21,8 @@ "sklearn.metrics.precision_score", "sklearn.metrics.f1_score", "sklearn.metrics.recall_score", - "zephyr_ml.primitives.postprocessing.confusion_matrix", - "zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve", + "zephyr_ml.primitives.evaluation.confusion_matrix", + "zephyr_ml.primitives.evaluation.roc_auc_score_and_curve", ] LOGGER = logging.getLogger(__name__) diff --git a/zephyr_ml/primitives/evaluation.py b/zephyr_ml/primitives/evaluation.py new file mode 100644 index 0000000..0f51a59 --- /dev/null +++ b/zephyr_ml/primitives/evaluation.py @@ -0,0 +1,64 @@ +""" +Evaluation metrics +""" + +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn import metrics + + +def confusion_matrix( + y_true, + y_pred, + labels=None, + sample_weight=None, + normalize=None): + conf_matrix = metrics.confusion_matrix( + y_true, y_pred, labels=labels, sample_weight=sample_weight, normalize=normalize + ) + fig = plt.figure() + ax = fig.add_axes(sns.heatmap(conf_matrix, annot=True, cmap="Blues")) + + ax.set_title("Confusion Matrix\n") + ax.set_xlabel("\nPredicted Values") + ax.set_ylabel("Actual Values") + + ax.xaxis.set_ticklabels(["False", "True"]) + ax.yaxis.set_ticklabels(["False", "True"]) + + return conf_matrix, fig + + +def roc_auc_score_and_curve( + y_true, y_proba, pos_label=None, sample_weight=None, drop_intermediate=True +): + if y_proba.ndim > 1: + y_proba = y_proba[:, 1] + fpr, tpr, _ = metrics.roc_curve( + y_true, + y_proba, + pos_label=pos_label, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + ) + ns_probs = [0 for _ in range(len(y_true))] + ns_fpr, ns_tpr, _ = metrics.roc_curve( + y_true, + ns_probs, + pos_label=pos_label, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + ) + + auc = metrics.roc_auc_score(y_true, y_proba) + fig, ax = plt.subplots(1, 1) + + ax.plot(fpr, tpr, "ro") + ax.plot(fpr, tpr) + ax.plot(ns_fpr, ns_tpr, linestyle="--", color="green") + + ax.set_ylabel("True Positive Rate") + ax.set_xlabel("False Positive Rate") + ax.set_title("AUC: %.3f" % auc) + + return auc, fig diff --git a/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.confusion_matrix.json b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.evaluation.confusion_matrix.json similarity index 85% rename from zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.confusion_matrix.json rename to zephyr_ml/primitives/jsons/zephyr_ml.primitives.evaluation.confusion_matrix.json index 766ca5f..670357c 100644 --- a/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.confusion_matrix.json +++ b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.evaluation.confusion_matrix.json @@ -1,12 +1,12 @@ { - "name": "zephyr_ml.primitives.postprocessing.confusion_matrix", + "name": "zephyr_ml.primitives.evaluation.confusion_matrix", "contributors": ["Raymond Pan "], "description": "Create and plot confusion matrix.", "classifiers": { "type": "helper" }, "modalities": [], - "primitive": "zephyr_ml.primitives.postprocessing.confusion_matrix", + "primitive": "zephyr_ml.primitives.evaluation.confusion_matrix", "produce": { "args": [ { diff --git a/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve.json b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.evaluation.roc_auc_score_and_curve.json similarity index 84% rename from zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve.json rename to zephyr_ml/primitives/jsons/zephyr_ml.primitives.evaluation.roc_auc_score_and_curve.json index 778bde9..2f18120 100644 --- a/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve.json +++ b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.evaluation.roc_auc_score_and_curve.json @@ -1,12 +1,12 @@ { - "name": "zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve", + "name": "zephyr_ml.primitives.evaluation.roc_auc_score_and_curve", "contributors": ["Raymond Pan "], "description": "Calculate ROC AUC score and plot curve.", "classifiers": { "type": "helper" }, "modalities": [], - "primitive": "zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve", + "primitive": "zephyr_ml.primitives.evaluation.roc_auc_score_and_curve", "produce": { "args": [ { diff --git a/zephyr_ml/primitives/postprocessing.py b/zephyr_ml/primitives/postprocessing.py index 2ae0af1..b9621ff 100644 --- a/zephyr_ml/primitives/postprocessing.py +++ b/zephyr_ml/primitives/postprocessing.py @@ -4,14 +4,12 @@ import logging -import matplotlib.pyplot as plt import numpy as np -import seaborn as sns import sklearn -from sklearn import metrics LOGGER = logging.getLogger(__name__) + METRICS = { "accuracy": sklearn.metrics.accuracy_score, "precision": sklearn.metrics.precision_score, @@ -85,59 +83,3 @@ def apply_threshold(self, y_proba): binary = [1 if x else 0 for x in y_proba > self._threshold] return binary, self._threshold, self._scores - -def confusion_matrix( - y_true, - y_pred, - labels=None, - sample_weight=None, - normalize=None): - conf_matrix = metrics.confusion_matrix( - y_true, y_pred, labels=labels, sample_weight=sample_weight, normalize=normalize - ) - fig = plt.figure() - ax = fig.add_axes(sns.heatmap(conf_matrix, annot=True, cmap="Blues")) - - ax.set_title("Confusion Matrix\n") - ax.set_xlabel("\nPredicted Values") - ax.set_ylabel("Actual Values") - - ax.xaxis.set_ticklabels(["False", "True"]) - ax.yaxis.set_ticklabels(["False", "True"]) - - return conf_matrix, fig - - -def roc_auc_score_and_curve( - y_true, y_proba, pos_label=None, sample_weight=None, drop_intermediate=True -): - if y_proba.ndim > 1: - y_proba = y_proba[:, 1] - fpr, tpr, _ = metrics.roc_curve( - y_true, - y_proba, - pos_label=pos_label, - sample_weight=sample_weight, - drop_intermediate=drop_intermediate, - ) - ns_probs = [0 for _ in range(len(y_true))] - ns_fpr, ns_tpr, _ = metrics.roc_curve( - y_true, - ns_probs, - pos_label=pos_label, - sample_weight=sample_weight, - drop_intermediate=drop_intermediate, - ) - - auc = metrics.roc_auc_score(y_true, y_proba) - fig, ax = plt.subplots(1, 1) - - ax.plot(fpr, tpr, "ro") - ax.plot(fpr, tpr) - ax.plot(ns_fpr, ns_tpr, linestyle="--", color="green") - - ax.set_ylabel("True Positive Rate") - ax.set_xlabel("False Positive Rate") - ax.set_title("AUC: %.3f" % auc) - - return auc, fig From a68fbabae6083c26b085fc8c40e737b1dedc1ab3 Mon Sep 17 00:00:00 2001 From: Raymond Pan Date: Sun, 25 May 2025 13:43:36 -0700 Subject: [PATCH 3/7] lint --- zephyr_ml/primitives/postprocessing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/zephyr_ml/primitives/postprocessing.py b/zephyr_ml/primitives/postprocessing.py index b9621ff..f1ce832 100644 --- a/zephyr_ml/primitives/postprocessing.py +++ b/zephyr_ml/primitives/postprocessing.py @@ -82,4 +82,3 @@ def apply_threshold(self, y_proba): binary = [1 if x else 0 for x in y_proba > self._threshold] return binary, self._threshold, self._scores - From a14469b5127be6bdf5af9b1831bda773b1f8b844 Mon Sep 17 00:00:00 2001 From: Raymond Pan Date: Mon, 26 May 2025 23:22:12 -0700 Subject: [PATCH 4/7] error handling --- zephyr_ml/core.py | 87 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 16 deletions(-) diff --git a/zephyr_ml/core.py b/zephyr_ml/core.py index 8417207..1c70fc8 100644 --- a/zephyr_ml/core.py +++ b/zephyr_ml/core.py @@ -125,7 +125,7 @@ def try_log_backwards_set_method_warning(self, name, next_step): f"\tAll other steps' results will be considered stale.")) def try_log_backwards_key_method_warning(self, name, next_step): - steps_in_between = self.get_steps_in_between(next_step, self.current_step+1) + steps_in_between = self.get_steps_in_between(next_step, self.current_step + 1) if len(steps_in_between) > 0: steps_in_between_str = (f"\tAny results produced by the following steps " f"will be considered stale:\n" @@ -353,7 +353,7 @@ def __init__(self): [self.get_train_test_split]), ([self.fit_pipeline], [self.set_fitted_pipeline], [self.get_fitted_pipeline]), ([self.predict, self.evaluate], [], []) - ] + ] self._guide_handler = GuideHandler(step_order) def GET_ENTITYSET_TYPES(self): @@ -553,7 +553,9 @@ def generate_label_times( AssertionError: If entityset has not been generated or set or labeling_fn is not a string and not callable. """ - assert self._entityset is not None, "entityset has not been set" + + if self._entityset is None: + raise ValueError("entityset has not been set") if isinstance(labeling_fn, str): # get predefined labeling function labeling_fn_map = get_labeling_functions_map() @@ -630,6 +632,9 @@ def get_label_times(self, visualize=False): Returns: tuple: (composeml.LabelTimes, dict) The label times and metadata. """ + if self._label_times is None: + raise ValueError("Label times have not been set" + "Call generate_label_times or set_label_times first.") if visualize: cp.label_times.plots.LabelPlots(self._label_times).distribution() return self._label_times, self._label_times_meta @@ -724,7 +729,20 @@ def generate_feature_matrix( Returns: tuple: (pd.DataFrame, list, featuretools.EntitySet) Feature matrix, feature definitions, and the processed entityset. + + Raises: + ValueError: If required attributes are missing. """ + if self._entityset is None: + raise ValueError( + "Entityset has not been set. Call generate_entityset or " + "set_entityset first.") + + if self._label_times is None: + raise ValueError( + "Label times have not been set. Call generate_label_times or " + "set_label_times first.") + entityset_copy = copy.deepcopy(self._entityset) # perform signal processing if signal_dataframe_name is not None and signal_column is not None: @@ -784,6 +802,9 @@ def get_feature_matrix(self): tuple: (pd.DataFrame, str, list) The feature matrix, label column name, and feature definitions. """ + if self._feature_matrix is None: + raise ValueError("Feature matrix has not been generated. " + "Call generate_feature_matrix or set_feature_matrix first.") return self._feature_matrix, self._label_col_name, self._features @guide @@ -830,6 +851,11 @@ def generate_train_test_split( Returns: tuple: (X_train, X_test, y_train, y_test) The split feature matrices and labels. """ + if self._feature_matrix is None: + raise ValueError( + "Feature matrix has not been generated. Call generate_feature_matrix " + "or set_feature_matrix first.") + feature_matrix = self._feature_matrix.copy() labels = feature_matrix.pop(self._label_col_name) @@ -880,7 +906,9 @@ def get_train_test_split(self): """ if (self._X_train is None or self._X_test is None or self._y_train is None or self._y_test is None): - return None + raise ValueError( + "Train-test split has not been generated. " + "Call generate_train_test_split or set_train_test_split first.") return self._X_train, self._X_test, self._y_train, self._y_test @guide @@ -894,8 +922,8 @@ def set_fitted_pipeline(self, pipeline): @guide def fit_pipeline( - self, pipeline="xgb_classifier", pipeline_hyperparameters=None, - X=None, y=None, visual=False, **kwargs): + self, pipeline="xgb_classifier", + pipeline_hyperparameters=None, visual=False, **kwargs): """Fit a machine learning pipeline. Args: @@ -905,28 +933,29 @@ def fit_pipeline( - Dictionary with pipeline specification - MLPipeline instance pipeline_hyperparameters (dict, optional): Hyperparameters for the pipeline. - X (pd.DataFrame, optional): Training features. If None, uses stored training set. - y (array-like, optional): Training labels. If None, uses stored training labels. visual (bool, optional): Whether to return visualization data. Defaults to False. **kwargs: Additional arguments passed to the pipeline's fit method. Returns: dict or None: If visual=True, returns visualization data dictionary. + + Raises: + ValueError: If required attributes are missing. """ - self._pipeline = self._get_mlpipeline( - pipeline, pipeline_hyperparameters) + if self._X_train is None or self._y_train is None: + raise ValueError( + "No training data provided. Call generate_train_test_split " + "or set_train_test_split first.") - if X is None: - X = self._X_train - if y is None: - y = self._y_train + self._pipeline = self._get_mlpipeline(pipeline, pipeline_hyperparameters) if visual: outputs_spec, visual_names = self._get_outputs_spec(False) else: outputs_spec = None - outputs = self._pipeline.fit(X, y, output_=outputs_spec, **kwargs) + outputs = self._pipeline.fit(X=self._X_train, y=self._y_train, + output_=outputs_spec, **kwargs) if visual and outputs is not None: return dict(zip(visual_names, outputs)) @@ -951,9 +980,22 @@ def predict(self, X=None, visual=False, **kwargs): Returns: array-like or tuple: Predictions, and if visual=True, also returns visualization data. + + Raises: + ValueError: If required attributes or parameters are missing. """ - if X is None: + if self._pipeline is None: + raise ValueError( + "No pipeline has been fitted. Call fit_pipeline or set_fitted_pipeline first.") + + if X is None and self._X_test is None: + raise ValueError( + "No test data provided. Pass in test data or " + "call generate_train_test_split or set_train_test_split first.") + + elif X is None: X = self._X_test + if visual: outputs_spec, visual_names = self._get_outputs_spec() else: @@ -984,9 +1026,22 @@ def evaluate( Returns: dict: A dictionary mapping metric names to their computed values. + + Raises: + ValueError: If required attributes are missing. """ + if self._pipeline is None: + raise ValueError( + "No pipeline has been fitted. Call fit_pipeline or set_fitted_pipeline first.") + + if (X is None and self._X_test is None) or (y is None and self._y_test is None): + raise ValueError( + "No test data provided. Pass in test data or " + "call generate_train_test_split or set_train_test_split first.") + if X is None: X = self._X_test + if y is None: y = self._y_test From ad9a1173319e07bde1f5a78fc76ac2b83c42e226 Mon Sep 17 00:00:00 2001 From: Raymond Pan Date: Sun, 1 Jun 2025 15:07:07 -0700 Subject: [PATCH 5/7] privatize other code --- tests/labeling/test_data_labeler.py | 2 +- tests/labeling/test_helpers.py | 2 +- tests/test_entityset.py | 6 +- tests/test_feature_engineering.py | 24 +- tests/test_guide.py | 2 +- tests/test_metadata.py | 2 +- zephyr_ml/__init__.py | 2 - zephyr_ml/{entityset.py => _entityset.py} | 4 +- ...engineering.py => _feature_engineering.py} | 0 zephyr_ml/_guide_handler.py | 296 +++++++++++++++++ zephyr_ml/{labeling => _labeling}/__init__.py | 6 +- .../{labeling => _labeling}/data_labeler.py | 0 .../_labeling/labeling_functions/__init__.py | 5 + .../labeling_functions/brake_pad_presence.py | 2 +- .../converter_replacement_presence.py | 2 +- .../labeling_functions/planet_bearing.py | 2 +- .../labeling_functions/total_power_loss.py | 2 +- zephyr_ml/{labeling => _labeling}/utils.py | 0 zephyr_ml/{metadata.py => _metadata.py} | 0 zephyr_ml/core.py | 301 +----------------- .../labeling/labeling_functions/__init__.py | 5 - 21 files changed, 334 insertions(+), 331 deletions(-) rename zephyr_ml/{entityset.py => _entityset.py} (98%) rename zephyr_ml/{feature_engineering.py => _feature_engineering.py} (100%) create mode 100644 zephyr_ml/_guide_handler.py rename zephyr_ml/{labeling => _labeling}/__init__.py (88%) rename zephyr_ml/{labeling => _labeling}/data_labeler.py (100%) create mode 100644 zephyr_ml/_labeling/labeling_functions/__init__.py rename zephyr_ml/{labeling => _labeling}/labeling_functions/brake_pad_presence.py (97%) rename zephyr_ml/{labeling => _labeling}/labeling_functions/converter_replacement_presence.py (97%) rename zephyr_ml/{labeling => _labeling}/labeling_functions/planet_bearing.py (97%) rename zephyr_ml/{labeling => _labeling}/labeling_functions/total_power_loss.py (96%) rename zephyr_ml/{labeling => _labeling}/utils.py (100%) rename zephyr_ml/{metadata.py => _metadata.py} (100%) delete mode 100644 zephyr_ml/labeling/labeling_functions/__init__.py diff --git a/tests/labeling/test_data_labeler.py b/tests/labeling/test_data_labeler.py index a0176ff..ed36772 100644 --- a/tests/labeling/test_data_labeler.py +++ b/tests/labeling/test_data_labeler.py @@ -2,7 +2,7 @@ import featuretools as ft -from zephyr_ml.labeling import DataLabeler +from zephyr_ml._labeling.data_labeler import DataLabeler class TestDataLabeler: diff --git a/tests/labeling/test_helpers.py b/tests/labeling/test_helpers.py index a43ca9b..95fdd38 100644 --- a/tests/labeling/test_helpers.py +++ b/tests/labeling/test_helpers.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from zephyr_ml.labeling.utils import ( +from zephyr_ml._labeling.utils import ( aggregate_by_column, categorical_presence, denormalize, greater_than, keyword_in_text, merge_binary_labeling_functions, total_duration) diff --git a/tests/test_entityset.py b/tests/test_entityset.py index ab3ec76..580fb03 100644 --- a/tests/test_entityset.py +++ b/tests/test_entityset.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from zephyr_ml import _create_entityset +from zephyr_ml._entityset import create_entityset @pytest.fixture @@ -120,11 +120,11 @@ def scada_dfs(base_dfs): def create_pidata_entityset(pidata_dfs): - return _create_entityset(pidata_dfs, es_type="pidata") + return create_entityset(pidata_dfs, es_type="pidata") def create_scada_entityset(scada_dfs): - return _create_entityset(scada_dfs, es_type="scada") + return create_entityset(scada_dfs, es_type="scada") def test_create_pidata_missing_entities(pidata_dfs): diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py index e4c08ba..fbc6e54 100644 --- a/tests/test_feature_engineering.py +++ b/tests/test_feature_engineering.py @@ -1,8 +1,8 @@ import pandas as pd import pytest -from zephyr_ml import _create_entityset -from zephyr_ml.feature_engineering import process_signals +from zephyr_ml._entityset import create_entityset +from zephyr_ml._feature_engineering import process_signals @pytest.fixture @@ -122,12 +122,12 @@ def scada_dfs(base_dfs): @pytest.fixture def pidata_es(pidata_dfs): - return _create_entityset(pidata_dfs, "pidata") + return create_entityset(pidata_dfs, "pidata") @pytest.fixture def scada_es(scada_dfs): - return _create_entityset(scada_dfs, "scada") + return create_entityset(scada_dfs, "scada") @pytest.fixture @@ -153,8 +153,8 @@ def test_process_signals_pidata(pidata_es, transformations, aggregations): replace_dataframe = False before = pidata_es['pidata'].copy() - process_signals(pidata_es, signal_dataframe_name, signal_column, transformations, aggregations, - window_size, replace_dataframe) + process_signals(pidata_es, signal_dataframe_name, signal_column, transformations, + aggregations, window_size, replace_dataframe) processed = pidata_es['pidata_processed'].copy() after = pidata_es['pidata'].copy() @@ -189,8 +189,8 @@ def test_process_signals_pidata_replace( window_size = '1m' replace_dataframe = True - process_signals(pidata_es, signal_dataframe_name, signal_column, transformations, aggregations, - window_size, replace_dataframe) + process_signals(pidata_es, signal_dataframe_name, signal_column, transformations, + aggregations, window_size, replace_dataframe) processed = pidata_es['pidata'].copy() @@ -224,8 +224,8 @@ def test_process_signals_scada(scada_es, transformations, aggregations): replace_dataframe = False before = scada_es['scada'].copy() - process_signals(scada_es, signal_dataframe_name, signal_column, transformations, aggregations, - window_size, replace_dataframe) + process_signals(scada_es, signal_dataframe_name, signal_column, transformations, + aggregations, window_size, replace_dataframe) expected = pd.DataFrame({ "_index": [0, 1, 2], @@ -256,8 +256,8 @@ def test_process_signals_scada_replace( window_size = '1m' replace_dataframe = True - process_signals(scada_es, signal_dataframe_name, signal_column, transformations, aggregations, - window_size, replace_dataframe) + process_signals(scada_es, signal_dataframe_name, signal_column, transformations, + aggregations, window_size, replace_dataframe) expected = pd.DataFrame({ "_index": [0, 1, 2], diff --git a/tests/test_guide.py b/tests/test_guide.py index e0dcabc..db54bef 100644 --- a/tests/test_guide.py +++ b/tests/test_guide.py @@ -1,4 +1,4 @@ -from zephyr_ml.core import GuideHandler, guide +from zephyr_ml._guide_handler import GuideHandler, guide class DummyObject: diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 8d8f923..6dd6f93 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,6 +1,6 @@ import pytest -from zephyr_ml.metadata import DEFAULT_ES_KWARGS, DEFAULT_ES_TYPE_KWARGS, get_mapped_kwargs +from zephyr_ml._metadata import DEFAULT_ES_KWARGS, DEFAULT_ES_TYPE_KWARGS, get_mapped_kwargs def test_default_scada_mapped_kwargs(): diff --git a/zephyr_ml/__init__.py b/zephyr_ml/__init__.py index 6213460..3e633d1 100644 --- a/zephyr_ml/__init__.py +++ b/zephyr_ml/__init__.py @@ -9,8 +9,6 @@ import os from zephyr_ml.core import Zephyr -from zephyr_ml.entityset import VALIDATE_DATA_FUNCTIONS, _create_entityset -from zephyr_ml.labeling import DataLabeler MLBLOCKS_PRIMITIVES = os.path.join( os.path.dirname(__file__), "primitives", "jsons") diff --git a/zephyr_ml/entityset.py b/zephyr_ml/_entityset.py similarity index 98% rename from zephyr_ml/entityset.py rename to zephyr_ml/_entityset.py index 514c325..8b3e4db 100644 --- a/zephyr_ml/entityset.py +++ b/zephyr_ml/_entityset.py @@ -2,7 +2,7 @@ import featuretools as ft -from zephyr_ml.metadata import get_mapped_kwargs +from zephyr_ml._metadata import get_mapped_kwargs def _validate_data(dfs, es_type, es_kwargs): @@ -159,7 +159,7 @@ def validate_vibrations_data(dfs, new_kwargs_mapping=None): } -def _create_entityset(entities, es_type, new_kwargs_mapping=None): +def create_entityset(entities, es_type, new_kwargs_mapping=None): validate_func = VALIDATE_DATA_FUNCTIONS[es_type] es_kwargs = validate_func(entities, new_kwargs_mapping) diff --git a/zephyr_ml/feature_engineering.py b/zephyr_ml/_feature_engineering.py similarity index 100% rename from zephyr_ml/feature_engineering.py rename to zephyr_ml/_feature_engineering.py diff --git a/zephyr_ml/_guide_handler.py b/zephyr_ml/_guide_handler.py new file mode 100644 index 0000000..995217b --- /dev/null +++ b/zephyr_ml/_guide_handler.py @@ -0,0 +1,296 @@ +import logging +from functools import wraps + + +LOGGER = logging.getLogger(__name__) + + +class GuideHandler: + + def __init__(self, ordered_steps): + self.cur_iteration = 0 + self.current_step = -1 + self.start_point = -1 + self.ordered_steps = ordered_steps + self.set_methods = set() + + self.producer_to_step_map = {} + self.getter_to_step_map = {} + + self.iterations = [] + for idx, (keys, sets, gets) in enumerate(self.ordered_steps): + self.iterations.append(-1) + + for prod in keys: + self.producer_to_step_map[prod.__name__] = idx + for prod in sets: + self.producer_to_step_map[prod.__name__] = idx + self.set_methods.add(prod.__name__) + + for get in gets: + self.getter_to_step_map[get.__name__] = idx + + def or_join(self, methods): + return " or ".join([method.__name__ for method in methods]) + + def get_get_steps_in_between(self, cur_step, next_step): + step_strs = [] + for step in range(cur_step + 1, next_step): + step_strs.append( + f"{step} {self.or_join(self.ordered_steps[step][2])}") + return step_strs + + def get_last_up_to_date(self, next_step): + latest_up_to_date = 0 + for step in range(next_step): + if self.iterations[step] == self.cur_iteration: + latest_up_to_date = step + return latest_up_to_date + + def join_steps(self, step_strs): + return "\n\t".join(step_strs) + + def get_steps_in_between(self, cur_step, next_step): + step_strs = [] + for step in range(cur_step + 1, next_step): + option_strs = [] + option_strs.extend(self.ordered_steps[step][0]) + option_strs.extend(self.ordered_steps[step][1]) + step_strs.append(f"{step}. {self.or_join(option_strs)}") + return step_strs + + def log_next_producer_step(self, name): + next_step = self.current_step + 1 + + if next_step >= len(self.ordered_steps): + cur_step_name = self.or_join(self.ordered_steps[self.current_step][0]) + LOGGER.warning((f"[GUIDE] DONE: {name}.\n" + f"\tYou have reached the end of the " + f"predictive engineering workflow.\n" + f"\tYou can call {cur_step_name} again or re-perform previous steps " + f"based on results.")) + else: + next_step_name = self.or_join(self.ordered_steps[next_step][0]) + LOGGER.warning(f"[GUIDE] DONE: {name}.\n" + f"\tYou can perform the next step by calling {next_step_name}.") + + def perform_producer_step(self, zephyr, method, + *method_args, **method_kwargs): + step_num = self.producer_to_step_map[method.__name__] + res = method(zephyr, *method_args, **method_kwargs) + self.current_step = step_num + self.iterations[step_num] = self.cur_iteration + self.log_next_producer_step(method.__name__) + return res + + def try_log_forward_set_method_warning(self, name, next_step): + if self.current_step != -1: + from_str = (f"Going from step {self.current_step} to " + f"step {next_step} by performing {name}.") + else: + from_str = (f"Performing step {next_step} with {name}.") + LOGGER.warning((f"[GUIDE] STALE WARNING: {name}.\n" + f"\t{from_str}\n" + f"\tThis is a forward step via a set method.\n" + f"\tAll previous steps' results will be considered stale.")) + + def try_log_backwards_set_method_warning(self, name, next_step): + LOGGER.warning((f"[GUIDE] STALE WARNING: {name}.\n" + f"\tGoing from step {self.current_step} to " + f"step {next_step} by performing {name}.\n" + f"\tThis is a backwards step via a set method.\n" + f"\tAll other steps' results will be considered stale.")) + + def try_log_backwards_key_method_warning(self, name, next_step): + steps_in_between = self.get_steps_in_between(next_step, self.current_step + 1) + if len(steps_in_between) > 0: + steps_in_between_str = (f"\tAny results produced by the following steps " + f"will be considered stale:\n" + f"\t{self.join_steps(steps_in_between)}") + else: + steps_in_between_str = "" + + LOGGER.warning((f"[GUIDE] STALE WARNING: {name}.\n" + f"\tGoing from step {self.current_step} to " + f"step {next_step} by performing {name}.\n" + f"\tThis is a backwards step via a key method.\n" + f"{steps_in_between_str}")) + + def log_get_inconsistent_warning(self, name, next_step): + prod_steps_str = self.or_join(self.ordered_steps[next_step][0]) + prod_steps = f"{next_step}.{prod_steps_str}" + latest_up_to_date = self.get_last_up_to_date(next_step) + LOGGER.warning((f"[GUIDE] INCONSISTENCY WARNING: {name}.\n" + f"Unable to perform {name} because" + f"{prod_steps} has not been run yet.\n" + f"Run steps starting at or before {latest_up_to_date}.")) + + def log_get_stale_warning(self, name, next_step): + latest_up_to_date = self.get_last_up_to_date(next_step) + LOGGER.warning((f"[GUIDE] STALE WARNING: {name}.\n" + f"This data is potentially stale.\n" + f"Re-run steps starting at or before {latest_up_to_date}" + f"to ensure data is up to date.")) + + # tries to perform step if possible -> warns that data might be stale + + def try_perform_forward_producer_step( + self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + next_step = self.producer_to_step_map[name] + if name in self.set_methods: # set method will update start point and start new iteration + self.try_log_forward_set_method_warning(name, next_step) + self.start_point = next_step + self.cur_iteration += 1 + # next_step == 0, set method (already warned), or previous step is up + # to term + res = self.perform_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + + def try_perform_backward_producer_step( + self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + next_step = self.producer_to_step_map[name] + # starting new iteration + self.cur_iteration += 1 + if next_step == 0 or name in self.set_methods: + self.start_point = next_step + else: # key method + # mark everything from start point to next step as current term + for i in range(self.start_point, next_step): + if self.iterations[i] != -1: + self.iterations[i] = self.cur_iteration + + if name in self.set_methods: + self.try_log_backwards_set_method_warning(name, next_step) + else: + self.try_log_backwards_key_method_warning(name, next_step) + + res = self.perform_producer_step( + zephyr, method, *method_args, **method_kwargs) + + return res + + def try_perform_producer_step( + self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + next_step = self.producer_to_step_map[name] + if next_step >= self.current_step: + res = self.try_perform_forward_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + else: + res = self.try_perform_backward_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + + # dont update current step or terms + + def try_perform_inconsistent_producer_step( # add using stale and overwriting + self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + next_step = self.producer_to_step_map[name] + # inconsistent forward step: performing key method but previous step is + # not up to date + if (next_step >= self.current_step and + self.iterations[next_step - 1] != self.cur_iteration): + prev_step = next_step - 1 + prev_set_method = self.or_join(self.ordered_steps[prev_step][1]) + prev_key_method = self.or_join(self.ordered_steps[prev_step][0]) + if next_step == len(self.ordered_steps) - 1: + final_text = (f"\tOtherwise, you can regenerate the data of the previous " + f"step by calling {prev_key_method}, and then call {name} again.") + else: + corr_set_method = self.or_join(self.ordered_steps[next_step][1]) + final_text = (f"\tIf you already have the data for THIS step, you can use " + f"{corr_set_method} to set the data.\n" + f"\tOtherwise, you can regenerate the data of the " + f"previous step by calling {prev_key_method}, " + f"and then call {name} again.") + LOGGER.warning(f"[GUIDE] INCONSISTENCY WARNING: {name}\n" + f"\tUnable to perform {name} because you are " + f"performing a key method at step {next_step} but the result of the " + f"previous step, step {prev_step}, is stale.\n" + f"\tIf you want to use the stale result or " + f"already have the data for step {prev_step}, you can use " + f"{prev_set_method} to set the data.\n" + f"{final_text}") + elif (next_step < self.current_step and + self.iterations[next_step - 1] != self.cur_iteration): + prev_step = next_step - 1 + prev_key_method = self.or_join(self.ordered_steps[prev_step][0]) + prev_set_method = self.or_join(self.ordered_steps[prev_step][1]) + + if next_step == len(self.ordered_steps) - 1: + final_text = (f"\tOtherwise, you can regenerate the data of the previous " + f"step by calling {prev_key_method}, and then call {name} again.") + else: + corr_set_method = self.or_join(self.ordered_steps[next_step][1]) + final_text = (f"\tIf you already have the data for THIS step, you can use " + f"{corr_set_method} to set the data.\n" + f"\tOtherwise, you can regenerate the data of the " + f"previous step by calling {prev_key_method}, " + f"and then call {name} again.") + LOGGER.warning(f"[GUIDE] INCONSISTENCY WARNING: {name}\n" + f"\tUnable to perform {name} because " + f"you are going backwards and starting a new iteration by " + f"performing a key method at step {next_step} but the result of the " + f"previous step, step {prev_step}, is STALE.\n" + f"\tIf you want to use the STALE result or " + f"already have the data for step {prev_step}, you can use " + f"{prev_set_method} to set the data.\n" + f"{final_text}") + + def try_perform_getter_step( + self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + # either inconsistent, stale, or up to date + step_num = self.getter_to_step_map[name] + step_iteration = self.iterations[step_num] + if step_iteration == -1: + self.log_get_inconsistent_warning(name, step_num) + elif step_iteration == self.cur_iteration: + res = method(zephyr, *method_args, **method_kwargs) + return res + else: + self.log_get_stale_warning(name, step_num) + res = method(zephyr, *method_args, **method_kwargs) + return res + + def guide_step(self, zephyr, method, *method_args, **method_kwargs): + method_name = method.__name__ + if method_name in self.producer_to_step_map: + # up-todate + next_step = self.producer_to_step_map[method_name] + if (next_step == 0 or # 0 step always valid, starting new iteration + # set method always valid, but will update start point and + # start new iteration + method_name in self.set_methods or + # key method valid if previous step is up to date + self.iterations[next_step - 1] == self.cur_iteration): + # forward step only valid if set method or key method w/ no + # skips + res = self.try_perform_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + else: # stale or inconsistent + res = self.try_perform_inconsistent_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + elif method_name in self.getter_to_step_map: + res = self.try_perform_getter_step( + zephyr, method, *method_args, **method_kwargs) + return res + else: + print(f"Method {method_name} does not need to be wrapped") + + +def guide(method): + + @wraps(method) + def guided_step(instance, *method_args, **method_kwargs): + return instance._guide_handler.guide_step( + instance, method, *method_args, **method_kwargs) + + return guided_step diff --git a/zephyr_ml/labeling/__init__.py b/zephyr_ml/_labeling/__init__.py similarity index 88% rename from zephyr_ml/labeling/__init__.py rename to zephyr_ml/_labeling/__init__.py index 60cb5d7..4a8c9bb 100644 --- a/zephyr_ml/labeling/__init__.py +++ b/zephyr_ml/_labeling/__init__.py @@ -1,6 +1,6 @@ -from zephyr_ml.labeling import utils -from zephyr_ml.labeling.data_labeler import DataLabeler -from zephyr_ml.labeling.labeling_functions import ( +from zephyr_ml._labeling import utils +from zephyr_ml._labeling.data_labeler import DataLabeler +from zephyr_ml._labeling.labeling_functions import ( brake_pad_presence, converter_replacement_presence, gearbox_replace_presence, total_power_loss) LABELING_FUNCTIONS = [ diff --git a/zephyr_ml/labeling/data_labeler.py b/zephyr_ml/_labeling/data_labeler.py similarity index 100% rename from zephyr_ml/labeling/data_labeler.py rename to zephyr_ml/_labeling/data_labeler.py diff --git a/zephyr_ml/_labeling/labeling_functions/__init__.py b/zephyr_ml/_labeling/labeling_functions/__init__.py new file mode 100644 index 0000000..241cbe1 --- /dev/null +++ b/zephyr_ml/_labeling/labeling_functions/__init__.py @@ -0,0 +1,5 @@ +from zephyr_ml._labeling.labeling_functions.brake_pad_presence import brake_pad_presence +from zephyr_ml._labeling.labeling_functions.converter_replacement_presence import ( + converter_replacement_presence) +from zephyr_ml._labeling.labeling_functions.planet_bearing import gearbox_replace_presence +from zephyr_ml._labeling.labeling_functions.total_power_loss import total_power_loss diff --git a/zephyr_ml/labeling/labeling_functions/brake_pad_presence.py b/zephyr_ml/_labeling/labeling_functions/brake_pad_presence.py similarity index 97% rename from zephyr_ml/labeling/labeling_functions/brake_pad_presence.py rename to zephyr_ml/_labeling/labeling_functions/brake_pad_presence.py index fe32c33..3d73b33 100644 --- a/zephyr_ml/labeling/labeling_functions/brake_pad_presence.py +++ b/zephyr_ml/_labeling/labeling_functions/brake_pad_presence.py @@ -1,4 +1,4 @@ -from zephyr_ml.labeling.utils import denormalize +from zephyr_ml._labeling.utils import denormalize def brake_pad_presence(es, column_map={}): diff --git a/zephyr_ml/labeling/labeling_functions/converter_replacement_presence.py b/zephyr_ml/_labeling/labeling_functions/converter_replacement_presence.py similarity index 97% rename from zephyr_ml/labeling/labeling_functions/converter_replacement_presence.py rename to zephyr_ml/_labeling/labeling_functions/converter_replacement_presence.py index eef8bd8..11884ce 100644 --- a/zephyr_ml/labeling/labeling_functions/converter_replacement_presence.py +++ b/zephyr_ml/_labeling/labeling_functions/converter_replacement_presence.py @@ -1,4 +1,4 @@ -from zephyr_ml.labeling.utils import denormalize +from zephyr_ml._labeling.utils import denormalize def converter_replacement_presence(es, column_map={}): diff --git a/zephyr_ml/labeling/labeling_functions/planet_bearing.py b/zephyr_ml/_labeling/labeling_functions/planet_bearing.py similarity index 97% rename from zephyr_ml/labeling/labeling_functions/planet_bearing.py rename to zephyr_ml/_labeling/labeling_functions/planet_bearing.py index 36a5412..fdd07af 100644 --- a/zephyr_ml/labeling/labeling_functions/planet_bearing.py +++ b/zephyr_ml/_labeling/labeling_functions/planet_bearing.py @@ -1,4 +1,4 @@ -from zephyr_ml.labeling.utils import denormalize +from zephyr_ml._labeling.utils import denormalize def gearbox_replace_presence(es, column_map={}): diff --git a/zephyr_ml/labeling/labeling_functions/total_power_loss.py b/zephyr_ml/_labeling/labeling_functions/total_power_loss.py similarity index 96% rename from zephyr_ml/labeling/labeling_functions/total_power_loss.py rename to zephyr_ml/_labeling/labeling_functions/total_power_loss.py index a866a05..343b17b 100644 --- a/zephyr_ml/labeling/labeling_functions/total_power_loss.py +++ b/zephyr_ml/_labeling/labeling_functions/total_power_loss.py @@ -1,5 +1,5 @@ -from zephyr_ml.labeling.utils import denormalize +from zephyr_ml._labeling.utils import denormalize def total_power_loss(es, column_map={}): diff --git a/zephyr_ml/labeling/utils.py b/zephyr_ml/_labeling/utils.py similarity index 100% rename from zephyr_ml/labeling/utils.py rename to zephyr_ml/_labeling/utils.py diff --git a/zephyr_ml/metadata.py b/zephyr_ml/_metadata.py similarity index 100% rename from zephyr_ml/metadata.py rename to zephyr_ml/_metadata.py diff --git a/zephyr_ml/core.py b/zephyr_ml/core.py index 1c70fc8..ffaff68 100644 --- a/zephyr_ml/core.py +++ b/zephyr_ml/core.py @@ -2,7 +2,6 @@ import json import logging import os -from functools import wraps from inspect import getfullargspec import composeml as cp @@ -12,9 +11,10 @@ from mlblocks import MLBlock, MLPipeline from sklearn.model_selection import train_test_split -from zephyr_ml.entityset import VALIDATE_DATA_FUNCTIONS, _create_entityset -from zephyr_ml.feature_engineering import process_signals -from zephyr_ml.labeling import get_labeling_functions, get_labeling_functions_map +from zephyr_ml._entityset import create_entityset, VALIDATE_DATA_FUNCTIONS +from zephyr_ml._feature_engineering import process_signals +from zephyr_ml._labeling import get_labeling_functions, get_labeling_functions_map +from zephyr_ml._guide_handler import guide, GuideHandler DEFAULT_METRICS = [ "sklearn.metrics.accuracy_score", @@ -28,297 +28,6 @@ LOGGER = logging.getLogger(__name__) -class GuideHandler: - - def __init__(self, ordered_steps): - self.cur_iteration = 0 - self.current_step = -1 - self.start_point = -1 - self.ordered_steps = ordered_steps - self.set_methods = set() - - self.producer_to_step_map = {} - self.getter_to_step_map = {} - - self.iterations = [] - for idx, (keys, sets, gets) in enumerate(self.ordered_steps): - self.iterations.append(-1) - - for prod in keys: - self.producer_to_step_map[prod.__name__] = idx - for prod in sets: - self.producer_to_step_map[prod.__name__] = idx - self.set_methods.add(prod.__name__) - - for get in gets: - self.getter_to_step_map[get.__name__] = idx - - def or_join(self, methods): - return " or ".join([method.__name__ for method in methods]) - - def get_get_steps_in_between(self, cur_step, next_step): - step_strs = [] - for step in range(cur_step + 1, next_step): - step_strs.append( - f"{step} {self.or_join(self.ordered_steps[step][2])}") - return step_strs - - def get_last_up_to_date(self, next_step): - latest_up_to_date = 0 - for step in range(next_step): - if self.iterations[step] == self.cur_iteration: - latest_up_to_date = step - return latest_up_to_date - - def join_steps(self, step_strs): - return "\n\t".join(step_strs) - - def get_steps_in_between(self, cur_step, next_step): - step_strs = [] - for step in range(cur_step + 1, next_step): - option_strs = [] - option_strs.extend(self.ordered_steps[step][0]) - option_strs.extend(self.ordered_steps[step][1]) - step_strs.append(f"{step}. {self.or_join(option_strs)}") - return step_strs - - def log_next_producer_step(self, name): - next_step = self.current_step + 1 - - if next_step >= len(self.ordered_steps): - cur_step_name = self.or_join(self.ordered_steps[self.current_step][0]) - LOGGER.warning((f"[GUIDE] DONE: {name}.\n" - f"\tYou have reached the end of the " - f"predictive engineering workflow.\n" - f"\tYou can call {cur_step_name} again or re-perform previous steps " - f"based on results.")) - else: - next_step_name = self.or_join(self.ordered_steps[next_step][0]) - LOGGER.warning(f"[GUIDE] DONE: {name}.\n" - f"\tYou can perform the next step by calling {next_step_name}.") - - def perform_producer_step(self, zephyr, method, - *method_args, **method_kwargs): - step_num = self.producer_to_step_map[method.__name__] - res = method(zephyr, *method_args, **method_kwargs) - self.current_step = step_num - self.iterations[step_num] = self.cur_iteration - self.log_next_producer_step(method.__name__) - return res - - def try_log_forward_set_method_warning(self, name, next_step): - if self.current_step != -1: - from_str = (f"Going from step {self.current_step} to " - f"step {next_step} by performing {name}.") - else: - from_str = (f"Performing step {next_step} with {name}.") - LOGGER.warning((f"[GUIDE] STALE WARNING: {name}.\n" - f"\t{from_str}\n" - f"\tThis is a forward step via a set method.\n" - f"\tAll previous steps' results will be considered stale.")) - - def try_log_backwards_set_method_warning(self, name, next_step): - LOGGER.warning((f"[GUIDE] STALE WARNING: {name}.\n" - f"\tGoing from step {self.current_step} to " - f"step {next_step} by performing {name}.\n" - f"\tThis is a backwards step via a set method.\n" - f"\tAll other steps' results will be considered stale.")) - - def try_log_backwards_key_method_warning(self, name, next_step): - steps_in_between = self.get_steps_in_between(next_step, self.current_step + 1) - if len(steps_in_between) > 0: - steps_in_between_str = (f"\tAny results produced by the following steps " - f"will be considered stale:\n" - f"\t{self.join_steps(steps_in_between)}") - else: - steps_in_between_str = "" - - LOGGER.warning((f"[GUIDE] STALE WARNING: {name}.\n" - f"\tGoing from step {self.current_step} to " - f"step {next_step} by performing {name}.\n" - f"\tThis is a backwards step via a key method.\n" - f"{steps_in_between_str}")) - - def log_get_inconsistent_warning(self, name, next_step): - prod_steps_str = self.or_join(self.ordered_steps[next_step][0]) - prod_steps = f"{next_step}.{prod_steps_str}" - latest_up_to_date = self.get_last_up_to_date(next_step) - LOGGER.warning((f"[GUIDE] INCONSISTENCY WARNING: {name}.\n" - f"Unable to perform {name} because" - f"{prod_steps} has not been run yet.\n" - f"Run steps starting at or before {latest_up_to_date}.")) - - def log_get_stale_warning(self, name, next_step): - latest_up_to_date = self.get_last_up_to_date(next_step) - LOGGER.warning((f"[GUIDE] STALE WARNING: {name}.\n" - f"This data is potentially stale.\n" - f"Re-run steps starting at or before {latest_up_to_date}" - f"to ensure data is up to date.")) - - # tries to perform step if possible -> warns that data might be stale - - def try_perform_forward_producer_step( - self, zephyr, method, *method_args, **method_kwargs): - name = method.__name__ - next_step = self.producer_to_step_map[name] - if name in self.set_methods: # set method will update start point and start new iteration - self.try_log_forward_set_method_warning(name, next_step) - self.start_point = next_step - self.cur_iteration += 1 - # next_step == 0, set method (already warned), or previous step is up - # to term - res = self.perform_producer_step( - zephyr, method, *method_args, **method_kwargs) - return res - - def try_perform_backward_producer_step( - self, zephyr, method, *method_args, **method_kwargs): - name = method.__name__ - next_step = self.producer_to_step_map[name] - # starting new iteration - self.cur_iteration += 1 - if next_step == 0 or name in self.set_methods: - self.start_point = next_step - else: # key method - # mark everything from start point to next step as current term - for i in range(self.start_point, next_step): - if self.iterations[i] != -1: - self.iterations[i] = self.cur_iteration - - if name in self.set_methods: - self.try_log_backwards_set_method_warning(name, next_step) - else: - self.try_log_backwards_key_method_warning(name, next_step) - - res = self.perform_producer_step( - zephyr, method, *method_args, **method_kwargs) - - return res - - def try_perform_producer_step( - self, zephyr, method, *method_args, **method_kwargs): - name = method.__name__ - next_step = self.producer_to_step_map[name] - if next_step >= self.current_step: - res = self.try_perform_forward_producer_step( - zephyr, method, *method_args, **method_kwargs) - return res - else: - res = self.try_perform_backward_producer_step( - zephyr, method, *method_args, **method_kwargs) - return res - - # dont update current step or terms - - def try_perform_inconsistent_producer_step( # add using stale and overwriting - self, zephyr, method, *method_args, **method_kwargs): - name = method.__name__ - next_step = self.producer_to_step_map[name] - # inconsistent forward step: performing key method but previous step is - # not up to date - if (next_step >= self.current_step and - self.iterations[next_step - 1] != self.cur_iteration): - prev_step = next_step - 1 - prev_set_method = self.or_join(self.ordered_steps[prev_step][1]) - prev_key_method = self.or_join(self.ordered_steps[prev_step][0]) - if next_step == len(self.ordered_steps) - 1: - final_text = (f"\tOtherwise, you can regenerate the data of the previous " - f"step by calling {prev_key_method}, and then call {name} again.") - else: - corr_set_method = self.or_join(self.ordered_steps[next_step][1]) - final_text = (f"\tIf you already have the data for THIS step, you can use " - f"{corr_set_method} to set the data.\n" - f"\tOtherwise, you can regenerate the data of the " - f"previous step by calling {prev_key_method}, " - f"and then call {name} again.") - LOGGER.warning(f"[GUIDE] INCONSISTENCY WARNING: {name}\n" - f"\tUnable to perform {name} because you are " - f"performing a key method at step {next_step} but the result of the " - f"previous step, step {prev_step}, is stale.\n" - f"\tIf you want to use the stale result or " - f"already have the data for step {prev_step}, you can use " - f"{prev_set_method} to set the data.\n" - f"{final_text}") - elif (next_step < self.current_step and - self.iterations[next_step - 1] != self.cur_iteration): - prev_step = next_step - 1 - prev_key_method = self.or_join(self.ordered_steps[prev_step][0]) - prev_set_method = self.or_join(self.ordered_steps[prev_step][1]) - - if next_step == len(self.ordered_steps) - 1: - final_text = (f"\tOtherwise, you can regenerate the data of the previous " - f"step by calling {prev_key_method}, and then call {name} again.") - else: - corr_set_method = self.or_join(self.ordered_steps[next_step][1]) - final_text = (f"\tIf you already have the data for THIS step, you can use " - f"{corr_set_method} to set the data.\n" - f"\tOtherwise, you can regenerate the data of the " - f"previous step by calling {prev_key_method}, " - f"and then call {name} again.") - LOGGER.warning(f"[GUIDE] INCONSISTENCY WARNING: {name}\n" - f"\tUnable to perform {name} because " - f"you are going backwards and starting a new iteration by " - f"performing a key method at step {next_step} but the result of the " - f"previous step, step {prev_step}, is STALE.\n" - f"\tIf you want to use the STALE result or " - f"already have the data for step {prev_step}, you can use " - f"{prev_set_method} to set the data.\n" - f"{final_text}") - - def try_perform_getter_step( - self, zephyr, method, *method_args, **method_kwargs): - name = method.__name__ - # either inconsistent, stale, or up to date - step_num = self.getter_to_step_map[name] - step_iteration = self.iterations[step_num] - if step_iteration == -1: - self.log_get_inconsistent_warning(name, step_num) - elif step_iteration == self.cur_iteration: - res = method(zephyr, *method_args, **method_kwargs) - return res - else: - self.log_get_stale_warning(name, step_num) - res = method(zephyr, *method_args, **method_kwargs) - return res - - def guide_step(self, zephyr, method, *method_args, **method_kwargs): - method_name = method.__name__ - if method_name in self.producer_to_step_map: - # up-todate - next_step = self.producer_to_step_map[method_name] - if (next_step == 0 or # 0 step always valid, starting new iteration - # set method always valid, but will update start point and - # start new iteration - method_name in self.set_methods or - # key method valid if previous step is up to date - self.iterations[next_step - 1] == self.cur_iteration): - # forward step only valid if set method or key method w/ no - # skips - res = self.try_perform_producer_step( - zephyr, method, *method_args, **method_kwargs) - return res - else: # stale or inconsistent - res = self.try_perform_inconsistent_producer_step( - zephyr, method, *method_args, **method_kwargs) - return res - elif method_name in self.getter_to_step_map: - res = self.try_perform_getter_step( - zephyr, method, *method_args, **method_kwargs) - return res - else: - print(f"Method {method_name} does not need to be wrapped") - - -def guide(method): - - @wraps(method) - def guided_step(instance, *method_args, **method_kwargs): - return instance._guide_handler.guide_step( - instance, method, *method_args, **method_kwargs) - - return guided_step - - class Zephyr: """Zephyr Class. @@ -435,7 +144,7 @@ def generate_entityset( raise ValueError( f"Invalid entityset type: {es_type}. Please use one of the following types:\ {VALIDATE_DATA_FUNCTIONS.keys()}") - entityset = _create_entityset(dfs, es_type, custom_kwargs_mapping) + entityset = create_entityset(dfs, es_type, custom_kwargs_mapping) # perform signal processing if signal_dataframe_name is not None and signal_column is not None: diff --git a/zephyr_ml/labeling/labeling_functions/__init__.py b/zephyr_ml/labeling/labeling_functions/__init__.py deleted file mode 100644 index b756735..0000000 --- a/zephyr_ml/labeling/labeling_functions/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from zephyr_ml.labeling.labeling_functions.brake_pad_presence import brake_pad_presence -from zephyr_ml.labeling.labeling_functions.converter_replacement_presence import ( - converter_replacement_presence) -from zephyr_ml.labeling.labeling_functions.planet_bearing import gearbox_replace_presence -from zephyr_ml.labeling.labeling_functions.total_power_loss import total_power_loss From 0e77f46153340ce418eed8dd7513d985265dd3e3 Mon Sep 17 00:00:00 2001 From: Raymond Pan Date: Sun, 29 Jun 2025 15:13:19 -0700 Subject: [PATCH 6/7] fix ga --- notebooks/modeling.ipynb | 68 ++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/notebooks/modeling.ipynb b/notebooks/modeling.ipynb index 377a6e2..e3c4017 100644 --- a/notebooks/modeling.ipynb +++ b/notebooks/modeling.ipynb @@ -545,9 +545,13 @@ "text": [ "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/sklearn/impute/_base.py:555: UserWarning: Skipping features without any observed values: [ 1 2 6 7 9 10 15 16 17 18]. At least one non-missing value is needed for imputation with strategy='mean'.\n", " warnings.warn(\n", - "[GUIDE] Successfully performed set_feature_matrix.\n", + "[GUIDE] STALE WARNING: set_feature_matrix.\n", + "\tPerforming step 2 with set_feature_matrix.\n", + "\tThis is a forward step via a set method.\n", + "\tAll previous steps' results will be considered stale.\n", + "[GUIDE] DONE: set_feature_matrix.\n", "\tYou can perform the next step by calling generate_train_test_split.\n", - "[GUIDE] Successfully performed generate_train_test_split.\n", + "[GUIDE] DONE: generate_train_test_split.\n", "\tYou can perform the next step by calling fit_pipeline.\n" ] }, @@ -604,7 +608,16 @@ "execution_count": 3, "id": "edffee03", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[GUIDE] DONE: fit_pipeline.\n", + "\tYou can perform the next step by calling predict or evaluate.\n" + ] + } + ], "source": [ "hyperparameters = {\n", " \"xgboost.XGBClassifier#1\": {\n", @@ -629,6 +642,15 @@ "id": "78187756", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[GUIDE] DONE: predict.\n", + "\tYou have reached the end of the predictive engineering workflow.\n", + "\tYou can call predict or evaluate again or re-perform previous steps based on results.\n" + ] + }, { "data": { "text/plain": [ @@ -644,6 +666,14 @@ "zephyr.predict()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c8440ee", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "24cda971", @@ -657,7 +687,17 @@ "execution_count": 5, "id": "cd097853", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[GUIDE] DONE: evaluate.\n", + "\tYou have reached the end of the predictive engineering workflow.\n", + "\tYou can call predict or evaluate again or re-perform previous steps based on results.\n" + ] + } + ], "source": [ "res = zephyr.evaluate()" ] @@ -675,10 +715,10 @@ " 'sklearn.metrics.precision_score': 0.5,\n", " 'sklearn.metrics.f1_score': 0.6666666666666666,\n", " 'sklearn.metrics.recall_score': 1.0,\n", - " 'zephyr_ml.primitives.postprocessing.confusion_matrix': (array([[1, 1],\n", + " 'zephyr_ml.primitives.evaluation.confusion_matrix': (array([[1, 1],\n", " [0, 1]]),\n", "
),\n", - " 'zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve': (0.5,\n", + " 'zephyr_ml.primitives.evaluation.roc_auc_score_and_curve': (0.5,\n", "
)}" ] }, @@ -719,13 +759,13 @@ ], "source": [ "%matplotlib inline\n", - "_, conf_matrix_fig = res[\"zephyr_ml.primitives.postprocessing.confusion_matrix\"]\n", + "_, conf_matrix_fig = res[\"zephyr_ml.primitives.evaluation.confusion_matrix\"]\n", "conf_matrix_fig" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "d59e86b1", "metadata": {}, "outputs": [ @@ -736,17 +776,25 @@ "
" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", - "_, roc_fig = res[\"zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve\"]\n", + "_, roc_fig = res[\"zephyr_ml.primitives.evaluation.roc_auc_score_and_curve\"]\n", "\n", "roc_fig\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38d109a1", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 59b0be706d9adc2abf30322994c0324dcb0ac2ce Mon Sep 17 00:00:00 2001 From: Raymond Pan Date: Sun, 29 Jun 2025 15:48:28 -0700 Subject: [PATCH 7/7] lint --- zephyr_ml/_guide_handler.py | 1 - zephyr_ml/core.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/zephyr_ml/_guide_handler.py b/zephyr_ml/_guide_handler.py index 995217b..e2725ae 100644 --- a/zephyr_ml/_guide_handler.py +++ b/zephyr_ml/_guide_handler.py @@ -1,7 +1,6 @@ import logging from functools import wraps - LOGGER = logging.getLogger(__name__) diff --git a/zephyr_ml/core.py b/zephyr_ml/core.py index ffaff68..87f3762 100644 --- a/zephyr_ml/core.py +++ b/zephyr_ml/core.py @@ -11,10 +11,10 @@ from mlblocks import MLBlock, MLPipeline from sklearn.model_selection import train_test_split -from zephyr_ml._entityset import create_entityset, VALIDATE_DATA_FUNCTIONS +from zephyr_ml._entityset import VALIDATE_DATA_FUNCTIONS, create_entityset from zephyr_ml._feature_engineering import process_signals +from zephyr_ml._guide_handler import GuideHandler, guide from zephyr_ml._labeling import get_labeling_functions, get_labeling_functions_map -from zephyr_ml._guide_handler import guide, GuideHandler DEFAULT_METRICS = [ "sklearn.metrics.accuracy_score",