From a4de9bf9aeb68b23bc31c84f698ccdc95542b3ff Mon Sep 17 00:00:00 2001 From: "akem134@elan" Date: Mon, 18 May 2020 11:23:36 +1200 Subject: [PATCH 1/5] Implement decision tree with polynomial features --- scripts/forecast_model.py | 33 ++++++++++++++++++++------------- whakaari/__init__.py | 12 +++++++++++- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/scripts/forecast_model.py b/scripts/forecast_model.py index 328e4ad..af61c5d 100644 --- a/scripts/forecast_model.py +++ b/scripts/forecast_model.py @@ -58,24 +58,31 @@ def forecast_dec2019(): fm.hires_forecast(ti=te-fm.dtw-fm.dtf, tf=te+month/30, recalculate=True, save=r'{:s}/forecast_hires.png'.format(fm.plotdir), n_jobs=n_jobs) -def forecast_test(): +def forecast_test(two_features=False): ''' test scale forecast model ''' # constants month = timedelta(days=365.25/12) - - # set up model - data_streams = ['rsam','mf','hf','dsar'] - fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, - look_forward=2., data_streams=data_streams, root='test') - + # set the available CPUs higher or lower as appropriate n_jobs = 6 - - # train the model - drop_features = ['linear_trend_timewise','agg_linear_trend'] - fm.train(ti='2012-04-01', tf='2012-10-01', drop_features=drop_features, retrain=True, - n_jobs=n_jobs) + + # set up model + if two_features: + data_streams = ['rsam'] + fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, + look_forward=2., data_streams=data_streams, root='testPF') + fm.train(ti='2012-04-01', tf='2012-10-01', retrain=True, + n_jobs=n_jobs, classifier="DTPF", + use_only_features=['rsam__maximum', 'rsam__fft_coefficient__coeff_12__attr_"abs"']) + else: + data_streams = ['rsam', 'mf', 'hf', 'dsar'] + fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, + look_forward=2., data_streams=data_streams, root='test') + # train the model + drop_features = ['linear_trend_timewise', 'agg_linear_trend'] + fm.train(ti='2012-04-01', tf='2012-10-01', drop_features=drop_features, retrain=True, + n_jobs=n_jobs) # plot a forecast for a future eruption te = fm.data.tes[1] @@ -116,6 +123,6 @@ def forecast_now(): if __name__ == "__main__": #forecast_dec2019() - forecast_test() + forecast_test(two_features=True) #forecast_now() \ No newline at end of file diff --git a/whakaari/__init__.py b/whakaari/__init__.py index 80df64f..fe4fddd 100644 --- a/whakaari/__init__.py +++ b/whakaari/__init__.py @@ -55,9 +55,11 @@ from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import PolynomialFeatures datas = ['rsam','mf','hf','dsar'] -all_classifiers = ["SVM","KNN",'DT','RF','NN','NB','LR'] +all_classifiers = ["SVM","KNN",'DT', 'DTPF', 'RF','NN','NB','LR'] _MONTH = timedelta(days=365.25/12) _DAY = timedelta(days=1.) @@ -969,6 +971,7 @@ def train(self, ti=None, tf=None, Nfts=20, Ncl=100, retrain=False, classifier="D SVM - Support Vector Machine. KNN - k-Nearest Neighbors DT - Decision Tree + DTPF - Decision Tree on Polynomial Features RF - Random Forest NN - Neural Network NB - Naive Bayes @@ -1607,6 +1610,13 @@ def get_classifier(classifier): model = DecisionTreeClassifier(class_weight='balanced') grid = {'max_depth': [3,5,7], 'criterion': ['gini','entropy'], 'max_features': ['auto','sqrt','log2',None]} + elif classifier == 'DTPF': + model = Pipeline([('polynomial', PolynomialFeatures()), + ('clf', DecisionTreeClassifier()) + ]) + grid = {'clf__max_depth': [3, 5, 7], 'clf__criterion': ['gini', 'entropy'], + 'clf__max_features': ['auto', 'sqrt', 'log2', None], + 'polynomial__degree': [1, 2, 3, 4, 5]} elif classifier == "RF": # random forest model = RandomForestClassifier(class_weight='balanced') grid = {'n_estimators': [10,30,100], 'max_depth': [3,5,7], 'criterion': ['gini','entropy'], From 839c57ccc4c1e3f93333f192b8911a742851d53f Mon Sep 17 00:00:00 2001 From: "akem134@elan" Date: Mon, 18 May 2020 11:36:20 +1200 Subject: [PATCH 2/5] Forecasting Dec 2019 with two features and polynomial basis functions --- scripts/forecast_model.py | 43 ++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/scripts/forecast_model.py b/scripts/forecast_model.py index af61c5d..9b6f247 100644 --- a/scripts/forecast_model.py +++ b/scripts/forecast_model.py @@ -15,33 +15,44 @@ warnings.filterwarnings("ignore", category=FitFailedWarning) -def forecast_dec2019(): +def forecast_dec2019(two_features=False): ''' forecast model for Dec 2019 eruption ''' # constants month = timedelta(days=365.25/12) day = timedelta(days=1) td = TremorData() - - # construct model object - data_streams = ['rsam','mf','hf','dsar'] - fm = ForecastModel(ti='2011-01-01', tf='2020-01-01', window=2., overlap=0.75, - look_forward=2., data_streams=data_streams) - - # columns to manually drop from feature matrix because they are highly correlated to other - # linear regressors - drop_features = ['linear_trend_timewise','agg_linear_trend'] - + # set the available CPUs higher or lower as appropriate n_jobs = 6 + # construct model object + if two_features: + data_streams = ['rsam'] + fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, + look_forward=2., data_streams=data_streams, root='twoFeatures') + use_only_features = ['rsam__maximum', 'rsam__fft_coefficient__coeff_12__attr_"abs"'] + classifier = "DTPF" + drop_features = [] + else: + data_streams = ['rsam','mf','hf','dsar'] + fm = ForecastModel(ti='2011-01-01', tf='2020-01-01', window=2., overlap=0.75, + look_forward=2., data_streams=data_streams) + + # columns to manually drop from feature matrix because they are highly correlated to other + # linear regressors + drop_features = ['linear_trend_timewise', 'agg_linear_trend'] + use_only_features = [] + classifier = "DT" + # train the model, excluding 2019 eruption - # note: building the feature matrix may take several hours, but only has to be done once + # note: building the feature matrix may take several hours, but only has to be done once # and will intermittantly save progress in ../features/ # trained scikit-learn models will be saved to ../models/*root*/ te = td.tes[-1] - fm.train(ti='2011-01-01', tf='2020-01-01', drop_features=drop_features, retrain=True, - exclude_dates=[[te-month,te+month],], n_jobs=n_jobs) + fm.train(ti='2011-01-01', tf='2020-01-01', drop_features=drop_features, retrain=True, + exclude_dates=[[te-month,te+month],], n_jobs=n_jobs, + classifier=classifier, use_only_features=use_only_features) # run forecast from 2011 to 2020 # model predictions will be saved to ../predictions/*root*/ @@ -122,7 +133,7 @@ def forecast_now(): save='current_forecast.png', nztimezone=True, n_jobs=n_jobs) if __name__ == "__main__": - #forecast_dec2019() - forecast_test(two_features=True) + forecast_dec2019(two_features=True) + #forecast_test(two_features=True) #forecast_now() \ No newline at end of file From 50c215d094218321c7dfe74dddb3cbf68e6c3722 Mon Sep 17 00:00:00 2001 From: "akem134@elan" Date: Mon, 18 May 2020 11:39:38 +1200 Subject: [PATCH 3/5] Changed abbreviation of classifier --- scripts/forecast_model.py | 4 ++-- whakaari/__init__.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/forecast_model.py b/scripts/forecast_model.py index 9b6f247..29a7a18 100644 --- a/scripts/forecast_model.py +++ b/scripts/forecast_model.py @@ -32,7 +32,7 @@ def forecast_dec2019(two_features=False): fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, look_forward=2., data_streams=data_streams, root='twoFeatures') use_only_features = ['rsam__maximum', 'rsam__fft_coefficient__coeff_12__attr_"abs"'] - classifier = "DTPF" + classifier = 'DTPBF' drop_features = [] else: data_streams = ['rsam','mf','hf','dsar'] @@ -84,7 +84,7 @@ def forecast_test(two_features=False): fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, look_forward=2., data_streams=data_streams, root='testPF') fm.train(ti='2012-04-01', tf='2012-10-01', retrain=True, - n_jobs=n_jobs, classifier="DTPF", + n_jobs=n_jobs, classifier='DTPBF', use_only_features=['rsam__maximum', 'rsam__fft_coefficient__coeff_12__attr_"abs"']) else: data_streams = ['rsam', 'mf', 'hf', 'dsar'] diff --git a/whakaari/__init__.py b/whakaari/__init__.py index fe4fddd..b9e235e 100644 --- a/whakaari/__init__.py +++ b/whakaari/__init__.py @@ -59,7 +59,7 @@ from sklearn.preprocessing import PolynomialFeatures datas = ['rsam','mf','hf','dsar'] -all_classifiers = ["SVM","KNN",'DT', 'DTPF', 'RF','NN','NB','LR'] +all_classifiers = ["SVM","KNN",'DT', 'DTPBF', 'RF','NN','NB','LR'] _MONTH = timedelta(days=365.25/12) _DAY = timedelta(days=1.) @@ -1593,6 +1593,7 @@ def get_classifier(classifier): SVM - Support Vector Machine. KNN - k-Nearest Neighbors DT - Decision Tree + DTPBF - Decision Tree with Polynomial Basis Functions RF - Random Forest NN - Neural Network NB - Naive Bayes @@ -1610,7 +1611,7 @@ def get_classifier(classifier): model = DecisionTreeClassifier(class_weight='balanced') grid = {'max_depth': [3,5,7], 'criterion': ['gini','entropy'], 'max_features': ['auto','sqrt','log2',None]} - elif classifier == 'DTPF': + elif classifier == 'DTPBF': model = Pipeline([('polynomial', PolynomialFeatures()), ('clf', DecisionTreeClassifier()) ]) From 1b947b6ce230fd504ddddab983817fcaa1a227ad Mon Sep 17 00:00:00 2001 From: "akem134@elan" Date: Mon, 18 May 2020 11:41:40 +1200 Subject: [PATCH 4/5] Compute test case by default --- scripts/forecast_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/forecast_model.py b/scripts/forecast_model.py index 29a7a18..e1693a3 100644 --- a/scripts/forecast_model.py +++ b/scripts/forecast_model.py @@ -133,7 +133,7 @@ def forecast_now(): save='current_forecast.png', nztimezone=True, n_jobs=n_jobs) if __name__ == "__main__": - forecast_dec2019(two_features=True) - #forecast_test(two_features=True) + #forecast_dec2019(two_features=True) + forecast_test(two_features=True) #forecast_now() \ No newline at end of file From 890aeed8af329ee56b3668edca897ce4225d4792 Mon Sep 17 00:00:00 2001 From: "akem134@elan" Date: Mon, 18 May 2020 12:00:01 +1200 Subject: [PATCH 5/5] Update doc string --- whakaari/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whakaari/__init__.py b/whakaari/__init__.py index b9e235e..3083c35 100644 --- a/whakaari/__init__.py +++ b/whakaari/__init__.py @@ -971,7 +971,7 @@ def train(self, ti=None, tf=None, Nfts=20, Ncl=100, retrain=False, classifier="D SVM - Support Vector Machine. KNN - k-Nearest Neighbors DT - Decision Tree - DTPF - Decision Tree on Polynomial Features + DTPBF - Decision Tree with Polynomial Basis Functions RF - Random Forest NN - Neural Network NB - Naive Bayes