diff --git a/scripts/forecast_model.py b/scripts/forecast_model.py index 328e4ad..e1693a3 100644 --- a/scripts/forecast_model.py +++ b/scripts/forecast_model.py @@ -15,33 +15,44 @@ warnings.filterwarnings("ignore", category=FitFailedWarning) -def forecast_dec2019(): +def forecast_dec2019(two_features=False): ''' forecast model for Dec 2019 eruption ''' # constants month = timedelta(days=365.25/12) day = timedelta(days=1) td = TremorData() - - # construct model object - data_streams = ['rsam','mf','hf','dsar'] - fm = ForecastModel(ti='2011-01-01', tf='2020-01-01', window=2., overlap=0.75, - look_forward=2., data_streams=data_streams) - - # columns to manually drop from feature matrix because they are highly correlated to other - # linear regressors - drop_features = ['linear_trend_timewise','agg_linear_trend'] - + # set the available CPUs higher or lower as appropriate n_jobs = 6 + # construct model object + if two_features: + data_streams = ['rsam'] + fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, + look_forward=2., data_streams=data_streams, root='twoFeatures') + use_only_features = ['rsam__maximum', 'rsam__fft_coefficient__coeff_12__attr_"abs"'] + classifier = 'DTPBF' + drop_features = [] + else: + data_streams = ['rsam','mf','hf','dsar'] + fm = ForecastModel(ti='2011-01-01', tf='2020-01-01', window=2., overlap=0.75, + look_forward=2., data_streams=data_streams) + + # columns to manually drop from feature matrix because they are highly correlated to other + # linear regressors + drop_features = ['linear_trend_timewise', 'agg_linear_trend'] + use_only_features = [] + classifier = "DT" + # train the model, excluding 2019 eruption - # note: building the feature matrix may take several hours, but only has to be done once + # note: building the feature matrix may take several hours, but only has to be done once # and will intermittantly save progress in ../features/ # trained scikit-learn models will be saved to ../models/*root*/ te = td.tes[-1] - fm.train(ti='2011-01-01', tf='2020-01-01', drop_features=drop_features, retrain=True, - exclude_dates=[[te-month,te+month],], n_jobs=n_jobs) + fm.train(ti='2011-01-01', tf='2020-01-01', drop_features=drop_features, retrain=True, + exclude_dates=[[te-month,te+month],], n_jobs=n_jobs, + classifier=classifier, use_only_features=use_only_features) # run forecast from 2011 to 2020 # model predictions will be saved to ../predictions/*root*/ @@ -58,24 +69,31 @@ def forecast_dec2019(): fm.hires_forecast(ti=te-fm.dtw-fm.dtf, tf=te+month/30, recalculate=True, save=r'{:s}/forecast_hires.png'.format(fm.plotdir), n_jobs=n_jobs) -def forecast_test(): +def forecast_test(two_features=False): ''' test scale forecast model ''' # constants month = timedelta(days=365.25/12) - - # set up model - data_streams = ['rsam','mf','hf','dsar'] - fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, - look_forward=2., data_streams=data_streams, root='test') - + # set the available CPUs higher or lower as appropriate n_jobs = 6 - - # train the model - drop_features = ['linear_trend_timewise','agg_linear_trend'] - fm.train(ti='2012-04-01', tf='2012-10-01', drop_features=drop_features, retrain=True, - n_jobs=n_jobs) + + # set up model + if two_features: + data_streams = ['rsam'] + fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, + look_forward=2., data_streams=data_streams, root='testPF') + fm.train(ti='2012-04-01', tf='2012-10-01', retrain=True, + n_jobs=n_jobs, classifier='DTPBF', + use_only_features=['rsam__maximum', 'rsam__fft_coefficient__coeff_12__attr_"abs"']) + else: + data_streams = ['rsam', 'mf', 'hf', 'dsar'] + fm = ForecastModel(ti='2012-04-01', tf='2012-10-01', window=2., overlap=0.75, + look_forward=2., data_streams=data_streams, root='test') + # train the model + drop_features = ['linear_trend_timewise', 'agg_linear_trend'] + fm.train(ti='2012-04-01', tf='2012-10-01', drop_features=drop_features, retrain=True, + n_jobs=n_jobs) # plot a forecast for a future eruption te = fm.data.tes[1] @@ -115,7 +133,7 @@ def forecast_now(): save='current_forecast.png', nztimezone=True, n_jobs=n_jobs) if __name__ == "__main__": - #forecast_dec2019() - forecast_test() + #forecast_dec2019(two_features=True) + forecast_test(two_features=True) #forecast_now() \ No newline at end of file diff --git a/whakaari/__init__.py b/whakaari/__init__.py index 80df64f..3083c35 100644 --- a/whakaari/__init__.py +++ b/whakaari/__init__.py @@ -55,9 +55,11 @@ from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import PolynomialFeatures datas = ['rsam','mf','hf','dsar'] -all_classifiers = ["SVM","KNN",'DT','RF','NN','NB','LR'] +all_classifiers = ["SVM","KNN",'DT', 'DTPBF', 'RF','NN','NB','LR'] _MONTH = timedelta(days=365.25/12) _DAY = timedelta(days=1.) @@ -969,6 +971,7 @@ def train(self, ti=None, tf=None, Nfts=20, Ncl=100, retrain=False, classifier="D SVM - Support Vector Machine. KNN - k-Nearest Neighbors DT - Decision Tree + DTPBF - Decision Tree with Polynomial Basis Functions RF - Random Forest NN - Neural Network NB - Naive Bayes @@ -1590,6 +1593,7 @@ def get_classifier(classifier): SVM - Support Vector Machine. KNN - k-Nearest Neighbors DT - Decision Tree + DTPBF - Decision Tree with Polynomial Basis Functions RF - Random Forest NN - Neural Network NB - Naive Bayes @@ -1607,6 +1611,13 @@ def get_classifier(classifier): model = DecisionTreeClassifier(class_weight='balanced') grid = {'max_depth': [3,5,7], 'criterion': ['gini','entropy'], 'max_features': ['auto','sqrt','log2',None]} + elif classifier == 'DTPBF': + model = Pipeline([('polynomial', PolynomialFeatures()), + ('clf', DecisionTreeClassifier()) + ]) + grid = {'clf__max_depth': [3, 5, 7], 'clf__criterion': ['gini', 'entropy'], + 'clf__max_features': ['auto', 'sqrt', 'log2', None], + 'polynomial__degree': [1, 2, 3, 4, 5]} elif classifier == "RF": # random forest model = RandomForestClassifier(class_weight='balanced') grid = {'n_estimators': [10,30,100], 'max_depth': [3,5,7], 'criterion': ['gini','entropy'],