-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpostprocessing.py
More file actions
137 lines (118 loc) · 5.66 KB
/
postprocessing.py
File metadata and controls
137 lines (118 loc) · 5.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
def RMSE(actual_y, predict_y):
rms = mean_squared_error(actual_y, predict_y, squared=False)
return rms
def MAPE(actual_y, predict_y):
error = mean_absolute_percentage_error(actual_y, predict_y)
return error
def plotTrainTest(actual_y, predict_y, train_percentage, model_name):
"""
plots predicted and actual case numbers over entire timeframe (train and test)
"""
split_idx = split_idx = round(len(actual_y)*train_percentage)
train_days = np.linspace(1, split_idx, num = split_idx)
test_days = np.linspace(split_idx, len(actual_y), num = len(actual_y)-split_idx)
plt.plot(train_days, actual_y[:split_idx], label='Actual (Train)')
plt.plot(test_days, actual_y[split_idx:], label = 'Actual (Test)')
plt.plot(test_days, predict_y, label='Predicted (Test)')
plt.xlabel('Day Number')
plt.ylabel('Daily Case Count')
plt.title(model_name + ' Case Prediction')
plt.legend()
plt.show()
# RF plotting stuff - moved over from the other file ######
def plotRF(features, feature_list, labels, predictions):
# Creating a list of integers in place of actual dates (can fix this afterwards)
dayIDs_labels = range(0,len(labels))
dayIDs_predictions = range(0,len(predictions))
# Dataframe with true values and dates
true_data = pd.DataFrame(data = {'date': dayIDs_labels, 'actual': labels})
# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data = {'date': dayIDs_predictions, 'prediction': predictions})
# Plot the actual values
plt.plot(true_data['date'], true_data['actual'], 'b-', label = 'actual')
# Plot the predicted values
plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'prediction')
plt.legend()
# Graph labels
plt.xlabel('Date')
plt.ylabel('Covid Cases')
plt.title('Actual and Predicted Values')
plt.show()
# Make the data accessible for plotting
true_data['covid symptoms'] = features[:, feature_list.index('covid symptoms')]
true_data['coronavirus'] = features[:, feature_list.index('coronavirus')]
true_data['covid'] = features[:, feature_list.index('covid')]
# Plot all the data as lines
plt.plot(true_data['date'], true_data['actual'], 'b-', label = 'actual', alpha = 1.0)
plt.plot(true_data['date'], true_data['covid symptoms'], 'y-', label = 'covid symptoms', alpha = 1.0)
plt.plot(true_data['date'], true_data['coronavirus'], 'k-', label = 'coronavirus', alpha = 0.8)
plt.plot(true_data['date'], true_data['covid'], 'r-', label = 'covid', alpha = 0.3)
# Formatting plot
plt.legend()
# Lables and title
plt.xlabel('Date')
plt.ylabel('Cases')
plt.title('Actual Cases and Search Queries')
plt.show()
def plotRFTrees(rf, feature_list, train_features, train_labels):
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
import pydot
import graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:\Program Files\Graphviz\bin'
# Visualization of decision tree
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')
print('The depth of this tree is:', tree.tree_.max_depth)
# Limit depth of tree to 2 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3, random_state=42)
rf_small.fit(train_features, train_labels)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')
def getRFImportances(rf, feature_list, bool_plotImportances):
# Importances
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
if bool_plotImportances:
# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances')
plt.show()
def showAccuracyInfo(predictions, test_labels):
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'Cases.')
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')