PredictDev/rf.py at main · alycialeonard/PredictDev · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Filename: rf.py
Author: Alycia Leonard
Date: 2025-12-05
Version: 1.0
Description: rf modelling script for UPV dataset - run a single experiment
License: GNU GPL-3.0
Contact: alycia.leonard@eng.ox.ac.uk
"""

from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import shap
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from functions import csv_to_list, plot_confusion_matrix_count, plot_confusion_matrix_norm, plot_precision_recall_f1, plot_roc_auc, plot_precision_recall_curve, plot_feature_importance
import json
from datetime import datetime
import sys

# Suppress tight layout warnings (inevitable due to SHAP code)
warnings.filterwarnings("ignore", message=".*Tight layout not applied.*")

def run_rf_experiment(target, target_short, stems_to_drop, clf, clf_short, param_dist):

    # -------- DEFINE EXPERIMENT PATH & START LOGGING ---------- #
    print(f"Starting experiment: Predicting {target_short} with {clf_short}!")
    cwd = os.getcwd()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_path = os.path.join(cwd, 'results', clf_short, target_short, timestamp)
    os.makedirs(experiment_path, exist_ok=True)
    log_file = os.path.join(experiment_path, "console_output.txt")
    print(f"Console output will log to {log_file}.")
    log_f = open(log_file, "w")
    sys.stdout = log_f
    sys.stderr = log_f

    # -------- DEFINE SAVE PATHS ------------#
    model_save_path = os.path.join(experiment_path, 'models')
    os.makedirs(model_save_path, exist_ok=True)
    plots_save_path = os.path.join(experiment_path, 'plots' )
    os.makedirs(plots_save_path, exist_ok=True)
    metrics_save_path = os.path.join(experiment_path, 'metrics')
    os.makedirs(metrics_save_path, exist_ok=True)

    # ----- PRINT EXPERIMENT SETTINGS TO LOG FILE ----- #
    print("Experiment settings:")
    print(f"- Target: {target}")
    print(f"- Target short name: {target_short}")
    print(f"- Question stems being dropped: {stems_to_drop}\n")

    # ------------ LOAD DATA ------------ #
    print("Experiment execution:\nLoading data...")
    data_path = os.path.join(cwd, 'data')
    df = pd.read_csv(os.path.join(data_path, "Kenya_UPV_Survey_Preprocessed_EncodedCols_2.csv"), low_memory=False)
    print(f"Data loaded from {os.path.join(data_path, "Kenya_UPV_Survey_Preprocessed_EncodedCols_2.csv")}!")
    num_cols = csv_to_list(os.path.join(data_path, "cols", "numeric_cols.csv"), 'numeric_cols')

    # --------- PREPARE DATA ------------- #
    print("Preparing data...")
    # Drop rows where target is missing & set target to y
    df = df.dropna(subset=[target]).copy()
    y = df[target]
    # Set predictors as X
    cols_to_drop = [c for c in df.columns if any(c.startswith(stem) for stem in stems_to_drop)]
    X = df.drop(columns=[target] + cols_to_drop)
    # Split data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    # -------- DEFINE CLASSIFICATION PIPELINE -------- #
    print("Defining classification pipeline & hyperparameter search space...")
    # Define preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), num_cols
            ),
        ],
        remainder='passthrough'  # already processed pass through
    )

    # Define Random Forest classifier pipeline
    clf_pipeline = Pipeline([('preprocessing', preprocessor), ('classifier', clf)])

    # ------- HYPERPARAMETER SEARCH ------- #
    # Define random search to execute: 100 random combinations, 5-fold validation
    random_search = RandomizedSearchCV(
        estimator=clf_pipeline,
        param_distributions=param_dist,
        n_iter=100,
        scoring= 'f1_weighted',
        cv=5,
        n_jobs=-1,
        verbose=1,
        random_state=42,
        refit=True
    )
    # Run the search
    print("Running randomized hyperparameter search...")
    random_search.fit(X_train, y_train)
    print("- Best parameters:", random_search.best_params_)
    print("- Best CV weighted F1:", random_search.best_score_)
    # Get the best model pipeline from the search
    best_model = random_search.best_estimator_
    # Save the best model
    joblib.dump(best_model, os.path.join(model_save_path, 'rf.pkl'))

    # ------- EVALUATE MODEL -------- #
    # Evaluate the best model
    print("Performance of best model:")
    test_score = best_model.score(X_test, y_test)
    print(f"- Test accuracy with best params: {test_score:.3f}")
    train_score = best_model.score(X_train, y_train)
    print(f"- Train accuracy with best params: {train_score:.3f}")
    y_pred_test = best_model.predict(X_test)
    print("- Classification report:\n")
    print(classification_report(y_test, y_pred_test, zero_division=0))
    print("- Confusion matrix:\n", confusion_matrix(y_test, y_pred_test))

    # ----------- SAVE METRICS -------- #
    print("Saving metrics...")
    # Convert classification report to dataframe & save as CSV
    report_dict = classification_report(y_test, y_pred_test, zero_division=0, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    report_df.to_csv(os.path.join(metrics_save_path, 'classification_report.csv'), index=True)

    # Convert confusion matrix to dataframe & save as CSV
    cm = confusion_matrix(y_test, y_pred_test)
    cm_df = pd.DataFrame(cm, index=best_model.classes_, columns=best_model.classes_)
    cm_df.to_csv(os.path.join(metrics_save_path, 'confusion_matrix.csv'))

    # Create summary CSV for experiment tracking
    summary = {
        'target': target,
        'target_short': target_short,
        'train_accuracy': float(train_score),
        'test_accuracy': float(test_score),
        'best_cv_weighted_f1': float(random_search.best_score_),
        'best_params': json.dumps(random_search.best_params_, default=str),
        'n_train': int(X_train.shape[0]),
        'n_test': int(X_test.shape[0]),
        'timestamp': pd.Timestamp.now().isoformat()
    }
    summary_df = pd.DataFrame([summary])
    summary_df.to_csv(os.path.join(metrics_save_path, 'metrics_summary.csv'), index=False)
    print(f"Performance metrics saved to {metrics_save_path}!")

    # -------- PLOT PERFORMANCE -------- #

    print("Plotting performance...")
    plot_confusion_matrix_count(best_model, X_test, y_test, os.path.join(plots_save_path, 'confusion_matrix_counts.png'))
    plot_confusion_matrix_norm(best_model, X_test, y_test, os.path.join(plots_save_path, 'confusion_matrix_normalized.png'))
    plot_precision_recall_f1(best_model, X_test, y_test, os.path.join(plots_save_path, 'precision_recall_f1_per_class.png'))
    plot_roc_auc(best_model, X_test, y_test, os.path.join(plots_save_path, 'roc_curves.png'))
    plot_precision_recall_curve(best_model, X_test, y_test, os.path.join(plots_save_path, 'precision_recall_curves.png'))
    plot_feature_importance(best_model, os.path.join(plots_save_path, "feature_importance.png"))
    print(f"Performance plots saved to {plots_save_path}!")

    # ---------- GET SHAP VALUES ----------- #
    print("Getting SHAP values...")
    # Get preprocessor and classifier from best model
    preprocessor = best_model.named_steps['preprocessing']
    clf = best_model.named_steps['classifier']

    # Transform training data for SHAP
    X_train_transformed = preprocessor.transform(X_train)
    feature_names = preprocessor.get_feature_names_out(X_train.columns)
    X_shap = pd.DataFrame(X_train_transformed, columns=feature_names)
    n_samples = X_shap.shape[0]
    n_features = X_shap.shape[1]
    print(f"X_shap shape: samples={n_samples}, features={n_features}")

    # Use TreeExplainer on the classifier and compute SHAP values
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_shap)
    # shap_for_plot = shap_values[..., 1]  # positive class

    # Hand the fact that SHAP can output about a million different shapes.
    print("Raw SHAP type:", type(shap_values))
    # Case 1: One list per class
    if isinstance(shap_values, list):
        shap_for_plot = shap_values[1]  # positive class
    # Case 2: 3D array: (samples, features, classes)
    elif shap_values.ndim == 3 and shap_values.shape[2] >= 2:
        shap_for_plot = shap_values[:, :, 1]
    # Case 3: 2D array matching (samples, features)
    elif shap_values.ndim == 2 and shap_values.shape[1] > 1:
        shap_for_plot = shap_values
    # Case 4: SHAP returned only one value per sample (shape: (n,1)). Fill it across all features so code doesn't crash
    elif shap_values.ndim == 2 and shap_values.shape[1] == 1:
        shap_for_plot = np.repeat(shap_values, X_shap.shape[1], axis=1)
    else:
        raise ValueError(f"Unexpected SHAP shape: {shap_values.shape}")

    # Get per-sample SHAP values (positive class) as DataFrame, add sample identifiers, save to CSV
    shap_df = pd.DataFrame(shap_for_plot, columns=X_shap.columns, index=X_shap.index)
    shap_df.insert(0, 'sample_index', shap_df.index)
    shap_df.to_csv(os.path.join(metrics_save_path, 'shap_values_per_sample.csv'), index=False)
    print(f"SHAP per sample saved to {os.path.join(metrics_save_path, 'shap_values_per_sample.csv')}!")

    # Get feature-level summary (mean absolute SHAP, std, rank) & save to csv
    mean_abs = np.nanmean(np.abs(shap_for_plot), axis=0)
    std_abs  = np.nanstd(np.abs(shap_for_plot), axis=0)
    feat_summary = pd.DataFrame({'feature': X_shap.columns, 'mean_abs_shap': mean_abs, 'std_abs_shap': std_abs})
    feat_summary = feat_summary.sort_values('mean_abs_shap', ascending=False).reset_index(drop=True)
    feat_summary['rank'] = feat_summary['mean_abs_shap'].rank(ascending=False, method='dense').astype(int)
    feat_summary.to_csv(os.path.join(metrics_save_path, 'shap_feature_summary.csv'), index=False)
    print(f"SHAP summary saved to {os.path.join(metrics_save_path, 'shap_feature_summary.csv')}!")

    print("Plotting SHAP values...")
    # Plot summary dot plot
    plt.figure(figsize=(10, 6))
    shap.summary_plot(shap_for_plot, X_shap, max_display=20, show=False)
    plt.title(f"SHAP Summary (positive class)")
    plt.savefig(os.path.join(plots_save_path, "SHAP_summary_dot.png"), bbox_inches='tight')
    plt.close()
    print(f"SHAP summary plot saved to {os.path.join(plots_save_path, "SHAP_summary_dot.png")}!")

    # ------- STOP LOGGING ------- #
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__
    log_f.close()
    print(f"Console log saved to {log_file}")


# --------- RUN THE EXPERIMENT ---------- #
def main():

    # Define target for prediction.
    tar = 'Have electricity and value it'

    # Define short-form of target to use in file saving
    tar_short = 'specific_electricityaccess1value1'

    # Define questions to drop from predictors
    to_drop = ['Which of the following items do you have access to in your daily life?',
               'Which 5 items are most important to you in your daily life? Please indicate these in order of importance, starting with the most important',
               'Given the chosen climate event - which 3 items are most useful to you?',
               'annotation_',
               'What is the main source of electricity in your home?',
               'What is the electricity used for?',
               'Do you use electricity for your business?',
               'What do you use electricity for in your business?',
               'What is the main source of lighting?',
               'Have electricity and do not value it',
               'No electricity access but still values it'
               ]

    # Define classifier
    clf = RandomForestClassifier(random_state=42)
    clf_short = "rf"

    # Define parameters to test for hyperparameters
    param_dist = {
        'classifier__n_estimators': randint(200, 801),
        'classifier__max_depth': [None, 8, 12, 16, 20],
        'classifier__min_samples_leaf': randint(1, 6),
        'classifier__min_samples_split': randint(2, 11),
        'classifier__max_features': ['sqrt', 'log2', 0.2, 0.5],
        'classifier__class_weight': ['balanced', 'balanced_subsample']
    }

    # Run the experiment
    run_rf_experiment(tar, tar_short, to_drop, clf, clf_short, param_dist)


if __name__ == "__main__":
    main()