SmartGridAnomalyDetection/preprocessor.py at main · kshirinshetty/SmartGridAnomalyDetection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import os

# Chk/make directory
output_directory = 'processed'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Process F0L and F0M first
f0l_processed = None
f0m_processed = None

for filename_base in ['F0L', 'F0M']:
    input_filename = f'dataset/{filename_base}.csv'

    try:
        df = pd.read_csv(input_filename)
        print(f"\nProcessing {input_filename}")

        # Feature Selection

        # 1. Variance Thresholding
        threshold_variance = 0.1  # Adjust as needed
        selector_variance = VarianceThreshold(threshold=threshold_variance)
        df_high_variance = pd.DataFrame(selector_variance.fit_transform(df),
                                        columns=df.columns[selector_variance.get_support()])
        print(f"Removed {df.shape[1] - df_high_variance.shape[1]} low variance features.")

        # 2. Correlation Analysis
        def remove_highly_correlated(df_corr, threshold_corr):
            corr_matrix = df_corr.corr().abs()
            upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
            to_drop = [column for column in upper.columns if any(upper[column] > threshold_corr)]
            df_cleaned = df_corr.drop(columns=to_drop)
            return df_cleaned, to_drop

        correlation_threshold = 0.9  # Adjust as needed
        df_selected_features, dropped_corr = remove_highly_correlated(df_high_variance, correlation_threshold)
        print(f"Removed {len(dropped_corr)} highly correlated features: {dropped_corr}")

        # Z-score Standardization
        if not df_selected_features.empty and df_selected_features.shape[1] > 1:
            time_column = df_selected_features.iloc[:, 0]
            data_to_scale = df_selected_features.iloc[:, 1:]

            scaler = StandardScaler()
            scaled_data = scaler.fit_transform(data_to_scale)
            scaled_df = pd.DataFrame(scaled_data, columns=data_to_scale.columns)
            scaled_df = pd.concat([time_column.reset_index(drop=True), scaled_df], axis=1)

            if filename_base == 'F0L':
                f0l_processed = scaled_df
            else:
                f0m_processed = scaled_df

            print(f"Z-score scaling complete for {filename_base}.")

        elif df_selected_features.shape[1] <= 1:
            print(f"Warning: Only one or zero columns remaining after feature selection for {filename_base}. Skipping scaling.")
            if filename_base == 'F0L':
                f0l_processed = df_selected_features
            else:
                f0m_processed = df_selected_features
        else:
            print(f"Warning: DataFrame is empty after feature selection for {filename_base}.")
            if filename_base == 'F0L':
                f0l_processed = pd.DataFrame()
            else:
                f0m_processed = pd.DataFrame()

    except FileNotFoundError:
        print(f"Error: Could not find {input_filename}")
    except Exception as e:
        print(f"An error occurred while processing {input_filename}: {e}")

# Process the remaining datasets (F1L-F7L & F1M-F7M)

# Using the feature selection learned from F0L and F0M
f0l_selected_columns = f0l_processed.columns.tolist() if f0l_processed is not None else []
f0m_selected_columns = f0m_processed.columns.tolist() if f0m_processed is not None else []

if not f0l_selected_columns or not f0m_selected_columns:
    print("Error: Could not retrieve surviving columns from processed F0L or F0M.")
    exit()

processed_data = {}

for i in range(1, 8):
    for suffix in ['L', 'M']:
        input_filename = f'dataset/F{i}{suffix}.csv'
        selected_columns = f0l_selected_columns if suffix == 'L' else f0m_selected_columns

        try:
            df = pd.read_csv(input_filename)
            print(f"\nProcessing {input_filename}")

            # Keep only the columns that survived in F0L/F0M
            common_columns = [col for col in df.columns if col in selected_columns]
            df_selected = df[common_columns]
            print(f"Kept {len(common_columns)} columns based on F0{suffix}.")

            # Z-score Standardization
            if not df_selected.empty and df_selected.shape[1] > 1:
                time_column = df_selected.iloc[:, 0]
                data_to_scale = df_selected.iloc[:, 1:]

                scaler = StandardScaler()
                scaled_data = scaler.fit_transform(data_to_scale)
                scaled_df = pd.DataFrame(scaled_data, columns=data_to_scale.columns)
                scaled_df = pd.concat([time_column.reset_index(drop=True), scaled_df], axis=1)
                processed_data[f'F{i}{suffix}'] = scaled_df
                print(f"Z-score scaling complete for F{i}{suffix}.")

            elif df_selected.shape[1] <= 1:
                print(f"Warning: Only one or zero columns remaining for F{i}{suffix}. No scaling.")
                processed_data[f'F{i}{suffix}'] = df_selected
            else:
                print(f"Warning: DataFrame is empty for F{i}{suffix}.")
                processed_data[f'F{i}{suffix}'] = pd.DataFrame()

        except FileNotFoundError:
            print(f"Warning: Could not find {input_filename}")
        except Exception as e:
            print(f"An error occurred while processing {input_filename}: {e}")


training_data = []
testing_data = []

print("Loading data for concatenation.")

for i in range(1, 8):
    for suffix in ['L', 'M']:
        key = f'F{i}{suffix}'
        if key in processed_data and not processed_data[key].empty:
            df = processed_data[key].copy()

            # Split into TRAINING and TESTING (e.g., 80% training, 20% testing)
            train_split = int(0.5 * len(df))

            # Add 'source' column only to the training data
            train_df = df.iloc[:train_split].copy()
            train_df['source'] = key
            training_data.append(train_df)

            # Testing data without the 'source' column
            test_df = df.iloc[train_split:].copy()
            testing_data.append(test_df)
        else:
            print(f"Warning: No data available for {key} for concatenation.")

# Concatenate all training and testing data
training_df = pd.concat(training_data, ignore_index=True)
testing_df = pd.concat(testing_data, ignore_index=True)

# Shuffle the datasets
training_df.sample(frac=1, random_state=69).to_csv('processed/TRAINING.csv', index=False)
testing_df.sample(frac=1, random_state=69).to_csv('processed/TESTING.csv', index=False)

print("Done.")