PV_Module_Analysis/Fault_detector.py at main · heavydriverrr/PV_Module_Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import joblib # Critical for saving the model for hybrid deployment

# --- Configuration ---
FILE_PATH = 'pv_fault_dataset.csv'
MODEL_FILENAME = 'pv_fault_model.joblib'
SCALER_FILENAME = 'pv_scaler.joblib'
RANDOM_STATE = 42

# --- 1. Load Data ---
try:
    # Load the CSV generated by the MATLAB script (Phase 1)
    df = pd.read_csv(FILE_PATH)
    print(f"Successfully loaded data from {FILE_PATH}. Shape: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: The file '{FILE_PATH}' was not found.")
    print("Please ensure the MATLAB script 'pv_data_simulator.m' has been run to generate the dataset.")
    exit()

# --- 2. Separate Features (X) and Target (y) ---
# X contains the 50 current measurements (I-V curve features)
X = df.iloc[:, :-1]
# y contains the fault classification label (0=Normal, 1=Fault)
y = df['LABEL']

# --- 3. Data Preprocessing (Scaling) ---
# Standardize features by removing the mean and scaling to unit variance.
# This ensures that all 50 I-V curve points are weighted equally.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nData scaled successfully using StandardScaler.")

# --- 4. Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

# --- 5. Choose and Train the Model (Logistic Regression) ---
model = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
print("\nTraining the Logistic Regression model...")
model.fit(X_train, y_train)
print("Training complete.")

# --- 6. Model Persistence (Phase 3 Conclusion: Saving for Real-Time Use) ---
# Save the trained model and the scaler object to disk.
# This step is essential for the 'realtime_monitor.py' script to perform classification
# without retraining every time.
joblib.dump(model, MODEL_FILENAME)
joblib.dump(scaler, SCALER_FILENAME)

print(f"\nModel and Scaler saved successfully to {MODEL_FILENAME} and {SCALER_FILENAME}.")

# --- Stress Test: Adding Artificial Noise ---
# We add random "jitter" to the test data to simulate real-world sensor noise.
# 0.5 is a significant amount of noise for standardized data.
noise_factor = 0.5
noise = np.random.normal(0, noise_factor, X_test.shape)
X_test_noisy = X_test + noise

print(f"\n--- STRESS TEST (Noise Factor: {noise_factor}) ---")
y_pred_noisy = model.predict(X_test_noisy)
accuracy_noisy = accuracy_score(y_test, y_pred_noisy)

print(f"Accuracy on NOISY Data: {accuracy_noisy * 100:.2f}%")

if accuracy_noisy == 1.0:
    print("Result: The model is extremely robust (or the fault pattern is very obvious).")
elif accuracy_noisy > 0.90:
    print("Result: The model is robust against sensor noise.")
else:
    print("Result: The model struggles with noise.")

# --- 7. Model Evaluation ---
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Performance ---")
print(f"Accuracy on Test Set: {accuracy * 100:.2f}%")
print("\nClassification Report (0=Normal, 1=Fault):")
print(classification_report(y_test, y_pred, target_names=['Normal (0)', 'Fault (1)']))

# --- 8. Visualization of a Test Prediction ---
# Grab a sample to plot and visually confirm the input pattern.
try:
    # Find the index of the first fault sample in the test set
    fault_sample_index = np.where(y_test == 1)[0][0]
    realtime_sample = X_test[fault_sample_index, :].reshape(1, -1)

    plt.figure(figsize=(8, 5))
    # Plot the 50 current feature values
    plt.plot(range(X_test.shape[1]), realtime_sample[0], 'r', linewidth=2, label='Tested I-V Curve (Scaled)')
    plt.title('Visualization of Faulty I-V Curve Pattern (Feature Vector)')
    plt.xlabel('Feature Index (I_V1 to I_V50)')
    plt.ylabel('Scaled Current Value')
    plt.grid(True)
    plt.legend()
    plt.show()

except IndexError:
    print("\nCould not find a fault sample in the test set for plotting.")
except Exception as e:
    print(f"\nError during plotting: {e}")