-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSupervised_Learning.py
More file actions
71 lines (57 loc) · 2.47 KB
/
Supervised_Learning.py
File metadata and controls
71 lines (57 loc) · 2.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import pandas as pd;
import matplotlib.pyplot as plt
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
uploaded_file = files.upload();
filename = list(uploaded_file.keys())[0]
data = pd.read_csv(filename)
print(data.describe())
data.drop('Outcome',axis=1).hist(figsize=(8, 8), bins=20)
plt.show()
outcome_counts = data["Outcome"].value_counts().sort_index()
outcome_counts.plot.bar(color=["red", "blue"])
plt.xlabel("Outcome of diabetes")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()
glucose_0 = data[data["Outcome"] == 0]["Glucose"]
glucose_1 = data[data["Outcome"] == 1]["Glucose"]
glucose_means = data.groupby("Outcome")["Glucose"].mean().rename({0: "No Diabetes", 1: "Diabetes"})
plt.figure(figsize=(5, 4))
plt.bar(glucose_means.index, glucose_means.values, color=["skyblue", "salmon"])
plt.xticks([0, 1])
plt.ylabel("Mean of glucose")
plt.tight_layout()
plt.show()
print("")
print("From the diagram, we observe that for individuals without diabetes, the mean glucose level is significantly lower (almost 30 units) compared to individuals with diabetes.")
print("Therefore, we can conclude that if a patient has high glucose levels, it is likely they have diabetes!")
x = data.drop('Outcome', axis=1).values
y = data['Outcome'].values
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=0)
print("30% is for validation and 70% is for training.")
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
model = LogisticRegression(random_state=0, max_iter=1000)
model.fit(x_train_scaled, y_train)
y_pred = model.predict(x_val_scaled)
accuracy = accuracy_score(y_val, y_pred)
print(f"The model's accuracy is: {accuracy:.2f}")
print("Therefore, we observe that the model has a quite high success rate!")
importances = np.abs(model.coef_[0])
features = data.columns[:-1]
plt.figure(figsize=(8, 6))
plt.barh(range(len(features)), importances, align='center')
plt.yticks(ticks=range(len(features)), labels=features)
plt.xlabel("Importance")
plt.ylabel("Categories")
plt.gca().invert_yaxis()
plt.show()
top_features = features[np.argsort(importances)[-3:][::-1]]
print("The 3 most important features based on the results are:", list(top_features))
print("")