machine-learning/Supervised_Learning.py at main · damousis/machine-learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import pandas as pd;
import matplotlib.pyplot as plt
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

uploaded_file = files.upload();
filename = list(uploaded_file.keys())[0]
data = pd.read_csv(filename)

print(data.describe())
data.drop('Outcome',axis=1).hist(figsize=(8, 8), bins=20)
plt.show()

outcome_counts = data["Outcome"].value_counts().sort_index()
outcome_counts.plot.bar(color=["red", "blue"])
plt.xlabel("Outcome of diabetes")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()

glucose_0 = data[data["Outcome"] == 0]["Glucose"]
glucose_1 = data[data["Outcome"] == 1]["Glucose"]

glucose_means = data.groupby("Outcome")["Glucose"].mean().rename({0: "No Diabetes", 1: "Diabetes"})
plt.figure(figsize=(5, 4))
plt.bar(glucose_means.index, glucose_means.values, color=["skyblue", "salmon"])
plt.xticks([0, 1])
plt.ylabel("Mean of glucose")
plt.tight_layout()
plt.show()

print("")
print("From the diagram, we observe that for individuals without diabetes, the mean glucose level is significantly lower (almost 30 units) compared to individuals with diabetes.")
print("Therefore, we can conclude that if a patient has high glucose levels, it is likely they have diabetes!")
x = data.drop('Outcome', axis=1).values
y = data['Outcome'].values

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=0)
print("30% is for validation and 70% is for training.")

scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

model = LogisticRegression(random_state=0, max_iter=1000)
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_val_scaled)
accuracy = accuracy_score(y_val, y_pred)

print(f"The model's accuracy is: {accuracy:.2f}")
print("Therefore, we observe that the model has a quite high success rate!")

importances = np.abs(model.coef_[0])
features = data.columns[:-1]

plt.figure(figsize=(8, 6))
plt.barh(range(len(features)), importances, align='center')
plt.yticks(ticks=range(len(features)), labels=features)
plt.xlabel("Importance")
plt.ylabel("Categories")
plt.gca().invert_yaxis()
plt.show()

top_features = features[np.argsort(importances)[-3:][::-1]]
print("The 3 most important features based on the results are:", list(top_features))
print("")