-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUnsupervised_Learning.py
More file actions
72 lines (56 loc) · 2.64 KB
/
Unsupervised_Learning.py
File metadata and controls
72 lines (56 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
uploaded_file = files.upload()
filename = list(uploaded_file.keys())[0]
data = pd.read_csv(filename)
wanted_columns = ["track_id", "artists", "album_name", "track_name", "track_genre"]
new_data = data[wanted_columns].copy()
features_data = data.drop(columns=wanted_columns)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(features_data)
scaled = pd.DataFrame(scaled_data, columns=features_data.columns)
kmeans = KMeans(n_clusters=9, random_state=0, n_init=10)
kmeans.fit(scaled)
new_data["cluster"] = kmeans.labels_
genre_percentages = new_data.groupby("cluster")["track_genre"].apply(lambda x: x.value_counts(normalize=True).iloc[0] * 100)
dominant_genres = new_data.groupby("cluster")["track_genre"].agg(lambda x: x.value_counts().idxmax())
print("Dominant genre per cluster:")
print("")
print(dominant_genres)
plt.figure(figsize=(10, 6))
plt.bar(genre_percentages.index, genre_percentages.values)
plt.xlabel("Cluster")
plt.ylabel("Percentage of dominant genre")
plt.xticks(genre_percentages.index)
plt.show()
my_track = "Shape Of You"
track_row = new_data[new_data["track_name"] == my_track]
if not track_row.empty:
print(f"The song '{track_row['track_name'].values[0]}' belongs to the artist {track_row['artists'].values[0]} and the cluster is {track_row['cluster'].values[0]}.")
if not track_row.empty:
index = track_row.index[0]
cluster = track_row["cluster"].values[0]
features = scaled.loc[index].values.reshape(1, -1)
same_cluster_indices = new_data[new_data["cluster"] == cluster].index
same_cluster_indices = same_cluster_indices[same_cluster_indices != index]
distances = pairwise_distances(features, scaled.loc[same_cluster_indices])[0]
closest_indices = same_cluster_indices[np.argsort(distances)[:10]]
recommended_songs = new_data.loc[closest_indices][["track_name", "artists", "track_genre", "cluster"]]
print("Top-10 similar tracks:")
print("")
display(recommended_songs)
print("")
print("The 3 closest are:")
closest_indices = same_cluster_indices[np.argsort(distances)[:3]]
recommended = new_data.loc[closest_indices][["track_name", "artists", "track_genre", "cluster"]]
display(recommended)
print("")
print("The 3 closest songs appearing in the result match the song I had selected!")
else:
print("The song was not found. Make sure the name is correct.")