ironhack-labs · jonathansada · May 23, 2025 · May 23, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+dataset/*
+output/*
diff --git a/Music Popularity Analysis.pdf b/Music Popularity Analysis.pdf
diff --git a/data_analysis_helpers.py b/data_analysis_helpers.py
@@ -0,0 +1,88 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+# Expected to be used with the parameter na_values of the method pd.read_csv
+def na_values(exclude=[]):
+    # Default values defined in doc: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
+    na_values = [" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"]
+    for v in exclude:
+        na_values.remove(v)
+    return na_values
+
+def cleanColNames(df, ncol={}):
+    df.columns = [ncol[column] if (column in ncol) else (column.lower().replace(" ", "_")) for column in df.columns]
+    return df
+
+def getNullValues(df, onlyEmpty=False):
+    tmp_df = df.isna().sum()
+    return tmp_df[tmp_df>0] if onlyEmpty else tmp_df
+
+def cleanNullRows(df):
+    return df.dropna(axis=0, how="all")
+
+def getDuplicated(df):
+    return df.duplicated().sum()
+
+def cleanDuplicated(df, keep="first"):
+    df.drop_duplicates(keep=keep, inplace=True)
+    df.reset_index(inplace=True)
+    return df
+
+def getFrequencyTable(df, column):
+    return pd.concat([df[column].value_counts(), df[column].value_counts(normalize=True).round(2)], axis=1)
+
+# TODO: Call show in the notebook, function only calculate and return values
+def analyze_cat_values(df, column, chart_size=False, ax_labels = False, ay_labels = False):
+    print(f"Proportion table for {column}: ")
+    frequency_table = getFrequencyTable(df, column)
+    display(frequency_table)
+
+    if len(df[column].unique()) <= 2: # If it is binary draw a pie plot
+        chart_size = chart_size if chart_size else (6,6)
+        plt.figure(figsize=chart_size)
+        labels = ax_labels if ax_labels else frequency_table.index
+        plt.pie(frequency_table["proportion"], labels=labels, autopct='%1.1f%%')
+        plt.title(f'Proportion of values for the feature {column}')
+    else: # else draw a barchart
+        chart_size = chart_size if chart_size else (6,3)
+        plt.figure(figsize=chart_size)
+        ax = sns.barplot(y=frequency_table["count"], x=frequency_table.index, data=frequency_table, legend=False);
+        plt.title(f'Proportion of values for the feature {column}')
+        if ax_labels:
+            ax.set_xticklabels(ax_labels, rotation=0)
+        if ay_labels:
+            ax.set_xticklabels(ay_labels, rotation=0)
+
+    plt.show();
+
+# TODO: Call print and show in the notebook, function only calculate and return values
+def analyze_num_values(df, column, ax_labels = False, ay_labels = False):
+    print(f"Statistic values for {column}:")
+    print("Count:\t\t", df[column].count())
+    print("AVG:\t\t", df[column].mean())
+    print("Min:\t\t", df[column].min())
+    print("Quantile 25:\t", df[column].quantile(0.25))
+    print("Quantile 50:\t", df[column].quantile(0.5))
+    print("Quantile 75:\t", df[column].quantile(0.75))
+    print("Max:\t\t", df[column].max())
+    print("Mode:\t\t", list(df[column].mode()))
+    print("Variance:\t", df[column].var())
+    print("STD:\t\t", df[column].std())
+    print("Skewness:\t", df[column].skew())
+    print("Kurtosis:\t", df[column].kurt())
+
+    plt.figure(figsize=(10,5))
+    plot = sns.histplot(df[column], bins=100)
+    if ax_labels:
+        ax.set_xticklabels(ax_labels, rotation=0)
+    if ay_labels:
+        ax.set_xticklabels(ay_labels, rotation=0)
+    #plot.set(yscale='log')
+
+    plt.show();
+
+def isChisquareStrong(chi2_pvalue):
+    return float(chi2_pvalue) < 0.05
+
+def getCramervRelation(cramer_v):
+    return ["Negligible", "Weak", "Moderate", "Strong", "Very Strong"][int(cramer_v*10)]
diff --git a/spotify_songs_questions.sql b/spotify_songs_questions.sql
@@ -0,0 +1,98 @@
+-- Database is not included in the GitHub repo (due to file size restrictions)
+-- To execute this query database must be generated by executing all the cells in w3-project.ipynb
+
+
+-- 1. How do song properties affects their popularity?
+-- Answered in the Bivariate Analysis
+
+-- 2. How popular are acoustic songs compared to the AVG?
+-- Assuming instrumental is an acousticness of 0.9 or more
+SELECT (SELECT AVG(popularity) as popularity FROM songs WHERE acousticness >= 0.9) AS acoustic_popularity ,
+	   (SELECT AVG(popularity) as popularity FROM songs) AS avg_popularity;
+
+-- 3. How popular are instrumental songs compared to the AVG?
+-- Assuming instrumental is an instrumentalness of 0.9 or more
+SELECT (SELECT AVG(popularity) as popularity FROM songs WHERE instrumentalness >= 0.9) AS instrumental_popularity ,
+	   (SELECT AVG(popularity) as popularity FROM songs) AS avg_popularity;
+
+-- 4. How popular are live songs compared to the AVG?
+-- Assuming instrumental is an liveness of 0.9 or more
+SELECT (SELECT AVG(popularity) as popularity FROM songs WHERE liveness >= 0.9) AS live_popularity ,
+	   (SELECT AVG(popularity) as popularity FROM songs) AS avg_popularity;
+
+-- 5. What are the most popular songs?
+SELECT  name, artists, AVG(popularity) as avg_popularity 
+FROM songs  
+GROUP BY spotify_id 
+ORDER BY avg_popularity DESC, continent DESC 
+LIMIT 5;
+
+-- 6. What are the 5 most popular songs in Europe
+SELECT continent, name, artists, AVG(popularity) as avg_popularity 
+FROM songs  
+WHERE continent = "EU" 
+GROUP BY spotify_id, continent 
+ORDER BY avg_popularity DESC, continent DESC 
+LIMIT 5;
+
+-- 7. What are the most popular songs songs per continent?
+SELECT s.continent, s.name, s.artists, MAX(s.avg_popularity) as popularity
+FROM (
+	SELECT s1.spotify_id, s1.name, s1.artists, AVG(s1.popularity) as avg_popularity, s1.continent
+	FROM songs as s1  
+	GROUP BY s1.spotify_id, s1.continent) as s
+GROUP BY s.continent;
+
+-- 8. What are the 5 most popular artist?
+SELECT artists, AVG(popularity) as avg_popularity 
+FROM songs 
+GROUP BY artists 
+ORDER BY avg_popularity DESC 
+LIMIT 5;
+
+-- 9. What are the properties of the songs of the most popular artists?
+SELECT ("duration_ms" / 60000) as duration_min, "is_explicit", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature" 
+FROM songs WHERE artists = (
+	SELECT s2.artists 
+	FROM songs as s2 
+	GROUP BY s2.artists 
+	ORDER BY AVG(s2.popularity) DESC 
+	LIMIT 1
+) GROUP BY spotify_id;
+
+-- 10. How does song properties evolve across the time? (properties vs release date)
+-- Leaving only the measures than can be calculated (exclude: is_explicit, mode, key, tempo)
+SELECT strftime('%Y', datetime("album_release_date")) as year,
+	  (AVG("duration_ms") / 60000) as "avg_duration_min", 
+	  AVG("danceability") as "avg_danceability", 
+	  AVG("energy") as "avg_energy", 
+	  AVG("loudness") as "avg_loudness", 
+	  AVG("speechiness") as "avg_speechiness", 
+	  AVG("acousticness") as "avg_acousticness", 
+	  AVG("instrumentalness") as "avg_instrumentalness", 
+	  AVG("liveness") as "avg_liveness", 
+	  AVG("valence") as "avg_valence", 
+	  AVG("tempo") as "avg_tempo"
+FROM songs 
+GROUP BY year
+ORDER BY year ASC;
+
+-- 11. How does the time impact popularity? (popularity vs release date)
+SELECT  strftime('%Y', datetime("snapshot_date")) as pop_year, strftime('%Y', datetime("album_release_date")) as release_year, AVG(popularity) as popularity
+FROM songs
+GROUP BY pop_year, release_year
+ORDER BY pop_year ASC, release_year ASC;
+
+-- 12. How many albums were released by the most popular artists?
+SELECT s2.artists, count(DISTINCT s2.album_name) as album
+FROM songs as s2
+WHERE artists IN (
+	SELECT s1.artists FROM songs as s1 GROUP BY s1.artists ORDER BY AVG(s1.popularity) DESC LIMIT 5
+) GROUP BY s2.artists;
+
+-- 13. When was the release of the first album of the most popular artist?
+SELECT s2.artists, s2.album_name, strftime('%Y', datetime(MIN("album_release_date"))) as release_year
+FROM songs as s2
+WHERE artists IN (
+	SELECT s1.artists FROM songs as s1 GROUP BY s1.artists ORDER BY AVG(s1.popularity) DESC LIMIT 5
+) GROUP BY s2.artists;