Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__pycache__/
dataset/*
output/*
Binary file added Music Popularity Analysis.pdf
Binary file not shown.
88 changes: 88 additions & 0 deletions data_analysis_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Expected to be used with the parameter na_values of the method pd.read_csv
def na_values(exclude=[]):
# Default values defined in doc: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
na_values = [" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"]
for v in exclude:
na_values.remove(v)
return na_values

def cleanColNames(df, ncol={}):
df.columns = [ncol[column] if (column in ncol) else (column.lower().replace(" ", "_")) for column in df.columns]
return df

def getNullValues(df, onlyEmpty=False):
tmp_df = df.isna().sum()
return tmp_df[tmp_df>0] if onlyEmpty else tmp_df

def cleanNullRows(df):
return df.dropna(axis=0, how="all")

def getDuplicated(df):
return df.duplicated().sum()

def cleanDuplicated(df, keep="first"):
df.drop_duplicates(keep=keep, inplace=True)
df.reset_index(inplace=True)
return df

def getFrequencyTable(df, column):
return pd.concat([df[column].value_counts(), df[column].value_counts(normalize=True).round(2)], axis=1)

# TODO: Call show in the notebook, function only calculate and return values
def analyze_cat_values(df, column, chart_size=False, ax_labels = False, ay_labels = False):
print(f"Proportion table for {column}: ")
frequency_table = getFrequencyTable(df, column)
display(frequency_table)

if len(df[column].unique()) <= 2: # If it is binary draw a pie plot
chart_size = chart_size if chart_size else (6,6)
plt.figure(figsize=chart_size)
labels = ax_labels if ax_labels else frequency_table.index
plt.pie(frequency_table["proportion"], labels=labels, autopct='%1.1f%%')
plt.title(f'Proportion of values for the feature {column}')
else: # else draw a barchart
chart_size = chart_size if chart_size else (6,3)
plt.figure(figsize=chart_size)
ax = sns.barplot(y=frequency_table["count"], x=frequency_table.index, data=frequency_table, legend=False);
plt.title(f'Proportion of values for the feature {column}')
if ax_labels:
ax.set_xticklabels(ax_labels, rotation=0)
if ay_labels:
ax.set_xticklabels(ay_labels, rotation=0)

plt.show();

# TODO: Call print and show in the notebook, function only calculate and return values
def analyze_num_values(df, column, ax_labels = False, ay_labels = False):
print(f"Statistic values for {column}:")
print("Count:\t\t", df[column].count())
print("AVG:\t\t", df[column].mean())
print("Min:\t\t", df[column].min())
print("Quantile 25:\t", df[column].quantile(0.25))
print("Quantile 50:\t", df[column].quantile(0.5))
print("Quantile 75:\t", df[column].quantile(0.75))
print("Max:\t\t", df[column].max())
print("Mode:\t\t", list(df[column].mode()))
print("Variance:\t", df[column].var())
print("STD:\t\t", df[column].std())
print("Skewness:\t", df[column].skew())
print("Kurtosis:\t", df[column].kurt())

plt.figure(figsize=(10,5))
plot = sns.histplot(df[column], bins=100)
if ax_labels:
ax.set_xticklabels(ax_labels, rotation=0)
if ay_labels:
ax.set_xticklabels(ay_labels, rotation=0)
#plot.set(yscale='log')

plt.show();

def isChisquareStrong(chi2_pvalue):
return float(chi2_pvalue) < 0.05

def getCramervRelation(cramer_v):
return ["Negligible", "Weak", "Moderate", "Strong", "Very Strong"][int(cramer_v*10)]
98 changes: 98 additions & 0 deletions spotify_songs_questions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
-- Database is not included in the GitHub repo (due to file size restrictions)
-- To execute this query database must be generated by executing all the cells in w3-project.ipynb


-- 1. How do song properties affects their popularity?
-- Answered in the Bivariate Analysis

-- 2. How popular are acoustic songs compared to the AVG?
-- Assuming instrumental is an acousticness of 0.9 or more
SELECT (SELECT AVG(popularity) as popularity FROM songs WHERE acousticness >= 0.9) AS acoustic_popularity ,
(SELECT AVG(popularity) as popularity FROM songs) AS avg_popularity;

-- 3. How popular are instrumental songs compared to the AVG?
-- Assuming instrumental is an instrumentalness of 0.9 or more
SELECT (SELECT AVG(popularity) as popularity FROM songs WHERE instrumentalness >= 0.9) AS instrumental_popularity ,
(SELECT AVG(popularity) as popularity FROM songs) AS avg_popularity;

-- 4. How popular are live songs compared to the AVG?
-- Assuming instrumental is an liveness of 0.9 or more
SELECT (SELECT AVG(popularity) as popularity FROM songs WHERE liveness >= 0.9) AS live_popularity ,
(SELECT AVG(popularity) as popularity FROM songs) AS avg_popularity;

-- 5. What are the most popular songs?
SELECT name, artists, AVG(popularity) as avg_popularity
FROM songs
GROUP BY spotify_id
ORDER BY avg_popularity DESC, continent DESC
LIMIT 5;

-- 6. What are the 5 most popular songs in Europe
SELECT continent, name, artists, AVG(popularity) as avg_popularity
FROM songs
WHERE continent = "EU"
GROUP BY spotify_id, continent
ORDER BY avg_popularity DESC, continent DESC
LIMIT 5;

-- 7. What are the most popular songs songs per continent?
SELECT s.continent, s.name, s.artists, MAX(s.avg_popularity) as popularity
FROM (
SELECT s1.spotify_id, s1.name, s1.artists, AVG(s1.popularity) as avg_popularity, s1.continent
FROM songs as s1
GROUP BY s1.spotify_id, s1.continent) as s
GROUP BY s.continent;

-- 8. What are the 5 most popular artist?
SELECT artists, AVG(popularity) as avg_popularity
FROM songs
GROUP BY artists
ORDER BY avg_popularity DESC
LIMIT 5;

-- 9. What are the properties of the songs of the most popular artists?
SELECT ("duration_ms" / 60000) as duration_min, "is_explicit", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature"
FROM songs WHERE artists = (
SELECT s2.artists
FROM songs as s2
GROUP BY s2.artists
ORDER BY AVG(s2.popularity) DESC
LIMIT 1
) GROUP BY spotify_id;

-- 10. How does song properties evolve across the time? (properties vs release date)
-- Leaving only the measures than can be calculated (exclude: is_explicit, mode, key, tempo)
SELECT strftime('%Y', datetime("album_release_date")) as year,
(AVG("duration_ms") / 60000) as "avg_duration_min",
AVG("danceability") as "avg_danceability",
AVG("energy") as "avg_energy",
AVG("loudness") as "avg_loudness",
AVG("speechiness") as "avg_speechiness",
AVG("acousticness") as "avg_acousticness",
AVG("instrumentalness") as "avg_instrumentalness",
AVG("liveness") as "avg_liveness",
AVG("valence") as "avg_valence",
AVG("tempo") as "avg_tempo"
FROM songs
GROUP BY year
ORDER BY year ASC;

-- 11. How does the time impact popularity? (popularity vs release date)
SELECT strftime('%Y', datetime("snapshot_date")) as pop_year, strftime('%Y', datetime("album_release_date")) as release_year, AVG(popularity) as popularity
FROM songs
GROUP BY pop_year, release_year
ORDER BY pop_year ASC, release_year ASC;

-- 12. How many albums were released by the most popular artists?
SELECT s2.artists, count(DISTINCT s2.album_name) as album
FROM songs as s2
WHERE artists IN (
SELECT s1.artists FROM songs as s1 GROUP BY s1.artists ORDER BY AVG(s1.popularity) DESC LIMIT 5
) GROUP BY s2.artists;

-- 13. When was the release of the first album of the most popular artist?
SELECT s2.artists, s2.album_name, strftime('%Y', datetime(MIN("album_release_date"))) as release_year
FROM songs as s2
WHERE artists IN (
SELECT s1.artists FROM songs as s1 GROUP BY s1.artists ORDER BY AVG(s1.popularity) DESC LIMIT 5
) GROUP BY s2.artists;
Loading