-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_data.py
More file actions
113 lines (84 loc) · 4.11 KB
/
get_data.py
File metadata and controls
113 lines (84 loc) · 4.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import spotipy
import pandas as pd
import numpy as np
from spotipy.oauth2 import SpotifyClientCredentials
import json
import os
from tqdm import tqdm
import re
from multiprocessing import Pool
#function to write the songs to a csv file
def write_songs_to_csv(directory = 'spotify_million_playlist_dataset/data/', write_directory = 'playlist_dataframes/',file1_idx = 0, file2_idx = 1):
file_names = os.listdir(directory)
# opening the number of files specified in the range
for i in range(file1_idx, file2_idx):
with open(directory + file_names[i], "r") as file:
#load the json file
data = json.load(file)
#iterate through the playlists
for playlist in tqdm(data["playlists"], desc="Loading playlists"):
playlist_data = []
#retrieve the playlist name
session = playlist["name"]
#iterate through the tracks in the playlist
for track in playlist["tracks"]:
#retrieve the track and artist data if possible
try:
track_obj = sp.track(track["track_uri"])
except:
continue
artist_obj = sp.artist(track["artist_uri"])
#creating the track audio features dictionary
track_data = sp.audio_features(track["track_uri"])[0]
#retrieving the artist and track names
artist_name = track["artist_name"]
track_name = track["track_name"]
#retrieving the artist and track popularity
artist_popularity = artist_obj['popularity']
track_popularity = track_obj["popularity"]
#retrieving the artist genres
artist_genres = artist_obj["genres"]
#create dictionary of track metadata
track_metadata = {
"artist_name": artist_name,
"track_name": track_name,
"artist_popularity": artist_popularity,
"track_popularity": track_popularity,
"artist_genres": artist_genres,
"playlist_name": session
}
#add track metadata to track features fo track data
track_data.update(track_metadata)
#add track data to playlist data list
playlist_data.append(track_data)
#convert playlist list into a dataframe
song_df = pd.DataFrame(playlist_data)
#drop unnecessary columns
song_df.drop(["analysis_url", "track_href", "type", 'uri'], axis=1, inplace=True)
#defining file name
file_name = re.search('\d+-\d+', file_names[i]).group()
#get the playlist id
pid = re.search('\d+', file_name)
#write the dataframe to a csv file
if int(playlist["pid"]) == int(pid.group()):
song_df.to_csv(write_directory + file_name + '.csv', mode='w', header=True, index=False)
else:
song_df.to_csv(write_directory + file_name + '.csv', mode='a', header=False, index=False)
def main():
#read cid from file
with open("ids/cid.txt", "r") as file:
cid = file.read()
#read secret from file
with open("ids/secret.txt", "r") as file:
secret = file.read()
#Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)
os.makedirs('playlist_dataframes', exist_ok=True)
#use multiprocessing to write the songs to csv files
if __name__ == '__main__':
p = Pool()
p.starmap(write_songs_to_csv, [(0, 1), (1, 2), (2, 3), (3, 4)])
p.close()
p.join()
#write_songs_to_csv(file1_idx=0,file2_idx=1, write_directory='playlist_dataframes/', directory='spotify_million_playlist_dataset/data/')