Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/minidata/data/*.pkl
75 changes: 75 additions & 0 deletions data/clean/tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# tag表标签很散,初始有27w个标签

import json
import mysql.connector
from tqdm import tqdm
import difflib
import pickle

mydb = mysql.connector.connect(
host="172.16.0.176", # 数据库主机地址
user="root", # 数据库用户名
passwd="admin", # 数据库密码
database="bigdata"
)
cursor = mydb.cursor()
sql = "SELECT id, value FROM tag"
cursor.execute(sql)
tags = cursor.fetchall()
tag_map = {tag[0]: tag[1] for tag in tags}

ids = [tag[0] for tag in tags]

tags = [tag[1] for tag in tags]

tag_lst = []
tag_dict = {}

max_step = 10000
threshold = 0.8

with open('../minidata/data/tag_set.pkl', 'rb') as f:
tag_set = pickle.load(f)
tags = list(tag_set)
tag_map = {i: tag_map[i] for i in tag_set}
with open('../minidata/data/tag_map.pkl', 'wb') as f:
pickle.dump(tag_map, f)
print(len(tags))
# 27w 双重循环复杂度太高
# 优化后从 3h+ -> 24s
# 首先对value预处理。去掉”空格、-、'等“
# tags = list(map(lambda x: x.replace(' ', '').replace('-', '').replace('\'', ''), tags))
# 然后进行排序
tags = list(map(lambda x: x.lower(), tags))
# 循环时比较最近的max_step个元素即可

cnt = 0
for i in tqdm(range(len(tags))):
flag = True
x = tag_map[tags[i]]
for tag in tag_lst[-max_step:]:
# if ids[i] not in tag_set:
# continue
# else:
# cnt += 1

if difflib.SequenceMatcher(None, x, tag).quick_ratio() >= threshold:
flag = False
tag_dict[x] = tag
break

if flag:
tag_lst.append(x)
tag_dict[x] = x

print(len(tag_lst))
print(len(set(tag_dict.values())))
import pickle

with open('tag_dict.pkl', 'wb') as f:
pickle.dump(tag_dict, f)


# print(len(tags))
# print('\n'.join(tags))
print(cnt)
28 changes: 28 additions & 0 deletions data/minidata/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import json
import mysql.connector
from tqdm import tqdm
import pickle

mydb = mysql.connector.connect(
host="172.16.0.176", # 数据库主机地址
user="root", # 数据库用户名
passwd="admin", # 数据库密码
database="bigdata"
)
cursor = mydb.cursor()

file = "data/user_ids.pkl"
with open(file, "rb") as f:
user_ids = pickle.load(f)

user_ids = set(user_ids)

sql = "select distinct creator_id from playlist"
cursor.execute(sql)
creators = cursor.fetchall()
creators = [creator[0] for creator in creators]

users = set(creators) & user_ids
print(len(users))
print(len(creators))
print(len(user_ids))
24 changes: 24 additions & 0 deletions data/minidata/read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import json
import mysql.connector
from tqdm import tqdm
import pickle

mydb = mysql.connector.connect(
host="172.16.0.176", # 数据库主机地址
user="root", # 数据库用户名
passwd="admin", # 数据库密码
database="bigdata"
)
cursor = mydb.cursor()

track_ids = pickle.load(open("data/track_ids.pkl", "rb"))

sql = f'SELECT DISTINCT user_id FROM preference WHERE track_id IN ({",".join(track_ids)})'
# print(sql)
cursor.execute(sql)
user_ids = cursor.fetchall()
user_ids = [user_ids[0] for user_ids in user_ids]


with open('data/user_ids.pkl', 'wb') as f:
pickle.dump(user_ids, f)
33 changes: 33 additions & 0 deletions data/minidata/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pickle
from tqdm import tqdm
import json
import mysql.connector

mydb = mysql.connector.connect(
host="172.16.0.176", # 数据库主机地址
user="root", # 数据库用户名
passwd="admin", # 数据库密码
database="minidata"
)
cursor = mydb.cursor()

tag_set = set()
filepath = "C:/Users/HP/Desktop/音乐推荐/entities/mini_tracks.idomaar"
primary_key = set()
with open(filepath, "r") as file:
lines = file.readlines()
for line in tqdm(lines):
tags = json.loads(line.split("\t")[-1])["tags"]
track_id = line.split("\t")[1]
for tag in tags:
key = f"{track_id}-{tag['id']}"
if key in primary_key: continue
primary_key.add(key)

sql = "INSERT INTO track_tag (track_id, tag_id) VALUES (%s, %s)"
val = (track_id, tag["id"])
try:
cursor.execute(sql, val)
except Exception as e:
print(e)
mydb.commit()
24 changes: 24 additions & 0 deletions data/minidata/write.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import mysql.connector
import re
import pickle
from tqdm import tqdm


mydb = mysql.connector.connect(
host="172.16.0.176", # 数据库主机地址
user="root", # 数据库用户名
passwd="admin", # 数据库密码
database="minidata"
)
cursor = mydb.cursor()

with open('./data/tag_map.pkl', 'rb') as f:
tag_map = pickle.load(f)

for k, v in tqdm(tag_map.items()):
sql = "INSERT INTO tag (id, value) VALUES (%s, %s)"
val = (k, v)
cursor.execute(sql, val)


mydb.commit()