Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions avg_score_by_iso_grouping.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
,0,1
0,AE,4.714285714285714
1,AL,5.0
2,AR,4.9
3,AT,4.461538461538462
4,AU,4.50561797752809
5,BE,4.777777777777778
6,BG,5.0
7,BH,4.0
8,BN,5.0
9,BR,4.612903225806452
10,BY,5.0
11,CA,4.468421052631579
12,CH,4.818181818181818
13,CL,4.625
14,CN,4.862068965517241
15,CO,5.0
16,CR,5.0
17,CY,5.0
18,CZ,5.0
19,DE,4.734939759036145
20,DK,4.678571428571429
21,DO,5.0
22,EC,3.6666666666666665
23,EG,4.666666666666667
24,ES,4.833333333333333
25,FI,4.363636363636363
26,FR,4.722222222222222
27,GB,4.664576802507837
28,GR,4.6
29,GT,5.0
30,HK,4.6
31,HN,4.0
32,HR,4.6
33,HU,4.333333333333333
34,ID,4.833333333333333
35,IE,4.6
36,IL,5.0
37,IN,4.84375
38,IS,4.0
39,IT,4.9411764705882355
40,JO,5.0
41,JP,4.9
42,KH,5.0
43,KR,4.2
44,KW,5.0
45,KZ,5.0
46,LB,5.0
47,LK,5.0
48,LT,5.0
49,LV,5.0
50,MK,5.0
51,MT,5.0
52,MX,4.620689655172414
53,MY,4.407407407407407
54,NE,5.0
55,NG,4.5
56,NL,4.622222222222222
57,NO,4.775
58,NZ,4.5
59,OM,4.0
60,PE,4.833333333333333
61,PH,4.857142857142857
62,PK,4.0
63,PL,4.666666666666667
64,PT,5.0
65,RO,4.714285714285714
66,RU,4.684210526315789
67,SE,4.5
68,SG,4.964285714285714
69,SI,5.0
70,SK,5.0
71,TH,4.888888888888889
72,TN,5.0
73,TR,4.75
74,TW,4.6
75,TZ,5.0
76,UA,4.857142857142857
77,US,4.507191994996873
78,UY,5.0
79,VN,5.0
80,ZA,4.529411764705882
50 changes: 50 additions & 0 deletions avg_score_over_time_grouping.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
,0,1
0,2017-05-23,4.3478260869565215
1,2017-05-24,4.688524590163935
2,2017-05-25,4.440860215053763
3,2017-05-26,4.595238095238095
4,2017-05-27,4.783333333333333
5,2017-05-28,4.863636363636363
6,2017-05-29,4.626373626373627
7,2017-05-30,4.602941176470588
8,2017-05-31,4.280405405405405
9,2017-06-01,4.516624040920716
10,2017-06-02,4.523605150214593
11,2017-06-03,4.670731707317073
12,2017-06-04,4.574468085106383
13,2017-06-05,4.752
14,2017-06-06,4.71304347826087
15,2017-06-07,4.536082474226804
16,2017-06-08,4.538461538461538
17,2017-06-09,4.595238095238095
18,2017-06-10,4.59375
19,2017-06-11,4.791666666666667
20,2017-06-12,4.589285714285714
21,2017-06-13,4.597560975609756
22,2017-06-14,4.388888888888889
23,2017-06-15,3.625
24,2017-06-16,3.625
25,2017-06-17,3.8333333333333335
26,2017-06-18,3.2777777777777777
27,2017-06-19,3.2222222222222223
28,2017-06-20,3.8
29,2017-06-21,2.3333333333333335
30,2017-06-22,2.8
31,2017-06-23,3.090909090909091
32,2017-06-24,3.2
33,2017-06-25,2.8
34,2017-06-26,3.4
35,2017-06-27,4.562893081761007
36,2017-06-28,4.667493796526054
37,2017-06-29,4.644268774703558
38,2017-06-30,4.67741935483871
39,2017-07-01,4.71875
40,2017-07-02,4.689655172413793
41,2017-07-03,4.734513274336283
42,2017-07-04,4.785046728971962
43,2017-07-05,4.534883720930233
44,2017-07-06,4.537634408602151
45,2017-07-07,4.525773195876289
46,2017-07-08,4.677083333333333
47,2017-07-09,4.466666666666667
48,2017-07-10,4.928571428571429
6 changes: 6 additions & 0 deletions max_score_by_app_bought_grouping.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
,0,1
0,"(-0.001, 20.0]",5
1,"(20.0, 38.0]",5
2,"(38.0, 58.0]",5
3,"(58.0, 79.0]",5
4,"(79.0, 100.0]",5
34 changes: 34 additions & 0 deletions reddit_data_SQL_insert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import sqlite3
import pandas as pd

def populate_database(database_path, data_path):

try:
data = pd.read_csv(data_path)

database = sqlite3.connect(database_path)
cursor = database.cursor()

"""This could be made modular like the sql queries but as it is just
one command for now, this seems an acceptable solution"""
sql_insert_reddit_database = '''INSERT INTO reviews(review, title, iso,
score, date, apps_bought, money_spent,
apps_bought_bucket, money_spent_bucket)
VALUES(?,?,?,?,?,?,?,?,?)'''

"""This turns every row into a tuple which is used by executemany"""
tupled_data = [tuple(i) for i in data.values]

"""Executes the command and the data to use"""
cursor.executemany(sql_insert_reddit_database,tupled_data)
database.commit()
database.close()
except Exception as e:
print(e)


populate_database("exercise_database.db", "reddit_exercise_data_clean.csv")

"""NB I did see that there was a one-line code you coud use to add a pandas
dataframe to SQL but 1) I've never used SQL before so it was more fun to learn
it and 2) it felt like it was going against the "spirit" of the challenge"""
46 changes: 46 additions & 0 deletions reddit_data_SQL_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import sqlite3
import pandas as pd

commands = {
"avg_score_by_iso_grouping": '''SELECT iso, AVG(score)
FROM reviews
GROUP BY iso''',
"max_score_by_app_bought_grouping": '''SELECT apps_bought_bucket, MAX(score)
FROM reviews
GROUP BY apps_bought_bucket''',
"avg_score_over_time_grouping": '''SELECT date, AVG(score)
FROM reviews
GROUP BY date'''
}

def sql_query(command, database_path):
try:
database = sqlite3.connect(database_path)
cursor = database.cursor()
cursor.execute(command)
queried_data = cursor.fetchall()

database.close()
return queried_data

except Exception as e:
print(e)

def write_all_to_csv(database_path):
for key, value in commands.items():
query_result = sql_query(value, database_path)
dataframe = pd.DataFrame(query_result)
dataframe.to_csv(key + ".csv")


write_all_to_csv("exercise_database.db")










56 changes: 56 additions & 0 deletions reddit_data_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pandas as pd
import re
from datetime import datetime

def strip_dates(date):
#removes anything after the space
return re.sub(r' .*$', '', date)

def uniform_format_dates(date):
"""Changes the format of the years in the "data" column to be uniform.
Assumes that this data is the limit of the odd date formats."""
if "/" in date:
return datetime.strptime(date, '%m/%d/%y').strftime('%Y-%m-%d')
else:
try:
return datetime.strptime(date, '%y-%m-%d').strftime('%Y-%m-%d')
except:
return date

def clean_dates(date):
date = strip_dates(date)
date = uniform_format_dates(date)
return date

def clean_data(dirty_data_location):
data = pd.read_csv(dirty_data_location, index_col=0)

"""quantile bucketing because otherwise we could get meaningless buckets if
we get outliers. I chose 5 buckets arbitrarily"""
data['app_bought_bucket'] = pd.qcut(data['app_bought'], 5)
data['money_spent_bucket'] = pd.qcut(data['money_spent'], 5)

"""Inspection of data shows that date formats are inconsistent"""
data['date'] = data['date'].apply(clean_dates)

"""product_name contains no unique info, delete"""
data = data.drop(columns="product_name")

"""Creates a new .csv file for us"""
data.to_csv('reddit_exercise_data_clean.csv')

return None


dirty_data_location = 'reddit_exercise_data.csv'
clean_data(dirty_data_location)










Loading