Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.pyc
*.pyo
*.DS_Store
21 changes: 21 additions & 0 deletions exercise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
from functions.data_cleaning import clean_data
from functions.populate_db import populate_db
from functions.sql_queries import run_queries


#Give the path of the CSV file:
original_csv_path = 'reddit_exercise_data.csv'
#Call the data_cleaning function that creates the clean Dataframe:
df = clean_data(original_csv_path)
#create a variable with the path of the clean data
clean_data_path = 'results/clean_data.csv'
#save clean data to CSV file:
df.to_csv(clean_data_path, encoding='utf-8', index=False)

db_path = 'exercise_database.db'
#Call the function to populate the database:
populate_db(db_path, clean_data_path)

#Run metrics SQL queries and save results to separate CSV files:
run_queries(db_path)
Binary file modified exercise_database.db
Binary file not shown.
2 changes: 2 additions & 0 deletions functions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-

40 changes: 40 additions & 0 deletions functions/data_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
import pandas as pd
import datetime

def clean_data(csv_path):
# read the csv file
df = pd.read_csv(csv_path , encoding='utf-8',)

# make money_spent a float to be converted to NUMERIC data type in SQLite
df['money_spent'] = df['money_spent'].apply(lambda x: float(x))

# product_name only has one value (used df['product_name'].value_counts() to see that) telling us this is about the Reddit app.
# Therefore it brings no information so we delete it
del df['product_name']

# Every row's time is 00:00 so the time information is useless that date column is a datetime.
# Also, the date format changes between rows
# So let's make this column a date string in a format that SQLite will interpret as date https://www.sqlite.org/lang_datefunc.html
# all dates have year first, except for the ones uysing / instead of -
# transforming the dates to a "year first" date format.
def change_to_year_first_format(date_string):
if '/' in date_string:
return datetime.datetime.strptime(date_string, '%m/%d/%y').strftime('%Y-%m-%d')
else:
return date_string

df['date'] = df['date'].apply(change_to_year_first_format)
df['date'] = pd.to_datetime(df['date'], yearfirst=True).apply(lambda x: x.strftime('%Y-%m-%d'))

# Create columns with bins for apps_bought and money_spent.

# Given the EDA, I could have used the following bins:
# bins_apps = [-1, 1, 5, 10, 25, 50, 100]
# bins_money = [-1, 1, 5, 10, 50, 100,200,300,400,450,500]
# but not sure of the business requirements, and given that this needs to be production ready for any data,
# I prefer using X bins of the same size.
df['app_bought_bucket'] = pd.cut(df['app_bought'], 20)
df['money_spent_bucket'] = pd.cut(df['money_spent'], 20)

return df
9 changes: 9 additions & 0 deletions functions/populate_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
import pandas as pd
import sqlite3

def populate_db(db_path, clean_data_path):
connection = sqlite3.connect(db_path)
clean_data = pd.read_csv(clean_data_path)
clean_data.to_sql(name = 'reviews', con = connection, if_exists = 'replace', index = False)
connection.close()
18 changes: 18 additions & 0 deletions functions/sql_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
import pandas as pd
import sqlite3

queries_dict = {
'average_score_by_iso': """SELECT iso ,AVG(score) FROM reviews GROUP BY iso """,
'max_score_by_app_bucket': """SELECT app_bought_bucket , MAX(score) FROM reviews GROUP BY app_bought_bucket """,
'average_score_over_time': """SELECT date , AVG(score) FROM reviews GROUP BY date ORDER BY date(date)"""
}
#generate one CSV per query:
def run_queries(db_path):
connection = sqlite3.connect(db_path)

for key , value in queries_dict.items():
new_df = pd.read_sql(value, con=connection)
new_df.to_csv(f'results/{key}.csv' , index=False)

connection.close()
82 changes: 82 additions & 0 deletions results/average_score_by_iso.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
iso,AVG(score)
AE,4.714285714285714
AL,5.0
AR,4.9
AT,4.461538461538462
AU,4.50561797752809
BE,4.777777777777778
BG,5.0
BH,4.0
BN,5.0
BR,4.612903225806452
BY,5.0
CA,4.468421052631579
CH,4.818181818181818
CL,4.625
CN,4.862068965517241
CO,5.0
CR,5.0
CY,5.0
CZ,5.0
DE,4.734939759036145
DK,4.678571428571429
DO,5.0
EC,3.6666666666666665
EG,4.666666666666667
ES,4.833333333333333
FI,4.363636363636363
FR,4.722222222222222
GB,4.664576802507837
GR,4.6
GT,5.0
HK,4.6
HN,4.0
HR,4.6
HU,4.333333333333333
ID,4.833333333333333
IE,4.6
IL,5.0
IN,4.84375
IS,4.0
IT,4.9411764705882355
JO,5.0
JP,4.9
KH,5.0
KR,4.2
KW,5.0
KZ,5.0
LB,5.0
LK,5.0
LT,5.0
LV,5.0
MK,5.0
MT,5.0
MX,4.620689655172414
MY,4.407407407407407
NE,5.0
NG,4.5
NL,4.622222222222222
NO,4.775
NZ,4.5
OM,4.0
PE,4.833333333333333
PH,4.857142857142857
PK,4.0
PL,4.666666666666667
PT,5.0
RO,4.714285714285714
RU,4.684210526315789
SE,4.5
SG,4.964285714285714
SI,5.0
SK,5.0
TH,4.888888888888889
TN,5.0
TR,4.75
TW,4.6
TZ,5.0
UA,4.857142857142857
US,4.507191994996873
UY,5.0
VN,5.0
ZA,4.529411764705882
50 changes: 50 additions & 0 deletions results/average_score_over_time.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
date,AVG(score)
2017-05-23,4.3478260869565215
2017-05-24,4.688524590163935
2017-05-25,4.440860215053763
2017-05-26,4.595238095238095
2017-05-27,4.783333333333333
2017-05-28,4.863636363636363
2017-05-29,4.626373626373627
2017-05-30,4.602941176470588
2017-05-31,4.280405405405405
2017-06-01,4.516624040920716
2017-06-02,4.523605150214593
2017-06-03,4.670731707317073
2017-06-04,4.574468085106383
2017-06-05,4.752
2017-06-06,4.71304347826087
2017-06-07,4.536082474226804
2017-06-08,4.538461538461538
2017-06-09,4.595238095238095
2017-06-10,4.59375
2017-06-11,4.791666666666667
2017-06-12,4.589285714285714
2017-06-13,4.597560975609756
2017-06-14,4.388888888888889
2017-06-15,3.625
2017-06-16,3.625
2017-06-17,3.8333333333333335
2017-06-18,3.2777777777777777
2017-06-19,3.2222222222222223
2017-06-20,3.8
2017-06-21,2.3333333333333335
2017-06-22,2.8
2017-06-23,3.090909090909091
2017-06-24,3.2
2017-06-25,2.8
2017-06-26,3.4
2017-06-27,4.562893081761007
2017-06-28,4.667493796526054
2017-06-29,4.644268774703558
2017-06-30,4.67741935483871
2017-07-01,4.71875
2017-07-02,4.689655172413793
2017-07-03,4.734513274336283
2017-07-04,4.785046728971962
2017-07-05,4.534883720930233
2017-07-06,4.537634408602151
2017-07-07,4.525773195876289
2017-07-08,4.677083333333333
2017-07-09,4.466666666666667
2017-07-10,4.928571428571429
Loading