chattermill · vilvasofianou · Dec 23, 2019
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+*.pyo
+*.DS_Store
diff --git a/exercise.py b/exercise.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+from functions.data_cleaning import clean_data
+from functions.populate_db import populate_db
+from functions.sql_queries import run_queries
+
+
+#Give the path of the CSV file:
+original_csv_path = 'reddit_exercise_data.csv'
+#Call the data_cleaning function that creates the clean Dataframe:
+df = clean_data(original_csv_path)
+#create a variable with the path of the clean data
+clean_data_path = 'results/clean_data.csv'
+#save clean data to CSV file:
+df.to_csv(clean_data_path, encoding='utf-8', index=False)
+
+db_path = 'exercise_database.db'
+#Call the function to populate the database:
+populate_db(db_path, clean_data_path)
+
+#Run metrics SQL queries and save results to separate CSV files:
+run_queries(db_path)
diff --git a/exercise_database.db b/exercise_database.db
diff --git a/functions/__init__.py b/functions/__init__.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+
diff --git a/functions/data_cleaning.py b/functions/data_cleaning.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+import pandas as pd
+import datetime
+
+def clean_data(csv_path):
+    #    read the csv file
+    df = pd.read_csv(csv_path , encoding='utf-8',)
+
+    #    make money_spent a float to be converted to NUMERIC data type in SQLite
+    df['money_spent'] = df['money_spent'].apply(lambda x: float(x))
+
+    #    product_name only has one value (used df['product_name'].value_counts() to see that) telling us this is about the Reddit app.
+    # Therefore it brings no information so we delete it
+    del df['product_name']
+
+    # Every row's time is 00:00 so the time information is useless that date column is a datetime.
+    # Also, the date format changes between rows
+    # So let's make this column a date string in a format that SQLite will interpret as date https://www.sqlite.org/lang_datefunc.html
+    #  all dates have year first, except for the ones uysing / instead of -
+    # transforming the  dates to a "year first" date format.
+    def change_to_year_first_format(date_string):
+        if '/' in date_string:
+            return datetime.datetime.strptime(date_string, '%m/%d/%y').strftime('%Y-%m-%d')
+        else:
+          return date_string
+
+    df['date'] = df['date'].apply(change_to_year_first_format)
+    df['date'] = pd.to_datetime(df['date'], yearfirst=True).apply(lambda x: x.strftime('%Y-%m-%d'))
+
+#    Create columns with bins for apps_bought and money_spent.
+
+    # Given the EDA, I could have used the following bins:
+    # bins_apps = [-1, 1, 5, 10, 25, 50, 100]
+    # bins_money = [-1, 1, 5, 10, 50, 100,200,300,400,450,500]
+    # but not sure of the business requirements, and given that this needs to be production ready for any data,
+    # I prefer using X bins of the same size. 
+    df['app_bought_bucket'] = pd.cut(df['app_bought'], 20)    
+    df['money_spent_bucket'] = pd.cut(df['money_spent'], 20)
+
+    return df
diff --git a/functions/populate_db.py b/functions/populate_db.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+import pandas as pd
+import sqlite3
+
+def populate_db(db_path, clean_data_path):
+    connection = sqlite3.connect(db_path)
+    clean_data = pd.read_csv(clean_data_path)
+    clean_data.to_sql(name = 'reviews', con = connection, if_exists = 'replace', index = False)
+    connection.close()
diff --git a/functions/sql_queries.py b/functions/sql_queries.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+import pandas as pd
+import sqlite3
+
+queries_dict = {
+    'average_score_by_iso': """SELECT iso ,AVG(score) FROM reviews GROUP BY iso """,
+    'max_score_by_app_bucket': """SELECT app_bought_bucket , MAX(score) FROM reviews GROUP BY app_bought_bucket """,
+    'average_score_over_time': """SELECT  date , AVG(score) FROM reviews GROUP BY date ORDER BY date(date)"""
+}
+#generate one CSV per query:
+def run_queries(db_path):
+    connection = sqlite3.connect(db_path)
+
+    for key , value in queries_dict.items():
+        new_df = pd.read_sql(value, con=connection)
+        new_df.to_csv(f'results/{key}.csv' ,  index=False)
+
+    connection.close()
diff --git a/results/average_score_by_iso.csv b/results/average_score_by_iso.csv
@@ -0,0 +1,82 @@
+iso,AVG(score)
+AE,4.714285714285714
+AL,5.0
+AR,4.9
+AT,4.461538461538462
+AU,4.50561797752809
+BE,4.777777777777778
+BG,5.0
+BH,4.0
+BN,5.0
+BR,4.612903225806452
+BY,5.0
+CA,4.468421052631579
+CH,4.818181818181818
+CL,4.625
+CN,4.862068965517241
+CO,5.0
+CR,5.0
+CY,5.0
+CZ,5.0
+DE,4.734939759036145
+DK,4.678571428571429
+DO,5.0
+EC,3.6666666666666665
+EG,4.666666666666667
+ES,4.833333333333333
+FI,4.363636363636363
+FR,4.722222222222222
+GB,4.664576802507837
+GR,4.6
+GT,5.0
+HK,4.6
+HN,4.0
+HR,4.6
+HU,4.333333333333333
+ID,4.833333333333333
+IE,4.6
+IL,5.0
+IN,4.84375
+IS,4.0
+IT,4.9411764705882355
+JO,5.0
+JP,4.9
+KH,5.0
+KR,4.2
+KW,5.0
+KZ,5.0
+LB,5.0
+LK,5.0
+LT,5.0
+LV,5.0
+MK,5.0
+MT,5.0
+MX,4.620689655172414
+MY,4.407407407407407
+NE,5.0
+NG,4.5
+NL,4.622222222222222
+NO,4.775
+NZ,4.5
+OM,4.0
+PE,4.833333333333333
+PH,4.857142857142857
+PK,4.0
+PL,4.666666666666667
+PT,5.0
+RO,4.714285714285714
+RU,4.684210526315789
+SE,4.5
+SG,4.964285714285714
+SI,5.0
+SK,5.0
+TH,4.888888888888889
+TN,5.0
+TR,4.75
+TW,4.6
+TZ,5.0
+UA,4.857142857142857
+US,4.507191994996873
+UY,5.0
+VN,5.0
+ZA,4.529411764705882
diff --git a/results/average_score_over_time.csv b/results/average_score_over_time.csv
@@ -0,0 +1,50 @@
+date,AVG(score)
+2017-05-23,4.3478260869565215
+2017-05-24,4.688524590163935
+2017-05-25,4.440860215053763
+2017-05-26,4.595238095238095
+2017-05-27,4.783333333333333
+2017-05-28,4.863636363636363
+2017-05-29,4.626373626373627
+2017-05-30,4.602941176470588
+2017-05-31,4.280405405405405
+2017-06-01,4.516624040920716
+2017-06-02,4.523605150214593
+2017-06-03,4.670731707317073
+2017-06-04,4.574468085106383
+2017-06-05,4.752
+2017-06-06,4.71304347826087
+2017-06-07,4.536082474226804
+2017-06-08,4.538461538461538
+2017-06-09,4.595238095238095
+2017-06-10,4.59375
+2017-06-11,4.791666666666667
+2017-06-12,4.589285714285714
+2017-06-13,4.597560975609756
+2017-06-14,4.388888888888889
+2017-06-15,3.625
+2017-06-16,3.625
+2017-06-17,3.8333333333333335
+2017-06-18,3.2777777777777777
+2017-06-19,3.2222222222222223
+2017-06-20,3.8
+2017-06-21,2.3333333333333335
+2017-06-22,2.8
+2017-06-23,3.090909090909091
+2017-06-24,3.2
+2017-06-25,2.8
+2017-06-26,3.4
+2017-06-27,4.562893081761007
+2017-06-28,4.667493796526054
+2017-06-29,4.644268774703558
+2017-06-30,4.67741935483871
+2017-07-01,4.71875
+2017-07-02,4.689655172413793
+2017-07-03,4.734513274336283
+2017-07-04,4.785046728971962
+2017-07-05,4.534883720930233
+2017-07-06,4.537634408602151
+2017-07-07,4.525773195876289
+2017-07-08,4.677083333333333
+2017-07-09,4.466666666666667
+2017-07-10,4.928571428571429