chattermill · NathanFlegg · Jan 20, 2020
diff --git a/avg_score_by_iso_grouping.csv b/avg_score_by_iso_grouping.csv
@@ -0,0 +1,82 @@
+,0,1
+0,AE,4.714285714285714
+1,AL,5.0
+2,AR,4.9
+3,AT,4.461538461538462
+4,AU,4.50561797752809
+5,BE,4.777777777777778
+6,BG,5.0
+7,BH,4.0
+8,BN,5.0
+9,BR,4.612903225806452
+10,BY,5.0
+11,CA,4.468421052631579
+12,CH,4.818181818181818
+13,CL,4.625
+14,CN,4.862068965517241
+15,CO,5.0
+16,CR,5.0
+17,CY,5.0
+18,CZ,5.0
+19,DE,4.734939759036145
+20,DK,4.678571428571429
+21,DO,5.0
+22,EC,3.6666666666666665
+23,EG,4.666666666666667
+24,ES,4.833333333333333
+25,FI,4.363636363636363
+26,FR,4.722222222222222
+27,GB,4.664576802507837
+28,GR,4.6
+29,GT,5.0
+30,HK,4.6
+31,HN,4.0
+32,HR,4.6
+33,HU,4.333333333333333
+34,ID,4.833333333333333
+35,IE,4.6
+36,IL,5.0
+37,IN,4.84375
+38,IS,4.0
+39,IT,4.9411764705882355
+40,JO,5.0
+41,JP,4.9
+42,KH,5.0
+43,KR,4.2
+44,KW,5.0
+45,KZ,5.0
+46,LB,5.0
+47,LK,5.0
+48,LT,5.0
+49,LV,5.0
+50,MK,5.0
+51,MT,5.0
+52,MX,4.620689655172414
+53,MY,4.407407407407407
+54,NE,5.0
+55,NG,4.5
+56,NL,4.622222222222222
+57,NO,4.775
+58,NZ,4.5
+59,OM,4.0
+60,PE,4.833333333333333
+61,PH,4.857142857142857
+62,PK,4.0
+63,PL,4.666666666666667
+64,PT,5.0
+65,RO,4.714285714285714
+66,RU,4.684210526315789
+67,SE,4.5
+68,SG,4.964285714285714
+69,SI,5.0
+70,SK,5.0
+71,TH,4.888888888888889
+72,TN,5.0
+73,TR,4.75
+74,TW,4.6
+75,TZ,5.0
+76,UA,4.857142857142857
+77,US,4.507191994996873
+78,UY,5.0
+79,VN,5.0
+80,ZA,4.529411764705882
diff --git a/avg_score_over_time_grouping.csv b/avg_score_over_time_grouping.csv
@@ -0,0 +1,50 @@
+,0,1
+0,2017-05-23,4.3478260869565215
+1,2017-05-24,4.688524590163935
+2,2017-05-25,4.440860215053763
+3,2017-05-26,4.595238095238095
+4,2017-05-27,4.783333333333333
+5,2017-05-28,4.863636363636363
+6,2017-05-29,4.626373626373627
+7,2017-05-30,4.602941176470588
+8,2017-05-31,4.280405405405405
+9,2017-06-01,4.516624040920716
+10,2017-06-02,4.523605150214593
+11,2017-06-03,4.670731707317073
+12,2017-06-04,4.574468085106383
+13,2017-06-05,4.752
+14,2017-06-06,4.71304347826087
+15,2017-06-07,4.536082474226804
+16,2017-06-08,4.538461538461538
+17,2017-06-09,4.595238095238095
+18,2017-06-10,4.59375
+19,2017-06-11,4.791666666666667
+20,2017-06-12,4.589285714285714
+21,2017-06-13,4.597560975609756
+22,2017-06-14,4.388888888888889
+23,2017-06-15,3.625
+24,2017-06-16,3.625
+25,2017-06-17,3.8333333333333335
+26,2017-06-18,3.2777777777777777
+27,2017-06-19,3.2222222222222223
+28,2017-06-20,3.8
+29,2017-06-21,2.3333333333333335
+30,2017-06-22,2.8
+31,2017-06-23,3.090909090909091
+32,2017-06-24,3.2
+33,2017-06-25,2.8
+34,2017-06-26,3.4
+35,2017-06-27,4.562893081761007
+36,2017-06-28,4.667493796526054
+37,2017-06-29,4.644268774703558
+38,2017-06-30,4.67741935483871
+39,2017-07-01,4.71875
+40,2017-07-02,4.689655172413793
+41,2017-07-03,4.734513274336283
+42,2017-07-04,4.785046728971962
+43,2017-07-05,4.534883720930233
+44,2017-07-06,4.537634408602151
+45,2017-07-07,4.525773195876289
+46,2017-07-08,4.677083333333333
+47,2017-07-09,4.466666666666667
+48,2017-07-10,4.928571428571429
diff --git a/max_score_by_app_bought_grouping.csv b/max_score_by_app_bought_grouping.csv
@@ -0,0 +1,6 @@
+,0,1
+0,"(-0.001, 20.0]",5
+1,"(20.0, 38.0]",5
+2,"(38.0, 58.0]",5
+3,"(58.0, 79.0]",5
+4,"(79.0, 100.0]",5
diff --git a/reddit_data_SQL_insert.py b/reddit_data_SQL_insert.py
@@ -0,0 +1,34 @@
+import sqlite3
+import pandas as pd
+
+def populate_database(database_path, data_path):
+
+    try:
+        data = pd.read_csv(data_path)
+
+        database = sqlite3.connect(database_path)
+        cursor = database.cursor()
+
+        """This could be made modular like the sql queries but as it is just
+        one command for now, this seems an acceptable solution"""
+        sql_insert_reddit_database = '''INSERT INTO reviews(review, title, iso,
+                                        score, date, apps_bought, money_spent, 
+                                        apps_bought_bucket, money_spent_bucket)
+                                        VALUES(?,?,?,?,?,?,?,?,?)'''
+
+        """This turns every row into a tuple which is used by executemany"""
+        tupled_data = [tuple(i) for i in data.values]
+
+        """Executes the command and the data to use"""
+        cursor.executemany(sql_insert_reddit_database,tupled_data)
+        database.commit()
+        database.close()
+    except Exception as e:
+        print(e)
+
+
+populate_database("exercise_database.db", "reddit_exercise_data_clean.csv")
+
+"""NB I did see that there was a one-line code you coud use to add a pandas
+dataframe to SQL but 1) I've never used SQL before so it was more fun to learn
+it and 2) it felt like it was going against the "spirit" of the challenge"""
diff --git a/reddit_data_SQL_queries.py b/reddit_data_SQL_queries.py
@@ -0,0 +1,46 @@
+import sqlite3
+import pandas as pd
+
+commands = {
+"avg_score_by_iso_grouping": '''SELECT iso, AVG(score) 
+                                FROM reviews 
+                                GROUP BY iso''',
+"max_score_by_app_bought_grouping": '''SELECT apps_bought_bucket, MAX(score)
+                                        FROM reviews
+                                        GROUP BY apps_bought_bucket''',
+"avg_score_over_time_grouping": '''SELECT date, AVG(score)
+                                    FROM reviews
+                                    GROUP BY date'''
+                                    }
+
+def sql_query(command, database_path):
+    try:
+        database = sqlite3.connect(database_path)
+        cursor = database.cursor()
+        cursor.execute(command)
+        queried_data = cursor.fetchall()
+
+        database.close()
+        return queried_data
+
+    except Exception as e:
+        print(e)
+
+def write_all_to_csv(database_path):
+    for key, value in commands.items():
+        query_result = sql_query(value, database_path)
+        dataframe = pd.DataFrame(query_result)
+        dataframe.to_csv(key + ".csv")
+
+
+write_all_to_csv("exercise_database.db")
+
+
+
+
+
+
+
+
+
+
diff --git a/reddit_data_cleaning.py b/reddit_data_cleaning.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import re
+from datetime import datetime
+
+def strip_dates(date):
+    #removes anything after the space
+    return re.sub(r' .*$', '', date)
+
+def uniform_format_dates(date):
+    """Changes the format of the years in the "data" column to be uniform.
+    Assumes that this data is the limit of the odd date formats."""
+    if "/" in date:
+        return datetime.strptime(date, '%m/%d/%y').strftime('%Y-%m-%d')
+    else:
+        try:
+            return datetime.strptime(date, '%y-%m-%d').strftime('%Y-%m-%d')
+        except:
+            return date
+
+def clean_dates(date):
+    date = strip_dates(date)
+    date = uniform_format_dates(date)
+    return date
+
+def clean_data(dirty_data_location):
+    data = pd.read_csv(dirty_data_location, index_col=0)
+
+    """quantile bucketing because otherwise we could get meaningless buckets if
+    we get outliers. I chose 5 buckets arbitrarily"""
+    data['app_bought_bucket'] = pd.qcut(data['app_bought'], 5)
+    data['money_spent_bucket'] = pd.qcut(data['money_spent'], 5)
+
+    """Inspection of data shows that date formats are inconsistent"""
+    data['date'] = data['date'].apply(clean_dates)      
+
+    """product_name contains no unique info, delete"""
+    data = data.drop(columns="product_name")
+
+    """Creates a new .csv file for us"""
+    data.to_csv('reddit_exercise_data_clean.csv')
+
+    return None
+
+
+dirty_data_location = 'reddit_exercise_data.csv'
+clean_data(dirty_data_location)
+
+
+
+
+
+
+
+
+
+