From f626822a9bb8956b9c0be0075d349885f8087654 Mon Sep 17 00:00:00 2001 From: rz233 <43656240+rz233@users.noreply.github.com> Date: Tue, 10 Aug 2021 14:19:08 -0400 Subject: [PATCH 1/2] Update checker.py --- data-checks/checker.py | 188 ++++++++++++++++------------------------- 1 file changed, 72 insertions(+), 116 deletions(-) diff --git a/data-checks/checker.py b/data-checks/checker.py index 155f333..2703781 100644 --- a/data-checks/checker.py +++ b/data-checks/checker.py @@ -1,15 +1,11 @@ -# --------------------------------------------------------- -# Settings import os - import pandas as pd -pd.options.plotting.backend = "plotly" - import numpy as np import plotly import plotly.graph_objects as go import plotly.express as px import argparse +from plotly.subplots import make_subplots @@ -35,6 +31,7 @@ def __init__(self, level = None, ind_dict = None, prefix = None, + postfix = None, col_names_dict = None, htrahshold = -3): """ @@ -58,8 +55,6 @@ def __init__(self, self.outputs_path = self.path + '/' + 'out' if not os.path.exists(self.outputs_path): os.mkdir(self.outputs_path) - else: - self.outputs_path = outputs_path # List files in path self.files = os.listdir(self.path) @@ -76,7 +71,8 @@ def __init__(self, # 'i8' : 'mean_distance_per_week.csv', # 'i9': 'week_home_vs_day_location_per_day.csv', # 'i10' : 'origin_destination_matrix_time_per_day.csv', - # 'i11': 'unique_subscriber_home_locations_per_month.csv'} + # 'i11': 'unique_subscriber_home_locations_per_month.csv' + else: self.ind_dict = ind_dict @@ -85,6 +81,11 @@ def __init__(self, else: self.ind_dict = {k:prefix+v for (k,v) in self.ind_dict.items()} + if postfix is None: + pass + else: + self.ind_dict = {k:v+postfix for (k,v) in self.ind_dict.items()} + # Check if files exist files_bol = all([os.path.isfile(self.path + '/' + self.ind_dict[key]) for key in self.ind_dict.keys()]) assert files_bol,"Some indicators don't exist. Check defaults or set self.ind_dict" @@ -126,8 +127,8 @@ def load(file_path, timevar = None): # Load data df = pd.read_csv(file_path) # Patch cleannig of headers in the middle of the data - c1_name = df.columns[0] - df = df[~df[c1_name].astype(str).str.contains(c1_name)] + # c1_name = df.columns[0] + # df = df[~df[c1_name].astype(str).str.contains(c1_name)] # Convert date vars if timevar is None: timevar = df.columns[0] @@ -163,13 +164,8 @@ def time_complete(data, timevar = None, timefreq = 'D'): data = data.reindex(full_time_range, fill_value=0) return(data) - # Bolleans - i1bol = 'i1' in self.ind_dict - i3bol = 'i3' in self.ind_dict - i5bol = 'i5' in self.ind_dict - # Indicator 1 - if i1bol: + if 'i1' in self.ind_dict: # self.i1_hour = remove_missings(self.i1, regionvar = self.col_names_dict['i1']['Geography'])\ # .groupby(['date', self.col_names_dict['i1']['Time']])\ # .agg({self.col_names_dict['i1']['Geography'] : pd.Series.nunique , @@ -189,7 +185,7 @@ def time_complete(data, timevar = None, timefreq = 'D'): self.i1_date = time_complete(self.i1_date, 'date') # Indicator 3 - if i3bol: + if 'i3' in self.ind_dict: self.i3_date = remove_missings(self.i3, regionvar = self.col_names_dict['i3']['Geography'])\ .groupby('date')\ .agg({self.col_names_dict['i3']['Geography'] : pd.Series.nunique , @@ -201,7 +197,7 @@ def time_complete(data, timevar = None, timefreq = 'D'): self.i3_date = time_complete(self.i3_date, 'date') # Indicator 5 - if i5bol: + if 'i5' in self.ind_dict: i5_nmissing = remove_missings(remove_missings(self.i5, self.col_names_dict['i5']['Geography_from']), self.col_names_dict['i5']['Geography_to']) self.i5_date = i5_nmissing\ @@ -217,49 +213,8 @@ def time_complete(data, timevar = None, timefreq = 'D'): # Remove first day for plots since it doesn't have movements from the day before # so it is biased by definition. self.i5_date = self.i5_date[~(self.i5_date.index == self.i5_date.index.min())] - - # Create a merged dataset making sure all indicators are in the same resolution - if (i1bol & i3bol): - index = [self.col_names_dict['i1']['Time'], self.col_names_dict['i1']['Geography']] - - i3 = self.i3.rename(columns = {self.col_names_dict['i3']['Count'] : 'subs'}) - self.merged = self.i1\ - .groupby(index)\ - .agg({self.col_names_dict['i1']['Count'] : np.sum})\ - .reset_index()\ - .merge(i3, on = index, how = 'outer')\ - .fillna(0)\ - .rename(columns = {self.col_names_dict['i1']['Count'] : 'trans'}) - - if i5bol: - # Just movements out of region - i5_org = self.i5[[self.col_names_dict['i5']['Time'], - self.col_names_dict['i5']['Geography_from'], - self.col_names_dict['i5']['Count']]]\ - .rename(columns = {self.col_names_dict['i5']['Count'] : 'mov_out', - self.col_names_dict['i5']['Time'] : index[0], - self.col_names_dict['i5']['Geography_from'] : index[1]})\ - .groupby(index)\ - .agg({'mov_out' : np.sum})\ - .reset_index() - - # Just movements into a region - i5_dest = self.i5[[self.col_names_dict['i5']['Time'], - self.col_names_dict['i5']['Geography_to'], - self.col_names_dict['i5']['Count']]]\ - .rename(columns = {self.col_names_dict['i5']['Count'] : 'mov_in', - self.col_names_dict['i5']['Time'] : index[0], - self.col_names_dict['i5']['Geography_to'] : index[1]})\ - .groupby(index)\ - .agg({'mov_in' : np.sum})\ - .reset_index() - - self.merged = self.merged\ - .merge(i5_org, on = index, how = 'outer')\ - .merge(i5_dest, on = index, how = 'outer')\ - .fillna(0) - # --------------------------------------------------------- + # --------------------------------------------------------- # Plots def plot_i1_hist(self, show = True, export = True): @@ -282,7 +237,7 @@ def plot_i1_count(self, show = True, export = True): fig.update_layout(title_text="Indicator 1: Total number of transactions.") print("Plotting indicator 1 daily count series...") - + if export: file_name = self.outputs_path + '/' + 'i1_count.html' print('Saving: ' + file_name) @@ -341,7 +296,6 @@ def plot_i5_count(self, show = True, export = True): plotly.offline.plot(fig, filename = file_name, auto_open=False) if show: fig.show() - def plot_i5_region_count(self, show = True, export = True): fig = go.Figure() fig.add_trace(go.Scatter(x=self.i5_date.index, @@ -371,51 +325,72 @@ def plot_region_missings(self, show = True, export = True): plotly.offline.plot(fig, filename = file_name, auto_open=False) if show: fig.show() - - # Subscribers vs transactions scatter - def plot_subs_v_trans(self, show = True, export = True): - df = self.merged - # Variable to mark red obs with zeros - df['color'] = np.where((df['trans']== 0) | (df['subs']== 0), - 'zeros', - 'normal') - i1_i3 = df\ - .rename(columns = {self.col_names_dict['i1']['Time'] : 'Date', - self.col_names_dict['i1']['Geography'] : 'Region', - 'subs' : 'Number of subscribers', - 'trans': 'Number of transactions'}) - fig = i1_i3.plot.scatter(x="Number of subscribers", - y="Number of transactions", - color = 'color', - hover_data=['Date', 'Region'], - title = 'Number of subscrivers vs number of transactions.') - fig.update_layout(showlegend=False) - - print("Plotting indicators 1 and 3 scatter...") + + + def plot_i1_i3_count(self, show = True, export = True): + + y = self.i1_date[self.col_names_dict['i1']['Count']]/self.i3_date[self.col_names_dict['i3']['Count']] + + fig = go.Figure(data=go.Scatter(x=self.i1_date.index, + y=y)) + + fig.update_layout(title_text="Indicator 1/ Indicator 3: Total number of transactions per subscriber.") + + print("Plotting Total number of transactions per subscriber...") + if export: - file_name = self.outputs_path + '/' + 'i3_vs_i1.html' + file_name = self.outputs_path + '/' + 'i1_per_i3_count.html' print('Saving: ' + file_name) plotly.offline.plot(fig, filename = file_name, auto_open=False) if show: fig.show() + + def plot_i5_i3_count(self, show = True, export = True): + + y = self.i5_date[self.col_names_dict['i5']['Count']]/self.i3_date[self.col_names_dict['i3']['Count']] + + fig = go.Figure(data=go.Scatter(x=self.i3_date.index, + y=y)) + + fig.update_layout(title_text="Indicator 5/ Indicator 3: Total number of movements per subscriber.") + + print("Plotting Total number of movements per subscriber...") + + if export: + file_name = self.outputs_path + '/' + 'i5_per_i3_count.html' + print('Saving: ' + file_name) + plotly.offline.plot(fig, filename = file_name, auto_open=False) + if show: + fig.show() + + + + + + + - # --------------------------------------------------------- + # --------------------------------------------------------- # Check pipelines - def completeness_checks(self, export = True, show = True): + def completeness_checks(self, export = True): if 'i1' in self.ind_dict: - self.plot_i1_hist(export = export, show = show) - self.plot_region_missings(export = export, show = show) - self.plot_i1_count(export = export, show = show) - self.plot_i1_n_regions(export = export, show = show) + self.plot_i1_hist(export = export) + self.plot_region_missings(export = export) + self.plot_i1_count(export = export) + self.plot_i1_n_regions(export = export) if 'i3' in self.ind_dict: - self.plot_i3_hist(export = export, show = show) - self.plot_i3_count(export = export, show = show) + self.plot_i3_hist(export = export) + self.plot_i3_count(export = export) if 'i5' in self.ind_dict: - self.plot_i5_count(export = export, show = show) + self.plot_i5_count(export = export) # self.plot_i5_region_count(export = export) - if ('i1' in self.ind_dict) & ('i3' in self.ind_dict): - self.plot_subs_v_trans(export = export, show = show) + if ('i1' in self.ind_dict) & ('i3' in self.ind_dict): + self.plot_i1_i3_count(export = export) + + if ('i5' in self.ind_dict) & ('i3' in self.ind_dict): + self.plot_i5_i3_count(export = export) + # USAGE OUTILERS: Indicator wards and days with towers down def usage_outliers(self, htrahshold = None): @@ -460,29 +435,10 @@ def usage_outliers(self, htrahshold = None): file.close() + # --------------------------------------------------------- # Run script from the terminal -# if __name__ == "__main__": - -# # Initializ parser -# parser = argparse.ArgumentParser() - -# # Adding optional argument -# parser.add_argument("-p", "--Path") -# parser.add_argument("--Prefix") -# parser.add_argument("--Output") - -# # Read arguments from command line -# args = parser.parse_args() - -# # Create checker instance -# indicators_checker = checker(path = args.Path, prefix = args.Prefix, outputs_path = args.Output) -# #------------------------------------------------------------------------------ -# # Export completeness plots -# indicators_checker.completeness_checks() -# #------------------------------------------------------------------------------ -# # Export towers down sheet -# indicators_checker.usage_outliers() + From bbcac73a20157f3f3d7b2f98365ed80046ad9386 Mon Sep 17 00:00:00 2001 From: rz233 <43656240+rz233@users.noreply.github.com> Date: Tue, 10 Aug 2021 16:57:59 -0400 Subject: [PATCH 2/2] Update checker.py --- data-checks/checker.py | 51 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/data-checks/checker.py b/data-checks/checker.py index 2703781..0baec09 100644 --- a/data-checks/checker.py +++ b/data-checks/checker.py @@ -235,6 +235,7 @@ def plot_i1_count(self, show = True, export = True): fig = go.Figure(data=go.Scatter(x=self.i1_date.index, y=self.i1_date[self.col_names_dict['i1']['Count']])) fig.update_layout(title_text="Indicator 1: Total number of transactions.") + fig.update_yaxes(rangemode="tozero") print("Plotting indicator 1 daily count series...") @@ -262,6 +263,7 @@ def plot_i3_count(self, show = True, export = True): fig = go.Figure(data=go.Scatter(x=self.i3_date.index, y=self.i3_date[self.col_names_dict['i3']['Count']])) fig.update_layout(title_text="Indicator 3: Total number of daily active subscribers.") + fig.update_yaxes(rangemode="tozero") print("Plotting indicator 3 histogram...") if export: @@ -289,6 +291,8 @@ def plot_i5_count(self, show = True, export = True): fig = go.Figure(data=go.Scatter(x=self.i5_date.index, y=self.i5_date[self.col_names_dict['i5']['Count']])) fig.update_layout(title_text="Indicator 5: Total number of movements.") + fig.update_yaxes(rangemode="tozero") + print("Plotting indicator 5 daily movement counts...") if export: file_name = self.outputs_path + '/' + 'i5_count.html' @@ -335,6 +339,7 @@ def plot_i1_i3_count(self, show = True, export = True): y=y)) fig.update_layout(title_text="Indicator 1/ Indicator 3: Total number of transactions per subscriber.") + fig.update_yaxes(rangemode="tozero") print("Plotting Total number of transactions per subscriber...") @@ -354,6 +359,7 @@ def plot_i5_i3_count(self, show = True, export = True): y=y)) fig.update_layout(title_text="Indicator 5/ Indicator 3: Total number of movements per subscriber.") + fig.update_yaxes(rangemode="tozero") print("Plotting Total number of movements per subscriber...") @@ -363,10 +369,11 @@ def plot_i5_i3_count(self, show = True, export = True): plotly.offline.plot(fig, filename = file_name, auto_open=False) if show: fig.show() + + - - + @@ -390,6 +397,46 @@ def completeness_checks(self, export = True): if ('i5' in self.ind_dict) & ('i3' in self.ind_dict): self.plot_i5_i3_count(export = export) + + if ('i5' in self.ind_dict) & ('i3' in self.ind_dict) & ('i1' in self.ind_dict): + + html_string = ''' + + + + + + + + + + + + + + + + + + + + + + + + + + + + ''' + + file_name = self.outputs_path + '/' + 'data_check_report.html' + f = open(file_name,'w') + f.write(html_string) + f.close() + # USAGE OUTILERS: Indicator wards and days with towers down