-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexplore.py
More file actions
134 lines (102 loc) · 4.26 KB
/
explore.py
File metadata and controls
134 lines (102 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import stats
def plot_churn_overall(df):
'''plots the overall churn responses
argument: telco_df
return: a bargraph'''
# create a figure
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot(111)
# proportion of observation of each class
prop_response = df['churn'].value_counts(normalize=True)
# create a bar plot showing the percentage of churn
prop_response.plot(kind='bar',
ax=ax,
color= ['#1f77b4', '#ff7f0e'])
# set title and labels
ax.set_title('Proportion of observations of the response variable',
fontsize=18)
ax.set_xlabel('churn',
fontsize=14)
ax.set_ylabel('proportion of observations',
fontsize=14)
ax.tick_params(rotation='auto')
# eliminate the frame from the plot
spine_names = ('top', 'right', 'bottom', 'left')
for spine_name in spine_names:
ax.spines[spine_name].set_visible(False)
def column_split(df):
'''Takes the qualitative and quantitative columns and splits them
as such. Ex: cat_cols, num_cols = column_split(df)
arguments: dataframe
return: cat_cols, num_cols'''
# separating our numeric and categorical columns:
# initialize two empty lists for each type:
cat_cols, num_cols = [], []
# set up a for loop to build those lists out:
# so for every column in explore_columns:
for col in df:
# check to see if its an object type,
# if so toss it in categorical
if df[col].dtype == 'O':
cat_cols.append(col)
# otherwise if its numeric:
else:
# check to see if we have more than just a few values:
# if thats the case, toss it in categorical
if df[col].nunique() < 10:
cat_cols.append(col)
# and otherwise call it continuous by elimination
else:
num_cols.append(col)
return cat_cols, num_cols
def stacked_plot(col_to_stack, df):
'''Takes the prepared columns from column_split function and plots
stacked percentage graphs of each category.
arguments: column list, dataframe
return: visual barcharts'''
for index, column in enumerate(col_to_stack):
bar_by_cat = pd.crosstab(df[column], df['churn']).apply(lambda x: x/x.sum()*100, axis=1)
bar_by_cat.plot(kind='bar', stacked=True)
plt.ylabel('Percentage')
plt.xlabel(column)
def get_chi_os(df):
'''get result of chi-square for churn and online_security'''
on_sec = df.online_security[df.online_security != 'No internet service']
observed = pd.crosstab(df.churn, on_sec)
chi2, p, degf, expected = stats.chi2_contingency(observed)
print(f'chi^2 = {chi2:.4f}')
print(f'p = {p:.4f}')
def get_chi_ob(df):
'''get result of chi-square for churn and online_backup'''
on_back = df.online_backup[df.online_backup != 'No internet service']
observed = pd.crosstab(df.churn, on_back)
chi2, p, degf, expected = stats.chi2_contingency(observed)
print(f'chi^2 = {chi2:.4f}')
print(f'p = {p:.4f}')
def get_chi_dp(df):
'''get result of chi-square for churn and device_protection'''
on_dev = df.device_protection[df.device_protection != 'No internet service']
observed = pd.crosstab(df.churn, on_dev)
chi2, p, degf, expected = stats.chi2_contingency(observed)
print(f'chi^2 = {chi2:.4f}')
print(f'p = {p:.4f}')
def get_chi_ts(df):
'''get result of chi-square for churn and tech_support'''
on_tech = df.tech_support[df.tech_support != 'No internet service']
observed = pd.crosstab(df.churn, on_tech)
chi2, p, degf, expected = stats.chi2_contingency(observed)
print(f'chi^2 = {chi2:.4f}')
print(f'p = {p:.4f}')
def get_chi_ct(df):
'''get result of chi-square for churn and contract type'''
observed = pd.crosstab(df.churn, df.contract_type)
chi2, p, degf, expected = stats.chi2_contingency(observed)
print(f'chi^2 = {chi2:.4f}')
print(f'p = {p:.4f}')