Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
4,457 changes: 4,457 additions & 0 deletions .ipynb_checkpoints/EDA Notebook-checkpoint.ipynb

Large diffs are not rendered by default.

Binary file added ALLEN AYODEJI JAMES_LISUM08::30.12-May-2022.pdf
Binary file not shown.
186 changes: 186 additions & 0 deletions Cab_Flask_Setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 12 19:48:29 2022

@author: allenayodeji
"""

import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression


def load_pickles(model_pickle_path, label_encoder_pickle_path):
"""
Loading pickled model and label encoder from the training stage
"""
model_pickle_opener = open(model_pickle_path,"rb")
model = pickle.load(model_pickle_opener)

label_encoder_pickle_opener = open(label_encoder_pickle_path,"rb")
label_encoder_dict = pickle.load(label_encoder_pickle_opener)

return model, label_encoder_dict


def pre_process_data(df, label_encoder_dict):
"""
Converting all non-numeric columns to numeric, using the saved
encoder from the training stage.
"""
df.drop("Customer_ID", axis=1, inplace=True)
df.drop("Transaction_ID", axis=1, inplace=True)
df.drop("Payment_Mode", axis=1, inplace=True)
for col in df.columns:
if col in list(label_encoder_dict.keys()):
column_le = label_encoder_dict[col]
df.loc[:, col] = column_le.transform(df.loc[:, col])
else:
continue
return df


def make_predictions(processed_df, model):
"""
Generating the 2 parts needed for prediction: binary prediction
(cab[1] or no cab [0]), and saivng as a JSON.
Probability returned by "predict_proba" method contains 2 probailities,
for both neg and positive classes. Returning only probability of positive
class, cab.
"""
prediction = model.predict(processed_df)
probability = model.predict_proba(processed_df)
probabilities = []
for prob_array in probability:
# getting only positive class probability for each one
probabilities.append(prob_array[1])

# packaging the predictions into a DF
predictions_df = pd.DataFrame({"prediction": prediction,
"probability": probabilities})
# converting predictions DF to json
predictions_json = predictions_df.to_json(orient="records")
return predictions_json


def generate_predictions():
"""
Master-function for applying all steps needed to generated predictions.
"""
# reading in the test JSON of holdout data, which was set aside in training
test_df = pd.read_json("holdout_test.json")

# paths to saved pickles
model_pickle_path = "Cab_prediction_model.pkl"
label_encoder_pickle_path = "Cab_prediction_label_encoders.pkl"

model, label_encoder_dict = load_pickles(model_pickle_path,
label_encoder_pickle_path)

processed_df = pre_process_data(test_df, label_encoder_dict)
prediction_json = make_predictions(processed_df, model)
return prediction_json

if __name__ == '__main__':
prediction_json = generate_predictions()














from flask import Flask, request
import numpy as np
import pandas as pd
import pickle

app = Flask(__name__)

model = pickle.load(open('model.pkl','rb'))

@app.route('/predict', methods=['POST'])


def generate_predictions():
"""
Master-function for applying all steps needed to generated predictions.
"""
# pulling the input json out of the request
input = request.json
# converting input json to DF
df = pd.DataFrame(input, index=np.arange(len(input)))

# defining path to pickled model and transformer
model_pickle_path = "churn_prediction_model.pkl"
label_encoder_pickle_path = "churn_prediction_label_encoders.pkl"

model, label_encoder_dict = load_pickles(model_pickle_path,
label_encoder_pickle_path)
# calling pre-processing functions
processed_df = pre_process_data(df, label_encoder_dict)
# calling prediction funcitons
prediction, probability = make_predictions(processed_df, model)
probabilities = []
for prob_array in probability:
# getting only positive class probability for each one
probabilities.append(prob_array[1])

# packaging the predictions into a DF
predictions_df = pd.DataFrame({"prediction": prediction,
"probability": probabilities})
# converting predictions DF to json
predictions_json = predictions_df.to_json(orient="records")
return predictions_json


def load_pickles(model_pickle_path, label_encoder_pickle_path):
"""
Loading pickled model and label encoder from the training stage
"""
model_pickle_opener = open(model_pickle_path,"rb")
model = pickle.load(model_pickle_opener)

label_encoder_pickle_opener = open(label_encoder_pickle_path, "rb")
label_encoder_dict = pickle.load(label_encoder_pickle_opener)

return model, label_encoder_dict


def pre_process_data(df, label_encoder_dict):
"""
Converting all non-numeric columns to numeric, using the saved
encoder from the training stage.
"""
df.drop("customerID", axis=1, inplace=True)
df.drop("Transaction_ID", axis=1, inplace=True)
df.drop("Payment_Mode", axis=1, inplace=True)
for col in df.columns:
if col in list(label_encoder_dict.keys()):
column_le = label_encoder_dict[col]
df.loc[:, col] = column_le.transform(df.loc[:, col])
else:
continue
return df


def make_predictions(processed_df, model):
"""
Apply saved model to get predictions and probailities
"""
prediction = model.predict(processed_df)
probability = model.predict_proba(processed_df)
return prediction, probability


if __name__ == "__main__":
app.run()
107 changes: 107 additions & 0 deletions Cab_Model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 29 07:11:15 2022

@author: allenayodeji
"""

import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import xlrd

df = pd.read_csv('data_cleaning.csv')

date=[]
for i in df['Date of Travel']:
xl_date = i
datetime_date = xlrd.xldate_as_datetime(xl_date, 0)
date_object = datetime_date.date()
date.append(date_object)
df['Date']= date

#To Delete a whole column
df = df.drop(['Unnamed: 0'], axis = 1)
df = df.drop(['Date of Travel'], axis = 1)

#To remove spaces in columns
df.columns = df.columns.str.replace(' ','_')
profit = (df.Price_Charged - df.Cost_of_Trip)
df['Profit_of_Cabs'] = profit.apply(lambda x: x if x>=0 else 0)


# saving a holdout test set of 100 rows
holdout = df.iloc[-3000:, :]
# saving as a json to test later
holdout.to_json("holdout_test.json", orient="records")

# the non-holdout data is train data
train = df.iloc[:3000, :]

def pre_process_data(train):
# dropping customerID column. Since it is unique to each customer,
# it is not useful to train on.
train.drop("Customer_ID", axis=1, inplace=True)
train.drop("Transaction_ID", axis=1, inplace=True)
train.drop("Payment_Mode", axis=1, inplace=True)

categorical_columns= ['Company', 'City', 'KM_Travelled', 'Price_Charged',
'Cost_of_Trip','Gender', 'Age',
'Income_(USD/Month)', 'Date', 'Profit_of_Cabs']

# converting all the categorical columns to numeric
col_mapper = {}
class_names_mapper = {}
for col in categorical_columns:
le = LabelEncoder()
le.fit(train.loc[:, col])
class_names = le.classes_
train.loc[:, col] = le.transform(train.loc[:, col])
# saving encoder for each column to be able to inverse-transform later
col_mapper.update({col: le})
class_names_mapper.update({col: class_names})

# handling issue where numeric columns have blank rows
train.replace(" ", "0", inplace=True)


return train, col_mapper, class_names


# applying pre-process function
processed_train, col_mapper, class_names_mapper = pre_process_data(train)


# splitting into X and Y
x_train = processed_train.drop("Profit_of_Cabs", axis=1)
y_train = processed_train.loc[:, "Profit_of_Cabs"]


# training out-of-the-box Logistic Regression
model = LogisticRegression()
model.fit(x_train, y_train)

# getting predictions
predictions = model.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
# checking accuracy of predictions
print(accuracy)

# pickling mdl
pickler = open("Cab_prediction_model.pkl", "wb")
pickle.dump(model, pickler)
pickler.close()

# pickling le dict
pickler = open("Cab_prediction_label_encoders.pkl", "wb")
pickle.dump(col_mapper, pickler)
pickler.close()

# pickling class names dict
pickler = open("Cab_prediction_class_names.pkl", "wb")
pickle.dump(class_names_mapper, pickler)
pickler.close()

Binary file added Cab_prediction_class_names.pkl
Binary file not shown.
Binary file added Cab_prediction_label_encoders.pkl
Binary file not shown.
Binary file added Cab_prediction_model.pkl
Binary file not shown.
Binary file added Data Intake Report.pdf
Binary file not shown.
Loading