Transforming-Data-into-Features/script.py at main · yiboTR/Transforming-Data-into-Features · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
df = pd.read_csv("profiles.csv")

#How can we predict a user's gender with different factors?
#We could use status, income, job, height, body type, and orientation

#Can we predict someone's height with their diet, body type, gender, and drugs?
#Map everything
df["status_code"] = df.status.map({"single":0, "seeing_someone":1, "available":2, "married":3, "unknown":4})
df["sex_code"] = df.sex.map({"m":0, "f":1})
df["sexuality_code"] = df.orientation.map({"straight": 0, "gay": 1, "bisexual": 2})
df["job_code"] = df.job.map({"other": 0, "student": 1, "science / tech / engineering": 2,
                                   "computer / hardware / software": 3, "artistic / musical / writer": 4,
                                   "sales / marketing / biz dev": 5, "medicine / health": 6, "education / academia": 6,
                                   "executive / management": 7, "banking / financial / real estate": 8,
                                   "entertainment / media": 9, "law / legal services": 10, "hospitality / travel": 11,
                                   "construction / craftsmanship": 12, "clerical / administrative": 13,
                                   "political / government": 14, "rather not say": 15, "transportation": 16,
                                   "unemployed": 17, "retired": 18, "military": 19})
df["body_code"] = df.body_type.map({"average": 0, "fit": 1, "athletic": 2, "thin": 3, "curvy": 4,
                                   "a little extra": 5, "skinny": 6, "full figured": 7, "overweight": 8,
                                   "jacked": 9, "used up": 10, "rather not say": 11})
#Normalize
features = df[["status_code", "income", "job_code", "sexuality_code", "height", "body_code"]]
labels = df[["sex_code"]]
y = labels.values
x = features.values
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
y_scaled = scaler.fit_transform(y)

features = pd.DataFrame(x_scaled, columns = features.columns)
labels = pd.DataFrame(y, columns = labels.columns)
features.dropna(inplace=True)
labels.dropna(inplace=True)

labels = labels.iloc[:46292]

print(len(features.index))

#Making data splits
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 1)
knn_scores = []
congress_approval = []
for k in range(1, 151):
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, y_train.values.ravel())
    knn_scores.append(sheep.score(X_test, y_test.values.ravel()))
    #I call Random Forests Congress because they run on votes and every tree is confused
    congress = RandomForestClassifier(max_depth = k)
    congress.fit(X_train, y_train.values.ravel())
    congress_approval.append(congress.score(X_test, y_test.values.ravel()))
#Visualize best K/Max depth
plt.plot(range(1, 151), knn_scores)
plt.plot(range(1, 151), congress_approval)
plt.xlabel("k/max depth values")
plt.ylabel("Scores")
plt.show()
#Setting up actual models for analysis
#KNN
knn_classifier = KNeighborsClassifier(147)
knn_classifier.fit(X_train, y_train.values.ravel())
knn_predictions = sheep_classifier.predict(X_test)
knn_precision = precision_score(y_test, sheep_predictions)
knn_recall = recall_score(y_test, sheep_predictions)
knn_accuracy = accuracy_score(y_test, sheep_predictions)
#Random Forest
congress_classifier = RandomForestClassifier()
congress_classifier.fit(X_train, y_train.values.ravel())
congress_predictions = congress_classifier.predict(X_test)
congress_precision = precision_score(y_test, congress_predictions)
congress_recall = recall_score(y_test, congress_predictions)
congress_accuracy = accuracy_score(y_test, congress_predictions)
print(f'''KNN Precision: {sheep_precision}
KNN Recall: {sheep_recall}
KNN Accuracy: {sheep_accuracy}
RF Precision: {congress_precision}
RF Recall: {congress_recall}
RF Accuracy: {congress_accuracy}''')
df["diet_code"] = df.diet.map({"mostly anything": 0, "anything": 1, "strictly anything": 2,
                              "mostly vegetarian": 3, "mostly other": 4, "strictly vegetarian": 5,
                              "vegetarian": 6, "strictly other": 7, "mostly vegan": 8, "other": 9,
                              "strictly vegan": 10, "vegan": 11, "mostly kosher": 12, "mostly halal": 13,
                              "strictly halal": 14, "strictly kosher": 15, "halal": 16, "kosher": 17})
df["drug_code"] = df.drugs.map({"never": 0, "sometimes": 1, "often": 2})
#Normalize and adjust for regression
reg_features = df[["diet_code", "body_code", "drug_code", "sex_code"]]
reg_labels = df[["height"]]
rx = reg_features.values
ry = reg_labels.values
rx_scaled = scaler.fit_transform(rx)
ry_scaled = scaler.fit_transform(ry)

reg_features = pd.DataFrame(rx, columns = reg_features.columns)
reg_labels = pd.DataFrame(ry, columns = reg_labels.columns)
reg_features.dropna(inplace=True)
reg_labels.dropna(inplace=True)

reg_labels = reg_labels.iloc[:26340]

print(len(reg_features.index))

#Recreate training data for regression
X_train, X_test, y_train, y_test = train_test_split(reg_features, reg_labels, random_state = 1)
#K-Nearest Neighbors Regressor
regressor = KNeighborsRegressor()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred, c='crimson')

plt.xlabel('Actual Height', fontsize=15)
plt.ylabel('Predicted Height', fontsize=15)
plt.title("KNeighborsRegressor Actual v. Predictions")

plt.show()
error = mean_absolute_error(y_test, y_pred)
sq_error = mean_squared_error(y_test, y_pred)
print(f'''Mean Absolute Error: {error}
Mean Squared Error: {sq_error}''')
#Logistic Regression Model
log_regressor = LogisticRegression(max_iter = 1000)
log_regressor.fit(X_train, y_train.values.ravel())

log_pred = log_regressor.predict(X_test)

plt.figure(figsize=(10,10))
plt.plot(y_test, log_pred, c='cyan')

plt.xlabel('Actual Height', fontsize=15)
plt.ylabel('Predicted Height', fontsize=15)
plt.title("LogisticRegression Actual v. Predictions")

plt.show()
#Statistics for Logistic Regression
log_error = mean_absolute_error(y_test, log_pred)
sq_log_error = mean_squared_error(y_test, log_pred)
print(f'''Mean Absolute Error: {log_error}
Mean Squared Error: {sq_log_error}''')