forked from ironhack-labs/project-2-eda-sql
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
111 lines (67 loc) · 2.76 KB
/
main.py
File metadata and controls
111 lines (67 loc) · 2.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# IMPORTING LIBRARIES
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy import text
# READING THE DATA
df = pd.read_csv('Top 100 Languages.csv')
# EXPLORATORY DATA ANALYSIS (EDA)
print(df.head())
print(df.info())
print(df.describe())
# CLEANING THE DATA
print(df.isnull().sum())
print(df.duplicated().sum())
df = df.drop_duplicates()
df = df.dropna()
# RENAMING COLUMNS FOR CONSISTENCY (REMOVE SPACES)
print(df.columns)
df.columns = df.columns.str.replace(' ', '_')
# SORTING THE DATA
df_sorted = df.sort_values(by='Total_Speakers', ascending=False)
# PLOTTING THE TOP 10 LANGUAGES BY TOTAL SPEAKERS
plt.figure(figsize=(10,6))
sns.barplot(x='Language', y='Total_Speakers', data=df_sorted.head(10))
plt.title('Top 5 Most Spoken Languages in the World')
plt.xticks(rotation=45)
plt.show()
# COMPARISON BETWEEN NATIVE AND TOTAL SPEAKERS FOR TOP 10 LANGUAGES
df_sorted = df.sort_values(by='Total_Speakers', ascending=False)
plt.figure(figsize=(10,6))
x = np.arange(len(df_sorted['Language'].head(10)))
width = 0.4
plt.bar(x - width/2, df_sorted['Native_Speakers'].head(10), width, label='Native Speakers', color='blue', alpha=0.6)
plt.bar(x + width/2, df_sorted['Total_Speakers'].head(10), width, label='Total Speakers', color='orange', alpha=0.6)
plt.title('Comparison of Native and Total Speakers - Top 5 Languages')
plt.xlabel('Language')
plt.ylabel('Number of Speakers')
plt.xticks(ticks=x, labels=df_sorted['Language'].head(10), rotation=45)
plt.legend()
plt.show()
# CORRELATION ANALYSIS BETWEEN NATIVE AND TOTAL SPEAKERS
correlation = df[['Native_Speakers', 'Total_Speakers']].corr()
print(correlation)
# HEATMAP OF CORRELATION MATRIX
sns.heatmap(correlation, annot=True)
plt.title('Correlation between Native and Total Speakers')
plt.show()
# CREATING THE ENGINE TO CONNECT TO THE DATABASE
engine = create_engine('mysql+pymysql://root:Adrian021218@localhost/top_languages_db')
# CREATING A NEW DATAFRAME FOR DATABASE INSERTION
df = pd.DataFrame({
'Language': ['English', 'Mandarin Chinese', 'Hindi', 'Spanish', 'French'],
'Total Speakers': [1132366680, 1116596640, 615475540, 534335730, 279821930],
'Native Speakers': [379007140, 917868640, 341208640, 460093030, 77177210],
'Origin': ['Indo-European', 'Sino-Tibetan', 'Indo-European', 'Indo-European', 'Indo-European']
})
# UPLOADING THE DATA TO THE MYSQL DATABASE
df.to_sql('top_languages_db', con=engine, if_exists='replace', index=False)
# EXECUTING A QUERY TO RETRIEVE DATA
with engine.connect() as connection:
result = connection.execute(text("SELECT * FROM top_languages"))
for row in result:
print(row)
# EXPORTING DATA BACK TO CSV (if needed)
# df.to_csv('top_languages_db.csv', index=False)