project-2-eda-sql/main.py at master · cbertopt/project-2-eda-sql · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# IMPORTING LIBRARIES

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy import text


# READING THE DATA

df = pd.read_csv('Top 100 Languages.csv')


# EXPLORATORY DATA ANALYSIS (EDA)

print(df.head())
print(df.info())
print(df.describe())


# CLEANING THE DATA

print(df.isnull().sum())
print(df.duplicated().sum())

df = df.drop_duplicates()
df = df.dropna()


# RENAMING COLUMNS FOR CONSISTENCY (REMOVE SPACES)

print(df.columns)
df.columns = df.columns.str.replace(' ', '_')


# SORTING THE DATA

df_sorted = df.sort_values(by='Total_Speakers', ascending=False)


# PLOTTING THE TOP 10 LANGUAGES BY TOTAL SPEAKERS

plt.figure(figsize=(10,6))
sns.barplot(x='Language', y='Total_Speakers', data=df_sorted.head(10))
plt.title('Top 5 Most Spoken Languages in the World')
plt.xticks(rotation=45)
plt.show()


# COMPARISON BETWEEN NATIVE AND TOTAL SPEAKERS FOR TOP 10 LANGUAGES

df_sorted = df.sort_values(by='Total_Speakers', ascending=False)

plt.figure(figsize=(10,6))
x = np.arange(len(df_sorted['Language'].head(10)))
width = 0.4
plt.bar(x - width/2, df_sorted['Native_Speakers'].head(10), width, label='Native Speakers', color='blue', alpha=0.6)
plt.bar(x + width/2, df_sorted['Total_Speakers'].head(10), width, label='Total Speakers', color='orange', alpha=0.6)
plt.title('Comparison of Native and Total Speakers - Top 5 Languages')
plt.xlabel('Language')
plt.ylabel('Number of Speakers')
plt.xticks(ticks=x, labels=df_sorted['Language'].head(10), rotation=45)
plt.legend()
plt.show()


# CORRELATION ANALYSIS BETWEEN NATIVE AND TOTAL SPEAKERS

correlation = df[['Native_Speakers', 'Total_Speakers']].corr()
print(correlation)


# HEATMAP OF CORRELATION MATRIX

sns.heatmap(correlation, annot=True)
plt.title('Correlation between Native and Total Speakers')
plt.show()


# CREATING THE ENGINE TO CONNECT TO THE DATABASE

engine = create_engine('mysql+pymysql://root:Adrian021218@localhost/top_languages_db')


# CREATING A NEW DATAFRAME FOR DATABASE INSERTION

df = pd.DataFrame({
    'Language': ['English', 'Mandarin Chinese', 'Hindi', 'Spanish', 'French'],
    'Total Speakers': [1132366680, 1116596640, 615475540, 534335730, 279821930],
    'Native Speakers': [379007140, 917868640, 341208640, 460093030, 77177210],
    'Origin': ['Indo-European', 'Sino-Tibetan', 'Indo-European', 'Indo-European', 'Indo-European']
})


# UPLOADING THE DATA TO THE MYSQL DATABASE

df.to_sql('top_languages_db', con=engine, if_exists='replace', index=False)


# EXECUTING A QUERY TO RETRIEVE DATA

with engine.connect() as connection:
    result = connection.execute(text("SELECT * FROM top_languages"))
    for row in result:
        print(row)


# EXPORTING DATA BACK TO CSV (if needed)
# df.to_csv('top_languages_db.csv', index=False)