railbotan · evin76 · Dec 22, 2021
diff --git a/homework_task.py b/homework_task.py
@@ -0,0 +1,52 @@
+import sqlite3
+import pandas as pd
+import numpy
+import matplotlib.pyplot as plt
+import re
+
+con = sqlite3.connect('works.sqlite')
+df = pd.read_csv("works.csv")
+cursor = con.cursor()
+def clean(field):
+    return re.sub(r'\<[^>]*\>', '', str(field))
+
+df['skills'] = df['skills'].apply(clean)
+df['otherInfo'] = df['otherInfo'].apply(clean)
+
+df.to_sql("works", con, if_exists='append', index=False)
+con.commit()
+
+cursor.execute('drop table if exists genders')
+cursor.execute('CREATE TABLE genders('
+               'id INTEGER PRIMARY KEY AUTOINCREMENT,'
+               'gender TEXT)')
+cursor.execute('INSERT INTO genders(gender)'
+               'SELECT DISTINCT gender'
+               'FROM works WHERE gender IS NOT NULL')
+cursor.execute('ALTER TABLE works'
+               'ADD COLUMN gender_id INTEGER REFERENCES genders(id)')
+cursor.execute('UPDATE works SET gender_id ='
+               '(SELECT id FROM genders'
+               'WHERE gender = works.gender)')
+cursor.execute('ALTER TABLE works'
+               'DROP COLUMN gender')
+con.commit()
+
+cursor.execute('drop table if exists education')
+cursor.execute('CREATE TABLE education'
+               '(id INTEGER PRIMARY KEY AUTOINCREMENT, '
+               'level_of_edu TEXT)')
+cursor.execute('INSERT INTO education(level_of_edu)'
+               ' SELECT DISTINCT educationType '
+               'FROM works'
+               ' WHERE educationType IS NOT NULL')
+cursor.execute('ALTER TABLE works'
+               ' ADD COLUMN educationType_id INTEGER REFERENCES education(id)')
+cursor.execute('UPDATE works'
+               ' SET educationType_id ='
+               ' (SELECT id'
+               ' FROM education'
+               ' WHERE level_of_edu = works.educationType)')
+cursor.execute('ALTER TABLE works'
+               ' DROP COLUMN educationType')
+con.commit()
diff --git a/task.py b/task.py
@@ -0,0 +1,59 @@
+import sqlite3
+import pandas as pd
+import numpy
+import matplotlib.pyplot as plt
+# task1 and task2
+con = sqlite3.connect('works.sqlite')
+cursor = con.cursor()
+cursor.execute('drop table if exists works')
+cursor.execute('create table works ('
+               'ID INTEGER PRIMARY KEY AUTOINCREMENT,'
+               'salary INTEGER,'
+               'educationType TEXT,'
+               'jobTitle TEXT,'
+               'qualification TEXT,'
+               'gender TEXT,'
+               'dateModify TEXT,'
+               'skills TEXT,'
+               'otherInfo TEXT)')
+con.commit()
+
+df = pd.read_csv("works.csv")
+df.to_sql("works", con, if_exists='append', index=False)
+con.commit()
+
+cursor.execute('create index salary_index on works (salary)')
+con.commit()
+
+#task 3, 4, 5, 6, 7
+cursor.execute('SELECT COUNT(*) FROM works')
+print(cursor.fetchall()[0][0])
+
+cursor.execute('SELECT COUNT(*) FROM works WHERE gender = "Женский"')
+w_salary = [t[0] for t in cursor.fetchall()]
+cursor.execute('SELECT COUNT(*) FROM works WHERE gender = "Мужской"')
+m_salary = [t[0] for t in cursor.fetchall()]
+cursor.execute('SELECT gender, COUNT(*) FROM works GROUP BY gender')
+cursor.execute('SELECT skills FROM works WHERE skills NOT NULL')
+cursor.execute('SELECT salary FROM works WHERE skills LIKE "%Python%"')
+
+#tasks 8, 9
+percentiles = numpy.linspace(.1, 1, 10)
+
+w_salary = numpy.quantile(w_salary, percentiles)
+m_salary = numpy.quantile(m_salary, percentiles)
+
+plt.hist(m_salary, bins=100)
+plt.show()
+plt.hist(w_salary, bins=100)
+plt.show()
+
+plt.plot(percentiles, m_salary)
+plt.xlabel("Перцентили")
+plt.ylabel("Зарплата у мужчин")
+plt.show()
+
+plt.plot(percentiles, w_salary)
+plt.xlabel("Перцентили")
+plt.ylabel("Зарплата у женщин")
+plt.show()
diff --git a/works.sqlite b/works.sqlite