diff --git a/homework.py b/homework.py new file mode 100644 index 0000000..bfe5993 --- /dev/null +++ b/homework.py @@ -0,0 +1,80 @@ +import re +import sqlite3 +import pandas as pd + + +def createTables(): + cursor.execute("drop table if exists works") + cursor.execute("drop table if exists gender") + cursor.execute("drop table if exists education") + cursor.execute("create table gender(" + "id INTEGER PRIMARY KEY AUTOINCREMENT," + "gender TEXT" + ");") + cursor.execute("create table education(" + "id INTEGER primary key autoincrement," + "educationType TEXT" + ");") + cursor.execute("create unique index education_educationType_uindex on education (educationType);") + cursor.execute("create unique index gender_gender_uindex on gender (gender);") + cursor.execute("create table works(" + "ID INTEGER PRIMARY KEY AUTOINCREMENT," + "salary INTEGER," + "educationType references education," + "jobTitle TEXT," + "qualification TEXT," + "gender references gender," + "dateModify TEXT," + "skills TEXT," + "otherInfo TEXT" + ");") + + connection.commit() + + +def clearFromHtml(line): + if not isinstance(line, str): + return line + clean = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') + a = re.sub(clean, '', line) + return a + + +connection = sqlite3.connect("homework.sqlite") +cursor = connection.cursor() +createTables() + +data = pd.read_csv('works.csv') + +for i in data['gender'].drop_duplicates(): + cursor.execute(f"insert into gender(gender) values('{i}');") + +for i in data['educationType'].drop_duplicates(): + cursor.execute(f"insert into education (educationType) values ('{i}');") + +connection.commit() + +index = 1 +for key, values in data.iterrows(): + cursor.execute(f"select id from education where educationType='{values['educationType']}';") + education_id = cursor.fetchone()[0] + + cursor.execute(f"select id from gender where gender='{values['gender']}';") + gender_id = cursor.fetchone()[0] + + cursor.execute("insert into works " + "(salary, educationType, jobTitle, qualification, gender, dateModify, skills, otherInfo)" + f"values (" + f"'{values['salary']}'," + f"'{education_id}'," + f"'{values['jobTitle']}'," + f"'{values['qualification']}'," + f"'{gender_id}'," + f"'{values['dateModify']}'," + f"'{clearFromHtml(values['skills'])}'," + f"'{clearFromHtml(values['otherInfo'])}');") + connection.commit() + print(f"Добавленна {index} строка") + index += 1 + +connection.commit() diff --git a/main.py b/main.py new file mode 100644 index 0000000..30a38df --- /dev/null +++ b/main.py @@ -0,0 +1,87 @@ +import math +import sqlite3 +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import os.path + +connection = sqlite3.connect("task.sqlite") +cursor = connection.cursor() + +cursor.execute("drop table if exists works") + +cursor.execute("create table works(" + "ID INTEGER PRIMARY KEY AUTOINCREMENT," + "salary INTEGER," + "educationType TEXT," + "jobTitle TEXT," + "qualification TEXT," + "gender TEXT," + "dateModify TEXT," + "skills TEXT," + "otherInfo TEXT" + ");") + +connection.commit() + +data = pd.read_csv('works.csv') +data.to_sql('works', connection, if_exists="append", index=None) +connection.commit() +cursor.execute("select * from works limit 5") +print(cursor.fetchall()) +print(os.path.getsize("task.sqlite") / 1024 / 1024, "Mб") +cursor.execute("create index salary_index on works(salary);") +connection.commit() + +cursor.execute("select count(*) from works") +cursor.execute("select count(*) from works where gender='Мужской'") +cursor.execute("select count(*) from works where gender='Женский'") +cursor.execute("select gender,count(*) from works group by gender") +print(cursor.fetchall()) +cursor.execute('select count(*) from works where skills not null') +print(cursor.fetchall()) +cursor.execute('select skills from works where skills not null') +print(cursor.fetchall()) +cursor.execute("select salary from works where skills like '%Python%'") +print(cursor.fetchall()) +print(os.path.getsize("task.sqlite") / 1024 / 1024, "Mб") +cursor.execute("select salary from works where gender = 'Мужской'") +mens_salary = [row[0] for row in cursor.fetchall()] +cursor.execute("select salary from works where gender = 'Женский'") +women_salary = [row[0] for row in cursor.fetchall()] + +percintile = np.linspace(0.1, 1, 10) +q = np.quantile(mens_salary, percintile) +q1 = np.quantile(women_salary, percintile) +print(list(zip(percintile, q))) +print(list(zip(percintile, q1))) + +plt.plot(percintile, q, color="b") +plt.plot(percintile, q1, color="r") +plt.xlabel("Перцентили") +plt.ylabel("Зарплата женщин") +plt.show() + +params = [("Мужской", "Высшее"), ("Мужской", "Незаконченное высшее"), ("Мужской", "Среднее"), + ("Мужской", "Среднее профессиональное"), ("Женский", "Высшее"), ("Женский", "Незаконченное высшее"), + ("Женский", "Среднее"), ("Женский", "Среднее профессиональное")] + +for p in params: + sql_query = f"SELECT salary FROM works WHERE gender = '{p[0]}' and educationType = '{p[1]}'" + salary = [row[0] for row in cursor.execute(sql_query).fetchall()] + plt.hist(salary, bins=100) + plt.title(f"Зарплата {p[0]} с образованием {p[1]}") + plt.show() + +l = pow(10, -10) + +x = np.linspace(0.0 * l, 1 * l, 100) +y = [] +fig, ax = plt.subplots() +for i in x: + temp_x = i + temp_y = 2.0 / l * math.pow(np.sin(3.0 * 3.1415 * i / l), 2) + y.append(temp_y) + +plt.plot(x, y, scalex=True, scaley=True) +plt.show()