diff --git a/homeWork.py b/homeWork.py new file mode 100644 index 0000000..4aad709 --- /dev/null +++ b/homeWork.py @@ -0,0 +1,57 @@ +import pandas as pd +import sqlite3 as sql +import re +def clean(file): + return re.sub(r'\<[^>]*\>', '', str(file)) +connection = sql.connect('homeworks.sqlite') +cursor = connection.cursor() +cursor.execute('drop table if exists works') +cursor.execute('create table works (' + 'ID INTEGER PRIMARY KEY AUTOINCREMENT,' + 'salary INTEGER,' + 'educationType TEXT,' + 'jobTitle TEXT,' + 'qualification TEXT,' + 'gender TEXT,' + 'dateModify TEXT,' + 'skills TEXT,' + 'otherInfo TEXT)') +connection.commit() +df = pd.read_csv('works.csv') +df.to_sql("works", connection, if_exists='append', index=False) + +df['skills'] = df['skills'].apply(clean) +df['otherInfo'] = df['otherInfo'].apply(clean) +connection.commit() + +cursor.execute('drop table if exists genders') +cursor.execute('create table genders(id INTEGER PRIMARY KEY AUTOINCREMENT, gender_val TEXT)') +connection.commit() + +cursor.execute('INSERT INTO genders(gender_val) SELECT DISTINCT gender FROM works WHERE gender IS NOT NULL') +connection.commit() + +cursor.execute('ALTER TABLE works ADD COLUMN gender_id INTEGER REFERENCES genders(id)') +connection.commit() + +cursor.execute('UPDATE works SET gender_id = (SELECT id FROM genders WHERE gender_val = works.gender)') +connection.commit() + +cursor.execute('ALTER TABLE works DROP COLUMN gender') +connection.commit() + +cursor.execute('drop table if exists education') +cursor.execute('create table education(id INTEGER PRIMARY KEY AUTOINCREMENT, edu_val TEXT)') +connection.commit() + +cursor.execute('INSERT INTO education(edu_val) SELECT DISTINCT educationType FROM works WHERE educationType IS NOT NULL') +connection.commit() + +cursor.execute('ALTER TABLE works ADD COLUMN educationType_id INTEGER REFERENCES education(id)') +connection.commit() + +cursor.execute('UPDATE works SET educationType_id = (SELECT id FROM education WHERE edu_val = works.educationType)') +connection.commit() + +cursor.execute('ALTER TABLE works DROP COLUMN educationType') +connection.commit() \ No newline at end of file diff --git a/homeworks.sqlite b/homeworks.sqlite new file mode 100644 index 0000000..0b4c85e Binary files /dev/null and b/homeworks.sqlite differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..006f969 --- /dev/null +++ b/main.py @@ -0,0 +1,75 @@ +import math +import numpy as np +import sqlite3 +import matplotlib.pyplot as plt +import os.path +import pandas as pd + +connection = sqlite3.connect("task.sqlite") +cursor = connection.cursor() +cursor.execute("drop table if exists works") +cursor.execute("create table works(" + "ID INTEGER PRIMARY KEY AUTOINCREMENT," + "salary INTEGER," + "educationType TEXT," + "jobTitle TEXT," + "qualification TEXT," + "gender TEXT," + "dateModify TEXT," + "skills TEXT," + "otherInfo TEXT" + ");") +connection.commit() +data = pd.read_csv('works.csv') +data.to_sql('works', connection, if_exists="append", index=None) +connection.commit() +cursor.execute("select * from works limit 5") +print(cursor.fetchall()) +print(os.path.getsize("task.sqlite") / 1024 / 1024, "Mб") +cursor.execute("create index salary_index on works(salary);") +connection.commit() +cursor.execute("select count(*) from works") +cursor.execute("select count(*) from works where gender='Мужской'") +cursor.execute("select count(*) from works where gender='Женский'") +cursor.execute("select gender,count(*) from works group by gender") +print(cursor.fetchall()) +cursor.execute('select count(*) from works where skills not null') +print(cursor.fetchall()) +cursor.execute('select skills from works where skills not null') +print(cursor.fetchall()) +cursor.execute("select salary from works where skills like '%Python%'") +print(cursor.fetchall()) +print(os.path.getsize("task.sqlite") / 1024 / 1024, "Mб") +cursor.execute("select salary from works where gender = 'Мужской'") +mens_salary = [row[0] for row in cursor.fetchall()] +cursor.execute("select salary from works where gender = 'Женский'") +women_salary = [row[0] for row in cursor.fetchall()] +percintile = np.linspace(0.1, 1, 10) +q = np.quantile(mens_salary, percintile) +q1 = np.quantile(women_salary, percintile) +print(list(zip(percintile, q))) +print(list(zip(percintile, q1))) +plt.plot(percintile, q, color="b") +plt.plot(percintile, q1, color="r") +plt.xlabel("Перцентили") +plt.ylabel("Зарплата женщин") +plt.show() +params = [("Мужской", "Высшее"), ("Мужской", "Незаконченное высшее"), ("Мужской", "Среднее"), + ("Мужской", "Среднее профессиональное"), ("Женский", "Высшее"), ("Женский", "Незаконченное высшее"), + ("Женский", "Среднее"), ("Женский", "Среднее профессиональное")] +for p in params: + sql_query = f"SELECT salary FROM works WHERE gender = '{p[0]}' and educationType = '{p[1]}'" + salary = [row[0] for row in cursor.execute(sql_query).fetchall()] + plt.hist(salary, bins=100) + plt.title(f"Зарплата {p[0]} с образованием {p[1]}") + plt.show() +l = pow(10, -10) +x = np.linspace(0.0 * l, 1 * l, 100) +y = [] +fig, ax = plt.subplots() +for i in x: + temp_x = i + temp_y = 2.0 / l * math.pow(np.sin(3.0 * 3.1415 * i / l), 2) + y.append(temp_y) +plt.plot(x, y, scalex=True, scaley=True) +plt.show() \ No newline at end of file diff --git a/task.sqlite b/task.sqlite new file mode 100644 index 0000000..0d42f54 Binary files /dev/null and b/task.sqlite differ