diff --git a/homework_task.py b/homework_task.py new file mode 100644 index 0000000..b8a2b3e --- /dev/null +++ b/homework_task.py @@ -0,0 +1,52 @@ +import sqlite3 +import pandas as pd +import numpy +import matplotlib.pyplot as plt +import re + +con = sqlite3.connect('works.sqlite') +df = pd.read_csv("works.csv") +cursor = con.cursor() +def clean(field): + return re.sub(r'\<[^>]*\>', '', str(field)) + +df['skills'] = df['skills'].apply(clean) +df['otherInfo'] = df['otherInfo'].apply(clean) + +df.to_sql("works", con, if_exists='append', index=False) +con.commit() + +cursor.execute('drop table if exists genders') +cursor.execute('CREATE TABLE genders(' + 'id INTEGER PRIMARY KEY AUTOINCREMENT,' + 'gender TEXT)') +cursor.execute('INSERT INTO genders(gender)' + 'SELECT DISTINCT gender' + 'FROM works WHERE gender IS NOT NULL') +cursor.execute('ALTER TABLE works' + 'ADD COLUMN gender_id INTEGER REFERENCES genders(id)') +cursor.execute('UPDATE works SET gender_id =' + '(SELECT id FROM genders' + 'WHERE gender = works.gender)') +cursor.execute('ALTER TABLE works' + 'DROP COLUMN gender') +con.commit() + +cursor.execute('drop table if exists education') +cursor.execute('CREATE TABLE education' + '(id INTEGER PRIMARY KEY AUTOINCREMENT, ' + 'level_of_edu TEXT)') +cursor.execute('INSERT INTO education(level_of_edu)' + ' SELECT DISTINCT educationType ' + 'FROM works' + ' WHERE educationType IS NOT NULL') +cursor.execute('ALTER TABLE works' + ' ADD COLUMN educationType_id INTEGER REFERENCES education(id)') +cursor.execute('UPDATE works' + ' SET educationType_id =' + ' (SELECT id' + ' FROM education' + ' WHERE level_of_edu = works.educationType)') +cursor.execute('ALTER TABLE works' + ' DROP COLUMN educationType') +con.commit() \ No newline at end of file diff --git a/task.py b/task.py new file mode 100644 index 0000000..1f1ed3a --- /dev/null +++ b/task.py @@ -0,0 +1,59 @@ +import sqlite3 +import pandas as pd +import numpy +import matplotlib.pyplot as plt +# task1 and task2 +con = sqlite3.connect('works.sqlite') +cursor = con.cursor() +cursor.execute('drop table if exists works') +cursor.execute('create table works (' + 'ID INTEGER PRIMARY KEY AUTOINCREMENT,' + 'salary INTEGER,' + 'educationType TEXT,' + 'jobTitle TEXT,' + 'qualification TEXT,' + 'gender TEXT,' + 'dateModify TEXT,' + 'skills TEXT,' + 'otherInfo TEXT)') +con.commit() + +df = pd.read_csv("works.csv") +df.to_sql("works", con, if_exists='append', index=False) +con.commit() + +cursor.execute('create index salary_index on works (salary)') +con.commit() + +#task 3, 4, 5, 6, 7 +cursor.execute('SELECT COUNT(*) FROM works') +print(cursor.fetchall()[0][0]) + +cursor.execute('SELECT COUNT(*) FROM works WHERE gender = "Женский"') +w_salary = [t[0] for t in cursor.fetchall()] +cursor.execute('SELECT COUNT(*) FROM works WHERE gender = "Мужской"') +m_salary = [t[0] for t in cursor.fetchall()] +cursor.execute('SELECT gender, COUNT(*) FROM works GROUP BY gender') +cursor.execute('SELECT skills FROM works WHERE skills NOT NULL') +cursor.execute('SELECT salary FROM works WHERE skills LIKE "%Python%"') + +#tasks 8, 9 +percentiles = numpy.linspace(.1, 1, 10) + +w_salary = numpy.quantile(w_salary, percentiles) +m_salary = numpy.quantile(m_salary, percentiles) + +plt.hist(m_salary, bins=100) +plt.show() +plt.hist(w_salary, bins=100) +plt.show() + +plt.plot(percentiles, m_salary) +plt.xlabel("Перцентили") +plt.ylabel("Зарплата у мужчин") +plt.show() + +plt.plot(percentiles, w_salary) +plt.xlabel("Перцентили") +plt.ylabel("Зарплата у женщин") +plt.show() \ No newline at end of file diff --git a/works.sqlite b/works.sqlite new file mode 100644 index 0000000..690feee Binary files /dev/null and b/works.sqlite differ