diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml new file mode 100644 index 0000000..1c38b97 --- /dev/null +++ b/.idea/dataSources.xml @@ -0,0 +1,12 @@ + + + + + sqlite.xerial + true + org.sqlite.JDBC + jdbc:sqlite:C:\Users\honor\Desktop\sql\works.sqlite + $ProjectFileDir$ + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..cff1354 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d56657a --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..b02178e --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/sql.iml b/.idea/sql.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/sql.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/dz.py b/dz.py new file mode 100644 index 0000000..dd65d70 --- /dev/null +++ b/dz.py @@ -0,0 +1,54 @@ +import sqlite3 +import pandas as pd +import re + +con = sqlite3.connect('works.sqlite') +df = pd.read_csv("works.csv") +cursor = con.cursor() + + +def remove_tags(field): + return re.sub(r'\<[^>]*\>', '', str(field)) + + +df['skills'] = df['skills'].apply(remove_tags) +df['otherInfo'] = df['otherInfo'].apply(remove_tags) + +df.to_sql("works", con, if_exists='append', index=False) +con.commit() + +cursor.execute('drop table if exists genders') +cursor.execute('CREATE TABLE genders(' + 'id INTEGER PRIMARY KEY AUTOINCREMENT,' + 'gender TEXT)') +#cursor.execute('INSERT INTO genders(gender)' + # 'SELECT DISTINCT gender' + # 'FROM works WHERE gender IS NOT NULL') +cursor.execute('ALTER TABLE works' + 'ADD COLUMN gender_id INTEGER REFERENCES genders(id)') +cursor.execute('UPDATE works SET gender_id =' + '(SELECT id FROM genders' + 'WHERE gender = works.gender)') +cursor.execute('ALTER TABLE works' + 'DROP COLUMN gender') +con.commit() + +cursor.execute('drop table if exists education') +cursor.execute('CREATE TABLE education' + '(id INTEGER PRIMARY KEY AUTOINCREMENT, ' + 'level_of_edu TEXT)') +cursor.execute('INSERT INTO education(level_of_edu)' + ' SELECT DISTINCT educationType ' + 'FROM works' + ' WHERE educationType IS NOT NULL') +cursor.execute('ALTER TABLE works' + ' ADD COLUMN educationType_id INTEGER REFERENCES education(id)') +cursor.execute('UPDATE works' + ' SET educationType_id =' + ' (SELECT id' + ' FROM education' + ' WHERE level_of_edu = works.educationType)') +cursor.execute('ALTER TABLE works' + ' DROP COLUMN educationType') +con.commit() + diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..60dc94e --- /dev/null +++ b/tasks.py @@ -0,0 +1,61 @@ +import sqlite3 +import pandas as pd +import numpy +import matplotlib.pyplot as plt + +# задание 1 и 2 +con = sqlite3.connect('works.sqlite') +cursor = con.cursor() +cursor.execute('drop table if exists works') +cursor.execute('create table works (' + 'ID INTEGER PRIMARY KEY AUTOINCREMENT,' + 'salary INTEGER,' + 'educationType TEXT,' + 'jobTitle TEXT,' + 'qualification TEXT,' + 'gender TEXT,' + 'dateModify TEXT,' + 'skills TEXT,' + 'otherInfo TEXT)') +con.commit() + +df = pd.read_csv("works.csv") +df.to_sql("works", con, if_exists='append', index=False) +con.commit() + +cursor.execute('create index salary_index on works (salary)') +con.commit() + +# задания 3 - 7 +cursor.execute('SELECT COUNT(*) FROM works') +print(cursor.fetchall()[0][0]) + +cursor.execute('SELECT COUNT(*) FROM works WHERE gender = "Женский"') +w_salary = [t[0] for t in cursor.fetchall()] +cursor.execute('SELECT COUNT(*) FROM works WHERE gender = "Мужской"') +m_salary = [t[0] for t in cursor.fetchall()] +cursor.execute('SELECT gender, COUNT(*) FROM works GROUP BY gender') +cursor.execute('SELECT skills FROM works WHERE skills NOT NULL') +cursor.execute('SELECT salary FROM works WHERE skills LIKE "%Python%"') + +# задания 8 - 9 +percentiles = numpy.linspace(.1, 1, 10) + +w_salary = numpy.quantile(w_salary, percentiles) +m_salary = numpy.quantile(m_salary, percentiles) + +plt.hist(m_salary, bins=100) +plt.show() +plt.hist(w_salary, bins=100) +plt.show() + +plt.plot(percentiles, m_salary) +plt.xlabel("Перцентили") +plt.ylabel("Зарплата у мужчин") +plt.show() + +plt.plot(percentiles, w_salary) +plt.xlabel("Перцентили") +plt.ylabel("Зарплата у женщин") +plt.show() + diff --git a/works.sqlite b/works.sqlite new file mode 100644 index 0000000..9d91883 Binary files /dev/null and b/works.sqlite differ