From d87f5c69c8534367bd508b5cae605a94aff55d24 Mon Sep 17 00:00:00 2001 From: Pasha Date: Fri, 24 Dec 2021 19:20:31 +0300 Subject: [PATCH] Add files --- sql_task.ipynb | 561 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 561 insertions(+) create mode 100644 sql_task.ipynb diff --git a/sql_task.ipynb b/sql_task.ipynb new file mode 100644 index 0000000..5db7469 --- /dev/null +++ b/sql_task.ipynb @@ -0,0 +1,561 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "44f097bf", + "metadata": {}, + "source": [ + "Задания\n", + "Повторим pandas-исследования. По возможности всю предобработку данных доверьте SQL.\n", + "\n", + "В скрипте на Питоне:\n", + "\n", + "Создайте и заполните таблицу запросами, создайте техническое поле ID c параметрами INTEGER PRIMARY KEY AUTOINCREMENT.\n", + "Добавьте индекс на поле salary. Изменится ли после этого размер файла? На сколько?\n", + "Выведите количество записей.\n", + "Выведите количество мужчин и женщин.\n", + "У скольки записей заполены skills?\n", + "Получить заполненные скиллы.\n", + "Вывести зарплату только у тех, у кого в скилах есть Python.\n", + "Построить перцентили и разброс по з/п у мужчин и женщин.\n", + "Построить графики распределения по з/п мужчин и женщин (а также в зависимости от высшего образования)." + ] + }, + { + "cell_type": "markdown", + "id": "6706538f", + "metadata": {}, + "source": [ + "#### 1.Создайте и заполните таблицу запросами, создайте техническое поле ID c параметрами INTEGER PRIMARY KEY AUTOINCREMENT.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0219ddd8", + "metadata": {}, + "outputs": [], + "source": [ + "import sqlite3 \n", + "import os\n", + "import numpy as np\n", + "import os.path\n", + "import re\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fed368bb", + "metadata": {}, + "outputs": [], + "source": [ + "con = sqlite3.connect('works.sqlite')" + ] + }, + { + "cell_type": "markdown", + "id": "dd93534d", + "metadata": {}, + "source": [ + "Создаем таблицу, в первом столбце которой будет расположен уникальный индентификатор" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9accc68f", + "metadata": {}, + "outputs": [], + "source": [ + "cur = con.cursor()\n", + "cur.execute(\"DROP TABLE IF EXISTS works\")\n", + "cur.execute('CREATE TABLE works ('\n", + " 'ID INTEGER PRIMARY KEY AUTOINCREMENT,'\n", + " 'salary INTEGER,'\n", + " 'educationType TEXT,'\n", + " 'jobTitle TEXT,'\n", + " 'qualification TEXT,'\n", + " 'gender TEXT,'\n", + " 'dateModify TEXT,'\n", + " 'skills TEXT,'\n", + " 'otherInfo TEXT)')" + ] + }, + { + "cell_type": "markdown", + "id": "5738f35f", + "metadata": {}, + "source": [ + "Получим описание столбцов всей таблицы с помощь запроса PRAGMA table_info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e39a4e1", + "metadata": {}, + "outputs": [], + "source": [ + "cur.execute(\"pragma table_info(works)\").fetchall()" + ] + }, + { + "cell_type": "markdown", + "id": "9b4a64b1", + "metadata": {}, + "source": [ + "С помощью Pandas запишем всю информацию из датафрейма в sql" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be853b77", + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"works.csv\")\n", + "data.to_sql('works', con, if_exists=\"append\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "92c83f3b", + "metadata": {}, + "source": [ + "#### 2.Добавьте индекс на поле salary. Изменится ли после этого размер файла? На сколько?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "786b9245", + "metadata": {}, + "outputs": [], + "source": [ + "weight_before = os.path.getsize(r'works.sqlite')\n", + "print(f\"Размер до создания индекса: {weight_before}.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "4e344ae7", + "metadata": {}, + "outputs": [ + { + "ename": "OperationalError", + "evalue": "database is locked", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mOperationalError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcur\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"create index salary_index on works (salary)\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mOperationalError\u001b[0m: database is locked" + ] + } + ], + "source": [ + "cur.execute(\"create index salary_index on works (salary)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3977c2e", + "metadata": {}, + "outputs": [], + "source": [ + "weight_after= os.path.getsize(r'works.sqlite')\n", + "print(f\"Размер после создания индекса: {weight_after} байт(а).\")\n", + "print(f\"Разница: {weight_before - weight_after} байт(а).\")" + ] + }, + { + "cell_type": "markdown", + "id": "54f6bbd1", + "metadata": {}, + "source": [ + "Вывод: После добавления индекса размер файла не меняется" + ] + }, + { + "cell_type": "markdown", + "id": "4f0dbda2", + "metadata": {}, + "source": [ + "#### 3.Выведите количество записей." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02e048ca", + "metadata": {}, + "outputs": [], + "source": [ + "print(cur.execute(\"SELECT COUNT(*) FROM works\").fetchone())" + ] + }, + { + "cell_type": "markdown", + "id": "355f73a2", + "metadata": {}, + "source": [ + "Количество записей 32683 " + ] + }, + { + "cell_type": "markdown", + "id": "25ec4442", + "metadata": {}, + "source": [ + "#### 4.Выведите количество мужчин и женщин." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f665638", + "metadata": {}, + "outputs": [], + "source": [ + "count = cur.execute(\"SELECT gender, count(*) FROM works GROUP BY works.gender\").fetchall()\n", + "count" + ] + }, + { + "cell_type": "markdown", + "id": "4a9b5bdc", + "metadata": {}, + "source": [ + "#### 4.У скольки записей заполены skills?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b68feed6", + "metadata": {}, + "outputs": [], + "source": [ + "skill_not_null = cur.execute(\"SELECT COUNT(*) FROM works WHERE skills IS NOT NULL\").fetchone()\n", + "skill_not_null" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af30125b", + "metadata": {}, + "outputs": [], + "source": [ + "В столбце skills заполнено 8972 строк" + ] + }, + { + "cell_type": "markdown", + "id": "eb62d132", + "metadata": {}, + "source": [ + "#### 5.Получить заполненные скиллы." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c935293f", + "metadata": {}, + "outputs": [], + "source": [ + "skills = cur.execute(\"SELECT skills FROM works WHERE skills IS NOT NULL\").fetchall()\n", + "print(skills)\n" + ] + }, + { + "cell_type": "markdown", + "id": "9f4d7be1", + "metadata": {}, + "source": [ + "#### 6.Вывести зарплату только у тех, у кого в скилах есть Python." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95777884", + "metadata": {}, + "outputs": [], + "source": [ + "python_skills = cur.execute(\"SELECT salary FROM works WHERE lower(skills) LIKE '%python%' OR lower(skills) LIKE '%питон%' LIKE '%пайтон%'\").fetchall()\n", + "python_skills\n" + ] + }, + { + "cell_type": "markdown", + "id": "d9560cea", + "metadata": {}, + "source": [ + "#### 7.Построить перцентили и разброс по з/п у мужчин и женщин" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "241e1eca", + "metadata": {}, + "outputs": [], + "source": [ + "q = [i/10 for i in range(1, 11)]\n", + "for gender in genders:\n", + " query = f\"SELECT salary FROM works WHERE {gender=}\"\n", + " salaries = [slr[0] for slr in cur.execute(query).fetchall()]\n", + " res = np.quantile(np.array(salaries), q)\n", + " print(f\"{gender=}\")\n", + " print(*[f\"{i} -> {j}\" for i, j in zip(q, res)], sep=\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "e910d522", + "metadata": {}, + "source": [ + "#### 8.Построить графики распределения по з/п мужчин и женщин (а также в зависимости от высшего образования)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80d239d3", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "gender_edu = cur.execute(\"SELECT DISTINCT educationType FROM works WHERE educationType IS NOT NULL\").fetchall()\n", + "gender_edu = [ed[0] for ed in educations]\n", + "\n", + "for gender in genders:\n", + " for education in gender_edu:\n", + " plt.figure(figsize=(10, 5)) \n", + " plt.grid()\n", + " plt.title(f\"Распределение по заработной плате для пол : '{gender}' с образованием: '{education}'\") \n", + " plt.xlabel(\"Зарплата\") \n", + " plt.ylabel(\"Количество\") \n", + " query = f'SELECT salary FROM works WHERE {gender=} and {educationType=}'\n", + " salaries = [i[0] for i in cur.execute(query).fetchall()]\n", + " plt.hist(salaries, bins=25)\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "43ddfa00", + "metadata": {}, + "source": [ + "# ДЗ\n", + "\n", + "Выделим отдельные сущности:\n", + "\n", + "Создайте отдельную таблицу с гендером, заполните ее значениями, сделайте на нее внешний ключ из таблицы works.\n", + "\n", + "Отдельная таблица для образования." + ] + }, + { + "cell_type": "markdown", + "id": "635b8e25", + "metadata": {}, + "source": [ + "#### Создайте отдельную таблицу с гендером, заполните ее значениями, сделайте на нее внешний ключ из таблицы works." + ] + }, + { + "cell_type": "markdown", + "id": "b3cfd355", + "metadata": {}, + "source": [ + "Создаем новую базу таблицу и сохраним изменения" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "68863df7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "сonn = sqlite3.connect('works.sqlite')\n", + "cursor = сonn.cursor()\n", + "cursor.execute('PRAGMA foreign_keys = true')" + ] + }, + { + "cell_type": "markdown", + "id": "d158ce7a", + "metadata": {}, + "source": [ + "Заполним таблицу" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e87a77f", + "metadata": {}, + "outputs": [], + "source": [ + "cursor.execute('DROP TABLE IF EXISTS works')\n", + "cursor.execute('CREATE TABLE works ('\n", + " 'ID INTEGER PRIMARY KEY AUTOINCREMENT,'\n", + " 'salary INTEGER,'\n", + " 'educationType TEXT,'\n", + " 'jobTitle TEXT,'\n", + " 'qualification TEXT,'\n", + " 'gender TEXT,'\n", + " 'dateModify TEXT,'\n", + " 'skills TEXT,'\n", + " 'otherInfo TEXT)')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f85be0a", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"works.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "19b9a4a4", + "metadata": {}, + "source": [ + "Создадим функцию для возврата необходимых тегов" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec856c07", + "metadata": {}, + "outputs": [], + "source": [ + "def return_tags(value):\n", + " return re.sub(r'<[^>]*>', '', str(value))" + ] + }, + { + "cell_type": "markdown", + "id": "a41210b6", + "metadata": {}, + "source": [ + "Очистим таблицу от тегов и перезапишем ее" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60d0b55f", + "metadata": {}, + "outputs": [], + "source": [ + "df['skills'] = df['skills'].apply(return_tags)\n", + "df['otherInfo'] = df['otherInfo'].apply(return_tags)\n", + "df.to_sql(\"works\", con, if_exists='append', index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "7ee58a17", + "metadata": {}, + "source": [ + " Создадим и заполненим словари genders и educations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0ddc697", + "metadata": {}, + "outputs": [], + "source": [ + "cursor.execute('DROP TABLE IF EXISTS genders')\n", + "cursor.execute('CREATE TABLE genders(genderName TEXT PRIMARY KEY )')\n", + "cursor.execute('INSERT INTO genders SELECT DISTINCT gender FROM works WHERE gender IS NOT NULL')\n", + "cursor.execute('DROP TABLE IF EXISTS educations')\n", + "cursor.execute('CREATE TABLE educations(educationType TEXT PRIMARY KEY )')\n", + "cursor.execute('INSERT INTO educations SELECT DISTINCT educationType FROM works WHERE works.educationType IS NOT NULL')" + ] + }, + { + "cell_type": "markdown", + "id": "447f5675", + "metadata": {}, + "source": [ + "#### Отдельная таблица для образования." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03d6996b", + "metadata": {}, + "outputs": [], + "source": [ + "cursor.execute('CREATE TABLE new_works ('\n", + " 'ID INTEGER PRIMARY KEY AUTOINCREMENT,'\n", + " 'salary INTEGER,'\n", + " 'educationType TEXT REFERENCES educations(educationType) ON DELETE CASCADE ON UPDATE CASCADE,'\n", + " 'jobTitle TEXT,'\n", + " 'qualification TEXT,'\n", + " 'gender TEXT REFERENCES genders(genderName) ON DELETE CASCADE ON UPDATE CASCADE,'\n", + " 'dateModify TEXT,'\n", + " 'skills TEXT,'\n", + " 'otherInfo TEXT)')\n", + "cursor.execute('INSERT INTO new_works SELECT * FROM works')\n", + "cursor.execute('DROP TABLE works')\n", + "cursor.execute('ALTER TABLE new_works RENAME TO works')\n", + "con.commit()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}