diff --git a/parcing_4.ipynb b/parcing_4.ipynb new file mode 100644 index 0000000..87b33e7 --- /dev/null +++ b/parcing_4.ipynb @@ -0,0 +1,255 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "from pprint import pprint\n", + "from bs4 import BeautifulSoup as bs\n", + "import requests\n", + "import pymongo as pm" + ], + "metadata": { + "id": "NetlqVRWTCC1" + }, + "execution_count": 48, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Task 1 Парсинг вакансий с hh.ru" + ], + "metadata": { + "id": "hl1lg9CwUmVA" + } + }, + { + "cell_type": "code", + "source": [ + "User_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 YaBrowser/22.11.0.2419 Yowser/2.5 Safari/537.36'\n", + "URL_first_page = 'https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1'\n", + "\n", + "headers = {\n", + " 'User-Agent': User_agent,\n", + "}" + ], + "metadata": { + "id": "b-loiR9aTCID" + }, + "execution_count": 49, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def parse_hh(url_page, headers, result=[], index_page=1):\n", + " response = requests.get(url_page, headers=headers)\n", + " if response.status_code != 200:\n", + " print('Парсинг завершен')\n", + " return result\n", + " else:\n", + " print('Cтраница №%d, ссылка: %s'%(index_page, response.url))\n", + "\n", + " dom = bs(response.content, 'html.parser')\n", + " vacancies = dom.find_all('div', {'class': 'vacancy-serp-item__layout'})\n", + " for vacancy in vacancies:\n", + " result.append(parse_vacancy_hh(vacancy))\n", + "\n", + " link_next_page = dom.find('a', {'data-qa': 'pager-next'})\n", + " if link_next_page:\n", + " link_next_page = 'https://hh.ru' + link_next_page['href']\n", + " else:\n", + " print('Парсинг завершен')\n", + " return result\n", + "\n", + " result = parse_hh(link_next_page, headers, result, index_page+1) \n", + " return result\n", + " \n", + "def parse_vacancy_hh(dom_vacancy):\n", + " vacancy_name = dom_vacancy.find('a').text\n", + "\n", + " vacancy_salary = dom_vacancy.find('span', {'class', 'bloko-header-section-3'})\n", + " if vacancy_salary:\n", + " vacancy_salary = vacancy_salary.text\n", + " min_salary, max_salary, currency_salary = clean_salary(vacancy_salary)\n", + " else:\n", + " min_salary, max_salary, currency_salary = None, None, None\n", + "\n", + " vacancy_link = dom_vacancy.find('a')['href']\n", + " \n", + " return {\n", + " 'vacancy_name': vacancy_name,\n", + " 'vacancy_salary': vacancy_salary,\n", + " 'min_salary': min_salary,\n", + " 'max_salary': max_salary,\n", + " 'currency_salary': currency_salary,\n", + " 'vacancy_link': vacancy_link,\n", + " 'vacancy_source': 'hh.ru',\n", + " }\n", + "\n", + "def clean_salary(vacancy_salary_text, min_salary=None, max_salary=None, currency_salary=None):\n", + " list_salary = vacancy_salary_text.replace('\\u202f', '').split()\n", + " for i in range(len(list_salary) - 1):\n", + " if list_salary[i] == 'от':\n", + " min_salary = int(list_salary[i + 1])\n", + " elif list_salary[i] == 'до':\n", + " max_salary = int(list_salary[i + 1])\n", + " elif list_salary[i] == '–':\n", + " min_salary = int(list_salary[i - 1])\n", + " max_salary = int(list_salary[i + 1])\n", + " currency_salary = list_salary[-1]\n", + "\n", + " return min_salary, max_salary, currency_salary" + ], + "metadata": { + "id": "nVxbHP0RTNu8" + }, + "execution_count": 50, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "result = parse_hh(URL_first_page, headers)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "knn1xMkaTNyK", + "outputId": "8eacecfe-cc26-425f-fbe5-78cca625caf6" + }, + "execution_count": 51, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cтраница №1, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1\n", + "Cтраница №2, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=1&hhtmFrom=vacancy_search_list\n", + "Cтраница №3, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=2&hhtmFrom=vacancy_search_list\n", + "Cтраница №4, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=3&hhtmFrom=vacancy_search_list\n", + "Cтраница №5, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=4&hhtmFrom=vacancy_search_list\n", + "Cтраница №6, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=5&hhtmFrom=vacancy_search_list\n", + "Cтраница №7, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=6&hhtmFrom=vacancy_search_list\n", + "Cтраница №8, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=7&hhtmFrom=vacancy_search_list\n", + "Парсинг завершен\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(f'Вакансия: {result[0]}\\nКол-во вакансий: {len(result)}')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LiWF5lKiU23P", + "outputId": "64e2e897-f8ba-422b-a2a5-462f89b4039c" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Вакансия: {'vacancy_name': 'Специалист по углубленной аналитике данных (Data scientist)', 'vacancy_salary': None, 'min_salary': None, 'max_salary': None, 'currency_salary': None, 'vacancy_link': 'https://hh.ru/vacancy/72920605?from=vacancy_search_list&query=Data+Scientist', 'vacancy_source': 'hh.ru'}\n", + "Кол-во вакансий: 143\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Task 2 Перенос данных в базу" + ], + "metadata": { + "id": "DmkARgQrVzjK" + } + }, + { + "cell_type": "code", + "source": [ + "client = pm.MongoClient()\n", + "\n", + "db = client['vacancies_ds']\n", + "collection_of_ds_vacancies = db['Data_Scientist']" + ], + "metadata": { + "id": "4gOSlK6TTzfD" + }, + "execution_count": 53, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def check_and_save_vacancies_in_db(vacancies):\n", + " for vacancy in vacancies:\n", + " if not len(list(collection_of_ds_vacancies.find({'vacancy_link': vacancy['vacancy_link']}))):\n", + " collection_of_ds_vacancies.insert_one(vacancy)" + ], + "metadata": { + "id": "6r-cAODZT1hG" + }, + "execution_count": 54, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "check_and_save_vacancies_in_db(result)\n", + "result_find = list(collection_of_ds_vacancies.find())" + ], + "metadata": { + "id": "Qh8URYTKT1k4" + }, + "execution_count": 55, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "len(result_find)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7gGUia1IUC0r", + "outputId": "2d2a932f-45f6-4dff-e68f-390406678133" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "143" + ] + }, + "metadata": {}, + "execution_count": 56 + } + ] + } + ] +} \ No newline at end of file