Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
255 changes: 255 additions & 0 deletions parcing_4.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"from pprint import pprint\n",
"from bs4 import BeautifulSoup as bs\n",
"import requests\n",
"import pymongo as pm"
],
"metadata": {
"id": "NetlqVRWTCC1"
},
"execution_count": 48,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Task 1 Парсинг вакансий с hh.ru"
],
"metadata": {
"id": "hl1lg9CwUmVA"
}
},
{
"cell_type": "code",
"source": [
"User_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 YaBrowser/22.11.0.2419 Yowser/2.5 Safari/537.36'\n",
"URL_first_page = 'https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1'\n",
"\n",
"headers = {\n",
" 'User-Agent': User_agent,\n",
"}"
],
"metadata": {
"id": "b-loiR9aTCID"
},
"execution_count": 49,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def parse_hh(url_page, headers, result=[], index_page=1):\n",
" response = requests.get(url_page, headers=headers)\n",
" if response.status_code != 200:\n",
" print('Парсинг завершен')\n",
" return result\n",
" else:\n",
" print('Cтраница №%d, ссылка: %s'%(index_page, response.url))\n",
"\n",
" dom = bs(response.content, 'html.parser')\n",
" vacancies = dom.find_all('div', {'class': 'vacancy-serp-item__layout'})\n",
" for vacancy in vacancies:\n",
" result.append(parse_vacancy_hh(vacancy))\n",
"\n",
" link_next_page = dom.find('a', {'data-qa': 'pager-next'})\n",
" if link_next_page:\n",
" link_next_page = 'https://hh.ru' + link_next_page['href']\n",
" else:\n",
" print('Парсинг завершен')\n",
" return result\n",
"\n",
" result = parse_hh(link_next_page, headers, result, index_page+1) \n",
" return result\n",
" \n",
"def parse_vacancy_hh(dom_vacancy):\n",
" vacancy_name = dom_vacancy.find('a').text\n",
"\n",
" vacancy_salary = dom_vacancy.find('span', {'class', 'bloko-header-section-3'})\n",
" if vacancy_salary:\n",
" vacancy_salary = vacancy_salary.text\n",
" min_salary, max_salary, currency_salary = clean_salary(vacancy_salary)\n",
" else:\n",
" min_salary, max_salary, currency_salary = None, None, None\n",
"\n",
" vacancy_link = dom_vacancy.find('a')['href']\n",
" \n",
" return {\n",
" 'vacancy_name': vacancy_name,\n",
" 'vacancy_salary': vacancy_salary,\n",
" 'min_salary': min_salary,\n",
" 'max_salary': max_salary,\n",
" 'currency_salary': currency_salary,\n",
" 'vacancy_link': vacancy_link,\n",
" 'vacancy_source': 'hh.ru',\n",
" }\n",
"\n",
"def clean_salary(vacancy_salary_text, min_salary=None, max_salary=None, currency_salary=None):\n",
" list_salary = vacancy_salary_text.replace('\\u202f', '').split()\n",
" for i in range(len(list_salary) - 1):\n",
" if list_salary[i] == 'от':\n",
" min_salary = int(list_salary[i + 1])\n",
" elif list_salary[i] == 'до':\n",
" max_salary = int(list_salary[i + 1])\n",
" elif list_salary[i] == '–':\n",
" min_salary = int(list_salary[i - 1])\n",
" max_salary = int(list_salary[i + 1])\n",
" currency_salary = list_salary[-1]\n",
"\n",
" return min_salary, max_salary, currency_salary"
],
"metadata": {
"id": "nVxbHP0RTNu8"
},
"execution_count": 50,
"outputs": []
},
{
"cell_type": "code",
"source": [
"result = parse_hh(URL_first_page, headers)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "knn1xMkaTNyK",
"outputId": "8eacecfe-cc26-425f-fbe5-78cca625caf6"
},
"execution_count": 51,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Cтраница №1, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1\n",
"Cтраница №2, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=1&hhtmFrom=vacancy_search_list\n",
"Cтраница №3, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=2&hhtmFrom=vacancy_search_list\n",
"Cтраница №4, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=3&hhtmFrom=vacancy_search_list\n",
"Cтраница №5, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=4&hhtmFrom=vacancy_search_list\n",
"Cтраница №6, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=5&hhtmFrom=vacancy_search_list\n",
"Cтраница №7, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=6&hhtmFrom=vacancy_search_list\n",
"Cтраница №8, ссылка: https://hh.ru/search/vacancy?search_field=name&search_field=company_name&search_field=description&text=Data+Scientist&from=suggest_post&customDomain=1&page=7&hhtmFrom=vacancy_search_list\n",
"Парсинг завершен\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(f'Вакансия: {result[0]}\\nКол-во вакансий: {len(result)}')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LiWF5lKiU23P",
"outputId": "64e2e897-f8ba-422b-a2a5-462f89b4039c"
},
"execution_count": 52,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Вакансия: {'vacancy_name': 'Специалист по углубленной аналитике данных (Data scientist)', 'vacancy_salary': None, 'min_salary': None, 'max_salary': None, 'currency_salary': None, 'vacancy_link': 'https://hh.ru/vacancy/72920605?from=vacancy_search_list&query=Data+Scientist', 'vacancy_source': 'hh.ru'}\n",
"Кол-во вакансий: 143\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### Task 2 Перенос данных в базу"
],
"metadata": {
"id": "DmkARgQrVzjK"
}
},
{
"cell_type": "code",
"source": [
"client = pm.MongoClient()\n",
"\n",
"db = client['vacancies_ds']\n",
"collection_of_ds_vacancies = db['Data_Scientist']"
],
"metadata": {
"id": "4gOSlK6TTzfD"
},
"execution_count": 53,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def check_and_save_vacancies_in_db(vacancies):\n",
" for vacancy in vacancies:\n",
" if not len(list(collection_of_ds_vacancies.find({'vacancy_link': vacancy['vacancy_link']}))):\n",
" collection_of_ds_vacancies.insert_one(vacancy)"
],
"metadata": {
"id": "6r-cAODZT1hG"
},
"execution_count": 54,
"outputs": []
},
{
"cell_type": "code",
"source": [
"check_and_save_vacancies_in_db(result)\n",
"result_find = list(collection_of_ds_vacancies.find())"
],
"metadata": {
"id": "Qh8URYTKT1k4"
},
"execution_count": 55,
"outputs": []
},
{
"cell_type": "code",
"source": [
"len(result_find)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7gGUia1IUC0r",
"outputId": "2d2a932f-45f6-4dff-e68f-390406678133"
},
"execution_count": 56,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"143"
]
},
"metadata": {},
"execution_count": 56
}
]
}
]
}