Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added database/__init__.py
Empty file.
42 changes: 42 additions & 0 deletions database/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import sqlalchemy.exc
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from . import models


class Database:
def __init__(self, db_url):
self.engine = create_engine(db_url)
models.Base.metadata.create_all(bind=self.engine)
self.maker = sessionmaker(bind=self.engine)

def get_or_create(self, session, model, **kwargs):
instance = session.query(model).filter_by(**kwargs).first()
if instance:
return instance
else:
instance = model(**kwargs)
return instance

def add_post(self, data):
session = self.maker()
post = models.Post(
**data["post_data"],
author=self.get_or_create(session, models.Author, **data["author_data"]),
tags=[self.get_or_create(session, models.Tag, **tag_params) for tag_params in data["tags_data"]],
comments=[models.Comment(**comment_params) for comment_params in data["comments_data"]]
)
for itm in post.tags:
itm.posts.append(post)
for itm in post.comments:
if itm.parent_id:
itm.parent = session.query(models.Comment).filter_by(id=itm.parent_id).first()

try:
session.add(post)
session.commit()
except sqlalchemy.exc.IntegrityError:
session.rollback()
finally:
session.close()
9 changes: 9 additions & 0 deletions database/mixins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from sqlalchemy import Column, Integer, String


class IdMixin:
id = Column(Integer, primary_key=True, autoincrement=True)


class UrlMixin:
url = Column(String, unique=True, nullable=False)
52 changes: 52 additions & 0 deletions database/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship

from sqlalchemy import Column, Integer, String, ForeignKey, Text, Table, DateTime

from .mixins import UrlMixin


Base = declarative_base()

tag_post = Table(
"tag_post",
Base.metadata,
Column("post_id", Integer, ForeignKey("post.id")),
Column("tag_id", Integer, ForeignKey("tag.id")),
)


class Post(Base, UrlMixin):
__tablename__ = "post"
id = Column(Integer, primary_key=True, autoincrement=True)
title = Column(String(250), nullable=False, unique=False)
img_url = Column(String, unique=False, nullable=True)
date_published = Column(DateTime, unique=False, nullable=True)
author_id = Column(Integer, ForeignKey("author.id"), nullable=True)
author = relationship("Author", backref="posts")
tags = relationship("Tag", secondary=tag_post)


class Author(Base, UrlMixin):
__tablename__ = "author"
id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String(150), nullable=False)


class Tag(Base, UrlMixin):
__tablename__ = "tag"
id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String(150), nullable=False)
posts = relationship(Post, secondary=tag_post)


class Comment(Base):
__tablename__ = "comment"
id = Column(Integer, primary_key=True, autoincrement=True)
body = Column(Text, nullable=False, unique=False)
created_at = Column(DateTime, unique=False, nullable=True)
author_of_comment = Column(String(150), nullable=False)
post_id = Column(Integer, ForeignKey("post.id"), nullable=False)
parent_id = Column(Integer, ForeignKey("comment.id"), nullable=True)
post = relationship(Post, backref="comments")
parent = relationship("Comment", uselist=False, post_update=True)
131 changes: 131 additions & 0 deletions gb_blog_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import time
import typing
import datetime
import requests
from urllib.parse import urljoin
import bs4

from database.database import Database


class GbBlogParse:
def __init__(self, start_url, db):
self.time = time.time()
self.start_url = start_url
self.db = db
self.done_urls = set()
self.tasks = []
start_task = self.get_task(self.start_url, self.parse_feed)
self.tasks.append(start_task)
self.done_urls.add(self.start_url)

def _get_response(self, url, *args, **kwargs):
if self.time + 0.9 < time.time():
time.sleep(0.5)
response = requests.get(url, *args, **kwargs)
self.time = time.time()
print(url)
return response

def _get_soup(self, url, *args, **kwargs):
soup = bs4.BeautifulSoup(self._get_response(url, *args, **kwargs).text, "lxml")
return soup

def get_task(self, url: str, callback: typing.Callable) -> typing.Callable:
def task():
soup = self._get_soup(url)
return callback(url, soup)

if url in self.done_urls:
return lambda *_, **__: None
self.done_urls.add(url)
return task

def task_creator(self, url, tags_list, callback):
links = set(
urljoin(url, itm.attrs.get("href")) for itm in tags_list if itm.attrs.get("href")
)
for link in links:
task = self.get_task(link, callback)
self.tasks.append(task)

def parse_feed(self, url, soup):
ul_pagination = soup.find("ul", attrs={"class": "gb__pagination"})
self.task_creator(url, ul_pagination.find_all("a"), self.parse_feed)
post_wrapper = soup.find("div", attrs={"class": "post-items-wrapper"})
self.task_creator(
url, post_wrapper.find_all("a", attrs={"class": "post-item__title"}), self.parse_post
)

def parse_post(self, url, soup):
author_tag = soup.find("div", attrs={"itemprop": "author"})

first_img = soup.find("div", attrs={"class": "blogpost-content"}).find("img")
if first_img: # Попались статьи без картинок
first_img_link = first_img.attrs.get("src")
else:
first_img_link = None

date_published = soup.find("div", attrs={"class": "blogpost-date-views"}).find("time").attrs.get("datetime")
date_obj = datetime.datetime.strptime(date_published, '%Y-%m-%dT%H:%M:%S%z')

data = {
"post_data": {
"title": soup.find("h1", attrs={"class": "blogpost-title"}).text,
"url": url,
"id": soup.find("comments").attrs.get("commentable-id"),
"img_url": first_img_link,
"date_published": date_obj,
},
"author_data": {
"url": urljoin(url, author_tag.parent.attrs.get("href")),
"name": author_tag.text,
},
"tags_data": [
{"name": tag.text, "url": urljoin(url, tag.attrs.get("href"))}
for tag in soup.find_all("a", attrs={"class": "small"})
],
"comments_data": self._get_comments(soup.find("comments").attrs.get("commentable-id")),
}
return data

def _get_comments(self, post_id):
api_path = f"/api/v2/comments?commentable_type=Post&commentable_id={post_id}&order=desc"
response = self._get_response(urljoin(self.start_url, api_path))
data = response.json()
data = self._restructure_comments(data, post_id)
return data

def _restructure_comments(self, data, post_id):
comment_keys = ('id', 'parent_id', 'body', 'created_at')
result = []
for itm in data:
tmp_dict = {}
for param in comment_keys:
tmp_dict.update({param: itm["comment"][param]})
tmp_dict.update({"post_id": post_id})
tmp_dict.update({"author_of_comment": itm["comment"]["user"]['full_name']})
# Убрал миллисекунды из 'created_at', формат '%Y-%m-%dT%H:%M:%S.%f%z' выдавал ошибку
tmp_dict['created_at'] = tmp_dict['created_at'][:19] + tmp_dict['created_at'][23:]
tmp_dict['created_at'] = datetime.datetime.strptime(tmp_dict['created_at'], '%Y-%m-%dT%H:%M:%S%z')
result.append(tmp_dict)
if itm["comment"]["children"]:
for child in self._restructure_comments(itm["comment"]["children"], post_id):
result.append(child)
return result

def run(self):
for task in self.tasks:
task_result = task()
if isinstance(task_result, dict):
self.save(task_result)

def save(self, data):
self.db.add_post(data)


if __name__ == "__main__":
# collection = MongoClient()["gb_parse_20_04"]["gb_blog"]
db = Database("sqlite:///gb_blog.db")
parser = GbBlogParse("https://gb.ru/posts", db)
parser.run()