From 83e2d5efb5a3941249d42a012f603319c94f7b5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=BB=D1=8C=D1=8F?= Date: Fri, 30 Apr 2021 18:40:09 +0300 Subject: [PATCH] add homework3 --- database/__init__.py | 0 database/database.py | 42 ++++++++++++++ database/mixins.py | 9 +++ database/models.py | 52 +++++++++++++++++ gb_blog_parse.py | 131 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 234 insertions(+) create mode 100644 database/__init__.py create mode 100644 database/database.py create mode 100644 database/mixins.py create mode 100644 database/models.py create mode 100644 gb_blog_parse.py diff --git a/database/__init__.py b/database/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/database.py b/database/database.py new file mode 100644 index 0000000..f25596c --- /dev/null +++ b/database/database.py @@ -0,0 +1,42 @@ +import sqlalchemy.exc +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from . import models + + +class Database: + def __init__(self, db_url): + self.engine = create_engine(db_url) + models.Base.metadata.create_all(bind=self.engine) + self.maker = sessionmaker(bind=self.engine) + + def get_or_create(self, session, model, **kwargs): + instance = session.query(model).filter_by(**kwargs).first() + if instance: + return instance + else: + instance = model(**kwargs) + return instance + + def add_post(self, data): + session = self.maker() + post = models.Post( + **data["post_data"], + author=self.get_or_create(session, models.Author, **data["author_data"]), + tags=[self.get_or_create(session, models.Tag, **tag_params) for tag_params in data["tags_data"]], + comments=[models.Comment(**comment_params) for comment_params in data["comments_data"]] + ) + for itm in post.tags: + itm.posts.append(post) + for itm in post.comments: + if itm.parent_id: + itm.parent = session.query(models.Comment).filter_by(id=itm.parent_id).first() + + try: + session.add(post) + session.commit() + except sqlalchemy.exc.IntegrityError: + session.rollback() + finally: + session.close() diff --git a/database/mixins.py b/database/mixins.py new file mode 100644 index 0000000..b0a7f50 --- /dev/null +++ b/database/mixins.py @@ -0,0 +1,9 @@ +from sqlalchemy import Column, Integer, String + + +class IdMixin: + id = Column(Integer, primary_key=True, autoincrement=True) + + +class UrlMixin: + url = Column(String, unique=True, nullable=False) \ No newline at end of file diff --git a/database/models.py b/database/models.py new file mode 100644 index 0000000..dc84c60 --- /dev/null +++ b/database/models.py @@ -0,0 +1,52 @@ +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship + +from sqlalchemy import Column, Integer, String, ForeignKey, Text, Table, DateTime + +from .mixins import UrlMixin + + +Base = declarative_base() + +tag_post = Table( + "tag_post", + Base.metadata, + Column("post_id", Integer, ForeignKey("post.id")), + Column("tag_id", Integer, ForeignKey("tag.id")), +) + + +class Post(Base, UrlMixin): + __tablename__ = "post" + id = Column(Integer, primary_key=True, autoincrement=True) + title = Column(String(250), nullable=False, unique=False) + img_url = Column(String, unique=False, nullable=True) + date_published = Column(DateTime, unique=False, nullable=True) + author_id = Column(Integer, ForeignKey("author.id"), nullable=True) + author = relationship("Author", backref="posts") + tags = relationship("Tag", secondary=tag_post) + + +class Author(Base, UrlMixin): + __tablename__ = "author" + id = Column(Integer, primary_key=True, autoincrement=True) + name = Column(String(150), nullable=False) + + +class Tag(Base, UrlMixin): + __tablename__ = "tag" + id = Column(Integer, primary_key=True, autoincrement=True) + name = Column(String(150), nullable=False) + posts = relationship(Post, secondary=tag_post) + + +class Comment(Base): + __tablename__ = "comment" + id = Column(Integer, primary_key=True, autoincrement=True) + body = Column(Text, nullable=False, unique=False) + created_at = Column(DateTime, unique=False, nullable=True) + author_of_comment = Column(String(150), nullable=False) + post_id = Column(Integer, ForeignKey("post.id"), nullable=False) + parent_id = Column(Integer, ForeignKey("comment.id"), nullable=True) + post = relationship(Post, backref="comments") + parent = relationship("Comment", uselist=False, post_update=True) diff --git a/gb_blog_parse.py b/gb_blog_parse.py new file mode 100644 index 0000000..9eb840c --- /dev/null +++ b/gb_blog_parse.py @@ -0,0 +1,131 @@ +import time +import typing +import datetime +import requests +from urllib.parse import urljoin +import bs4 + +from database.database import Database + + +class GbBlogParse: + def __init__(self, start_url, db): + self.time = time.time() + self.start_url = start_url + self.db = db + self.done_urls = set() + self.tasks = [] + start_task = self.get_task(self.start_url, self.parse_feed) + self.tasks.append(start_task) + self.done_urls.add(self.start_url) + + def _get_response(self, url, *args, **kwargs): + if self.time + 0.9 < time.time(): + time.sleep(0.5) + response = requests.get(url, *args, **kwargs) + self.time = time.time() + print(url) + return response + + def _get_soup(self, url, *args, **kwargs): + soup = bs4.BeautifulSoup(self._get_response(url, *args, **kwargs).text, "lxml") + return soup + + def get_task(self, url: str, callback: typing.Callable) -> typing.Callable: + def task(): + soup = self._get_soup(url) + return callback(url, soup) + + if url in self.done_urls: + return lambda *_, **__: None + self.done_urls.add(url) + return task + + def task_creator(self, url, tags_list, callback): + links = set( + urljoin(url, itm.attrs.get("href")) for itm in tags_list if itm.attrs.get("href") + ) + for link in links: + task = self.get_task(link, callback) + self.tasks.append(task) + + def parse_feed(self, url, soup): + ul_pagination = soup.find("ul", attrs={"class": "gb__pagination"}) + self.task_creator(url, ul_pagination.find_all("a"), self.parse_feed) + post_wrapper = soup.find("div", attrs={"class": "post-items-wrapper"}) + self.task_creator( + url, post_wrapper.find_all("a", attrs={"class": "post-item__title"}), self.parse_post + ) + + def parse_post(self, url, soup): + author_tag = soup.find("div", attrs={"itemprop": "author"}) + + first_img = soup.find("div", attrs={"class": "blogpost-content"}).find("img") + if first_img: # Попались статьи без картинок + first_img_link = first_img.attrs.get("src") + else: + first_img_link = None + + date_published = soup.find("div", attrs={"class": "blogpost-date-views"}).find("time").attrs.get("datetime") + date_obj = datetime.datetime.strptime(date_published, '%Y-%m-%dT%H:%M:%S%z') + + data = { + "post_data": { + "title": soup.find("h1", attrs={"class": "blogpost-title"}).text, + "url": url, + "id": soup.find("comments").attrs.get("commentable-id"), + "img_url": first_img_link, + "date_published": date_obj, + }, + "author_data": { + "url": urljoin(url, author_tag.parent.attrs.get("href")), + "name": author_tag.text, + }, + "tags_data": [ + {"name": tag.text, "url": urljoin(url, tag.attrs.get("href"))} + for tag in soup.find_all("a", attrs={"class": "small"}) + ], + "comments_data": self._get_comments(soup.find("comments").attrs.get("commentable-id")), + } + return data + + def _get_comments(self, post_id): + api_path = f"/api/v2/comments?commentable_type=Post&commentable_id={post_id}&order=desc" + response = self._get_response(urljoin(self.start_url, api_path)) + data = response.json() + data = self._restructure_comments(data, post_id) + return data + + def _restructure_comments(self, data, post_id): + comment_keys = ('id', 'parent_id', 'body', 'created_at') + result = [] + for itm in data: + tmp_dict = {} + for param in comment_keys: + tmp_dict.update({param: itm["comment"][param]}) + tmp_dict.update({"post_id": post_id}) + tmp_dict.update({"author_of_comment": itm["comment"]["user"]['full_name']}) + # Убрал миллисекунды из 'created_at', формат '%Y-%m-%dT%H:%M:%S.%f%z' выдавал ошибку + tmp_dict['created_at'] = tmp_dict['created_at'][:19] + tmp_dict['created_at'][23:] + tmp_dict['created_at'] = datetime.datetime.strptime(tmp_dict['created_at'], '%Y-%m-%dT%H:%M:%S%z') + result.append(tmp_dict) + if itm["comment"]["children"]: + for child in self._restructure_comments(itm["comment"]["children"], post_id): + result.append(child) + return result + + def run(self): + for task in self.tasks: + task_result = task() + if isinstance(task_result, dict): + self.save(task_result) + + def save(self, data): + self.db.add_post(data) + + +if __name__ == "__main__": + # collection = MongoClient()["gb_parse_20_04"]["gb_blog"] + db = Database("sqlite:///gb_blog.db") + parser = GbBlogParse("https://gb.ru/posts", db) + parser.run()