From 4a786d9e23e9a4954088141a8b17253e0ac1b982 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 27 Feb 2026 16:43:01 +0100 Subject: [PATCH 1/5] refactor(scraper): cache pages in sqlite db --- alembic.ini | 12 +- alembic/env.py | 121 ++++++++++++++-- .../fd345d2b7d78_httpcache_table.py | 42 ++++++ api/env.py | 1 + api/models.py | 28 +++- api/util/db.py | 24 +++ justfile | 3 +- scraper/settings.py | 2 +- scraper/util/db_httpcache.py | 137 ++++++++++++++++++ scraper/util/url.py | 6 + 10 files changed, 355 insertions(+), 21 deletions(-) create mode 100644 alembic/meta_versions/fd345d2b7d78_httpcache_table.py create mode 100644 scraper/util/db_httpcache.py diff --git a/alembic.ini b/alembic.ini index 1b03b05..8dbe1ab 100644 --- a/alembic.ini +++ b/alembic.ini @@ -1,6 +1,6 @@ # A generic, single database configuration. -[alembic] +[DEFAULT] # path to migration scripts. # this is typically a path given in POSIX (e.g. forward slashes) # format, relative to the token %(here)s which refers to the location of this @@ -19,6 +19,7 @@ script_location = %(here)s/alembic prepend_sys_path = . + # timezone to use when rendering the date within the migration file # as well as the filename. # If specified, requires the tzdata library which can be installed by adding @@ -44,7 +45,6 @@ prepend_sys_path = . # directories, initial revisions must be specified with --version-path. # The path separator used here should be the separator specified by "path_separator" # below. -# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions # path_separator; This indicates what character is used to split lists of file # paths, including version_locations and prepend_sys_path within configparser @@ -81,11 +81,11 @@ path_separator = os # are written from script.py.mako # output_encoding = utf-8 -# database URL. This is consumed by the user-maintained env.py script only. -# other means of configuring database URLs may be customized within the env.py -# file. -sqlalchemy.url = driver://user:pass@localhost/dbname +[data_db] +version_locations = %(here)s/alembic/versions +[meta_db] +version_locations = %(here)s/alembic/meta_versions [post_write_hooks] # post_write_hooks defines scripts or Python functions that are run diff --git a/alembic/env.py b/alembic/env.py index c1ff233..a509feb 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,10 +1,16 @@ -import pathlib +from collections.abc import MutableMapping from logging.config import fileConfig +from pathlib import Path +from typing import Literal + +from sqlalchemy import Table +from sqlalchemy.sql.schema import SchemaItem +from sqlmodel import SQLModel from alembic import context -from api.util.db import engine from api.env import Settings -from api.models import BaseModel +from api.models import BaseModel, MetadataModel +from api.util.db import engine, meta_engine # this is the Alembic Config object, which provides # access to the values within the .ini file in use. @@ -14,18 +20,109 @@ fileConfig(config.config_file_name) +def _get_table_names(base_cls: type) -> set[str]: + """Recursively collect __tablename__ from all table-model subclasses.""" + names: set[str] = set() + for cls in base_cls.__subclasses__(): + tablename = getattr(cls, "__tablename__", None) + if isinstance(tablename, str) and hasattr(cls, "__table__"): + names.add(tablename) + names |= _get_table_names(cls) + return names + + +# SQLModel ignores the metadata= kwarg and puts all tables into SQLModel.metadata. +# We distinguish which tables belong to which DB by walking subclasses instead. +_base_table_names = _get_table_names(BaseModel) +_meta_table_names = _get_table_names(MetadataModel) + +IncludeNameType = Literal[ + "schema", + "table", + "column", + "index", + "unique_constraint", + "foreign_key_constraint", +] +ParentNamesType = MutableMapping[ + Literal["schema_name", "table_name", "schema_qualified_table_name"], str | None +] + + +def _include_name_base( + name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType +) -> bool: + if type_ == "table": + return name in _base_table_names + return True + + +def _include_name_meta( + name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType +) -> bool: + if type_ == "table": + return name in _meta_table_names + return True + + +def _include_object_base( + object_: SchemaItem, + name: str | None, + type_: str, + _reflected: bool, + _compare_to: SchemaItem | None, +) -> bool: + if type_ == "table": + if isinstance(object_, Table): + return object_.name in _base_table_names + return name in _base_table_names if name else False + return True + + +def _include_object_meta( + object_: SchemaItem, + name: str | None, + type_: str, + _reflected: bool, + _compare_to: SchemaItem | None, +) -> bool: + if type_ == "table": + if isinstance(object_, Table): + return object_.name in _meta_table_names + return name in _meta_table_names if name else False + return True + + def run_migrations() -> None: - pathlib.Path(Settings().db_path).parent.mkdir(parents=True, exist_ok=True) + if "".join(config.get_version_locations_list() or "").endswith("meta_versions"): + # metadata db + Path(Settings().meta_db_path).parent.mkdir(parents=True, exist_ok=True) + with meta_engine.connect() as connection: + context.configure( + connection=connection, + target_metadata=SQLModel.metadata, + render_as_batch=True, + include_name=_include_name_meta, + include_object=_include_object_meta, + ) + + with context.begin_transaction(): + context.run_migrations() - with engine.connect() as connection: - context.configure( - connection=connection, - target_metadata=BaseModel.metadata, - render_as_batch=True, - ) + else: + # data db + Path(Settings().db_path).parent.mkdir(parents=True, exist_ok=True) + with engine.connect() as connection: + context.configure( + connection=connection, + target_metadata=SQLModel.metadata, + render_as_batch=True, + include_name=_include_name_base, + include_object=_include_object_base, + ) - with context.begin_transaction(): - context.run_migrations() + with context.begin_transaction(): + context.run_migrations() run_migrations() diff --git a/alembic/meta_versions/fd345d2b7d78_httpcache_table.py b/alembic/meta_versions/fd345d2b7d78_httpcache_table.py new file mode 100644 index 0000000..cb67731 --- /dev/null +++ b/alembic/meta_versions/fd345d2b7d78_httpcache_table.py @@ -0,0 +1,42 @@ +"""httpcache table + +Revision ID: fd345d2b7d78 +Revises: +Create Date: 2026-02-27 17:45:49.644585 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +import sqlmodel + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "fd345d2b7d78" +down_revision: Union[str, Sequence[str], None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "httpcache", + sa.Column("url", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column("status_code", sa.Integer(), nullable=False), + sa.Column("body", sa.LargeBinary(), nullable=True), + sa.Column("headers", sa.JSON(), nullable=True), + sa.Column("scraped_at", sa.INTEGER(), nullable=False), + sa.PrimaryKeyConstraint("url"), + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("httpcache") + # ### end Alembic commands ### diff --git a/api/env.py b/api/env.py index 66f2d60..5574641 100644 --- a/api/env.py +++ b/api/env.py @@ -10,6 +10,7 @@ class Settings(BaseSettings): ) db_path: str = "data/db.sqlite" + meta_db_path: str = "data/meta_db.sqlite" cache_expiry: int = 60 * 60 * 24 * 30 # in seconds (30 days) sitemap_expiry: int = 86400 # in seconds plausible_url: str | None = None diff --git a/api/models.py b/api/models.py index 64dba0f..2ffd105 100644 --- a/api/models.py +++ b/api/models.py @@ -2,7 +2,7 @@ import time from enum import Enum -from typing import final, override +from typing import Mapping, final, override from pydantic import BaseModel as PydanticBaseModel from rapidfuzz import fuzz, process, utils @@ -586,3 +586,29 @@ class SectionPathView(BaseModel, table=True): class UnitDepartmentView(BaseModel, table=True): unit_id: int = Field(primary_key=True) department_id: int = Field(primary_key=True, index=True) + + +""" + + +METADATA DATABASE +(separate db) + + + +""" + + +class MetadataModel(SQLModel): + pass + + +class HTTPCache(MetadataModel, table=True): + url: str = Field(primary_key=True) + status_code: int + body: bytes | None = Field(default=None) + headers: dict[str, str] | None = Field(default=None, sa_column=Column(JSON)) + scraped_at: int = Field( + default_factory=lambda: int(time.time()), + sa_column=Column(INTEGER, nullable=False), + ) diff --git a/api/util/db.py b/api/util/db.py index d02f20c..abd4ab9 100644 --- a/api/util/db.py +++ b/api/util/db.py @@ -28,3 +28,27 @@ async def aget_session(): async with AsyncSession(aengine) as session: await session.execute(text("pragma mmap_size=30000000000")) yield session + + +meta_engine = create_engine( + f"sqlite+pysqlite:///{Settings().meta_db_path}", json_serializer=json_serializer +) + +ameta_engine = create_async_engine( + f"sqlite+aiosqlite:///{Settings().meta_db_path}", + json_serializer=json_serializer, + pool_size=20, + max_overflow=30, +) + + +def meta_get_session(): + with Session(meta_engine) as session: + session.execute(text("PRAGMA foreign_keys=ON")) + yield session + + +async def ameta_get_session(): + async with AsyncSession(ameta_engine) as session: + await session.execute(text("pragma mmap_size=30000000000")) + yield session diff --git a/justfile b/justfile index 1e9c18f..18f799a 100644 --- a/justfile +++ b/justfile @@ -8,7 +8,8 @@ dev: alias m := migrate migrate: - uv run alembic upgrade heads + uv run alembic -n data_db upgrade heads + uv run alembic -n meta_db upgrade heads alias s := scrape diff --git a/scraper/settings.py b/scraper/settings.py index a64c1c8..afc614d 100644 --- a/scraper/settings.py +++ b/scraper/settings.py @@ -93,7 +93,7 @@ HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_DIR = "httpcache" HTTPCACHE_IGNORE_HTTP_CODES = [] -HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" +HTTPCACHE_STORAGE = "scraper.util.db_httpcache.DBHTTPCache" # Set settings whose default value is deprecated to a future-proof value FEED_EXPORT_ENCODING = "utf-8" diff --git a/scraper/util/db_httpcache.py b/scraper/util/db_httpcache.py new file mode 100644 index 0000000..445c2cd --- /dev/null +++ b/scraper/util/db_httpcache.py @@ -0,0 +1,137 @@ +from email.parser import Parser +from pathlib import Path +from typing import final, override + +import yaml +from rich import print +from scrapy import Request, Spider +from scrapy.extensions import httpcache +from scrapy.http import Response +from scrapy.responsetypes import responsetypes +from scrapy.settings import BaseSettings +from sqlmodel import Session + +from api.models import HTTPCache +from api.util.db import meta_engine +from scraper.util.url import normalized_url + + +@final +class DBHTTPCache(httpcache.FilesystemCacheStorage): + def __init__(self, settings: BaseSettings | None): + if settings: + super().__init__(settings) + + @override + def open_spider(self, spider: Spider) -> None: + self.logger = spider.logger + + @override + def close_spider(self, spider: Spider) -> None: + pass + + @override + def retrieve_response(self, spider: Spider, request: Request) -> Response | None: + url = self._normalize_url(request.url) + with Session(meta_engine.connect()) as session: + entry = session.get(HTTPCache, url) + if not entry: + return None + + headers = ( + {k.encode(): v.encode() for k, v in entry.headers.items()} + if entry.headers + else {} + ) + + respcls = responsetypes.from_args( + headers=headers, + url=url, + body=entry.body, + ) + return respcls( + url=url, + headers=headers, + status=entry.status_code, + body=entry.body or b"", + ) + + @override + def store_response( + self, spider: Spider, request: Request, response: Response + ) -> None: + self.store(request.url, response, None) + + def store(self, url: str, response: Response, timestamp: float | None): + url = self._normalize_url(url) + headers: dict[str, str] = dict(response.headers.to_unicode_dict()) + with Session(meta_engine.connect()) as session: + entry = HTTPCache( + url=url, + status_code=response.status, + headers=headers, + body=response.body, + ) + if timestamp is not None: + entry.scraped_at = int(timestamp) + + session.merge(entry) + session.commit() + + def _normalize_url(self, url: str) -> str: + return normalized_url(url) + + +@final +class Migrator: + def __init__(self, cachedir: str) -> None: + self.cachedir = cachedir + self.cache = DBHTTPCache(None) + + def migrate(self): + for dir in self._walk(): + try: + with open(dir / "meta", "r") as f: + # yaml allows us to open the invalid formatted json file + data = yaml.load(f, Loader=yaml.SafeLoader) # pyright: ignore[reportAny] + url: str = data.get("url", "") # pyright: ignore[reportAny] + timestamp: float | None = data.get("timestamp") # pyright: ignore[reportAny] + status: int = data.get("status", 0) # pyright: ignore[reportAny] + with open(dir / "response_body", "rb") as f: + body = f.read() + with open(dir / "response_headers", "r") as f: + parsed = Parser().parse(f) + headers = dict(parsed.items()) + self._add(url, timestamp, status, headers, body) + print(f"Migrated {url} from {dir}") + except Exception as e: + print(f"[red]Failed to migrate from {dir}: {e}[/red]") + + def _add( + self, + url: str, + timestamp: float | None, + status: int, + headers: dict[str, str], + body: bytes, + ): + response = Response(url=url, status=status, headers=headers, body=body) + self.cache.store(url, response, timestamp) + + def _walk(self): + cachedir = Path(self.cachedir) + for spiderdir in cachedir.iterdir(): + if not spiderdir.is_dir(): + continue + for shortdir in spiderdir.iterdir(): + if not shortdir.is_dir(): + continue + for requestdir in shortdir.iterdir(): + if not requestdir.is_dir(): + continue + yield requestdir + + +if __name__ == "__main__": + migrator = Migrator(".scrapy/httpcache") + migrator.migrate() diff --git a/scraper/util/url.py b/scraper/util/url.py index 7556efe..d55d8b5 100644 --- a/scraper/util/url.py +++ b/scraper/util/url.py @@ -28,3 +28,9 @@ def sort_url_params(url: str) -> str: sorted_query = dict(sorted(query.items())) url_res = url_res._replace(query=urlencode(sorted_query, True)) return urlunparse(url_res) + + +def normalized_url(url: str) -> str: + sorted = sort_url_params(url) + sorted = sorted.replace(".vorlesungen.", ".vvz.").replace("http://", "https://") + return sorted.strip("/") From db0aad8ca45027984c890be52cf46b2600ec65d3 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 27 Feb 2026 19:08:16 +0100 Subject: [PATCH 2/5] some more cleanup --- alembic/env.py | 89 +++-------- .../c28cde0a90db_move_over_cleanup_table.py | 37 +++++ .../3b1a337a1fe5_move_over_cleanup_table.py | 37 +++++ api/models.py | 22 +-- api/util/db.py | 6 +- scraper/env.py | 9 +- scraper/main.py | 105 +++++++------ scraper/pipelines.py | 2 +- scraper/settings.py | 4 +- scraper/spiders/units.py | 2 +- scraper/{util => types}/mappings.py | 0 scraper/util/cleanup_scrapy.py | 143 ------------------ scraper/util/db_httpcache.py | 2 + scraper/util/delete_cached.py | 48 ------ scraper/util/logging.py | 3 +- scraper/util/regex_rules.py | 2 +- scraper/util/table.py | 5 +- 17 files changed, 187 insertions(+), 329 deletions(-) create mode 100644 alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py create mode 100644 alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py rename scraper/{util => types}/mappings.py (100%) delete mode 100644 scraper/util/cleanup_scrapy.py delete mode 100644 scraper/util/delete_cached.py diff --git a/alembic/env.py b/alembic/env.py index a509feb..6b10ec6 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,10 +1,7 @@ -from collections.abc import MutableMapping from logging.config import fileConfig from pathlib import Path -from typing import Literal -from sqlalchemy import Table -from sqlalchemy.sql.schema import SchemaItem +from sqlalchemy import MetaData from sqlmodel import SQLModel from alembic import context @@ -31,66 +28,28 @@ def _get_table_names(base_cls: type) -> set[str]: return names -# SQLModel ignores the metadata= kwarg and puts all tables into SQLModel.metadata. -# We distinguish which tables belong to which DB by walking subclasses instead. +def _build_filtered_metadata(table_names: set[str]) -> MetaData: + """Build a new MetaData containing only the specified tables from SQLModel.metadata. + + SQLModel ignores the metadata= kwarg and registers all tables into a single + shared SQLModel.metadata. To make Alembic correctly detect additions, changes, + AND removals per-database, we construct a filtered MetaData that only contains + the tables belonging to that database. This way Alembic sees exactly which + tables should exist and can generate drops for any that are missing. + """ + filtered = MetaData() + for name, table in SQLModel.metadata.tables.items(): + if name in table_names: + table.to_metadata(filtered) + return filtered + + +# Distinguish which tables belong to which DB by walking model subclasses. _base_table_names = _get_table_names(BaseModel) _meta_table_names = _get_table_names(MetadataModel) -IncludeNameType = Literal[ - "schema", - "table", - "column", - "index", - "unique_constraint", - "foreign_key_constraint", -] -ParentNamesType = MutableMapping[ - Literal["schema_name", "table_name", "schema_qualified_table_name"], str | None -] - - -def _include_name_base( - name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType -) -> bool: - if type_ == "table": - return name in _base_table_names - return True - - -def _include_name_meta( - name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType -) -> bool: - if type_ == "table": - return name in _meta_table_names - return True - - -def _include_object_base( - object_: SchemaItem, - name: str | None, - type_: str, - _reflected: bool, - _compare_to: SchemaItem | None, -) -> bool: - if type_ == "table": - if isinstance(object_, Table): - return object_.name in _base_table_names - return name in _base_table_names if name else False - return True - - -def _include_object_meta( - object_: SchemaItem, - name: str | None, - type_: str, - _reflected: bool, - _compare_to: SchemaItem | None, -) -> bool: - if type_ == "table": - if isinstance(object_, Table): - return object_.name in _meta_table_names - return name in _meta_table_names if name else False - return True +_base_metadata = _build_filtered_metadata(_base_table_names) +_meta_metadata = _build_filtered_metadata(_meta_table_names) def run_migrations() -> None: @@ -100,10 +59,8 @@ def run_migrations() -> None: with meta_engine.connect() as connection: context.configure( connection=connection, - target_metadata=SQLModel.metadata, + target_metadata=_meta_metadata, render_as_batch=True, - include_name=_include_name_meta, - include_object=_include_object_meta, ) with context.begin_transaction(): @@ -115,10 +72,8 @@ def run_migrations() -> None: with engine.connect() as connection: context.configure( connection=connection, - target_metadata=SQLModel.metadata, + target_metadata=_base_metadata, render_as_batch=True, - include_name=_include_name_base, - include_object=_include_object_base, ) with context.begin_transaction(): diff --git a/alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py b/alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py new file mode 100644 index 0000000..7777b31 --- /dev/null +++ b/alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py @@ -0,0 +1,37 @@ +"""move over cleanup table + +Revision ID: c28cde0a90db +Revises: fd345d2b7d78 +Create Date: 2026-02-27 19:08:43.207368 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = 'c28cde0a90db' +down_revision: Union[str, Sequence[str], None] = 'fd345d2b7d78' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('lastcleanup', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('timestamp', sa.INTEGER(), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('lastcleanup') + # ### end Alembic commands ### diff --git a/alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py b/alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py new file mode 100644 index 0000000..b6b2efe --- /dev/null +++ b/alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py @@ -0,0 +1,37 @@ +"""move over cleanup table + +Revision ID: 3b1a337a1fe5 +Revises: aa860aba0a9f +Create Date: 2026-02-27 19:18:10.668579 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = '3b1a337a1fe5' +down_revision: Union[str, Sequence[str], None] = 'aa860aba0a9f' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('lastcleanup') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('lastcleanup', + sa.Column('id', sa.INTEGER(), nullable=False), + sa.Column('timestamp', sa.INTEGER(), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + # ### end Alembic commands ### diff --git a/api/models.py b/api/models.py index 2ffd105..dcc60a1 100644 --- a/api/models.py +++ b/api/models.py @@ -2,7 +2,7 @@ import time from enum import Enum -from typing import Mapping, final, override +from typing import final, override from pydantic import BaseModel as PydanticBaseModel from rapidfuzz import fuzz, process, utils @@ -530,16 +530,6 @@ class FinishedScrapingSemester(BaseModel, table=True): semkez: str = Field(primary_key=True) -class LastCleanup(BaseModel, table=True): - """Keeps track of when the last cleanup of the scrapy cache was performed.""" - - id: int | None = Field(default=None, primary_key=True) - timestamp: int = Field( - default_factory=lambda: int(time.time()), - sa_column=Column(INTEGER, nullable=False), - ) - - class Rating(BaseModel, table=True): """Course ratings scraped from the CourseReview site""" @@ -612,3 +602,13 @@ class HTTPCache(MetadataModel, table=True): default_factory=lambda: int(time.time()), sa_column=Column(INTEGER, nullable=False), ) + + +class LastCleanup(MetadataModel, table=True): + """Keeps track of when the last cleanup of the scrapy cache was performed.""" + + id: int | None = Field(default=None, primary_key=True) + timestamp: int = Field( + default_factory=lambda: int(time.time()), + sa_column=Column(INTEGER, nullable=False), + ) diff --git a/api/util/db.py b/api/util/db.py index abd4ab9..7900edf 100644 --- a/api/util/db.py +++ b/api/util/db.py @@ -27,6 +27,7 @@ def get_session(): async def aget_session(): async with AsyncSession(aengine) as session: await session.execute(text("pragma mmap_size=30000000000")) + await session.execute(text("PRAGMA foreign_keys=ON")) yield session @@ -42,13 +43,14 @@ async def aget_session(): ) -def meta_get_session(): +def get_meta_session(): with Session(meta_engine) as session: session.execute(text("PRAGMA foreign_keys=ON")) yield session -async def ameta_get_session(): +async def aget_meta_session(): async with AsyncSession(ameta_engine) as session: await session.execute(text("pragma mmap_size=30000000000")) + await session.execute(text("PRAGMA foreign_keys=ON")) yield session diff --git a/scraper/env.py b/scraper/env.py index d2555ea..86e1a9c 100644 --- a/scraper/env.py +++ b/scraper/env.py @@ -10,16 +10,13 @@ class Settings(BaseSettings): env_file=".env", env_file_encoding="utf-8", extra="ignore" ) - refresh_html: bool = False - """If html files, that are already cached locally, should be refetched""" - # Semester settings only apply to newly scraped semesters - # RESCRAPE_AMOUNT will overwrite this and cause only the last + # ENABLE_RESCRAPE will overwrite this and cause only the last # two already scraped semesters to be rescraped - start_year: int = date.today().year + start_year: int = date.today().year - 1 # automatically include next year (if it exists) end_year: int = date.today().year + 1 - semester: str = "W" + semester: str = "W,S" delay: float = 5.0 """Amount of seconds to at least wait between requests""" diff --git a/scraper/main.py b/scraper/main.py index 186dc08..e903ab8 100644 --- a/scraper/main.py +++ b/scraper/main.py @@ -8,16 +8,18 @@ from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from scrapy.utils.project import get_project_settings -from sqlmodel import text +from sqlmodel import col, distinct, select, text from api.env import Settings as APISettings +from api.models import LearningUnit from api.util.db import get_session from api.util.materialize import update_materialized_views from scraper.env import Settings as EnvSettings from scraper.spiders.lecturers import LecturersSpider from scraper.spiders.ratings import RatingsSpider from scraper.spiders.units import UnitsSpider -from scraper.util.delete_cached import delete_cached + +logger = logging.getLogger(__name__) def add_stdout_logging(settings: Settings): @@ -36,49 +38,66 @@ def add_stdout_logging(settings: Settings): root.addHandler(sh) -settings = get_project_settings() -add_stdout_logging(settings) +def crawl(): + settings = get_project_settings() + add_stdout_logging(settings) -process = CrawlerProcess(settings) + process = CrawlerProcess(settings) -# cleanup cache if required -if EnvSettings().enable_rescrape: - semkezs = delete_cached() - process.crawl(UnitsSpider, semkezs=semkezs) - process.crawl(LecturersSpider, semkezs=semkezs) - process.crawl(RatingsSpider) -else: - process.crawl(UnitsSpider) - process.crawl(LecturersSpider) - process.crawl(RatingsSpider) -process.start() + # cleanup cache if required + if EnvSettings().enable_rescrape: + with next(get_session()) as session: + semkezs = session.exec( + select(distinct(LearningUnit.semkez)) + .order_by(col(LearningUnit.semkez).desc()) + .limit(2) + ).all() + if not semkezs: + logger.info("No semesters found in database, scraping all semesters.") + semkezs = None + process.crawl(UnitsSpider, semkezs=semkezs) + # process.crawl(LecturersSpider, semkezs=semkezs) + # process.crawl(RatingsSpider) + else: + process.crawl(UnitsSpider) + # process.crawl(LecturersSpider) + # process.crawl(RatingsSpider) + process.start() -logger = logging.getLogger(__name__) -logger.info("Finished scraping data, updating materialized tables") -with next(get_session()) as session: - update_materialized_views(session) - -# vacuum/zip db -logger.info(f"Vacuuming database into {APISettings().vacuum_path}") -if Path(APISettings().vacuum_path).exists(): # required for VACUUM INTO to work - Path(APISettings().vacuum_path).unlink() -with next(get_session()) as session: - session.execute( - text("VACUUM INTO :vacuum_path"), - {"vacuum_path": f"{APISettings().vacuum_path}"}, +def update_materialized_view(): + logger.info("Finished scraping data, updating materialized tables") + with next(get_session()) as session: + update_materialized_views(session) + + +def vacuum(): + # vacuum/zip db + logger.info(f"Vacuuming database into {APISettings().vacuum_path}") + if Path(APISettings().vacuum_path).exists(): # required for VACUUM INTO to work + Path(APISettings().vacuum_path).unlink() + with next(get_session()) as session: + session.execute( + text("VACUUM INTO :vacuum_path"), + {"vacuum_path": f"{APISettings().vacuum_path}"}, + ) + logger.info("Finished vacuuming database") + logger.info(f"Creating database zip file at {APISettings().zip_path}") + with zipfile.ZipFile(APISettings().zip_path, "w", zipfile.ZIP_DEFLATED) as z: + z.write(APISettings().vacuum_path, arcname="database.db") + logger.info("Finished creating database zip file") + db_size = Path(APISettings().db_path).stat().st_size / (1024 * 1024) + vacuum_size = Path(APISettings().vacuum_path).stat().st_size / (1024 * 1024) + zip_size = Path(APISettings().zip_path).stat().st_size / (1024 * 1024) + logger.info( + f"Database size: {db_size:.2f} MB, vacuum size: {vacuum_size:.2f} MB, zipped size: {zip_size:.2f} MB" ) -logger.info("Finished vacuuming database") -logger.info(f"Creating database zip file at {APISettings().zip_path}") -with zipfile.ZipFile(APISettings().zip_path, "w", zipfile.ZIP_DEFLATED) as z: - z.write(APISettings().vacuum_path, arcname="database.db") -logger.info("Finished creating database zip file") -db_size = Path(APISettings().db_path).stat().st_size / (1024 * 1024) -vacuum_size = Path(APISettings().vacuum_path).stat().st_size / (1024 * 1024) -zip_size = Path(APISettings().zip_path).stat().st_size / (1024 * 1024) -logger.info( - f"Database size: {db_size:.2f} MB, vacuum size: {vacuum_size:.2f} MB, zipped size: {zip_size:.2f} MB" -) -logger.info(f"Deleting vacuum file at {APISettings().vacuum_path}") -Path(APISettings().vacuum_path).unlink(missing_ok=True) -logger.info("Finished deleting vacuum file.") + logger.info(f"Deleting vacuum file at {APISettings().vacuum_path}") + Path(APISettings().vacuum_path).unlink(missing_ok=True) + logger.info("Finished deleting vacuum file.") + + +if __name__ == "__main__": + crawl() + update_materialized_view() + vacuum() diff --git a/scraper/pipelines.py b/scraper/pipelines.py index ebd853c..17a8271 100644 --- a/scraper/pipelines.py +++ b/scraper/pipelines.py @@ -27,8 +27,8 @@ ) from api.util import db from scraper.spiders.units import UnitsSpider +from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping from scraper.util.difference import find_unit_differences -from scraper.util.mappings import UnitDepartmentMapping, UnitLevelMapping from scraper.util.scrapercache import CACHE_PATH DEP_LINK = CACHE_PATH / "unit_dep_link.jsonl" diff --git a/scraper/settings.py b/scraper/settings.py index afc614d..f8a5ffa 100644 --- a/scraper/settings.py +++ b/scraper/settings.py @@ -89,9 +89,7 @@ # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -HTTPCACHE_ENABLED = not Settings().refresh_html -HTTPCACHE_EXPIRATION_SECS = 0 -HTTPCACHE_DIR = "httpcache" +HTTPCACHE_ENABLED = True HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_STORAGE = "scraper.util.db_httpcache.DBHTTPCache" diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py index 07667ed..1eac4ca 100644 --- a/scraper/spiders/units.py +++ b/scraper/spiders/units.py @@ -38,8 +38,8 @@ WeekdayEnum, ) from scraper.env import Settings +from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping from scraper.util.logging import KeywordLoggerSpider -from scraper.util.mappings import UnitDepartmentMapping, UnitLevelMapping from scraper.util.regex_rules import ( RE_ABSCHNITTID, RE_DATE, diff --git a/scraper/util/mappings.py b/scraper/types/mappings.py similarity index 100% rename from scraper/util/mappings.py rename to scraper/types/mappings.py diff --git a/scraper/util/cleanup_scrapy.py b/scraper/util/cleanup_scrapy.py deleted file mode 100644 index 0920b28..0000000 --- a/scraper/util/cleanup_scrapy.py +++ /dev/null @@ -1,143 +0,0 @@ -from datetime import datetime -from pathlib import Path -import argparse -import re -from shutil import rmtree -import time -from typing import TypedDict, cast - -import yaml - - -HTTP_CACHE_PATH = Path(".scrapy/httpcache") - -re_units_en = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/lerneinheit\.view\?ansicht=ALLE&lang=en&lerneinheitId=\d+&semkez=\d{4}\w" -re_root_units = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/sucheLehrangebot\.view\?semkez=\d{4}\w&ansicht=2&seite=0(&deptId=\d+)?(&studiengangTyp=\w+)?&lang=\w\w" -re_legends = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/legendeStudienplanangaben\.view\?abschnittId=\d+&lang=en&semkez=\d{4}\w" - -re_lecturers_root = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/sucheDozierende\.view\?lang=de&semkez=\d{4}\w&seite=0" - - -class FileMetadata(TypedDict): - url: str - timestamp: int - - -def get_files(path: Path): - if not path.is_dir(): - return - for top in path.iterdir(): - if top.is_dir(): - for bot in top.iterdir(): - if bot.is_dir(): - meta = bot / "meta" - if not meta.exists(): - yield "", bot, 0 - with open(meta, "r") as f: - # yaml allows us to open the invalid formatted json file - data = cast( - FileMetadata, - yaml.load(f, Loader=yaml.SafeLoader), - ) - yield data.get("url", ""), bot, data.get("timestamp", 0) - - -def cleanup_scrapy( - dry_run: bool = False, - delete_cached_semesters: list[str] | None = None, - amount: int = 100, - age_seconds: int = 0, -): - if delete_cached_semesters is None: - delete_cached_semesters = [] - - unts, lecrs = 0, 0 - cached_unts = 0 - units = HTTP_CACHE_PATH / "units" - lecturers = HTTP_CACHE_PATH / "lecturers" - now = time.time() - - for url, dir, timestamp in get_files(units): - dt = datetime.fromtimestamp(timestamp) - # delete files that we do not use anymore - if ( - not re.match(re_units_en, url) - and not re.match(re_root_units, url) - and not re.match(re_legends, url) - ): - print(f"Delete unit: {dir}: URL mismatch {url}: {dt}") - unts += 1 - if not dry_run: - rmtree(dir) - # delete files from cached semesters - if cached_unts < amount: - for sem in delete_cached_semesters: - if f"semkez={sem}" in url and (now - timestamp) > age_seconds: - print(f"Delete unit: {dir}: Cached semester {sem} {url}: {dt}") - unts += 1 - cached_unts += 1 - if not dry_run: - rmtree(dir) - break - - for url, dir, timestamp in get_files(lecturers): - dt = datetime.fromtimestamp(timestamp) - if not re.match(re_lecturers_root, url): - print(f"Delete lecturer: {dir}: URL mismatch {url}: {dt}") - lecrs += 1 - if not dry_run: - rmtree(dir) - - print( - f"===============\nDeleted {unts} files in lecturers dir\nDeleted {lecrs} files in lecturers dir" - ) - - -if __name__ == "__main__": - - class Arguments(argparse.Namespace): - dry_run: bool - delete_cached_semesters: list[str] - amount: int - age_seconds: int - - parser = argparse.ArgumentParser(description="Cleanup scrapy cache") - parser.add_argument( - "--dry-run", - action="store_true", - help="Show what would be deleted without actually deleting", - default=False, - ) - parser.add_argument( - "-d", - "--delete-cached-semesters", - nargs="+", - help="List of semesters to delete from cache, e.g., 2023W 2024S", - default=[], - ) - parser.add_argument( - "-n", - "--amount", - type=int, - help="Amount of cached semesters to delete", - default=100, - ) - parser.add_argument( - "--age-seconds", - type=int, - help="Delete cached files older than this many seconds", - default=0, - ) - parser.add_argument( - "--age-seconds", - type=int, - help="Delete cached files older than this many seconds", - default=0, - ) - args = parser.parse_args(namespace=Arguments()) - cleanup_scrapy( - dry_run=args.dry_run, - delete_cached_semesters=args.delete_cached_semesters, - amount=args.amount, - age_seconds=args.age_seconds, - ) diff --git a/scraper/util/db_httpcache.py b/scraper/util/db_httpcache.py index 445c2cd..ec4360a 100644 --- a/scraper/util/db_httpcache.py +++ b/scraper/util/db_httpcache.py @@ -84,6 +84,8 @@ def _normalize_url(self, url: str) -> str: @final class Migrator: + """Used to migrate httpcache to dbhttpcache above""" + def __init__(self, cachedir: str) -> None: self.cachedir = cachedir self.cache = DBHTTPCache(None) diff --git a/scraper/util/delete_cached.py b/scraper/util/delete_cached.py deleted file mode 100644 index e42caa6..0000000 --- a/scraper/util/delete_cached.py +++ /dev/null @@ -1,48 +0,0 @@ -import time -from typing import Sequence -from sqlmodel import col, distinct, select -from api.models import LastCleanup, LearningUnit -from api.util.db import get_session -from scraper.env import Settings -from scraper.util.cleanup_scrapy import cleanup_scrapy - - -def delete_cached() -> Sequence[str]: - print("Checking if cached files should be deleted...") - with next(get_session()) as session: - last_cleanup = session.exec( - select(LastCleanup).order_by(col(LastCleanup.timestamp).desc()).limit(1) - ).first() - last_cleanup_time = last_cleanup.timestamp if last_cleanup else 0 - - # prevent cleaning up if pod is crash-looping - now = int(time.time()) - if now - last_cleanup_time < 22 * 3600: - print( - "Last cleanup was performed less than 22 hours ago, skipping cleanup." - ) - return [] - - last_semesters = session.exec( - select(distinct(LearningUnit.semkez)) - .order_by(col(LearningUnit.semkez).desc()) - .limit(2) - ).all() - if not last_semesters: - print("No semesters found in database, skipping cleanup.") - return [] - - print(f"Performing cleanup of cached files for semesters: {last_semesters}") - - cleanup_scrapy( - dry_run=True, - delete_cached_semesters=list(last_semesters), - amount=Settings().rescrape_amount, - age_seconds=Settings().rescrape_age_seconds, - ) - - last_cleanup = LastCleanup(timestamp=now) - session.add(last_cleanup) - session.commit() - - return last_semesters diff --git a/scraper/util/logging.py b/scraper/util/logging.py index 6024f76..24e6e52 100644 --- a/scraper/util/logging.py +++ b/scraper/util/logging.py @@ -1,12 +1,13 @@ # pyright: reportExplicitAny=false,reportAny=false from typing import Any, override + from scrapy.spiders import CrawlSpider from scrapy.utils.log import SpiderLoggerAdapter class KeywordLoggerAdapter(SpiderLoggerAdapter): - """A logger adapter which adds the 'keyword' attribute to log records.""" + """A logger adapter which adds the 'extra' attributes to log records.""" @override def process(self, msg: str, kwargs: Any): diff --git a/scraper/util/regex_rules.py b/scraper/util/regex_rules.py index 628d75a..107597b 100644 --- a/scraper/util/regex_rules.py +++ b/scraper/util/regex_rules.py @@ -8,7 +8,7 @@ - 17-412 1L (https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?lerneinheitId=13629&semkez=2003S&ansicht=LEHRVERANSTALTUNGEN&lang=de) - 10-824 (https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?lerneinheitId=6467&semkez=2003S&ansicht=LEHRVERANSTALTUNGEN&lang=de) """ -RE_DATE = r"\d{2}\.\d{2}\.\d{4}" # 31.12.2023 +RE_DATE = r"\d{2}\.\d{2}\.\d{4}" # Format: 31.12.2023 RE_SEMKEZ = r"semkez=(\w+)" RE_UNITID = r"lerneinheitId=(\d+)" RE_DOZIDE = r"dozide=(\d+)" diff --git a/scraper/util/table.py b/scraper/util/table.py index 1b82f6e..89767fe 100644 --- a/scraper/util/table.py +++ b/scraper/util/table.py @@ -1,13 +1,14 @@ from re import Pattern -from scrapy.http import Response + from parsel import Selector, SelectorList +from scrapy.http import Response from scraper.util.keymap import TranslationKey, translations class Table: """ - Takes a page and throws all table rows into a list of (key, columns) tuples. + Takes a page and transforms all table rows into a list of (key, columns) tuples. """ def __init__( From 1f9ec0ee66617bfb3f5741f197851ee7b0b85c89 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 27 Feb 2026 22:24:59 +0100 Subject: [PATCH 3/5] rescrape and add option to flag pages --- .../meta_versions/357b241a4250_add_flagged.py | 43 +++++++++++ api/models.py | 2 + scraper/env.py | 4 +- scraper/main.py | 24 +----- scraper/settings.py | 2 +- scraper/spiders/lecturers.py | 14 +++- scraper/spiders/units.py | 32 +++++++- scraper/util/{ => caching}/db_httpcache.py | 11 +++ scraper/util/caching/rescrape.py | 73 +++++++++++++++++++ 9 files changed, 176 insertions(+), 29 deletions(-) create mode 100644 alembic/meta_versions/357b241a4250_add_flagged.py rename scraper/util/{ => caching}/db_httpcache.py (93%) create mode 100644 scraper/util/caching/rescrape.py diff --git a/alembic/meta_versions/357b241a4250_add_flagged.py b/alembic/meta_versions/357b241a4250_add_flagged.py new file mode 100644 index 0000000..34946b5 --- /dev/null +++ b/alembic/meta_versions/357b241a4250_add_flagged.py @@ -0,0 +1,43 @@ +"""add flagged + +Revision ID: 357b241a4250 +Revises: c28cde0a90db +Create Date: 2026-02-27 22:17:37.706745 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "357b241a4250" +down_revision: Union[str, Sequence[str], None] = "c28cde0a90db" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("httpcache", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "flagged", sa.Boolean(), nullable=False, server_default=sa.false() + ) + ) + + with op.batch_alter_table("httpcache", schema=None) as batch_op: + batch_op.alter_column("flagged", server_default=None) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("httpcache", schema=None) as batch_op: + batch_op.drop_column("flagged") + + # ### end Alembic commands ### diff --git a/api/models.py b/api/models.py index dcc60a1..6961a62 100644 --- a/api/models.py +++ b/api/models.py @@ -598,6 +598,8 @@ class HTTPCache(MetadataModel, table=True): status_code: int body: bytes | None = Field(default=None) headers: dict[str, str] | None = Field(default=None, sa_column=Column(JSON)) + flagged: bool = Field(default=False) + """if set, the entry be rescraped the next time it's accessed""" scraped_at: int = Field( default_factory=lambda: int(time.time()), sa_column=Column(INTEGER, nullable=False), diff --git a/scraper/env.py b/scraper/env.py index 86e1a9c..a1f4f66 100644 --- a/scraper/env.py +++ b/scraper/env.py @@ -24,10 +24,10 @@ class Settings(BaseSettings): log_append: bool = True disable_log_file: bool = False - # delete valid cached files + # rescrapes the full course pages, while only rescraping + # the given amount of the oldest cached unit pages enable_rescrape: bool = False rescrape_amount: int = 500 - rescrape_age_seconds: int = 24 * 3600 * 14 # 14 days def read_semesters(self) -> list[Literal["W", "S"]]: semesters: list[Literal["W", "S"]] = [] diff --git a/scraper/main.py b/scraper/main.py index e903ab8..40809f6 100644 --- a/scraper/main.py +++ b/scraper/main.py @@ -18,6 +18,7 @@ from scraper.spiders.lecturers import LecturersSpider from scraper.spiders.ratings import RatingsSpider from scraper.spiders.units import UnitsSpider +from scraper.util.caching.rescrape import get_last_semesters logger = logging.getLogger(__name__) @@ -41,27 +42,10 @@ def add_stdout_logging(settings: Settings): def crawl(): settings = get_project_settings() add_stdout_logging(settings) - process = CrawlerProcess(settings) - - # cleanup cache if required - if EnvSettings().enable_rescrape: - with next(get_session()) as session: - semkezs = session.exec( - select(distinct(LearningUnit.semkez)) - .order_by(col(LearningUnit.semkez).desc()) - .limit(2) - ).all() - if not semkezs: - logger.info("No semesters found in database, scraping all semesters.") - semkezs = None - process.crawl(UnitsSpider, semkezs=semkezs) - # process.crawl(LecturersSpider, semkezs=semkezs) - # process.crawl(RatingsSpider) - else: - process.crawl(UnitsSpider) - # process.crawl(LecturersSpider) - # process.crawl(RatingsSpider) + process.crawl(UnitsSpider) + # process.crawl(LecturersSpider) + # process.crawl(RatingsSpider) process.start() diff --git a/scraper/settings.py b/scraper/settings.py index f8a5ffa..fb2c50a 100644 --- a/scraper/settings.py +++ b/scraper/settings.py @@ -91,7 +91,7 @@ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = True HTTPCACHE_IGNORE_HTTP_CODES = [] -HTTPCACHE_STORAGE = "scraper.util.db_httpcache.DBHTTPCache" +HTTPCACHE_STORAGE = "scraper.util.caching.db_httpcache.DBHTTPCache" # Set settings whose default value is deprecated to a future-proof value FEED_EXPORT_ENCODING = "utf-8" diff --git a/scraper/spiders/lecturers.py b/scraper/spiders/lecturers.py index 59d1229..2382a2d 100644 --- a/scraper/spiders/lecturers.py +++ b/scraper/spiders/lecturers.py @@ -5,6 +5,7 @@ from api.models import Lecturer from scraper.env import Settings +from scraper.util.caching.rescrape import RESCRAPE_SEMKEZS from scraper.util.logging import KeywordLoggerSpider from scraper.util.regex_rules import RE_DOZIDE, RE_SEMKEZ @@ -18,11 +19,11 @@ def get_urls(year: int, semester: Literal["W", "S"]): class LecturersSpider(KeywordLoggerSpider): name: str = "lecturers" - def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny] - if semkezs is not None: + def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny] + if RESCRAPE_SEMKEZS is not None: self.start_urls: list[str] = [ url - for semkez in semkezs + for semkez in RESCRAPE_SEMKEZS for url in get_urls(int(semkez[:-1]), "S" if semkez[-1] == "S" else "W") ] else: @@ -36,6 +37,13 @@ def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyr @override def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny] + if RESCRAPE_SEMKEZS and "cached" in response.flags: + self.logger.info( + "Skipping cached lecturers page due to rescrape settings", + extra={"url": response.url}, + ) + return + semkez = re.search(RE_SEMKEZ, response.url) if not semkez: self.logger.error( diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py index 1eac4ca..48be03f 100644 --- a/scraper/spiders/units.py +++ b/scraper/spiders/units.py @@ -39,6 +39,7 @@ ) from scraper.env import Settings from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping +from scraper.util.caching.rescrape import RESCRAPE_SEMKEZS from scraper.util.logging import KeywordLoggerSpider from scraper.util.regex_rules import ( RE_ABSCHNITTID, @@ -131,11 +132,11 @@ class UnitsSpider(KeywordLoggerSpider): ) course_ids: dict[str, set[int]] = defaultdict(set) - def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny] - if semkezs is not None: + def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny] + if RESCRAPE_SEMKEZS is not None: self.start_urls: list[str] = [ url - for semkez in semkezs + for semkez in RESCRAPE_SEMKEZS for url in get_urls(int(semkez[:-1]), "S" if semkez[-1] == "S" else "W") ] else: @@ -149,6 +150,16 @@ def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyr @override def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny] + if RESCRAPE_SEMKEZS and "cached" in response.flags: + self.logger.info( + "Skipping cached catalogue page due to rescrape settings", + extra={ + "url": response.url, + "request_url": response.request.url if response.request else None, + }, + ) + return + try: catalog_semkez = re.search(RE_SEMKEZ, response.url) if not catalog_semkez: @@ -361,6 +372,11 @@ def parse_unit( Example url: https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?semkez=2025W&ansicht=ALLE&lerneinheitId=192945&lang=en """ + if RESCRAPE_SEMKEZS and "cached" in response.flags: + # the http cache will automatically refetch a set of pages + # if there are explicit semkezs to rescrape + return + try: if "red9.ethz.ch" in response.url: self.logger.info( @@ -561,6 +577,16 @@ def parse_legend(self, response: Response) -> Generator[UnitTypeLegends]: """ Example: www.vvz.ethz.ch/Vorlesungsverzeichnis/legendeStudienplanangaben.view?abschnittId=117361&semkez=2025W&lang=en """ + if RESCRAPE_SEMKEZS and "cached" in response.flags: + self.logger.info( + "Skipping cached legend page due to rescrape settings", + extra={ + "url": response.url, + "request_url": response.request.url if response.request else None, + }, + ) + return + try: semkez = re.search(RE_SEMKEZ, response.url) id = re.search(RE_ABSCHNITTID, response.url) diff --git a/scraper/util/db_httpcache.py b/scraper/util/caching/db_httpcache.py similarity index 93% rename from scraper/util/db_httpcache.py rename to scraper/util/caching/db_httpcache.py index ec4360a..284b16b 100644 --- a/scraper/util/db_httpcache.py +++ b/scraper/util/caching/db_httpcache.py @@ -13,6 +13,7 @@ from api.models import HTTPCache from api.util.db import meta_engine +from scraper.util.caching.rescrape import should_rescrape from scraper.util.url import normalized_url @@ -33,10 +34,20 @@ def close_spider(self, spider: Spider) -> None: @override def retrieve_response(self, spider: Spider, request: Request) -> Response | None: url = self._normalize_url(request.url) + + if should_rescrape(url): + self.logger.info( + "URL marked for rescraping, skipping cache", + extra={"url": url}, + ) + return None + with Session(meta_engine.connect()) as session: entry = session.get(HTTPCache, url) if not entry: return None + if entry.flagged: + return None headers = ( {k.encode(): v.encode() for k, v in entry.headers.items()} diff --git a/scraper/util/caching/rescrape.py b/scraper/util/caching/rescrape.py new file mode 100644 index 0000000..7af5126 --- /dev/null +++ b/scraper/util/caching/rescrape.py @@ -0,0 +1,73 @@ +""" +Cache layout + +All pages are cached in an sqlite DB with DBHTTPCache. + +If enable_rescrape is enabled: +- At most rescrape_amount unit +pages of the last two semesters are rescraped. +- seit=0 pages are rescraped if older than an hour +""" + +from time import time + +from sqlmodel import col, distinct, or_, select + +from api.models import HTTPCache, LearningUnit +from api.util.db import get_meta_session, get_session +from scraper.env import Settings + +settings = Settings() +enable_rescrape = Settings().enable_rescrape +rescrape_amount = Settings().rescrape_amount + + +def get_last_semesters(n: int) -> list[str]: + with next(get_session()) as session: + semkezs = session.exec( + select(distinct(LearningUnit.semkez)) + .order_by(col(LearningUnit.semkez).desc()) + .limit(n) + ).all() + return list(semkezs) + + +RESCRAPE_SEMKEZS = get_last_semesters(1) if enable_rescrape else None + +# gets the outdated urls and any seite=0 urls +clauses = [] +oldest_urls = set[str]() +flagged = set[str]() +if RESCRAPE_SEMKEZS is not None: + clauses = or_( + *[ + col(HTTPCache.url).contains(f"semkez={semkez}") + for semkez in RESCRAPE_SEMKEZS + ] + ) + with next(get_meta_session()) as session: + oldest_urls = set( + session.exec( + select(HTTPCache.url) + .order_by(col(HTTPCache.scraped_at)) + .where(clauses) + .limit(rescrape_amount) + ).all() + ) + + # seite=0 pages + seite0_urls = session.exec( + select(HTTPCache.url) + .where( + clauses, + col(HTTPCache.url).contains("seite=0"), + col(HTTPCache.scraped_at) < int(time()) - 3600, # older than an hour + ) + .order_by(col(HTTPCache.scraped_at)) + .limit(50) + ).all() + oldest_urls.update(seite0_urls) + + +def should_rescrape(url: str): + return enable_rescrape and url in oldest_urls From 060abd838b288c8eaea1147f64fdfc83594497f9 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 27 Feb 2026 22:30:24 +0100 Subject: [PATCH 4/5] partially remove unit changes --- api/models.py | 2 ++ api/routers/v1/units.py | 4 ++- scraper/pipelines.py | 41 +---------------------- scraper/util/difference.py | 66 -------------------------------------- 4 files changed, 6 insertions(+), 107 deletions(-) delete mode 100644 scraper/util/difference.py diff --git a/api/models.py b/api/models.py index 6961a62..ba63621 100644 --- a/api/models.py +++ b/api/models.py @@ -514,6 +514,7 @@ def search_query(self) -> str: """ +# TODO: REMOVE class UnitChanges(BaseModel, table=True): """We keep track of changes that get applied to learning units""" @@ -524,6 +525,7 @@ class UnitChanges(BaseModel, table=True): """The scraped_at before the changes were applied""" +# TODO: move to metadata db class FinishedScrapingSemester(BaseModel, table=True): """Keeps track of which semesters have been fully scraped already.""" diff --git a/api/routers/v1/units.py b/api/routers/v1/units.py index 50616e2..88af1fd 100644 --- a/api/routers/v1/units.py +++ b/api/routers/v1/units.py @@ -70,9 +70,11 @@ async def get_unit_lecturers( @router.get( "/{unit_id}/changes", response_model=Sequence[UnitChanges], - description="Get a list of changes that the course details have undergone. " + description="WILL BE REMOVED BEGINNING OF MARCH 2026. It's too broken.\n" + + "Get a list of changes that the course details have undergone. " + "Changes are a JSON object that describe what the values were before they " + "got updated to either the next change or whatever the model currently has.", + deprecated=True, ) async def get_unit_changes( unit_id: int, diff --git a/scraper/pipelines.py b/scraper/pipelines.py index 17a8271..22ee58e 100644 --- a/scraper/pipelines.py +++ b/scraper/pipelines.py @@ -6,7 +6,7 @@ from pydantic import BaseModel from scrapy import Spider from scrapy.utils.log import SpiderLoggerAdapter -from sqlmodel import Session, col, select +from sqlmodel import Session, select from api.models import ( Course, @@ -16,10 +16,8 @@ LearningUnit, Lecturer, Level, - Overwriteable, Rating, Section, - UnitChanges, UnitExaminerLink, UnitLecturerLink, UnitSectionLink, @@ -28,7 +26,6 @@ from api.util import db from scraper.spiders.units import UnitsSpider from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping -from scraper.util.difference import find_unit_differences from scraper.util.scrapercache import CACHE_PATH DEP_LINK = CACHE_PATH / "unit_dep_link.jsonl" @@ -143,42 +140,6 @@ def process_item(self, item: object, spider: Spider): if not old: self.session.add(item) self.session.commit() - elif isinstance(old, Overwriteable): - if isinstance(old, LearningUnit) and isinstance(item, LearningUnit): - # determine if there are any differences - if differences := find_unit_differences(old, item): - old_changes = self.session.exec( - select(UnitChanges) - .where( - UnitChanges.changes == differences.changes, - UnitChanges.unit_id == differences.unit_id, - ) - .order_by(col(UnitChanges.scraped_at).desc()) - ).one_or_none() - if old_changes: - self.logger.warning( - "Detecting duplicate changes. Only updating scraped_at", - extra={ - "unit_id": old.id, - "changes": differences.changes, - "changes_id": old_changes.id, - }, - ) - old_changes.scraped_at = differences.scraped_at - else: - self.logger.info( - "LearningUnit changes detected", - extra={ - "unit_id": old.id, - "changes": differences.changes, - }, - ) - self.session.add(differences) - - old.overwrite_with(item) - old.scraped_at = int(time.time()) - self.session.add(old) - self.session.commit() return item except Exception as e: diff --git a/scraper/util/difference.py b/scraper/util/difference.py deleted file mode 100644 index e6aff57..0000000 --- a/scraper/util/difference.py +++ /dev/null @@ -1,66 +0,0 @@ -# pyright: reportExplicitAny=false,reportAny=false - - -from typing import Any, Literal -from api.models import LearningUnit, UnitChanges - - -def _determine_lang(unit: LearningUnit) -> Literal["en", "de"]: - """ - Determines if a new unit model is added as an english or german part. - Defaults to "de" if no English fields are set. - """ - for field, value in unit: - if field.endswith("_english") and value is not None: - return "en" - return "de" - - -def _has_language_key(unit: LearningUnit, lang: Literal["en", "de"]): - """Determines if a model already has keys of a language""" - for field, value in unit: - if field.endswith("_english"): - if lang == "en" and value is not None: - return True - elif lang == "de" and getattr(unit, field[:-8]) is not None: - return True - return False - - -def find_unit_differences(old: LearningUnit, new: LearningUnit) -> UnitChanges | None: - """ - Determines if there are any differences between an already existing model (from the DB) - and from a newly yielded item. The new item is either an English or German unit, meaning - either the English or German catalogue data is filled out, while the other language fields - are None. In a scraping run we'll always get both a "German" and "English" catalogue data - unit as well as an "English" unit with the additional data. - By determining the language of a model we avoid the issue where `old` is "English", while - `new` is "German", so all the English fields are incorrectly identified as having been - removed because they're not present in the new model anymore. - """ - - if old.id != new.id: - raise ValueError("Can only compare LearningUnits with the same unit_id") - - new_lang = _determine_lang(new) - if not _has_language_key(old, new_lang): - # There are no differences to check, since the old model does not have any language - # specific values of the same language as the new item. - return None - - diffs: dict[str, Any] = {} - # only iterate over explicitly set fields to avoid checking default/None values - for field in new.model_fields_set: - val_old = getattr(old, field) - val_new = getattr(new, field) - if val_old != val_new: - diffs[field] = val_old - - if not diffs: - return None - - return UnitChanges( - unit_id=old.id, - changes=diffs, - scraped_at=old.scraped_at, - ) From 889fd2d886ea339006d4e55accaf43e5104888ca Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 27 Feb 2026 23:04:13 +0100 Subject: [PATCH 5/5] add last-updated text on unit page --- .github/workflows/build.yml | 15 ++++++++++----- api/models.py | 3 +++ api/templates/pages/Unit/Index.jinja | 6 +++++- justfile | 4 ++++ scraper/main.py | 9 +++------ scraper/pipelines.py | 6 ++++++ scraper/spiders/lecturers.py | 2 +- scraper/spiders/units.py | 4 ++-- scraper/util/caching/db_httpcache.py | 7 +++++++ 9 files changed, 41 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 21fa01d..776cb38 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -43,11 +43,16 @@ jobs: - name: Check SQLite Alembic Migrations run: | - uv run alembic upgrade heads - uv run alembic check - uv run alembic downgrade base - uv run alembic upgrade heads - uv run alembic check + uv run alembic -n data_db upgrade heads + uv run alembic -n meta_db upgrade heads + uv run alembic -n data_db check + uv run alembic -n meta_db check + uv run alembic -n data_db downgrade base + uv run alembic -n meta_db downgrade base + uv run alembic -n data_db upgrade heads + uv run alembic -n meta_db upgrade heads + uv run alembic -n data_db check + uv run alembic -n meta_db check build: needs: test diff --git a/api/models.py b/api/models.py index ba63621..56e7861 100644 --- a/api/models.py +++ b/api/models.py @@ -368,6 +368,9 @@ def departments_as_short_str(self) -> str: def levels_as_str(self) -> str: return ", ".join([str(level) for level in self.levels]) + def last_updated(self) -> str: + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.scraped_at)) + """ diff --git a/api/templates/pages/Unit/Index.jinja b/api/templates/pages/Unit/Index.jinja index 213bae2..0f80980 100644 --- a/api/templates/pages/Unit/Index.jinja +++ b/api/templates/pages/Unit/Index.jinja @@ -34,7 +34,7 @@

-
+
@@ -157,7 +157,11 @@ +
+

+ Last Updated: {{ unit.last_updated() }} +

diff --git a/justfile b/justfile index 18f799a..fc2cf46 100644 --- a/justfile +++ b/justfile @@ -11,6 +11,10 @@ migrate: uv run alembic -n data_db upgrade heads uv run alembic -n meta_db upgrade heads +check: + uv run alembic -n data_db check + uv run alembic -n meta_db check + alias s := scrape scrape: diff --git a/scraper/main.py b/scraper/main.py index 40809f6..f753d16 100644 --- a/scraper/main.py +++ b/scraper/main.py @@ -8,17 +8,14 @@ from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from scrapy.utils.project import get_project_settings -from sqlmodel import col, distinct, select, text +from sqlmodel import text from api.env import Settings as APISettings -from api.models import LearningUnit from api.util.db import get_session from api.util.materialize import update_materialized_views -from scraper.env import Settings as EnvSettings from scraper.spiders.lecturers import LecturersSpider from scraper.spiders.ratings import RatingsSpider from scraper.spiders.units import UnitsSpider -from scraper.util.caching.rescrape import get_last_semesters logger = logging.getLogger(__name__) @@ -44,8 +41,8 @@ def crawl(): add_stdout_logging(settings) process = CrawlerProcess(settings) process.crawl(UnitsSpider) - # process.crawl(LecturersSpider) - # process.crawl(RatingsSpider) + process.crawl(LecturersSpider) + process.crawl(RatingsSpider) process.start() diff --git a/scraper/pipelines.py b/scraper/pipelines.py index 22ee58e..54a1a10 100644 --- a/scraper/pipelines.py +++ b/scraper/pipelines.py @@ -16,6 +16,7 @@ LearningUnit, Lecturer, Level, + Overwriteable, Rating, Section, UnitExaminerLink, @@ -140,6 +141,11 @@ def process_item(self, item: object, spider: Spider): if not old: self.session.add(item) self.session.commit() + elif isinstance(old, Overwriteable): + old.overwrite_with(item) + old.scraped_at = int(time.time()) + self.session.add(old) + self.session.commit() return item except Exception as e: diff --git a/scraper/spiders/lecturers.py b/scraper/spiders/lecturers.py index 2382a2d..4c866a0 100644 --- a/scraper/spiders/lecturers.py +++ b/scraper/spiders/lecturers.py @@ -39,7 +39,7 @@ def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExpli def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny] if RESCRAPE_SEMKEZS and "cached" in response.flags: self.logger.info( - "Skipping cached lecturers page due to rescrape settings", + "RESCRAPE is on. Not implicitly rescraping lecturers page.", extra={"url": response.url}, ) return diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py index 48be03f..5fc4df8 100644 --- a/scraper/spiders/units.py +++ b/scraper/spiders/units.py @@ -152,7 +152,7 @@ def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExpli def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny] if RESCRAPE_SEMKEZS and "cached" in response.flags: self.logger.info( - "Skipping cached catalogue page due to rescrape settings", + "RESCRAPE is on. Not implicitly rescraping catalogue page.", extra={ "url": response.url, "request_url": response.request.url if response.request else None, @@ -579,7 +579,7 @@ def parse_legend(self, response: Response) -> Generator[UnitTypeLegends]: """ if RESCRAPE_SEMKEZS and "cached" in response.flags: self.logger.info( - "Skipping cached legend page due to rescrape settings", + "RESCRAPE is on. Not implicitly rescraping legend page.", extra={ "url": response.url, "request_url": response.request.url if response.request else None, diff --git a/scraper/util/caching/db_httpcache.py b/scraper/util/caching/db_httpcache.py index 284b16b..dcd7b8a 100644 --- a/scraper/util/caching/db_httpcache.py +++ b/scraper/util/caching/db_httpcache.py @@ -47,6 +47,10 @@ def retrieve_response(self, spider: Spider, request: Request) -> Response | None if not entry: return None if entry.flagged: + self.logger.info( + "URL flagged for rescraping, skipping cache", + extra={"url": url}, + ) return None headers = ( @@ -74,6 +78,9 @@ def store_response( self.store(request.url, response, None) def store(self, url: str, response: Response, timestamp: float | None): + if response.status == 302: + return + url = self._normalize_url(url) headers: dict[str, str] = dict(response.headers.to_unicode_dict()) with Session(meta_engine.connect()) as session: