From 4a786d9e23e9a4954088141a8b17253e0ac1b982 Mon Sep 17 00:00:00 2001
From: Mark
Date: Fri, 27 Feb 2026 16:43:01 +0100
Subject: [PATCH 1/5] refactor(scraper): cache pages in sqlite db
---
alembic.ini | 12 +-
alembic/env.py | 121 ++++++++++++++--
.../fd345d2b7d78_httpcache_table.py | 42 ++++++
api/env.py | 1 +
api/models.py | 28 +++-
api/util/db.py | 24 +++
justfile | 3 +-
scraper/settings.py | 2 +-
scraper/util/db_httpcache.py | 137 ++++++++++++++++++
scraper/util/url.py | 6 +
10 files changed, 355 insertions(+), 21 deletions(-)
create mode 100644 alembic/meta_versions/fd345d2b7d78_httpcache_table.py
create mode 100644 scraper/util/db_httpcache.py
diff --git a/alembic.ini b/alembic.ini
index 1b03b05..8dbe1ab 100644
--- a/alembic.ini
+++ b/alembic.ini
@@ -1,6 +1,6 @@
# A generic, single database configuration.
-[alembic]
+[DEFAULT]
# path to migration scripts.
# this is typically a path given in POSIX (e.g. forward slashes)
# format, relative to the token %(here)s which refers to the location of this
@@ -19,6 +19,7 @@ script_location = %(here)s/alembic
prepend_sys_path = .
+
# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the tzdata library which can be installed by adding
@@ -44,7 +45,6 @@ prepend_sys_path = .
# directories, initial revisions must be specified with --version-path.
# The path separator used here should be the separator specified by "path_separator"
# below.
-# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions
# path_separator; This indicates what character is used to split lists of file
# paths, including version_locations and prepend_sys_path within configparser
@@ -81,11 +81,11 @@ path_separator = os
# are written from script.py.mako
# output_encoding = utf-8
-# database URL. This is consumed by the user-maintained env.py script only.
-# other means of configuring database URLs may be customized within the env.py
-# file.
-sqlalchemy.url = driver://user:pass@localhost/dbname
+[data_db]
+version_locations = %(here)s/alembic/versions
+[meta_db]
+version_locations = %(here)s/alembic/meta_versions
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
diff --git a/alembic/env.py b/alembic/env.py
index c1ff233..a509feb 100644
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -1,10 +1,16 @@
-import pathlib
+from collections.abc import MutableMapping
from logging.config import fileConfig
+from pathlib import Path
+from typing import Literal
+
+from sqlalchemy import Table
+from sqlalchemy.sql.schema import SchemaItem
+from sqlmodel import SQLModel
from alembic import context
-from api.util.db import engine
from api.env import Settings
-from api.models import BaseModel
+from api.models import BaseModel, MetadataModel
+from api.util.db import engine, meta_engine
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
@@ -14,18 +20,109 @@
fileConfig(config.config_file_name)
+def _get_table_names(base_cls: type) -> set[str]:
+ """Recursively collect __tablename__ from all table-model subclasses."""
+ names: set[str] = set()
+ for cls in base_cls.__subclasses__():
+ tablename = getattr(cls, "__tablename__", None)
+ if isinstance(tablename, str) and hasattr(cls, "__table__"):
+ names.add(tablename)
+ names |= _get_table_names(cls)
+ return names
+
+
+# SQLModel ignores the metadata= kwarg and puts all tables into SQLModel.metadata.
+# We distinguish which tables belong to which DB by walking subclasses instead.
+_base_table_names = _get_table_names(BaseModel)
+_meta_table_names = _get_table_names(MetadataModel)
+
+IncludeNameType = Literal[
+ "schema",
+ "table",
+ "column",
+ "index",
+ "unique_constraint",
+ "foreign_key_constraint",
+]
+ParentNamesType = MutableMapping[
+ Literal["schema_name", "table_name", "schema_qualified_table_name"], str | None
+]
+
+
+def _include_name_base(
+ name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType
+) -> bool:
+ if type_ == "table":
+ return name in _base_table_names
+ return True
+
+
+def _include_name_meta(
+ name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType
+) -> bool:
+ if type_ == "table":
+ return name in _meta_table_names
+ return True
+
+
+def _include_object_base(
+ object_: SchemaItem,
+ name: str | None,
+ type_: str,
+ _reflected: bool,
+ _compare_to: SchemaItem | None,
+) -> bool:
+ if type_ == "table":
+ if isinstance(object_, Table):
+ return object_.name in _base_table_names
+ return name in _base_table_names if name else False
+ return True
+
+
+def _include_object_meta(
+ object_: SchemaItem,
+ name: str | None,
+ type_: str,
+ _reflected: bool,
+ _compare_to: SchemaItem | None,
+) -> bool:
+ if type_ == "table":
+ if isinstance(object_, Table):
+ return object_.name in _meta_table_names
+ return name in _meta_table_names if name else False
+ return True
+
+
def run_migrations() -> None:
- pathlib.Path(Settings().db_path).parent.mkdir(parents=True, exist_ok=True)
+ if "".join(config.get_version_locations_list() or "").endswith("meta_versions"):
+ # metadata db
+ Path(Settings().meta_db_path).parent.mkdir(parents=True, exist_ok=True)
+ with meta_engine.connect() as connection:
+ context.configure(
+ connection=connection,
+ target_metadata=SQLModel.metadata,
+ render_as_batch=True,
+ include_name=_include_name_meta,
+ include_object=_include_object_meta,
+ )
+
+ with context.begin_transaction():
+ context.run_migrations()
- with engine.connect() as connection:
- context.configure(
- connection=connection,
- target_metadata=BaseModel.metadata,
- render_as_batch=True,
- )
+ else:
+ # data db
+ Path(Settings().db_path).parent.mkdir(parents=True, exist_ok=True)
+ with engine.connect() as connection:
+ context.configure(
+ connection=connection,
+ target_metadata=SQLModel.metadata,
+ render_as_batch=True,
+ include_name=_include_name_base,
+ include_object=_include_object_base,
+ )
- with context.begin_transaction():
- context.run_migrations()
+ with context.begin_transaction():
+ context.run_migrations()
run_migrations()
diff --git a/alembic/meta_versions/fd345d2b7d78_httpcache_table.py b/alembic/meta_versions/fd345d2b7d78_httpcache_table.py
new file mode 100644
index 0000000..cb67731
--- /dev/null
+++ b/alembic/meta_versions/fd345d2b7d78_httpcache_table.py
@@ -0,0 +1,42 @@
+"""httpcache table
+
+Revision ID: fd345d2b7d78
+Revises:
+Create Date: 2026-02-27 17:45:49.644585
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+import sqlmodel
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "fd345d2b7d78"
+down_revision: Union[str, Sequence[str], None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ """Upgrade schema."""
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.create_table(
+ "httpcache",
+ sa.Column("url", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+ sa.Column("status_code", sa.Integer(), nullable=False),
+ sa.Column("body", sa.LargeBinary(), nullable=True),
+ sa.Column("headers", sa.JSON(), nullable=True),
+ sa.Column("scraped_at", sa.INTEGER(), nullable=False),
+ sa.PrimaryKeyConstraint("url"),
+ )
+ # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+ """Downgrade schema."""
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.drop_table("httpcache")
+ # ### end Alembic commands ###
diff --git a/api/env.py b/api/env.py
index 66f2d60..5574641 100644
--- a/api/env.py
+++ b/api/env.py
@@ -10,6 +10,7 @@ class Settings(BaseSettings):
)
db_path: str = "data/db.sqlite"
+ meta_db_path: str = "data/meta_db.sqlite"
cache_expiry: int = 60 * 60 * 24 * 30 # in seconds (30 days)
sitemap_expiry: int = 86400 # in seconds
plausible_url: str | None = None
diff --git a/api/models.py b/api/models.py
index 64dba0f..2ffd105 100644
--- a/api/models.py
+++ b/api/models.py
@@ -2,7 +2,7 @@
import time
from enum import Enum
-from typing import final, override
+from typing import Mapping, final, override
from pydantic import BaseModel as PydanticBaseModel
from rapidfuzz import fuzz, process, utils
@@ -586,3 +586,29 @@ class SectionPathView(BaseModel, table=True):
class UnitDepartmentView(BaseModel, table=True):
unit_id: int = Field(primary_key=True)
department_id: int = Field(primary_key=True, index=True)
+
+
+"""
+
+
+METADATA DATABASE
+(separate db)
+
+
+
+"""
+
+
+class MetadataModel(SQLModel):
+ pass
+
+
+class HTTPCache(MetadataModel, table=True):
+ url: str = Field(primary_key=True)
+ status_code: int
+ body: bytes | None = Field(default=None)
+ headers: dict[str, str] | None = Field(default=None, sa_column=Column(JSON))
+ scraped_at: int = Field(
+ default_factory=lambda: int(time.time()),
+ sa_column=Column(INTEGER, nullable=False),
+ )
diff --git a/api/util/db.py b/api/util/db.py
index d02f20c..abd4ab9 100644
--- a/api/util/db.py
+++ b/api/util/db.py
@@ -28,3 +28,27 @@ async def aget_session():
async with AsyncSession(aengine) as session:
await session.execute(text("pragma mmap_size=30000000000"))
yield session
+
+
+meta_engine = create_engine(
+ f"sqlite+pysqlite:///{Settings().meta_db_path}", json_serializer=json_serializer
+)
+
+ameta_engine = create_async_engine(
+ f"sqlite+aiosqlite:///{Settings().meta_db_path}",
+ json_serializer=json_serializer,
+ pool_size=20,
+ max_overflow=30,
+)
+
+
+def meta_get_session():
+ with Session(meta_engine) as session:
+ session.execute(text("PRAGMA foreign_keys=ON"))
+ yield session
+
+
+async def ameta_get_session():
+ async with AsyncSession(ameta_engine) as session:
+ await session.execute(text("pragma mmap_size=30000000000"))
+ yield session
diff --git a/justfile b/justfile
index 1e9c18f..18f799a 100644
--- a/justfile
+++ b/justfile
@@ -8,7 +8,8 @@ dev:
alias m := migrate
migrate:
- uv run alembic upgrade heads
+ uv run alembic -n data_db upgrade heads
+ uv run alembic -n meta_db upgrade heads
alias s := scrape
diff --git a/scraper/settings.py b/scraper/settings.py
index a64c1c8..afc614d 100644
--- a/scraper/settings.py
+++ b/scraper/settings.py
@@ -93,7 +93,7 @@
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = "httpcache"
HTTPCACHE_IGNORE_HTTP_CODES = []
-HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+HTTPCACHE_STORAGE = "scraper.util.db_httpcache.DBHTTPCache"
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"
diff --git a/scraper/util/db_httpcache.py b/scraper/util/db_httpcache.py
new file mode 100644
index 0000000..445c2cd
--- /dev/null
+++ b/scraper/util/db_httpcache.py
@@ -0,0 +1,137 @@
+from email.parser import Parser
+from pathlib import Path
+from typing import final, override
+
+import yaml
+from rich import print
+from scrapy import Request, Spider
+from scrapy.extensions import httpcache
+from scrapy.http import Response
+from scrapy.responsetypes import responsetypes
+from scrapy.settings import BaseSettings
+from sqlmodel import Session
+
+from api.models import HTTPCache
+from api.util.db import meta_engine
+from scraper.util.url import normalized_url
+
+
+@final
+class DBHTTPCache(httpcache.FilesystemCacheStorage):
+ def __init__(self, settings: BaseSettings | None):
+ if settings:
+ super().__init__(settings)
+
+ @override
+ def open_spider(self, spider: Spider) -> None:
+ self.logger = spider.logger
+
+ @override
+ def close_spider(self, spider: Spider) -> None:
+ pass
+
+ @override
+ def retrieve_response(self, spider: Spider, request: Request) -> Response | None:
+ url = self._normalize_url(request.url)
+ with Session(meta_engine.connect()) as session:
+ entry = session.get(HTTPCache, url)
+ if not entry:
+ return None
+
+ headers = (
+ {k.encode(): v.encode() for k, v in entry.headers.items()}
+ if entry.headers
+ else {}
+ )
+
+ respcls = responsetypes.from_args(
+ headers=headers,
+ url=url,
+ body=entry.body,
+ )
+ return respcls(
+ url=url,
+ headers=headers,
+ status=entry.status_code,
+ body=entry.body or b"",
+ )
+
+ @override
+ def store_response(
+ self, spider: Spider, request: Request, response: Response
+ ) -> None:
+ self.store(request.url, response, None)
+
+ def store(self, url: str, response: Response, timestamp: float | None):
+ url = self._normalize_url(url)
+ headers: dict[str, str] = dict(response.headers.to_unicode_dict())
+ with Session(meta_engine.connect()) as session:
+ entry = HTTPCache(
+ url=url,
+ status_code=response.status,
+ headers=headers,
+ body=response.body,
+ )
+ if timestamp is not None:
+ entry.scraped_at = int(timestamp)
+
+ session.merge(entry)
+ session.commit()
+
+ def _normalize_url(self, url: str) -> str:
+ return normalized_url(url)
+
+
+@final
+class Migrator:
+ def __init__(self, cachedir: str) -> None:
+ self.cachedir = cachedir
+ self.cache = DBHTTPCache(None)
+
+ def migrate(self):
+ for dir in self._walk():
+ try:
+ with open(dir / "meta", "r") as f:
+ # yaml allows us to open the invalid formatted json file
+ data = yaml.load(f, Loader=yaml.SafeLoader) # pyright: ignore[reportAny]
+ url: str = data.get("url", "") # pyright: ignore[reportAny]
+ timestamp: float | None = data.get("timestamp") # pyright: ignore[reportAny]
+ status: int = data.get("status", 0) # pyright: ignore[reportAny]
+ with open(dir / "response_body", "rb") as f:
+ body = f.read()
+ with open(dir / "response_headers", "r") as f:
+ parsed = Parser().parse(f)
+ headers = dict(parsed.items())
+ self._add(url, timestamp, status, headers, body)
+ print(f"Migrated {url} from {dir}")
+ except Exception as e:
+ print(f"[red]Failed to migrate from {dir}: {e}[/red]")
+
+ def _add(
+ self,
+ url: str,
+ timestamp: float | None,
+ status: int,
+ headers: dict[str, str],
+ body: bytes,
+ ):
+ response = Response(url=url, status=status, headers=headers, body=body)
+ self.cache.store(url, response, timestamp)
+
+ def _walk(self):
+ cachedir = Path(self.cachedir)
+ for spiderdir in cachedir.iterdir():
+ if not spiderdir.is_dir():
+ continue
+ for shortdir in spiderdir.iterdir():
+ if not shortdir.is_dir():
+ continue
+ for requestdir in shortdir.iterdir():
+ if not requestdir.is_dir():
+ continue
+ yield requestdir
+
+
+if __name__ == "__main__":
+ migrator = Migrator(".scrapy/httpcache")
+ migrator.migrate()
diff --git a/scraper/util/url.py b/scraper/util/url.py
index 7556efe..d55d8b5 100644
--- a/scraper/util/url.py
+++ b/scraper/util/url.py
@@ -28,3 +28,9 @@ def sort_url_params(url: str) -> str:
sorted_query = dict(sorted(query.items()))
url_res = url_res._replace(query=urlencode(sorted_query, True))
return urlunparse(url_res)
+
+
+def normalized_url(url: str) -> str:
+ sorted = sort_url_params(url)
+ sorted = sorted.replace(".vorlesungen.", ".vvz.").replace("http://", "https://")
+ return sorted.strip("/")
From db0aad8ca45027984c890be52cf46b2600ec65d3 Mon Sep 17 00:00:00 2001
From: Mark
Date: Fri, 27 Feb 2026 19:08:16 +0100
Subject: [PATCH 2/5] some more cleanup
---
alembic/env.py | 89 +++--------
.../c28cde0a90db_move_over_cleanup_table.py | 37 +++++
.../3b1a337a1fe5_move_over_cleanup_table.py | 37 +++++
api/models.py | 22 +--
api/util/db.py | 6 +-
scraper/env.py | 9 +-
scraper/main.py | 105 +++++++------
scraper/pipelines.py | 2 +-
scraper/settings.py | 4 +-
scraper/spiders/units.py | 2 +-
scraper/{util => types}/mappings.py | 0
scraper/util/cleanup_scrapy.py | 143 ------------------
scraper/util/db_httpcache.py | 2 +
scraper/util/delete_cached.py | 48 ------
scraper/util/logging.py | 3 +-
scraper/util/regex_rules.py | 2 +-
scraper/util/table.py | 5 +-
17 files changed, 187 insertions(+), 329 deletions(-)
create mode 100644 alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py
create mode 100644 alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py
rename scraper/{util => types}/mappings.py (100%)
delete mode 100644 scraper/util/cleanup_scrapy.py
delete mode 100644 scraper/util/delete_cached.py
diff --git a/alembic/env.py b/alembic/env.py
index a509feb..6b10ec6 100644
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -1,10 +1,7 @@
-from collections.abc import MutableMapping
from logging.config import fileConfig
from pathlib import Path
-from typing import Literal
-from sqlalchemy import Table
-from sqlalchemy.sql.schema import SchemaItem
+from sqlalchemy import MetaData
from sqlmodel import SQLModel
from alembic import context
@@ -31,66 +28,28 @@ def _get_table_names(base_cls: type) -> set[str]:
return names
-# SQLModel ignores the metadata= kwarg and puts all tables into SQLModel.metadata.
-# We distinguish which tables belong to which DB by walking subclasses instead.
+def _build_filtered_metadata(table_names: set[str]) -> MetaData:
+ """Build a new MetaData containing only the specified tables from SQLModel.metadata.
+
+ SQLModel ignores the metadata= kwarg and registers all tables into a single
+ shared SQLModel.metadata. To make Alembic correctly detect additions, changes,
+ AND removals per-database, we construct a filtered MetaData that only contains
+ the tables belonging to that database. This way Alembic sees exactly which
+ tables should exist and can generate drops for any that are missing.
+ """
+ filtered = MetaData()
+ for name, table in SQLModel.metadata.tables.items():
+ if name in table_names:
+ table.to_metadata(filtered)
+ return filtered
+
+
+# Distinguish which tables belong to which DB by walking model subclasses.
_base_table_names = _get_table_names(BaseModel)
_meta_table_names = _get_table_names(MetadataModel)
-IncludeNameType = Literal[
- "schema",
- "table",
- "column",
- "index",
- "unique_constraint",
- "foreign_key_constraint",
-]
-ParentNamesType = MutableMapping[
- Literal["schema_name", "table_name", "schema_qualified_table_name"], str | None
-]
-
-
-def _include_name_base(
- name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType
-) -> bool:
- if type_ == "table":
- return name in _base_table_names
- return True
-
-
-def _include_name_meta(
- name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType
-) -> bool:
- if type_ == "table":
- return name in _meta_table_names
- return True
-
-
-def _include_object_base(
- object_: SchemaItem,
- name: str | None,
- type_: str,
- _reflected: bool,
- _compare_to: SchemaItem | None,
-) -> bool:
- if type_ == "table":
- if isinstance(object_, Table):
- return object_.name in _base_table_names
- return name in _base_table_names if name else False
- return True
-
-
-def _include_object_meta(
- object_: SchemaItem,
- name: str | None,
- type_: str,
- _reflected: bool,
- _compare_to: SchemaItem | None,
-) -> bool:
- if type_ == "table":
- if isinstance(object_, Table):
- return object_.name in _meta_table_names
- return name in _meta_table_names if name else False
- return True
+_base_metadata = _build_filtered_metadata(_base_table_names)
+_meta_metadata = _build_filtered_metadata(_meta_table_names)
def run_migrations() -> None:
@@ -100,10 +59,8 @@ def run_migrations() -> None:
with meta_engine.connect() as connection:
context.configure(
connection=connection,
- target_metadata=SQLModel.metadata,
+ target_metadata=_meta_metadata,
render_as_batch=True,
- include_name=_include_name_meta,
- include_object=_include_object_meta,
)
with context.begin_transaction():
@@ -115,10 +72,8 @@ def run_migrations() -> None:
with engine.connect() as connection:
context.configure(
connection=connection,
- target_metadata=SQLModel.metadata,
+ target_metadata=_base_metadata,
render_as_batch=True,
- include_name=_include_name_base,
- include_object=_include_object_base,
)
with context.begin_transaction():
diff --git a/alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py b/alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py
new file mode 100644
index 0000000..7777b31
--- /dev/null
+++ b/alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py
@@ -0,0 +1,37 @@
+"""move over cleanup table
+
+Revision ID: c28cde0a90db
+Revises: fd345d2b7d78
+Create Date: 2026-02-27 19:08:43.207368
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'c28cde0a90db'
+down_revision: Union[str, Sequence[str], None] = 'fd345d2b7d78'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ """Upgrade schema."""
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.create_table('lastcleanup',
+ sa.Column('id', sa.Integer(), nullable=False),
+ sa.Column('timestamp', sa.INTEGER(), nullable=False),
+ sa.PrimaryKeyConstraint('id')
+ )
+ # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+ """Downgrade schema."""
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.drop_table('lastcleanup')
+ # ### end Alembic commands ###
diff --git a/alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py b/alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py
new file mode 100644
index 0000000..b6b2efe
--- /dev/null
+++ b/alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py
@@ -0,0 +1,37 @@
+"""move over cleanup table
+
+Revision ID: 3b1a337a1fe5
+Revises: aa860aba0a9f
+Create Date: 2026-02-27 19:18:10.668579
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel
+
+
+# revision identifiers, used by Alembic.
+revision: str = '3b1a337a1fe5'
+down_revision: Union[str, Sequence[str], None] = 'aa860aba0a9f'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ """Upgrade schema."""
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.drop_table('lastcleanup')
+ # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+ """Downgrade schema."""
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.create_table('lastcleanup',
+ sa.Column('id', sa.INTEGER(), nullable=False),
+ sa.Column('timestamp', sa.INTEGER(), nullable=False),
+ sa.PrimaryKeyConstraint('id')
+ )
+ # ### end Alembic commands ###
diff --git a/api/models.py b/api/models.py
index 2ffd105..dcc60a1 100644
--- a/api/models.py
+++ b/api/models.py
@@ -2,7 +2,7 @@
import time
from enum import Enum
-from typing import Mapping, final, override
+from typing import final, override
from pydantic import BaseModel as PydanticBaseModel
from rapidfuzz import fuzz, process, utils
@@ -530,16 +530,6 @@ class FinishedScrapingSemester(BaseModel, table=True):
semkez: str = Field(primary_key=True)
-class LastCleanup(BaseModel, table=True):
- """Keeps track of when the last cleanup of the scrapy cache was performed."""
-
- id: int | None = Field(default=None, primary_key=True)
- timestamp: int = Field(
- default_factory=lambda: int(time.time()),
- sa_column=Column(INTEGER, nullable=False),
- )
-
-
class Rating(BaseModel, table=True):
"""Course ratings scraped from the CourseReview site"""
@@ -612,3 +602,13 @@ class HTTPCache(MetadataModel, table=True):
default_factory=lambda: int(time.time()),
sa_column=Column(INTEGER, nullable=False),
)
+
+
+class LastCleanup(MetadataModel, table=True):
+ """Keeps track of when the last cleanup of the scrapy cache was performed."""
+
+ id: int | None = Field(default=None, primary_key=True)
+ timestamp: int = Field(
+ default_factory=lambda: int(time.time()),
+ sa_column=Column(INTEGER, nullable=False),
+ )
diff --git a/api/util/db.py b/api/util/db.py
index abd4ab9..7900edf 100644
--- a/api/util/db.py
+++ b/api/util/db.py
@@ -27,6 +27,7 @@ def get_session():
async def aget_session():
async with AsyncSession(aengine) as session:
await session.execute(text("pragma mmap_size=30000000000"))
+ await session.execute(text("PRAGMA foreign_keys=ON"))
yield session
@@ -42,13 +43,14 @@ async def aget_session():
)
-def meta_get_session():
+def get_meta_session():
with Session(meta_engine) as session:
session.execute(text("PRAGMA foreign_keys=ON"))
yield session
-async def ameta_get_session():
+async def aget_meta_session():
async with AsyncSession(ameta_engine) as session:
await session.execute(text("pragma mmap_size=30000000000"))
+ await session.execute(text("PRAGMA foreign_keys=ON"))
yield session
diff --git a/scraper/env.py b/scraper/env.py
index d2555ea..86e1a9c 100644
--- a/scraper/env.py
+++ b/scraper/env.py
@@ -10,16 +10,13 @@ class Settings(BaseSettings):
env_file=".env", env_file_encoding="utf-8", extra="ignore"
)
- refresh_html: bool = False
- """If html files, that are already cached locally, should be refetched"""
-
# Semester settings only apply to newly scraped semesters
- # RESCRAPE_AMOUNT will overwrite this and cause only the last
+ # ENABLE_RESCRAPE will overwrite this and cause only the last
# two already scraped semesters to be rescraped
- start_year: int = date.today().year
+ start_year: int = date.today().year - 1
# automatically include next year (if it exists)
end_year: int = date.today().year + 1
- semester: str = "W"
+ semester: str = "W,S"
delay: float = 5.0
"""Amount of seconds to at least wait between requests"""
diff --git a/scraper/main.py b/scraper/main.py
index 186dc08..e903ab8 100644
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -8,16 +8,18 @@
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
-from sqlmodel import text
+from sqlmodel import col, distinct, select, text
from api.env import Settings as APISettings
+from api.models import LearningUnit
from api.util.db import get_session
from api.util.materialize import update_materialized_views
from scraper.env import Settings as EnvSettings
from scraper.spiders.lecturers import LecturersSpider
from scraper.spiders.ratings import RatingsSpider
from scraper.spiders.units import UnitsSpider
-from scraper.util.delete_cached import delete_cached
+
+logger = logging.getLogger(__name__)
def add_stdout_logging(settings: Settings):
@@ -36,49 +38,66 @@ def add_stdout_logging(settings: Settings):
root.addHandler(sh)
-settings = get_project_settings()
-add_stdout_logging(settings)
+def crawl():
+ settings = get_project_settings()
+ add_stdout_logging(settings)
-process = CrawlerProcess(settings)
+ process = CrawlerProcess(settings)
-# cleanup cache if required
-if EnvSettings().enable_rescrape:
- semkezs = delete_cached()
- process.crawl(UnitsSpider, semkezs=semkezs)
- process.crawl(LecturersSpider, semkezs=semkezs)
- process.crawl(RatingsSpider)
-else:
- process.crawl(UnitsSpider)
- process.crawl(LecturersSpider)
- process.crawl(RatingsSpider)
-process.start()
+ # cleanup cache if required
+ if EnvSettings().enable_rescrape:
+ with next(get_session()) as session:
+ semkezs = session.exec(
+ select(distinct(LearningUnit.semkez))
+ .order_by(col(LearningUnit.semkez).desc())
+ .limit(2)
+ ).all()
+ if not semkezs:
+ logger.info("No semesters found in database, scraping all semesters.")
+ semkezs = None
+ process.crawl(UnitsSpider, semkezs=semkezs)
+ # process.crawl(LecturersSpider, semkezs=semkezs)
+ # process.crawl(RatingsSpider)
+ else:
+ process.crawl(UnitsSpider)
+ # process.crawl(LecturersSpider)
+ # process.crawl(RatingsSpider)
+ process.start()
-logger = logging.getLogger(__name__)
-logger.info("Finished scraping data, updating materialized tables")
-with next(get_session()) as session:
- update_materialized_views(session)
-
-# vacuum/zip db
-logger.info(f"Vacuuming database into {APISettings().vacuum_path}")
-if Path(APISettings().vacuum_path).exists(): # required for VACUUM INTO to work
- Path(APISettings().vacuum_path).unlink()
-with next(get_session()) as session:
- session.execute(
- text("VACUUM INTO :vacuum_path"),
- {"vacuum_path": f"{APISettings().vacuum_path}"},
+def update_materialized_view():
+ logger.info("Finished scraping data, updating materialized tables")
+ with next(get_session()) as session:
+ update_materialized_views(session)
+
+
+def vacuum():
+ # vacuum/zip db
+ logger.info(f"Vacuuming database into {APISettings().vacuum_path}")
+ if Path(APISettings().vacuum_path).exists(): # required for VACUUM INTO to work
+ Path(APISettings().vacuum_path).unlink()
+ with next(get_session()) as session:
+ session.execute(
+ text("VACUUM INTO :vacuum_path"),
+ {"vacuum_path": f"{APISettings().vacuum_path}"},
+ )
+ logger.info("Finished vacuuming database")
+ logger.info(f"Creating database zip file at {APISettings().zip_path}")
+ with zipfile.ZipFile(APISettings().zip_path, "w", zipfile.ZIP_DEFLATED) as z:
+ z.write(APISettings().vacuum_path, arcname="database.db")
+ logger.info("Finished creating database zip file")
+ db_size = Path(APISettings().db_path).stat().st_size / (1024 * 1024)
+ vacuum_size = Path(APISettings().vacuum_path).stat().st_size / (1024 * 1024)
+ zip_size = Path(APISettings().zip_path).stat().st_size / (1024 * 1024)
+ logger.info(
+ f"Database size: {db_size:.2f} MB, vacuum size: {vacuum_size:.2f} MB, zipped size: {zip_size:.2f} MB"
)
-logger.info("Finished vacuuming database")
-logger.info(f"Creating database zip file at {APISettings().zip_path}")
-with zipfile.ZipFile(APISettings().zip_path, "w", zipfile.ZIP_DEFLATED) as z:
- z.write(APISettings().vacuum_path, arcname="database.db")
-logger.info("Finished creating database zip file")
-db_size = Path(APISettings().db_path).stat().st_size / (1024 * 1024)
-vacuum_size = Path(APISettings().vacuum_path).stat().st_size / (1024 * 1024)
-zip_size = Path(APISettings().zip_path).stat().st_size / (1024 * 1024)
-logger.info(
- f"Database size: {db_size:.2f} MB, vacuum size: {vacuum_size:.2f} MB, zipped size: {zip_size:.2f} MB"
-)
-logger.info(f"Deleting vacuum file at {APISettings().vacuum_path}")
-Path(APISettings().vacuum_path).unlink(missing_ok=True)
-logger.info("Finished deleting vacuum file.")
+ logger.info(f"Deleting vacuum file at {APISettings().vacuum_path}")
+ Path(APISettings().vacuum_path).unlink(missing_ok=True)
+ logger.info("Finished deleting vacuum file.")
+
+
+if __name__ == "__main__":
+ crawl()
+ update_materialized_view()
+ vacuum()
diff --git a/scraper/pipelines.py b/scraper/pipelines.py
index ebd853c..17a8271 100644
--- a/scraper/pipelines.py
+++ b/scraper/pipelines.py
@@ -27,8 +27,8 @@
)
from api.util import db
from scraper.spiders.units import UnitsSpider
+from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
from scraper.util.difference import find_unit_differences
-from scraper.util.mappings import UnitDepartmentMapping, UnitLevelMapping
from scraper.util.scrapercache import CACHE_PATH
DEP_LINK = CACHE_PATH / "unit_dep_link.jsonl"
diff --git a/scraper/settings.py b/scraper/settings.py
index afc614d..f8a5ffa 100644
--- a/scraper/settings.py
+++ b/scraper/settings.py
@@ -89,9 +89,7 @@
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-HTTPCACHE_ENABLED = not Settings().refresh_html
-HTTPCACHE_EXPIRATION_SECS = 0
-HTTPCACHE_DIR = "httpcache"
+HTTPCACHE_ENABLED = True
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = "scraper.util.db_httpcache.DBHTTPCache"
diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py
index 07667ed..1eac4ca 100644
--- a/scraper/spiders/units.py
+++ b/scraper/spiders/units.py
@@ -38,8 +38,8 @@
WeekdayEnum,
)
from scraper.env import Settings
+from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
from scraper.util.logging import KeywordLoggerSpider
-from scraper.util.mappings import UnitDepartmentMapping, UnitLevelMapping
from scraper.util.regex_rules import (
RE_ABSCHNITTID,
RE_DATE,
diff --git a/scraper/util/mappings.py b/scraper/types/mappings.py
similarity index 100%
rename from scraper/util/mappings.py
rename to scraper/types/mappings.py
diff --git a/scraper/util/cleanup_scrapy.py b/scraper/util/cleanup_scrapy.py
deleted file mode 100644
index 0920b28..0000000
--- a/scraper/util/cleanup_scrapy.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from datetime import datetime
-from pathlib import Path
-import argparse
-import re
-from shutil import rmtree
-import time
-from typing import TypedDict, cast
-
-import yaml
-
-
-HTTP_CACHE_PATH = Path(".scrapy/httpcache")
-
-re_units_en = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/lerneinheit\.view\?ansicht=ALLE&lang=en&lerneinheitId=\d+&semkez=\d{4}\w"
-re_root_units = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/sucheLehrangebot\.view\?semkez=\d{4}\w&ansicht=2&seite=0(&deptId=\d+)?(&studiengangTyp=\w+)?&lang=\w\w"
-re_legends = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/legendeStudienplanangaben\.view\?abschnittId=\d+&lang=en&semkez=\d{4}\w"
-
-re_lecturers_root = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/sucheDozierende\.view\?lang=de&semkez=\d{4}\w&seite=0"
-
-
-class FileMetadata(TypedDict):
- url: str
- timestamp: int
-
-
-def get_files(path: Path):
- if not path.is_dir():
- return
- for top in path.iterdir():
- if top.is_dir():
- for bot in top.iterdir():
- if bot.is_dir():
- meta = bot / "meta"
- if not meta.exists():
- yield "", bot, 0
- with open(meta, "r") as f:
- # yaml allows us to open the invalid formatted json file
- data = cast(
- FileMetadata,
- yaml.load(f, Loader=yaml.SafeLoader),
- )
- yield data.get("url", ""), bot, data.get("timestamp", 0)
-
-
-def cleanup_scrapy(
- dry_run: bool = False,
- delete_cached_semesters: list[str] | None = None,
- amount: int = 100,
- age_seconds: int = 0,
-):
- if delete_cached_semesters is None:
- delete_cached_semesters = []
-
- unts, lecrs = 0, 0
- cached_unts = 0
- units = HTTP_CACHE_PATH / "units"
- lecturers = HTTP_CACHE_PATH / "lecturers"
- now = time.time()
-
- for url, dir, timestamp in get_files(units):
- dt = datetime.fromtimestamp(timestamp)
- # delete files that we do not use anymore
- if (
- not re.match(re_units_en, url)
- and not re.match(re_root_units, url)
- and not re.match(re_legends, url)
- ):
- print(f"Delete unit: {dir}: URL mismatch {url}: {dt}")
- unts += 1
- if not dry_run:
- rmtree(dir)
- # delete files from cached semesters
- if cached_unts < amount:
- for sem in delete_cached_semesters:
- if f"semkez={sem}" in url and (now - timestamp) > age_seconds:
- print(f"Delete unit: {dir}: Cached semester {sem} {url}: {dt}")
- unts += 1
- cached_unts += 1
- if not dry_run:
- rmtree(dir)
- break
-
- for url, dir, timestamp in get_files(lecturers):
- dt = datetime.fromtimestamp(timestamp)
- if not re.match(re_lecturers_root, url):
- print(f"Delete lecturer: {dir}: URL mismatch {url}: {dt}")
- lecrs += 1
- if not dry_run:
- rmtree(dir)
-
- print(
- f"===============\nDeleted {unts} files in lecturers dir\nDeleted {lecrs} files in lecturers dir"
- )
-
-
-if __name__ == "__main__":
-
- class Arguments(argparse.Namespace):
- dry_run: bool
- delete_cached_semesters: list[str]
- amount: int
- age_seconds: int
-
- parser = argparse.ArgumentParser(description="Cleanup scrapy cache")
- parser.add_argument(
- "--dry-run",
- action="store_true",
- help="Show what would be deleted without actually deleting",
- default=False,
- )
- parser.add_argument(
- "-d",
- "--delete-cached-semesters",
- nargs="+",
- help="List of semesters to delete from cache, e.g., 2023W 2024S",
- default=[],
- )
- parser.add_argument(
- "-n",
- "--amount",
- type=int,
- help="Amount of cached semesters to delete",
- default=100,
- )
- parser.add_argument(
- "--age-seconds",
- type=int,
- help="Delete cached files older than this many seconds",
- default=0,
- )
- parser.add_argument(
- "--age-seconds",
- type=int,
- help="Delete cached files older than this many seconds",
- default=0,
- )
- args = parser.parse_args(namespace=Arguments())
- cleanup_scrapy(
- dry_run=args.dry_run,
- delete_cached_semesters=args.delete_cached_semesters,
- amount=args.amount,
- age_seconds=args.age_seconds,
- )
diff --git a/scraper/util/db_httpcache.py b/scraper/util/db_httpcache.py
index 445c2cd..ec4360a 100644
--- a/scraper/util/db_httpcache.py
+++ b/scraper/util/db_httpcache.py
@@ -84,6 +84,8 @@ def _normalize_url(self, url: str) -> str:
@final
class Migrator:
+ """Used to migrate httpcache to dbhttpcache above"""
+
def __init__(self, cachedir: str) -> None:
self.cachedir = cachedir
self.cache = DBHTTPCache(None)
diff --git a/scraper/util/delete_cached.py b/scraper/util/delete_cached.py
deleted file mode 100644
index e42caa6..0000000
--- a/scraper/util/delete_cached.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import time
-from typing import Sequence
-from sqlmodel import col, distinct, select
-from api.models import LastCleanup, LearningUnit
-from api.util.db import get_session
-from scraper.env import Settings
-from scraper.util.cleanup_scrapy import cleanup_scrapy
-
-
-def delete_cached() -> Sequence[str]:
- print("Checking if cached files should be deleted...")
- with next(get_session()) as session:
- last_cleanup = session.exec(
- select(LastCleanup).order_by(col(LastCleanup.timestamp).desc()).limit(1)
- ).first()
- last_cleanup_time = last_cleanup.timestamp if last_cleanup else 0
-
- # prevent cleaning up if pod is crash-looping
- now = int(time.time())
- if now - last_cleanup_time < 22 * 3600:
- print(
- "Last cleanup was performed less than 22 hours ago, skipping cleanup."
- )
- return []
-
- last_semesters = session.exec(
- select(distinct(LearningUnit.semkez))
- .order_by(col(LearningUnit.semkez).desc())
- .limit(2)
- ).all()
- if not last_semesters:
- print("No semesters found in database, skipping cleanup.")
- return []
-
- print(f"Performing cleanup of cached files for semesters: {last_semesters}")
-
- cleanup_scrapy(
- dry_run=True,
- delete_cached_semesters=list(last_semesters),
- amount=Settings().rescrape_amount,
- age_seconds=Settings().rescrape_age_seconds,
- )
-
- last_cleanup = LastCleanup(timestamp=now)
- session.add(last_cleanup)
- session.commit()
-
- return last_semesters
diff --git a/scraper/util/logging.py b/scraper/util/logging.py
index 6024f76..24e6e52 100644
--- a/scraper/util/logging.py
+++ b/scraper/util/logging.py
@@ -1,12 +1,13 @@
# pyright: reportExplicitAny=false,reportAny=false
from typing import Any, override
+
from scrapy.spiders import CrawlSpider
from scrapy.utils.log import SpiderLoggerAdapter
class KeywordLoggerAdapter(SpiderLoggerAdapter):
- """A logger adapter which adds the 'keyword' attribute to log records."""
+ """A logger adapter which adds the 'extra' attributes to log records."""
@override
def process(self, msg: str, kwargs: Any):
diff --git a/scraper/util/regex_rules.py b/scraper/util/regex_rules.py
index 628d75a..107597b 100644
--- a/scraper/util/regex_rules.py
+++ b/scraper/util/regex_rules.py
@@ -8,7 +8,7 @@
- 17-412 1L (https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?lerneinheitId=13629&semkez=2003S&ansicht=LEHRVERANSTALTUNGEN&lang=de)
- 10-824 (https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?lerneinheitId=6467&semkez=2003S&ansicht=LEHRVERANSTALTUNGEN&lang=de)
"""
-RE_DATE = r"\d{2}\.\d{2}\.\d{4}" # 31.12.2023
+RE_DATE = r"\d{2}\.\d{2}\.\d{4}" # Format: 31.12.2023
RE_SEMKEZ = r"semkez=(\w+)"
RE_UNITID = r"lerneinheitId=(\d+)"
RE_DOZIDE = r"dozide=(\d+)"
diff --git a/scraper/util/table.py b/scraper/util/table.py
index 1b82f6e..89767fe 100644
--- a/scraper/util/table.py
+++ b/scraper/util/table.py
@@ -1,13 +1,14 @@
from re import Pattern
-from scrapy.http import Response
+
from parsel import Selector, SelectorList
+from scrapy.http import Response
from scraper.util.keymap import TranslationKey, translations
class Table:
"""
- Takes a page and throws all table rows into a list of (key, columns) tuples.
+ Takes a page and transforms all table rows into a list of (key, columns) tuples.
"""
def __init__(
From 1f9ec0ee66617bfb3f5741f197851ee7b0b85c89 Mon Sep 17 00:00:00 2001
From: Mark
Date: Fri, 27 Feb 2026 22:24:59 +0100
Subject: [PATCH 3/5] rescrape and add option to flag pages
---
.../meta_versions/357b241a4250_add_flagged.py | 43 +++++++++++
api/models.py | 2 +
scraper/env.py | 4 +-
scraper/main.py | 24 +-----
scraper/settings.py | 2 +-
scraper/spiders/lecturers.py | 14 +++-
scraper/spiders/units.py | 32 +++++++-
scraper/util/{ => caching}/db_httpcache.py | 11 +++
scraper/util/caching/rescrape.py | 73 +++++++++++++++++++
9 files changed, 176 insertions(+), 29 deletions(-)
create mode 100644 alembic/meta_versions/357b241a4250_add_flagged.py
rename scraper/util/{ => caching}/db_httpcache.py (93%)
create mode 100644 scraper/util/caching/rescrape.py
diff --git a/alembic/meta_versions/357b241a4250_add_flagged.py b/alembic/meta_versions/357b241a4250_add_flagged.py
new file mode 100644
index 0000000..34946b5
--- /dev/null
+++ b/alembic/meta_versions/357b241a4250_add_flagged.py
@@ -0,0 +1,43 @@
+"""add flagged
+
+Revision ID: 357b241a4250
+Revises: c28cde0a90db
+Create Date: 2026-02-27 22:17:37.706745
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "357b241a4250"
+down_revision: Union[str, Sequence[str], None] = "c28cde0a90db"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ """Upgrade schema."""
+ # ### commands auto generated by Alembic - please adjust! ###
+ with op.batch_alter_table("httpcache", schema=None) as batch_op:
+ batch_op.add_column(
+ sa.Column(
+ "flagged", sa.Boolean(), nullable=False, server_default=sa.false()
+ )
+ )
+
+ with op.batch_alter_table("httpcache", schema=None) as batch_op:
+ batch_op.alter_column("flagged", server_default=None)
+ # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+ """Downgrade schema."""
+ # ### commands auto generated by Alembic - please adjust! ###
+ with op.batch_alter_table("httpcache", schema=None) as batch_op:
+ batch_op.drop_column("flagged")
+
+ # ### end Alembic commands ###
diff --git a/api/models.py b/api/models.py
index dcc60a1..6961a62 100644
--- a/api/models.py
+++ b/api/models.py
@@ -598,6 +598,8 @@ class HTTPCache(MetadataModel, table=True):
status_code: int
body: bytes | None = Field(default=None)
headers: dict[str, str] | None = Field(default=None, sa_column=Column(JSON))
+ flagged: bool = Field(default=False)
+ """if set, the entry be rescraped the next time it's accessed"""
scraped_at: int = Field(
default_factory=lambda: int(time.time()),
sa_column=Column(INTEGER, nullable=False),
diff --git a/scraper/env.py b/scraper/env.py
index 86e1a9c..a1f4f66 100644
--- a/scraper/env.py
+++ b/scraper/env.py
@@ -24,10 +24,10 @@ class Settings(BaseSettings):
log_append: bool = True
disable_log_file: bool = False
- # delete valid cached files
+ # rescrapes the full course pages, while only rescraping
+ # the given amount of the oldest cached unit pages
enable_rescrape: bool = False
rescrape_amount: int = 500
- rescrape_age_seconds: int = 24 * 3600 * 14 # 14 days
def read_semesters(self) -> list[Literal["W", "S"]]:
semesters: list[Literal["W", "S"]] = []
diff --git a/scraper/main.py b/scraper/main.py
index e903ab8..40809f6 100644
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -18,6 +18,7 @@
from scraper.spiders.lecturers import LecturersSpider
from scraper.spiders.ratings import RatingsSpider
from scraper.spiders.units import UnitsSpider
+from scraper.util.caching.rescrape import get_last_semesters
logger = logging.getLogger(__name__)
@@ -41,27 +42,10 @@ def add_stdout_logging(settings: Settings):
def crawl():
settings = get_project_settings()
add_stdout_logging(settings)
-
process = CrawlerProcess(settings)
-
- # cleanup cache if required
- if EnvSettings().enable_rescrape:
- with next(get_session()) as session:
- semkezs = session.exec(
- select(distinct(LearningUnit.semkez))
- .order_by(col(LearningUnit.semkez).desc())
- .limit(2)
- ).all()
- if not semkezs:
- logger.info("No semesters found in database, scraping all semesters.")
- semkezs = None
- process.crawl(UnitsSpider, semkezs=semkezs)
- # process.crawl(LecturersSpider, semkezs=semkezs)
- # process.crawl(RatingsSpider)
- else:
- process.crawl(UnitsSpider)
- # process.crawl(LecturersSpider)
- # process.crawl(RatingsSpider)
+ process.crawl(UnitsSpider)
+ # process.crawl(LecturersSpider)
+ # process.crawl(RatingsSpider)
process.start()
diff --git a/scraper/settings.py b/scraper/settings.py
index f8a5ffa..fb2c50a 100644
--- a/scraper/settings.py
+++ b/scraper/settings.py
@@ -91,7 +91,7 @@
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_IGNORE_HTTP_CODES = []
-HTTPCACHE_STORAGE = "scraper.util.db_httpcache.DBHTTPCache"
+HTTPCACHE_STORAGE = "scraper.util.caching.db_httpcache.DBHTTPCache"
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"
diff --git a/scraper/spiders/lecturers.py b/scraper/spiders/lecturers.py
index 59d1229..2382a2d 100644
--- a/scraper/spiders/lecturers.py
+++ b/scraper/spiders/lecturers.py
@@ -5,6 +5,7 @@
from api.models import Lecturer
from scraper.env import Settings
+from scraper.util.caching.rescrape import RESCRAPE_SEMKEZS
from scraper.util.logging import KeywordLoggerSpider
from scraper.util.regex_rules import RE_DOZIDE, RE_SEMKEZ
@@ -18,11 +19,11 @@ def get_urls(year: int, semester: Literal["W", "S"]):
class LecturersSpider(KeywordLoggerSpider):
name: str = "lecturers"
- def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny]
- if semkezs is not None:
+ def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny]
+ if RESCRAPE_SEMKEZS is not None:
self.start_urls: list[str] = [
url
- for semkez in semkezs
+ for semkez in RESCRAPE_SEMKEZS
for url in get_urls(int(semkez[:-1]), "S" if semkez[-1] == "S" else "W")
]
else:
@@ -36,6 +37,13 @@ def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyr
@override
def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny]
+ if RESCRAPE_SEMKEZS and "cached" in response.flags:
+ self.logger.info(
+ "Skipping cached lecturers page due to rescrape settings",
+ extra={"url": response.url},
+ )
+ return
+
semkez = re.search(RE_SEMKEZ, response.url)
if not semkez:
self.logger.error(
diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py
index 1eac4ca..48be03f 100644
--- a/scraper/spiders/units.py
+++ b/scraper/spiders/units.py
@@ -39,6 +39,7 @@
)
from scraper.env import Settings
from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
+from scraper.util.caching.rescrape import RESCRAPE_SEMKEZS
from scraper.util.logging import KeywordLoggerSpider
from scraper.util.regex_rules import (
RE_ABSCHNITTID,
@@ -131,11 +132,11 @@ class UnitsSpider(KeywordLoggerSpider):
)
course_ids: dict[str, set[int]] = defaultdict(set)
- def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny]
- if semkezs is not None:
+ def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny]
+ if RESCRAPE_SEMKEZS is not None:
self.start_urls: list[str] = [
url
- for semkez in semkezs
+ for semkez in RESCRAPE_SEMKEZS
for url in get_urls(int(semkez[:-1]), "S" if semkez[-1] == "S" else "W")
]
else:
@@ -149,6 +150,16 @@ def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyr
@override
def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny]
+ if RESCRAPE_SEMKEZS and "cached" in response.flags:
+ self.logger.info(
+ "Skipping cached catalogue page due to rescrape settings",
+ extra={
+ "url": response.url,
+ "request_url": response.request.url if response.request else None,
+ },
+ )
+ return
+
try:
catalog_semkez = re.search(RE_SEMKEZ, response.url)
if not catalog_semkez:
@@ -361,6 +372,11 @@ def parse_unit(
Example url: https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?semkez=2025W&ansicht=ALLE&lerneinheitId=192945&lang=en
"""
+ if RESCRAPE_SEMKEZS and "cached" in response.flags:
+ # the http cache will automatically refetch a set of pages
+ # if there are explicit semkezs to rescrape
+ return
+
try:
if "red9.ethz.ch" in response.url:
self.logger.info(
@@ -561,6 +577,16 @@ def parse_legend(self, response: Response) -> Generator[UnitTypeLegends]:
"""
Example: www.vvz.ethz.ch/Vorlesungsverzeichnis/legendeStudienplanangaben.view?abschnittId=117361&semkez=2025W&lang=en
"""
+ if RESCRAPE_SEMKEZS and "cached" in response.flags:
+ self.logger.info(
+ "Skipping cached legend page due to rescrape settings",
+ extra={
+ "url": response.url,
+ "request_url": response.request.url if response.request else None,
+ },
+ )
+ return
+
try:
semkez = re.search(RE_SEMKEZ, response.url)
id = re.search(RE_ABSCHNITTID, response.url)
diff --git a/scraper/util/db_httpcache.py b/scraper/util/caching/db_httpcache.py
similarity index 93%
rename from scraper/util/db_httpcache.py
rename to scraper/util/caching/db_httpcache.py
index ec4360a..284b16b 100644
--- a/scraper/util/db_httpcache.py
+++ b/scraper/util/caching/db_httpcache.py
@@ -13,6 +13,7 @@
from api.models import HTTPCache
from api.util.db import meta_engine
+from scraper.util.caching.rescrape import should_rescrape
from scraper.util.url import normalized_url
@@ -33,10 +34,20 @@ def close_spider(self, spider: Spider) -> None:
@override
def retrieve_response(self, spider: Spider, request: Request) -> Response | None:
url = self._normalize_url(request.url)
+
+ if should_rescrape(url):
+ self.logger.info(
+ "URL marked for rescraping, skipping cache",
+ extra={"url": url},
+ )
+ return None
+
with Session(meta_engine.connect()) as session:
entry = session.get(HTTPCache, url)
if not entry:
return None
+ if entry.flagged:
+ return None
headers = (
{k.encode(): v.encode() for k, v in entry.headers.items()}
diff --git a/scraper/util/caching/rescrape.py b/scraper/util/caching/rescrape.py
new file mode 100644
index 0000000..7af5126
--- /dev/null
+++ b/scraper/util/caching/rescrape.py
@@ -0,0 +1,73 @@
+"""
+Cache layout
+
+All pages are cached in an sqlite DB with DBHTTPCache.
+
+If enable_rescrape is enabled:
+- At most rescrape_amount unit
+pages of the last two semesters are rescraped.
+- seit=0 pages are rescraped if older than an hour
+"""
+
+from time import time
+
+from sqlmodel import col, distinct, or_, select
+
+from api.models import HTTPCache, LearningUnit
+from api.util.db import get_meta_session, get_session
+from scraper.env import Settings
+
+settings = Settings()
+enable_rescrape = Settings().enable_rescrape
+rescrape_amount = Settings().rescrape_amount
+
+
+def get_last_semesters(n: int) -> list[str]:
+ with next(get_session()) as session:
+ semkezs = session.exec(
+ select(distinct(LearningUnit.semkez))
+ .order_by(col(LearningUnit.semkez).desc())
+ .limit(n)
+ ).all()
+ return list(semkezs)
+
+
+RESCRAPE_SEMKEZS = get_last_semesters(1) if enable_rescrape else None
+
+# gets the outdated urls and any seite=0 urls
+clauses = []
+oldest_urls = set[str]()
+flagged = set[str]()
+if RESCRAPE_SEMKEZS is not None:
+ clauses = or_(
+ *[
+ col(HTTPCache.url).contains(f"semkez={semkez}")
+ for semkez in RESCRAPE_SEMKEZS
+ ]
+ )
+ with next(get_meta_session()) as session:
+ oldest_urls = set(
+ session.exec(
+ select(HTTPCache.url)
+ .order_by(col(HTTPCache.scraped_at))
+ .where(clauses)
+ .limit(rescrape_amount)
+ ).all()
+ )
+
+ # seite=0 pages
+ seite0_urls = session.exec(
+ select(HTTPCache.url)
+ .where(
+ clauses,
+ col(HTTPCache.url).contains("seite=0"),
+ col(HTTPCache.scraped_at) < int(time()) - 3600, # older than an hour
+ )
+ .order_by(col(HTTPCache.scraped_at))
+ .limit(50)
+ ).all()
+ oldest_urls.update(seite0_urls)
+
+
+def should_rescrape(url: str):
+ return enable_rescrape and url in oldest_urls
From 060abd838b288c8eaea1147f64fdfc83594497f9 Mon Sep 17 00:00:00 2001
From: Mark
Date: Fri, 27 Feb 2026 22:30:24 +0100
Subject: [PATCH 4/5] partially remove unit changes
---
api/models.py | 2 ++
api/routers/v1/units.py | 4 ++-
scraper/pipelines.py | 41 +----------------------
scraper/util/difference.py | 66 --------------------------------------
4 files changed, 6 insertions(+), 107 deletions(-)
delete mode 100644 scraper/util/difference.py
diff --git a/api/models.py b/api/models.py
index 6961a62..ba63621 100644
--- a/api/models.py
+++ b/api/models.py
@@ -514,6 +514,7 @@ def search_query(self) -> str:
"""
+# TODO: REMOVE
class UnitChanges(BaseModel, table=True):
"""We keep track of changes that get applied to learning units"""
@@ -524,6 +525,7 @@ class UnitChanges(BaseModel, table=True):
"""The scraped_at before the changes were applied"""
+# TODO: move to metadata db
class FinishedScrapingSemester(BaseModel, table=True):
"""Keeps track of which semesters have been fully scraped already."""
diff --git a/api/routers/v1/units.py b/api/routers/v1/units.py
index 50616e2..88af1fd 100644
--- a/api/routers/v1/units.py
+++ b/api/routers/v1/units.py
@@ -70,9 +70,11 @@ async def get_unit_lecturers(
@router.get(
"/{unit_id}/changes",
response_model=Sequence[UnitChanges],
- description="Get a list of changes that the course details have undergone. "
+ description="WILL BE REMOVED BEGINNING OF MARCH 2026. It's too broken.\n"
+ + "Get a list of changes that the course details have undergone. "
+ "Changes are a JSON object that describe what the values were before they "
+ "got updated to either the next change or whatever the model currently has.",
+ deprecated=True,
)
async def get_unit_changes(
unit_id: int,
diff --git a/scraper/pipelines.py b/scraper/pipelines.py
index 17a8271..22ee58e 100644
--- a/scraper/pipelines.py
+++ b/scraper/pipelines.py
@@ -6,7 +6,7 @@
from pydantic import BaseModel
from scrapy import Spider
from scrapy.utils.log import SpiderLoggerAdapter
-from sqlmodel import Session, col, select
+from sqlmodel import Session, select
from api.models import (
Course,
@@ -16,10 +16,8 @@
LearningUnit,
Lecturer,
Level,
- Overwriteable,
Rating,
Section,
- UnitChanges,
UnitExaminerLink,
UnitLecturerLink,
UnitSectionLink,
@@ -28,7 +26,6 @@
from api.util import db
from scraper.spiders.units import UnitsSpider
from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
-from scraper.util.difference import find_unit_differences
from scraper.util.scrapercache import CACHE_PATH
DEP_LINK = CACHE_PATH / "unit_dep_link.jsonl"
@@ -143,42 +140,6 @@ def process_item(self, item: object, spider: Spider):
if not old:
self.session.add(item)
self.session.commit()
- elif isinstance(old, Overwriteable):
- if isinstance(old, LearningUnit) and isinstance(item, LearningUnit):
- # determine if there are any differences
- if differences := find_unit_differences(old, item):
- old_changes = self.session.exec(
- select(UnitChanges)
- .where(
- UnitChanges.changes == differences.changes,
- UnitChanges.unit_id == differences.unit_id,
- )
- .order_by(col(UnitChanges.scraped_at).desc())
- ).one_or_none()
- if old_changes:
- self.logger.warning(
- "Detecting duplicate changes. Only updating scraped_at",
- extra={
- "unit_id": old.id,
- "changes": differences.changes,
- "changes_id": old_changes.id,
- },
- )
- old_changes.scraped_at = differences.scraped_at
- else:
- self.logger.info(
- "LearningUnit changes detected",
- extra={
- "unit_id": old.id,
- "changes": differences.changes,
- },
- )
- self.session.add(differences)
-
- old.overwrite_with(item)
- old.scraped_at = int(time.time())
- self.session.add(old)
- self.session.commit()
return item
except Exception as e:
diff --git a/scraper/util/difference.py b/scraper/util/difference.py
deleted file mode 100644
index e6aff57..0000000
--- a/scraper/util/difference.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# pyright: reportExplicitAny=false,reportAny=false
-
-
-from typing import Any, Literal
-from api.models import LearningUnit, UnitChanges
-
-
-def _determine_lang(unit: LearningUnit) -> Literal["en", "de"]:
- """
- Determines if a new unit model is added as an english or german part.
- Defaults to "de" if no English fields are set.
- """
- for field, value in unit:
- if field.endswith("_english") and value is not None:
- return "en"
- return "de"
-
-
-def _has_language_key(unit: LearningUnit, lang: Literal["en", "de"]):
- """Determines if a model already has keys of a language"""
- for field, value in unit:
- if field.endswith("_english"):
- if lang == "en" and value is not None:
- return True
- elif lang == "de" and getattr(unit, field[:-8]) is not None:
- return True
- return False
-
-
-def find_unit_differences(old: LearningUnit, new: LearningUnit) -> UnitChanges | None:
- """
- Determines if there are any differences between an already existing model (from the DB)
- and from a newly yielded item. The new item is either an English or German unit, meaning
- either the English or German catalogue data is filled out, while the other language fields
- are None. In a scraping run we'll always get both a "German" and "English" catalogue data
- unit as well as an "English" unit with the additional data.
- By determining the language of a model we avoid the issue where `old` is "English", while
- `new` is "German", so all the English fields are incorrectly identified as having been
- removed because they're not present in the new model anymore.
- """
-
- if old.id != new.id:
- raise ValueError("Can only compare LearningUnits with the same unit_id")
-
- new_lang = _determine_lang(new)
- if not _has_language_key(old, new_lang):
- # There are no differences to check, since the old model does not have any language
- # specific values of the same language as the new item.
- return None
-
- diffs: dict[str, Any] = {}
- # only iterate over explicitly set fields to avoid checking default/None values
- for field in new.model_fields_set:
- val_old = getattr(old, field)
- val_new = getattr(new, field)
- if val_old != val_new:
- diffs[field] = val_old
-
- if not diffs:
- return None
-
- return UnitChanges(
- unit_id=old.id,
- changes=diffs,
- scraped_at=old.scraped_at,
- )
From 889fd2d886ea339006d4e55accaf43e5104888ca Mon Sep 17 00:00:00 2001
From: Mark
Date: Fri, 27 Feb 2026 23:04:13 +0100
Subject: [PATCH 5/5] add last-updated text on unit page
---
.github/workflows/build.yml | 15 ++++++++++-----
api/models.py | 3 +++
api/templates/pages/Unit/Index.jinja | 6 +++++-
justfile | 4 ++++
scraper/main.py | 9 +++------
scraper/pipelines.py | 6 ++++++
scraper/spiders/lecturers.py | 2 +-
scraper/spiders/units.py | 4 ++--
scraper/util/caching/db_httpcache.py | 7 +++++++
9 files changed, 41 insertions(+), 15 deletions(-)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 21fa01d..776cb38 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -43,11 +43,16 @@ jobs:
- name: Check SQLite Alembic Migrations
run: |
- uv run alembic upgrade heads
- uv run alembic check
- uv run alembic downgrade base
- uv run alembic upgrade heads
- uv run alembic check
+ uv run alembic -n data_db upgrade heads
+ uv run alembic -n meta_db upgrade heads
+ uv run alembic -n data_db check
+ uv run alembic -n meta_db check
+ uv run alembic -n data_db downgrade base
+ uv run alembic -n meta_db downgrade base
+ uv run alembic -n data_db upgrade heads
+ uv run alembic -n meta_db upgrade heads
+ uv run alembic -n data_db check
+ uv run alembic -n meta_db check
build:
needs: test
diff --git a/api/models.py b/api/models.py
index ba63621..56e7861 100644
--- a/api/models.py
+++ b/api/models.py
@@ -368,6 +368,9 @@ def departments_as_short_str(self) -> str:
def levels_as_str(self) -> str:
return ", ".join([str(level) for level in self.levels])
+ def last_updated(self) -> str:
+ return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.scraped_at))
+
"""
diff --git a/api/templates/pages/Unit/Index.jinja b/api/templates/pages/Unit/Index.jinja
index 213bae2..0f80980 100644
--- a/api/templates/pages/Unit/Index.jinja
+++ b/api/templates/pages/Unit/Index.jinja
@@ -34,7 +34,7 @@
-
+
@@ -157,7 +157,11 @@
+
+
+ Last Updated: {{ unit.last_updated() }}
+
diff --git a/justfile b/justfile
index 18f799a..fc2cf46 100644
--- a/justfile
+++ b/justfile
@@ -11,6 +11,10 @@ migrate:
uv run alembic -n data_db upgrade heads
uv run alembic -n meta_db upgrade heads
+check:
+ uv run alembic -n data_db check
+ uv run alembic -n meta_db check
+
alias s := scrape
scrape:
diff --git a/scraper/main.py b/scraper/main.py
index 40809f6..f753d16 100644
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -8,17 +8,14 @@
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
-from sqlmodel import col, distinct, select, text
+from sqlmodel import text
from api.env import Settings as APISettings
-from api.models import LearningUnit
from api.util.db import get_session
from api.util.materialize import update_materialized_views
-from scraper.env import Settings as EnvSettings
from scraper.spiders.lecturers import LecturersSpider
from scraper.spiders.ratings import RatingsSpider
from scraper.spiders.units import UnitsSpider
-from scraper.util.caching.rescrape import get_last_semesters
logger = logging.getLogger(__name__)
@@ -44,8 +41,8 @@ def crawl():
add_stdout_logging(settings)
process = CrawlerProcess(settings)
process.crawl(UnitsSpider)
- # process.crawl(LecturersSpider)
- # process.crawl(RatingsSpider)
+ process.crawl(LecturersSpider)
+ process.crawl(RatingsSpider)
process.start()
diff --git a/scraper/pipelines.py b/scraper/pipelines.py
index 22ee58e..54a1a10 100644
--- a/scraper/pipelines.py
+++ b/scraper/pipelines.py
@@ -16,6 +16,7 @@
LearningUnit,
Lecturer,
Level,
+ Overwriteable,
Rating,
Section,
UnitExaminerLink,
@@ -140,6 +141,11 @@ def process_item(self, item: object, spider: Spider):
if not old:
self.session.add(item)
self.session.commit()
+ elif isinstance(old, Overwriteable):
+ old.overwrite_with(item)
+ old.scraped_at = int(time.time())
+ self.session.add(old)
+ self.session.commit()
return item
except Exception as e:
diff --git a/scraper/spiders/lecturers.py b/scraper/spiders/lecturers.py
index 2382a2d..4c866a0 100644
--- a/scraper/spiders/lecturers.py
+++ b/scraper/spiders/lecturers.py
@@ -39,7 +39,7 @@ def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExpli
def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny]
if RESCRAPE_SEMKEZS and "cached" in response.flags:
self.logger.info(
- "Skipping cached lecturers page due to rescrape settings",
+ "RESCRAPE is on. Not implicitly rescraping lecturers page.",
extra={"url": response.url},
)
return
diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py
index 48be03f..5fc4df8 100644
--- a/scraper/spiders/units.py
+++ b/scraper/spiders/units.py
@@ -152,7 +152,7 @@ def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExpli
def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny]
if RESCRAPE_SEMKEZS and "cached" in response.flags:
self.logger.info(
- "Skipping cached catalogue page due to rescrape settings",
+ "RESCRAPE is on. Not implicitly rescraping catalogue page.",
extra={
"url": response.url,
"request_url": response.request.url if response.request else None,
@@ -579,7 +579,7 @@ def parse_legend(self, response: Response) -> Generator[UnitTypeLegends]:
"""
if RESCRAPE_SEMKEZS and "cached" in response.flags:
self.logger.info(
- "Skipping cached legend page due to rescrape settings",
+ "RESCRAPE is on. Not implicitly rescraping legend page.",
extra={
"url": response.url,
"request_url": response.request.url if response.request else None,
diff --git a/scraper/util/caching/db_httpcache.py b/scraper/util/caching/db_httpcache.py
index 284b16b..dcd7b8a 100644
--- a/scraper/util/caching/db_httpcache.py
+++ b/scraper/util/caching/db_httpcache.py
@@ -47,6 +47,10 @@ def retrieve_response(self, spider: Spider, request: Request) -> Response | None
if not entry:
return None
if entry.flagged:
+ self.logger.info(
+ "URL flagged for rescraping, skipping cache",
+ extra={"url": url},
+ )
return None
headers = (
@@ -74,6 +78,9 @@ def store_response(
self.store(request.url, response, None)
def store(self, url: str, response: Response, timestamp: float | None):
+ if response.status == 302:
+ return
+
url = self._normalize_url(url)
headers: dict[str, str] = dict(response.headers.to_unicode_dict())
with Session(meta_engine.connect()) as session: