From 4a786d9e23e9a4954088141a8b17253e0ac1b982 Mon Sep 17 00:00:00 2001
From: Mark <mrkswrn@gmail.com>
Date: Fri, 27 Feb 2026 16:43:01 +0100
Subject: [PATCH 1/5] refactor(scraper): cache pages in sqlite db

---
 alembic.ini                                   |  12 +-
 alembic/env.py                                | 121 ++++++++++++++--
 .../fd345d2b7d78_httpcache_table.py           |  42 ++++++
 api/env.py                                    |   1 +
 api/models.py                                 |  28 +++-
 api/util/db.py                                |  24 +++
 justfile                                      |   3 +-
 scraper/settings.py                           |   2 +-
 scraper/util/db_httpcache.py                  | 137 ++++++++++++++++++
 scraper/util/url.py                           |   6 +
 10 files changed, 355 insertions(+), 21 deletions(-)
 create mode 100644 alembic/meta_versions/fd345d2b7d78_httpcache_table.py
 create mode 100644 scraper/util/db_httpcache.py

diff --git a/alembic.ini b/alembic.ini
index 1b03b05..8dbe1ab 100644
--- a/alembic.ini
+++ b/alembic.ini
@@ -1,6 +1,6 @@
 # A generic, single database configuration.
 
-[alembic]
+[DEFAULT]
 # path to migration scripts.
 # this is typically a path given in POSIX (e.g. forward slashes)
 # format, relative to the token %(here)s which refers to the location of this
@@ -19,6 +19,7 @@ script_location = %(here)s/alembic
 prepend_sys_path = .
 
 
+
 # timezone to use when rendering the date within the migration file
 # as well as the filename.
 # If specified, requires the tzdata library which can be installed by adding
@@ -44,7 +45,6 @@ prepend_sys_path = .
 # directories, initial revisions must be specified with --version-path.
 # The path separator used here should be the separator specified by "path_separator"
 # below.
-# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions
 
 # path_separator; This indicates what character is used to split lists of file
 # paths, including version_locations and prepend_sys_path within configparser
@@ -81,11 +81,11 @@ path_separator = os
 # are written from script.py.mako
 # output_encoding = utf-8
 
-# database URL.  This is consumed by the user-maintained env.py script only.
-# other means of configuring database URLs may be customized within the env.py
-# file.
-sqlalchemy.url = driver://user:pass@localhost/dbname
+[data_db]
+version_locations = %(here)s/alembic/versions
 
+[meta_db]
+version_locations = %(here)s/alembic/meta_versions
 
 [post_write_hooks]
 # post_write_hooks defines scripts or Python functions that are run
diff --git a/alembic/env.py b/alembic/env.py
index c1ff233..a509feb 100644
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -1,10 +1,16 @@
-import pathlib
+from collections.abc import MutableMapping
 from logging.config import fileConfig
+from pathlib import Path
+from typing import Literal
+
+from sqlalchemy import Table
+from sqlalchemy.sql.schema import SchemaItem
+from sqlmodel import SQLModel
 
 from alembic import context
-from api.util.db import engine
 from api.env import Settings
-from api.models import BaseModel
+from api.models import BaseModel, MetadataModel
+from api.util.db import engine, meta_engine
 
 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.
@@ -14,18 +20,109 @@
     fileConfig(config.config_file_name)
 
 
+def _get_table_names(base_cls: type) -> set[str]:
+    """Recursively collect __tablename__ from all table-model subclasses."""
+    names: set[str] = set()
+    for cls in base_cls.__subclasses__():
+        tablename = getattr(cls, "__tablename__", None)
+        if isinstance(tablename, str) and hasattr(cls, "__table__"):
+            names.add(tablename)
+        names |= _get_table_names(cls)
+    return names
+
+
+# SQLModel ignores the metadata= kwarg and puts all tables into SQLModel.metadata.
+# We distinguish which tables belong to which DB by walking subclasses instead.
+_base_table_names = _get_table_names(BaseModel)
+_meta_table_names = _get_table_names(MetadataModel)
+
+IncludeNameType = Literal[
+    "schema",
+    "table",
+    "column",
+    "index",
+    "unique_constraint",
+    "foreign_key_constraint",
+]
+ParentNamesType = MutableMapping[
+    Literal["schema_name", "table_name", "schema_qualified_table_name"], str | None
+]
+
+
+def _include_name_base(
+    name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType
+) -> bool:
+    if type_ == "table":
+        return name in _base_table_names
+    return True
+
+
+def _include_name_meta(
+    name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType
+) -> bool:
+    if type_ == "table":
+        return name in _meta_table_names
+    return True
+
+
+def _include_object_base(
+    object_: SchemaItem,
+    name: str | None,
+    type_: str,
+    _reflected: bool,
+    _compare_to: SchemaItem | None,
+) -> bool:
+    if type_ == "table":
+        if isinstance(object_, Table):
+            return object_.name in _base_table_names
+        return name in _base_table_names if name else False
+    return True
+
+
+def _include_object_meta(
+    object_: SchemaItem,
+    name: str | None,
+    type_: str,
+    _reflected: bool,
+    _compare_to: SchemaItem | None,
+) -> bool:
+    if type_ == "table":
+        if isinstance(object_, Table):
+            return object_.name in _meta_table_names
+        return name in _meta_table_names if name else False
+    return True
+
+
 def run_migrations() -> None:
-    pathlib.Path(Settings().db_path).parent.mkdir(parents=True, exist_ok=True)
+    if "".join(config.get_version_locations_list() or "").endswith("meta_versions"):
+        # metadata db
+        Path(Settings().meta_db_path).parent.mkdir(parents=True, exist_ok=True)
+        with meta_engine.connect() as connection:
+            context.configure(
+                connection=connection,
+                target_metadata=SQLModel.metadata,
+                render_as_batch=True,
+                include_name=_include_name_meta,
+                include_object=_include_object_meta,
+            )
+
+            with context.begin_transaction():
+                context.run_migrations()
 
-    with engine.connect() as connection:
-        context.configure(
-            connection=connection,
-            target_metadata=BaseModel.metadata,
-            render_as_batch=True,
-        )
+    else:
+        # data db
+        Path(Settings().db_path).parent.mkdir(parents=True, exist_ok=True)
+        with engine.connect() as connection:
+            context.configure(
+                connection=connection,
+                target_metadata=SQLModel.metadata,
+                render_as_batch=True,
+                include_name=_include_name_base,
+                include_object=_include_object_base,
+            )
 
-        with context.begin_transaction():
-            context.run_migrations()
+            with context.begin_transaction():
+                context.run_migrations()
 
 
 run_migrations()
diff --git a/alembic/meta_versions/fd345d2b7d78_httpcache_table.py b/alembic/meta_versions/fd345d2b7d78_httpcache_table.py
new file mode 100644
index 0000000..cb67731
--- /dev/null
+++ b/alembic/meta_versions/fd345d2b7d78_httpcache_table.py
@@ -0,0 +1,42 @@
+"""httpcache table
+
+Revision ID: fd345d2b7d78
+Revises:
+Create Date: 2026-02-27 17:45:49.644585
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+import sqlmodel
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "fd345d2b7d78"
+down_revision: Union[str, Sequence[str], None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "httpcache",
+        sa.Column("url", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+        sa.Column("status_code", sa.Integer(), nullable=False),
+        sa.Column("body", sa.LargeBinary(), nullable=True),
+        sa.Column("headers", sa.JSON(), nullable=True),
+        sa.Column("scraped_at", sa.INTEGER(), nullable=False),
+        sa.PrimaryKeyConstraint("url"),
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("httpcache")
+    # ### end Alembic commands ###
diff --git a/api/env.py b/api/env.py
index 66f2d60..5574641 100644
--- a/api/env.py
+++ b/api/env.py
@@ -10,6 +10,7 @@ class Settings(BaseSettings):
     )
 
     db_path: str = "data/db.sqlite"
+    meta_db_path: str = "data/meta_db.sqlite"
     cache_expiry: int = 60 * 60 * 24 * 30  # in seconds (30 days)
     sitemap_expiry: int = 86400  # in seconds
     plausible_url: str | None = None
diff --git a/api/models.py b/api/models.py
index 64dba0f..2ffd105 100644
--- a/api/models.py
+++ b/api/models.py
@@ -2,7 +2,7 @@
 
 import time
 from enum import Enum
-from typing import final, override
+from typing import Mapping, final, override
 
 from pydantic import BaseModel as PydanticBaseModel
 from rapidfuzz import fuzz, process, utils
@@ -586,3 +586,29 @@ class SectionPathView(BaseModel, table=True):
 class UnitDepartmentView(BaseModel, table=True):
     unit_id: int = Field(primary_key=True)
     department_id: int = Field(primary_key=True, index=True)
+
+
+"""
+
+
+METADATA DATABASE
+(separate db)
+
+
+
+"""
+
+
+class MetadataModel(SQLModel):
+    pass
+
+
+class HTTPCache(MetadataModel, table=True):
+    url: str = Field(primary_key=True)
+    status_code: int
+    body: bytes | None = Field(default=None)
+    headers: dict[str, str] | None = Field(default=None, sa_column=Column(JSON))
+    scraped_at: int = Field(
+        default_factory=lambda: int(time.time()),
+        sa_column=Column(INTEGER, nullable=False),
+    )
diff --git a/api/util/db.py b/api/util/db.py
index d02f20c..abd4ab9 100644
--- a/api/util/db.py
+++ b/api/util/db.py
@@ -28,3 +28,27 @@ async def aget_session():
     async with AsyncSession(aengine) as session:
         await session.execute(text("pragma mmap_size=30000000000"))
         yield session
+
+
+meta_engine = create_engine(
+    f"sqlite+pysqlite:///{Settings().meta_db_path}", json_serializer=json_serializer
+)
+
+ameta_engine = create_async_engine(
+    f"sqlite+aiosqlite:///{Settings().meta_db_path}",
+    json_serializer=json_serializer,
+    pool_size=20,
+    max_overflow=30,
+)
+
+
+def meta_get_session():
+    with Session(meta_engine) as session:
+        session.execute(text("PRAGMA foreign_keys=ON"))
+        yield session
+
+
+async def ameta_get_session():
+    async with AsyncSession(ameta_engine) as session:
+        await session.execute(text("pragma mmap_size=30000000000"))
+        yield session
diff --git a/justfile b/justfile
index 1e9c18f..18f799a 100644
--- a/justfile
+++ b/justfile
@@ -8,7 +8,8 @@ dev:
 alias m := migrate
 
 migrate:
-    uv run alembic upgrade heads
+    uv run alembic -n data_db upgrade heads
+    uv run alembic -n meta_db upgrade heads
 
 alias s := scrape
 
diff --git a/scraper/settings.py b/scraper/settings.py
index a64c1c8..afc614d 100644
--- a/scraper/settings.py
+++ b/scraper/settings.py
@@ -93,7 +93,7 @@
 HTTPCACHE_EXPIRATION_SECS = 0
 HTTPCACHE_DIR = "httpcache"
 HTTPCACHE_IGNORE_HTTP_CODES = []
-HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+HTTPCACHE_STORAGE = "scraper.util.db_httpcache.DBHTTPCache"
 
 # Set settings whose default value is deprecated to a future-proof value
 FEED_EXPORT_ENCODING = "utf-8"
diff --git a/scraper/util/db_httpcache.py b/scraper/util/db_httpcache.py
new file mode 100644
index 0000000..445c2cd
--- /dev/null
+++ b/scraper/util/db_httpcache.py
@@ -0,0 +1,137 @@
+from email.parser import Parser
+from pathlib import Path
+from typing import final, override
+
+import yaml
+from rich import print
+from scrapy import Request, Spider
+from scrapy.extensions import httpcache
+from scrapy.http import Response
+from scrapy.responsetypes import responsetypes
+from scrapy.settings import BaseSettings
+from sqlmodel import Session
+
+from api.models import HTTPCache
+from api.util.db import meta_engine
+from scraper.util.url import normalized_url
+
+
+@final
+class DBHTTPCache(httpcache.FilesystemCacheStorage):
+    def __init__(self, settings: BaseSettings | None):
+        if settings:
+            super().__init__(settings)
+
+    @override
+    def open_spider(self, spider: Spider) -> None:
+        self.logger = spider.logger
+
+    @override
+    def close_spider(self, spider: Spider) -> None:
+        pass
+
+    @override
+    def retrieve_response(self, spider: Spider, request: Request) -> Response | None:
+        url = self._normalize_url(request.url)
+        with Session(meta_engine.connect()) as session:
+            entry = session.get(HTTPCache, url)
+        if not entry:
+            return None
+
+        headers = (
+            {k.encode(): v.encode() for k, v in entry.headers.items()}
+            if entry.headers
+            else {}
+        )
+
+        respcls = responsetypes.from_args(
+            headers=headers,
+            url=url,
+            body=entry.body,
+        )
+        return respcls(
+            url=url,
+            headers=headers,
+            status=entry.status_code,
+            body=entry.body or b"",
+        )
+
+    @override
+    def store_response(
+        self, spider: Spider, request: Request, response: Response
+    ) -> None:
+        self.store(request.url, response, None)
+
+    def store(self, url: str, response: Response, timestamp: float | None):
+        url = self._normalize_url(url)
+        headers: dict[str, str] = dict(response.headers.to_unicode_dict())
+        with Session(meta_engine.connect()) as session:
+            entry = HTTPCache(
+                url=url,
+                status_code=response.status,
+                headers=headers,
+                body=response.body,
+            )
+            if timestamp is not None:
+                entry.scraped_at = int(timestamp)
+
+            session.merge(entry)
+            session.commit()
+
+    def _normalize_url(self, url: str) -> str:
+        return normalized_url(url)
+
+
+@final
+class Migrator:
+    def __init__(self, cachedir: str) -> None:
+        self.cachedir = cachedir
+        self.cache = DBHTTPCache(None)
+
+    def migrate(self):
+        for dir in self._walk():
+            try:
+                with open(dir / "meta", "r") as f:
+                    # yaml allows us to open the invalid formatted json file
+                    data = yaml.load(f, Loader=yaml.SafeLoader)  # pyright: ignore[reportAny]
+                url: str = data.get("url", "")  # pyright: ignore[reportAny]
+                timestamp: float | None = data.get("timestamp")  # pyright: ignore[reportAny]
+                status: int = data.get("status", 0)  # pyright: ignore[reportAny]
+                with open(dir / "response_body", "rb") as f:
+                    body = f.read()
+                with open(dir / "response_headers", "r") as f:
+                    parsed = Parser().parse(f)
+                    headers = dict(parsed.items())
+                self._add(url, timestamp, status, headers, body)
+                print(f"Migrated {url} from {dir}")
+            except Exception as e:
+                print(f"[red]Failed to migrate from {dir}: {e}[/red]")
+
+    def _add(
+        self,
+        url: str,
+        timestamp: float | None,
+        status: int,
+        headers: dict[str, str],
+        body: bytes,
+    ):
+        response = Response(url=url, status=status, headers=headers, body=body)
+        self.cache.store(url, response, timestamp)
+
+    def _walk(self):
+        cachedir = Path(self.cachedir)
+        for spiderdir in cachedir.iterdir():
+            if not spiderdir.is_dir():
+                continue
+            for shortdir in spiderdir.iterdir():
+                if not shortdir.is_dir():
+                    continue
+                for requestdir in shortdir.iterdir():
+                    if not requestdir.is_dir():
+                        continue
+                    yield requestdir
+
+
+if __name__ == "__main__":
+    migrator = Migrator(".scrapy/httpcache")
+    migrator.migrate()
diff --git a/scraper/util/url.py b/scraper/util/url.py
index 7556efe..d55d8b5 100644
--- a/scraper/util/url.py
+++ b/scraper/util/url.py
@@ -28,3 +28,9 @@ def sort_url_params(url: str) -> str:
     sorted_query = dict(sorted(query.items()))
     url_res = url_res._replace(query=urlencode(sorted_query, True))
     return urlunparse(url_res)
+
+
+def normalized_url(url: str) -> str:
+    sorted = sort_url_params(url)
+    sorted = sorted.replace(".vorlesungen.", ".vvz.").replace("http://", "https://")
+    return sorted.strip("/")

From db0aad8ca45027984c890be52cf46b2600ec65d3 Mon Sep 17 00:00:00 2001
From: Mark <mrkswrn@gmail.com>
Date: Fri, 27 Feb 2026 19:08:16 +0100
Subject: [PATCH 2/5] some more cleanup

---
 alembic/env.py                                |  89 +++--------
 .../c28cde0a90db_move_over_cleanup_table.py   |  37 +++++
 .../3b1a337a1fe5_move_over_cleanup_table.py   |  37 +++++
 api/models.py                                 |  22 +--
 api/util/db.py                                |   6 +-
 scraper/env.py                                |   9 +-
 scraper/main.py                               | 105 +++++++------
 scraper/pipelines.py                          |   2 +-
 scraper/settings.py                           |   4 +-
 scraper/spiders/units.py                      |   2 +-
 scraper/{util => types}/mappings.py           |   0
 scraper/util/cleanup_scrapy.py                | 143 ------------------
 scraper/util/db_httpcache.py                  |   2 +
 scraper/util/delete_cached.py                 |  48 ------
 scraper/util/logging.py                       |   3 +-
 scraper/util/regex_rules.py                   |   2 +-
 scraper/util/table.py                         |   5 +-
 17 files changed, 187 insertions(+), 329 deletions(-)
 create mode 100644 alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py
 create mode 100644 alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py
 rename scraper/{util => types}/mappings.py (100%)
 delete mode 100644 scraper/util/cleanup_scrapy.py
 delete mode 100644 scraper/util/delete_cached.py

diff --git a/alembic/env.py b/alembic/env.py
index a509feb..6b10ec6 100644
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -1,10 +1,7 @@
-from collections.abc import MutableMapping
 from logging.config import fileConfig
 from pathlib import Path
-from typing import Literal
 
-from sqlalchemy import Table
-from sqlalchemy.sql.schema import SchemaItem
+from sqlalchemy import MetaData
 from sqlmodel import SQLModel
 
 from alembic import context
@@ -31,66 +28,28 @@ def _get_table_names(base_cls: type) -> set[str]:
     return names
 
 
-# SQLModel ignores the metadata= kwarg and puts all tables into SQLModel.metadata.
-# We distinguish which tables belong to which DB by walking subclasses instead.
+def _build_filtered_metadata(table_names: set[str]) -> MetaData:
+    """Build a new MetaData containing only the specified tables from SQLModel.metadata.
+
+    SQLModel ignores the metadata= kwarg and registers all tables into a single
+    shared SQLModel.metadata. To make Alembic correctly detect additions, changes,
+    AND removals per-database, we construct a filtered MetaData that only contains
+    the tables belonging to that database. This way Alembic sees exactly which
+    tables should exist and can generate drops for any that are missing.
+    """
+    filtered = MetaData()
+    for name, table in SQLModel.metadata.tables.items():
+        if name in table_names:
+            table.to_metadata(filtered)
+    return filtered
+
+
+# Distinguish which tables belong to which DB by walking model subclasses.
 _base_table_names = _get_table_names(BaseModel)
 _meta_table_names = _get_table_names(MetadataModel)
 
-IncludeNameType = Literal[
-    "schema",
-    "table",
-    "column",
-    "index",
-    "unique_constraint",
-    "foreign_key_constraint",
-]
-ParentNamesType = MutableMapping[
-    Literal["schema_name", "table_name", "schema_qualified_table_name"], str | None
-]
-
-
-def _include_name_base(
-    name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType
-) -> bool:
-    if type_ == "table":
-        return name in _base_table_names
-    return True
-
-
-def _include_name_meta(
-    name: str | None, type_: IncludeNameType, _parent_names: ParentNamesType
-) -> bool:
-    if type_ == "table":
-        return name in _meta_table_names
-    return True
-
-
-def _include_object_base(
-    object_: SchemaItem,
-    name: str | None,
-    type_: str,
-    _reflected: bool,
-    _compare_to: SchemaItem | None,
-) -> bool:
-    if type_ == "table":
-        if isinstance(object_, Table):
-            return object_.name in _base_table_names
-        return name in _base_table_names if name else False
-    return True
-
-
-def _include_object_meta(
-    object_: SchemaItem,
-    name: str | None,
-    type_: str,
-    _reflected: bool,
-    _compare_to: SchemaItem | None,
-) -> bool:
-    if type_ == "table":
-        if isinstance(object_, Table):
-            return object_.name in _meta_table_names
-        return name in _meta_table_names if name else False
-    return True
+_base_metadata = _build_filtered_metadata(_base_table_names)
+_meta_metadata = _build_filtered_metadata(_meta_table_names)
 
 
 def run_migrations() -> None:
@@ -100,10 +59,8 @@ def run_migrations() -> None:
         with meta_engine.connect() as connection:
             context.configure(
                 connection=connection,
-                target_metadata=SQLModel.metadata,
+                target_metadata=_meta_metadata,
                 render_as_batch=True,
-                include_name=_include_name_meta,
-                include_object=_include_object_meta,
             )
 
             with context.begin_transaction():
@@ -115,10 +72,8 @@ def run_migrations() -> None:
         with engine.connect() as connection:
             context.configure(
                 connection=connection,
-                target_metadata=SQLModel.metadata,
+                target_metadata=_base_metadata,
                 render_as_batch=True,
-                include_name=_include_name_base,
-                include_object=_include_object_base,
             )
 
             with context.begin_transaction():
diff --git a/alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py b/alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py
new file mode 100644
index 0000000..7777b31
--- /dev/null
+++ b/alembic/meta_versions/c28cde0a90db_move_over_cleanup_table.py
@@ -0,0 +1,37 @@
+"""move over cleanup table
+
+Revision ID: c28cde0a90db
+Revises: fd345d2b7d78
+Create Date: 2026-02-27 19:08:43.207368
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'c28cde0a90db'
+down_revision: Union[str, Sequence[str], None] = 'fd345d2b7d78'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('lastcleanup',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('timestamp', sa.INTEGER(), nullable=False),
+    sa.PrimaryKeyConstraint('id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('lastcleanup')
+    # ### end Alembic commands ###
diff --git a/alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py b/alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py
new file mode 100644
index 0000000..b6b2efe
--- /dev/null
+++ b/alembic/versions/3b1a337a1fe5_move_over_cleanup_table.py
@@ -0,0 +1,37 @@
+"""move over cleanup table
+
+Revision ID: 3b1a337a1fe5
+Revises: aa860aba0a9f
+Create Date: 2026-02-27 19:18:10.668579
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel
+
+
+# revision identifiers, used by Alembic.
+revision: str = '3b1a337a1fe5'
+down_revision: Union[str, Sequence[str], None] = 'aa860aba0a9f'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('lastcleanup')
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('lastcleanup',
+    sa.Column('id', sa.INTEGER(), nullable=False),
+    sa.Column('timestamp', sa.INTEGER(), nullable=False),
+    sa.PrimaryKeyConstraint('id')
+    )
+    # ### end Alembic commands ###
diff --git a/api/models.py b/api/models.py
index 2ffd105..dcc60a1 100644
--- a/api/models.py
+++ b/api/models.py
@@ -2,7 +2,7 @@
 
 import time
 from enum import Enum
-from typing import Mapping, final, override
+from typing import final, override
 
 from pydantic import BaseModel as PydanticBaseModel
 from rapidfuzz import fuzz, process, utils
@@ -530,16 +530,6 @@ class FinishedScrapingSemester(BaseModel, table=True):
     semkez: str = Field(primary_key=True)
 
 
-class LastCleanup(BaseModel, table=True):
-    """Keeps track of when the last cleanup of the scrapy cache was performed."""
-
-    id: int | None = Field(default=None, primary_key=True)
-    timestamp: int = Field(
-        default_factory=lambda: int(time.time()),
-        sa_column=Column(INTEGER, nullable=False),
-    )
-
-
 class Rating(BaseModel, table=True):
     """Course ratings scraped from the CourseReview site"""
 
@@ -612,3 +602,13 @@ class HTTPCache(MetadataModel, table=True):
         default_factory=lambda: int(time.time()),
         sa_column=Column(INTEGER, nullable=False),
     )
+
+
+class LastCleanup(MetadataModel, table=True):
+    """Keeps track of when the last cleanup of the scrapy cache was performed."""
+
+    id: int | None = Field(default=None, primary_key=True)
+    timestamp: int = Field(
+        default_factory=lambda: int(time.time()),
+        sa_column=Column(INTEGER, nullable=False),
+    )
diff --git a/api/util/db.py b/api/util/db.py
index abd4ab9..7900edf 100644
--- a/api/util/db.py
+++ b/api/util/db.py
@@ -27,6 +27,7 @@ def get_session():
 async def aget_session():
     async with AsyncSession(aengine) as session:
         await session.execute(text("pragma mmap_size=30000000000"))
+        await session.execute(text("PRAGMA foreign_keys=ON"))
         yield session
 
 
@@ -42,13 +43,14 @@ async def aget_session():
 )
 
 
-def meta_get_session():
+def get_meta_session():
     with Session(meta_engine) as session:
         session.execute(text("PRAGMA foreign_keys=ON"))
         yield session
 
 
-async def ameta_get_session():
+async def aget_meta_session():
     async with AsyncSession(ameta_engine) as session:
         await session.execute(text("pragma mmap_size=30000000000"))
+        await session.execute(text("PRAGMA foreign_keys=ON"))
         yield session
diff --git a/scraper/env.py b/scraper/env.py
index d2555ea..86e1a9c 100644
--- a/scraper/env.py
+++ b/scraper/env.py
@@ -10,16 +10,13 @@ class Settings(BaseSettings):
         env_file=".env", env_file_encoding="utf-8", extra="ignore"
     )
 
-    refresh_html: bool = False
-    """If html files, that are already cached locally, should be refetched"""
-
     # Semester settings only apply to newly scraped semesters
-    # RESCRAPE_AMOUNT will overwrite this and cause only the last
+    # ENABLE_RESCRAPE will overwrite this and cause only the last
     # two already scraped semesters to be rescraped
-    start_year: int = date.today().year
+    start_year: int = date.today().year - 1
     # automatically include next year (if it exists)
     end_year: int = date.today().year + 1
-    semester: str = "W"
+    semester: str = "W,S"
 
     delay: float = 5.0
     """Amount of seconds to at least wait between requests"""
diff --git a/scraper/main.py b/scraper/main.py
index 186dc08..e903ab8 100644
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -8,16 +8,18 @@
 from scrapy.crawler import CrawlerProcess
 from scrapy.settings import Settings
 from scrapy.utils.project import get_project_settings
-from sqlmodel import text
+from sqlmodel import col, distinct, select, text
 
 from api.env import Settings as APISettings
+from api.models import LearningUnit
 from api.util.db import get_session
 from api.util.materialize import update_materialized_views
 from scraper.env import Settings as EnvSettings
 from scraper.spiders.lecturers import LecturersSpider
 from scraper.spiders.ratings import RatingsSpider
 from scraper.spiders.units import UnitsSpider
-from scraper.util.delete_cached import delete_cached
+
+logger = logging.getLogger(__name__)
 
 
 def add_stdout_logging(settings: Settings):
@@ -36,49 +38,66 @@ def add_stdout_logging(settings: Settings):
     root.addHandler(sh)
 
 
-settings = get_project_settings()
-add_stdout_logging(settings)
+def crawl():
+    settings = get_project_settings()
+    add_stdout_logging(settings)
 
-process = CrawlerProcess(settings)
+    process = CrawlerProcess(settings)
 
-# cleanup cache if required
-if EnvSettings().enable_rescrape:
-    semkezs = delete_cached()
-    process.crawl(UnitsSpider, semkezs=semkezs)
-    process.crawl(LecturersSpider, semkezs=semkezs)
-    process.crawl(RatingsSpider)
-else:
-    process.crawl(UnitsSpider)
-    process.crawl(LecturersSpider)
-    process.crawl(RatingsSpider)
-process.start()
+    # cleanup cache if required
+    if EnvSettings().enable_rescrape:
+        with next(get_session()) as session:
+            semkezs = session.exec(
+                select(distinct(LearningUnit.semkez))
+                .order_by(col(LearningUnit.semkez).desc())
+                .limit(2)
+            ).all()
+        if not semkezs:
+            logger.info("No semesters found in database, scraping all semesters.")
+            semkezs = None
+        process.crawl(UnitsSpider, semkezs=semkezs)
+        # process.crawl(LecturersSpider, semkezs=semkezs)
+        # process.crawl(RatingsSpider)
+    else:
+        process.crawl(UnitsSpider)
+        # process.crawl(LecturersSpider)
+        # process.crawl(RatingsSpider)
+    process.start()
 
-logger = logging.getLogger(__name__)
 
-logger.info("Finished scraping data, updating materialized tables")
-with next(get_session()) as session:
-    update_materialized_views(session)
-
-# vacuum/zip db
-logger.info(f"Vacuuming database into {APISettings().vacuum_path}")
-if Path(APISettings().vacuum_path).exists():  # required for VACUUM INTO to work
-    Path(APISettings().vacuum_path).unlink()
-with next(get_session()) as session:
-    session.execute(
-        text("VACUUM INTO :vacuum_path"),
-        {"vacuum_path": f"{APISettings().vacuum_path}"},
+def update_materialized_view():
+    logger.info("Finished scraping data, updating materialized tables")
+    with next(get_session()) as session:
+        update_materialized_views(session)
+
+
+def vacuum():
+    # vacuum/zip db
+    logger.info(f"Vacuuming database into {APISettings().vacuum_path}")
+    if Path(APISettings().vacuum_path).exists():  # required for VACUUM INTO to work
+        Path(APISettings().vacuum_path).unlink()
+    with next(get_session()) as session:
+        session.execute(
+            text("VACUUM INTO :vacuum_path"),
+            {"vacuum_path": f"{APISettings().vacuum_path}"},
+        )
+    logger.info("Finished vacuuming database")
+    logger.info(f"Creating database zip file at {APISettings().zip_path}")
+    with zipfile.ZipFile(APISettings().zip_path, "w", zipfile.ZIP_DEFLATED) as z:
+        z.write(APISettings().vacuum_path, arcname="database.db")
+    logger.info("Finished creating database zip file")
+    db_size = Path(APISettings().db_path).stat().st_size / (1024 * 1024)
+    vacuum_size = Path(APISettings().vacuum_path).stat().st_size / (1024 * 1024)
+    zip_size = Path(APISettings().zip_path).stat().st_size / (1024 * 1024)
+    logger.info(
+        f"Database size: {db_size:.2f} MB, vacuum size: {vacuum_size:.2f} MB, zipped size: {zip_size:.2f} MB"
     )
-logger.info("Finished vacuuming database")
-logger.info(f"Creating database zip file at {APISettings().zip_path}")
-with zipfile.ZipFile(APISettings().zip_path, "w", zipfile.ZIP_DEFLATED) as z:
-    z.write(APISettings().vacuum_path, arcname="database.db")
-logger.info("Finished creating database zip file")
-db_size = Path(APISettings().db_path).stat().st_size / (1024 * 1024)
-vacuum_size = Path(APISettings().vacuum_path).stat().st_size / (1024 * 1024)
-zip_size = Path(APISettings().zip_path).stat().st_size / (1024 * 1024)
-logger.info(
-    f"Database size: {db_size:.2f} MB, vacuum size: {vacuum_size:.2f} MB, zipped size: {zip_size:.2f} MB"
-)
-logger.info(f"Deleting vacuum file at {APISettings().vacuum_path}")
-Path(APISettings().vacuum_path).unlink(missing_ok=True)
-logger.info("Finished deleting vacuum file.")
+    logger.info(f"Deleting vacuum file at {APISettings().vacuum_path}")
+    Path(APISettings().vacuum_path).unlink(missing_ok=True)
+    logger.info("Finished deleting vacuum file.")
+
+
+if __name__ == "__main__":
+    crawl()
+    update_materialized_view()
+    vacuum()
diff --git a/scraper/pipelines.py b/scraper/pipelines.py
index ebd853c..17a8271 100644
--- a/scraper/pipelines.py
+++ b/scraper/pipelines.py
@@ -27,8 +27,8 @@
 )
 from api.util import db
 from scraper.spiders.units import UnitsSpider
+from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
 from scraper.util.difference import find_unit_differences
-from scraper.util.mappings import UnitDepartmentMapping, UnitLevelMapping
 from scraper.util.scrapercache import CACHE_PATH
 
 DEP_LINK = CACHE_PATH / "unit_dep_link.jsonl"
diff --git a/scraper/settings.py b/scraper/settings.py
index afc614d..f8a5ffa 100644
--- a/scraper/settings.py
+++ b/scraper/settings.py
@@ -89,9 +89,7 @@
 
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-HTTPCACHE_ENABLED = not Settings().refresh_html
-HTTPCACHE_EXPIRATION_SECS = 0
-HTTPCACHE_DIR = "httpcache"
+HTTPCACHE_ENABLED = True
 HTTPCACHE_IGNORE_HTTP_CODES = []
 HTTPCACHE_STORAGE = "scraper.util.db_httpcache.DBHTTPCache"
 
diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py
index 07667ed..1eac4ca 100644
--- a/scraper/spiders/units.py
+++ b/scraper/spiders/units.py
@@ -38,8 +38,8 @@
     WeekdayEnum,
 )
 from scraper.env import Settings
+from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
 from scraper.util.logging import KeywordLoggerSpider
-from scraper.util.mappings import UnitDepartmentMapping, UnitLevelMapping
 from scraper.util.regex_rules import (
     RE_ABSCHNITTID,
     RE_DATE,
diff --git a/scraper/util/mappings.py b/scraper/types/mappings.py
similarity index 100%
rename from scraper/util/mappings.py
rename to scraper/types/mappings.py
diff --git a/scraper/util/cleanup_scrapy.py b/scraper/util/cleanup_scrapy.py
deleted file mode 100644
index 0920b28..0000000
--- a/scraper/util/cleanup_scrapy.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from datetime import datetime
-from pathlib import Path
-import argparse
-import re
-from shutil import rmtree
-import time
-from typing import TypedDict, cast
-
-import yaml
-
-
-HTTP_CACHE_PATH = Path(".scrapy/httpcache")
-
-re_units_en = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/lerneinheit\.view\?ansicht=ALLE&lang=en&lerneinheitId=\d+&semkez=\d{4}\w"
-re_root_units = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/sucheLehrangebot\.view\?semkez=\d{4}\w&ansicht=2&seite=0(&deptId=\d+)?(&studiengangTyp=\w+)?&lang=\w\w"
-re_legends = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/legendeStudienplanangaben\.view\?abschnittId=\d+&lang=en&semkez=\d{4}\w"
-
-re_lecturers_root = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/sucheDozierende\.view\?lang=de&semkez=\d{4}\w&seite=0"
-
-
-class FileMetadata(TypedDict):
-    url: str
-    timestamp: int
-
-
-def get_files(path: Path):
-    if not path.is_dir():
-        return
-    for top in path.iterdir():
-        if top.is_dir():
-            for bot in top.iterdir():
-                if bot.is_dir():
-                    meta = bot / "meta"
-                    if not meta.exists():
-                        yield "", bot, 0
-                    with open(meta, "r") as f:
-                        # yaml allows us to open the invalid formatted json file
-                        data = cast(
-                            FileMetadata,
-                            yaml.load(f, Loader=yaml.SafeLoader),
-                        )
-                    yield data.get("url", ""), bot, data.get("timestamp", 0)
-
-
-def cleanup_scrapy(
-    dry_run: bool = False,
-    delete_cached_semesters: list[str] | None = None,
-    amount: int = 100,
-    age_seconds: int = 0,
-):
-    if delete_cached_semesters is None:
-        delete_cached_semesters = []
-
-    unts, lecrs = 0, 0
-    cached_unts = 0
-    units = HTTP_CACHE_PATH / "units"
-    lecturers = HTTP_CACHE_PATH / "lecturers"
-    now = time.time()
-
-    for url, dir, timestamp in get_files(units):
-        dt = datetime.fromtimestamp(timestamp)
-        # delete files that we do not use anymore
-        if (
-            not re.match(re_units_en, url)
-            and not re.match(re_root_units, url)
-            and not re.match(re_legends, url)
-        ):
-            print(f"Delete unit: {dir}: URL mismatch {url}: {dt}")
-            unts += 1
-            if not dry_run:
-                rmtree(dir)
-        # delete files from cached semesters
-        if cached_unts < amount:
-            for sem in delete_cached_semesters:
-                if f"semkez={sem}" in url and (now - timestamp) > age_seconds:
-                    print(f"Delete unit: {dir}: Cached semester {sem} {url}: {dt}")
-                    unts += 1
-                    cached_unts += 1
-                    if not dry_run:
-                        rmtree(dir)
-                    break
-
-    for url, dir, timestamp in get_files(lecturers):
-        dt = datetime.fromtimestamp(timestamp)
-        if not re.match(re_lecturers_root, url):
-            print(f"Delete lecturer: {dir}: URL mismatch {url}: {dt}")
-            lecrs += 1
-            if not dry_run:
-                rmtree(dir)
-
-    print(
-        f"===============\nDeleted {unts} files in lecturers dir\nDeleted {lecrs} files in lecturers dir"
-    )
-
-
-if __name__ == "__main__":
-
-    class Arguments(argparse.Namespace):
-        dry_run: bool
-        delete_cached_semesters: list[str]
-        amount: int
-        age_seconds: int
-
-    parser = argparse.ArgumentParser(description="Cleanup scrapy cache")
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Show what would be deleted without actually deleting",
-        default=False,
-    )
-    parser.add_argument(
-        "-d",
-        "--delete-cached-semesters",
-        nargs="+",
-        help="List of semesters to delete from cache, e.g., 2023W 2024S",
-        default=[],
-    )
-    parser.add_argument(
-        "-n",
-        "--amount",
-        type=int,
-        help="Amount of cached semesters to delete",
-        default=100,
-    )
-    parser.add_argument(
-        "--age-seconds",
-        type=int,
-        help="Delete cached files older than this many seconds",
-        default=0,
-    )
-    parser.add_argument(
-        "--age-seconds",
-        type=int,
-        help="Delete cached files older than this many seconds",
-        default=0,
-    )
-    args = parser.parse_args(namespace=Arguments())
-    cleanup_scrapy(
-        dry_run=args.dry_run,
-        delete_cached_semesters=args.delete_cached_semesters,
-        amount=args.amount,
-        age_seconds=args.age_seconds,
-    )
diff --git a/scraper/util/db_httpcache.py b/scraper/util/db_httpcache.py
index 445c2cd..ec4360a 100644
--- a/scraper/util/db_httpcache.py
+++ b/scraper/util/db_httpcache.py
@@ -84,6 +84,8 @@ def _normalize_url(self, url: str) -> str:
 
 @final
 class Migrator:
+    """Used to migrate httpcache to dbhttpcache above"""
+
     def __init__(self, cachedir: str) -> None:
         self.cachedir = cachedir
         self.cache = DBHTTPCache(None)
diff --git a/scraper/util/delete_cached.py b/scraper/util/delete_cached.py
deleted file mode 100644
index e42caa6..0000000
--- a/scraper/util/delete_cached.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import time
-from typing import Sequence
-from sqlmodel import col, distinct, select
-from api.models import LastCleanup, LearningUnit
-from api.util.db import get_session
-from scraper.env import Settings
-from scraper.util.cleanup_scrapy import cleanup_scrapy
-
-
-def delete_cached() -> Sequence[str]:
-    print("Checking if cached files should be deleted...")
-    with next(get_session()) as session:
-        last_cleanup = session.exec(
-            select(LastCleanup).order_by(col(LastCleanup.timestamp).desc()).limit(1)
-        ).first()
-        last_cleanup_time = last_cleanup.timestamp if last_cleanup else 0
-
-        # prevent cleaning up if pod is crash-looping
-        now = int(time.time())
-        if now - last_cleanup_time < 22 * 3600:
-            print(
-                "Last cleanup was performed less than 22 hours ago, skipping cleanup."
-            )
-            return []
-
-        last_semesters = session.exec(
-            select(distinct(LearningUnit.semkez))
-            .order_by(col(LearningUnit.semkez).desc())
-            .limit(2)
-        ).all()
-        if not last_semesters:
-            print("No semesters found in database, skipping cleanup.")
-            return []
-
-        print(f"Performing cleanup of cached files for semesters: {last_semesters}")
-
-        cleanup_scrapy(
-            dry_run=True,
-            delete_cached_semesters=list(last_semesters),
-            amount=Settings().rescrape_amount,
-            age_seconds=Settings().rescrape_age_seconds,
-        )
-
-        last_cleanup = LastCleanup(timestamp=now)
-        session.add(last_cleanup)
-        session.commit()
-
-    return last_semesters
diff --git a/scraper/util/logging.py b/scraper/util/logging.py
index 6024f76..24e6e52 100644
--- a/scraper/util/logging.py
+++ b/scraper/util/logging.py
@@ -1,12 +1,13 @@
 # pyright: reportExplicitAny=false,reportAny=false
 
 from typing import Any, override
+
 from scrapy.spiders import CrawlSpider
 from scrapy.utils.log import SpiderLoggerAdapter
 
 
 class KeywordLoggerAdapter(SpiderLoggerAdapter):
-    """A logger adapter which adds the 'keyword' attribute to log records."""
+    """A logger adapter which adds the 'extra' attributes to log records."""
 
     @override
     def process(self, msg: str, kwargs: Any):
diff --git a/scraper/util/regex_rules.py b/scraper/util/regex_rules.py
index 628d75a..107597b 100644
--- a/scraper/util/regex_rules.py
+++ b/scraper/util/regex_rules.py
@@ -8,7 +8,7 @@
 - 17-412 1L (https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?lerneinheitId=13629&semkez=2003S&ansicht=LEHRVERANSTALTUNGEN&lang=de)
 - 10-824 (https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?lerneinheitId=6467&semkez=2003S&ansicht=LEHRVERANSTALTUNGEN&lang=de)
 """
-RE_DATE = r"\d{2}\.\d{2}\.\d{4}"  # 31.12.2023
+RE_DATE = r"\d{2}\.\d{2}\.\d{4}"  # Format: 31.12.2023
 RE_SEMKEZ = r"semkez=(\w+)"
 RE_UNITID = r"lerneinheitId=(\d+)"
 RE_DOZIDE = r"dozide=(\d+)"
diff --git a/scraper/util/table.py b/scraper/util/table.py
index 1b82f6e..89767fe 100644
--- a/scraper/util/table.py
+++ b/scraper/util/table.py
@@ -1,13 +1,14 @@
 from re import Pattern
-from scrapy.http import Response
+
 from parsel import Selector, SelectorList
+from scrapy.http import Response
 
 from scraper.util.keymap import TranslationKey, translations
 
 
 class Table:
     """
-    Takes a page and throws all table rows into a list of (key, columns) tuples.
+    Takes a page and transforms all table rows into a list of (key, columns) tuples.
     """
 
     def __init__(

From 1f9ec0ee66617bfb3f5741f197851ee7b0b85c89 Mon Sep 17 00:00:00 2001
From: Mark <mrkswrn@gmail.com>
Date: Fri, 27 Feb 2026 22:24:59 +0100
Subject: [PATCH 3/5] rescrape and add option to flag pages

---
 .../meta_versions/357b241a4250_add_flagged.py | 43 +++++++++++
 api/models.py                                 |  2 +
 scraper/env.py                                |  4 +-
 scraper/main.py                               | 24 +-----
 scraper/settings.py                           |  2 +-
 scraper/spiders/lecturers.py                  | 14 +++-
 scraper/spiders/units.py                      | 32 +++++++-
 scraper/util/{ => caching}/db_httpcache.py    | 11 +++
 scraper/util/caching/rescrape.py              | 73 +++++++++++++++++++
 9 files changed, 176 insertions(+), 29 deletions(-)
 create mode 100644 alembic/meta_versions/357b241a4250_add_flagged.py
 rename scraper/util/{ => caching}/db_httpcache.py (93%)
 create mode 100644 scraper/util/caching/rescrape.py

diff --git a/alembic/meta_versions/357b241a4250_add_flagged.py b/alembic/meta_versions/357b241a4250_add_flagged.py
new file mode 100644
index 0000000..34946b5
--- /dev/null
+++ b/alembic/meta_versions/357b241a4250_add_flagged.py
@@ -0,0 +1,43 @@
+"""add flagged
+
+Revision ID: 357b241a4250
+Revises: c28cde0a90db
+Create Date: 2026-02-27 22:17:37.706745
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "357b241a4250"
+down_revision: Union[str, Sequence[str], None] = "c28cde0a90db"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("httpcache", schema=None) as batch_op:
+        batch_op.add_column(
+            sa.Column(
+                "flagged", sa.Boolean(), nullable=False, server_default=sa.false()
+            )
+        )
+
+    with op.batch_alter_table("httpcache", schema=None) as batch_op:
+        batch_op.alter_column("flagged", server_default=None)
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("httpcache", schema=None) as batch_op:
+        batch_op.drop_column("flagged")
+
+    # ### end Alembic commands ###
diff --git a/api/models.py b/api/models.py
index dcc60a1..6961a62 100644
--- a/api/models.py
+++ b/api/models.py
@@ -598,6 +598,8 @@ class HTTPCache(MetadataModel, table=True):
     status_code: int
     body: bytes | None = Field(default=None)
     headers: dict[str, str] | None = Field(default=None, sa_column=Column(JSON))
+    flagged: bool = Field(default=False)
+    """if set, the entry be rescraped the next time it's accessed"""
     scraped_at: int = Field(
         default_factory=lambda: int(time.time()),
         sa_column=Column(INTEGER, nullable=False),
diff --git a/scraper/env.py b/scraper/env.py
index 86e1a9c..a1f4f66 100644
--- a/scraper/env.py
+++ b/scraper/env.py
@@ -24,10 +24,10 @@ class Settings(BaseSettings):
     log_append: bool = True
     disable_log_file: bool = False
 
-    # delete valid cached files
+    # rescrapes the full course pages, while only rescraping
+    # the given amount of the oldest cached unit pages
     enable_rescrape: bool = False
     rescrape_amount: int = 500
-    rescrape_age_seconds: int = 24 * 3600 * 14  # 14 days
 
     def read_semesters(self) -> list[Literal["W", "S"]]:
         semesters: list[Literal["W", "S"]] = []
diff --git a/scraper/main.py b/scraper/main.py
index e903ab8..40809f6 100644
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -18,6 +18,7 @@
 from scraper.spiders.lecturers import LecturersSpider
 from scraper.spiders.ratings import RatingsSpider
 from scraper.spiders.units import UnitsSpider
+from scraper.util.caching.rescrape import get_last_semesters
 
 logger = logging.getLogger(__name__)
 
@@ -41,27 +42,10 @@ def add_stdout_logging(settings: Settings):
 def crawl():
     settings = get_project_settings()
     add_stdout_logging(settings)
-
     process = CrawlerProcess(settings)
-
-    # cleanup cache if required
-    if EnvSettings().enable_rescrape:
-        with next(get_session()) as session:
-            semkezs = session.exec(
-                select(distinct(LearningUnit.semkez))
-                .order_by(col(LearningUnit.semkez).desc())
-                .limit(2)
-            ).all()
-        if not semkezs:
-            logger.info("No semesters found in database, scraping all semesters.")
-            semkezs = None
-        process.crawl(UnitsSpider, semkezs=semkezs)
-        # process.crawl(LecturersSpider, semkezs=semkezs)
-        # process.crawl(RatingsSpider)
-    else:
-        process.crawl(UnitsSpider)
-        # process.crawl(LecturersSpider)
-        # process.crawl(RatingsSpider)
+    process.crawl(UnitsSpider)
+    # process.crawl(LecturersSpider)
+    # process.crawl(RatingsSpider)
     process.start()
 
 
diff --git a/scraper/settings.py b/scraper/settings.py
index f8a5ffa..fb2c50a 100644
--- a/scraper/settings.py
+++ b/scraper/settings.py
@@ -91,7 +91,7 @@
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 HTTPCACHE_ENABLED = True
 HTTPCACHE_IGNORE_HTTP_CODES = []
-HTTPCACHE_STORAGE = "scraper.util.db_httpcache.DBHTTPCache"
+HTTPCACHE_STORAGE = "scraper.util.caching.db_httpcache.DBHTTPCache"
 
 # Set settings whose default value is deprecated to a future-proof value
 FEED_EXPORT_ENCODING = "utf-8"
diff --git a/scraper/spiders/lecturers.py b/scraper/spiders/lecturers.py
index 59d1229..2382a2d 100644
--- a/scraper/spiders/lecturers.py
+++ b/scraper/spiders/lecturers.py
@@ -5,6 +5,7 @@
 
 from api.models import Lecturer
 from scraper.env import Settings
+from scraper.util.caching.rescrape import RESCRAPE_SEMKEZS
 from scraper.util.logging import KeywordLoggerSpider
 from scraper.util.regex_rules import RE_DOZIDE, RE_SEMKEZ
 
@@ -18,11 +19,11 @@ def get_urls(year: int, semester: Literal["W", "S"]):
 class LecturersSpider(KeywordLoggerSpider):
     name: str = "lecturers"
 
-    def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any):  # pyright: ignore[reportAny,reportExplicitAny]
-        if semkezs is not None:
+    def __init__(self, *a: Any, **kw: Any):  # pyright: ignore[reportAny,reportExplicitAny]
+        if RESCRAPE_SEMKEZS is not None:
             self.start_urls: list[str] = [
                 url
-                for semkez in semkezs
+                for semkez in RESCRAPE_SEMKEZS
                 for url in get_urls(int(semkez[:-1]), "S" if semkez[-1] == "S" else "W")
             ]
         else:
@@ -36,6 +37,13 @@ def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any):  # pyr
 
     @override
     def parse_start_url(self, response: Response, **_: Any):  # pyright: ignore[reportExplicitAny]
+        if RESCRAPE_SEMKEZS and "cached" in response.flags:
+            self.logger.info(
+                "Skipping cached lecturers page due to rescrape settings",
+                extra={"url": response.url},
+            )
+            return
+
         semkez = re.search(RE_SEMKEZ, response.url)
         if not semkez:
             self.logger.error(
diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py
index 1eac4ca..48be03f 100644
--- a/scraper/spiders/units.py
+++ b/scraper/spiders/units.py
@@ -39,6 +39,7 @@
 )
 from scraper.env import Settings
 from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
+from scraper.util.caching.rescrape import RESCRAPE_SEMKEZS
 from scraper.util.logging import KeywordLoggerSpider
 from scraper.util.regex_rules import (
     RE_ABSCHNITTID,
@@ -131,11 +132,11 @@ class UnitsSpider(KeywordLoggerSpider):
     )
     course_ids: dict[str, set[int]] = defaultdict(set)
 
-    def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any):  # pyright: ignore[reportAny,reportExplicitAny]
-        if semkezs is not None:
+    def __init__(self, *a: Any, **kw: Any):  # pyright: ignore[reportAny,reportExplicitAny]
+        if RESCRAPE_SEMKEZS is not None:
             self.start_urls: list[str] = [
                 url
-                for semkez in semkezs
+                for semkez in RESCRAPE_SEMKEZS
                 for url in get_urls(int(semkez[:-1]), "S" if semkez[-1] == "S" else "W")
             ]
         else:
@@ -149,6 +150,16 @@ def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any):  # pyr
 
     @override
     def parse_start_url(self, response: Response, **_: Any):  # pyright: ignore[reportExplicitAny]
+        if RESCRAPE_SEMKEZS and "cached" in response.flags:
+            self.logger.info(
+                "Skipping cached catalogue page due to rescrape settings",
+                extra={
+                    "url": response.url,
+                    "request_url": response.request.url if response.request else None,
+                },
+            )
+            return
+
         try:
             catalog_semkez = re.search(RE_SEMKEZ, response.url)
             if not catalog_semkez:
@@ -361,6 +372,11 @@ def parse_unit(
 
         Example url: https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?semkez=2025W&ansicht=ALLE&lerneinheitId=192945&lang=en
         """
+        if RESCRAPE_SEMKEZS and "cached" in response.flags:
+            # the http cache will automatically refetch a set of pages
+            # if there are explicit semkezs to rescrape
+            return
+
         try:
             if "red9.ethz.ch" in response.url:
                 self.logger.info(
@@ -561,6 +577,16 @@ def parse_legend(self, response: Response) -> Generator[UnitTypeLegends]:
         """
         Example: www.vvz.ethz.ch/Vorlesungsverzeichnis/legendeStudienplanangaben.view?abschnittId=117361&semkez=2025W&lang=en
         """
+        if RESCRAPE_SEMKEZS and "cached" in response.flags:
+            self.logger.info(
+                "Skipping cached legend page due to rescrape settings",
+                extra={
+                    "url": response.url,
+                    "request_url": response.request.url if response.request else None,
+                },
+            )
+            return
+
         try:
             semkez = re.search(RE_SEMKEZ, response.url)
             id = re.search(RE_ABSCHNITTID, response.url)
diff --git a/scraper/util/db_httpcache.py b/scraper/util/caching/db_httpcache.py
similarity index 93%
rename from scraper/util/db_httpcache.py
rename to scraper/util/caching/db_httpcache.py
index ec4360a..284b16b 100644
--- a/scraper/util/db_httpcache.py
+++ b/scraper/util/caching/db_httpcache.py
@@ -13,6 +13,7 @@
 
 from api.models import HTTPCache
 from api.util.db import meta_engine
+from scraper.util.caching.rescrape import should_rescrape
 from scraper.util.url import normalized_url
 
 
@@ -33,10 +34,20 @@ def close_spider(self, spider: Spider) -> None:
     @override
     def retrieve_response(self, spider: Spider, request: Request) -> Response | None:
         url = self._normalize_url(request.url)
+
+        if should_rescrape(url):
+            self.logger.info(
+                "URL marked for rescraping, skipping cache",
+                extra={"url": url},
+            )
+            return None
+
         with Session(meta_engine.connect()) as session:
             entry = session.get(HTTPCache, url)
         if not entry:
             return None
+        if entry.flagged:
+            return None
 
         headers = (
             {k.encode(): v.encode() for k, v in entry.headers.items()}
diff --git a/scraper/util/caching/rescrape.py b/scraper/util/caching/rescrape.py
new file mode 100644
index 0000000..7af5126
--- /dev/null
+++ b/scraper/util/caching/rescrape.py
@@ -0,0 +1,73 @@
+"""
+Cache layout
+
+All pages are cached in an sqlite DB with DBHTTPCache.
+
+If enable_rescrape is enabled:
+- At most rescrape_amount unit
+pages of the last two semesters are rescraped.
+- seit=0 pages are rescraped if older than an hour
+"""
+
+from time import time
+
+from sqlmodel import col, distinct, or_, select
+
+from api.models import HTTPCache, LearningUnit
+from api.util.db import get_meta_session, get_session
+from scraper.env import Settings
+
+settings = Settings()
+enable_rescrape = Settings().enable_rescrape
+rescrape_amount = Settings().rescrape_amount
+
+
+def get_last_semesters(n: int) -> list[str]:
+    with next(get_session()) as session:
+        semkezs = session.exec(
+            select(distinct(LearningUnit.semkez))
+            .order_by(col(LearningUnit.semkez).desc())
+            .limit(n)
+        ).all()
+    return list(semkezs)
+
+
+RESCRAPE_SEMKEZS = get_last_semesters(1) if enable_rescrape else None
+
+# gets the outdated urls and any seite=0 urls
+clauses = []
+oldest_urls = set[str]()
+flagged = set[str]()
+if RESCRAPE_SEMKEZS is not None:
+    clauses = or_(
+        *[
+            col(HTTPCache.url).contains(f"semkez={semkez}")
+            for semkez in RESCRAPE_SEMKEZS
+        ]
+    )
+    with next(get_meta_session()) as session:
+        oldest_urls = set(
+            session.exec(
+                select(HTTPCache.url)
+                .order_by(col(HTTPCache.scraped_at))
+                .where(clauses)
+                .limit(rescrape_amount)
+            ).all()
+        )
+
+        # seite=0 pages
+        seite0_urls = session.exec(
+            select(HTTPCache.url)
+            .where(
+                clauses,
+                col(HTTPCache.url).contains("seite=0"),
+                col(HTTPCache.scraped_at) < int(time()) - 3600,  # older than an hour
+            )
+            .order_by(col(HTTPCache.scraped_at))
+            .limit(50)
+        ).all()
+        oldest_urls.update(seite0_urls)
+
+
+def should_rescrape(url: str):
+    return enable_rescrape and url in oldest_urls

From 060abd838b288c8eaea1147f64fdfc83594497f9 Mon Sep 17 00:00:00 2001
From: Mark <mrkswrn@gmail.com>
Date: Fri, 27 Feb 2026 22:30:24 +0100
Subject: [PATCH 4/5] partially remove unit changes

---
 api/models.py              |  2 ++
 api/routers/v1/units.py    |  4 ++-
 scraper/pipelines.py       | 41 +----------------------
 scraper/util/difference.py | 66 --------------------------------------
 4 files changed, 6 insertions(+), 107 deletions(-)
 delete mode 100644 scraper/util/difference.py

diff --git a/api/models.py b/api/models.py
index 6961a62..ba63621 100644
--- a/api/models.py
+++ b/api/models.py
@@ -514,6 +514,7 @@ def search_query(self) -> str:
 """
 
 
+# TODO: REMOVE
 class UnitChanges(BaseModel, table=True):
     """We keep track of changes that get applied to learning units"""
 
@@ -524,6 +525,7 @@ class UnitChanges(BaseModel, table=True):
     """The scraped_at before the changes were applied"""
 
 
+# TODO: move to metadata db
 class FinishedScrapingSemester(BaseModel, table=True):
     """Keeps track of which semesters have been fully scraped already."""
 
diff --git a/api/routers/v1/units.py b/api/routers/v1/units.py
index 50616e2..88af1fd 100644
--- a/api/routers/v1/units.py
+++ b/api/routers/v1/units.py
@@ -70,9 +70,11 @@ async def get_unit_lecturers(
 @router.get(
     "/{unit_id}/changes",
     response_model=Sequence[UnitChanges],
-    description="Get a list of changes that the course details have undergone. "
+    description="WILL BE REMOVED BEGINNING OF MARCH 2026. It's too broken.\n"
+    + "Get a list of changes that the course details have undergone. "
     + "Changes are a JSON object that describe what the values were before they "
     + "got updated to either the next change or whatever the model currently has.",
+    deprecated=True,
 )
 async def get_unit_changes(
     unit_id: int,
diff --git a/scraper/pipelines.py b/scraper/pipelines.py
index 17a8271..22ee58e 100644
--- a/scraper/pipelines.py
+++ b/scraper/pipelines.py
@@ -6,7 +6,7 @@
 from pydantic import BaseModel
 from scrapy import Spider
 from scrapy.utils.log import SpiderLoggerAdapter
-from sqlmodel import Session, col, select
+from sqlmodel import Session, select
 
 from api.models import (
     Course,
@@ -16,10 +16,8 @@
     LearningUnit,
     Lecturer,
     Level,
-    Overwriteable,
     Rating,
     Section,
-    UnitChanges,
     UnitExaminerLink,
     UnitLecturerLink,
     UnitSectionLink,
@@ -28,7 +26,6 @@
 from api.util import db
 from scraper.spiders.units import UnitsSpider
 from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
-from scraper.util.difference import find_unit_differences
 from scraper.util.scrapercache import CACHE_PATH
 
 DEP_LINK = CACHE_PATH / "unit_dep_link.jsonl"
@@ -143,42 +140,6 @@ def process_item(self, item: object, spider: Spider):
             if not old:
                 self.session.add(item)
                 self.session.commit()
-            elif isinstance(old, Overwriteable):
-                if isinstance(old, LearningUnit) and isinstance(item, LearningUnit):
-                    # determine if there are any differences
-                    if differences := find_unit_differences(old, item):
-                        old_changes = self.session.exec(
-                            select(UnitChanges)
-                            .where(
-                                UnitChanges.changes == differences.changes,
-                                UnitChanges.unit_id == differences.unit_id,
-                            )
-                            .order_by(col(UnitChanges.scraped_at).desc())
-                        ).one_or_none()
-                        if old_changes:
-                            self.logger.warning(
-                                "Detecting duplicate changes. Only updating scraped_at",
-                                extra={
-                                    "unit_id": old.id,
-                                    "changes": differences.changes,
-                                    "changes_id": old_changes.id,
-                                },
-                            )
-                            old_changes.scraped_at = differences.scraped_at
-                        else:
-                            self.logger.info(
-                                "LearningUnit changes detected",
-                                extra={
-                                    "unit_id": old.id,
-                                    "changes": differences.changes,
-                                },
-                            )
-                            self.session.add(differences)
-
-                old.overwrite_with(item)
-                old.scraped_at = int(time.time())
-                self.session.add(old)
-                self.session.commit()
 
             return item
         except Exception as e:
diff --git a/scraper/util/difference.py b/scraper/util/difference.py
deleted file mode 100644
index e6aff57..0000000
--- a/scraper/util/difference.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# pyright: reportExplicitAny=false,reportAny=false
-
-
-from typing import Any, Literal
-from api.models import LearningUnit, UnitChanges
-
-
-def _determine_lang(unit: LearningUnit) -> Literal["en", "de"]:
-    """
-    Determines if a new unit model is added as an english or german part.
-    Defaults to "de" if no English fields are set.
-    """
-    for field, value in unit:
-        if field.endswith("_english") and value is not None:
-            return "en"
-    return "de"
-
-
-def _has_language_key(unit: LearningUnit, lang: Literal["en", "de"]):
-    """Determines if a model already has keys of a language"""
-    for field, value in unit:
-        if field.endswith("_english"):
-            if lang == "en" and value is not None:
-                return True
-            elif lang == "de" and getattr(unit, field[:-8]) is not None:
-                return True
-    return False
-
-
-def find_unit_differences(old: LearningUnit, new: LearningUnit) -> UnitChanges | None:
-    """
-    Determines if there are any differences between an already existing model (from the DB)
-    and from a newly yielded item. The new item is either an English or German unit, meaning
-    either the English or German catalogue data is filled out, while the other language fields
-    are None. In a scraping run we'll always get both a "German" and "English" catalogue data
-    unit as well as an "English" unit with the additional data.
-    By determining the language of a model we avoid the issue where `old` is "English", while
-    `new` is "German", so all the English fields are incorrectly identified as having been
-    removed because they're not present in the new model anymore.
-    """
-
-    if old.id != new.id:
-        raise ValueError("Can only compare LearningUnits with the same unit_id")
-
-    new_lang = _determine_lang(new)
-    if not _has_language_key(old, new_lang):
-        # There are no differences to check, since the old model does not have any language
-        # specific values of the same language as the new item.
-        return None
-
-    diffs: dict[str, Any] = {}
-    # only iterate over explicitly set fields to avoid checking default/None values
-    for field in new.model_fields_set:
-        val_old = getattr(old, field)
-        val_new = getattr(new, field)
-        if val_old != val_new:
-            diffs[field] = val_old
-
-    if not diffs:
-        return None
-
-    return UnitChanges(
-        unit_id=old.id,
-        changes=diffs,
-        scraped_at=old.scraped_at,
-    )

From 889fd2d886ea339006d4e55accaf43e5104888ca Mon Sep 17 00:00:00 2001
From: Mark <mrkswrn@gmail.com>
Date: Fri, 27 Feb 2026 23:04:13 +0100
Subject: [PATCH 5/5] add last-updated text on unit page

---
 .github/workflows/build.yml          | 15 ++++++++++-----
 api/models.py                        |  3 +++
 api/templates/pages/Unit/Index.jinja |  6 +++++-
 justfile                             |  4 ++++
 scraper/main.py                      |  9 +++------
 scraper/pipelines.py                 |  6 ++++++
 scraper/spiders/lecturers.py         |  2 +-
 scraper/spiders/units.py             |  4 ++--
 scraper/util/caching/db_httpcache.py |  7 +++++++
 9 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 21fa01d..776cb38 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -43,11 +43,16 @@ jobs:
 
       - name: Check SQLite Alembic Migrations
         run: |
-          uv run alembic upgrade heads
-          uv run alembic check
-          uv run alembic downgrade base
-          uv run alembic upgrade heads
-          uv run alembic check
+          uv run alembic -n data_db upgrade heads
+          uv run alembic -n meta_db upgrade heads
+          uv run alembic -n data_db check
+          uv run alembic -n meta_db check
+          uv run alembic -n data_db downgrade base
+          uv run alembic -n meta_db downgrade base
+          uv run alembic -n data_db upgrade heads
+          uv run alembic -n meta_db upgrade heads
+          uv run alembic -n data_db check
+          uv run alembic -n meta_db check
 
   build:
     needs: test
diff --git a/api/models.py b/api/models.py
index ba63621..56e7861 100644
--- a/api/models.py
+++ b/api/models.py
@@ -368,6 +368,9 @@ def departments_as_short_str(self) -> str:
     def levels_as_str(self) -> str:
         return ", ".join([str(level) for level in self.levels])
 
+    def last_updated(self) -> str:
+        return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.scraped_at))
+
 
 """
 
diff --git a/api/templates/pages/Unit/Index.jinja b/api/templates/pages/Unit/Index.jinja
index 213bae2..0f80980 100644
--- a/api/templates/pages/Unit/Index.jinja
+++ b/api/templates/pages/Unit/Index.jinja
@@ -34,7 +34,7 @@
             </p>
         </div>
 
-        <header class="card bg-base-100 shadow-sm">
+        <header class="card bg-base-100 shadow-sm relative">
             <div class="card-body flex-row gap-4 justify-between">
                 <div class="flex flex-col gap-2">
                     <div class="flex flex-wrap items-center gap-2 mb-2">
@@ -157,7 +157,11 @@
                             <icons.External />
                         </span>
                     </a>
+
                 </div>
+                <p class="absolute bottom-2 right-4 text-xs text-base-content/50">
+                    Last Updated: {{ unit.last_updated() }}
+                </p>
             </div>
         </header>
 
diff --git a/justfile b/justfile
index 18f799a..fc2cf46 100644
--- a/justfile
+++ b/justfile
@@ -11,6 +11,10 @@ migrate:
     uv run alembic -n data_db upgrade heads
     uv run alembic -n meta_db upgrade heads
 
+check:
+    uv run alembic -n data_db check
+    uv run alembic -n meta_db check
+
 alias s := scrape
 
 scrape:
diff --git a/scraper/main.py b/scraper/main.py
index 40809f6..f753d16 100644
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -8,17 +8,14 @@
 from scrapy.crawler import CrawlerProcess
 from scrapy.settings import Settings
 from scrapy.utils.project import get_project_settings
-from sqlmodel import col, distinct, select, text
+from sqlmodel import text
 
 from api.env import Settings as APISettings
-from api.models import LearningUnit
 from api.util.db import get_session
 from api.util.materialize import update_materialized_views
-from scraper.env import Settings as EnvSettings
 from scraper.spiders.lecturers import LecturersSpider
 from scraper.spiders.ratings import RatingsSpider
 from scraper.spiders.units import UnitsSpider
-from scraper.util.caching.rescrape import get_last_semesters
 
 logger = logging.getLogger(__name__)
 
@@ -44,8 +41,8 @@ def crawl():
     add_stdout_logging(settings)
     process = CrawlerProcess(settings)
     process.crawl(UnitsSpider)
-    # process.crawl(LecturersSpider)
-    # process.crawl(RatingsSpider)
+    process.crawl(LecturersSpider)
+    process.crawl(RatingsSpider)
     process.start()
 
 
diff --git a/scraper/pipelines.py b/scraper/pipelines.py
index 22ee58e..54a1a10 100644
--- a/scraper/pipelines.py
+++ b/scraper/pipelines.py
@@ -16,6 +16,7 @@
     LearningUnit,
     Lecturer,
     Level,
+    Overwriteable,
     Rating,
     Section,
     UnitExaminerLink,
@@ -140,6 +141,11 @@ def process_item(self, item: object, spider: Spider):
             if not old:
                 self.session.add(item)
                 self.session.commit()
+            elif isinstance(old, Overwriteable):
+                old.overwrite_with(item)
+                old.scraped_at = int(time.time())
+                self.session.add(old)
+                self.session.commit()
 
             return item
         except Exception as e:
diff --git a/scraper/spiders/lecturers.py b/scraper/spiders/lecturers.py
index 2382a2d..4c866a0 100644
--- a/scraper/spiders/lecturers.py
+++ b/scraper/spiders/lecturers.py
@@ -39,7 +39,7 @@ def __init__(self, *a: Any, **kw: Any):  # pyright: ignore[reportAny,reportExpli
     def parse_start_url(self, response: Response, **_: Any):  # pyright: ignore[reportExplicitAny]
         if RESCRAPE_SEMKEZS and "cached" in response.flags:
             self.logger.info(
-                "Skipping cached lecturers page due to rescrape settings",
+                "RESCRAPE is on. Not implicitly rescraping lecturers page.",
                 extra={"url": response.url},
             )
             return
diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py
index 48be03f..5fc4df8 100644
--- a/scraper/spiders/units.py
+++ b/scraper/spiders/units.py
@@ -152,7 +152,7 @@ def __init__(self, *a: Any, **kw: Any):  # pyright: ignore[reportAny,reportExpli
     def parse_start_url(self, response: Response, **_: Any):  # pyright: ignore[reportExplicitAny]
         if RESCRAPE_SEMKEZS and "cached" in response.flags:
             self.logger.info(
-                "Skipping cached catalogue page due to rescrape settings",
+                "RESCRAPE is on. Not implicitly rescraping catalogue page.",
                 extra={
                     "url": response.url,
                     "request_url": response.request.url if response.request else None,
@@ -579,7 +579,7 @@ def parse_legend(self, response: Response) -> Generator[UnitTypeLegends]:
         """
         if RESCRAPE_SEMKEZS and "cached" in response.flags:
             self.logger.info(
-                "Skipping cached legend page due to rescrape settings",
+                "RESCRAPE is on. Not implicitly rescraping legend page.",
                 extra={
                     "url": response.url,
                     "request_url": response.request.url if response.request else None,
diff --git a/scraper/util/caching/db_httpcache.py b/scraper/util/caching/db_httpcache.py
index 284b16b..dcd7b8a 100644
--- a/scraper/util/caching/db_httpcache.py
+++ b/scraper/util/caching/db_httpcache.py
@@ -47,6 +47,10 @@ def retrieve_response(self, spider: Spider, request: Request) -> Response | None
         if not entry:
             return None
         if entry.flagged:
+            self.logger.info(
+                "URL flagged for rescraping, skipping cache",
+                extra={"url": url},
+            )
             return None
 
         headers = (
@@ -74,6 +78,9 @@ def store_response(
         self.store(request.url, response, None)
 
     def store(self, url: str, response: Response, timestamp: float | None):
+        if response.status == 302:
+            return
+
         url = self._normalize_url(url)
         headers: dict[str, str] = dict(response.headers.to_unicode_dict())
         with Session(meta_engine.connect()) as session: