@@ -157,7 +157,11 @@
+
+
+ Last Updated: {{ unit.last_updated() }}
+
diff --git a/api/util/db.py b/api/util/db.py
index d02f20c..7900edf 100644
--- a/api/util/db.py
+++ b/api/util/db.py
@@ -27,4 +27,30 @@ def get_session():
async def aget_session():
async with AsyncSession(aengine) as session:
await session.execute(text("pragma mmap_size=30000000000"))
+ await session.execute(text("PRAGMA foreign_keys=ON"))
+ yield session
+
+
+meta_engine = create_engine(
+ f"sqlite+pysqlite:///{Settings().meta_db_path}", json_serializer=json_serializer
+)
+
+ameta_engine = create_async_engine(
+ f"sqlite+aiosqlite:///{Settings().meta_db_path}",
+ json_serializer=json_serializer,
+ pool_size=20,
+ max_overflow=30,
+)
+
+
+def get_meta_session():
+ with Session(meta_engine) as session:
+ session.execute(text("PRAGMA foreign_keys=ON"))
+ yield session
+
+
+async def aget_meta_session():
+ async with AsyncSession(ameta_engine) as session:
+ await session.execute(text("pragma mmap_size=30000000000"))
+ await session.execute(text("PRAGMA foreign_keys=ON"))
yield session
diff --git a/justfile b/justfile
index 1e9c18f..fc2cf46 100644
--- a/justfile
+++ b/justfile
@@ -8,7 +8,12 @@ dev:
alias m := migrate
migrate:
- uv run alembic upgrade heads
+ uv run alembic -n data_db upgrade heads
+ uv run alembic -n meta_db upgrade heads
+
+check:
+ uv run alembic -n data_db check
+ uv run alembic -n meta_db check
alias s := scrape
diff --git a/scraper/env.py b/scraper/env.py
index d2555ea..a1f4f66 100644
--- a/scraper/env.py
+++ b/scraper/env.py
@@ -10,16 +10,13 @@ class Settings(BaseSettings):
env_file=".env", env_file_encoding="utf-8", extra="ignore"
)
- refresh_html: bool = False
- """If html files, that are already cached locally, should be refetched"""
-
# Semester settings only apply to newly scraped semesters
- # RESCRAPE_AMOUNT will overwrite this and cause only the last
+ # ENABLE_RESCRAPE will overwrite this and cause only the last
# two already scraped semesters to be rescraped
- start_year: int = date.today().year
+ start_year: int = date.today().year - 1
# automatically include next year (if it exists)
end_year: int = date.today().year + 1
- semester: str = "W"
+ semester: str = "W,S"
delay: float = 5.0
"""Amount of seconds to at least wait between requests"""
@@ -27,10 +24,10 @@ class Settings(BaseSettings):
log_append: bool = True
disable_log_file: bool = False
- # delete valid cached files
+ # rescrapes the full course pages, while only rescraping
+ # the given amount of the oldest cached unit pages
enable_rescrape: bool = False
rescrape_amount: int = 500
- rescrape_age_seconds: int = 24 * 3600 * 14 # 14 days
def read_semesters(self) -> list[Literal["W", "S"]]:
semesters: list[Literal["W", "S"]] = []
diff --git a/scraper/main.py b/scraper/main.py
index 186dc08..f753d16 100644
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -13,11 +13,11 @@
from api.env import Settings as APISettings
from api.util.db import get_session
from api.util.materialize import update_materialized_views
-from scraper.env import Settings as EnvSettings
from scraper.spiders.lecturers import LecturersSpider
from scraper.spiders.ratings import RatingsSpider
from scraper.spiders.units import UnitsSpider
-from scraper.util.delete_cached import delete_cached
+
+logger = logging.getLogger(__name__)
def add_stdout_logging(settings: Settings):
@@ -36,49 +36,49 @@ def add_stdout_logging(settings: Settings):
root.addHandler(sh)
-settings = get_project_settings()
-add_stdout_logging(settings)
-
-process = CrawlerProcess(settings)
-
-# cleanup cache if required
-if EnvSettings().enable_rescrape:
- semkezs = delete_cached()
- process.crawl(UnitsSpider, semkezs=semkezs)
- process.crawl(LecturersSpider, semkezs=semkezs)
- process.crawl(RatingsSpider)
-else:
+def crawl():
+ settings = get_project_settings()
+ add_stdout_logging(settings)
+ process = CrawlerProcess(settings)
process.crawl(UnitsSpider)
process.crawl(LecturersSpider)
process.crawl(RatingsSpider)
-process.start()
+ process.start()
+
+
+def update_materialized_view():
+ logger.info("Finished scraping data, updating materialized tables")
+ with next(get_session()) as session:
+ update_materialized_views(session)
-logger = logging.getLogger(__name__)
-logger.info("Finished scraping data, updating materialized tables")
-with next(get_session()) as session:
- update_materialized_views(session)
-
-# vacuum/zip db
-logger.info(f"Vacuuming database into {APISettings().vacuum_path}")
-if Path(APISettings().vacuum_path).exists(): # required for VACUUM INTO to work
- Path(APISettings().vacuum_path).unlink()
-with next(get_session()) as session:
- session.execute(
- text("VACUUM INTO :vacuum_path"),
- {"vacuum_path": f"{APISettings().vacuum_path}"},
+def vacuum():
+ # vacuum/zip db
+ logger.info(f"Vacuuming database into {APISettings().vacuum_path}")
+ if Path(APISettings().vacuum_path).exists(): # required for VACUUM INTO to work
+ Path(APISettings().vacuum_path).unlink()
+ with next(get_session()) as session:
+ session.execute(
+ text("VACUUM INTO :vacuum_path"),
+ {"vacuum_path": f"{APISettings().vacuum_path}"},
+ )
+ logger.info("Finished vacuuming database")
+ logger.info(f"Creating database zip file at {APISettings().zip_path}")
+ with zipfile.ZipFile(APISettings().zip_path, "w", zipfile.ZIP_DEFLATED) as z:
+ z.write(APISettings().vacuum_path, arcname="database.db")
+ logger.info("Finished creating database zip file")
+ db_size = Path(APISettings().db_path).stat().st_size / (1024 * 1024)
+ vacuum_size = Path(APISettings().vacuum_path).stat().st_size / (1024 * 1024)
+ zip_size = Path(APISettings().zip_path).stat().st_size / (1024 * 1024)
+ logger.info(
+ f"Database size: {db_size:.2f} MB, vacuum size: {vacuum_size:.2f} MB, zipped size: {zip_size:.2f} MB"
)
-logger.info("Finished vacuuming database")
-logger.info(f"Creating database zip file at {APISettings().zip_path}")
-with zipfile.ZipFile(APISettings().zip_path, "w", zipfile.ZIP_DEFLATED) as z:
- z.write(APISettings().vacuum_path, arcname="database.db")
-logger.info("Finished creating database zip file")
-db_size = Path(APISettings().db_path).stat().st_size / (1024 * 1024)
-vacuum_size = Path(APISettings().vacuum_path).stat().st_size / (1024 * 1024)
-zip_size = Path(APISettings().zip_path).stat().st_size / (1024 * 1024)
-logger.info(
- f"Database size: {db_size:.2f} MB, vacuum size: {vacuum_size:.2f} MB, zipped size: {zip_size:.2f} MB"
-)
-logger.info(f"Deleting vacuum file at {APISettings().vacuum_path}")
-Path(APISettings().vacuum_path).unlink(missing_ok=True)
-logger.info("Finished deleting vacuum file.")
+ logger.info(f"Deleting vacuum file at {APISettings().vacuum_path}")
+ Path(APISettings().vacuum_path).unlink(missing_ok=True)
+ logger.info("Finished deleting vacuum file.")
+
+
+if __name__ == "__main__":
+ crawl()
+ update_materialized_view()
+ vacuum()
diff --git a/scraper/pipelines.py b/scraper/pipelines.py
index ebd853c..54a1a10 100644
--- a/scraper/pipelines.py
+++ b/scraper/pipelines.py
@@ -6,7 +6,7 @@
from pydantic import BaseModel
from scrapy import Spider
from scrapy.utils.log import SpiderLoggerAdapter
-from sqlmodel import Session, col, select
+from sqlmodel import Session, select
from api.models import (
Course,
@@ -19,7 +19,6 @@
Overwriteable,
Rating,
Section,
- UnitChanges,
UnitExaminerLink,
UnitLecturerLink,
UnitSectionLink,
@@ -27,8 +26,7 @@
)
from api.util import db
from scraper.spiders.units import UnitsSpider
-from scraper.util.difference import find_unit_differences
-from scraper.util.mappings import UnitDepartmentMapping, UnitLevelMapping
+from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
from scraper.util.scrapercache import CACHE_PATH
DEP_LINK = CACHE_PATH / "unit_dep_link.jsonl"
@@ -144,37 +142,6 @@ def process_item(self, item: object, spider: Spider):
self.session.add(item)
self.session.commit()
elif isinstance(old, Overwriteable):
- if isinstance(old, LearningUnit) and isinstance(item, LearningUnit):
- # determine if there are any differences
- if differences := find_unit_differences(old, item):
- old_changes = self.session.exec(
- select(UnitChanges)
- .where(
- UnitChanges.changes == differences.changes,
- UnitChanges.unit_id == differences.unit_id,
- )
- .order_by(col(UnitChanges.scraped_at).desc())
- ).one_or_none()
- if old_changes:
- self.logger.warning(
- "Detecting duplicate changes. Only updating scraped_at",
- extra={
- "unit_id": old.id,
- "changes": differences.changes,
- "changes_id": old_changes.id,
- },
- )
- old_changes.scraped_at = differences.scraped_at
- else:
- self.logger.info(
- "LearningUnit changes detected",
- extra={
- "unit_id": old.id,
- "changes": differences.changes,
- },
- )
- self.session.add(differences)
-
old.overwrite_with(item)
old.scraped_at = int(time.time())
self.session.add(old)
diff --git a/scraper/settings.py b/scraper/settings.py
index a64c1c8..fb2c50a 100644
--- a/scraper/settings.py
+++ b/scraper/settings.py
@@ -89,11 +89,9 @@
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-HTTPCACHE_ENABLED = not Settings().refresh_html
-HTTPCACHE_EXPIRATION_SECS = 0
-HTTPCACHE_DIR = "httpcache"
+HTTPCACHE_ENABLED = True
HTTPCACHE_IGNORE_HTTP_CODES = []
-HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+HTTPCACHE_STORAGE = "scraper.util.caching.db_httpcache.DBHTTPCache"
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"
diff --git a/scraper/spiders/lecturers.py b/scraper/spiders/lecturers.py
index 59d1229..4c866a0 100644
--- a/scraper/spiders/lecturers.py
+++ b/scraper/spiders/lecturers.py
@@ -5,6 +5,7 @@
from api.models import Lecturer
from scraper.env import Settings
+from scraper.util.caching.rescrape import RESCRAPE_SEMKEZS
from scraper.util.logging import KeywordLoggerSpider
from scraper.util.regex_rules import RE_DOZIDE, RE_SEMKEZ
@@ -18,11 +19,11 @@ def get_urls(year: int, semester: Literal["W", "S"]):
class LecturersSpider(KeywordLoggerSpider):
name: str = "lecturers"
- def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny]
- if semkezs is not None:
+ def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny]
+ if RESCRAPE_SEMKEZS is not None:
self.start_urls: list[str] = [
url
- for semkez in semkezs
+ for semkez in RESCRAPE_SEMKEZS
for url in get_urls(int(semkez[:-1]), "S" if semkez[-1] == "S" else "W")
]
else:
@@ -36,6 +37,13 @@ def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyr
@override
def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny]
+ if RESCRAPE_SEMKEZS and "cached" in response.flags:
+ self.logger.info(
+ "RESCRAPE is on. Not implicitly rescraping lecturers page.",
+ extra={"url": response.url},
+ )
+ return
+
semkez = re.search(RE_SEMKEZ, response.url)
if not semkez:
self.logger.error(
diff --git a/scraper/spiders/units.py b/scraper/spiders/units.py
index 07667ed..5fc4df8 100644
--- a/scraper/spiders/units.py
+++ b/scraper/spiders/units.py
@@ -38,8 +38,9 @@
WeekdayEnum,
)
from scraper.env import Settings
+from scraper.types.mappings import UnitDepartmentMapping, UnitLevelMapping
+from scraper.util.caching.rescrape import RESCRAPE_SEMKEZS
from scraper.util.logging import KeywordLoggerSpider
-from scraper.util.mappings import UnitDepartmentMapping, UnitLevelMapping
from scraper.util.regex_rules import (
RE_ABSCHNITTID,
RE_DATE,
@@ -131,11 +132,11 @@ class UnitsSpider(KeywordLoggerSpider):
)
course_ids: dict[str, set[int]] = defaultdict(set)
- def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny]
- if semkezs is not None:
+ def __init__(self, *a: Any, **kw: Any): # pyright: ignore[reportAny,reportExplicitAny]
+ if RESCRAPE_SEMKEZS is not None:
self.start_urls: list[str] = [
url
- for semkez in semkezs
+ for semkez in RESCRAPE_SEMKEZS
for url in get_urls(int(semkez[:-1]), "S" if semkez[-1] == "S" else "W")
]
else:
@@ -149,6 +150,16 @@ def __init__(self, semkezs: list[str] | None = None, *a: Any, **kw: Any): # pyr
@override
def parse_start_url(self, response: Response, **_: Any): # pyright: ignore[reportExplicitAny]
+ if RESCRAPE_SEMKEZS and "cached" in response.flags:
+ self.logger.info(
+ "RESCRAPE is on. Not implicitly rescraping catalogue page.",
+ extra={
+ "url": response.url,
+ "request_url": response.request.url if response.request else None,
+ },
+ )
+ return
+
try:
catalog_semkez = re.search(RE_SEMKEZ, response.url)
if not catalog_semkez:
@@ -361,6 +372,11 @@ def parse_unit(
Example url: https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?semkez=2025W&ansicht=ALLE&lerneinheitId=192945&lang=en
"""
+ if RESCRAPE_SEMKEZS and "cached" in response.flags:
+ # the http cache will automatically refetch a set of pages
+ # if there are explicit semkezs to rescrape
+ return
+
try:
if "red9.ethz.ch" in response.url:
self.logger.info(
@@ -561,6 +577,16 @@ def parse_legend(self, response: Response) -> Generator[UnitTypeLegends]:
"""
Example: www.vvz.ethz.ch/Vorlesungsverzeichnis/legendeStudienplanangaben.view?abschnittId=117361&semkez=2025W&lang=en
"""
+ if RESCRAPE_SEMKEZS and "cached" in response.flags:
+ self.logger.info(
+ "RESCRAPE is on. Not implicitly rescraping legend page.",
+ extra={
+ "url": response.url,
+ "request_url": response.request.url if response.request else None,
+ },
+ )
+ return
+
try:
semkez = re.search(RE_SEMKEZ, response.url)
id = re.search(RE_ABSCHNITTID, response.url)
diff --git a/scraper/util/mappings.py b/scraper/types/mappings.py
similarity index 100%
rename from scraper/util/mappings.py
rename to scraper/types/mappings.py
diff --git a/scraper/util/caching/db_httpcache.py b/scraper/util/caching/db_httpcache.py
new file mode 100644
index 0000000..dcd7b8a
--- /dev/null
+++ b/scraper/util/caching/db_httpcache.py
@@ -0,0 +1,157 @@
+from email.parser import Parser
+from pathlib import Path
+from typing import final, override
+
+import yaml
+from rich import print
+from scrapy import Request, Spider
+from scrapy.extensions import httpcache
+from scrapy.http import Response
+from scrapy.responsetypes import responsetypes
+from scrapy.settings import BaseSettings
+from sqlmodel import Session
+
+from api.models import HTTPCache
+from api.util.db import meta_engine
+from scraper.util.caching.rescrape import should_rescrape
+from scraper.util.url import normalized_url
+
+
+@final
+class DBHTTPCache(httpcache.FilesystemCacheStorage):
+ def __init__(self, settings: BaseSettings | None):
+ if settings:
+ super().__init__(settings)
+
+ @override
+ def open_spider(self, spider: Spider) -> None:
+ self.logger = spider.logger
+
+ @override
+ def close_spider(self, spider: Spider) -> None:
+ pass
+
+ @override
+ def retrieve_response(self, spider: Spider, request: Request) -> Response | None:
+ url = self._normalize_url(request.url)
+
+ if should_rescrape(url):
+ self.logger.info(
+ "URL marked for rescraping, skipping cache",
+ extra={"url": url},
+ )
+ return None
+
+ with Session(meta_engine.connect()) as session:
+ entry = session.get(HTTPCache, url)
+ if not entry:
+ return None
+ if entry.flagged:
+ self.logger.info(
+ "URL flagged for rescraping, skipping cache",
+ extra={"url": url},
+ )
+ return None
+
+ headers = (
+ {k.encode(): v.encode() for k, v in entry.headers.items()}
+ if entry.headers
+ else {}
+ )
+
+ respcls = responsetypes.from_args(
+ headers=headers,
+ url=url,
+ body=entry.body,
+ )
+ return respcls(
+ url=url,
+ headers=headers,
+ status=entry.status_code,
+ body=entry.body or b"",
+ )
+
+ @override
+ def store_response(
+ self, spider: Spider, request: Request, response: Response
+ ) -> None:
+ self.store(request.url, response, None)
+
+ def store(self, url: str, response: Response, timestamp: float | None):
+ if response.status == 302:
+ return
+
+ url = self._normalize_url(url)
+ headers: dict[str, str] = dict(response.headers.to_unicode_dict())
+ with Session(meta_engine.connect()) as session:
+ entry = HTTPCache(
+ url=url,
+ status_code=response.status,
+ headers=headers,
+ body=response.body,
+ )
+ if timestamp is not None:
+ entry.scraped_at = int(timestamp)
+
+ session.merge(entry)
+ session.commit()
+
+ def _normalize_url(self, url: str) -> str:
+ return normalized_url(url)
+
+
+@final
+class Migrator:
+ """Used to migrate httpcache to dbhttpcache above"""
+
+ def __init__(self, cachedir: str) -> None:
+ self.cachedir = cachedir
+ self.cache = DBHTTPCache(None)
+
+ def migrate(self):
+ for dir in self._walk():
+ try:
+ with open(dir / "meta", "r") as f:
+ # yaml allows us to open the invalid formatted json file
+ data = yaml.load(f, Loader=yaml.SafeLoader) # pyright: ignore[reportAny]
+ url: str = data.get("url", "") # pyright: ignore[reportAny]
+ timestamp: float | None = data.get("timestamp") # pyright: ignore[reportAny]
+ status: int = data.get("status", 0) # pyright: ignore[reportAny]
+ with open(dir / "response_body", "rb") as f:
+ body = f.read()
+ with open(dir / "response_headers", "r") as f:
+ parsed = Parser().parse(f)
+ headers = dict(parsed.items())
+ self._add(url, timestamp, status, headers, body)
+ print(f"Migrated {url} from {dir}")
+ except Exception as e:
+ print(f"[red]Failed to migrate from {dir}: {e}[/red]")
+
+ def _add(
+ self,
+ url: str,
+ timestamp: float | None,
+ status: int,
+ headers: dict[str, str],
+ body: bytes,
+ ):
+ response = Response(url=url, status=status, headers=headers, body=body)
+ self.cache.store(url, response, timestamp)
+
+ def _walk(self):
+ cachedir = Path(self.cachedir)
+ for spiderdir in cachedir.iterdir():
+ if not spiderdir.is_dir():
+ continue
+ for shortdir in spiderdir.iterdir():
+ if not shortdir.is_dir():
+ continue
+ for requestdir in shortdir.iterdir():
+ if not requestdir.is_dir():
+ continue
+ yield requestdir
+
+
+if __name__ == "__main__":
+ migrator = Migrator(".scrapy/httpcache")
+ migrator.migrate()
diff --git a/scraper/util/caching/rescrape.py b/scraper/util/caching/rescrape.py
new file mode 100644
index 0000000..7af5126
--- /dev/null
+++ b/scraper/util/caching/rescrape.py
@@ -0,0 +1,73 @@
+"""
+Cache layout
+
+All pages are cached in an sqlite DB with DBHTTPCache.
+
+If enable_rescrape is enabled:
+- At most rescrape_amount unit
+pages of the last two semesters are rescraped.
+- seit=0 pages are rescraped if older than an hour
+"""
+
+from time import time
+
+from sqlmodel import col, distinct, or_, select
+
+from api.models import HTTPCache, LearningUnit
+from api.util.db import get_meta_session, get_session
+from scraper.env import Settings
+
+settings = Settings()
+enable_rescrape = Settings().enable_rescrape
+rescrape_amount = Settings().rescrape_amount
+
+
+def get_last_semesters(n: int) -> list[str]:
+ with next(get_session()) as session:
+ semkezs = session.exec(
+ select(distinct(LearningUnit.semkez))
+ .order_by(col(LearningUnit.semkez).desc())
+ .limit(n)
+ ).all()
+ return list(semkezs)
+
+
+RESCRAPE_SEMKEZS = get_last_semesters(1) if enable_rescrape else None
+
+# gets the outdated urls and any seite=0 urls
+clauses = []
+oldest_urls = set[str]()
+flagged = set[str]()
+if RESCRAPE_SEMKEZS is not None:
+ clauses = or_(
+ *[
+ col(HTTPCache.url).contains(f"semkez={semkez}")
+ for semkez in RESCRAPE_SEMKEZS
+ ]
+ )
+ with next(get_meta_session()) as session:
+ oldest_urls = set(
+ session.exec(
+ select(HTTPCache.url)
+ .order_by(col(HTTPCache.scraped_at))
+ .where(clauses)
+ .limit(rescrape_amount)
+ ).all()
+ )
+
+ # seite=0 pages
+ seite0_urls = session.exec(
+ select(HTTPCache.url)
+ .where(
+ clauses,
+ col(HTTPCache.url).contains("seite=0"),
+ col(HTTPCache.scraped_at) < int(time()) - 3600, # older than an hour
+ )
+ .order_by(col(HTTPCache.scraped_at))
+ .limit(50)
+ ).all()
+ oldest_urls.update(seite0_urls)
+
+
+def should_rescrape(url: str):
+ return enable_rescrape and url in oldest_urls
diff --git a/scraper/util/cleanup_scrapy.py b/scraper/util/cleanup_scrapy.py
deleted file mode 100644
index 0920b28..0000000
--- a/scraper/util/cleanup_scrapy.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from datetime import datetime
-from pathlib import Path
-import argparse
-import re
-from shutil import rmtree
-import time
-from typing import TypedDict, cast
-
-import yaml
-
-
-HTTP_CACHE_PATH = Path(".scrapy/httpcache")
-
-re_units_en = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/lerneinheit\.view\?ansicht=ALLE&lang=en&lerneinheitId=\d+&semkez=\d{4}\w"
-re_root_units = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/sucheLehrangebot\.view\?semkez=\d{4}\w&ansicht=2&seite=0(&deptId=\d+)?(&studiengangTyp=\w+)?&lang=\w\w"
-re_legends = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/legendeStudienplanangaben\.view\?abschnittId=\d+&lang=en&semkez=\d{4}\w"
-
-re_lecturers_root = r"https://www\.vvz\.ethz\.ch/Vorlesungsverzeichnis/sucheDozierende\.view\?lang=de&semkez=\d{4}\w&seite=0"
-
-
-class FileMetadata(TypedDict):
- url: str
- timestamp: int
-
-
-def get_files(path: Path):
- if not path.is_dir():
- return
- for top in path.iterdir():
- if top.is_dir():
- for bot in top.iterdir():
- if bot.is_dir():
- meta = bot / "meta"
- if not meta.exists():
- yield "", bot, 0
- with open(meta, "r") as f:
- # yaml allows us to open the invalid formatted json file
- data = cast(
- FileMetadata,
- yaml.load(f, Loader=yaml.SafeLoader),
- )
- yield data.get("url", ""), bot, data.get("timestamp", 0)
-
-
-def cleanup_scrapy(
- dry_run: bool = False,
- delete_cached_semesters: list[str] | None = None,
- amount: int = 100,
- age_seconds: int = 0,
-):
- if delete_cached_semesters is None:
- delete_cached_semesters = []
-
- unts, lecrs = 0, 0
- cached_unts = 0
- units = HTTP_CACHE_PATH / "units"
- lecturers = HTTP_CACHE_PATH / "lecturers"
- now = time.time()
-
- for url, dir, timestamp in get_files(units):
- dt = datetime.fromtimestamp(timestamp)
- # delete files that we do not use anymore
- if (
- not re.match(re_units_en, url)
- and not re.match(re_root_units, url)
- and not re.match(re_legends, url)
- ):
- print(f"Delete unit: {dir}: URL mismatch {url}: {dt}")
- unts += 1
- if not dry_run:
- rmtree(dir)
- # delete files from cached semesters
- if cached_unts < amount:
- for sem in delete_cached_semesters:
- if f"semkez={sem}" in url and (now - timestamp) > age_seconds:
- print(f"Delete unit: {dir}: Cached semester {sem} {url}: {dt}")
- unts += 1
- cached_unts += 1
- if not dry_run:
- rmtree(dir)
- break
-
- for url, dir, timestamp in get_files(lecturers):
- dt = datetime.fromtimestamp(timestamp)
- if not re.match(re_lecturers_root, url):
- print(f"Delete lecturer: {dir}: URL mismatch {url}: {dt}")
- lecrs += 1
- if not dry_run:
- rmtree(dir)
-
- print(
- f"===============\nDeleted {unts} files in lecturers dir\nDeleted {lecrs} files in lecturers dir"
- )
-
-
-if __name__ == "__main__":
-
- class Arguments(argparse.Namespace):
- dry_run: bool
- delete_cached_semesters: list[str]
- amount: int
- age_seconds: int
-
- parser = argparse.ArgumentParser(description="Cleanup scrapy cache")
- parser.add_argument(
- "--dry-run",
- action="store_true",
- help="Show what would be deleted without actually deleting",
- default=False,
- )
- parser.add_argument(
- "-d",
- "--delete-cached-semesters",
- nargs="+",
- help="List of semesters to delete from cache, e.g., 2023W 2024S",
- default=[],
- )
- parser.add_argument(
- "-n",
- "--amount",
- type=int,
- help="Amount of cached semesters to delete",
- default=100,
- )
- parser.add_argument(
- "--age-seconds",
- type=int,
- help="Delete cached files older than this many seconds",
- default=0,
- )
- parser.add_argument(
- "--age-seconds",
- type=int,
- help="Delete cached files older than this many seconds",
- default=0,
- )
- args = parser.parse_args(namespace=Arguments())
- cleanup_scrapy(
- dry_run=args.dry_run,
- delete_cached_semesters=args.delete_cached_semesters,
- amount=args.amount,
- age_seconds=args.age_seconds,
- )
diff --git a/scraper/util/delete_cached.py b/scraper/util/delete_cached.py
deleted file mode 100644
index e42caa6..0000000
--- a/scraper/util/delete_cached.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import time
-from typing import Sequence
-from sqlmodel import col, distinct, select
-from api.models import LastCleanup, LearningUnit
-from api.util.db import get_session
-from scraper.env import Settings
-from scraper.util.cleanup_scrapy import cleanup_scrapy
-
-
-def delete_cached() -> Sequence[str]:
- print("Checking if cached files should be deleted...")
- with next(get_session()) as session:
- last_cleanup = session.exec(
- select(LastCleanup).order_by(col(LastCleanup.timestamp).desc()).limit(1)
- ).first()
- last_cleanup_time = last_cleanup.timestamp if last_cleanup else 0
-
- # prevent cleaning up if pod is crash-looping
- now = int(time.time())
- if now - last_cleanup_time < 22 * 3600:
- print(
- "Last cleanup was performed less than 22 hours ago, skipping cleanup."
- )
- return []
-
- last_semesters = session.exec(
- select(distinct(LearningUnit.semkez))
- .order_by(col(LearningUnit.semkez).desc())
- .limit(2)
- ).all()
- if not last_semesters:
- print("No semesters found in database, skipping cleanup.")
- return []
-
- print(f"Performing cleanup of cached files for semesters: {last_semesters}")
-
- cleanup_scrapy(
- dry_run=True,
- delete_cached_semesters=list(last_semesters),
- amount=Settings().rescrape_amount,
- age_seconds=Settings().rescrape_age_seconds,
- )
-
- last_cleanup = LastCleanup(timestamp=now)
- session.add(last_cleanup)
- session.commit()
-
- return last_semesters
diff --git a/scraper/util/difference.py b/scraper/util/difference.py
deleted file mode 100644
index e6aff57..0000000
--- a/scraper/util/difference.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# pyright: reportExplicitAny=false,reportAny=false
-
-
-from typing import Any, Literal
-from api.models import LearningUnit, UnitChanges
-
-
-def _determine_lang(unit: LearningUnit) -> Literal["en", "de"]:
- """
- Determines if a new unit model is added as an english or german part.
- Defaults to "de" if no English fields are set.
- """
- for field, value in unit:
- if field.endswith("_english") and value is not None:
- return "en"
- return "de"
-
-
-def _has_language_key(unit: LearningUnit, lang: Literal["en", "de"]):
- """Determines if a model already has keys of a language"""
- for field, value in unit:
- if field.endswith("_english"):
- if lang == "en" and value is not None:
- return True
- elif lang == "de" and getattr(unit, field[:-8]) is not None:
- return True
- return False
-
-
-def find_unit_differences(old: LearningUnit, new: LearningUnit) -> UnitChanges | None:
- """
- Determines if there are any differences between an already existing model (from the DB)
- and from a newly yielded item. The new item is either an English or German unit, meaning
- either the English or German catalogue data is filled out, while the other language fields
- are None. In a scraping run we'll always get both a "German" and "English" catalogue data
- unit as well as an "English" unit with the additional data.
- By determining the language of a model we avoid the issue where `old` is "English", while
- `new` is "German", so all the English fields are incorrectly identified as having been
- removed because they're not present in the new model anymore.
- """
-
- if old.id != new.id:
- raise ValueError("Can only compare LearningUnits with the same unit_id")
-
- new_lang = _determine_lang(new)
- if not _has_language_key(old, new_lang):
- # There are no differences to check, since the old model does not have any language
- # specific values of the same language as the new item.
- return None
-
- diffs: dict[str, Any] = {}
- # only iterate over explicitly set fields to avoid checking default/None values
- for field in new.model_fields_set:
- val_old = getattr(old, field)
- val_new = getattr(new, field)
- if val_old != val_new:
- diffs[field] = val_old
-
- if not diffs:
- return None
-
- return UnitChanges(
- unit_id=old.id,
- changes=diffs,
- scraped_at=old.scraped_at,
- )
diff --git a/scraper/util/logging.py b/scraper/util/logging.py
index 6024f76..24e6e52 100644
--- a/scraper/util/logging.py
+++ b/scraper/util/logging.py
@@ -1,12 +1,13 @@
# pyright: reportExplicitAny=false,reportAny=false
from typing import Any, override
+
from scrapy.spiders import CrawlSpider
from scrapy.utils.log import SpiderLoggerAdapter
class KeywordLoggerAdapter(SpiderLoggerAdapter):
- """A logger adapter which adds the 'keyword' attribute to log records."""
+ """A logger adapter which adds the 'extra' attributes to log records."""
@override
def process(self, msg: str, kwargs: Any):
diff --git a/scraper/util/regex_rules.py b/scraper/util/regex_rules.py
index 628d75a..107597b 100644
--- a/scraper/util/regex_rules.py
+++ b/scraper/util/regex_rules.py
@@ -8,7 +8,7 @@
- 17-412 1L (https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?lerneinheitId=13629&semkez=2003S&ansicht=LEHRVERANSTALTUNGEN&lang=de)
- 10-824 (https://www.vvz.ethz.ch/Vorlesungsverzeichnis/lerneinheit.view?lerneinheitId=6467&semkez=2003S&ansicht=LEHRVERANSTALTUNGEN&lang=de)
"""
-RE_DATE = r"\d{2}\.\d{2}\.\d{4}" # 31.12.2023
+RE_DATE = r"\d{2}\.\d{2}\.\d{4}" # Format: 31.12.2023
RE_SEMKEZ = r"semkez=(\w+)"
RE_UNITID = r"lerneinheitId=(\d+)"
RE_DOZIDE = r"dozide=(\d+)"
diff --git a/scraper/util/table.py b/scraper/util/table.py
index 1b82f6e..89767fe 100644
--- a/scraper/util/table.py
+++ b/scraper/util/table.py
@@ -1,13 +1,14 @@
from re import Pattern
-from scrapy.http import Response
+
from parsel import Selector, SelectorList
+from scrapy.http import Response
from scraper.util.keymap import TranslationKey, translations
class Table:
"""
- Takes a page and throws all table rows into a list of (key, columns) tuples.
+ Takes a page and transforms all table rows into a list of (key, columns) tuples.
"""
def __init__(
diff --git a/scraper/util/url.py b/scraper/util/url.py
index 7556efe..d55d8b5 100644
--- a/scraper/util/url.py
+++ b/scraper/util/url.py
@@ -28,3 +28,9 @@ def sort_url_params(url: str) -> str:
sorted_query = dict(sorted(query.items()))
url_res = url_res._replace(query=urlencode(sorted_query, True))
return urlunparse(url_res)
+
+
+def normalized_url(url: str) -> str:
+ sorted = sort_url_params(url)
+ sorted = sorted.replace(".vorlesungen.", ".vvz.").replace("http://", "https://")
+ return sorted.strip("/")