diff --git a/tests/test_definitions.py b/tests/test_definitions.py index ee17c70..cabf260 100644 --- a/tests/test_definitions.py +++ b/tests/test_definitions.py @@ -322,6 +322,69 @@ def test_no_cache_dir_still_works(self): assert result is not None assert result["definition"] == "A building" + def test_kaikki_fallback_when_llm_returns_none(self): + """Kaikki is used as fallback when LLM returns None.""" + kaikki_result = { + "definition": "A native definition", + "part_of_speech": None, + "source": "kaikki", + "url": None, + } + with patch("definitions._call_llm_definition", return_value=None): + with patch("definitions.lookup_kaikki_native", return_value=kaikki_result): + result = fetch_definition("word", "nl", cache_dir=None) + + assert result is not None + assert result["source"] == "kaikki" + assert result["definition"] == "A native definition" + + def test_kaikki_english_fallback_when_native_missing(self): + """Kaikki English is used when both LLM and kaikki native return None.""" + kaikki_en_result = { + "definition": "An English gloss", + "part_of_speech": None, + "source": "kaikki-en", + "url": None, + } + with patch("definitions._call_llm_definition", return_value=None): + with patch("definitions.lookup_kaikki_native", return_value=None): + with patch("definitions.lookup_kaikki_english", return_value=kaikki_en_result): + result = fetch_definition("word", "ro", cache_dir=None) + + assert result is not None + assert result["source"] == "kaikki-en" + + def test_kaikki_fallback_is_cached(self, tmp_path): + """Kaikki fallback results get written to disk cache.""" + cache_dir = str(tmp_path) + kaikki_result = { + "definition": "A native definition", + "part_of_speech": None, + "source": "kaikki", + "url": None, + } + with patch("definitions._call_llm_definition", return_value=None): + with patch("definitions.lookup_kaikki_native", return_value=kaikki_result): + fetch_definition("word", "nl", cache_dir=cache_dir) + + cache_file = tmp_path / "nl" / "word.json" + assert cache_file.exists() + cached = json.loads(cache_file.read_text()) + assert cached["source"] == "kaikki" + + def test_negative_cache_only_when_all_tiers_fail(self, tmp_path): + """Negative cache is written only when LLM AND kaikki both fail.""" + cache_dir = str(tmp_path) + with patch("definitions._call_llm_definition", return_value=None): + with patch("definitions.lookup_kaikki_native", return_value=None): + with patch("definitions.lookup_kaikki_english", return_value=None): + fetch_definition("xyzzy", "zz", cache_dir=cache_dir) + + cache_file = tmp_path / "zz" / "xyzzy.json" + assert cache_file.exists() + cached = json.loads(cache_file.read_text()) + assert cached["not_found"] is True + # --------------------------------------------------------------------------- # LLM_LANG_NAMES coverage diff --git a/webapp/definitions.py b/webapp/definitions.py index d599acc..a06e9bf 100644 --- a/webapp/definitions.py +++ b/webapp/definitions.py @@ -1,14 +1,13 @@ """ Definition fetching for Wordle Global. -Simple 2-tier system: disk cache → LLM (GPT-5.2). +3-tier system: disk cache → LLM (GPT-5.2) → kaikki (offline Wiktionary). Definitions are pre-generated daily via scripts/pregenerate_definitions.py. """ import json import logging import os -import re import time import urllib.parse import urllib.request as urlreq @@ -41,36 +40,41 @@ def _load_kaikki_file(cache_key, file_path): return _kaikki_cache[cache_key] -def lookup_kaikki_native(word, lang_code): - """Look up a word in native-language kaikki definitions.""" - defs = _load_kaikki_file( - f"{lang_code}_native", os.path.join(_DEFINITIONS_DIR, f"{lang_code}.json") - ) +def _lookup_kaikki(word, lang_code, variant): + """Look up a word in kaikki definitions. + + Args: + variant: "native" for native-language defs, "en" for English glosses. + """ + if variant == "native": + cache_key = f"{lang_code}_native" + file_name = f"{lang_code}.json" + source = "kaikki" + else: + cache_key = f"{lang_code}_en" + file_name = f"{lang_code}_en.json" + source = "kaikki-en" + + defs = _load_kaikki_file(cache_key, os.path.join(_DEFINITIONS_DIR, file_name)) definition = defs.get(word.lower()) if definition: return { "definition": definition, "part_of_speech": None, - "source": "kaikki", - "url": None, + "source": source, + "url": _wiktionary_url(word, lang_code), } return None +def lookup_kaikki_native(word, lang_code): + """Look up a word in native-language kaikki definitions.""" + return _lookup_kaikki(word, lang_code, "native") + + def lookup_kaikki_english(word, lang_code): """Look up a word in English-gloss kaikki definitions.""" - defs = _load_kaikki_file( - f"{lang_code}_en", os.path.join(_DEFINITIONS_DIR, f"{lang_code}_en.json") - ) - definition = defs.get(word.lower()) - if definition: - return { - "definition": definition, - "part_of_speech": None, - "source": "kaikki-en", - "url": None, - } - return None + return _lookup_kaikki(word, lang_code, "en") # --------------------------------------------------------------------------- @@ -87,11 +91,6 @@ def _wiktionary_url(word, lang_code): return f"https://{wikt_lang}.wiktionary.org/wiki/{urllib.parse.quote(word)}" -def strip_html(text): - """Strip HTML tags from a string.""" - return re.sub(r"<[^>]+>", "", text).strip() - - # --------------------------------------------------------------------------- # LLM definition generation (GPT-5.2) # --------------------------------------------------------------------------- @@ -235,14 +234,14 @@ def _call_llm_definition(word, lang_code): ) return None + def_en = definition_en[:300] + def_native = ((definition_native or definition_en))[:300] wikt_url = _wiktionary_url(word, lang_code) return { - # New fields - "definition_native": (definition_native or definition_en)[:300], - "definition_en": definition_en[:300], + "definition_native": def_native, + "definition_en": def_en, + "definition": def_en, # backward compat "confidence": confidence, - # Backward-compatible fields - "definition": definition_en[:300], "part_of_speech": result.get("part_of_speech"), "source": "llm", "url": wikt_url, @@ -265,7 +264,7 @@ def _call_llm_definition(word, lang_code): def fetch_definition(word, lang_code, cache_dir=None, skip_negative_cache=False): - """Fetch a word definition. 2-tier: disk cache → LLM. + """Fetch a word definition. 3-tier: disk cache → LLM → kaikki. Args: word: The word to define. @@ -305,6 +304,10 @@ def fetch_definition(word, lang_code, cache_dir=None, skip_negative_cache=False) # --- Tier 2: LLM --- result = _call_llm_definition(word, lang_code) + # --- Tier 3: Kaikki fallback (offline Wiktionary data) --- + if not result: + result = lookup_kaikki_native(word, lang_code) or lookup_kaikki_english(word, lang_code) + # Cache result (including negative results) if lang_cache_dir: try: