Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions tests/test_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,69 @@ def test_no_cache_dir_still_works(self):
assert result is not None
assert result["definition"] == "A building"

def test_kaikki_fallback_when_llm_returns_none(self):
"""Kaikki is used as fallback when LLM returns None."""
kaikki_result = {
"definition": "A native definition",
"part_of_speech": None,
"source": "kaikki",
"url": None,
}
with patch("definitions._call_llm_definition", return_value=None):
with patch("definitions.lookup_kaikki_native", return_value=kaikki_result):
result = fetch_definition("word", "nl", cache_dir=None)

assert result is not None
assert result["source"] == "kaikki"
assert result["definition"] == "A native definition"

def test_kaikki_english_fallback_when_native_missing(self):
"""Kaikki English is used when both LLM and kaikki native return None."""
kaikki_en_result = {
"definition": "An English gloss",
"part_of_speech": None,
"source": "kaikki-en",
"url": None,
}
with patch("definitions._call_llm_definition", return_value=None):
with patch("definitions.lookup_kaikki_native", return_value=None):
with patch("definitions.lookup_kaikki_english", return_value=kaikki_en_result):
result = fetch_definition("word", "ro", cache_dir=None)

assert result is not None
assert result["source"] == "kaikki-en"

def test_kaikki_fallback_is_cached(self, tmp_path):
"""Kaikki fallback results get written to disk cache."""
cache_dir = str(tmp_path)
kaikki_result = {
"definition": "A native definition",
"part_of_speech": None,
"source": "kaikki",
"url": None,
}
with patch("definitions._call_llm_definition", return_value=None):
with patch("definitions.lookup_kaikki_native", return_value=kaikki_result):
fetch_definition("word", "nl", cache_dir=cache_dir)

cache_file = tmp_path / "nl" / "word.json"
assert cache_file.exists()
cached = json.loads(cache_file.read_text())
assert cached["source"] == "kaikki"

def test_negative_cache_only_when_all_tiers_fail(self, tmp_path):
"""Negative cache is written only when LLM AND kaikki both fail."""
cache_dir = str(tmp_path)
with patch("definitions._call_llm_definition", return_value=None):
with patch("definitions.lookup_kaikki_native", return_value=None):
with patch("definitions.lookup_kaikki_english", return_value=None):
fetch_definition("xyzzy", "zz", cache_dir=cache_dir)

cache_file = tmp_path / "zz" / "xyzzy.json"
assert cache_file.exists()
cached = json.loads(cache_file.read_text())
assert cached["not_found"] is True


# ---------------------------------------------------------------------------
# LLM_LANG_NAMES coverage
Expand Down
67 changes: 35 additions & 32 deletions webapp/definitions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
"""
Definition fetching for Wordle Global.

Simple 2-tier system: disk cache → LLM (GPT-5.2).
3-tier system: disk cache → LLM (GPT-5.2) → kaikki (offline Wiktionary).
Definitions are pre-generated daily via scripts/pregenerate_definitions.py.
"""

import json
import logging
import os
import re
import time
import urllib.parse
import urllib.request as urlreq
Expand Down Expand Up @@ -41,36 +40,41 @@ def _load_kaikki_file(cache_key, file_path):
return _kaikki_cache[cache_key]


def lookup_kaikki_native(word, lang_code):
"""Look up a word in native-language kaikki definitions."""
defs = _load_kaikki_file(
f"{lang_code}_native", os.path.join(_DEFINITIONS_DIR, f"{lang_code}.json")
)
def _lookup_kaikki(word, lang_code, variant):
"""Look up a word in kaikki definitions.

Args:
variant: "native" for native-language defs, "en" for English glosses.
"""
if variant == "native":
cache_key = f"{lang_code}_native"
file_name = f"{lang_code}.json"
source = "kaikki"
else:
cache_key = f"{lang_code}_en"
file_name = f"{lang_code}_en.json"
source = "kaikki-en"

defs = _load_kaikki_file(cache_key, os.path.join(_DEFINITIONS_DIR, file_name))
definition = defs.get(word.lower())
if definition:
return {
"definition": definition,
"part_of_speech": None,
"source": "kaikki",
"url": None,
"source": source,
"url": _wiktionary_url(word, lang_code),
}
return None


def lookup_kaikki_native(word, lang_code):
"""Look up a word in native-language kaikki definitions."""
return _lookup_kaikki(word, lang_code, "native")


def lookup_kaikki_english(word, lang_code):
"""Look up a word in English-gloss kaikki definitions."""
defs = _load_kaikki_file(
f"{lang_code}_en", os.path.join(_DEFINITIONS_DIR, f"{lang_code}_en.json")
)
definition = defs.get(word.lower())
if definition:
return {
"definition": definition,
"part_of_speech": None,
"source": "kaikki-en",
"url": None,
}
return None
return _lookup_kaikki(word, lang_code, "en")


# ---------------------------------------------------------------------------
Expand All @@ -87,11 +91,6 @@ def _wiktionary_url(word, lang_code):
return f"https://{wikt_lang}.wiktionary.org/wiki/{urllib.parse.quote(word)}"


def strip_html(text):
"""Strip HTML tags from a string."""
return re.sub(r"<[^>]+>", "", text).strip()


# ---------------------------------------------------------------------------
# LLM definition generation (GPT-5.2)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -235,14 +234,14 @@ def _call_llm_definition(word, lang_code):
)
return None

def_en = definition_en[:300]
def_native = ((definition_native or definition_en))[:300]
wikt_url = _wiktionary_url(word, lang_code)
return {
# New fields
"definition_native": (definition_native or definition_en)[:300],
"definition_en": definition_en[:300],
"definition_native": def_native,
"definition_en": def_en,
"definition": def_en, # backward compat
"confidence": confidence,
# Backward-compatible fields
"definition": definition_en[:300],
"part_of_speech": result.get("part_of_speech"),
"source": "llm",
"url": wikt_url,
Expand All @@ -265,7 +264,7 @@ def _call_llm_definition(word, lang_code):


def fetch_definition(word, lang_code, cache_dir=None, skip_negative_cache=False):
"""Fetch a word definition. 2-tier: disk cache → LLM.
"""Fetch a word definition. 3-tier: disk cache → LLM → kaikki.

Args:
word: The word to define.
Expand Down Expand Up @@ -305,6 +304,10 @@ def fetch_definition(word, lang_code, cache_dir=None, skip_negative_cache=False)
# --- Tier 2: LLM ---
result = _call_llm_definition(word, lang_code)

# --- Tier 3: Kaikki fallback (offline Wiktionary data) ---
if not result:
result = lookup_kaikki_native(word, lang_code) or lookup_kaikki_english(word, lang_code)

# Cache result (including negative results)
if lang_cache_dir:
try:
Expand Down