Hugo0 · Hugo0 · Mar 4, 2026 · Mar 4, 2026
diff --git a/frontend/src/definitions.ts b/frontend/src/definitions.ts
@@ -14,8 +14,7 @@ function escapeHtml(str: string): string {
 
 /**
  * Fetch a word definition from our backend API.
- * The backend tries native Wiktionary first, then English Wiktionary,
- * and caches results to disk.
+ * The backend uses pre-generated LLM definitions with disk caching.
  */
 export async function fetchDefinition(word: string, lang: string): Promise<WordDefinition> {
     try {
@@ -26,8 +25,11 @@ export async function fetchDefinition(word: string, lang: string): Promise<WordD
                 word,
                 partOfSpeech: data.part_of_speech || undefined,
                 definition: data.definition || '',
-                source: data.source || 'english',
-                url: data.url || '',
+                definitionNative: data.definition_native || undefined,
+                definitionEn: data.definition_en || undefined,
+                confidence: data.confidence,
+                source: data.source || 'llm',
+                url: data.url || data.wiktionary_url || '',
             };
         }
     } catch {

diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts
@@ -120,7 +120,10 @@ export interface WordDefinition {
     word: string;
     partOfSpeech?: string;
     definition: string;
-    source: 'native' | 'english' | 'link';
+    definitionNative?: string;
+    definitionEn?: string;
+    confidence?: number;
+    source: 'native' | 'english' | 'link' | 'llm' | 'kaikki' | 'kaikki-en' | 'ai';
     url: string;
 }
 

diff --git a/scripts/build_definitions.py → scripts/deprecated/build_definitions.py b/scripts/build_definitions.py → scripts/deprecated/build_definitions.py
@@ -433,7 +433,10 @@ def extract_best_gloss(senses, word=None):
 def process_jsonl_gz(gz_path, target_words, lang_code_filter=None):
     """Process a gzipped JSONL file and extract definitions for target words.
 
-    Returns dict: {word: {"definition": str, "pos": str, "priority": int}}
+    Returns dict: {word: {"definition": str, "pos": str, "n_senses": int, "priority": int}}
+
+    Selection strategy: prefer the entry with the most senses (proxy for
+    primary meaning), using POS priority only as a tiebreaker.
     """
     results = {}
 
@@ -458,19 +461,25 @@ def process_jsonl_gz(gz_path, target_words, lang_code_filter=None):
             pos = entry.get("pos", "unknown")
             priority = POS_PRIORITY.get(pos, 50)
             senses = entry.get("senses", [])
+            n_senses = len(senses)
             gloss = extract_best_gloss(senses, word=word)
 
             if not gloss:
                 continue
 
-            # Keep the entry with best POS priority
+            # Keep the entry with the most senses (proxy for primary meaning).
+            # Use POS priority as tiebreaker when sense counts are equal.
             if word in results:
-                if priority >= results[word]["priority"]:
+                existing = results[word]
+                if n_senses < existing["n_senses"]:
+                    continue
+                if n_senses == existing["n_senses"] and priority >= existing["priority"]:
                     continue
 
             results[word] = {
                 "definition": gloss,
                 "pos": pos,
+                "n_senses": n_senses,
                 "priority": priority,
             }
 

diff --git a/scripts/capture_wiktionary_fixtures.py → ...deprecated/capture_wiktionary_fixtures.py b/scripts/capture_wiktionary_fixtures.py → ...deprecated/capture_wiktionary_fixtures.py
diff --git a/scripts/pregenerate_definitions.py b/scripts/pregenerate_definitions.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+"""Pre-generate LLM definitions for upcoming daily words.
+
+Run daily via cron to ensure definitions are cached before players see them.
+Generates definitions for today and tomorrow across all languages.
+
+Usage:
+    uv run python scripts/pregenerate_definitions.py              # all langs, today + tomorrow
+    uv run python scripts/pregenerate_definitions.py --days 3     # today + 3 days ahead
+    uv run python scripts/pregenerate_definitions.py --lang en    # single language
+    uv run python scripts/pregenerate_definitions.py --backfill 30 # past 30 days
+    uv run python scripts/pregenerate_definitions.py --dry-run    # show what would be generated
+
+Requires OPENAI_API_KEY in .env or environment.
+"""
+
+import argparse
+import os
+import sys
+import time
+
+# Add project root to path so we can import from webapp
+_script_dir = os.path.dirname(os.path.abspath(__file__))
+_project_root = os.path.join(_script_dir, "..")
+sys.path.insert(0, _project_root)
+os.chdir(os.path.join(_project_root, "webapp"))
+
+from webapp.app import (
+    WORD_DEFS_DIR,
+    get_todays_idx,
+    get_word_for_day,
+    language_codes,
+    language_configs,
+)
+from webapp.definitions import _call_llm_definition
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Pre-generate LLM definitions for daily words")
+    parser.add_argument("--lang", type=str, help="Generate for a single language")
+    parser.add_argument(
+        "--days", type=int, default=1, help="Days ahead to generate (default: 1 = today + tomorrow)"
+    )
+    parser.add_argument(
+        "--backfill",
+        type=int,
+        default=0,
+        help="Days in the past to also generate (e.g. --backfill 30)",
+    )
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
+    args = parser.parse_args()
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key and not args.dry_run:
+        print("Error: OPENAI_API_KEY not set in environment or .env")
+        sys.exit(1)
+
+    if args.lang:
+        if args.lang not in language_codes:
+            print(f"Error: Unknown language '{args.lang}'")
+            sys.exit(1)
+        langs = [args.lang]
+    else:
+        langs = list(language_codes)
+
+    todays_idx = get_todays_idx()
+    start_idx = max(1, todays_idx - args.backfill)
+    day_range = range(start_idx, todays_idx + args.days + 1)
+
+    total = len(langs) * len(day_range)
+    generated = 0
+    cached = 0
+    errors = 0
+    low_confidence = 0
+
+    print(
+        f"Generating definitions for {len(langs)} languages, "
+        f"days {day_range.start}-{day_range.stop - 1}"
+    )
+    print(f"Total words to process: {total}\n")
+
+    for day_idx in day_range:
+        for lang in langs:
+            word = get_word_for_day(lang, day_idx)
+            lang_name = language_configs[lang].get("name", lang)
+
+            cache_dir = os.path.join(WORD_DEFS_DIR, lang)
+            cache_path = os.path.join(cache_dir, f"{word.lower()}.json")
+
+            if args.dry_run:
+                status = "cached" if os.path.exists(cache_path) else "pending"
+                print(f"  [{status}] {lang} #{day_idx}: {word} ({lang_name})")
+                continue
+
+            if os.path.exists(cache_path):
+                cached += 1
+                continue
+
+            # Generate definition via LLM
+            start = time.time()
+            result = _call_llm_definition(word, lang)
+            elapsed = time.time() - start
+
+            if result:
+                confidence = result.get("confidence", 0)
+                pos = result.get("part_of_speech", "?")
+
+                # Write to cache
+                import json
+
+                os.makedirs(cache_dir, exist_ok=True)
+                with open(cache_path, "w") as f:
+                    json.dump(result, f)
+
+                generated += 1
+                flag = ""
+                if confidence < 0.7:
+                    low_confidence += 1
+                    flag = " (LOW)"
+                print(
+                    f"  [generated] {lang} #{day_idx}: {word} ({lang_name})"
+                    f" — confidence={confidence}, pos={pos}, {elapsed:.1f}s{flag}"
+                )
+            else:
+                errors += 1
+                # Write negative cache
+                import json
+
+                os.makedirs(cache_dir, exist_ok=True)
+                with open(cache_path, "w") as f:
+                    json.dump({"not_found": True, "ts": int(time.time())}, f)
+                print(f"  [error] {lang} #{day_idx}: {word} ({lang_name}) — {elapsed:.1f}s")
+
+            # Rate limit: 0.3s between calls
+            time.sleep(0.3)
+
+    if not args.dry_run:
+        print(
+            f"\nDone: {generated} generated, {cached} cached, "
+            f"{errors} errors, {low_confidence} low-confidence"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/pregenerate_images.py b/scripts/pregenerate_images.py
@@ -28,7 +28,7 @@
 from webapp.app import (
     IMAGE_LANGUAGES,
     WORD_IMAGES_DIR,
-    fetch_definition_cached,
+    fetch_definition,
     generate_word_image,
     get_todays_idx,
     get_word_for_day,
@@ -100,11 +100,13 @@ def main():
                 print(f"  [cached] {lang} #{day_idx}: {word}")
                 continue
 
-            # Fetch and cache definition
-            defn = fetch_definition_cached(word, lang)
+            # Fetch and cache definition — prefer English for image generation
+            defn = fetch_definition(word, lang)
             definition_hint = ""
-            if defn and defn.get("definition"):
-                definition_hint = f", which means {defn['definition']}"
+            if defn:
+                en_def = defn.get("definition_en") or defn.get("definition", "")
+                if en_def:
+                    definition_hint = f", which means {en_def}"
 
             # Generate image
             start = time.time()

diff --git a/scripts/refresh_definition_cache.py b/scripts/refresh_definition_cache.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+"""Refresh stale definition caches on production.
+
+After changing the definition tier hierarchy, existing disk-cached definitions
+may be from kaikki (lower quality) instead of the Wiktionary parser. This script
+hits the API with ?refresh=1 to force re-resolution for past daily words.
+
+Usage:
+    uv run scripts/refresh_definition_cache.py [--langs en,de,...] [--days 30] [--base-url https://wordle.global]
+"""
+
+import argparse
+import json
+import time
+import urllib.request
+
+
+def get_past_words(base_url, lang_code, n_words):
+    """Get recent daily words by fetching word pages from the hub."""
+    import re
+
+    words = []
+    try:
+        # Get day indices from the hub page
+        url = f"{base_url}/{lang_code}/words"
+        req = urllib.request.Request(url, headers={"User-Agent": "WordleGlobal-CacheRefresh/1.0"})
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            html = resp.read().decode("utf-8")
+            day_indices = re.findall(rf"/{lang_code}/word/(\d+)", html)
+
+        # Fetch each word page to get the actual word
+        for day_idx in day_indices[:n_words]:
+            try:
+                url = f"{base_url}/{lang_code}/word/{day_idx}"
+                req = urllib.request.Request(
+                    url, headers={"User-Agent": "WordleGlobal-CacheRefresh/1.0"}
+                )
+                with urllib.request.urlopen(req, timeout=10) as resp:
+                    html = resp.read().decode("utf-8")
+                    m = re.search(r"uppercase\">(\w+)</strong>", html)
+                    if m:
+                        words.append(m.group(1).lower())
+                time.sleep(0.2)
+            except Exception:
+                pass
+    except Exception as e:
+        print(f"  Error fetching word list: {e}")
+    return words
+
+
+def refresh_definitions(lang_codes, days, base_url, dry_run=False):
+    """Refresh cached definitions for recent daily words."""
+    for lang_code in lang_codes:
+        print(f"[{lang_code}] Fetching recent words...")
+        words = get_past_words(base_url, lang_code, days)
+        if not words:
+            print(f"  [{lang_code}] No words found — skipping")
+            continue
+
+        print(f"  [{lang_code}] Found {len(words)} words")
+        refreshed = 0
+        skipped = 0
+
+        for word in words:
+            url = f"{base_url}/{lang_code}/api/definition/{word}?refresh=1"
+            if dry_run:
+                print(f"  [{lang_code}] Would refresh: {word}")
+                refreshed += 1
+                continue
+
+            try:
+                req = urllib.request.Request(
+                    url, headers={"User-Agent": "WordleGlobal-CacheRefresh/1.0"}
+                )
+                with urllib.request.urlopen(req, timeout=15) as resp:
+                    data = json.loads(resp.read())
+                    source = data.get("source", "?")
+                    defn = data.get("definition", "")[:50]
+                    print(f"  [{lang_code}] {word}: source={source} → {defn}")
+                    refreshed += 1
+            except Exception as e:
+                print(f"  [{lang_code}] {word}: ERROR {e}")
+                skipped += 1
+
+            time.sleep(0.5)  # Be polite to production
+
+        print(f"  [{lang_code}] Done: {refreshed} refreshed, {skipped} errors\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Refresh definition caches on production")
+    parser.add_argument(
+        "--langs", type=str, default="en", help="Comma-separated language codes (default: en)"
+    )
+    parser.add_argument(
+        "--days", type=int, default=30, help="Number of recent words to refresh (default: 30)"
+    )
+    parser.add_argument("--base-url", type=str, default="https://wordle.global", help="Base URL")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be refreshed without making requests",
+    )
+    args = parser.parse_args()
+
+    lang_codes = [lc.strip() for lc in args.langs.split(",")]
+    print(f"Refreshing {len(lang_codes)} language(s), last {args.days} words")
+    print(f"Base URL: {args.base_url}\n")
+
+    refresh_definitions(lang_codes, args.days, args.base_url, dry_run=args.dry_run)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,6 +7,9 @@
 import json
 from pathlib import Path
 
+# Exclude deprecated tests from collection
+collect_ignore_glob = ["deprecated/*"]
+
 
 def pytest_addoption(parser):
     parser.addoption(

diff --git a/tests/deprecated/__init__.py b/tests/deprecated/__init__.py
@@ -0,0 +1 @@
+# Deprecated tests for old Wiktionary parser system
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Deprecated tests for old Wiktionary parser system