Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions frontend/src/definitions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ function escapeHtml(str: string): string {

/**
* Fetch a word definition from our backend API.
* The backend tries native Wiktionary first, then English Wiktionary,
* and caches results to disk.
* The backend uses pre-generated LLM definitions with disk caching.
*/
export async function fetchDefinition(word: string, lang: string): Promise<WordDefinition> {
try {
Expand All @@ -26,8 +25,11 @@ export async function fetchDefinition(word: string, lang: string): Promise<WordD
word,
partOfSpeech: data.part_of_speech || undefined,
definition: data.definition || '',
source: data.source || 'english',
url: data.url || '',
definitionNative: data.definition_native || undefined,
definitionEn: data.definition_en || undefined,
confidence: data.confidence,
source: data.source || 'llm',
url: data.url || data.wiktionary_url || '',
};
}
} catch {
Expand Down
5 changes: 4 additions & 1 deletion frontend/src/types/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,10 @@ export interface WordDefinition {
word: string;
partOfSpeech?: string;
definition: string;
source: 'native' | 'english' | 'link';
definitionNative?: string;
definitionEn?: string;
confidence?: number;
source: 'native' | 'english' | 'link' | 'llm' | 'kaikki' | 'kaikki-en' | 'ai';
url: string;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,10 @@ def extract_best_gloss(senses, word=None):
def process_jsonl_gz(gz_path, target_words, lang_code_filter=None):
"""Process a gzipped JSONL file and extract definitions for target words.

Returns dict: {word: {"definition": str, "pos": str, "priority": int}}
Returns dict: {word: {"definition": str, "pos": str, "n_senses": int, "priority": int}}

Selection strategy: prefer the entry with the most senses (proxy for
primary meaning), using POS priority only as a tiebreaker.
"""
results = {}

Expand All @@ -458,19 +461,25 @@ def process_jsonl_gz(gz_path, target_words, lang_code_filter=None):
pos = entry.get("pos", "unknown")
priority = POS_PRIORITY.get(pos, 50)
senses = entry.get("senses", [])
n_senses = len(senses)
gloss = extract_best_gloss(senses, word=word)

if not gloss:
continue

# Keep the entry with best POS priority
# Keep the entry with the most senses (proxy for primary meaning).
# Use POS priority as tiebreaker when sense counts are equal.
if word in results:
if priority >= results[word]["priority"]:
existing = results[word]
if n_senses < existing["n_senses"]:
continue
if n_senses == existing["n_senses"] and priority >= existing["priority"]:
continue

results[word] = {
"definition": gloss,
"pos": pos,
"n_senses": n_senses,
"priority": priority,
}

Expand Down
145 changes: 145 additions & 0 deletions scripts/pregenerate_definitions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/usr/bin/env python3
"""Pre-generate LLM definitions for upcoming daily words.

Run daily via cron to ensure definitions are cached before players see them.
Generates definitions for today and tomorrow across all languages.

Usage:
uv run python scripts/pregenerate_definitions.py # all langs, today + tomorrow
uv run python scripts/pregenerate_definitions.py --days 3 # today + 3 days ahead
uv run python scripts/pregenerate_definitions.py --lang en # single language
uv run python scripts/pregenerate_definitions.py --backfill 30 # past 30 days
uv run python scripts/pregenerate_definitions.py --dry-run # show what would be generated

Requires OPENAI_API_KEY in .env or environment.
"""

import argparse
import os
import sys
import time

# Add project root to path so we can import from webapp
_script_dir = os.path.dirname(os.path.abspath(__file__))
_project_root = os.path.join(_script_dir, "..")
sys.path.insert(0, _project_root)
os.chdir(os.path.join(_project_root, "webapp"))

from webapp.app import (
WORD_DEFS_DIR,
get_todays_idx,
get_word_for_day,
language_codes,
language_configs,
)
from webapp.definitions import _call_llm_definition


def main():
parser = argparse.ArgumentParser(description="Pre-generate LLM definitions for daily words")
parser.add_argument("--lang", type=str, help="Generate for a single language")
parser.add_argument(
"--days", type=int, default=1, help="Days ahead to generate (default: 1 = today + tomorrow)"
)
parser.add_argument(
"--backfill",
type=int,
default=0,
help="Days in the past to also generate (e.g. --backfill 30)",
)
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
args = parser.parse_args()

api_key = os.environ.get("OPENAI_API_KEY")
if not api_key and not args.dry_run:
print("Error: OPENAI_API_KEY not set in environment or .env")
sys.exit(1)

if args.lang:
if args.lang not in language_codes:
print(f"Error: Unknown language '{args.lang}'")
sys.exit(1)
langs = [args.lang]
else:
langs = list(language_codes)

todays_idx = get_todays_idx()
start_idx = max(1, todays_idx - args.backfill)
day_range = range(start_idx, todays_idx + args.days + 1)

total = len(langs) * len(day_range)
generated = 0
cached = 0
errors = 0
low_confidence = 0

print(
f"Generating definitions for {len(langs)} languages, "
f"days {day_range.start}-{day_range.stop - 1}"
)
print(f"Total words to process: {total}\n")

for day_idx in day_range:
for lang in langs:
word = get_word_for_day(lang, day_idx)
lang_name = language_configs[lang].get("name", lang)

cache_dir = os.path.join(WORD_DEFS_DIR, lang)
cache_path = os.path.join(cache_dir, f"{word.lower()}.json")

if args.dry_run:
status = "cached" if os.path.exists(cache_path) else "pending"
print(f" [{status}] {lang} #{day_idx}: {word} ({lang_name})")
continue

if os.path.exists(cache_path):
cached += 1
continue

# Generate definition via LLM
start = time.time()
result = _call_llm_definition(word, lang)
elapsed = time.time() - start

if result:
confidence = result.get("confidence", 0)
pos = result.get("part_of_speech", "?")

# Write to cache
import json

os.makedirs(cache_dir, exist_ok=True)
with open(cache_path, "w") as f:
json.dump(result, f)

generated += 1
flag = ""
if confidence < 0.7:
low_confidence += 1
flag = " (LOW)"
print(
f" [generated] {lang} #{day_idx}: {word} ({lang_name})"
f" — confidence={confidence}, pos={pos}, {elapsed:.1f}s{flag}"
)
else:
errors += 1
# Write negative cache
import json

os.makedirs(cache_dir, exist_ok=True)
with open(cache_path, "w") as f:
json.dump({"not_found": True, "ts": int(time.time())}, f)
print(f" [error] {lang} #{day_idx}: {word} ({lang_name}) — {elapsed:.1f}s")

# Rate limit: 0.3s between calls
time.sleep(0.3)

if not args.dry_run:
print(
f"\nDone: {generated} generated, {cached} cached, "
f"{errors} errors, {low_confidence} low-confidence"
)


if __name__ == "__main__":
main()
12 changes: 7 additions & 5 deletions scripts/pregenerate_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from webapp.app import (
IMAGE_LANGUAGES,
WORD_IMAGES_DIR,
fetch_definition_cached,
fetch_definition,
generate_word_image,
get_todays_idx,
get_word_for_day,
Expand Down Expand Up @@ -100,11 +100,13 @@ def main():
print(f" [cached] {lang} #{day_idx}: {word}")
continue

# Fetch and cache definition
defn = fetch_definition_cached(word, lang)
# Fetch and cache definition — prefer English for image generation
defn = fetch_definition(word, lang)
definition_hint = ""
if defn and defn.get("definition"):
definition_hint = f", which means {defn['definition']}"
if defn:
en_def = defn.get("definition_en") or defn.get("definition", "")
if en_def:
definition_hint = f", which means {en_def}"

# Generate image
start = time.time()
Expand Down
114 changes: 114 additions & 0 deletions scripts/refresh_definition_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""Refresh stale definition caches on production.

After changing the definition tier hierarchy, existing disk-cached definitions
may be from kaikki (lower quality) instead of the Wiktionary parser. This script
hits the API with ?refresh=1 to force re-resolution for past daily words.

Usage:
uv run scripts/refresh_definition_cache.py [--langs en,de,...] [--days 30] [--base-url https://wordle.global]
"""

import argparse
import json
import time
import urllib.request


def get_past_words(base_url, lang_code, n_words):
"""Get recent daily words by fetching word pages from the hub."""
import re

words = []
try:
# Get day indices from the hub page
url = f"{base_url}/{lang_code}/words"
req = urllib.request.Request(url, headers={"User-Agent": "WordleGlobal-CacheRefresh/1.0"})
with urllib.request.urlopen(req, timeout=15) as resp:
html = resp.read().decode("utf-8")
day_indices = re.findall(rf"/{lang_code}/word/(\d+)", html)

# Fetch each word page to get the actual word
for day_idx in day_indices[:n_words]:
try:
url = f"{base_url}/{lang_code}/word/{day_idx}"
req = urllib.request.Request(
url, headers={"User-Agent": "WordleGlobal-CacheRefresh/1.0"}
)
with urllib.request.urlopen(req, timeout=10) as resp:
html = resp.read().decode("utf-8")
m = re.search(r"uppercase\">(\w+)</strong>", html)
if m:
words.append(m.group(1).lower())
time.sleep(0.2)
except Exception:
pass
except Exception as e:
print(f" Error fetching word list: {e}")
return words


def refresh_definitions(lang_codes, days, base_url, dry_run=False):
"""Refresh cached definitions for recent daily words."""
for lang_code in lang_codes:
print(f"[{lang_code}] Fetching recent words...")
words = get_past_words(base_url, lang_code, days)
if not words:
print(f" [{lang_code}] No words found — skipping")
continue

print(f" [{lang_code}] Found {len(words)} words")
refreshed = 0
skipped = 0

for word in words:
url = f"{base_url}/{lang_code}/api/definition/{word}?refresh=1"
if dry_run:
print(f" [{lang_code}] Would refresh: {word}")
refreshed += 1
continue

try:
req = urllib.request.Request(
url, headers={"User-Agent": "WordleGlobal-CacheRefresh/1.0"}
)
with urllib.request.urlopen(req, timeout=15) as resp:
data = json.loads(resp.read())
source = data.get("source", "?")
defn = data.get("definition", "")[:50]
print(f" [{lang_code}] {word}: source={source} → {defn}")
refreshed += 1
except Exception as e:
print(f" [{lang_code}] {word}: ERROR {e}")
skipped += 1

time.sleep(0.5) # Be polite to production

print(f" [{lang_code}] Done: {refreshed} refreshed, {skipped} errors\n")


def main():
parser = argparse.ArgumentParser(description="Refresh definition caches on production")
parser.add_argument(
"--langs", type=str, default="en", help="Comma-separated language codes (default: en)"
)
parser.add_argument(
"--days", type=int, default=30, help="Number of recent words to refresh (default: 30)"
)
parser.add_argument("--base-url", type=str, default="https://wordle.global", help="Base URL")
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be refreshed without making requests",
)
args = parser.parse_args()

lang_codes = [lc.strip() for lc in args.langs.split(",")]
print(f"Refreshing {len(lang_codes)} language(s), last {args.days} words")
print(f"Base URL: {args.base_url}\n")

refresh_definitions(lang_codes, args.days, args.base_url, dry_run=args.dry_run)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
import json
from pathlib import Path

# Exclude deprecated tests from collection
collect_ignore_glob = ["deprecated/*"]


def pytest_addoption(parser):
parser.addoption(
Expand Down
1 change: 1 addition & 0 deletions tests/deprecated/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Deprecated tests for old Wiktionary parser system
Loading