Dawn/patch llms scripts (#1197)

dawnkelly09 · web-flow · commit 606687556ef6 · 2025-11-13T13:27:59.000-05:00
* Adds check to delete old files from /.ai/ before writing updated one to keep docs and llm files in sync

* adds word count and token estimates to per page files

* adds word count and token estimate to category files

* update token function per feedback from Copilot

* removes output of token and word counts to pages to avoid frequent merge conflicts. Functions for determining counts left in place.

* fresh llms without word and token counts
diff --git a/scripts/generate_ai_pages.py b/scripts/generate_ai_pages.py
@@ -12,6 +12,7 @@
 import argparse
 import textwrap
 import requests
+import shutil
 from pathlib import Path
 
 # -------------- CLI flags (module-level toggles) --------------
@@ -309,6 +310,29 @@ def build_raw_url(config: dict, slug: str) -> str:
     pages_dirname = config.get("outputs", {}).get("files", {}).get("pages_dir", "pages")
     return f"https://raw.githubusercontent.com/{org}/{repo}/{branch}/{public_root}/{pages_dirname}/{slug}"
 
+# ----------------------------
+# Word count, token estimate
+# ----------------------------
+
+def word_count(s: str) -> int:
+    return len(re.findall(r"\b\w+\b", s, flags=re.UNICODE))
+
+def _heuristic_token_count(s: str) -> int:
+    return len(re.findall(r"\w+|[^\s\w]", s, flags=re.UNICODE))
+
+def _cl100k_token_count(s: str) -> int:
+    try:
+        import tiktoken  # type: ignore
+        enc = tiktoken.get_encoding("cl100k_base")
+        return len(enc.encode(s))
+    except Exception:
+        return _heuristic_token_count(s)
+
+def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
+    if estimator == "cl100k":
+        return _cl100k_token_count(text)
+    return _heuristic_token_count(text)
+
 # ----------------------------
 # Writer
 # ----------------------------
@@ -333,6 +357,21 @@ def write_ai_page(ai_pages_dir: Path, slug: str, header: dict, body: str):
 # Main
 # ----------------------------
 
+def reset_ai_pages_dir(ai_pages_dir: Path):
+    """Remove all existing AI pages so runs always reflect current docs."""
+    if DRY_RUN:
+        print(f"[dry-run] Would remove existing files under {ai_pages_dir}")
+        return
+
+    if not ai_pages_dir.exists():
+        return
+
+    for entry in ai_pages_dir.iterdir():
+        if entry.is_dir():
+            shutil.rmtree(entry)
+        else:
+            entry.unlink()
+
 def main():
     global ALLOW_REMOTE, DRY_RUN
 
@@ -365,7 +404,6 @@ def main():
     variables = load_yaml(str(variables_path))
 
     # Config bits
-    fm_flag = config.get("content", {}).get("exclusions", {}).get("frontmatter_flag", "ai_exclude")
     skip_basenames = set(config.get("content", {}).get("exclusions", {}).get("skip_basenames", []))
     skip_parts = set(config.get("content", {}).get("exclusions", {}).get("skip_paths", []))
     docs_base_url = config.get("project", {}).get("docs_base_url", "").rstrip("/") + "/"
@@ -374,6 +412,8 @@ def main():
     public_root = config.get("outputs", {}).get("public_root", "/.ai/").strip("/")
     pages_dirname = config.get("outputs", {}).get("files", {}).get("pages_dir", "pages")
     ai_pages_dir = (repo_root / public_root / pages_dirname).resolve()
+    ai_pages_dir.mkdir(parents=True, exist_ok=True)
+    reset_ai_pages_dir(ai_pages_dir)
 
     # Collect files
     files = get_all_markdown_files(str(docs_dir), skip_basenames, skip_parts)
diff --git a/scripts/generate_category_bundles.py b/scripts/generate_category_bundles.py
@@ -147,22 +147,13 @@ def load_all_pages(ai_dir: Path) -> List[AiPage]:
 
 
 # ----------------------------
-# Token estimation
+# Token estimation, word count
 # ----------------------------
 
 def _heuristic_token_count(s: str) -> int:
-    """
-    Dependency-free token estimate:
-      - counts words and standalone punctuation
-      - decent for prose and code; model-agnostic
-    """
     return len(re.findall(r"\w+|[^\s\w]", s, flags=re.UNICODE))
 
 def _cl100k_token_count(s: str) -> int:
-    """
-    Optional: if tiktoken is installed and estimator name is 'cl100k',
-    compute tokens via cl100k_base; otherwise fall back to heuristic.
-    """
     try:
         import tiktoken  # type: ignore
         enc = tiktoken.get_encoding("cl100k_base")
@@ -171,13 +162,13 @@ def _cl100k_token_count(s: str) -> int:
         return _heuristic_token_count(s)
 
 def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
-    if estimator == "heuristic-v1":
-        return _heuristic_token_count(text)
     if estimator == "cl100k":
         return _cl100k_token_count(text)
-    # Unknown/custom estimator name → compute via heuristic but keep the label in outputs.
     return _heuristic_token_count(text)
 
+def word_count(text: str) -> int:
+    return len(re.findall(r"\b\w+\b", text, flags=re.UNICODE))
+
 
 # ----------------------------
 # Category logic
@@ -260,6 +251,7 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
 
     # Precompute token counts once per page
     page_tokens: Dict[str, int] = {p.slug: estimate_tokens(p.body, token_estimator) for p in pages}
+    page_words: Dict[str, int] = {p.slug: word_count(p.body) for p in pages}
 
     out_root = (repo_root / config.get("outputs", {}).get("public_root", "/.ai/").strip("/") / "categories").resolve()
 
diff --git a/scripts/llms_config.json b/scripts/llms_config.json
@@ -42,7 +42,7 @@
                 "terms-of-use.md",
                 "privacy-policy.md"
             ],
-            "skip_paths": [".snippets", ".github", ".ai"]
+            "skip_paths": [".snippets", ".github", ".ai", ".venv", "venv"]
         }
     },
 

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@`
`42`	`42`	`"terms-of-use.md",`
`43`	`43`	`"privacy-policy.md"`
`44`	`44`	`],`
`45`		`- "skip_paths": [".snippets", ".github", ".ai"]`
	`45`	`+ "skip_paths": [".snippets", ".github", ".ai", ".venv", "venv"]`
`46`	`46`	`}`
`47`	`47`	`},`
`48`	`48`