Skip to content

Commit 6066875

Browse files
authored
Dawn/patch llms scripts (#1197)
* Adds check to delete old files from /.ai/ before writing updated one to keep docs and llm files in sync * adds word count and token estimates to per page files * adds word count and token estimate to category files * update token function per feedback from Copilot * removes output of token and word counts to pages to avoid frequent merge conflicts. Functions for determining counts left in place. * fresh llms without word and token counts
1 parent 991cf5f commit 6066875

File tree

3 files changed

+47
-15
lines changed

3 files changed

+47
-15
lines changed

scripts/generate_ai_pages.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import argparse
1313
import textwrap
1414
import requests
15+
import shutil
1516
from pathlib import Path
1617

1718
# -------------- CLI flags (module-level toggles) --------------
@@ -309,6 +310,29 @@ def build_raw_url(config: dict, slug: str) -> str:
309310
pages_dirname = config.get("outputs", {}).get("files", {}).get("pages_dir", "pages")
310311
return f"https://raw.githubusercontent.com/{org}/{repo}/{branch}/{public_root}/{pages_dirname}/{slug}"
311312

313+
# ----------------------------
314+
# Word count, token estimate
315+
# ----------------------------
316+
317+
def word_count(s: str) -> int:
318+
return len(re.findall(r"\b\w+\b", s, flags=re.UNICODE))
319+
320+
def _heuristic_token_count(s: str) -> int:
321+
return len(re.findall(r"\w+|[^\s\w]", s, flags=re.UNICODE))
322+
323+
def _cl100k_token_count(s: str) -> int:
324+
try:
325+
import tiktoken # type: ignore
326+
enc = tiktoken.get_encoding("cl100k_base")
327+
return len(enc.encode(s))
328+
except Exception:
329+
return _heuristic_token_count(s)
330+
331+
def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
332+
if estimator == "cl100k":
333+
return _cl100k_token_count(text)
334+
return _heuristic_token_count(text)
335+
312336
# ----------------------------
313337
# Writer
314338
# ----------------------------
@@ -333,6 +357,21 @@ def write_ai_page(ai_pages_dir: Path, slug: str, header: dict, body: str):
333357
# Main
334358
# ----------------------------
335359

360+
def reset_ai_pages_dir(ai_pages_dir: Path):
361+
"""Remove all existing AI pages so runs always reflect current docs."""
362+
if DRY_RUN:
363+
print(f"[dry-run] Would remove existing files under {ai_pages_dir}")
364+
return
365+
366+
if not ai_pages_dir.exists():
367+
return
368+
369+
for entry in ai_pages_dir.iterdir():
370+
if entry.is_dir():
371+
shutil.rmtree(entry)
372+
else:
373+
entry.unlink()
374+
336375
def main():
337376
global ALLOW_REMOTE, DRY_RUN
338377

@@ -365,7 +404,6 @@ def main():
365404
variables = load_yaml(str(variables_path))
366405

367406
# Config bits
368-
fm_flag = config.get("content", {}).get("exclusions", {}).get("frontmatter_flag", "ai_exclude")
369407
skip_basenames = set(config.get("content", {}).get("exclusions", {}).get("skip_basenames", []))
370408
skip_parts = set(config.get("content", {}).get("exclusions", {}).get("skip_paths", []))
371409
docs_base_url = config.get("project", {}).get("docs_base_url", "").rstrip("/") + "/"
@@ -374,6 +412,8 @@ def main():
374412
public_root = config.get("outputs", {}).get("public_root", "/.ai/").strip("/")
375413
pages_dirname = config.get("outputs", {}).get("files", {}).get("pages_dir", "pages")
376414
ai_pages_dir = (repo_root / public_root / pages_dirname).resolve()
415+
ai_pages_dir.mkdir(parents=True, exist_ok=True)
416+
reset_ai_pages_dir(ai_pages_dir)
377417

378418
# Collect files
379419
files = get_all_markdown_files(str(docs_dir), skip_basenames, skip_parts)

scripts/generate_category_bundles.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -147,22 +147,13 @@ def load_all_pages(ai_dir: Path) -> List[AiPage]:
147147

148148

149149
# ----------------------------
150-
# Token estimation
150+
# Token estimation, word count
151151
# ----------------------------
152152

153153
def _heuristic_token_count(s: str) -> int:
154-
"""
155-
Dependency-free token estimate:
156-
- counts words and standalone punctuation
157-
- decent for prose and code; model-agnostic
158-
"""
159154
return len(re.findall(r"\w+|[^\s\w]", s, flags=re.UNICODE))
160155

161156
def _cl100k_token_count(s: str) -> int:
162-
"""
163-
Optional: if tiktoken is installed and estimator name is 'cl100k',
164-
compute tokens via cl100k_base; otherwise fall back to heuristic.
165-
"""
166157
try:
167158
import tiktoken # type: ignore
168159
enc = tiktoken.get_encoding("cl100k_base")
@@ -171,13 +162,13 @@ def _cl100k_token_count(s: str) -> int:
171162
return _heuristic_token_count(s)
172163

173164
def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
174-
if estimator == "heuristic-v1":
175-
return _heuristic_token_count(text)
176165
if estimator == "cl100k":
177166
return _cl100k_token_count(text)
178-
# Unknown/custom estimator name → compute via heuristic but keep the label in outputs.
179167
return _heuristic_token_count(text)
180168

169+
def word_count(text: str) -> int:
170+
return len(re.findall(r"\b\w+\b", text, flags=re.UNICODE))
171+
181172

182173
# ----------------------------
183174
# Category logic
@@ -260,6 +251,7 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
260251

261252
# Precompute token counts once per page
262253
page_tokens: Dict[str, int] = {p.slug: estimate_tokens(p.body, token_estimator) for p in pages}
254+
page_words: Dict[str, int] = {p.slug: word_count(p.body) for p in pages}
263255

264256
out_root = (repo_root / config.get("outputs", {}).get("public_root", "/.ai/").strip("/") / "categories").resolve()
265257

scripts/llms_config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
"terms-of-use.md",
4343
"privacy-policy.md"
4444
],
45-
"skip_paths": [".snippets", ".github", ".ai"]
45+
"skip_paths": [".snippets", ".github", ".ai", ".venv", "venv"]
4646
}
4747
},
4848

0 commit comments

Comments
 (0)