diff --git a/scripts/generate_ai_pages.py b/scripts/generate_ai_pages.py index fb2119123..34249acff 100644 --- a/scripts/generate_ai_pages.py +++ b/scripts/generate_ai_pages.py @@ -12,6 +12,7 @@ import argparse import textwrap import requests +import shutil from pathlib import Path # -------------- CLI flags (module-level toggles) -------------- @@ -309,6 +310,29 @@ def build_raw_url(config: dict, slug: str) -> str: pages_dirname = config.get("outputs", {}).get("files", {}).get("pages_dir", "pages") return f"https://raw.githubusercontent.com/{org}/{repo}/{branch}/{public_root}/{pages_dirname}/{slug}" +# ---------------------------- +# Word count, token estimate +# ---------------------------- + +def word_count(s: str) -> int: + return len(re.findall(r"\b\w+\b", s, flags=re.UNICODE)) + +def _heuristic_token_count(s: str) -> int: + return len(re.findall(r"\w+|[^\s\w]", s, flags=re.UNICODE)) + +def _cl100k_token_count(s: str) -> int: + try: + import tiktoken # type: ignore + enc = tiktoken.get_encoding("cl100k_base") + return len(enc.encode(s)) + except Exception: + return _heuristic_token_count(s) + +def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int: + if estimator == "cl100k": + return _cl100k_token_count(text) + return _heuristic_token_count(text) + # ---------------------------- # Writer # ---------------------------- @@ -333,6 +357,21 @@ def write_ai_page(ai_pages_dir: Path, slug: str, header: dict, body: str): # Main # ---------------------------- +def reset_ai_pages_dir(ai_pages_dir: Path): + """Remove all existing AI pages so runs always reflect current docs.""" + if DRY_RUN: + print(f"[dry-run] Would remove existing files under {ai_pages_dir}") + return + + if not ai_pages_dir.exists(): + return + + for entry in ai_pages_dir.iterdir(): + if entry.is_dir(): + shutil.rmtree(entry) + else: + entry.unlink() + def main(): global ALLOW_REMOTE, DRY_RUN @@ -365,7 +404,6 @@ def main(): variables = load_yaml(str(variables_path)) # Config bits - fm_flag = config.get("content", {}).get("exclusions", {}).get("frontmatter_flag", "ai_exclude") skip_basenames = set(config.get("content", {}).get("exclusions", {}).get("skip_basenames", [])) skip_parts = set(config.get("content", {}).get("exclusions", {}).get("skip_paths", [])) docs_base_url = config.get("project", {}).get("docs_base_url", "").rstrip("/") + "/" @@ -374,6 +412,8 @@ def main(): public_root = config.get("outputs", {}).get("public_root", "/.ai/").strip("/") pages_dirname = config.get("outputs", {}).get("files", {}).get("pages_dir", "pages") ai_pages_dir = (repo_root / public_root / pages_dirname).resolve() + ai_pages_dir.mkdir(parents=True, exist_ok=True) + reset_ai_pages_dir(ai_pages_dir) # Collect files files = get_all_markdown_files(str(docs_dir), skip_basenames, skip_parts) diff --git a/scripts/generate_category_bundles.py b/scripts/generate_category_bundles.py index 461406346..5ec0f90bc 100644 --- a/scripts/generate_category_bundles.py +++ b/scripts/generate_category_bundles.py @@ -147,22 +147,13 @@ def load_all_pages(ai_dir: Path) -> List[AiPage]: # ---------------------------- -# Token estimation +# Token estimation, word count # ---------------------------- def _heuristic_token_count(s: str) -> int: - """ - Dependency-free token estimate: - - counts words and standalone punctuation - - decent for prose and code; model-agnostic - """ return len(re.findall(r"\w+|[^\s\w]", s, flags=re.UNICODE)) def _cl100k_token_count(s: str) -> int: - """ - Optional: if tiktoken is installed and estimator name is 'cl100k', - compute tokens via cl100k_base; otherwise fall back to heuristic. - """ try: import tiktoken # type: ignore enc = tiktoken.get_encoding("cl100k_base") @@ -171,13 +162,13 @@ def _cl100k_token_count(s: str) -> int: return _heuristic_token_count(s) def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int: - if estimator == "heuristic-v1": - return _heuristic_token_count(text) if estimator == "cl100k": return _cl100k_token_count(text) - # Unknown/custom estimator name → compute via heuristic but keep the label in outputs. return _heuristic_token_count(text) +def word_count(text: str) -> int: + return len(re.findall(r"\b\w+\b", text, flags=re.UNICODE)) + # ---------------------------- # Category logic @@ -260,6 +251,7 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int # Precompute token counts once per page page_tokens: Dict[str, int] = {p.slug: estimate_tokens(p.body, token_estimator) for p in pages} + page_words: Dict[str, int] = {p.slug: word_count(p.body) for p in pages} out_root = (repo_root / config.get("outputs", {}).get("public_root", "/.ai/").strip("/") / "categories").resolve() diff --git a/scripts/llms_config.json b/scripts/llms_config.json index 78a0a694f..27bee59c4 100644 --- a/scripts/llms_config.json +++ b/scripts/llms_config.json @@ -42,7 +42,7 @@ "terms-of-use.md", "privacy-policy.md" ], - "skip_paths": [".snippets", ".github", ".ai"] + "skip_paths": [".snippets", ".github", ".ai", ".venv", "venv"] } },