Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion scripts/generate_ai_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import argparse
import textwrap
import requests
import shutil
from pathlib import Path

# -------------- CLI flags (module-level toggles) --------------
Expand Down Expand Up @@ -309,6 +310,29 @@ def build_raw_url(config: dict, slug: str) -> str:
pages_dirname = config.get("outputs", {}).get("files", {}).get("pages_dir", "pages")
return f"https://raw.githubusercontent.com/{org}/{repo}/{branch}/{public_root}/{pages_dirname}/{slug}"

# ----------------------------
# Word count, token estimate
# ----------------------------

def word_count(s: str) -> int:
return len(re.findall(r"\b\w+\b", s, flags=re.UNICODE))

def _heuristic_token_count(s: str) -> int:
return len(re.findall(r"\w+|[^\s\w]", s, flags=re.UNICODE))

def _cl100k_token_count(s: str) -> int:
try:
import tiktoken # type: ignore
enc = tiktoken.get_encoding("cl100k_base")
return len(enc.encode(s))
except Exception:
return _heuristic_token_count(s)

def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
if estimator == "cl100k":
return _cl100k_token_count(text)
return _heuristic_token_count(text)

# ----------------------------
# Writer
# ----------------------------
Expand All @@ -333,6 +357,21 @@ def write_ai_page(ai_pages_dir: Path, slug: str, header: dict, body: str):
# Main
# ----------------------------

def reset_ai_pages_dir(ai_pages_dir: Path):
"""Remove all existing AI pages so runs always reflect current docs."""
if DRY_RUN:
print(f"[dry-run] Would remove existing files under {ai_pages_dir}")
return

if not ai_pages_dir.exists():
return

for entry in ai_pages_dir.iterdir():
if entry.is_dir():
shutil.rmtree(entry)
else:
entry.unlink()

def main():
global ALLOW_REMOTE, DRY_RUN

Expand Down Expand Up @@ -365,7 +404,6 @@ def main():
variables = load_yaml(str(variables_path))

# Config bits
fm_flag = config.get("content", {}).get("exclusions", {}).get("frontmatter_flag", "ai_exclude")
skip_basenames = set(config.get("content", {}).get("exclusions", {}).get("skip_basenames", []))
skip_parts = set(config.get("content", {}).get("exclusions", {}).get("skip_paths", []))
docs_base_url = config.get("project", {}).get("docs_base_url", "").rstrip("/") + "/"
Expand All @@ -374,6 +412,8 @@ def main():
public_root = config.get("outputs", {}).get("public_root", "/.ai/").strip("/")
pages_dirname = config.get("outputs", {}).get("files", {}).get("pages_dir", "pages")
ai_pages_dir = (repo_root / public_root / pages_dirname).resolve()
ai_pages_dir.mkdir(parents=True, exist_ok=True)
reset_ai_pages_dir(ai_pages_dir)

# Collect files
files = get_all_markdown_files(str(docs_dir), skip_basenames, skip_parts)
Expand Down
18 changes: 5 additions & 13 deletions scripts/generate_category_bundles.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,22 +147,13 @@ def load_all_pages(ai_dir: Path) -> List[AiPage]:


# ----------------------------
# Token estimation
# Token estimation, word count
# ----------------------------

def _heuristic_token_count(s: str) -> int:
"""
Dependency-free token estimate:
- counts words and standalone punctuation
- decent for prose and code; model-agnostic
"""
return len(re.findall(r"\w+|[^\s\w]", s, flags=re.UNICODE))

def _cl100k_token_count(s: str) -> int:
"""
Optional: if tiktoken is installed and estimator name is 'cl100k',
compute tokens via cl100k_base; otherwise fall back to heuristic.
"""
try:
import tiktoken # type: ignore
enc = tiktoken.get_encoding("cl100k_base")
Expand All @@ -171,13 +162,13 @@ def _cl100k_token_count(s: str) -> int:
return _heuristic_token_count(s)

def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
if estimator == "heuristic-v1":
return _heuristic_token_count(text)
if estimator == "cl100k":
return _cl100k_token_count(text)
# Unknown/custom estimator name β†’ compute via heuristic but keep the label in outputs.
return _heuristic_token_count(text)

def word_count(text: str) -> int:
return len(re.findall(r"\b\w+\b", text, flags=re.UNICODE))


# ----------------------------
# Category logic
Expand Down Expand Up @@ -260,6 +251,7 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int

# Precompute token counts once per page
page_tokens: Dict[str, int] = {p.slug: estimate_tokens(p.body, token_estimator) for p in pages}
page_words: Dict[str, int] = {p.slug: word_count(p.body) for p in pages}

out_root = (repo_root / config.get("outputs", {}).get("public_root", "/.ai/").strip("/") / "categories").resolve()

Expand Down
2 changes: 1 addition & 1 deletion scripts/llms_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"terms-of-use.md",
"privacy-policy.md"
],
"skip_paths": [".snippets", ".github", ".ai"]
"skip_paths": [".snippets", ".github", ".ai", ".venv", "venv"]
}
},

Expand Down
Loading