1212import argparse
1313import textwrap
1414import requests
15+ import shutil
1516from pathlib import Path
1617
1718# -------------- CLI flags (module-level toggles) --------------
@@ -309,6 +310,29 @@ def build_raw_url(config: dict, slug: str) -> str:
309310 pages_dirname = config .get ("outputs" , {}).get ("files" , {}).get ("pages_dir" , "pages" )
310311 return f"https://raw.githubusercontent.com/{ org } /{ repo } /{ branch } /{ public_root } /{ pages_dirname } /{ slug } "
311312
313+ # ----------------------------
314+ # Word count, token estimate
315+ # ----------------------------
316+
317+ def word_count (s : str ) -> int :
318+ return len (re .findall (r"\b\w+\b" , s , flags = re .UNICODE ))
319+
320+ def _heuristic_token_count (s : str ) -> int :
321+ return len (re .findall (r"\w+|[^\s\w]" , s , flags = re .UNICODE ))
322+
323+ def _cl100k_token_count (s : str ) -> int :
324+ try :
325+ import tiktoken # type: ignore
326+ enc = tiktoken .get_encoding ("cl100k_base" )
327+ return len (enc .encode (s ))
328+ except Exception :
329+ return _heuristic_token_count (s )
330+
331+ def estimate_tokens (text : str , estimator : str = "heuristic-v1" ) -> int :
332+ if estimator == "cl100k" :
333+ return _cl100k_token_count (text )
334+ return _heuristic_token_count (text )
335+
312336# ----------------------------
313337# Writer
314338# ----------------------------
@@ -333,6 +357,21 @@ def write_ai_page(ai_pages_dir: Path, slug: str, header: dict, body: str):
333357# Main
334358# ----------------------------
335359
360+ def reset_ai_pages_dir (ai_pages_dir : Path ):
361+ """Remove all existing AI pages so runs always reflect current docs."""
362+ if DRY_RUN :
363+ print (f"[dry-run] Would remove existing files under { ai_pages_dir } " )
364+ return
365+
366+ if not ai_pages_dir .exists ():
367+ return
368+
369+ for entry in ai_pages_dir .iterdir ():
370+ if entry .is_dir ():
371+ shutil .rmtree (entry )
372+ else :
373+ entry .unlink ()
374+
336375def main ():
337376 global ALLOW_REMOTE , DRY_RUN
338377
@@ -365,7 +404,6 @@ def main():
365404 variables = load_yaml (str (variables_path ))
366405
367406 # Config bits
368- fm_flag = config .get ("content" , {}).get ("exclusions" , {}).get ("frontmatter_flag" , "ai_exclude" )
369407 skip_basenames = set (config .get ("content" , {}).get ("exclusions" , {}).get ("skip_basenames" , []))
370408 skip_parts = set (config .get ("content" , {}).get ("exclusions" , {}).get ("skip_paths" , []))
371409 docs_base_url = config .get ("project" , {}).get ("docs_base_url" , "" ).rstrip ("/" ) + "/"
@@ -374,6 +412,8 @@ def main():
374412 public_root = config .get ("outputs" , {}).get ("public_root" , "/.ai/" ).strip ("/" )
375413 pages_dirname = config .get ("outputs" , {}).get ("files" , {}).get ("pages_dir" , "pages" )
376414 ai_pages_dir = (repo_root / public_root / pages_dirname ).resolve ()
415+ ai_pages_dir .mkdir (parents = True , exist_ok = True )
416+ reset_ai_pages_dir (ai_pages_dir )
377417
378418 # Collect files
379419 files = get_all_markdown_files (str (docs_dir ), skip_basenames , skip_parts )
0 commit comments