From eb1a01672249f9f28c5c38ccaac2df12be3e9d96 Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Thu, 13 Nov 2025 14:51:48 +0100 Subject: [PATCH 1/6] Add populate-search-engine command and workflow - Add new markdown splitting logic based on headings (h1-h6) - Implement process_hf_docs.py to fetch and process docs from HF dataset - Add populate-search-engine CLI command (replaces build-embeddings) - Add GitHub Actions workflow for automated doc processing - Support downloading pre-built docs from hf-doc-build/doc-build dataset - Handle markdown chunking with heading hierarchy preservation - Add skip-embeddings flag for testing without embedding generation --- .github/workflows/populate_search_engine.yml | 89 ++++ src/doc_builder/build_embeddings.py | 157 +++++++ src/doc_builder/commands/embeddings.py | 225 +++++----- src/doc_builder/process_hf_docs.py | 433 +++++++++++++++++++ 4 files changed, 807 insertions(+), 97 deletions(-) create mode 100644 .github/workflows/populate_search_engine.yml create mode 100644 src/doc_builder/process_hf_docs.py diff --git a/.github/workflows/populate_search_engine.yml b/.github/workflows/populate_search_engine.yml new file mode 100644 index 00000000..ebcd1c0a --- /dev/null +++ b/.github/workflows/populate_search_engine.yml @@ -0,0 +1,89 @@ +name: Populate Search Engine + +on: + schedule: + - cron: "5 7 * * *" # every day at 07:05 + # to run this workflow manually from the Actions tab + workflow_dispatch: + inputs: + libraries: + description: 'Specific libraries to process (space-separated, e.g., "accelerate diffusers"). Leave empty for all.' + required: false + default: '' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + +jobs: + process-docs: + runs-on: ubuntu-latest + timeout-minutes: 360 # Set timeout to 6 hours + steps: + - name: Checkout doc-builder + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Set up Python 3.10 + run: uv python install 3.10 + + - name: Install doc-builder + run: uv sync --extra dev + + - name: Populate search engine from HF doc-build dataset + shell: bash + run: | + echo "Processing documentation from hf-doc-build/doc-build dataset..." + + # Build command + CMD="uv run doc-builder populate-search-engine" + + # Add library filter if specified + if [ ! -z "${{ github.event.inputs.libraries }}" ]; then + CMD="$CMD --libraries ${{ github.event.inputs.libraries }}" + fi + + # Add skip embeddings flag + CMD="$CMD --skip-embeddings" + + # Add credentials + CMD="$CMD --hf_ie_name docs-embed-bge-base-en-v1-5" + CMD="$CMD --hf_ie_namespace huggingface" + CMD="$CMD --hf_ie_token ${{ secrets.HF_IE_TOKEN }}" + CMD="$CMD --meilisearch_key ${{ secrets.MEILISEARCH_KEY }}" + + # Execute + echo "Running: $CMD" + $CMD + + cleanup-job: + needs: process-docs + runs-on: ubuntu-latest + if: always() # This ensures that the cleanup job runs regardless of the result + steps: + - name: Checkout doc-builder + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Set up Python 3.10 + run: uv python install 3.10 + + - name: Install doc-builder + run: uv sync --extra dev + + - name: Success Cleanup + if: needs.process-docs.result == 'success' # Runs if job succeeded + run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap + + - name: Failure Cleanup + if: needs.process-docs.result == 'failure' # Runs if job failed + run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} + diff --git a/src/doc_builder/build_embeddings.py b/src/doc_builder/build_embeddings.py index bb50adc3..bbd4c52c 100644 --- a/src/doc_builder/build_embeddings.py +++ b/src/doc_builder/build_embeddings.py @@ -408,6 +408,163 @@ def clean_md(text): return text.strip() +def split_into_excerpts(text: str, max_length: int) -> list[str]: + """ + Split text into excerpts of approximately max_length characters. + + Args: + text: The text to split + max_length: Maximum length of each excerpt in characters + + Returns: + List of text excerpts + """ + if not text: + return [] + + excerpts = [] + current_index = 0 + + while current_index < len(text): + end_index = current_index + max_length + + # If we're at the end of the text, just take the rest + if end_index >= len(text): + excerpts.append(text[current_index:].strip()) + break + + # Look for the next word boundary after "max_length" characters + remaining_text = text[end_index:] + word_boundary_match = re.search(r'\b', remaining_text) + + margin = 50 + + if word_boundary_match and word_boundary_match.start() <= margin: + # Found a word boundary within 50 characters, use it + end_index = end_index + word_boundary_match.start() + else: + # No word boundary within 50 chars, fall back to finding good breaking points + break_point = end_index + for i in range(end_index, max(current_index + max_length - margin, current_index), -1): + if i < len(text): + char = text[i] + if char in [' ', '\n', '.', ',', ';', '!', '?']: + break_point = i + 1 + break + end_index = break_point + + excerpts.append(text[current_index:end_index].strip()) + current_index = end_index + + return [excerpt for excerpt in excerpts if len(excerpt) > 0] + + +def build_headings_object(heading_stack: list[str]) -> dict: + """ + Build headings dictionary from heading stack. + + Args: + heading_stack: List of heading strings in format "## Heading Text" + + Returns: + Dictionary with heading1 through heading6 keys + """ + headings = {} + + for heading in heading_stack: + match = re.match(r'^(#{1,6})\s+(.+)$', heading) + if match: + level = len(match.group(1)) + text = match.group(2).strip() + headings[f'heading{level}'] = text + + return headings + + +def split_markdown_by_headings(markdown_content: str, excerpts_max_length: int = 1000) -> list[dict]: + """ + Split markdown content by headings and create sections with excerpts. + Similar to the TypeScript implementation for consistent chunking. + + Args: + markdown_content: The markdown text to split + excerpts_max_length: Maximum length of each excerpt in characters (default: 1000) + + Returns: + List of dictionaries with 'excerpts' (list of text chunks) and 'headings' (dict) keys + """ + lines = markdown_content.split("\n") + sections = [] + + current_section = "" + heading_stack = [] + line_index = 0 + + while line_index < len(lines): + line = lines[line_index] + heading_match = re.match(r'^(#{1,6})\s+(.+)$', line) + + if heading_match: + # Save the previous section if it has content + if current_section.strip(): + sections.append({ + 'excerpts': split_into_excerpts(current_section.strip(), excerpts_max_length), + 'headings': build_headings_object(heading_stack) + }) + + # Parse the heading + heading_level = len(heading_match.group(1)) + heading_text = heading_match.group(2).strip() + full_heading = f"{heading_match.group(1)} {heading_text}" + + # Update heading stack based on level + # Keep only headings with lower level than current + new_stack = [] + for h in heading_stack: + h_match = re.match(r'^(#{1,6})', h) + if h_match: + existing_level = len(h_match.group(1)) + if existing_level < heading_level: + new_stack.append(h) + heading_stack = new_stack + + # Add current heading + heading_stack.append(full_heading) + + # Start new section with the heading + current_section = line + + # Look ahead to include content after heading + line_index += 1 + while line_index < len(lines): + next_line = lines[line_index] + next_heading_match = re.match(r'^(#{1,6})\s+(.+)$', next_line) + + if next_heading_match: + # Found next heading, break to process it + break + else: + # Add line to current section + current_section += "\n" + next_line + line_index += 1 + + # Don't increment line_index here since we either reached end or found next heading + continue + else: + # Add line to current section + current_section += ("\n" if current_section else "") + line + line_index += 1 + + # Add the last section + if current_section.strip(): + sections.append({ + 'excerpts': split_into_excerpts(current_section.strip(), excerpts_max_length), + 'headings': build_headings_object(heading_stack) + }) + + return sections + + def get_page_title(path: str): """ Given a path to doc page, generate doc page title. diff --git a/src/doc_builder/commands/embeddings.py b/src/doc_builder/commands/embeddings.py index fed7eb21..0dda56ff 100644 --- a/src/doc_builder/commands/embeddings.py +++ b/src/doc_builder/commands/embeddings.py @@ -14,111 +14,79 @@ import argparse -import importlib +from pathlib import Path + +from doc_builder import clean_meilisearch +from doc_builder.build_embeddings import add_gradio_docs, call_embedding_inference +from doc_builder.meilisearch_helper import add_embeddings_to_db +from doc_builder.process_hf_docs import process_all_libraries +from doc_builder.utils import chunk_list + + +def process_hf_docs_command(args): + """ + Process documentation from HF doc-build dataset. + Downloads pre-built docs and generates embeddings. + """ + import meilisearch + from tqdm import tqdm + + print("Processing documentation from HF doc-build dataset...") + + # Process all or specific libraries + results = process_all_libraries( + output_dir=Path(args.output_dir) if args.output_dir else None, + excerpts_max_length=args.excerpt_length, + libraries=args.libraries if args.libraries else None, + skip_download=args.skip_download + ) -from doc_builder import build_embeddings, clean_meilisearch -from doc_builder.build_embeddings import add_gradio_docs -from doc_builder.utils import get_default_branch_name, get_doc_config, read_doc_config + # If embeddings are requested + if not args.skip_embeddings: + print("\n" + "=" * 80) + print("šŸ”¢ GENERATING EMBEDDINGS") + print("=" * 80) + # Collect all chunks + all_chunks = [] + for _library_name, chunks in results.items(): + all_chunks.extend(chunks) -def embeddings_command(args): - read_doc_config(args.path_to_docs) + print(f"\nTotal chunks to embed: {len(all_chunks)}") - default_version = get_default_branch_name(args.path_to_docs) - if args.not_python_module and args.version is None: - version = default_version - elif args.version is None: - module = importlib.import_module(args.library_name) - version = module.__version__ + # Generate embeddings + from doc_builder.build_embeddings import MEILI_INDEX_TEMP - if "dev" in version: - version = default_version - else: - version = f"v{version}" - else: - version = args.version - - # `version` will always start with prefix `v` - # `version_tag` does not have to start with prefix `v` (see: https://github.com/huggingface/datasets/tags) - version_tag = version - if version != default_version: - doc_config = get_doc_config() - version_prefix = getattr(doc_config, "version_prefix", "v") - version_ = version[1:] # v2.1.0 -> 2.1.0 - version_tag = f"{version_prefix}{version_}" - - # Disable notebook building for non-master version - if version != default_version: - args.notebook_dir = None - - print("Building embeddings for", args.library_name, args.path_to_docs) - build_embeddings( - args.library_name, - args.path_to_docs, - args.hf_ie_name, - args.hf_ie_namespace, - args.hf_ie_token, - args.meilisearch_key, - version=version, - version_tag=version_tag, - language=args.language, - is_python_module=not args.not_python_module, - version_tag_suffix=args.version_tag_suffix, - repo_owner=args.repo_owner, - repo_name=args.repo_name, - ) + embeddings = call_embedding_inference( + all_chunks, + args.hf_ie_name, + args.hf_ie_namespace, + args.hf_ie_token, + is_python_module=False # Pre-built docs are not Python modules + ) + # Push to Meilisearch + print("\n" + "=" * 80) + print("šŸ“¤ UPLOADING TO MEILISEARCH") + print("=" * 80) -def embeddings_command_parser(subparsers=None): - if subparsers is not None: - parser = subparsers.add_parser("embeddings") - else: - parser = argparse.ArgumentParser("Doc Builder embeddings command") + client = meilisearch.Client("https://edge.meilisearch.com", args.meilisearch_key) + ITEMS_PER_CHUNK = 5000 - parser.add_argument("library_name", type=str, help="Library name") - parser.add_argument( - "path_to_docs", - type=str, - help="Local path to library documentation. The library should be cloned, and the folder containing the " - "documentation files should be indicated here.", - ) - parser.add_argument("--hf_ie_name", type=str, help="Inference Endpoints name.", required=True) - parser.add_argument("--hf_ie_namespace", type=str, help="Inference Endpoints namespace.", required=True) - parser.add_argument("--hf_ie_token", type=str, help="Hugging Face token.", required=True) - parser.add_argument("--meilisearch_key", type=str, help="Meilisearch key.", required=True) - parser.add_argument("--language", type=str, help="Language of the documentation to generate", default="en") - parser.add_argument( - "--version", - type=str, - help="Version of the documentation to generate. Will default to the version of the package module (using " - "`main` for a version containing dev).", - ) - parser.add_argument( - "--not_python_module", - action="store_true", - help="Whether docs files do NOT have corresponding python module (like HF course & hub docs).", - ) - parser.add_argument( - "--version_tag_suffix", - type=str, - default="src/", - help="Suffix to add after the version tag (e.g. 1.3.0 or main) in the documentation links. For example, the default `src/` suffix will result in a base link as `https://github.com/huggingface/{package_name}/blob/{version_tag}/src/`.", - ) - parser.add_argument( - "--repo_owner", - type=str, - default="huggingface", - help="Owner of the repo (e.g. huggingface, rwightman, etc.).", - ) - parser.add_argument( - "--repo_name", - type=str, - default=None, - help="Name of the repo (e.g. transformers, pytorch-image-models, etc.). By default, this is the same as the library_name.", - ) - if subparsers is not None: - parser.set_defaults(func=embeddings_command) + for chunk_embeddings in tqdm( + chunk_list(embeddings, ITEMS_PER_CHUNK), + desc="Uploading to meilisearch" + ): + add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings) + + print(f"\nāœ… Successfully uploaded {len(embeddings)} embeddings to Meilisearch") + print("\n" + "=" * 80) + print("āœ… PROCESSING COMPLETE") + print("=" * 80) + + +def embeddings_command_parser(subparsers=None): # meilsiearch clean: swap & delete the temp index if subparsers is not None: parser_meilisearch_clean = subparsers.add_parser("meilisearch-clean") @@ -154,4 +122,67 @@ def embeddings_command_parser(subparsers=None): ) ) - return parser + # populate-search-engine: process documentation from HF doc-build dataset and populate search engine + if subparsers is not None: + parser_process_hf_docs = subparsers.add_parser("populate-search-engine") + else: + parser_process_hf_docs = argparse.ArgumentParser( + "Doc Builder populate-search-engine command. Process pre-built documentation from HF doc-build dataset and populate search engine." + ) + + parser_process_hf_docs.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory for downloaded/extracted files (uses temp dir if not specified)" + ) + parser_process_hf_docs.add_argument( + "--libraries", + type=str, + nargs="+", + default=None, + help="Specific libraries to process (e.g., accelerate diffusers). If not specified, processes all libraries." + ) + parser_process_hf_docs.add_argument( + "--excerpt-length", + type=int, + default=1000, + help="Maximum length of each excerpt in characters (default: 1000)" + ) + parser_process_hf_docs.add_argument( + "--skip-download", + action="store_true", + help="Skip download if files already exist in output-dir" + ) + parser_process_hf_docs.add_argument( + "--skip-embeddings", + action="store_true", + help="Skip embedding generation and meilisearch upload (useful for testing)" + ) + parser_process_hf_docs.add_argument( + "--hf_ie_name", + type=str, + help="Inference Endpoints name (required unless --skip-embeddings is set)", + required=False + ) + parser_process_hf_docs.add_argument( + "--hf_ie_namespace", + type=str, + help="Inference Endpoints namespace (required unless --skip-embeddings is set)", + required=False + ) + parser_process_hf_docs.add_argument( + "--hf_ie_token", + type=str, + help="Hugging Face token (required unless --skip-embeddings is set)", + required=False + ) + parser_process_hf_docs.add_argument( + "--meilisearch_key", + type=str, + help="Meilisearch key (required unless --skip-embeddings is set)", + required=False + ) + + if subparsers is not None: + parser_process_hf_docs.set_defaults(func=process_hf_docs_command) diff --git a/src/doc_builder/process_hf_docs.py b/src/doc_builder/process_hf_docs.py new file mode 100644 index 00000000..4f295aa0 --- /dev/null +++ b/src/doc_builder/process_hf_docs.py @@ -0,0 +1,433 @@ +#!/usr/bin/env python3 +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Process documentation from HuggingFace doc-build dataset. +Downloads and processes pre-built documentation markdown files. +""" + +import io +import json +import os +import tempfile +import zipfile +from collections import namedtuple +from pathlib import Path + +import requests +from tqdm import tqdm + +from .build_embeddings import split_markdown_by_headings + +Chunk = namedtuple("Chunk", "text source_page_url source_page_title package_name headings") + +HF_DATASET_REPO = "hf-doc-build/doc-build" +HF_DATASET_API_URL = f"https://huggingface.co/api/datasets/{HF_DATASET_REPO}/tree/main" +HF_DATASET_BASE_URL = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main" + + +def fetch_library_directories() -> list[dict]: + """ + Fetch the list of library directories from the HF doc-build dataset. + + Returns: + List of directory metadata dictionaries with 'path' and 'oid' keys + """ + print(f"Fetching library directories from {HF_DATASET_API_URL}...") + response = requests.get(HF_DATASET_API_URL) + response.raise_for_status() + + data = response.json() + + # Filter only directories + directories = [item for item in data if item.get("type") == "directory"] + + print(f"Found {len(directories)} library directories") + return directories + + +def download_and_extract_zip(library_name: str, output_dir: Path) -> Path | None: + """ + Download and extract the main.zip file for a library. + + Args: + library_name: Name of the library (e.g., 'accelerate') + output_dir: Directory to extract files to + + Returns: + Path to extracted directory, or None if download failed + """ + zip_url = f"{HF_DATASET_BASE_URL}/{library_name}/main.zip" + + try: + print(f" Downloading {zip_url}...") + response = requests.get(zip_url, stream=True) + response.raise_for_status() + + # Get total size for progress bar + total_size = int(response.headers.get('content-length', 0)) + + # Download to memory + zip_content = io.BytesIO() + with tqdm(total=total_size, unit='B', unit_scale=True, desc=f" {library_name}") as pbar: + for chunk in response.iter_content(chunk_size=8192): + zip_content.write(chunk) + pbar.update(len(chunk)) + + # Extract zip + zip_content.seek(0) + extract_path = output_dir / library_name + extract_path.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(zip_content) as zip_ref: + zip_ref.extractall(extract_path) + + print(f" Extracted to {extract_path}") + return extract_path + + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + print(f" āš ļø No main.zip found for {library_name}, skipping...") + return None + raise + except Exception as e: + print(f" āŒ Error processing {library_name}: {e}") + return None + + +def find_markdown_files(directory: Path) -> list[Path]: + """ + Recursively find all markdown files in a directory. + + Args: + directory: Root directory to search + + Returns: + List of paths to markdown files + """ + markdown_files = [] + for file_path in directory.rglob("*"): + if file_path.is_file() and file_path.suffix in [".md", ".mdx"]: + markdown_files.append(file_path) + return markdown_files + + +def markdown_file_to_url(file_path: Path, library_name: str, base_dir: Path) -> str: + """ + Convert a file path to a HuggingFace docs URL. + + Args: + file_path: Path to the markdown file + library_name: Name of the library + base_dir: Base directory (the extracted library folder) + + Returns: + URL string + """ + # Get relative path from base_dir + relative_path = file_path.relative_to(base_dir) + + # Remove file extension + path_without_ext = relative_path.with_suffix("") + + # Convert to URL format + url_path = str(path_without_ext).replace(os.sep, "/") + + # Build URL + url = f"https://huggingface.co/docs/{library_name}/{url_path}" + + return url + + +def get_page_title(file_path: Path) -> str: + """ + Generate a page title from file path. + + Args: + file_path: Path to the file + + Returns: + Formatted page title + """ + # Use the filename without extension + name = file_path.stem + # Replace underscores and hyphens with spaces + formatted = name.replace("_", " ").replace("-", " ") + # Capitalize + return formatted.title() + + +def process_markdown_file( + file_path: Path, + library_name: str, + base_dir: Path, + excerpts_max_length: int = 1000 +) -> list[Chunk]: + """ + Process a single markdown file into chunks. + + Args: + file_path: Path to the markdown file + library_name: Name of the library + base_dir: Base directory for URL generation + excerpts_max_length: Maximum length of each excerpt + + Returns: + List of Chunk objects + """ + try: + with open(file_path, encoding='utf-8') as f: + content = f.read() + + # Split markdown by headings + sections = split_markdown_by_headings(content, excerpts_max_length) + + # Generate base URL for this file + base_url = markdown_file_to_url(file_path, library_name, base_dir) + page_title = get_page_title(file_path) + + # Convert sections to Chunks + chunks = [] + for section in sections: + headings_dict = section['headings'] + + # Create heading list from the dictionary + heading_list = [] + for i in range(1, 7): + heading_key = f'heading{i}' + if heading_key in headings_dict: + # Reconstruct the heading with # marks + heading_text = headings_dict[heading_key] + heading_list.append('#' * i + ' ' + heading_text) + + # Generate URL with anchor (use first heading as anchor) + url = base_url + if headings_dict: + # Use the deepest heading for anchor + last_heading = None + for i in range(6, 0, -1): + if f'heading{i}' in headings_dict: + last_heading = headings_dict[f'heading{i}'] + break + + if last_heading: + # Create anchor from heading (lowercase, replace spaces with hyphens) + anchor = last_heading.lower().replace(' ', '-') + # Remove special characters + anchor = ''.join(c for c in anchor if c.isalnum() or c == '-') + url = f"{base_url}#{anchor}" + + # Create a chunk for each excerpt + for excerpt in section['excerpts']: + chunk = Chunk( + text=excerpt, + source_page_url=url, + source_page_title=page_title, + package_name=library_name, + headings=heading_list + ) + chunks.append(chunk) + + return chunks + + except Exception as e: + print(f" āš ļø Error processing {file_path.name}: {e}") + return [] + + +def process_library( + library_name: str, + output_dir: Path, + excerpts_max_length: int = 1000, + skip_download: bool = False +) -> list[Chunk]: + """ + Process a single library: download, extract, and chunk all markdown files. + + Args: + library_name: Name of the library + output_dir: Directory for temporary files + excerpts_max_length: Maximum length of each excerpt + skip_download: Skip download if files already exist + + Returns: + List of all chunks for this library + """ + print(f"\nšŸ“š Processing library: {library_name}") + + # Check if already extracted + extract_path = output_dir / library_name + + if skip_download and extract_path.exists(): + print(f" ā„¹ļø Using existing files at {extract_path}") + else: + # Download and extract + extract_path = download_and_extract_zip(library_name, output_dir) + if extract_path is None: + return [] + + # Find all markdown files + markdown_files = find_markdown_files(extract_path) + print(f" Found {len(markdown_files)} markdown files") + + if not markdown_files: + print(f" āš ļø No markdown files found for {library_name}") + return [] + + # Process each markdown file + all_chunks = [] + print(" Processing markdown files...") + for md_file in tqdm(markdown_files, desc=f" {library_name}", unit="file"): + chunks = process_markdown_file(md_file, library_name, extract_path, excerpts_max_length) + all_chunks.extend(chunks) + + print(f" āœ… Generated {len(all_chunks)} chunks from {len(markdown_files)} files") + + return all_chunks + + +def process_all_libraries( + output_dir: Path | None = None, + excerpts_max_length: int = 1000, + libraries: list[str] | None = None, + skip_download: bool = False +) -> dict: + """ + Process all libraries from the HF doc-build dataset. + + Args: + output_dir: Directory for temporary files (uses temp dir if None) + excerpts_max_length: Maximum length of each excerpt + libraries: List of specific libraries to process (None = all) + skip_download: Skip download if files already exist + + Returns: + Dictionary mapping library names to their chunks + """ + if output_dir is None: + output_dir = Path(tempfile.mkdtemp(prefix="hf_docs_")) + print(f"Using temporary directory: {output_dir}") + else: + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Fetch library directories + directories = fetch_library_directories() + + # Filter if specific libraries requested + if libraries: + directories = [d for d in directories if d['path'] in libraries] + print(f"Processing {len(directories)} requested libraries: {libraries}") + + # Process each library + results = {} + for directory in directories: + library_name = directory['path'] + chunks = process_library(library_name, output_dir, excerpts_max_length, skip_download) + results[library_name] = chunks + + # Summary + print("\n" + "=" * 80) + print("šŸ“Š SUMMARY") + print("=" * 80) + total_chunks = 0 + for library_name, chunks in results.items(): + print(f" {library_name}: {len(chunks)} chunks") + total_chunks += len(chunks) + print(f"\n Total: {total_chunks} chunks across {len(results)} libraries") + print("=" * 80) + + return results + + +def save_chunks_to_json(chunks: list[Chunk], output_file: Path): + """ + Save chunks to a JSON file. + + Args: + chunks: List of Chunk objects + output_file: Path to output JSON file + """ + # Convert chunks to dictionaries + chunks_data = [ + { + 'text': chunk.text, + 'source_page_url': chunk.source_page_url, + 'source_page_title': chunk.source_page_title, + 'package_name': chunk.package_name, + 'headings': chunk.headings + } + for chunk in chunks + ] + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(chunks_data, f, indent=2, ensure_ascii=False) + + print(f"Saved {len(chunks)} chunks to {output_file}") + + +if __name__ == "__main__": + # Example usage + import argparse + + parser = argparse.ArgumentParser(description="Process HuggingFace documentation from doc-build dataset") + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory for downloaded/extracted files (uses temp dir if not specified)" + ) + parser.add_argument( + "--libraries", + type=str, + nargs="+", + default=None, + help="Specific libraries to process (e.g., accelerate diffusers)" + ) + parser.add_argument( + "--excerpt-length", + type=int, + default=1000, + help="Maximum length of each excerpt in characters (default: 1000)" + ) + parser.add_argument( + "--skip-download", + action="store_true", + help="Skip download if files already exist in output-dir" + ) + parser.add_argument( + "--save-json", + type=str, + default=None, + help="Save all chunks to a JSON file" + ) + + args = parser.parse_args() + + # Process libraries + results = process_all_libraries( + output_dir=Path(args.output_dir) if args.output_dir else None, + excerpts_max_length=args.excerpt_length, + libraries=args.libraries, + skip_download=args.skip_download + ) + + # Save to JSON if requested + if args.save_json: + all_chunks = [] + for chunks in results.values(): + all_chunks.extend(chunks) + save_chunks_to_json(all_chunks, Path(args.save_json)) + From e471eebd1c124ff36afe7be9bba3d4f0be27fcd3 Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Thu, 13 Nov 2025 14:53:05 +0100 Subject: [PATCH 2/6] format --- src/doc_builder/build_embeddings.py | 34 ++++++------ src/doc_builder/commands/embeddings.py | 35 +++++------- src/doc_builder/process_hf_docs.py | 75 ++++++++++---------------- 3 files changed, 60 insertions(+), 84 deletions(-) diff --git a/src/doc_builder/build_embeddings.py b/src/doc_builder/build_embeddings.py index bbd4c52c..77a12f02 100644 --- a/src/doc_builder/build_embeddings.py +++ b/src/doc_builder/build_embeddings.py @@ -435,7 +435,7 @@ def split_into_excerpts(text: str, max_length: int) -> list[str]: # Look for the next word boundary after "max_length" characters remaining_text = text[end_index:] - word_boundary_match = re.search(r'\b', remaining_text) + word_boundary_match = re.search(r"\b", remaining_text) margin = 50 @@ -448,7 +448,7 @@ def split_into_excerpts(text: str, max_length: int) -> list[str]: for i in range(end_index, max(current_index + max_length - margin, current_index), -1): if i < len(text): char = text[i] - if char in [' ', '\n', '.', ',', ';', '!', '?']: + if char in [" ", "\n", ".", ",", ";", "!", "?"]: break_point = i + 1 break end_index = break_point @@ -472,11 +472,11 @@ def build_headings_object(heading_stack: list[str]) -> dict: headings = {} for heading in heading_stack: - match = re.match(r'^(#{1,6})\s+(.+)$', heading) + match = re.match(r"^(#{1,6})\s+(.+)$", heading) if match: level = len(match.group(1)) text = match.group(2).strip() - headings[f'heading{level}'] = text + headings[f"heading{level}"] = text return headings @@ -502,15 +502,17 @@ def split_markdown_by_headings(markdown_content: str, excerpts_max_length: int = while line_index < len(lines): line = lines[line_index] - heading_match = re.match(r'^(#{1,6})\s+(.+)$', line) + heading_match = re.match(r"^(#{1,6})\s+(.+)$", line) if heading_match: # Save the previous section if it has content if current_section.strip(): - sections.append({ - 'excerpts': split_into_excerpts(current_section.strip(), excerpts_max_length), - 'headings': build_headings_object(heading_stack) - }) + sections.append( + { + "excerpts": split_into_excerpts(current_section.strip(), excerpts_max_length), + "headings": build_headings_object(heading_stack), + } + ) # Parse the heading heading_level = len(heading_match.group(1)) @@ -521,7 +523,7 @@ def split_markdown_by_headings(markdown_content: str, excerpts_max_length: int = # Keep only headings with lower level than current new_stack = [] for h in heading_stack: - h_match = re.match(r'^(#{1,6})', h) + h_match = re.match(r"^(#{1,6})", h) if h_match: existing_level = len(h_match.group(1)) if existing_level < heading_level: @@ -538,7 +540,7 @@ def split_markdown_by_headings(markdown_content: str, excerpts_max_length: int = line_index += 1 while line_index < len(lines): next_line = lines[line_index] - next_heading_match = re.match(r'^(#{1,6})\s+(.+)$', next_line) + next_heading_match = re.match(r"^(#{1,6})\s+(.+)$", next_line) if next_heading_match: # Found next heading, break to process it @@ -557,10 +559,12 @@ def split_markdown_by_headings(markdown_content: str, excerpts_max_length: int = # Add the last section if current_section.strip(): - sections.append({ - 'excerpts': split_into_excerpts(current_section.strip(), excerpts_max_length), - 'headings': build_headings_object(heading_stack) - }) + sections.append( + { + "excerpts": split_into_excerpts(current_section.strip(), excerpts_max_length), + "headings": build_headings_object(heading_stack), + } + ) return sections diff --git a/src/doc_builder/commands/embeddings.py b/src/doc_builder/commands/embeddings.py index 0dda56ff..e85539a6 100644 --- a/src/doc_builder/commands/embeddings.py +++ b/src/doc_builder/commands/embeddings.py @@ -38,7 +38,7 @@ def process_hf_docs_command(args): output_dir=Path(args.output_dir) if args.output_dir else None, excerpts_max_length=args.excerpt_length, libraries=args.libraries if args.libraries else None, - skip_download=args.skip_download + skip_download=args.skip_download, ) # If embeddings are requested @@ -62,7 +62,7 @@ def process_hf_docs_command(args): args.hf_ie_name, args.hf_ie_namespace, args.hf_ie_token, - is_python_module=False # Pre-built docs are not Python modules + is_python_module=False, # Pre-built docs are not Python modules ) # Push to Meilisearch @@ -73,10 +73,7 @@ def process_hf_docs_command(args): client = meilisearch.Client("https://edge.meilisearch.com", args.meilisearch_key) ITEMS_PER_CHUNK = 5000 - for chunk_embeddings in tqdm( - chunk_list(embeddings, ITEMS_PER_CHUNK), - desc="Uploading to meilisearch" - ): + for chunk_embeddings in tqdm(chunk_list(embeddings, ITEMS_PER_CHUNK), desc="Uploading to meilisearch"): add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings) print(f"\nāœ… Successfully uploaded {len(embeddings)} embeddings to Meilisearch") @@ -134,54 +131,46 @@ def embeddings_command_parser(subparsers=None): "--output-dir", type=str, default=None, - help="Directory for downloaded/extracted files (uses temp dir if not specified)" + help="Directory for downloaded/extracted files (uses temp dir if not specified)", ) parser_process_hf_docs.add_argument( "--libraries", type=str, nargs="+", default=None, - help="Specific libraries to process (e.g., accelerate diffusers). If not specified, processes all libraries." + help="Specific libraries to process (e.g., accelerate diffusers). If not specified, processes all libraries.", ) parser_process_hf_docs.add_argument( - "--excerpt-length", - type=int, - default=1000, - help="Maximum length of each excerpt in characters (default: 1000)" + "--excerpt-length", type=int, default=1000, help="Maximum length of each excerpt in characters (default: 1000)" ) parser_process_hf_docs.add_argument( - "--skip-download", - action="store_true", - help="Skip download if files already exist in output-dir" + "--skip-download", action="store_true", help="Skip download if files already exist in output-dir" ) parser_process_hf_docs.add_argument( "--skip-embeddings", action="store_true", - help="Skip embedding generation and meilisearch upload (useful for testing)" + help="Skip embedding generation and meilisearch upload (useful for testing)", ) parser_process_hf_docs.add_argument( "--hf_ie_name", type=str, help="Inference Endpoints name (required unless --skip-embeddings is set)", - required=False + required=False, ) parser_process_hf_docs.add_argument( "--hf_ie_namespace", type=str, help="Inference Endpoints namespace (required unless --skip-embeddings is set)", - required=False + required=False, ) parser_process_hf_docs.add_argument( - "--hf_ie_token", - type=str, - help="Hugging Face token (required unless --skip-embeddings is set)", - required=False + "--hf_ie_token", type=str, help="Hugging Face token (required unless --skip-embeddings is set)", required=False ) parser_process_hf_docs.add_argument( "--meilisearch_key", type=str, help="Meilisearch key (required unless --skip-embeddings is set)", - required=False + required=False, ) if subparsers is not None: diff --git a/src/doc_builder/process_hf_docs.py b/src/doc_builder/process_hf_docs.py index 4f295aa0..1e002c4a 100644 --- a/src/doc_builder/process_hf_docs.py +++ b/src/doc_builder/process_hf_docs.py @@ -77,11 +77,11 @@ def download_and_extract_zip(library_name: str, output_dir: Path) -> Path | None response.raise_for_status() # Get total size for progress bar - total_size = int(response.headers.get('content-length', 0)) + total_size = int(response.headers.get("content-length", 0)) # Download to memory zip_content = io.BytesIO() - with tqdm(total=total_size, unit='B', unit_scale=True, desc=f" {library_name}") as pbar: + with tqdm(total=total_size, unit="B", unit_scale=True, desc=f" {library_name}") as pbar: for chunk in response.iter_content(chunk_size=8192): zip_content.write(chunk) pbar.update(len(chunk)) @@ -170,10 +170,7 @@ def get_page_title(file_path: Path) -> str: def process_markdown_file( - file_path: Path, - library_name: str, - base_dir: Path, - excerpts_max_length: int = 1000 + file_path: Path, library_name: str, base_dir: Path, excerpts_max_length: int = 1000 ) -> list[Chunk]: """ Process a single markdown file into chunks. @@ -188,7 +185,7 @@ def process_markdown_file( List of Chunk objects """ try: - with open(file_path, encoding='utf-8') as f: + with open(file_path, encoding="utf-8") as f: content = f.read() # Split markdown by headings @@ -201,16 +198,16 @@ def process_markdown_file( # Convert sections to Chunks chunks = [] for section in sections: - headings_dict = section['headings'] + headings_dict = section["headings"] # Create heading list from the dictionary heading_list = [] for i in range(1, 7): - heading_key = f'heading{i}' + heading_key = f"heading{i}" if heading_key in headings_dict: # Reconstruct the heading with # marks heading_text = headings_dict[heading_key] - heading_list.append('#' * i + ' ' + heading_text) + heading_list.append("#" * i + " " + heading_text) # Generate URL with anchor (use first heading as anchor) url = base_url @@ -218,25 +215,25 @@ def process_markdown_file( # Use the deepest heading for anchor last_heading = None for i in range(6, 0, -1): - if f'heading{i}' in headings_dict: - last_heading = headings_dict[f'heading{i}'] + if f"heading{i}" in headings_dict: + last_heading = headings_dict[f"heading{i}"] break if last_heading: # Create anchor from heading (lowercase, replace spaces with hyphens) - anchor = last_heading.lower().replace(' ', '-') + anchor = last_heading.lower().replace(" ", "-") # Remove special characters - anchor = ''.join(c for c in anchor if c.isalnum() or c == '-') + anchor = "".join(c for c in anchor if c.isalnum() or c == "-") url = f"{base_url}#{anchor}" # Create a chunk for each excerpt - for excerpt in section['excerpts']: + for excerpt in section["excerpts"]: chunk = Chunk( text=excerpt, source_page_url=url, source_page_title=page_title, package_name=library_name, - headings=heading_list + headings=heading_list, ) chunks.append(chunk) @@ -248,10 +245,7 @@ def process_markdown_file( def process_library( - library_name: str, - output_dir: Path, - excerpts_max_length: int = 1000, - skip_download: bool = False + library_name: str, output_dir: Path, excerpts_max_length: int = 1000, skip_download: bool = False ) -> list[Chunk]: """ Process a single library: download, extract, and chunk all markdown files. @@ -302,7 +296,7 @@ def process_all_libraries( output_dir: Path | None = None, excerpts_max_length: int = 1000, libraries: list[str] | None = None, - skip_download: bool = False + skip_download: bool = False, ) -> dict: """ Process all libraries from the HF doc-build dataset. @@ -328,13 +322,13 @@ def process_all_libraries( # Filter if specific libraries requested if libraries: - directories = [d for d in directories if d['path'] in libraries] + directories = [d for d in directories if d["path"] in libraries] print(f"Processing {len(directories)} requested libraries: {libraries}") # Process each library results = {} for directory in directories: - library_name = directory['path'] + library_name = directory["path"] chunks = process_library(library_name, output_dir, excerpts_max_length, skip_download) results[library_name] = chunks @@ -363,16 +357,16 @@ def save_chunks_to_json(chunks: list[Chunk], output_file: Path): # Convert chunks to dictionaries chunks_data = [ { - 'text': chunk.text, - 'source_page_url': chunk.source_page_url, - 'source_page_title': chunk.source_page_title, - 'package_name': chunk.package_name, - 'headings': chunk.headings + "text": chunk.text, + "source_page_url": chunk.source_page_url, + "source_page_title": chunk.source_page_title, + "package_name": chunk.package_name, + "headings": chunk.headings, } for chunk in chunks ] - with open(output_file, 'w', encoding='utf-8') as f: + with open(output_file, "w", encoding="utf-8") as f: json.dump(chunks_data, f, indent=2, ensure_ascii=False) print(f"Saved {len(chunks)} chunks to {output_file}") @@ -387,32 +381,22 @@ def save_chunks_to_json(chunks: list[Chunk], output_file: Path): "--output-dir", type=str, default=None, - help="Directory for downloaded/extracted files (uses temp dir if not specified)" + help="Directory for downloaded/extracted files (uses temp dir if not specified)", ) parser.add_argument( "--libraries", type=str, nargs="+", default=None, - help="Specific libraries to process (e.g., accelerate diffusers)" - ) - parser.add_argument( - "--excerpt-length", - type=int, - default=1000, - help="Maximum length of each excerpt in characters (default: 1000)" + help="Specific libraries to process (e.g., accelerate diffusers)", ) parser.add_argument( - "--skip-download", - action="store_true", - help="Skip download if files already exist in output-dir" + "--excerpt-length", type=int, default=1000, help="Maximum length of each excerpt in characters (default: 1000)" ) parser.add_argument( - "--save-json", - type=str, - default=None, - help="Save all chunks to a JSON file" + "--skip-download", action="store_true", help="Skip download if files already exist in output-dir" ) + parser.add_argument("--save-json", type=str, default=None, help="Save all chunks to a JSON file") args = parser.parse_args() @@ -421,7 +405,7 @@ def save_chunks_to_json(chunks: list[Chunk], output_file: Path): output_dir=Path(args.output_dir) if args.output_dir else None, excerpts_max_length=args.excerpt_length, libraries=args.libraries, - skip_download=args.skip_download + skip_download=args.skip_download, ) # Save to JSON if requested @@ -430,4 +414,3 @@ def save_chunks_to_json(chunks: list[Chunk], output_file: Path): for chunks in results.values(): all_chunks.extend(chunks) save_chunks_to_json(all_chunks, Path(args.save_json)) - From d70c8a9c8b09c1aaeae2495d8b36bc33d90c617f Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Thu, 13 Nov 2025 14:54:07 +0100 Subject: [PATCH 3/6] Remove the Daily Build Embeddings workflow from GitHub Actions. This workflow was responsible for building embeddings for various Hugging Face repositories and has been deprecated in favor of the new populate-search-engine command and workflow. --- .github/workflows/build_embeddings.yml | 179 ------------------------- 1 file changed, 179 deletions(-) delete mode 100644 .github/workflows/build_embeddings.yml diff --git a/.github/workflows/build_embeddings.yml b/.github/workflows/build_embeddings.yml deleted file mode 100644 index 3ceb0b1a..00000000 --- a/.github/workflows/build_embeddings.yml +++ /dev/null @@ -1,179 +0,0 @@ -name: Daily Build Embeddings - -env: - DIFFUSERS_SLOW_IMPORT: yes - -on: - schedule: - - cron: "5 7 * * *" # every day at 07:05 - # to run this workflow manually from the Actions tab - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: false - -jobs: - matrix-job: - runs-on: ubuntu-latest - container: huggingface/transformers-doc-builder - strategy: - max-parallel: 1 # run sequentially - matrix: - include: - - repo_id: huggingface/tokenizers - doc_folder: docs/source-doc-builder - package_path: bindings/python - - repo_id: huggingface/diffusers - doc_folder: docs/source/en - - repo_id: huggingface/accelerate - doc_folder: docs/source - - repo_id: huggingface/huggingface_hub - doc_folder: docs/source/en - - repo_id: huggingface/transformers - doc_folder: docs/source/en - - repo_id: huggingface/hub-docs - doc_folder: docs/hub - package_name: hub - is_not_python_module: true - - repo_id: huggingface/huggingface.js - doc_folder: docs - is_not_python_module: true - pre_command: npm install -g corepack@latest && corepack enable && cd huggingface.js && pnpm install && pnpm -r build && pnpm --filter doc-internal start - - repo_id: huggingface/transformers.js - doc_folder: docs/source - is_not_python_module: true - - repo_id: huggingface/smolagents - doc_folder: docs/source/en - - repo_id: huggingface/peft - doc_folder: docs/source - - repo_id: huggingface/trl - doc_folder: docs/source - - repo_id: bitsandbytes-foundation/bitsandbytes - doc_folder: docs/source - - repo_id: huggingface/lerobot - doc_folder: docs/source - - repo_id: huggingface/pytorch-image-models - doc_folder: hfdocs/source - package_name: timm - - repo_id: huggingface/hub-docs - doc_folder: docs/inference-providers - package_name: inference-providers - is_not_python_module: true - - repo_id: huggingface/safetensors - doc_folder: docs/source - package_path: bindings/python - - repo_id: huggingface/hf-endpoints-documentation - doc_folder: docs/source - package_name: inference-endpoints - is_not_python_module: true - - repo_id: huggingface/dataset-viewer - doc_folder: docs/source - package_name: dataset-viewer - is_not_python_module: true - timeout-minutes: 360 # Set timeout to 6 hours - steps: - - name: Setup REPO_NAME - shell: bash - run: | - current_path=$(pwd) - repo_id="${{ matrix.repo_id }}" - repo_name="${repo_id#*/}" - echo "REPO_NAME=${repo_name}" >> $GITHUB_ENV - - - name: Checkout repository - uses: actions/checkout@v2 - with: - repository: ${{ matrix.repo_id }} - path: ${{ github.workspace }}/${{ env.REPO_NAME }} - - - uses: actions/setup-node@v4 - with: - node-version: '20' - - - name: Install libgl1 - run: apt-get install -y libgl1 - - - name: Export PIP_OR_UV ('pip' or 'uv pip') - run: | - if [ -z "${{ job.container }}" ] - then - echo "PIP_OR_UV=uv pip" >> $GITHUB_ENV - else - echo "PIP_OR_UV=pip" >> $GITHUB_ENV - fi - - - name: Setup environment - shell: bash - run: | - if [[ "${{ matrix.is_not_python_module }}" != "true" ]]; then - current_path=$(pwd) - cd ${{ env.REPO_NAME }} - if [[ -n "${{ matrix.package_path }}" ]]; then - cd ${{ matrix.package_path }} - $PIP_OR_UV install .[dev] - $PIP_OR_UV install --force-reinstall numpy==1.26.4 - cd $current_path - else - $PIP_OR_UV install .[dev] - $PIP_OR_UV install --force-reinstall numpy==1.26.4 - cd $current_path - fi - fi - - rm -rf doc-builder - rm -rf .git - git clone https://github.com/huggingface/doc-builder.git - cd doc-builder - git fetch - git checkout main - $PIP_OR_UV install . - - - name: Run pre-command - shell: bash - run: | - if [ ! -z "${{ matrix.pre_command }}" ] - then - bash -c "${{ matrix.pre_command }}" - fi - - - name: Build embeddings - shell: bash - run: | - echo Building docs for ${{ matrix.package_name || env.REPO_NAME }} - FLAGS="" - if [[ "${{ matrix.is_not_python_module }}" == "true" ]]; then - FLAGS="--not_python_module" - fi - doc-builder embeddings ${{ matrix.package_name || env.REPO_NAME }} ${{ env.REPO_NAME }}/${{ matrix.doc_folder }} --hf_ie_name docs-embed-bge-base-en-v1-5 --hf_ie_namespace huggingface --hf_ie_token ${{ secrets.HF_IE_TOKEN }} --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} $FLAGS - - gradio-job: - runs-on: ubuntu-latest - steps: - - name: Checkout doc-builder - uses: actions/checkout@v2 - - - name: Install doc-builder - run: pip install .[dev] - - - name: Add gradio docs to meilisearch - run: doc-builder add-gradio-docs --hf_ie_name docs-embed-bge-base-en-v1-5 --hf_ie_namespace huggingface --hf_ie_token ${{ secrets.HF_IE_TOKEN }} --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} - - cleanup-job: - needs: matrix-job - runs-on: ubuntu-latest - if: always() # This ensures that the cleanup job runs regardless of the result of matrix-job - steps: - - name: Checkout doc-builder - uses: actions/checkout@v2 - - - name: Install doc-builder - run: pip install .[dev] - - - name: Success Cleanup - if: needs.matrix-job.result == 'success' # Runs if all matrix jobs succeeded - run: doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap - - - name: Failure Cleanup - if: needs.matrix-job.result == 'failure' # Runs if any matrix job failed - run: doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} From 989ef7dc2b3093fd9fc66d38f21bd4f4ae3a9c0a Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Thu, 13 Nov 2025 14:57:06 +0100 Subject: [PATCH 4/6] wip --- .github/workflows/populate_search_engine.yml | 42 ++++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/populate_search_engine.yml b/.github/workflows/populate_search_engine.yml index ebcd1c0a..44f32af8 100644 --- a/.github/workflows/populate_search_engine.yml +++ b/.github/workflows/populate_search_engine.yml @@ -60,30 +60,30 @@ jobs: echo "Running: $CMD" $CMD - cleanup-job: - needs: process-docs - runs-on: ubuntu-latest - if: always() # This ensures that the cleanup job runs regardless of the result - steps: - - name: Checkout doc-builder - uses: actions/checkout@v4 + # cleanup-job: + # needs: process-docs + # runs-on: ubuntu-latest + # if: always() # This ensures that the cleanup job runs regardless of the result + # steps: + # - name: Checkout doc-builder + # uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v4 - with: - version: "latest" + # - name: Install uv + # uses: astral-sh/setup-uv@v4 + # with: + # version: "latest" - - name: Set up Python 3.10 - run: uv python install 3.10 + # - name: Set up Python 3.10 + # run: uv python install 3.10 - - name: Install doc-builder - run: uv sync --extra dev + # - name: Install doc-builder + # run: uv sync --extra dev - - name: Success Cleanup - if: needs.process-docs.result == 'success' # Runs if job succeeded - run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap + # - name: Success Cleanup + # if: needs.process-docs.result == 'success' # Runs if job succeeded + # run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap - - name: Failure Cleanup - if: needs.process-docs.result == 'failure' # Runs if job failed - run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} + # - name: Failure Cleanup + # if: needs.process-docs.result == 'failure' # Runs if job failed + # run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} From ce8681c0e257ebe5a56516570918ffc537f2fa6e Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Thu, 13 Nov 2025 14:59:46 +0100 Subject: [PATCH 5/6] wip --- .github/workflows/populate_search_engine.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/populate_search_engine.yml b/.github/workflows/populate_search_engine.yml index 44f32af8..68464293 100644 --- a/.github/workflows/populate_search_engine.yml +++ b/.github/workflows/populate_search_engine.yml @@ -1,6 +1,7 @@ name: Populate Search Engine on: + push: schedule: - cron: "5 7 * * *" # every day at 07:05 # to run this workflow manually from the Actions tab From 80954a9745df3a43132457758306c08c032cb0e6 Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Thu, 13 Nov 2025 15:10:11 +0100 Subject: [PATCH 6/6] Add gradio-job to preserve Gradio docs special handling - Gradio docs use separate JSON format from gradio/docs dataset - Gradio is not in hf-doc-build dataset, needs separate processing - Uncomment cleanup-job and update to depend on both jobs - Remove accidental 'on: push' trigger --- .github/workflows/populate_search_engine.yml | 73 ++++++++++++-------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/.github/workflows/populate_search_engine.yml b/.github/workflows/populate_search_engine.yml index 68464293..05c1cf14 100644 --- a/.github/workflows/populate_search_engine.yml +++ b/.github/workflows/populate_search_engine.yml @@ -1,7 +1,6 @@ name: Populate Search Engine on: - push: schedule: - cron: "5 7 * * *" # every day at 07:05 # to run this workflow manually from the Actions tab @@ -61,30 +60,50 @@ jobs: echo "Running: $CMD" $CMD - # cleanup-job: - # needs: process-docs - # runs-on: ubuntu-latest - # if: always() # This ensures that the cleanup job runs regardless of the result - # steps: - # - name: Checkout doc-builder - # uses: actions/checkout@v4 - - # - name: Install uv - # uses: astral-sh/setup-uv@v4 - # with: - # version: "latest" - - # - name: Set up Python 3.10 - # run: uv python install 3.10 - - # - name: Install doc-builder - # run: uv sync --extra dev - - # - name: Success Cleanup - # if: needs.process-docs.result == 'success' # Runs if job succeeded - # run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap - - # - name: Failure Cleanup - # if: needs.process-docs.result == 'failure' # Runs if job failed - # run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} + gradio-job: + runs-on: ubuntu-latest + steps: + - name: Checkout doc-builder + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Set up Python 3.10 + run: uv python install 3.10 + + - name: Install doc-builder + run: uv sync --extra dev + + - name: Add gradio docs to meilisearch + run: uv run doc-builder add-gradio-docs --hf_ie_name docs-embed-bge-base-en-v1-5 --hf_ie_namespace huggingface --hf_ie_token ${{ secrets.HF_IE_TOKEN }} --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} + + cleanup-job: + needs: [process-docs, gradio-job] + runs-on: ubuntu-latest + if: always() # This ensures that the cleanup job runs regardless of the result + steps: + - name: Checkout doc-builder + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Set up Python 3.10 + run: uv python install 3.10 + + - name: Install doc-builder + run: uv sync --extra dev + + - name: Success Cleanup + if: needs.process-docs.result == 'success' # Runs if job succeeded + run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap + + - name: Failure Cleanup + if: needs.process-docs.result == 'failure' # Runs if job failed + run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }}