diff --git a/.github/workflows/build_embeddings.yml b/.github/workflows/build_embeddings.yml deleted file mode 100644 index 3ceb0b1a..00000000 --- a/.github/workflows/build_embeddings.yml +++ /dev/null @@ -1,179 +0,0 @@ -name: Daily Build Embeddings - -env: - DIFFUSERS_SLOW_IMPORT: yes - -on: - schedule: - - cron: "5 7 * * *" # every day at 07:05 - # to run this workflow manually from the Actions tab - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: false - -jobs: - matrix-job: - runs-on: ubuntu-latest - container: huggingface/transformers-doc-builder - strategy: - max-parallel: 1 # run sequentially - matrix: - include: - - repo_id: huggingface/tokenizers - doc_folder: docs/source-doc-builder - package_path: bindings/python - - repo_id: huggingface/diffusers - doc_folder: docs/source/en - - repo_id: huggingface/accelerate - doc_folder: docs/source - - repo_id: huggingface/huggingface_hub - doc_folder: docs/source/en - - repo_id: huggingface/transformers - doc_folder: docs/source/en - - repo_id: huggingface/hub-docs - doc_folder: docs/hub - package_name: hub - is_not_python_module: true - - repo_id: huggingface/huggingface.js - doc_folder: docs - is_not_python_module: true - pre_command: npm install -g corepack@latest && corepack enable && cd huggingface.js && pnpm install && pnpm -r build && pnpm --filter doc-internal start - - repo_id: huggingface/transformers.js - doc_folder: docs/source - is_not_python_module: true - - repo_id: huggingface/smolagents - doc_folder: docs/source/en - - repo_id: huggingface/peft - doc_folder: docs/source - - repo_id: huggingface/trl - doc_folder: docs/source - - repo_id: bitsandbytes-foundation/bitsandbytes - doc_folder: docs/source - - repo_id: huggingface/lerobot - doc_folder: docs/source - - repo_id: huggingface/pytorch-image-models - doc_folder: hfdocs/source - package_name: timm - - repo_id: huggingface/hub-docs - doc_folder: docs/inference-providers - package_name: inference-providers - is_not_python_module: true - - repo_id: huggingface/safetensors - doc_folder: docs/source - package_path: bindings/python - - repo_id: huggingface/hf-endpoints-documentation - doc_folder: docs/source - package_name: inference-endpoints - is_not_python_module: true - - repo_id: huggingface/dataset-viewer - doc_folder: docs/source - package_name: dataset-viewer - is_not_python_module: true - timeout-minutes: 360 # Set timeout to 6 hours - steps: - - name: Setup REPO_NAME - shell: bash - run: | - current_path=$(pwd) - repo_id="${{ matrix.repo_id }}" - repo_name="${repo_id#*/}" - echo "REPO_NAME=${repo_name}" >> $GITHUB_ENV - - - name: Checkout repository - uses: actions/checkout@v2 - with: - repository: ${{ matrix.repo_id }} - path: ${{ github.workspace }}/${{ env.REPO_NAME }} - - - uses: actions/setup-node@v4 - with: - node-version: '20' - - - name: Install libgl1 - run: apt-get install -y libgl1 - - - name: Export PIP_OR_UV ('pip' or 'uv pip') - run: | - if [ -z "${{ job.container }}" ] - then - echo "PIP_OR_UV=uv pip" >> $GITHUB_ENV - else - echo "PIP_OR_UV=pip" >> $GITHUB_ENV - fi - - - name: Setup environment - shell: bash - run: | - if [[ "${{ matrix.is_not_python_module }}" != "true" ]]; then - current_path=$(pwd) - cd ${{ env.REPO_NAME }} - if [[ -n "${{ matrix.package_path }}" ]]; then - cd ${{ matrix.package_path }} - $PIP_OR_UV install .[dev] - $PIP_OR_UV install --force-reinstall numpy==1.26.4 - cd $current_path - else - $PIP_OR_UV install .[dev] - $PIP_OR_UV install --force-reinstall numpy==1.26.4 - cd $current_path - fi - fi - - rm -rf doc-builder - rm -rf .git - git clone https://github.com/huggingface/doc-builder.git - cd doc-builder - git fetch - git checkout main - $PIP_OR_UV install . - - - name: Run pre-command - shell: bash - run: | - if [ ! -z "${{ matrix.pre_command }}" ] - then - bash -c "${{ matrix.pre_command }}" - fi - - - name: Build embeddings - shell: bash - run: | - echo Building docs for ${{ matrix.package_name || env.REPO_NAME }} - FLAGS="" - if [[ "${{ matrix.is_not_python_module }}" == "true" ]]; then - FLAGS="--not_python_module" - fi - doc-builder embeddings ${{ matrix.package_name || env.REPO_NAME }} ${{ env.REPO_NAME }}/${{ matrix.doc_folder }} --hf_ie_name docs-embed-bge-base-en-v1-5 --hf_ie_namespace huggingface --hf_ie_token ${{ secrets.HF_IE_TOKEN }} --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} $FLAGS - - gradio-job: - runs-on: ubuntu-latest - steps: - - name: Checkout doc-builder - uses: actions/checkout@v2 - - - name: Install doc-builder - run: pip install .[dev] - - - name: Add gradio docs to meilisearch - run: doc-builder add-gradio-docs --hf_ie_name docs-embed-bge-base-en-v1-5 --hf_ie_namespace huggingface --hf_ie_token ${{ secrets.HF_IE_TOKEN }} --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} - - cleanup-job: - needs: matrix-job - runs-on: ubuntu-latest - if: always() # This ensures that the cleanup job runs regardless of the result of matrix-job - steps: - - name: Checkout doc-builder - uses: actions/checkout@v2 - - - name: Install doc-builder - run: pip install .[dev] - - - name: Success Cleanup - if: needs.matrix-job.result == 'success' # Runs if all matrix jobs succeeded - run: doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap - - - name: Failure Cleanup - if: needs.matrix-job.result == 'failure' # Runs if any matrix job failed - run: doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} diff --git a/.github/workflows/populate_search_engine.yml b/.github/workflows/populate_search_engine.yml new file mode 100644 index 00000000..05c1cf14 --- /dev/null +++ b/.github/workflows/populate_search_engine.yml @@ -0,0 +1,109 @@ +name: Populate Search Engine + +on: + schedule: + - cron: "5 7 * * *" # every day at 07:05 + # to run this workflow manually from the Actions tab + workflow_dispatch: + inputs: + libraries: + description: 'Specific libraries to process (space-separated, e.g., "accelerate diffusers"). Leave empty for all.' + required: false + default: '' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + +jobs: + process-docs: + runs-on: ubuntu-latest + timeout-minutes: 360 # Set timeout to 6 hours + steps: + - name: Checkout doc-builder + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Set up Python 3.10 + run: uv python install 3.10 + + - name: Install doc-builder + run: uv sync --extra dev + + - name: Populate search engine from HF doc-build dataset + shell: bash + run: | + echo "Processing documentation from hf-doc-build/doc-build dataset..." + + # Build command + CMD="uv run doc-builder populate-search-engine" + + # Add library filter if specified + if [ ! -z "${{ github.event.inputs.libraries }}" ]; then + CMD="$CMD --libraries ${{ github.event.inputs.libraries }}" + fi + + # Add skip embeddings flag + CMD="$CMD --skip-embeddings" + + # Add credentials + CMD="$CMD --hf_ie_name docs-embed-bge-base-en-v1-5" + CMD="$CMD --hf_ie_namespace huggingface" + CMD="$CMD --hf_ie_token ${{ secrets.HF_IE_TOKEN }}" + CMD="$CMD --meilisearch_key ${{ secrets.MEILISEARCH_KEY }}" + + # Execute + echo "Running: $CMD" + $CMD + + gradio-job: + runs-on: ubuntu-latest + steps: + - name: Checkout doc-builder + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Set up Python 3.10 + run: uv python install 3.10 + + - name: Install doc-builder + run: uv sync --extra dev + + - name: Add gradio docs to meilisearch + run: uv run doc-builder add-gradio-docs --hf_ie_name docs-embed-bge-base-en-v1-5 --hf_ie_namespace huggingface --hf_ie_token ${{ secrets.HF_IE_TOKEN }} --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} + + cleanup-job: + needs: [process-docs, gradio-job] + runs-on: ubuntu-latest + if: always() # This ensures that the cleanup job runs regardless of the result + steps: + - name: Checkout doc-builder + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Set up Python 3.10 + run: uv python install 3.10 + + - name: Install doc-builder + run: uv sync --extra dev + + - name: Success Cleanup + if: needs.process-docs.result == 'success' # Runs if job succeeded + run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap + + - name: Failure Cleanup + if: needs.process-docs.result == 'failure' # Runs if job failed + run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} + diff --git a/src/doc_builder/build_embeddings.py b/src/doc_builder/build_embeddings.py index bb50adc3..77a12f02 100644 --- a/src/doc_builder/build_embeddings.py +++ b/src/doc_builder/build_embeddings.py @@ -408,6 +408,167 @@ def clean_md(text): return text.strip() +def split_into_excerpts(text: str, max_length: int) -> list[str]: + """ + Split text into excerpts of approximately max_length characters. + + Args: + text: The text to split + max_length: Maximum length of each excerpt in characters + + Returns: + List of text excerpts + """ + if not text: + return [] + + excerpts = [] + current_index = 0 + + while current_index < len(text): + end_index = current_index + max_length + + # If we're at the end of the text, just take the rest + if end_index >= len(text): + excerpts.append(text[current_index:].strip()) + break + + # Look for the next word boundary after "max_length" characters + remaining_text = text[end_index:] + word_boundary_match = re.search(r"\b", remaining_text) + + margin = 50 + + if word_boundary_match and word_boundary_match.start() <= margin: + # Found a word boundary within 50 characters, use it + end_index = end_index + word_boundary_match.start() + else: + # No word boundary within 50 chars, fall back to finding good breaking points + break_point = end_index + for i in range(end_index, max(current_index + max_length - margin, current_index), -1): + if i < len(text): + char = text[i] + if char in [" ", "\n", ".", ",", ";", "!", "?"]: + break_point = i + 1 + break + end_index = break_point + + excerpts.append(text[current_index:end_index].strip()) + current_index = end_index + + return [excerpt for excerpt in excerpts if len(excerpt) > 0] + + +def build_headings_object(heading_stack: list[str]) -> dict: + """ + Build headings dictionary from heading stack. + + Args: + heading_stack: List of heading strings in format "## Heading Text" + + Returns: + Dictionary with heading1 through heading6 keys + """ + headings = {} + + for heading in heading_stack: + match = re.match(r"^(#{1,6})\s+(.+)$", heading) + if match: + level = len(match.group(1)) + text = match.group(2).strip() + headings[f"heading{level}"] = text + + return headings + + +def split_markdown_by_headings(markdown_content: str, excerpts_max_length: int = 1000) -> list[dict]: + """ + Split markdown content by headings and create sections with excerpts. + Similar to the TypeScript implementation for consistent chunking. + + Args: + markdown_content: The markdown text to split + excerpts_max_length: Maximum length of each excerpt in characters (default: 1000) + + Returns: + List of dictionaries with 'excerpts' (list of text chunks) and 'headings' (dict) keys + """ + lines = markdown_content.split("\n") + sections = [] + + current_section = "" + heading_stack = [] + line_index = 0 + + while line_index < len(lines): + line = lines[line_index] + heading_match = re.match(r"^(#{1,6})\s+(.+)$", line) + + if heading_match: + # Save the previous section if it has content + if current_section.strip(): + sections.append( + { + "excerpts": split_into_excerpts(current_section.strip(), excerpts_max_length), + "headings": build_headings_object(heading_stack), + } + ) + + # Parse the heading + heading_level = len(heading_match.group(1)) + heading_text = heading_match.group(2).strip() + full_heading = f"{heading_match.group(1)} {heading_text}" + + # Update heading stack based on level + # Keep only headings with lower level than current + new_stack = [] + for h in heading_stack: + h_match = re.match(r"^(#{1,6})", h) + if h_match: + existing_level = len(h_match.group(1)) + if existing_level < heading_level: + new_stack.append(h) + heading_stack = new_stack + + # Add current heading + heading_stack.append(full_heading) + + # Start new section with the heading + current_section = line + + # Look ahead to include content after heading + line_index += 1 + while line_index < len(lines): + next_line = lines[line_index] + next_heading_match = re.match(r"^(#{1,6})\s+(.+)$", next_line) + + if next_heading_match: + # Found next heading, break to process it + break + else: + # Add line to current section + current_section += "\n" + next_line + line_index += 1 + + # Don't increment line_index here since we either reached end or found next heading + continue + else: + # Add line to current section + current_section += ("\n" if current_section else "") + line + line_index += 1 + + # Add the last section + if current_section.strip(): + sections.append( + { + "excerpts": split_into_excerpts(current_section.strip(), excerpts_max_length), + "headings": build_headings_object(heading_stack), + } + ) + + return sections + + def get_page_title(path: str): """ Given a path to doc page, generate doc page title. diff --git a/src/doc_builder/commands/embeddings.py b/src/doc_builder/commands/embeddings.py index fed7eb21..e85539a6 100644 --- a/src/doc_builder/commands/embeddings.py +++ b/src/doc_builder/commands/embeddings.py @@ -14,111 +14,76 @@ import argparse -import importlib +from pathlib import Path + +from doc_builder import clean_meilisearch +from doc_builder.build_embeddings import add_gradio_docs, call_embedding_inference +from doc_builder.meilisearch_helper import add_embeddings_to_db +from doc_builder.process_hf_docs import process_all_libraries +from doc_builder.utils import chunk_list + + +def process_hf_docs_command(args): + """ + Process documentation from HF doc-build dataset. + Downloads pre-built docs and generates embeddings. + """ + import meilisearch + from tqdm import tqdm + + print("Processing documentation from HF doc-build dataset...") + + # Process all or specific libraries + results = process_all_libraries( + output_dir=Path(args.output_dir) if args.output_dir else None, + excerpts_max_length=args.excerpt_length, + libraries=args.libraries if args.libraries else None, + skip_download=args.skip_download, + ) -from doc_builder import build_embeddings, clean_meilisearch -from doc_builder.build_embeddings import add_gradio_docs -from doc_builder.utils import get_default_branch_name, get_doc_config, read_doc_config + # If embeddings are requested + if not args.skip_embeddings: + print("\n" + "=" * 80) + print("šŸ”¢ GENERATING EMBEDDINGS") + print("=" * 80) + # Collect all chunks + all_chunks = [] + for _library_name, chunks in results.items(): + all_chunks.extend(chunks) -def embeddings_command(args): - read_doc_config(args.path_to_docs) + print(f"\nTotal chunks to embed: {len(all_chunks)}") - default_version = get_default_branch_name(args.path_to_docs) - if args.not_python_module and args.version is None: - version = default_version - elif args.version is None: - module = importlib.import_module(args.library_name) - version = module.__version__ + # Generate embeddings + from doc_builder.build_embeddings import MEILI_INDEX_TEMP - if "dev" in version: - version = default_version - else: - version = f"v{version}" - else: - version = args.version - - # `version` will always start with prefix `v` - # `version_tag` does not have to start with prefix `v` (see: https://github.com/huggingface/datasets/tags) - version_tag = version - if version != default_version: - doc_config = get_doc_config() - version_prefix = getattr(doc_config, "version_prefix", "v") - version_ = version[1:] # v2.1.0 -> 2.1.0 - version_tag = f"{version_prefix}{version_}" - - # Disable notebook building for non-master version - if version != default_version: - args.notebook_dir = None - - print("Building embeddings for", args.library_name, args.path_to_docs) - build_embeddings( - args.library_name, - args.path_to_docs, - args.hf_ie_name, - args.hf_ie_namespace, - args.hf_ie_token, - args.meilisearch_key, - version=version, - version_tag=version_tag, - language=args.language, - is_python_module=not args.not_python_module, - version_tag_suffix=args.version_tag_suffix, - repo_owner=args.repo_owner, - repo_name=args.repo_name, - ) + embeddings = call_embedding_inference( + all_chunks, + args.hf_ie_name, + args.hf_ie_namespace, + args.hf_ie_token, + is_python_module=False, # Pre-built docs are not Python modules + ) + # Push to Meilisearch + print("\n" + "=" * 80) + print("šŸ“¤ UPLOADING TO MEILISEARCH") + print("=" * 80) -def embeddings_command_parser(subparsers=None): - if subparsers is not None: - parser = subparsers.add_parser("embeddings") - else: - parser = argparse.ArgumentParser("Doc Builder embeddings command") + client = meilisearch.Client("https://edge.meilisearch.com", args.meilisearch_key) + ITEMS_PER_CHUNK = 5000 + + for chunk_embeddings in tqdm(chunk_list(embeddings, ITEMS_PER_CHUNK), desc="Uploading to meilisearch"): + add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings) + + print(f"\nāœ… Successfully uploaded {len(embeddings)} embeddings to Meilisearch") + + print("\n" + "=" * 80) + print("āœ… PROCESSING COMPLETE") + print("=" * 80) - parser.add_argument("library_name", type=str, help="Library name") - parser.add_argument( - "path_to_docs", - type=str, - help="Local path to library documentation. The library should be cloned, and the folder containing the " - "documentation files should be indicated here.", - ) - parser.add_argument("--hf_ie_name", type=str, help="Inference Endpoints name.", required=True) - parser.add_argument("--hf_ie_namespace", type=str, help="Inference Endpoints namespace.", required=True) - parser.add_argument("--hf_ie_token", type=str, help="Hugging Face token.", required=True) - parser.add_argument("--meilisearch_key", type=str, help="Meilisearch key.", required=True) - parser.add_argument("--language", type=str, help="Language of the documentation to generate", default="en") - parser.add_argument( - "--version", - type=str, - help="Version of the documentation to generate. Will default to the version of the package module (using " - "`main` for a version containing dev).", - ) - parser.add_argument( - "--not_python_module", - action="store_true", - help="Whether docs files do NOT have corresponding python module (like HF course & hub docs).", - ) - parser.add_argument( - "--version_tag_suffix", - type=str, - default="src/", - help="Suffix to add after the version tag (e.g. 1.3.0 or main) in the documentation links. For example, the default `src/` suffix will result in a base link as `https://github.com/huggingface/{package_name}/blob/{version_tag}/src/`.", - ) - parser.add_argument( - "--repo_owner", - type=str, - default="huggingface", - help="Owner of the repo (e.g. huggingface, rwightman, etc.).", - ) - parser.add_argument( - "--repo_name", - type=str, - default=None, - help="Name of the repo (e.g. transformers, pytorch-image-models, etc.). By default, this is the same as the library_name.", - ) - if subparsers is not None: - parser.set_defaults(func=embeddings_command) +def embeddings_command_parser(subparsers=None): # meilsiearch clean: swap & delete the temp index if subparsers is not None: parser_meilisearch_clean = subparsers.add_parser("meilisearch-clean") @@ -154,4 +119,59 @@ def embeddings_command_parser(subparsers=None): ) ) - return parser + # populate-search-engine: process documentation from HF doc-build dataset and populate search engine + if subparsers is not None: + parser_process_hf_docs = subparsers.add_parser("populate-search-engine") + else: + parser_process_hf_docs = argparse.ArgumentParser( + "Doc Builder populate-search-engine command. Process pre-built documentation from HF doc-build dataset and populate search engine." + ) + + parser_process_hf_docs.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory for downloaded/extracted files (uses temp dir if not specified)", + ) + parser_process_hf_docs.add_argument( + "--libraries", + type=str, + nargs="+", + default=None, + help="Specific libraries to process (e.g., accelerate diffusers). If not specified, processes all libraries.", + ) + parser_process_hf_docs.add_argument( + "--excerpt-length", type=int, default=1000, help="Maximum length of each excerpt in characters (default: 1000)" + ) + parser_process_hf_docs.add_argument( + "--skip-download", action="store_true", help="Skip download if files already exist in output-dir" + ) + parser_process_hf_docs.add_argument( + "--skip-embeddings", + action="store_true", + help="Skip embedding generation and meilisearch upload (useful for testing)", + ) + parser_process_hf_docs.add_argument( + "--hf_ie_name", + type=str, + help="Inference Endpoints name (required unless --skip-embeddings is set)", + required=False, + ) + parser_process_hf_docs.add_argument( + "--hf_ie_namespace", + type=str, + help="Inference Endpoints namespace (required unless --skip-embeddings is set)", + required=False, + ) + parser_process_hf_docs.add_argument( + "--hf_ie_token", type=str, help="Hugging Face token (required unless --skip-embeddings is set)", required=False + ) + parser_process_hf_docs.add_argument( + "--meilisearch_key", + type=str, + help="Meilisearch key (required unless --skip-embeddings is set)", + required=False, + ) + + if subparsers is not None: + parser_process_hf_docs.set_defaults(func=process_hf_docs_command) diff --git a/src/doc_builder/process_hf_docs.py b/src/doc_builder/process_hf_docs.py new file mode 100644 index 00000000..1e002c4a --- /dev/null +++ b/src/doc_builder/process_hf_docs.py @@ -0,0 +1,416 @@ +#!/usr/bin/env python3 +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Process documentation from HuggingFace doc-build dataset. +Downloads and processes pre-built documentation markdown files. +""" + +import io +import json +import os +import tempfile +import zipfile +from collections import namedtuple +from pathlib import Path + +import requests +from tqdm import tqdm + +from .build_embeddings import split_markdown_by_headings + +Chunk = namedtuple("Chunk", "text source_page_url source_page_title package_name headings") + +HF_DATASET_REPO = "hf-doc-build/doc-build" +HF_DATASET_API_URL = f"https://huggingface.co/api/datasets/{HF_DATASET_REPO}/tree/main" +HF_DATASET_BASE_URL = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main" + + +def fetch_library_directories() -> list[dict]: + """ + Fetch the list of library directories from the HF doc-build dataset. + + Returns: + List of directory metadata dictionaries with 'path' and 'oid' keys + """ + print(f"Fetching library directories from {HF_DATASET_API_URL}...") + response = requests.get(HF_DATASET_API_URL) + response.raise_for_status() + + data = response.json() + + # Filter only directories + directories = [item for item in data if item.get("type") == "directory"] + + print(f"Found {len(directories)} library directories") + return directories + + +def download_and_extract_zip(library_name: str, output_dir: Path) -> Path | None: + """ + Download and extract the main.zip file for a library. + + Args: + library_name: Name of the library (e.g., 'accelerate') + output_dir: Directory to extract files to + + Returns: + Path to extracted directory, or None if download failed + """ + zip_url = f"{HF_DATASET_BASE_URL}/{library_name}/main.zip" + + try: + print(f" Downloading {zip_url}...") + response = requests.get(zip_url, stream=True) + response.raise_for_status() + + # Get total size for progress bar + total_size = int(response.headers.get("content-length", 0)) + + # Download to memory + zip_content = io.BytesIO() + with tqdm(total=total_size, unit="B", unit_scale=True, desc=f" {library_name}") as pbar: + for chunk in response.iter_content(chunk_size=8192): + zip_content.write(chunk) + pbar.update(len(chunk)) + + # Extract zip + zip_content.seek(0) + extract_path = output_dir / library_name + extract_path.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(zip_content) as zip_ref: + zip_ref.extractall(extract_path) + + print(f" Extracted to {extract_path}") + return extract_path + + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + print(f" āš ļø No main.zip found for {library_name}, skipping...") + return None + raise + except Exception as e: + print(f" āŒ Error processing {library_name}: {e}") + return None + + +def find_markdown_files(directory: Path) -> list[Path]: + """ + Recursively find all markdown files in a directory. + + Args: + directory: Root directory to search + + Returns: + List of paths to markdown files + """ + markdown_files = [] + for file_path in directory.rglob("*"): + if file_path.is_file() and file_path.suffix in [".md", ".mdx"]: + markdown_files.append(file_path) + return markdown_files + + +def markdown_file_to_url(file_path: Path, library_name: str, base_dir: Path) -> str: + """ + Convert a file path to a HuggingFace docs URL. + + Args: + file_path: Path to the markdown file + library_name: Name of the library + base_dir: Base directory (the extracted library folder) + + Returns: + URL string + """ + # Get relative path from base_dir + relative_path = file_path.relative_to(base_dir) + + # Remove file extension + path_without_ext = relative_path.with_suffix("") + + # Convert to URL format + url_path = str(path_without_ext).replace(os.sep, "/") + + # Build URL + url = f"https://huggingface.co/docs/{library_name}/{url_path}" + + return url + + +def get_page_title(file_path: Path) -> str: + """ + Generate a page title from file path. + + Args: + file_path: Path to the file + + Returns: + Formatted page title + """ + # Use the filename without extension + name = file_path.stem + # Replace underscores and hyphens with spaces + formatted = name.replace("_", " ").replace("-", " ") + # Capitalize + return formatted.title() + + +def process_markdown_file( + file_path: Path, library_name: str, base_dir: Path, excerpts_max_length: int = 1000 +) -> list[Chunk]: + """ + Process a single markdown file into chunks. + + Args: + file_path: Path to the markdown file + library_name: Name of the library + base_dir: Base directory for URL generation + excerpts_max_length: Maximum length of each excerpt + + Returns: + List of Chunk objects + """ + try: + with open(file_path, encoding="utf-8") as f: + content = f.read() + + # Split markdown by headings + sections = split_markdown_by_headings(content, excerpts_max_length) + + # Generate base URL for this file + base_url = markdown_file_to_url(file_path, library_name, base_dir) + page_title = get_page_title(file_path) + + # Convert sections to Chunks + chunks = [] + for section in sections: + headings_dict = section["headings"] + + # Create heading list from the dictionary + heading_list = [] + for i in range(1, 7): + heading_key = f"heading{i}" + if heading_key in headings_dict: + # Reconstruct the heading with # marks + heading_text = headings_dict[heading_key] + heading_list.append("#" * i + " " + heading_text) + + # Generate URL with anchor (use first heading as anchor) + url = base_url + if headings_dict: + # Use the deepest heading for anchor + last_heading = None + for i in range(6, 0, -1): + if f"heading{i}" in headings_dict: + last_heading = headings_dict[f"heading{i}"] + break + + if last_heading: + # Create anchor from heading (lowercase, replace spaces with hyphens) + anchor = last_heading.lower().replace(" ", "-") + # Remove special characters + anchor = "".join(c for c in anchor if c.isalnum() or c == "-") + url = f"{base_url}#{anchor}" + + # Create a chunk for each excerpt + for excerpt in section["excerpts"]: + chunk = Chunk( + text=excerpt, + source_page_url=url, + source_page_title=page_title, + package_name=library_name, + headings=heading_list, + ) + chunks.append(chunk) + + return chunks + + except Exception as e: + print(f" āš ļø Error processing {file_path.name}: {e}") + return [] + + +def process_library( + library_name: str, output_dir: Path, excerpts_max_length: int = 1000, skip_download: bool = False +) -> list[Chunk]: + """ + Process a single library: download, extract, and chunk all markdown files. + + Args: + library_name: Name of the library + output_dir: Directory for temporary files + excerpts_max_length: Maximum length of each excerpt + skip_download: Skip download if files already exist + + Returns: + List of all chunks for this library + """ + print(f"\nšŸ“š Processing library: {library_name}") + + # Check if already extracted + extract_path = output_dir / library_name + + if skip_download and extract_path.exists(): + print(f" ā„¹ļø Using existing files at {extract_path}") + else: + # Download and extract + extract_path = download_and_extract_zip(library_name, output_dir) + if extract_path is None: + return [] + + # Find all markdown files + markdown_files = find_markdown_files(extract_path) + print(f" Found {len(markdown_files)} markdown files") + + if not markdown_files: + print(f" āš ļø No markdown files found for {library_name}") + return [] + + # Process each markdown file + all_chunks = [] + print(" Processing markdown files...") + for md_file in tqdm(markdown_files, desc=f" {library_name}", unit="file"): + chunks = process_markdown_file(md_file, library_name, extract_path, excerpts_max_length) + all_chunks.extend(chunks) + + print(f" āœ… Generated {len(all_chunks)} chunks from {len(markdown_files)} files") + + return all_chunks + + +def process_all_libraries( + output_dir: Path | None = None, + excerpts_max_length: int = 1000, + libraries: list[str] | None = None, + skip_download: bool = False, +) -> dict: + """ + Process all libraries from the HF doc-build dataset. + + Args: + output_dir: Directory for temporary files (uses temp dir if None) + excerpts_max_length: Maximum length of each excerpt + libraries: List of specific libraries to process (None = all) + skip_download: Skip download if files already exist + + Returns: + Dictionary mapping library names to their chunks + """ + if output_dir is None: + output_dir = Path(tempfile.mkdtemp(prefix="hf_docs_")) + print(f"Using temporary directory: {output_dir}") + else: + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Fetch library directories + directories = fetch_library_directories() + + # Filter if specific libraries requested + if libraries: + directories = [d for d in directories if d["path"] in libraries] + print(f"Processing {len(directories)} requested libraries: {libraries}") + + # Process each library + results = {} + for directory in directories: + library_name = directory["path"] + chunks = process_library(library_name, output_dir, excerpts_max_length, skip_download) + results[library_name] = chunks + + # Summary + print("\n" + "=" * 80) + print("šŸ“Š SUMMARY") + print("=" * 80) + total_chunks = 0 + for library_name, chunks in results.items(): + print(f" {library_name}: {len(chunks)} chunks") + total_chunks += len(chunks) + print(f"\n Total: {total_chunks} chunks across {len(results)} libraries") + print("=" * 80) + + return results + + +def save_chunks_to_json(chunks: list[Chunk], output_file: Path): + """ + Save chunks to a JSON file. + + Args: + chunks: List of Chunk objects + output_file: Path to output JSON file + """ + # Convert chunks to dictionaries + chunks_data = [ + { + "text": chunk.text, + "source_page_url": chunk.source_page_url, + "source_page_title": chunk.source_page_title, + "package_name": chunk.package_name, + "headings": chunk.headings, + } + for chunk in chunks + ] + + with open(output_file, "w", encoding="utf-8") as f: + json.dump(chunks_data, f, indent=2, ensure_ascii=False) + + print(f"Saved {len(chunks)} chunks to {output_file}") + + +if __name__ == "__main__": + # Example usage + import argparse + + parser = argparse.ArgumentParser(description="Process HuggingFace documentation from doc-build dataset") + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory for downloaded/extracted files (uses temp dir if not specified)", + ) + parser.add_argument( + "--libraries", + type=str, + nargs="+", + default=None, + help="Specific libraries to process (e.g., accelerate diffusers)", + ) + parser.add_argument( + "--excerpt-length", type=int, default=1000, help="Maximum length of each excerpt in characters (default: 1000)" + ) + parser.add_argument( + "--skip-download", action="store_true", help="Skip download if files already exist in output-dir" + ) + parser.add_argument("--save-json", type=str, default=None, help="Save all chunks to a JSON file") + + args = parser.parse_args() + + # Process libraries + results = process_all_libraries( + output_dir=Path(args.output_dir) if args.output_dir else None, + excerpts_max_length=args.excerpt_length, + libraries=args.libraries, + skip_download=args.skip_download, + ) + + # Save to JSON if requested + if args.save_json: + all_chunks = [] + for chunks in results.values(): + all_chunks.extend(chunks) + save_chunks_to_json(all_chunks, Path(args.save_json))