diff --git a/.github/workflows/build_embeddings.yml b/.github/workflows/build_embeddings.yml
deleted file mode 100644
index 3ceb0b1a..00000000
--- a/.github/workflows/build_embeddings.yml
+++ /dev/null
@@ -1,179 +0,0 @@
-name: Daily Build Embeddings
-
-env:
-  DIFFUSERS_SLOW_IMPORT: yes
-
-on:
-  schedule:
-    - cron: "5 7 * * *" # every day at 07:05
-  # to run this workflow manually from the Actions tab
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: false
-
-jobs:
-  matrix-job:
-    runs-on: ubuntu-latest
-    container: huggingface/transformers-doc-builder
-    strategy:
-      max-parallel: 1 # run sequentially
-      matrix:
-        include:
-          - repo_id: huggingface/tokenizers
-            doc_folder: docs/source-doc-builder
-            package_path: bindings/python
-          - repo_id: huggingface/diffusers
-            doc_folder: docs/source/en
-          - repo_id: huggingface/accelerate
-            doc_folder: docs/source
-          - repo_id: huggingface/huggingface_hub
-            doc_folder: docs/source/en
-          - repo_id: huggingface/transformers
-            doc_folder: docs/source/en
-          - repo_id: huggingface/hub-docs
-            doc_folder: docs/hub
-            package_name: hub
-            is_not_python_module: true
-          - repo_id: huggingface/huggingface.js
-            doc_folder: docs
-            is_not_python_module: true
-            pre_command: npm install -g corepack@latest && corepack enable && cd huggingface.js && pnpm install && pnpm -r build && pnpm --filter doc-internal start
-          - repo_id: huggingface/transformers.js
-            doc_folder: docs/source
-            is_not_python_module: true
-          - repo_id: huggingface/smolagents
-            doc_folder: docs/source/en
-          - repo_id: huggingface/peft
-            doc_folder: docs/source
-          - repo_id: huggingface/trl
-            doc_folder: docs/source
-          - repo_id: bitsandbytes-foundation/bitsandbytes
-            doc_folder: docs/source
-          - repo_id: huggingface/lerobot
-            doc_folder: docs/source
-          - repo_id: huggingface/pytorch-image-models
-            doc_folder: hfdocs/source
-            package_name: timm
-          - repo_id: huggingface/hub-docs
-            doc_folder: docs/inference-providers
-            package_name: inference-providers
-            is_not_python_module: true
-          - repo_id: huggingface/safetensors
-            doc_folder: docs/source
-            package_path: bindings/python
-          - repo_id: huggingface/hf-endpoints-documentation
-            doc_folder: docs/source
-            package_name: inference-endpoints
-            is_not_python_module: true
-          - repo_id: huggingface/dataset-viewer
-            doc_folder: docs/source
-            package_name: dataset-viewer
-            is_not_python_module: true
-    timeout-minutes: 360  # Set timeout to 6 hours
-    steps:
-      - name: Setup REPO_NAME
-        shell: bash
-        run: |
-          current_path=$(pwd)
-          repo_id="${{ matrix.repo_id }}"
-          repo_name="${repo_id#*/}"
-          echo "REPO_NAME=${repo_name}" >> $GITHUB_ENV
-
-      - name: Checkout repository
-        uses: actions/checkout@v2
-        with:
-          repository: ${{ matrix.repo_id }}
-          path: ${{ github.workspace }}/${{ env.REPO_NAME }}
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '20'
-
-      - name: Install libgl1
-        run: apt-get install -y libgl1
-
-      - name: Export PIP_OR_UV ('pip' or 'uv pip')
-        run: |
-          if [ -z "${{ job.container }}" ]
-          then
-            echo "PIP_OR_UV=uv pip" >> $GITHUB_ENV
-          else
-            echo "PIP_OR_UV=pip" >> $GITHUB_ENV
-          fi
-
-      - name: Setup environment
-        shell: bash
-        run: |
-          if [[ "${{ matrix.is_not_python_module }}" != "true" ]]; then
-            current_path=$(pwd)
-            cd ${{ env.REPO_NAME }}
-            if [[ -n "${{ matrix.package_path }}" ]]; then
-              cd ${{ matrix.package_path }}
-              $PIP_OR_UV install .[dev]
-              $PIP_OR_UV install --force-reinstall numpy==1.26.4
-              cd $current_path
-            else
-              $PIP_OR_UV install .[dev]
-              $PIP_OR_UV install --force-reinstall numpy==1.26.4
-              cd $current_path
-            fi
-          fi
-
-          rm -rf doc-builder
-          rm -rf .git
-          git clone https://github.com/huggingface/doc-builder.git
-          cd doc-builder
-          git fetch
-          git checkout main
-          $PIP_OR_UV install .
-
-      - name: Run pre-command
-        shell: bash
-        run: |
-          if [ ! -z "${{ matrix.pre_command }}" ]
-          then
-            bash -c "${{ matrix.pre_command }}"
-          fi
-
-      - name: Build embeddings
-        shell: bash
-        run: |
-          echo Building docs for ${{ matrix.package_name || env.REPO_NAME }}
-          FLAGS=""
-          if [[ "${{ matrix.is_not_python_module }}" == "true" ]]; then
-            FLAGS="--not_python_module"
-          fi
-          doc-builder embeddings ${{ matrix.package_name || env.REPO_NAME }} ${{ env.REPO_NAME }}/${{ matrix.doc_folder }} --hf_ie_name docs-embed-bge-base-en-v1-5 --hf_ie_namespace huggingface --hf_ie_token ${{ secrets.HF_IE_TOKEN }} --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} $FLAGS
-
-  gradio-job:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout doc-builder
-        uses: actions/checkout@v2
-
-      - name: Install doc-builder
-        run: pip install .[dev]
-
-      - name: Add gradio docs to meilisearch
-        run: doc-builder add-gradio-docs --hf_ie_name docs-embed-bge-base-en-v1-5 --hf_ie_namespace huggingface --hf_ie_token ${{ secrets.HF_IE_TOKEN }} --meilisearch_key ${{ secrets.MEILISEARCH_KEY }}
-  
-  cleanup-job:
-    needs: matrix-job
-    runs-on: ubuntu-latest
-    if: always() # This ensures that the cleanup job runs regardless of the result of matrix-job
-    steps:
-      - name: Checkout doc-builder
-        uses: actions/checkout@v2
-
-      - name: Install doc-builder
-        run: pip install .[dev]
-
-      - name: Success Cleanup
-        if: needs.matrix-job.result == 'success' # Runs if all matrix jobs succeeded
-        run: doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap
-
-      - name: Failure Cleanup
-        if: needs.matrix-job.result == 'failure' # Runs if any matrix job failed
-        run: doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }}
diff --git a/.github/workflows/populate_search_engine.yml b/.github/workflows/populate_search_engine.yml
new file mode 100644
index 00000000..05c1cf14
--- /dev/null
+++ b/.github/workflows/populate_search_engine.yml
@@ -0,0 +1,109 @@
+name: Populate Search Engine
+
+on:
+  schedule:
+    - cron: "5 7 * * *" # every day at 07:05
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+    inputs:
+      libraries:
+        description: 'Specific libraries to process (space-separated, e.g., "accelerate diffusers"). Leave empty for all.'
+        required: false
+        default: ''
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  process-docs:
+    runs-on: ubuntu-latest
+    timeout-minutes: 360  # Set timeout to 6 hours
+    steps:
+      - name: Checkout doc-builder
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
+      - name: Set up Python 3.10
+        run: uv python install 3.10
+
+      - name: Install doc-builder
+        run: uv sync --extra dev
+
+      - name: Populate search engine from HF doc-build dataset
+        shell: bash
+        run: |
+          echo "Processing documentation from hf-doc-build/doc-build dataset..."
+          
+          # Build command
+          CMD="uv run doc-builder populate-search-engine"
+          
+          # Add library filter if specified
+          if [ ! -z "${{ github.event.inputs.libraries }}" ]; then
+            CMD="$CMD --libraries ${{ github.event.inputs.libraries }}"
+          fi
+          
+          # Add skip embeddings flag
+          CMD="$CMD --skip-embeddings"
+          
+          # Add credentials
+          CMD="$CMD --hf_ie_name docs-embed-bge-base-en-v1-5"
+          CMD="$CMD --hf_ie_namespace huggingface"
+          CMD="$CMD --hf_ie_token ${{ secrets.HF_IE_TOKEN }}"
+          CMD="$CMD --meilisearch_key ${{ secrets.MEILISEARCH_KEY }}"
+          
+          # Execute
+          echo "Running: $CMD"
+          $CMD
+  
+  gradio-job:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout doc-builder
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
+      - name: Set up Python 3.10
+        run: uv python install 3.10
+
+      - name: Install doc-builder
+        run: uv sync --extra dev
+
+      - name: Add gradio docs to meilisearch
+        run: uv run doc-builder add-gradio-docs --hf_ie_name docs-embed-bge-base-en-v1-5 --hf_ie_namespace huggingface --hf_ie_token ${{ secrets.HF_IE_TOKEN }} --meilisearch_key ${{ secrets.MEILISEARCH_KEY }}
+  
+  cleanup-job:
+    needs: [process-docs, gradio-job]
+    runs-on: ubuntu-latest
+    if: always() # This ensures that the cleanup job runs regardless of the result
+    steps:
+      - name: Checkout doc-builder
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
+      - name: Set up Python 3.10
+        run: uv python install 3.10
+
+      - name: Install doc-builder
+        run: uv sync --extra dev
+
+      - name: Success Cleanup
+        if: needs.process-docs.result == 'success' # Runs if job succeeded
+        run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }} --swap
+
+      - name: Failure Cleanup
+        if: needs.process-docs.result == 'failure' # Runs if job failed
+        run: uv run doc-builder meilisearch-clean --meilisearch_key ${{ secrets.MEILISEARCH_KEY }}
+
diff --git a/src/doc_builder/build_embeddings.py b/src/doc_builder/build_embeddings.py
index bb50adc3..77a12f02 100644
--- a/src/doc_builder/build_embeddings.py
+++ b/src/doc_builder/build_embeddings.py
@@ -408,6 +408,167 @@ def clean_md(text):
     return text.strip()
 
 
+def split_into_excerpts(text: str, max_length: int) -> list[str]:
+    """
+    Split text into excerpts of approximately max_length characters.
+
+    Args:
+        text: The text to split
+        max_length: Maximum length of each excerpt in characters
+
+    Returns:
+        List of text excerpts
+    """
+    if not text:
+        return []
+
+    excerpts = []
+    current_index = 0
+
+    while current_index < len(text):
+        end_index = current_index + max_length
+
+        # If we're at the end of the text, just take the rest
+        if end_index >= len(text):
+            excerpts.append(text[current_index:].strip())
+            break
+
+        # Look for the next word boundary after "max_length" characters
+        remaining_text = text[end_index:]
+        word_boundary_match = re.search(r"\b", remaining_text)
+
+        margin = 50
+
+        if word_boundary_match and word_boundary_match.start() <= margin:
+            # Found a word boundary within 50 characters, use it
+            end_index = end_index + word_boundary_match.start()
+        else:
+            # No word boundary within 50 chars, fall back to finding good breaking points
+            break_point = end_index
+            for i in range(end_index, max(current_index + max_length - margin, current_index), -1):
+                if i < len(text):
+                    char = text[i]
+                    if char in [" ", "\n", ".", ",", ";", "!", "?"]:
+                        break_point = i + 1
+                        break
+            end_index = break_point
+
+        excerpts.append(text[current_index:end_index].strip())
+        current_index = end_index
+
+    return [excerpt for excerpt in excerpts if len(excerpt) > 0]
+
+
+def build_headings_object(heading_stack: list[str]) -> dict:
+    """
+    Build headings dictionary from heading stack.
+
+    Args:
+        heading_stack: List of heading strings in format "## Heading Text"
+
+    Returns:
+        Dictionary with heading1 through heading6 keys
+    """
+    headings = {}
+
+    for heading in heading_stack:
+        match = re.match(r"^(#{1,6})\s+(.+)$", heading)
+        if match:
+            level = len(match.group(1))
+            text = match.group(2).strip()
+            headings[f"heading{level}"] = text
+
+    return headings
+
+
+def split_markdown_by_headings(markdown_content: str, excerpts_max_length: int = 1000) -> list[dict]:
+    """
+    Split markdown content by headings and create sections with excerpts.
+    Similar to the TypeScript implementation for consistent chunking.
+
+    Args:
+        markdown_content: The markdown text to split
+        excerpts_max_length: Maximum length of each excerpt in characters (default: 1000)
+
+    Returns:
+        List of dictionaries with 'excerpts' (list of text chunks) and 'headings' (dict) keys
+    """
+    lines = markdown_content.split("\n")
+    sections = []
+
+    current_section = ""
+    heading_stack = []
+    line_index = 0
+
+    while line_index < len(lines):
+        line = lines[line_index]
+        heading_match = re.match(r"^(#{1,6})\s+(.+)$", line)
+
+        if heading_match:
+            # Save the previous section if it has content
+            if current_section.strip():
+                sections.append(
+                    {
+                        "excerpts": split_into_excerpts(current_section.strip(), excerpts_max_length),
+                        "headings": build_headings_object(heading_stack),
+                    }
+                )
+
+            # Parse the heading
+            heading_level = len(heading_match.group(1))
+            heading_text = heading_match.group(2).strip()
+            full_heading = f"{heading_match.group(1)} {heading_text}"
+
+            # Update heading stack based on level
+            # Keep only headings with lower level than current
+            new_stack = []
+            for h in heading_stack:
+                h_match = re.match(r"^(#{1,6})", h)
+                if h_match:
+                    existing_level = len(h_match.group(1))
+                    if existing_level < heading_level:
+                        new_stack.append(h)
+            heading_stack = new_stack
+
+            # Add current heading
+            heading_stack.append(full_heading)
+
+            # Start new section with the heading
+            current_section = line
+
+            # Look ahead to include content after heading
+            line_index += 1
+            while line_index < len(lines):
+                next_line = lines[line_index]
+                next_heading_match = re.match(r"^(#{1,6})\s+(.+)$", next_line)
+
+                if next_heading_match:
+                    # Found next heading, break to process it
+                    break
+                else:
+                    # Add line to current section
+                    current_section += "\n" + next_line
+                    line_index += 1
+
+            # Don't increment line_index here since we either reached end or found next heading
+            continue
+        else:
+            # Add line to current section
+            current_section += ("\n" if current_section else "") + line
+            line_index += 1
+
+    # Add the last section
+    if current_section.strip():
+        sections.append(
+            {
+                "excerpts": split_into_excerpts(current_section.strip(), excerpts_max_length),
+                "headings": build_headings_object(heading_stack),
+            }
+        )
+
+    return sections
+
+
 def get_page_title(path: str):
     """
     Given a path to doc page, generate doc page title.
diff --git a/src/doc_builder/commands/embeddings.py b/src/doc_builder/commands/embeddings.py
index fed7eb21..e85539a6 100644
--- a/src/doc_builder/commands/embeddings.py
+++ b/src/doc_builder/commands/embeddings.py
@@ -14,111 +14,76 @@
 
 
 import argparse
-import importlib
+from pathlib import Path
+
+from doc_builder import clean_meilisearch
+from doc_builder.build_embeddings import add_gradio_docs, call_embedding_inference
+from doc_builder.meilisearch_helper import add_embeddings_to_db
+from doc_builder.process_hf_docs import process_all_libraries
+from doc_builder.utils import chunk_list
+
+
+def process_hf_docs_command(args):
+    """
+    Process documentation from HF doc-build dataset.
+    Downloads pre-built docs and generates embeddings.
+    """
+    import meilisearch
+    from tqdm import tqdm
+
+    print("Processing documentation from HF doc-build dataset...")
+
+    # Process all or specific libraries
+    results = process_all_libraries(
+        output_dir=Path(args.output_dir) if args.output_dir else None,
+        excerpts_max_length=args.excerpt_length,
+        libraries=args.libraries if args.libraries else None,
+        skip_download=args.skip_download,
+    )
 
-from doc_builder import build_embeddings, clean_meilisearch
-from doc_builder.build_embeddings import add_gradio_docs
-from doc_builder.utils import get_default_branch_name, get_doc_config, read_doc_config
+    # If embeddings are requested
+    if not args.skip_embeddings:
+        print("\n" + "=" * 80)
+        print("🔢 GENERATING EMBEDDINGS")
+        print("=" * 80)
 
+        # Collect all chunks
+        all_chunks = []
+        for _library_name, chunks in results.items():
+            all_chunks.extend(chunks)
 
-def embeddings_command(args):
-    read_doc_config(args.path_to_docs)
+        print(f"\nTotal chunks to embed: {len(all_chunks)}")
 
-    default_version = get_default_branch_name(args.path_to_docs)
-    if args.not_python_module and args.version is None:
-        version = default_version
-    elif args.version is None:
-        module = importlib.import_module(args.library_name)
-        version = module.__version__
+        # Generate embeddings
+        from doc_builder.build_embeddings import MEILI_INDEX_TEMP
 
-        if "dev" in version:
-            version = default_version
-        else:
-            version = f"v{version}"
-    else:
-        version = args.version
-
-    # `version` will always start with prefix `v`
-    # `version_tag` does not have to start with prefix `v` (see: https://github.com/huggingface/datasets/tags)
-    version_tag = version
-    if version != default_version:
-        doc_config = get_doc_config()
-        version_prefix = getattr(doc_config, "version_prefix", "v")
-        version_ = version[1:]  # v2.1.0 -> 2.1.0
-        version_tag = f"{version_prefix}{version_}"
-
-    # Disable notebook building for non-master version
-    if version != default_version:
-        args.notebook_dir = None
-
-    print("Building embeddings for", args.library_name, args.path_to_docs)
-    build_embeddings(
-        args.library_name,
-        args.path_to_docs,
-        args.hf_ie_name,
-        args.hf_ie_namespace,
-        args.hf_ie_token,
-        args.meilisearch_key,
-        version=version,
-        version_tag=version_tag,
-        language=args.language,
-        is_python_module=not args.not_python_module,
-        version_tag_suffix=args.version_tag_suffix,
-        repo_owner=args.repo_owner,
-        repo_name=args.repo_name,
-    )
+        embeddings = call_embedding_inference(
+            all_chunks,
+            args.hf_ie_name,
+            args.hf_ie_namespace,
+            args.hf_ie_token,
+            is_python_module=False,  # Pre-built docs are not Python modules
+        )
 
+        # Push to Meilisearch
+        print("\n" + "=" * 80)
+        print("📤 UPLOADING TO MEILISEARCH")
+        print("=" * 80)
 
-def embeddings_command_parser(subparsers=None):
-    if subparsers is not None:
-        parser = subparsers.add_parser("embeddings")
-    else:
-        parser = argparse.ArgumentParser("Doc Builder embeddings command")
+        client = meilisearch.Client("https://edge.meilisearch.com", args.meilisearch_key)
+        ITEMS_PER_CHUNK = 5000
+
+        for chunk_embeddings in tqdm(chunk_list(embeddings, ITEMS_PER_CHUNK), desc="Uploading to meilisearch"):
+            add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings)
+
+        print(f"\n✅ Successfully uploaded {len(embeddings)} embeddings to Meilisearch")
+
+    print("\n" + "=" * 80)
+    print("✅ PROCESSING COMPLETE")
+    print("=" * 80)
 
-    parser.add_argument("library_name", type=str, help="Library name")
-    parser.add_argument(
-        "path_to_docs",
-        type=str,
-        help="Local path to library documentation. The library should be cloned, and the folder containing the "
-        "documentation files should be indicated here.",
-    )
-    parser.add_argument("--hf_ie_name", type=str, help="Inference Endpoints name.", required=True)
-    parser.add_argument("--hf_ie_namespace", type=str, help="Inference Endpoints namespace.", required=True)
-    parser.add_argument("--hf_ie_token", type=str, help="Hugging Face token.", required=True)
-    parser.add_argument("--meilisearch_key", type=str, help="Meilisearch key.", required=True)
-    parser.add_argument("--language", type=str, help="Language of the documentation to generate", default="en")
-    parser.add_argument(
-        "--version",
-        type=str,
-        help="Version of the documentation to generate. Will default to the version of the package module (using "
-        "`main` for a version containing dev).",
-    )
-    parser.add_argument(
-        "--not_python_module",
-        action="store_true",
-        help="Whether docs files do NOT have corresponding python module (like HF course & hub docs).",
-    )
-    parser.add_argument(
-        "--version_tag_suffix",
-        type=str,
-        default="src/",
-        help="Suffix to add after the version tag (e.g. 1.3.0 or main) in the documentation links. For example, the default `src/` suffix will result in a base link as `https://github.com/huggingface/{package_name}/blob/{version_tag}/src/`.",
-    )
-    parser.add_argument(
-        "--repo_owner",
-        type=str,
-        default="huggingface",
-        help="Owner of the repo (e.g. huggingface, rwightman, etc.).",
-    )
-    parser.add_argument(
-        "--repo_name",
-        type=str,
-        default=None,
-        help="Name of the repo (e.g. transformers, pytorch-image-models, etc.). By default, this is the same as the library_name.",
-    )
-    if subparsers is not None:
-        parser.set_defaults(func=embeddings_command)
 
+def embeddings_command_parser(subparsers=None):
     # meilsiearch clean: swap & delete the temp index
     if subparsers is not None:
         parser_meilisearch_clean = subparsers.add_parser("meilisearch-clean")
@@ -154,4 +119,59 @@ def embeddings_command_parser(subparsers=None):
             )
         )
 
-    return parser
+    # populate-search-engine: process documentation from HF doc-build dataset and populate search engine
+    if subparsers is not None:
+        parser_process_hf_docs = subparsers.add_parser("populate-search-engine")
+    else:
+        parser_process_hf_docs = argparse.ArgumentParser(
+            "Doc Builder populate-search-engine command. Process pre-built documentation from HF doc-build dataset and populate search engine."
+        )
+
+    parser_process_hf_docs.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Directory for downloaded/extracted files (uses temp dir if not specified)",
+    )
+    parser_process_hf_docs.add_argument(
+        "--libraries",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Specific libraries to process (e.g., accelerate diffusers). If not specified, processes all libraries.",
+    )
+    parser_process_hf_docs.add_argument(
+        "--excerpt-length", type=int, default=1000, help="Maximum length of each excerpt in characters (default: 1000)"
+    )
+    parser_process_hf_docs.add_argument(
+        "--skip-download", action="store_true", help="Skip download if files already exist in output-dir"
+    )
+    parser_process_hf_docs.add_argument(
+        "--skip-embeddings",
+        action="store_true",
+        help="Skip embedding generation and meilisearch upload (useful for testing)",
+    )
+    parser_process_hf_docs.add_argument(
+        "--hf_ie_name",
+        type=str,
+        help="Inference Endpoints name (required unless --skip-embeddings is set)",
+        required=False,
+    )
+    parser_process_hf_docs.add_argument(
+        "--hf_ie_namespace",
+        type=str,
+        help="Inference Endpoints namespace (required unless --skip-embeddings is set)",
+        required=False,
+    )
+    parser_process_hf_docs.add_argument(
+        "--hf_ie_token", type=str, help="Hugging Face token (required unless --skip-embeddings is set)", required=False
+    )
+    parser_process_hf_docs.add_argument(
+        "--meilisearch_key",
+        type=str,
+        help="Meilisearch key (required unless --skip-embeddings is set)",
+        required=False,
+    )
+
+    if subparsers is not None:
+        parser_process_hf_docs.set_defaults(func=process_hf_docs_command)
diff --git a/src/doc_builder/process_hf_docs.py b/src/doc_builder/process_hf_docs.py
new file mode 100644
index 00000000..1e002c4a
--- /dev/null
+++ b/src/doc_builder/process_hf_docs.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python3
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Process documentation from HuggingFace doc-build dataset.
+Downloads and processes pre-built documentation markdown files.
+"""
+
+import io
+import json
+import os
+import tempfile
+import zipfile
+from collections import namedtuple
+from pathlib import Path
+
+import requests
+from tqdm import tqdm
+
+from .build_embeddings import split_markdown_by_headings
+
+Chunk = namedtuple("Chunk", "text source_page_url source_page_title package_name headings")
+
+HF_DATASET_REPO = "hf-doc-build/doc-build"
+HF_DATASET_API_URL = f"https://huggingface.co/api/datasets/{HF_DATASET_REPO}/tree/main"
+HF_DATASET_BASE_URL = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main"
+
+
+def fetch_library_directories() -> list[dict]:
+    """
+    Fetch the list of library directories from the HF doc-build dataset.
+
+    Returns:
+        List of directory metadata dictionaries with 'path' and 'oid' keys
+    """
+    print(f"Fetching library directories from {HF_DATASET_API_URL}...")
+    response = requests.get(HF_DATASET_API_URL)
+    response.raise_for_status()
+
+    data = response.json()
+
+    # Filter only directories
+    directories = [item for item in data if item.get("type") == "directory"]
+
+    print(f"Found {len(directories)} library directories")
+    return directories
+
+
+def download_and_extract_zip(library_name: str, output_dir: Path) -> Path | None:
+    """
+    Download and extract the main.zip file for a library.
+
+    Args:
+        library_name: Name of the library (e.g., 'accelerate')
+        output_dir: Directory to extract files to
+
+    Returns:
+        Path to extracted directory, or None if download failed
+    """
+    zip_url = f"{HF_DATASET_BASE_URL}/{library_name}/main.zip"
+
+    try:
+        print(f"  Downloading {zip_url}...")
+        response = requests.get(zip_url, stream=True)
+        response.raise_for_status()
+
+        # Get total size for progress bar
+        total_size = int(response.headers.get("content-length", 0))
+
+        # Download to memory
+        zip_content = io.BytesIO()
+        with tqdm(total=total_size, unit="B", unit_scale=True, desc=f"  {library_name}") as pbar:
+            for chunk in response.iter_content(chunk_size=8192):
+                zip_content.write(chunk)
+                pbar.update(len(chunk))
+
+        # Extract zip
+        zip_content.seek(0)
+        extract_path = output_dir / library_name
+        extract_path.mkdir(parents=True, exist_ok=True)
+
+        with zipfile.ZipFile(zip_content) as zip_ref:
+            zip_ref.extractall(extract_path)
+
+        print(f"  Extracted to {extract_path}")
+        return extract_path
+
+    except requests.exceptions.HTTPError as e:
+        if e.response.status_code == 404:
+            print(f"  ⚠️  No main.zip found for {library_name}, skipping...")
+            return None
+        raise
+    except Exception as e:
+        print(f"  ❌ Error processing {library_name}: {e}")
+        return None
+
+
+def find_markdown_files(directory: Path) -> list[Path]:
+    """
+    Recursively find all markdown files in a directory.
+
+    Args:
+        directory: Root directory to search
+
+    Returns:
+        List of paths to markdown files
+    """
+    markdown_files = []
+    for file_path in directory.rglob("*"):
+        if file_path.is_file() and file_path.suffix in [".md", ".mdx"]:
+            markdown_files.append(file_path)
+    return markdown_files
+
+
+def markdown_file_to_url(file_path: Path, library_name: str, base_dir: Path) -> str:
+    """
+    Convert a file path to a HuggingFace docs URL.
+
+    Args:
+        file_path: Path to the markdown file
+        library_name: Name of the library
+        base_dir: Base directory (the extracted library folder)
+
+    Returns:
+        URL string
+    """
+    # Get relative path from base_dir
+    relative_path = file_path.relative_to(base_dir)
+
+    # Remove file extension
+    path_without_ext = relative_path.with_suffix("")
+
+    # Convert to URL format
+    url_path = str(path_without_ext).replace(os.sep, "/")
+
+    # Build URL
+    url = f"https://huggingface.co/docs/{library_name}/{url_path}"
+
+    return url
+
+
+def get_page_title(file_path: Path) -> str:
+    """
+    Generate a page title from file path.
+
+    Args:
+        file_path: Path to the file
+
+    Returns:
+        Formatted page title
+    """
+    # Use the filename without extension
+    name = file_path.stem
+    # Replace underscores and hyphens with spaces
+    formatted = name.replace("_", " ").replace("-", " ")
+    # Capitalize
+    return formatted.title()
+
+
+def process_markdown_file(
+    file_path: Path, library_name: str, base_dir: Path, excerpts_max_length: int = 1000
+) -> list[Chunk]:
+    """
+    Process a single markdown file into chunks.
+
+    Args:
+        file_path: Path to the markdown file
+        library_name: Name of the library
+        base_dir: Base directory for URL generation
+        excerpts_max_length: Maximum length of each excerpt
+
+    Returns:
+        List of Chunk objects
+    """
+    try:
+        with open(file_path, encoding="utf-8") as f:
+            content = f.read()
+
+        # Split markdown by headings
+        sections = split_markdown_by_headings(content, excerpts_max_length)
+
+        # Generate base URL for this file
+        base_url = markdown_file_to_url(file_path, library_name, base_dir)
+        page_title = get_page_title(file_path)
+
+        # Convert sections to Chunks
+        chunks = []
+        for section in sections:
+            headings_dict = section["headings"]
+
+            # Create heading list from the dictionary
+            heading_list = []
+            for i in range(1, 7):
+                heading_key = f"heading{i}"
+                if heading_key in headings_dict:
+                    # Reconstruct the heading with # marks
+                    heading_text = headings_dict[heading_key]
+                    heading_list.append("#" * i + " " + heading_text)
+
+            # Generate URL with anchor (use first heading as anchor)
+            url = base_url
+            if headings_dict:
+                # Use the deepest heading for anchor
+                last_heading = None
+                for i in range(6, 0, -1):
+                    if f"heading{i}" in headings_dict:
+                        last_heading = headings_dict[f"heading{i}"]
+                        break
+
+                if last_heading:
+                    # Create anchor from heading (lowercase, replace spaces with hyphens)
+                    anchor = last_heading.lower().replace(" ", "-")
+                    # Remove special characters
+                    anchor = "".join(c for c in anchor if c.isalnum() or c == "-")
+                    url = f"{base_url}#{anchor}"
+
+            # Create a chunk for each excerpt
+            for excerpt in section["excerpts"]:
+                chunk = Chunk(
+                    text=excerpt,
+                    source_page_url=url,
+                    source_page_title=page_title,
+                    package_name=library_name,
+                    headings=heading_list,
+                )
+                chunks.append(chunk)
+
+        return chunks
+
+    except Exception as e:
+        print(f"    ⚠️  Error processing {file_path.name}: {e}")
+        return []
+
+
+def process_library(
+    library_name: str, output_dir: Path, excerpts_max_length: int = 1000, skip_download: bool = False
+) -> list[Chunk]:
+    """
+    Process a single library: download, extract, and chunk all markdown files.
+
+    Args:
+        library_name: Name of the library
+        output_dir: Directory for temporary files
+        excerpts_max_length: Maximum length of each excerpt
+        skip_download: Skip download if files already exist
+
+    Returns:
+        List of all chunks for this library
+    """
+    print(f"\n📚 Processing library: {library_name}")
+
+    # Check if already extracted
+    extract_path = output_dir / library_name
+
+    if skip_download and extract_path.exists():
+        print(f"  ℹ️  Using existing files at {extract_path}")
+    else:
+        # Download and extract
+        extract_path = download_and_extract_zip(library_name, output_dir)
+        if extract_path is None:
+            return []
+
+    # Find all markdown files
+    markdown_files = find_markdown_files(extract_path)
+    print(f"  Found {len(markdown_files)} markdown files")
+
+    if not markdown_files:
+        print(f"  ⚠️  No markdown files found for {library_name}")
+        return []
+
+    # Process each markdown file
+    all_chunks = []
+    print("  Processing markdown files...")
+    for md_file in tqdm(markdown_files, desc=f"  {library_name}", unit="file"):
+        chunks = process_markdown_file(md_file, library_name, extract_path, excerpts_max_length)
+        all_chunks.extend(chunks)
+
+    print(f"  ✅ Generated {len(all_chunks)} chunks from {len(markdown_files)} files")
+
+    return all_chunks
+
+
+def process_all_libraries(
+    output_dir: Path | None = None,
+    excerpts_max_length: int = 1000,
+    libraries: list[str] | None = None,
+    skip_download: bool = False,
+) -> dict:
+    """
+    Process all libraries from the HF doc-build dataset.
+
+    Args:
+        output_dir: Directory for temporary files (uses temp dir if None)
+        excerpts_max_length: Maximum length of each excerpt
+        libraries: List of specific libraries to process (None = all)
+        skip_download: Skip download if files already exist
+
+    Returns:
+        Dictionary mapping library names to their chunks
+    """
+    if output_dir is None:
+        output_dir = Path(tempfile.mkdtemp(prefix="hf_docs_"))
+        print(f"Using temporary directory: {output_dir}")
+    else:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Fetch library directories
+    directories = fetch_library_directories()
+
+    # Filter if specific libraries requested
+    if libraries:
+        directories = [d for d in directories if d["path"] in libraries]
+        print(f"Processing {len(directories)} requested libraries: {libraries}")
+
+    # Process each library
+    results = {}
+    for directory in directories:
+        library_name = directory["path"]
+        chunks = process_library(library_name, output_dir, excerpts_max_length, skip_download)
+        results[library_name] = chunks
+
+    # Summary
+    print("\n" + "=" * 80)
+    print("📊 SUMMARY")
+    print("=" * 80)
+    total_chunks = 0
+    for library_name, chunks in results.items():
+        print(f"  {library_name}: {len(chunks)} chunks")
+        total_chunks += len(chunks)
+    print(f"\n  Total: {total_chunks} chunks across {len(results)} libraries")
+    print("=" * 80)
+
+    return results
+
+
+def save_chunks_to_json(chunks: list[Chunk], output_file: Path):
+    """
+    Save chunks to a JSON file.
+
+    Args:
+        chunks: List of Chunk objects
+        output_file: Path to output JSON file
+    """
+    # Convert chunks to dictionaries
+    chunks_data = [
+        {
+            "text": chunk.text,
+            "source_page_url": chunk.source_page_url,
+            "source_page_title": chunk.source_page_title,
+            "package_name": chunk.package_name,
+            "headings": chunk.headings,
+        }
+        for chunk in chunks
+    ]
+
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(chunks_data, f, indent=2, ensure_ascii=False)
+
+    print(f"Saved {len(chunks)} chunks to {output_file}")
+
+
+if __name__ == "__main__":
+    # Example usage
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Process HuggingFace documentation from doc-build dataset")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Directory for downloaded/extracted files (uses temp dir if not specified)",
+    )
+    parser.add_argument(
+        "--libraries",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Specific libraries to process (e.g., accelerate diffusers)",
+    )
+    parser.add_argument(
+        "--excerpt-length", type=int, default=1000, help="Maximum length of each excerpt in characters (default: 1000)"
+    )
+    parser.add_argument(
+        "--skip-download", action="store_true", help="Skip download if files already exist in output-dir"
+    )
+    parser.add_argument("--save-json", type=str, default=None, help="Save all chunks to a JSON file")
+
+    args = parser.parse_args()
+
+    # Process libraries
+    results = process_all_libraries(
+        output_dir=Path(args.output_dir) if args.output_dir else None,
+        excerpts_max_length=args.excerpt_length,
+        libraries=args.libraries,
+        skip_download=args.skip_download,
+    )
+
+    # Save to JSON if requested
+    if args.save_json:
+        all_chunks = []
+        for chunks in results.values():
+            all_chunks.extend(chunks)
+        save_chunks_to_json(all_chunks, Path(args.save_json))