From f53fd8e209a384d5c7f321f16dcd5348153ad54e Mon Sep 17 00:00:00 2001
From: Mishig Davaadorj <dmishig@gmail.com>
Date: Wed, 12 Nov 2025 14:38:53 +0100
Subject: [PATCH 1/4] Add HTML stripper script for markdown documentation

- Add strip_html_from_md.py script to convert HTML-heavy docs to clean markdown
- Strip HTML tags from docstrings and convert to level 4 headers
- Format class/function names with anchors: #### ClassName[[anchor]]
- Remove 'class' and 'def' prefixes from names
- Format parameters without bullets, using : separator with blank lines
- Support processing single files or directories recursively
- Add comprehensive README with usage examples and documentation
---
 README_strip_html.md              |  99 +++++++++
 src/scripts/strip_html_from_md.py | 350 ++++++++++++++++++++++++++++++
 2 files changed, 449 insertions(+)
 create mode 100644 README_strip_html.md
 create mode 100644 src/scripts/strip_html_from_md.py
diff --git a/README_strip_html.md b/README_strip_html.md
new file mode 100644
index 00000000..68e48887
--- /dev/null
+++ b/README_strip_html.md
@@ -0,0 +1,99 @@
+# HTML Stripper for Markdown Documentation
+
+This script strips HTML tags from markdown files and converts docstrings to clean markdown format.
+
+## Features
+
+- Removes all HTML tags while preserving content
+- Converts docstring blocks to clean markdown with level 4 headers (`####`)
+- Extracts and formats:
+  - Class/function names as headers (removes `class` and `def` prefixes)
+  - Anchors in double square brackets format: `[[anchor]]`
+  - Source links
+  - Parameter descriptions (removes bullets and bold, uses `:` separator, adds blank lines between params)
+  - Return types and descriptions
+- Preserves markdown code blocks and structure
+- Can process single files or entire directories
+
+## Usage
+
+### Single File
+
+```bash
+python3 src/scripts/strip_html_from_md.py input.md -o output.md
+```
+
+Or overwrite the input file:
+
+```bash
+python3 src/scripts/strip_html_from_md.py input.md
+```
+
+### Directory Processing
+
+Process all markdown files in a directory:
+
+```bash
+python3 src/scripts/strip_html_from_md.py docs/ -o clean_docs/
+```
+
+Process recursively:
+
+```bash
+python3 src/scripts/strip_html_from_md.py docs/ -o clean_docs/ --recursive
+```
+
+## Examples
+
+### Before (with HTML)
+
+```markdown
+<div class="docstring border-l-2 border-t-2 pl-4 pt-3.5">
+
+<docstring><name>class transformers.BertConfig</name><anchor>transformers.BertConfig</anchor><source>https://github.com/huggingface/transformers/blob/v4.57.1/src/transformers/models/bert/configuration_bert.py#L29</source><paramsdesc>- **vocab_size** (`int`, *optional*, defaults to 30522) --
+  Vocabulary size of the BERT model.</paramsdesc></docstring>
+
+This is the configuration class to store the configuration of a BertModel.
+
+</div>
+```
+
+### After (clean markdown)
+
+```markdown
+#### transformers.BertConfig[[transformers.BertConfig]]
+
+[Source](https://github.com/huggingface/transformers/blob/v4.57.1/src/transformers/models/bert/configuration_bert.py#L29)
+
+This is the configuration class to store the configuration of a BertModel.
+
+**Parameters:**
+
+vocab_size (`int`, *optional*, defaults to 30522) : Vocabulary size of the BERT model.
+
+hidden_size (`int`, *optional*, defaults to 768) : Dimensionality of the encoder layers and the pooler layer.
+```
+
+## Command-line Options
+
+- `input` - Input markdown file or directory (required)
+- `-o, --output` - Output file or directory (optional, defaults to overwriting input)
+- `-r, --recursive` - Process directory recursively (optional)
+
+## What Gets Stripped
+
+The script removes:
+- `<div>` tags and their attributes
+- `<docstring>` and nested tags (`<name>`, `<anchor>`, `<source>`, etc.)
+- Component tags: `<Tip>`, `<ExampleCodeBlock>`, `<hfoptions>`, `<hfoption>`, etc.
+- `<EditOnGithub>` links
+- HTML comments
+- Any other HTML tags
+
+## What Gets Preserved
+
+- Markdown syntax (headers, lists, code blocks, links, etc.)
+- Text content from within HTML tags
+- Code blocks (backtick-fenced)
+- Link URLs and formatting
+
diff --git a/src/scripts/strip_html_from_md.py b/src/scripts/strip_html_from_md.py
new file mode 100644
index 00000000..0643f145
--- /dev/null
+++ b/src/scripts/strip_html_from_md.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+"""
+Script to strip HTML from markdown files and convert docstrings to clean markdown.
+
+Specifically handles docstring blocks by:
+- Removing all HTML tags
+- Converting class/function names to level 4 headers (####)
+- Preserving the text content
+"""
+
+import re
+import argparse
+from pathlib import Path
+from html.parser import HTMLParser
+from typing import Optional
+
+
+class HTMLStripper(HTMLParser):
+    """Helper class to strip HTML tags while preserving text content."""
+    
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.strict = False
+        self.convert_charrefs = True
+        self.text = []
+    
+    def handle_data(self, data):
+        self.text.append(data)
+    
+    def get_data(self):
+        return ''.join(self.text)
+
+
+def strip_html_tags(text: str) -> str:
+    """Strip HTML tags from text while preserving content."""
+    stripper = HTMLStripper()
+    stripper.feed(text)
+    return stripper.get_data()
+
+
+def extract_docstring_info(docstring_block: str) -> dict:
+    """Extract information from a docstring block."""
+    info = {
+        'name': None,
+        'anchor': None,
+        'source': None,
+        'parameters': None,
+        'paramsdesc': None,
+        'rettype': None,
+        'retdesc': None,
+        'description': None
+    }
+    
+    # Extract name
+    name_match = re.search(r'<name>(.*?)</name>', docstring_block, re.DOTALL)
+    if name_match:
+        raw_name = name_match.group(1).strip()
+        # Remove "class " or "def " prefix if present
+        cleaned_name = re.sub(r'^(class|def)\s+', '', raw_name)
+        info['name'] = cleaned_name
+    
+    # Extract anchor
+    anchor_match = re.search(r'<anchor>(.*?)</anchor>', docstring_block, re.DOTALL)
+    if anchor_match:
+        info['anchor'] = anchor_match.group(1).strip()
+    
+    # Extract source
+    source_match = re.search(r'<source>(.*?)</source>', docstring_block, re.DOTALL)
+    if source_match:
+        info['source'] = source_match.group(1).strip()
+    
+    # Extract parameters description
+    paramsdesc_match = re.search(r'<paramsdesc>(.*?)</paramsdesc>', docstring_block, re.DOTALL)
+    if paramsdesc_match:
+        info['paramsdesc'] = paramsdesc_match.group(1).strip()
+    
+    # Extract return type
+    rettype_match = re.search(r'<rettype>(.*?)</rettype>', docstring_block, re.DOTALL)
+    if rettype_match:
+        info['rettype'] = rettype_match.group(1).strip()
+    
+    # Extract return description
+    retdesc_match = re.search(r'<retdesc>(.*?)</retdesc>', docstring_block, re.DOTALL)
+    if retdesc_match:
+        info['retdesc'] = retdesc_match.group(1).strip()
+    
+    # Extract text outside docstring tags but inside the div
+    # This is the description text
+    description_match = re.search(r'</docstring>(.*?)(?:</div>|$)', docstring_block, re.DOTALL)
+    if description_match:
+        desc_text = description_match.group(1).strip()
+        # Remove any remaining HTML tags
+        desc_text = re.sub(r'<[^>]+>', '', desc_text)
+        if desc_text:
+            info['description'] = desc_text
+    
+    return info
+
+
+def format_parameters(paramsdesc: str) -> str:
+    """
+    Format parameter descriptions by:
+    - Removing bullets (-)
+    - Removing bold formatting (**)
+    - Changing -- to :
+    - Adding blank lines between parameters
+    """
+    lines = paramsdesc.split('\n')
+    formatted_params = []
+    current_param = []
+    
+    for line in lines:
+        # Check if this is a new parameter line (starts with "- **")
+        if re.match(r'^\s*-\s+\*\*', line):
+            # Save the previous parameter if exists
+            if current_param:
+                param_text = ' '.join(current_param)
+                # Remove - and ** formatting
+                param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text)
+                # Change -- to :
+                param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1)
+                formatted_params.append(param_text)
+                formatted_params.append('')  # Add blank line between parameters
+                current_param = []
+            
+            # Start new parameter
+            current_param.append(line)
+        elif current_param:
+            # Continuation of current parameter description
+            current_param.append(line.strip())
+    
+    # Don't forget the last parameter
+    if current_param:
+        param_text = ' '.join(current_param)
+        param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text)
+        param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1)
+        formatted_params.append(param_text)
+    
+    return '\n'.join(formatted_params)
+
+
+def process_docstring_block(docstring_block: str) -> str:
+    """
+    Process a docstring block by:
+    1. Extracting the class/function name and relevant info
+    2. Stripping all HTML tags
+    3. Converting to clean markdown with level 4 header
+    """
+    # Extract structured information from the docstring
+    info = extract_docstring_info(docstring_block)
+    
+    # Build the cleaned markdown
+    parts = []
+    
+    # Add the name as level 4 header with anchor
+    if info['name']:
+        if info['anchor']:
+            parts.append(f"#### {info['name']}[[{info['anchor']}]]")
+        else:
+            parts.append(f"#### {info['name']}")
+        parts.append("")
+    
+    # Add source link if available
+    if info['source']:
+        # Strip any HTML from source
+        source_clean = strip_html_tags(info['source'])
+        parts.append(f"[Source]({source_clean})")
+        parts.append("")
+    
+    # Add description
+    if info['description']:
+        parts.append(info['description'])
+        parts.append("")
+    
+    # Add parameters description
+    if info['paramsdesc']:
+        parts.append("**Parameters:**")
+        parts.append("")
+        # Format parameters: remove bullets and bold, change -- to :, add blank lines
+        formatted_params = format_parameters(info['paramsdesc'])
+        parts.append(formatted_params)
+        parts.append("")
+    
+    # Add return type
+    if info['rettype']:
+        parts.append("**Returns:**")
+        parts.append("")
+        # Strip HTML tags from return type
+        rettype_clean = strip_html_tags(info['rettype'])
+        parts.append(f"`{rettype_clean}`")
+        parts.append("")
+    
+    # Add return description
+    if info['retdesc']:
+        if not info['rettype']:
+            parts.append("**Returns:**")
+            parts.append("")
+        parts.append(info['retdesc'])
+        parts.append("")
+    
+    result = '\n'.join(parts)
+    
+    # Clean up excessive newlines
+    result = re.sub(r'\n{3,}', '\n\n', result)
+    
+    return result.strip()
+
+
+def strip_html_from_markdown(content: str) -> str:
+    """
+    Strip HTML from markdown content.
+    
+    Handles:
+    - Docstring blocks wrapped in <div class="docstring...">...</div>
+    - Other HTML tags throughout the document
+    """
+    result = content
+    
+    # Process docstring blocks with their wrapping divs
+    # Pattern to match: <div class="docstring...">...<docstring>...</docstring>...</div>
+    docstring_pattern = r'<div[^>]*class="docstring[^"]*"[^>]*>.*?<docstring>.*?</docstring>.*?</div>'
+    
+    def replace_docstring(match):
+        block = match.group(0)
+        return process_docstring_block(block)
+    
+    result = re.sub(docstring_pattern, replace_docstring, result, flags=re.DOTALL)
+    
+    # Strip remaining HTML tags (like <Tip>, </Tip>, <ExampleCodeBlock>, etc.)
+    # But preserve markdown code blocks
+    result = strip_remaining_html(result)
+    
+    return result
+
+
+def strip_remaining_html(content: str) -> str:
+    """
+    Strip remaining HTML tags while preserving markdown structure.
+    Handles tags like <Tip>, <ExampleCodeBlock>, etc.
+    """
+    # Remove HTML comments
+    content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
+    
+    # Remove common component tags while preserving their content
+    # (Tip, TipEnd, ExampleCodeBlock, hfoptions, hfoption, etc.)
+    tags_to_remove = [
+        'Tip', 'TipEnd', 'ExampleCodeBlock', 'hfoptions', 'hfoption',
+        'EditOnGithub', 'div', 'span', 'anchor'
+    ]
+    
+    for tag in tags_to_remove:
+        # Remove opening tags with any attributes
+        content = re.sub(rf'<{tag}[^>]*>', '', content, flags=re.IGNORECASE)
+        # Remove closing tags
+        content = re.sub(rf'</{tag}>', '', content, flags=re.IGNORECASE)
+    
+    # Remove any remaining HTML tags (generic cleanup)
+    # This is more aggressive but preserves text content
+    content = re.sub(r'<[^>]+>', '', content)
+    
+    # Clean up multiple consecutive blank lines
+    content = re.sub(r'\n{3,}', '\n\n', content)
+    
+    return content
+
+
+def process_file(input_path: Path, output_path: Optional[Path] = None) -> None:
+    """
+    Process a markdown file to strip HTML.
+    
+    Args:
+        input_path: Path to input markdown file
+        output_path: Path to output file (if None, overwrites input)
+    """
+    # Read the input file
+    content = input_path.read_text(encoding='utf-8')
+    
+    # Process the content
+    cleaned_content = strip_html_from_markdown(content)
+    
+    # Write to output file
+    if output_path is None:
+        output_path = input_path
+    
+    output_path.write_text(cleaned_content, encoding='utf-8')
+    print(f"Processed: {input_path} -> {output_path}")
+
+
+def main():
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(
+        description='Strip HTML from markdown files and convert docstrings to clean markdown.'
+    )
+    parser.add_argument(
+        'input',
+        type=str,
+        help='Input markdown file or directory'
+    )
+    parser.add_argument(
+        '-o', '--output',
+        type=str,
+        default=None,
+        help='Output file or directory (defaults to overwriting input)'
+    )
+    parser.add_argument(
+        '-r', '--recursive',
+        action='store_true',
+        help='Process directory recursively'
+    )
+    
+    args = parser.parse_args()
+    
+    input_path = Path(args.input)
+    output_path = Path(args.output) if args.output else None
+    
+    if input_path.is_file():
+        # Process single file
+        process_file(input_path, output_path)
+    elif input_path.is_dir():
+        # Process directory
+        pattern = '**/*.md' if args.recursive else '*.md'
+        md_files = list(input_path.glob(pattern))
+        
+        if not md_files:
+            print(f"No markdown files found in {input_path}")
+            return
+        
+        for md_file in md_files:
+            if output_path:
+                # Preserve directory structure in output
+                relative_path = md_file.relative_to(input_path)
+                out_file = output_path / relative_path
+                out_file.parent.mkdir(parents=True, exist_ok=True)
+            else:
+                out_file = None
+            
+            process_file(md_file, out_file)
+        
+        print(f"\nProcessed {len(md_files)} file(s)")
+    else:
+        print(f"Error: {input_path} is not a valid file or directory")
+        return 1
+    
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
+

From d8e0cd1986d1dfcaaa43f56405b1a4d289bcd6ea Mon Sep 17 00:00:00 2001
From: Mishig Davaadorj <dmishig@gmail.com>
Date: Wed, 12 Nov 2025 14:40:43 +0100
Subject: [PATCH 2/4] ruff fix

---
 src/scripts/strip_html_from_md.py | 103 +++++++++++++++---------------
 1 file changed, 51 insertions(+), 52 deletions(-)

diff --git a/src/scripts/strip_html_from_md.py b/src/scripts/strip_html_from_md.py
index 0643f145..2ea5e7df 100644
--- a/src/scripts/strip_html_from_md.py
+++ b/src/scripts/strip_html_from_md.py
@@ -8,26 +8,25 @@
 - Preserving the text content
 """
 
-import re
 import argparse
-from pathlib import Path
+import re
 from html.parser import HTMLParser
-from typing import Optional
+from pathlib import Path
 
 
 class HTMLStripper(HTMLParser):
     """Helper class to strip HTML tags while preserving text content."""
-    
+
     def __init__(self):
         super().__init__()
         self.reset()
         self.strict = False
         self.convert_charrefs = True
         self.text = []
-    
+
     def handle_data(self, data):
         self.text.append(data)
-    
+
     def get_data(self):
         return ''.join(self.text)
 
@@ -51,7 +50,7 @@ def extract_docstring_info(docstring_block: str) -> dict:
         'retdesc': None,
         'description': None
     }
-    
+
     # Extract name
     name_match = re.search(r'<name>(.*?)</name>', docstring_block, re.DOTALL)
     if name_match:
@@ -59,32 +58,32 @@ def extract_docstring_info(docstring_block: str) -> dict:
         # Remove "class " or "def " prefix if present
         cleaned_name = re.sub(r'^(class|def)\s+', '', raw_name)
         info['name'] = cleaned_name
-    
+
     # Extract anchor
     anchor_match = re.search(r'<anchor>(.*?)</anchor>', docstring_block, re.DOTALL)
     if anchor_match:
         info['anchor'] = anchor_match.group(1).strip()
-    
+
     # Extract source
     source_match = re.search(r'<source>(.*?)</source>', docstring_block, re.DOTALL)
     if source_match:
         info['source'] = source_match.group(1).strip()
-    
+
     # Extract parameters description
     paramsdesc_match = re.search(r'<paramsdesc>(.*?)</paramsdesc>', docstring_block, re.DOTALL)
     if paramsdesc_match:
         info['paramsdesc'] = paramsdesc_match.group(1).strip()
-    
+
     # Extract return type
     rettype_match = re.search(r'<rettype>(.*?)</rettype>', docstring_block, re.DOTALL)
     if rettype_match:
         info['rettype'] = rettype_match.group(1).strip()
-    
+
     # Extract return description
     retdesc_match = re.search(r'<retdesc>(.*?)</retdesc>', docstring_block, re.DOTALL)
     if retdesc_match:
         info['retdesc'] = retdesc_match.group(1).strip()
-    
+
     # Extract text outside docstring tags but inside the div
     # This is the description text
     description_match = re.search(r'</docstring>(.*?)(?:</div>|$)', docstring_block, re.DOTALL)
@@ -94,7 +93,7 @@ def extract_docstring_info(docstring_block: str) -> dict:
         desc_text = re.sub(r'<[^>]+>', '', desc_text)
         if desc_text:
             info['description'] = desc_text
-    
+
     return info
 
 
@@ -109,7 +108,7 @@ def format_parameters(paramsdesc: str) -> str:
     lines = paramsdesc.split('\n')
     formatted_params = []
     current_param = []
-    
+
     for line in lines:
         # Check if this is a new parameter line (starts with "- **")
         if re.match(r'^\s*-\s+\*\*', line):
@@ -123,20 +122,20 @@ def format_parameters(paramsdesc: str) -> str:
                 formatted_params.append(param_text)
                 formatted_params.append('')  # Add blank line between parameters
                 current_param = []
-            
+
             # Start new parameter
             current_param.append(line)
         elif current_param:
             # Continuation of current parameter description
             current_param.append(line.strip())
-    
+
     # Don't forget the last parameter
     if current_param:
         param_text = ' '.join(current_param)
         param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text)
         param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1)
         formatted_params.append(param_text)
-    
+
     return '\n'.join(formatted_params)
 
 
@@ -149,10 +148,10 @@ def process_docstring_block(docstring_block: str) -> str:
     """
     # Extract structured information from the docstring
     info = extract_docstring_info(docstring_block)
-    
+
     # Build the cleaned markdown
     parts = []
-    
+
     # Add the name as level 4 header with anchor
     if info['name']:
         if info['anchor']:
@@ -160,19 +159,19 @@ def process_docstring_block(docstring_block: str) -> str:
         else:
             parts.append(f"#### {info['name']}")
         parts.append("")
-    
+
     # Add source link if available
     if info['source']:
         # Strip any HTML from source
         source_clean = strip_html_tags(info['source'])
         parts.append(f"[Source]({source_clean})")
         parts.append("")
-    
+
     # Add description
     if info['description']:
         parts.append(info['description'])
         parts.append("")
-    
+
     # Add parameters description
     if info['paramsdesc']:
         parts.append("**Parameters:**")
@@ -181,7 +180,7 @@ def process_docstring_block(docstring_block: str) -> str:
         formatted_params = format_parameters(info['paramsdesc'])
         parts.append(formatted_params)
         parts.append("")
-    
+
     # Add return type
     if info['rettype']:
         parts.append("**Returns:**")
@@ -190,7 +189,7 @@ def process_docstring_block(docstring_block: str) -> str:
         rettype_clean = strip_html_tags(info['rettype'])
         parts.append(f"`{rettype_clean}`")
         parts.append("")
-    
+
     # Add return description
     if info['retdesc']:
         if not info['rettype']:
@@ -198,39 +197,39 @@ def process_docstring_block(docstring_block: str) -> str:
             parts.append("")
         parts.append(info['retdesc'])
         parts.append("")
-    
+
     result = '\n'.join(parts)
-    
+
     # Clean up excessive newlines
     result = re.sub(r'\n{3,}', '\n\n', result)
-    
+
     return result.strip()
 
 
 def strip_html_from_markdown(content: str) -> str:
     """
     Strip HTML from markdown content.
-    
+
     Handles:
     - Docstring blocks wrapped in <div class="docstring...">...</div>
     - Other HTML tags throughout the document
     """
     result = content
-    
+
     # Process docstring blocks with their wrapping divs
     # Pattern to match: <div class="docstring...">...<docstring>...</docstring>...</div>
     docstring_pattern = r'<div[^>]*class="docstring[^"]*"[^>]*>.*?<docstring>.*?</docstring>.*?</div>'
-    
+
     def replace_docstring(match):
         block = match.group(0)
         return process_docstring_block(block)
-    
+
     result = re.sub(docstring_pattern, replace_docstring, result, flags=re.DOTALL)
-    
+
     # Strip remaining HTML tags (like <Tip>, </Tip>, <ExampleCodeBlock>, etc.)
     # But preserve markdown code blocks
     result = strip_remaining_html(result)
-    
+
     return result
 
 
@@ -241,48 +240,48 @@ def strip_remaining_html(content: str) -> str:
     """
     # Remove HTML comments
     content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
-    
+
     # Remove common component tags while preserving their content
     # (Tip, TipEnd, ExampleCodeBlock, hfoptions, hfoption, etc.)
     tags_to_remove = [
         'Tip', 'TipEnd', 'ExampleCodeBlock', 'hfoptions', 'hfoption',
         'EditOnGithub', 'div', 'span', 'anchor'
     ]
-    
+
     for tag in tags_to_remove:
         # Remove opening tags with any attributes
         content = re.sub(rf'<{tag}[^>]*>', '', content, flags=re.IGNORECASE)
         # Remove closing tags
         content = re.sub(rf'</{tag}>', '', content, flags=re.IGNORECASE)
-    
+
     # Remove any remaining HTML tags (generic cleanup)
     # This is more aggressive but preserves text content
     content = re.sub(r'<[^>]+>', '', content)
-    
+
     # Clean up multiple consecutive blank lines
     content = re.sub(r'\n{3,}', '\n\n', content)
-    
+
     return content
 
 
-def process_file(input_path: Path, output_path: Optional[Path] = None) -> None:
+def process_file(input_path: Path, output_path: Path | None = None) -> None:
     """
     Process a markdown file to strip HTML.
-    
+
     Args:
         input_path: Path to input markdown file
         output_path: Path to output file (if None, overwrites input)
     """
     # Read the input file
     content = input_path.read_text(encoding='utf-8')
-    
+
     # Process the content
     cleaned_content = strip_html_from_markdown(content)
-    
+
     # Write to output file
     if output_path is None:
         output_path = input_path
-    
+
     output_path.write_text(cleaned_content, encoding='utf-8')
     print(f"Processed: {input_path} -> {output_path}")
 
@@ -308,12 +307,12 @@ def main():
         action='store_true',
         help='Process directory recursively'
     )
-    
+
     args = parser.parse_args()
-    
+
     input_path = Path(args.input)
     output_path = Path(args.output) if args.output else None
-    
+
     if input_path.is_file():
         # Process single file
         process_file(input_path, output_path)
@@ -321,11 +320,11 @@ def main():
         # Process directory
         pattern = '**/*.md' if args.recursive else '*.md'
         md_files = list(input_path.glob(pattern))
-        
+
         if not md_files:
             print(f"No markdown files found in {input_path}")
             return
-        
+
         for md_file in md_files:
             if output_path:
                 # Preserve directory structure in output
@@ -334,14 +333,14 @@ def main():
                 out_file.parent.mkdir(parents=True, exist_ok=True)
             else:
                 out_file = None
-            
+
             process_file(md_file, out_file)
-        
+
         print(f"\nProcessed {len(md_files)} file(s)")
     else:
         print(f"Error: {input_path} is not a valid file or directory")
         return 1
-    
+
     return 0
 
 

From 71cfc5220424a18b2a3820acae6b0ce4335fdb62 Mon Sep 17 00:00:00 2001
From: Mishig Davaadorj <dmishig@gmail.com>
Date: Wed, 12 Nov 2025 14:42:05 +0100
Subject: [PATCH 3/4] format

---
 src/scripts/strip_html_from_md.py | 149 +++++++++++++++---------------
 1 file changed, 72 insertions(+), 77 deletions(-)

diff --git a/src/scripts/strip_html_from_md.py b/src/scripts/strip_html_from_md.py
index 2ea5e7df..43a7f4fe 100644
--- a/src/scripts/strip_html_from_md.py
+++ b/src/scripts/strip_html_from_md.py
@@ -28,7 +28,7 @@ def handle_data(self, data):
         self.text.append(data)
 
     def get_data(self):
-        return ''.join(self.text)
+        return "".join(self.text)
 
 
 def strip_html_tags(text: str) -> str:
@@ -41,58 +41,58 @@ def strip_html_tags(text: str) -> str:
 def extract_docstring_info(docstring_block: str) -> dict:
     """Extract information from a docstring block."""
     info = {
-        'name': None,
-        'anchor': None,
-        'source': None,
-        'parameters': None,
-        'paramsdesc': None,
-        'rettype': None,
-        'retdesc': None,
-        'description': None
+        "name": None,
+        "anchor": None,
+        "source": None,
+        "parameters": None,
+        "paramsdesc": None,
+        "rettype": None,
+        "retdesc": None,
+        "description": None,
     }
 
     # Extract name
-    name_match = re.search(r'<name>(.*?)</name>', docstring_block, re.DOTALL)
+    name_match = re.search(r"<name>(.*?)</name>", docstring_block, re.DOTALL)
     if name_match:
         raw_name = name_match.group(1).strip()
         # Remove "class " or "def " prefix if present
-        cleaned_name = re.sub(r'^(class|def)\s+', '', raw_name)
-        info['name'] = cleaned_name
+        cleaned_name = re.sub(r"^(class|def)\s+", "", raw_name)
+        info["name"] = cleaned_name
 
     # Extract anchor
-    anchor_match = re.search(r'<anchor>(.*?)</anchor>', docstring_block, re.DOTALL)
+    anchor_match = re.search(r"<anchor>(.*?)</anchor>", docstring_block, re.DOTALL)
     if anchor_match:
-        info['anchor'] = anchor_match.group(1).strip()
+        info["anchor"] = anchor_match.group(1).strip()
 
     # Extract source
-    source_match = re.search(r'<source>(.*?)</source>', docstring_block, re.DOTALL)
+    source_match = re.search(r"<source>(.*?)</source>", docstring_block, re.DOTALL)
     if source_match:
-        info['source'] = source_match.group(1).strip()
+        info["source"] = source_match.group(1).strip()
 
     # Extract parameters description
-    paramsdesc_match = re.search(r'<paramsdesc>(.*?)</paramsdesc>', docstring_block, re.DOTALL)
+    paramsdesc_match = re.search(r"<paramsdesc>(.*?)</paramsdesc>", docstring_block, re.DOTALL)
     if paramsdesc_match:
-        info['paramsdesc'] = paramsdesc_match.group(1).strip()
+        info["paramsdesc"] = paramsdesc_match.group(1).strip()
 
     # Extract return type
-    rettype_match = re.search(r'<rettype>(.*?)</rettype>', docstring_block, re.DOTALL)
+    rettype_match = re.search(r"<rettype>(.*?)</rettype>", docstring_block, re.DOTALL)
     if rettype_match:
-        info['rettype'] = rettype_match.group(1).strip()
+        info["rettype"] = rettype_match.group(1).strip()
 
     # Extract return description
-    retdesc_match = re.search(r'<retdesc>(.*?)</retdesc>', docstring_block, re.DOTALL)
+    retdesc_match = re.search(r"<retdesc>(.*?)</retdesc>", docstring_block, re.DOTALL)
     if retdesc_match:
-        info['retdesc'] = retdesc_match.group(1).strip()
+        info["retdesc"] = retdesc_match.group(1).strip()
 
     # Extract text outside docstring tags but inside the div
     # This is the description text
-    description_match = re.search(r'</docstring>(.*?)(?:</div>|$)', docstring_block, re.DOTALL)
+    description_match = re.search(r"</docstring>(.*?)(?:</div>|$)", docstring_block, re.DOTALL)
     if description_match:
         desc_text = description_match.group(1).strip()
         # Remove any remaining HTML tags
-        desc_text = re.sub(r'<[^>]+>', '', desc_text)
+        desc_text = re.sub(r"<[^>]+>", "", desc_text)
         if desc_text:
-            info['description'] = desc_text
+            info["description"] = desc_text
 
     return info
 
@@ -105,22 +105,22 @@ def format_parameters(paramsdesc: str) -> str:
     - Changing -- to :
     - Adding blank lines between parameters
     """
-    lines = paramsdesc.split('\n')
+    lines = paramsdesc.split("\n")
     formatted_params = []
     current_param = []
 
     for line in lines:
         # Check if this is a new parameter line (starts with "- **")
-        if re.match(r'^\s*-\s+\*\*', line):
+        if re.match(r"^\s*-\s+\*\*", line):
             # Save the previous parameter if exists
             if current_param:
-                param_text = ' '.join(current_param)
+                param_text = " ".join(current_param)
                 # Remove - and ** formatting
-                param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text)
+                param_text = re.sub(r"^\s*-\s+\*\*([^*]+)\*\*", r"\1", param_text)
                 # Change -- to :
-                param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1)
+                param_text = re.sub(r"\s+--\s+", " : ", param_text, count=1)
                 formatted_params.append(param_text)
-                formatted_params.append('')  # Add blank line between parameters
+                formatted_params.append("")  # Add blank line between parameters
                 current_param = []
 
             # Start new parameter
@@ -131,12 +131,12 @@ def format_parameters(paramsdesc: str) -> str:
 
     # Don't forget the last parameter
     if current_param:
-        param_text = ' '.join(current_param)
-        param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text)
-        param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1)
+        param_text = " ".join(current_param)
+        param_text = re.sub(r"^\s*-\s+\*\*([^*]+)\*\*", r"\1", param_text)
+        param_text = re.sub(r"\s+--\s+", " : ", param_text, count=1)
         formatted_params.append(param_text)
 
-    return '\n'.join(formatted_params)
+    return "\n".join(formatted_params)
 
 
 def process_docstring_block(docstring_block: str) -> str:
@@ -153,55 +153,55 @@ def process_docstring_block(docstring_block: str) -> str:
     parts = []
 
     # Add the name as level 4 header with anchor
-    if info['name']:
-        if info['anchor']:
+    if info["name"]:
+        if info["anchor"]:
             parts.append(f"#### {info['name']}[[{info['anchor']}]]")
         else:
             parts.append(f"#### {info['name']}")
         parts.append("")
 
     # Add source link if available
-    if info['source']:
+    if info["source"]:
         # Strip any HTML from source
-        source_clean = strip_html_tags(info['source'])
+        source_clean = strip_html_tags(info["source"])
         parts.append(f"[Source]({source_clean})")
         parts.append("")
 
     # Add description
-    if info['description']:
-        parts.append(info['description'])
+    if info["description"]:
+        parts.append(info["description"])
         parts.append("")
 
     # Add parameters description
-    if info['paramsdesc']:
+    if info["paramsdesc"]:
         parts.append("**Parameters:**")
         parts.append("")
         # Format parameters: remove bullets and bold, change -- to :, add blank lines
-        formatted_params = format_parameters(info['paramsdesc'])
+        formatted_params = format_parameters(info["paramsdesc"])
         parts.append(formatted_params)
         parts.append("")
 
     # Add return type
-    if info['rettype']:
+    if info["rettype"]:
         parts.append("**Returns:**")
         parts.append("")
         # Strip HTML tags from return type
-        rettype_clean = strip_html_tags(info['rettype'])
+        rettype_clean = strip_html_tags(info["rettype"])
         parts.append(f"`{rettype_clean}`")
         parts.append("")
 
     # Add return description
-    if info['retdesc']:
-        if not info['rettype']:
+    if info["retdesc"]:
+        if not info["rettype"]:
             parts.append("**Returns:**")
             parts.append("")
-        parts.append(info['retdesc'])
+        parts.append(info["retdesc"])
         parts.append("")
 
-    result = '\n'.join(parts)
+    result = "\n".join(parts)
 
     # Clean up excessive newlines
-    result = re.sub(r'\n{3,}', '\n\n', result)
+    result = re.sub(r"\n{3,}", "\n\n", result)
 
     return result.strip()
 
@@ -239,27 +239,34 @@ def strip_remaining_html(content: str) -> str:
     Handles tags like <Tip>, <ExampleCodeBlock>, etc.
     """
     # Remove HTML comments
-    content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
+    content = re.sub(r"<!--.*?-->", "", content, flags=re.DOTALL)
 
     # Remove common component tags while preserving their content
     # (Tip, TipEnd, ExampleCodeBlock, hfoptions, hfoption, etc.)
     tags_to_remove = [
-        'Tip', 'TipEnd', 'ExampleCodeBlock', 'hfoptions', 'hfoption',
-        'EditOnGithub', 'div', 'span', 'anchor'
+        "Tip",
+        "TipEnd",
+        "ExampleCodeBlock",
+        "hfoptions",
+        "hfoption",
+        "EditOnGithub",
+        "div",
+        "span",
+        "anchor",
     ]
 
     for tag in tags_to_remove:
         # Remove opening tags with any attributes
-        content = re.sub(rf'<{tag}[^>]*>', '', content, flags=re.IGNORECASE)
+        content = re.sub(rf"<{tag}[^>]*>", "", content, flags=re.IGNORECASE)
         # Remove closing tags
-        content = re.sub(rf'</{tag}>', '', content, flags=re.IGNORECASE)
+        content = re.sub(rf"</{tag}>", "", content, flags=re.IGNORECASE)
 
     # Remove any remaining HTML tags (generic cleanup)
     # This is more aggressive but preserves text content
-    content = re.sub(r'<[^>]+>', '', content)
+    content = re.sub(r"<[^>]+>", "", content)
 
     # Clean up multiple consecutive blank lines
-    content = re.sub(r'\n{3,}', '\n\n', content)
+    content = re.sub(r"\n{3,}", "\n\n", content)
 
     return content
 
@@ -273,7 +280,7 @@ def process_file(input_path: Path, output_path: Path | None = None) -> None:
         output_path: Path to output file (if None, overwrites input)
     """
     # Read the input file
-    content = input_path.read_text(encoding='utf-8')
+    content = input_path.read_text(encoding="utf-8")
 
     # Process the content
     cleaned_content = strip_html_from_markdown(content)
@@ -282,31 +289,20 @@ def process_file(input_path: Path, output_path: Path | None = None) -> None:
     if output_path is None:
         output_path = input_path
 
-    output_path.write_text(cleaned_content, encoding='utf-8')
+    output_path.write_text(cleaned_content, encoding="utf-8")
     print(f"Processed: {input_path} -> {output_path}")
 
 
 def main():
     """Main entry point for the script."""
     parser = argparse.ArgumentParser(
-        description='Strip HTML from markdown files and convert docstrings to clean markdown.'
+        description="Strip HTML from markdown files and convert docstrings to clean markdown."
     )
+    parser.add_argument("input", type=str, help="Input markdown file or directory")
     parser.add_argument(
-        'input',
-        type=str,
-        help='Input markdown file or directory'
-    )
-    parser.add_argument(
-        '-o', '--output',
-        type=str,
-        default=None,
-        help='Output file or directory (defaults to overwriting input)'
-    )
-    parser.add_argument(
-        '-r', '--recursive',
-        action='store_true',
-        help='Process directory recursively'
+        "-o", "--output", type=str, default=None, help="Output file or directory (defaults to overwriting input)"
     )
+    parser.add_argument("-r", "--recursive", action="store_true", help="Process directory recursively")
 
     args = parser.parse_args()
 
@@ -318,7 +314,7 @@ def main():
         process_file(input_path, output_path)
     elif input_path.is_dir():
         # Process directory
-        pattern = '**/*.md' if args.recursive else '*.md'
+        pattern = "**/*.md" if args.recursive else "*.md"
         md_files = list(input_path.glob(pattern))
 
         if not md_files:
@@ -344,6 +340,5 @@ def main():
     return 0
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     exit(main())
-

From 7ddde3f6a04aa55d8b970bb24f99bb4db8b6593b Mon Sep 17 00:00:00 2001
From: Mishig Davaadorj <dmishig@gmail.com>
Date: Wed, 12 Nov 2025 14:50:18 +0100
Subject: [PATCH 4/4] add to jobs

---
 .github/workflows/build_main_documentation.yml | 7 +++++++
 .github/workflows/build_pr_documentation.yml   | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index 20870971..5005de82 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -234,6 +234,13 @@ jobs:
 
           cd ..
 
+      - name: Strip HTML from built markdown files
+        run: |
+          source .venv/bin/activate
+          echo "Stripping HTML from markdown files in build_dir"
+          python3 src/scripts/strip_html_from_md.py build_dir/ --recursive
+          echo "HTML stripping complete"
+
       - name: Push to repositories
         run: |
           source .venv/bin/activate
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index ce0e06d2..09bc9747 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -219,6 +219,13 @@ jobs:
           fi
           cd ..
 
+      - name: Strip HTML from built markdown files
+        run: |
+          source .venv/bin/activate
+          echo "Stripping HTML from markdown files in build_dir"
+          python3 src/scripts/strip_html_from_md.py build_dir/ --recursive
+          echo "HTML stripping complete"
+
       - name: Save commit_sha & pr_number
         run: |
           echo ${{ inputs.commit_sha }} > ./build_dir/commit_sha