From f53fd8e209a384d5c7f321f16dcd5348153ad54e Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Wed, 12 Nov 2025 14:38:53 +0100 Subject: [PATCH 1/4] Add HTML stripper script for markdown documentation - Add strip_html_from_md.py script to convert HTML-heavy docs to clean markdown - Strip HTML tags from docstrings and convert to level 4 headers - Format class/function names with anchors: #### ClassName[[anchor]] - Remove 'class' and 'def' prefixes from names - Format parameters without bullets, using : separator with blank lines - Support processing single files or directories recursively - Add comprehensive README with usage examples and documentation --- README_strip_html.md | 99 +++++++++ src/scripts/strip_html_from_md.py | 350 ++++++++++++++++++++++++++++++ 2 files changed, 449 insertions(+) create mode 100644 README_strip_html.md create mode 100644 src/scripts/strip_html_from_md.py diff --git a/README_strip_html.md b/README_strip_html.md new file mode 100644 index 00000000..68e48887 --- /dev/null +++ b/README_strip_html.md @@ -0,0 +1,99 @@ +# HTML Stripper for Markdown Documentation + +This script strips HTML tags from markdown files and converts docstrings to clean markdown format. + +## Features + +- Removes all HTML tags while preserving content +- Converts docstring blocks to clean markdown with level 4 headers (`####`) +- Extracts and formats: + - Class/function names as headers (removes `class` and `def` prefixes) + - Anchors in double square brackets format: `[[anchor]]` + - Source links + - Parameter descriptions (removes bullets and bold, uses `:` separator, adds blank lines between params) + - Return types and descriptions +- Preserves markdown code blocks and structure +- Can process single files or entire directories + +## Usage + +### Single File + +```bash +python3 src/scripts/strip_html_from_md.py input.md -o output.md +``` + +Or overwrite the input file: + +```bash +python3 src/scripts/strip_html_from_md.py input.md +``` + +### Directory Processing + +Process all markdown files in a directory: + +```bash +python3 src/scripts/strip_html_from_md.py docs/ -o clean_docs/ +``` + +Process recursively: + +```bash +python3 src/scripts/strip_html_from_md.py docs/ -o clean_docs/ --recursive +``` + +## Examples + +### Before (with HTML) + +```markdown +
+ +class transformers.BertConfigtransformers.BertConfighttps://github.com/huggingface/transformers/blob/v4.57.1/src/transformers/models/bert/configuration_bert.py#L29- **vocab_size** (`int`, *optional*, defaults to 30522) -- + Vocabulary size of the BERT model. + +This is the configuration class to store the configuration of a BertModel. + +
+``` + +### After (clean markdown) + +```markdown +#### transformers.BertConfig[[transformers.BertConfig]] + +[Source](https://github.com/huggingface/transformers/blob/v4.57.1/src/transformers/models/bert/configuration_bert.py#L29) + +This is the configuration class to store the configuration of a BertModel. + +**Parameters:** + +vocab_size (`int`, *optional*, defaults to 30522) : Vocabulary size of the BERT model. + +hidden_size (`int`, *optional*, defaults to 768) : Dimensionality of the encoder layers and the pooler layer. +``` + +## Command-line Options + +- `input` - Input markdown file or directory (required) +- `-o, --output` - Output file or directory (optional, defaults to overwriting input) +- `-r, --recursive` - Process directory recursively (optional) + +## What Gets Stripped + +The script removes: +- `
` tags and their attributes +- `` and nested tags (``, ``, ``, etc.) +- Component tags: ``, ``, ``, ``, etc. +- `` links +- HTML comments +- Any other HTML tags + +## What Gets Preserved + +- Markdown syntax (headers, lists, code blocks, links, etc.) +- Text content from within HTML tags +- Code blocks (backtick-fenced) +- Link URLs and formatting + diff --git a/src/scripts/strip_html_from_md.py b/src/scripts/strip_html_from_md.py new file mode 100644 index 00000000..0643f145 --- /dev/null +++ b/src/scripts/strip_html_from_md.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 +""" +Script to strip HTML from markdown files and convert docstrings to clean markdown. + +Specifically handles docstring blocks by: +- Removing all HTML tags +- Converting class/function names to level 4 headers (####) +- Preserving the text content +""" + +import re +import argparse +from pathlib import Path +from html.parser import HTMLParser +from typing import Optional + + +class HTMLStripper(HTMLParser): + """Helper class to strip HTML tags while preserving text content.""" + + def __init__(self): + super().__init__() + self.reset() + self.strict = False + self.convert_charrefs = True + self.text = [] + + def handle_data(self, data): + self.text.append(data) + + def get_data(self): + return ''.join(self.text) + + +def strip_html_tags(text: str) -> str: + """Strip HTML tags from text while preserving content.""" + stripper = HTMLStripper() + stripper.feed(text) + return stripper.get_data() + + +def extract_docstring_info(docstring_block: str) -> dict: + """Extract information from a docstring block.""" + info = { + 'name': None, + 'anchor': None, + 'source': None, + 'parameters': None, + 'paramsdesc': None, + 'rettype': None, + 'retdesc': None, + 'description': None + } + + # Extract name + name_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + if name_match: + raw_name = name_match.group(1).strip() + # Remove "class " or "def " prefix if present + cleaned_name = re.sub(r'^(class|def)\s+', '', raw_name) + info['name'] = cleaned_name + + # Extract anchor + anchor_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + if anchor_match: + info['anchor'] = anchor_match.group(1).strip() + + # Extract source + source_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + if source_match: + info['source'] = source_match.group(1).strip() + + # Extract parameters description + paramsdesc_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + if paramsdesc_match: + info['paramsdesc'] = paramsdesc_match.group(1).strip() + + # Extract return type + rettype_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + if rettype_match: + info['rettype'] = rettype_match.group(1).strip() + + # Extract return description + retdesc_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + if retdesc_match: + info['retdesc'] = retdesc_match.group(1).strip() + + # Extract text outside docstring tags but inside the div + # This is the description text + description_match = re.search(r'(.*?)(?:
|$)', docstring_block, re.DOTALL) + if description_match: + desc_text = description_match.group(1).strip() + # Remove any remaining HTML tags + desc_text = re.sub(r'<[^>]+>', '', desc_text) + if desc_text: + info['description'] = desc_text + + return info + + +def format_parameters(paramsdesc: str) -> str: + """ + Format parameter descriptions by: + - Removing bullets (-) + - Removing bold formatting (**) + - Changing -- to : + - Adding blank lines between parameters + """ + lines = paramsdesc.split('\n') + formatted_params = [] + current_param = [] + + for line in lines: + # Check if this is a new parameter line (starts with "- **") + if re.match(r'^\s*-\s+\*\*', line): + # Save the previous parameter if exists + if current_param: + param_text = ' '.join(current_param) + # Remove - and ** formatting + param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text) + # Change -- to : + param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1) + formatted_params.append(param_text) + formatted_params.append('') # Add blank line between parameters + current_param = [] + + # Start new parameter + current_param.append(line) + elif current_param: + # Continuation of current parameter description + current_param.append(line.strip()) + + # Don't forget the last parameter + if current_param: + param_text = ' '.join(current_param) + param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text) + param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1) + formatted_params.append(param_text) + + return '\n'.join(formatted_params) + + +def process_docstring_block(docstring_block: str) -> str: + """ + Process a docstring block by: + 1. Extracting the class/function name and relevant info + 2. Stripping all HTML tags + 3. Converting to clean markdown with level 4 header + """ + # Extract structured information from the docstring + info = extract_docstring_info(docstring_block) + + # Build the cleaned markdown + parts = [] + + # Add the name as level 4 header with anchor + if info['name']: + if info['anchor']: + parts.append(f"#### {info['name']}[[{info['anchor']}]]") + else: + parts.append(f"#### {info['name']}") + parts.append("") + + # Add source link if available + if info['source']: + # Strip any HTML from source + source_clean = strip_html_tags(info['source']) + parts.append(f"[Source]({source_clean})") + parts.append("") + + # Add description + if info['description']: + parts.append(info['description']) + parts.append("") + + # Add parameters description + if info['paramsdesc']: + parts.append("**Parameters:**") + parts.append("") + # Format parameters: remove bullets and bold, change -- to :, add blank lines + formatted_params = format_parameters(info['paramsdesc']) + parts.append(formatted_params) + parts.append("") + + # Add return type + if info['rettype']: + parts.append("**Returns:**") + parts.append("") + # Strip HTML tags from return type + rettype_clean = strip_html_tags(info['rettype']) + parts.append(f"`{rettype_clean}`") + parts.append("") + + # Add return description + if info['retdesc']: + if not info['rettype']: + parts.append("**Returns:**") + parts.append("") + parts.append(info['retdesc']) + parts.append("") + + result = '\n'.join(parts) + + # Clean up excessive newlines + result = re.sub(r'\n{3,}', '\n\n', result) + + return result.strip() + + +def strip_html_from_markdown(content: str) -> str: + """ + Strip HTML from markdown content. + + Handles: + - Docstring blocks wrapped in
...
+ - Other HTML tags throughout the document + """ + result = content + + # Process docstring blocks with their wrapping divs + # Pattern to match:
.........
+ docstring_pattern = r']*class="docstring[^"]*"[^>]*>.*?.*?.*?' + + def replace_docstring(match): + block = match.group(0) + return process_docstring_block(block) + + result = re.sub(docstring_pattern, replace_docstring, result, flags=re.DOTALL) + + # Strip remaining HTML tags (like , , , etc.) + # But preserve markdown code blocks + result = strip_remaining_html(result) + + return result + + +def strip_remaining_html(content: str) -> str: + """ + Strip remaining HTML tags while preserving markdown structure. + Handles tags like , , etc. + """ + # Remove HTML comments + content = re.sub(r'', '', content, flags=re.DOTALL) + + # Remove common component tags while preserving their content + # (Tip, TipEnd, ExampleCodeBlock, hfoptions, hfoption, etc.) + tags_to_remove = [ + 'Tip', 'TipEnd', 'ExampleCodeBlock', 'hfoptions', 'hfoption', + 'EditOnGithub', 'div', 'span', 'anchor' + ] + + for tag in tags_to_remove: + # Remove opening tags with any attributes + content = re.sub(rf'<{tag}[^>]*>', '', content, flags=re.IGNORECASE) + # Remove closing tags + content = re.sub(rf'', '', content, flags=re.IGNORECASE) + + # Remove any remaining HTML tags (generic cleanup) + # This is more aggressive but preserves text content + content = re.sub(r'<[^>]+>', '', content) + + # Clean up multiple consecutive blank lines + content = re.sub(r'\n{3,}', '\n\n', content) + + return content + + +def process_file(input_path: Path, output_path: Optional[Path] = None) -> None: + """ + Process a markdown file to strip HTML. + + Args: + input_path: Path to input markdown file + output_path: Path to output file (if None, overwrites input) + """ + # Read the input file + content = input_path.read_text(encoding='utf-8') + + # Process the content + cleaned_content = strip_html_from_markdown(content) + + # Write to output file + if output_path is None: + output_path = input_path + + output_path.write_text(cleaned_content, encoding='utf-8') + print(f"Processed: {input_path} -> {output_path}") + + +def main(): + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description='Strip HTML from markdown files and convert docstrings to clean markdown.' + ) + parser.add_argument( + 'input', + type=str, + help='Input markdown file or directory' + ) + parser.add_argument( + '-o', '--output', + type=str, + default=None, + help='Output file or directory (defaults to overwriting input)' + ) + parser.add_argument( + '-r', '--recursive', + action='store_true', + help='Process directory recursively' + ) + + args = parser.parse_args() + + input_path = Path(args.input) + output_path = Path(args.output) if args.output else None + + if input_path.is_file(): + # Process single file + process_file(input_path, output_path) + elif input_path.is_dir(): + # Process directory + pattern = '**/*.md' if args.recursive else '*.md' + md_files = list(input_path.glob(pattern)) + + if not md_files: + print(f"No markdown files found in {input_path}") + return + + for md_file in md_files: + if output_path: + # Preserve directory structure in output + relative_path = md_file.relative_to(input_path) + out_file = output_path / relative_path + out_file.parent.mkdir(parents=True, exist_ok=True) + else: + out_file = None + + process_file(md_file, out_file) + + print(f"\nProcessed {len(md_files)} file(s)") + else: + print(f"Error: {input_path} is not a valid file or directory") + return 1 + + return 0 + + +if __name__ == '__main__': + exit(main()) + From d8e0cd1986d1dfcaaa43f56405b1a4d289bcd6ea Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Wed, 12 Nov 2025 14:40:43 +0100 Subject: [PATCH 2/4] ruff fix --- src/scripts/strip_html_from_md.py | 103 +++++++++++++++--------------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/src/scripts/strip_html_from_md.py b/src/scripts/strip_html_from_md.py index 0643f145..2ea5e7df 100644 --- a/src/scripts/strip_html_from_md.py +++ b/src/scripts/strip_html_from_md.py @@ -8,26 +8,25 @@ - Preserving the text content """ -import re import argparse -from pathlib import Path +import re from html.parser import HTMLParser -from typing import Optional +from pathlib import Path class HTMLStripper(HTMLParser): """Helper class to strip HTML tags while preserving text content.""" - + def __init__(self): super().__init__() self.reset() self.strict = False self.convert_charrefs = True self.text = [] - + def handle_data(self, data): self.text.append(data) - + def get_data(self): return ''.join(self.text) @@ -51,7 +50,7 @@ def extract_docstring_info(docstring_block: str) -> dict: 'retdesc': None, 'description': None } - + # Extract name name_match = re.search(r'(.*?)', docstring_block, re.DOTALL) if name_match: @@ -59,32 +58,32 @@ def extract_docstring_info(docstring_block: str) -> dict: # Remove "class " or "def " prefix if present cleaned_name = re.sub(r'^(class|def)\s+', '', raw_name) info['name'] = cleaned_name - + # Extract anchor anchor_match = re.search(r'(.*?)', docstring_block, re.DOTALL) if anchor_match: info['anchor'] = anchor_match.group(1).strip() - + # Extract source source_match = re.search(r'(.*?)', docstring_block, re.DOTALL) if source_match: info['source'] = source_match.group(1).strip() - + # Extract parameters description paramsdesc_match = re.search(r'(.*?)', docstring_block, re.DOTALL) if paramsdesc_match: info['paramsdesc'] = paramsdesc_match.group(1).strip() - + # Extract return type rettype_match = re.search(r'(.*?)', docstring_block, re.DOTALL) if rettype_match: info['rettype'] = rettype_match.group(1).strip() - + # Extract return description retdesc_match = re.search(r'(.*?)', docstring_block, re.DOTALL) if retdesc_match: info['retdesc'] = retdesc_match.group(1).strip() - + # Extract text outside docstring tags but inside the div # This is the description text description_match = re.search(r'(.*?)(?:|$)', docstring_block, re.DOTALL) @@ -94,7 +93,7 @@ def extract_docstring_info(docstring_block: str) -> dict: desc_text = re.sub(r'<[^>]+>', '', desc_text) if desc_text: info['description'] = desc_text - + return info @@ -109,7 +108,7 @@ def format_parameters(paramsdesc: str) -> str: lines = paramsdesc.split('\n') formatted_params = [] current_param = [] - + for line in lines: # Check if this is a new parameter line (starts with "- **") if re.match(r'^\s*-\s+\*\*', line): @@ -123,20 +122,20 @@ def format_parameters(paramsdesc: str) -> str: formatted_params.append(param_text) formatted_params.append('') # Add blank line between parameters current_param = [] - + # Start new parameter current_param.append(line) elif current_param: # Continuation of current parameter description current_param.append(line.strip()) - + # Don't forget the last parameter if current_param: param_text = ' '.join(current_param) param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text) param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1) formatted_params.append(param_text) - + return '\n'.join(formatted_params) @@ -149,10 +148,10 @@ def process_docstring_block(docstring_block: str) -> str: """ # Extract structured information from the docstring info = extract_docstring_info(docstring_block) - + # Build the cleaned markdown parts = [] - + # Add the name as level 4 header with anchor if info['name']: if info['anchor']: @@ -160,19 +159,19 @@ def process_docstring_block(docstring_block: str) -> str: else: parts.append(f"#### {info['name']}") parts.append("") - + # Add source link if available if info['source']: # Strip any HTML from source source_clean = strip_html_tags(info['source']) parts.append(f"[Source]({source_clean})") parts.append("") - + # Add description if info['description']: parts.append(info['description']) parts.append("") - + # Add parameters description if info['paramsdesc']: parts.append("**Parameters:**") @@ -181,7 +180,7 @@ def process_docstring_block(docstring_block: str) -> str: formatted_params = format_parameters(info['paramsdesc']) parts.append(formatted_params) parts.append("") - + # Add return type if info['rettype']: parts.append("**Returns:**") @@ -190,7 +189,7 @@ def process_docstring_block(docstring_block: str) -> str: rettype_clean = strip_html_tags(info['rettype']) parts.append(f"`{rettype_clean}`") parts.append("") - + # Add return description if info['retdesc']: if not info['rettype']: @@ -198,39 +197,39 @@ def process_docstring_block(docstring_block: str) -> str: parts.append("") parts.append(info['retdesc']) parts.append("") - + result = '\n'.join(parts) - + # Clean up excessive newlines result = re.sub(r'\n{3,}', '\n\n', result) - + return result.strip() def strip_html_from_markdown(content: str) -> str: """ Strip HTML from markdown content. - + Handles: - Docstring blocks wrapped in
...
- Other HTML tags throughout the document """ result = content - + # Process docstring blocks with their wrapping divs # Pattern to match:
.........
docstring_pattern = r']*class="docstring[^"]*"[^>]*>.*?.*?.*?' - + def replace_docstring(match): block = match.group(0) return process_docstring_block(block) - + result = re.sub(docstring_pattern, replace_docstring, result, flags=re.DOTALL) - + # Strip remaining HTML tags (like , , , etc.) # But preserve markdown code blocks result = strip_remaining_html(result) - + return result @@ -241,48 +240,48 @@ def strip_remaining_html(content: str) -> str: """ # Remove HTML comments content = re.sub(r'', '', content, flags=re.DOTALL) - + # Remove common component tags while preserving their content # (Tip, TipEnd, ExampleCodeBlock, hfoptions, hfoption, etc.) tags_to_remove = [ 'Tip', 'TipEnd', 'ExampleCodeBlock', 'hfoptions', 'hfoption', 'EditOnGithub', 'div', 'span', 'anchor' ] - + for tag in tags_to_remove: # Remove opening tags with any attributes content = re.sub(rf'<{tag}[^>]*>', '', content, flags=re.IGNORECASE) # Remove closing tags content = re.sub(rf'', '', content, flags=re.IGNORECASE) - + # Remove any remaining HTML tags (generic cleanup) # This is more aggressive but preserves text content content = re.sub(r'<[^>]+>', '', content) - + # Clean up multiple consecutive blank lines content = re.sub(r'\n{3,}', '\n\n', content) - + return content -def process_file(input_path: Path, output_path: Optional[Path] = None) -> None: +def process_file(input_path: Path, output_path: Path | None = None) -> None: """ Process a markdown file to strip HTML. - + Args: input_path: Path to input markdown file output_path: Path to output file (if None, overwrites input) """ # Read the input file content = input_path.read_text(encoding='utf-8') - + # Process the content cleaned_content = strip_html_from_markdown(content) - + # Write to output file if output_path is None: output_path = input_path - + output_path.write_text(cleaned_content, encoding='utf-8') print(f"Processed: {input_path} -> {output_path}") @@ -308,12 +307,12 @@ def main(): action='store_true', help='Process directory recursively' ) - + args = parser.parse_args() - + input_path = Path(args.input) output_path = Path(args.output) if args.output else None - + if input_path.is_file(): # Process single file process_file(input_path, output_path) @@ -321,11 +320,11 @@ def main(): # Process directory pattern = '**/*.md' if args.recursive else '*.md' md_files = list(input_path.glob(pattern)) - + if not md_files: print(f"No markdown files found in {input_path}") return - + for md_file in md_files: if output_path: # Preserve directory structure in output @@ -334,14 +333,14 @@ def main(): out_file.parent.mkdir(parents=True, exist_ok=True) else: out_file = None - + process_file(md_file, out_file) - + print(f"\nProcessed {len(md_files)} file(s)") else: print(f"Error: {input_path} is not a valid file or directory") return 1 - + return 0 From 71cfc5220424a18b2a3820acae6b0ce4335fdb62 Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Wed, 12 Nov 2025 14:42:05 +0100 Subject: [PATCH 3/4] format --- src/scripts/strip_html_from_md.py | 149 +++++++++++++++--------------- 1 file changed, 72 insertions(+), 77 deletions(-) diff --git a/src/scripts/strip_html_from_md.py b/src/scripts/strip_html_from_md.py index 2ea5e7df..43a7f4fe 100644 --- a/src/scripts/strip_html_from_md.py +++ b/src/scripts/strip_html_from_md.py @@ -28,7 +28,7 @@ def handle_data(self, data): self.text.append(data) def get_data(self): - return ''.join(self.text) + return "".join(self.text) def strip_html_tags(text: str) -> str: @@ -41,58 +41,58 @@ def strip_html_tags(text: str) -> str: def extract_docstring_info(docstring_block: str) -> dict: """Extract information from a docstring block.""" info = { - 'name': None, - 'anchor': None, - 'source': None, - 'parameters': None, - 'paramsdesc': None, - 'rettype': None, - 'retdesc': None, - 'description': None + "name": None, + "anchor": None, + "source": None, + "parameters": None, + "paramsdesc": None, + "rettype": None, + "retdesc": None, + "description": None, } # Extract name - name_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + name_match = re.search(r"(.*?)", docstring_block, re.DOTALL) if name_match: raw_name = name_match.group(1).strip() # Remove "class " or "def " prefix if present - cleaned_name = re.sub(r'^(class|def)\s+', '', raw_name) - info['name'] = cleaned_name + cleaned_name = re.sub(r"^(class|def)\s+", "", raw_name) + info["name"] = cleaned_name # Extract anchor - anchor_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + anchor_match = re.search(r"(.*?)", docstring_block, re.DOTALL) if anchor_match: - info['anchor'] = anchor_match.group(1).strip() + info["anchor"] = anchor_match.group(1).strip() # Extract source - source_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + source_match = re.search(r"(.*?)", docstring_block, re.DOTALL) if source_match: - info['source'] = source_match.group(1).strip() + info["source"] = source_match.group(1).strip() # Extract parameters description - paramsdesc_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + paramsdesc_match = re.search(r"(.*?)", docstring_block, re.DOTALL) if paramsdesc_match: - info['paramsdesc'] = paramsdesc_match.group(1).strip() + info["paramsdesc"] = paramsdesc_match.group(1).strip() # Extract return type - rettype_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + rettype_match = re.search(r"(.*?)", docstring_block, re.DOTALL) if rettype_match: - info['rettype'] = rettype_match.group(1).strip() + info["rettype"] = rettype_match.group(1).strip() # Extract return description - retdesc_match = re.search(r'(.*?)', docstring_block, re.DOTALL) + retdesc_match = re.search(r"(.*?)", docstring_block, re.DOTALL) if retdesc_match: - info['retdesc'] = retdesc_match.group(1).strip() + info["retdesc"] = retdesc_match.group(1).strip() # Extract text outside docstring tags but inside the div # This is the description text - description_match = re.search(r'(.*?)(?:|$)', docstring_block, re.DOTALL) + description_match = re.search(r"(.*?)(?:|$)", docstring_block, re.DOTALL) if description_match: desc_text = description_match.group(1).strip() # Remove any remaining HTML tags - desc_text = re.sub(r'<[^>]+>', '', desc_text) + desc_text = re.sub(r"<[^>]+>", "", desc_text) if desc_text: - info['description'] = desc_text + info["description"] = desc_text return info @@ -105,22 +105,22 @@ def format_parameters(paramsdesc: str) -> str: - Changing -- to : - Adding blank lines between parameters """ - lines = paramsdesc.split('\n') + lines = paramsdesc.split("\n") formatted_params = [] current_param = [] for line in lines: # Check if this is a new parameter line (starts with "- **") - if re.match(r'^\s*-\s+\*\*', line): + if re.match(r"^\s*-\s+\*\*", line): # Save the previous parameter if exists if current_param: - param_text = ' '.join(current_param) + param_text = " ".join(current_param) # Remove - and ** formatting - param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text) + param_text = re.sub(r"^\s*-\s+\*\*([^*]+)\*\*", r"\1", param_text) # Change -- to : - param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1) + param_text = re.sub(r"\s+--\s+", " : ", param_text, count=1) formatted_params.append(param_text) - formatted_params.append('') # Add blank line between parameters + formatted_params.append("") # Add blank line between parameters current_param = [] # Start new parameter @@ -131,12 +131,12 @@ def format_parameters(paramsdesc: str) -> str: # Don't forget the last parameter if current_param: - param_text = ' '.join(current_param) - param_text = re.sub(r'^\s*-\s+\*\*([^*]+)\*\*', r'\1', param_text) - param_text = re.sub(r'\s+--\s+', ' : ', param_text, count=1) + param_text = " ".join(current_param) + param_text = re.sub(r"^\s*-\s+\*\*([^*]+)\*\*", r"\1", param_text) + param_text = re.sub(r"\s+--\s+", " : ", param_text, count=1) formatted_params.append(param_text) - return '\n'.join(formatted_params) + return "\n".join(formatted_params) def process_docstring_block(docstring_block: str) -> str: @@ -153,55 +153,55 @@ def process_docstring_block(docstring_block: str) -> str: parts = [] # Add the name as level 4 header with anchor - if info['name']: - if info['anchor']: + if info["name"]: + if info["anchor"]: parts.append(f"#### {info['name']}[[{info['anchor']}]]") else: parts.append(f"#### {info['name']}") parts.append("") # Add source link if available - if info['source']: + if info["source"]: # Strip any HTML from source - source_clean = strip_html_tags(info['source']) + source_clean = strip_html_tags(info["source"]) parts.append(f"[Source]({source_clean})") parts.append("") # Add description - if info['description']: - parts.append(info['description']) + if info["description"]: + parts.append(info["description"]) parts.append("") # Add parameters description - if info['paramsdesc']: + if info["paramsdesc"]: parts.append("**Parameters:**") parts.append("") # Format parameters: remove bullets and bold, change -- to :, add blank lines - formatted_params = format_parameters(info['paramsdesc']) + formatted_params = format_parameters(info["paramsdesc"]) parts.append(formatted_params) parts.append("") # Add return type - if info['rettype']: + if info["rettype"]: parts.append("**Returns:**") parts.append("") # Strip HTML tags from return type - rettype_clean = strip_html_tags(info['rettype']) + rettype_clean = strip_html_tags(info["rettype"]) parts.append(f"`{rettype_clean}`") parts.append("") # Add return description - if info['retdesc']: - if not info['rettype']: + if info["retdesc"]: + if not info["rettype"]: parts.append("**Returns:**") parts.append("") - parts.append(info['retdesc']) + parts.append(info["retdesc"]) parts.append("") - result = '\n'.join(parts) + result = "\n".join(parts) # Clean up excessive newlines - result = re.sub(r'\n{3,}', '\n\n', result) + result = re.sub(r"\n{3,}", "\n\n", result) return result.strip() @@ -239,27 +239,34 @@ def strip_remaining_html(content: str) -> str: Handles tags like , , etc. """ # Remove HTML comments - content = re.sub(r'', '', content, flags=re.DOTALL) + content = re.sub(r"", "", content, flags=re.DOTALL) # Remove common component tags while preserving their content # (Tip, TipEnd, ExampleCodeBlock, hfoptions, hfoption, etc.) tags_to_remove = [ - 'Tip', 'TipEnd', 'ExampleCodeBlock', 'hfoptions', 'hfoption', - 'EditOnGithub', 'div', 'span', 'anchor' + "Tip", + "TipEnd", + "ExampleCodeBlock", + "hfoptions", + "hfoption", + "EditOnGithub", + "div", + "span", + "anchor", ] for tag in tags_to_remove: # Remove opening tags with any attributes - content = re.sub(rf'<{tag}[^>]*>', '', content, flags=re.IGNORECASE) + content = re.sub(rf"<{tag}[^>]*>", "", content, flags=re.IGNORECASE) # Remove closing tags - content = re.sub(rf'', '', content, flags=re.IGNORECASE) + content = re.sub(rf"", "", content, flags=re.IGNORECASE) # Remove any remaining HTML tags (generic cleanup) # This is more aggressive but preserves text content - content = re.sub(r'<[^>]+>', '', content) + content = re.sub(r"<[^>]+>", "", content) # Clean up multiple consecutive blank lines - content = re.sub(r'\n{3,}', '\n\n', content) + content = re.sub(r"\n{3,}", "\n\n", content) return content @@ -273,7 +280,7 @@ def process_file(input_path: Path, output_path: Path | None = None) -> None: output_path: Path to output file (if None, overwrites input) """ # Read the input file - content = input_path.read_text(encoding='utf-8') + content = input_path.read_text(encoding="utf-8") # Process the content cleaned_content = strip_html_from_markdown(content) @@ -282,31 +289,20 @@ def process_file(input_path: Path, output_path: Path | None = None) -> None: if output_path is None: output_path = input_path - output_path.write_text(cleaned_content, encoding='utf-8') + output_path.write_text(cleaned_content, encoding="utf-8") print(f"Processed: {input_path} -> {output_path}") def main(): """Main entry point for the script.""" parser = argparse.ArgumentParser( - description='Strip HTML from markdown files and convert docstrings to clean markdown.' + description="Strip HTML from markdown files and convert docstrings to clean markdown." ) + parser.add_argument("input", type=str, help="Input markdown file or directory") parser.add_argument( - 'input', - type=str, - help='Input markdown file or directory' - ) - parser.add_argument( - '-o', '--output', - type=str, - default=None, - help='Output file or directory (defaults to overwriting input)' - ) - parser.add_argument( - '-r', '--recursive', - action='store_true', - help='Process directory recursively' + "-o", "--output", type=str, default=None, help="Output file or directory (defaults to overwriting input)" ) + parser.add_argument("-r", "--recursive", action="store_true", help="Process directory recursively") args = parser.parse_args() @@ -318,7 +314,7 @@ def main(): process_file(input_path, output_path) elif input_path.is_dir(): # Process directory - pattern = '**/*.md' if args.recursive else '*.md' + pattern = "**/*.md" if args.recursive else "*.md" md_files = list(input_path.glob(pattern)) if not md_files: @@ -344,6 +340,5 @@ def main(): return 0 -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) - From 7ddde3f6a04aa55d8b970bb24f99bb4db8b6593b Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Wed, 12 Nov 2025 14:50:18 +0100 Subject: [PATCH 4/4] add to jobs --- .github/workflows/build_main_documentation.yml | 7 +++++++ .github/workflows/build_pr_documentation.yml | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index 20870971..5005de82 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -234,6 +234,13 @@ jobs: cd .. + - name: Strip HTML from built markdown files + run: | + source .venv/bin/activate + echo "Stripping HTML from markdown files in build_dir" + python3 src/scripts/strip_html_from_md.py build_dir/ --recursive + echo "HTML stripping complete" + - name: Push to repositories run: | source .venv/bin/activate diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index ce0e06d2..09bc9747 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -219,6 +219,13 @@ jobs: fi cd .. + - name: Strip HTML from built markdown files + run: | + source .venv/bin/activate + echo "Stripping HTML from markdown files in build_dir" + python3 src/scripts/strip_html_from_md.py build_dir/ --recursive + echo "HTML stripping complete" + - name: Save commit_sha & pr_number run: | echo ${{ inputs.commit_sha }} > ./build_dir/commit_sha