-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathhtml2md.py
More file actions
67 lines (50 loc) · 1.79 KB
/
html2md.py
File metadata and controls
67 lines (50 loc) · 1.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from bs4 import BeautifulSoup
#! Install beautifulsoup4 by running the following command:
# pip install beautifulsoup4
def html_table_to_md(text: str) -> str:
"""Convert HTML tables to Markdown tables in the given text.
Args:
text (str): Text containing HTML tables
Returns:
str: Text with HTML tables converted to Markdown format
"""
soup = BeautifulSoup(text, "html.parser")
tables = soup.find_all("table")
for table in tables:
md_table = []
max_cols = 0
# Get all rows
rows = table.find_all("tr")
if not rows:
continue
for row in rows:
cols = 0
for cell in row.find_all(["td", "th"]):
colspan = int(cell.get("colspan", 1))
cols += colspan
max_cols = max(max_cols, cols)
for row in rows:
row_data = []
cells = row.find_all(["td", "th"])
col_count = 0
for cell in cells:
content = cell.get_text().strip()
colspan = int(cell.get("colspan", 1))
for _ in range(colspan):
row_data.append(content)
col_count += 1
while col_count < max_cols:
row_data.append("")
col_count += 1
md_table.append("| " + " | ".join(row_data) + " |")
if len(md_table) == 1:
md_table.append("| " + " | ".join(["---"] * max_cols) + " |")
# Replace the HTML table with markdown table
md_table_str = "\n".join(md_table)
table.replace_with(md_table_str)
return str(soup)
with open("old.md", "r") as f:
html = f.read()
md = html_table_to_md(html)
with open("Output/new.md", "w") as f:
f.write(md)