Skip to content

Commit 6e6350f

Browse files
committed
async
Signed-off-by: Randolph Sapp <rs@ti.com>
1 parent e53744d commit 6e6350f

File tree

1 file changed

+44
-31
lines changed

1 file changed

+44
-31
lines changed

bin/dedupe.py

Lines changed: 44 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,29 @@
66
Copyright (C) 2025 Texas Instruments Incorporated - https://www.ti.com
77
"""
88

9+
import argparse
910
import logging
11+
from multiprocessing import Pool
1012

1113
from lxml import html
1214
from root_index import get_root_index, BUILD_PATH
1315

1416
COMMON_PATHS = {"_images", "_downloads", "_static"}
1517

1618

17-
def _rewrite_wrapper(document, old_rel_path, new_rel_path, check_list):
19+
def _rewrite_path(html_path, common_dir, check_list):
1820
"""Wrapper to replace links using lxml rewrite_links. Defines a throwaway function to make
1921
things faster.
2022
21-
:param document: lxml html document to operate on
22-
:param old_rel_path: Pathlib path to the document root directory
23-
:param new_rel_path: Pathlib path to the new common directory
23+
:param html_path: Pathlib path to file to HTML file
24+
:param common_dir: Pathlib path to the document root directory
2425
:param check_list: Iterable of pathlib paths to check
2526
"""
27+
with html_path.open("r", encoding="utf-8") as file:
28+
document = html.fromstring(file.read())
29+
30+
old_rel_path = html_path.parent.resolve()
31+
new_rel_path = common_dir.resolve()
2632

2733
def _update_link(link):
2834
"""Function to interact with lxml's rewrite_links
@@ -36,20 +42,31 @@ def _update_link(link):
3642
link_path = old_rel_path.joinpath(clean_link).resolve()
3743
for check_path in check_list:
3844
if link_path.is_relative_to(check_path):
39-
logging.info("old link path: %s", link_path)
45+
logging.info("rewriting link in: %s", html_path)
46+
logging.debug("old link path: %s", link_path)
4047
new_path = new_rel_path.joinpath(
4148
link_path.relative_to(check_path.parent)
4249
)
43-
logging.info("new link path: %s", new_path)
50+
logging.debug("new link path: %s", new_path)
4451
rel_path = new_path.relative_to(old_rel_path, walk_up=True)
45-
logging.info("new rel path: %s", rel_path)
46-
logging.info("---")
52+
logging.debug("new rel path: %s", rel_path)
53+
logging.debug("---")
4754
return rel_path.as_posix()
4855

4956
return link
5057

5158
document.rewrite_links(_update_link, resolve_base_href=False)
5259

60+
with html_path.open("wb") as file:
61+
file.write(
62+
html.tostring(
63+
document,
64+
encoding="utf-8",
65+
include_meta_content_type=True,
66+
doctype="<!DOCTYPE html>",
67+
)
68+
)
69+
5370

5471
def _move_files(old_rel_path, new_rel_path, check_list):
5572
"""Move the files that match the check_list from the old_rel_path root into new_rel_path.
@@ -66,16 +83,15 @@ def _move_files(old_rel_path, new_rel_path, check_list):
6683
rel = path.relative_to(old_rel_path)
6784
logging.info("moving file: %s", rel)
6885
new = new_rel_path.joinpath(rel)
69-
logging.info("destination: %s", new)
86+
logging.debug("destination: %s", new)
7087
new.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
7188
path.replace(new)
72-
logging.info("---")
7389

7490
for empty_dir in sorted(operating_dir.glob("**/*"), reverse=True):
7591
empty_dir.rmdir()
7692

7793

78-
def rewrite_paths(root_dir, common_dir):
94+
def rewrite_paths(root_dir, common_dir, jobs):
7995
"""Rewrite the paths to move assets into a common_dir directory. This assumes:
8096
8197
1. Paths are already relative to the given root_dir
@@ -86,31 +102,28 @@ def rewrite_paths(root_dir, common_dir):
86102
"""
87103
check_list = {root_dir.joinpath(x).resolve() for x in COMMON_PATHS}
88104
logging.info("rewriting paths")
89-
for html_path in root_dir.glob("**/*.html"):
90-
with html_path.open("r", encoding="utf-8") as file:
91-
document = html.fromstring(file.read())
92-
93-
_rewrite_wrapper(
94-
document, html_path.parent.resolve(), common_dir.resolve(), check_list
95-
)
96-
97-
with html_path.open("wb") as file:
98-
file.write(
99-
html.tostring(
100-
document,
101-
encoding="utf-8",
102-
include_meta_content_type=True,
103-
doctype="<!DOCTYPE html>",
104-
)
105-
)
106-
105+
starmap_iterable = [
106+
(path, common_dir, check_list) for path in root_dir.glob("**/*.html")
107+
]
108+
with Pool(jobs) as pool:
109+
pool.starmap(_rewrite_path, starmap_iterable)
107110
logging.info("moving the files")
108111
_move_files(root_dir.resolve(), common_dir.resolve(), check_list)
109112

110113

111114
def main():
112115
"""Main processing loop"""
113-
logging.basicConfig(level=logging.INFO)
116+
parser = argparse.ArgumentParser(
117+
prog="dedupe.py",
118+
description="Tool to deduplicate HTML assets for GitHub pages deployments",
119+
)
120+
121+
parser.add_argument("-v", "--verbose", action="store_true")
122+
parser.add_argument("-j", "--jobs", type=int, default=8)
123+
124+
args = parser.parse_args()
125+
126+
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
114127

115128
root_list = []
116129
for path in BUILD_PATH.glob("*/"):
@@ -121,7 +134,7 @@ def main():
121134

122135
for path in root_list:
123136
logging.info("working on the following document dir: %s", path)
124-
rewrite_paths(path, BUILD_PATH)
137+
rewrite_paths(path, BUILD_PATH, args.jobs)
125138

126139

127140
if __name__ == "__main__":

0 commit comments

Comments
 (0)