66Copyright (C) 2025 Texas Instruments Incorporated - https://www.ti.com
77"""
88
9+ import argparse
910import logging
11+ from multiprocessing import Pool
1012
1113from lxml import html
1214from root_index import get_root_index , BUILD_PATH
1315
1416COMMON_PATHS = {"_images" , "_downloads" , "_static" }
1517
1618
17- def _rewrite_wrapper ( document , old_rel_path , new_rel_path , check_list ):
19+ def _rewrite_path ( html_path , common_dir , check_list ):
1820 """Wrapper to replace links using lxml rewrite_links. Defines a throwaway function to make
1921 things faster.
2022
21- :param document: lxml html document to operate on
22- :param old_rel_path: Pathlib path to the document root directory
23- :param new_rel_path: Pathlib path to the new common directory
23+ :param html_path: Pathlib path to file to HTML file
24+ :param common_dir: Pathlib path to the document root directory
2425 :param check_list: Iterable of pathlib paths to check
2526 """
27+ with html_path .open ("r" , encoding = "utf-8" ) as file :
28+ document = html .fromstring (file .read ())
29+
30+ old_rel_path = html_path .parent .resolve ()
31+ new_rel_path = common_dir .resolve ()
2632
2733 def _update_link (link ):
2834 """Function to interact with lxml's rewrite_links
@@ -36,20 +42,31 @@ def _update_link(link):
3642 link_path = old_rel_path .joinpath (clean_link ).resolve ()
3743 for check_path in check_list :
3844 if link_path .is_relative_to (check_path ):
39- logging .info ("old link path: %s" , link_path )
45+ logging .info ("rewriting link in: %s" , html_path )
46+ logging .debug ("old link path: %s" , link_path )
4047 new_path = new_rel_path .joinpath (
4148 link_path .relative_to (check_path .parent )
4249 )
43- logging .info ("new link path: %s" , new_path )
50+ logging .debug ("new link path: %s" , new_path )
4451 rel_path = new_path .relative_to (old_rel_path , walk_up = True )
45- logging .info ("new rel path: %s" , rel_path )
46- logging .info ("---" )
52+ logging .debug ("new rel path: %s" , rel_path )
53+ logging .debug ("---" )
4754 return rel_path .as_posix ()
4855
4956 return link
5057
5158 document .rewrite_links (_update_link , resolve_base_href = False )
5259
60+ with html_path .open ("wb" ) as file :
61+ file .write (
62+ html .tostring (
63+ document ,
64+ encoding = "utf-8" ,
65+ include_meta_content_type = True ,
66+ doctype = "<!DOCTYPE html>" ,
67+ )
68+ )
69+
5370
5471def _move_files (old_rel_path , new_rel_path , check_list ):
5572 """Move the files that match the check_list from the old_rel_path root into new_rel_path.
@@ -66,16 +83,15 @@ def _move_files(old_rel_path, new_rel_path, check_list):
6683 rel = path .relative_to (old_rel_path )
6784 logging .info ("moving file: %s" , rel )
6885 new = new_rel_path .joinpath (rel )
69- logging .info ("destination: %s" , new )
86+ logging .debug ("destination: %s" , new )
7087 new .parent .mkdir (mode = 0o755 , parents = True , exist_ok = True )
7188 path .replace (new )
72- logging .info ("---" )
7389
7490 for empty_dir in sorted (operating_dir .glob ("**/*" ), reverse = True ):
7591 empty_dir .rmdir ()
7692
7793
78- def rewrite_paths (root_dir , common_dir ):
94+ def rewrite_paths (root_dir , common_dir , jobs ):
7995 """Rewrite the paths to move assets into a common_dir directory. This assumes:
8096
8197 1. Paths are already relative to the given root_dir
@@ -86,31 +102,28 @@ def rewrite_paths(root_dir, common_dir):
86102 """
87103 check_list = {root_dir .joinpath (x ).resolve () for x in COMMON_PATHS }
88104 logging .info ("rewriting paths" )
89- for html_path in root_dir .glob ("**/*.html" ):
90- with html_path .open ("r" , encoding = "utf-8" ) as file :
91- document = html .fromstring (file .read ())
92-
93- _rewrite_wrapper (
94- document , html_path .parent .resolve (), common_dir .resolve (), check_list
95- )
96-
97- with html_path .open ("wb" ) as file :
98- file .write (
99- html .tostring (
100- document ,
101- encoding = "utf-8" ,
102- include_meta_content_type = True ,
103- doctype = "<!DOCTYPE html>" ,
104- )
105- )
106-
105+ starmap_iterable = [
106+ (path , common_dir , check_list ) for path in root_dir .glob ("**/*.html" )
107+ ]
108+ with Pool (jobs ) as pool :
109+ pool .starmap (_rewrite_path , starmap_iterable )
107110 logging .info ("moving the files" )
108111 _move_files (root_dir .resolve (), common_dir .resolve (), check_list )
109112
110113
111114def main ():
112115 """Main processing loop"""
113- logging .basicConfig (level = logging .INFO )
116+ parser = argparse .ArgumentParser (
117+ prog = "dedupe.py" ,
118+ description = "Tool to deduplicate HTML assets for GitHub pages deployments" ,
119+ )
120+
121+ parser .add_argument ("-v" , "--verbose" , action = "store_true" )
122+ parser .add_argument ("-j" , "--jobs" , type = int , default = 8 )
123+
124+ args = parser .parse_args ()
125+
126+ logging .basicConfig (level = logging .DEBUG if args .verbose else logging .INFO )
114127
115128 root_list = []
116129 for path in BUILD_PATH .glob ("*/" ):
@@ -121,7 +134,7 @@ def main():
121134
122135 for path in root_list :
123136 logging .info ("working on the following document dir: %s" , path )
124- rewrite_paths (path , BUILD_PATH )
137+ rewrite_paths (path , BUILD_PATH , args . jobs )
125138
126139
127140if __name__ == "__main__" :
0 commit comments