1919
2020import fnmatch
2121import io
22- from lxml import etree
2322import re
2423import os
25- import sys
2624import shutil
2725import urllib .parse
28- from xml_utils import xml_escape , xml_unescape
26+ from lxml import etree
27+
2928
3029def rmtree_if_exists (dir ):
3130 if os .path .isdir (dir ):
3231 shutil .rmtree (dir )
3332
33+
3434def move_dir_contents_to_dir (srcdir , dstdir ):
3535 for fn in os .listdir (srcdir ):
3636 shutil .move (os .path .join (srcdir , fn ),
3737 os .path .join (dstdir , fn ))
3838
39+
3940def rearrange_archive (root ):
4041 # rearrange the archive. {root} here is output/reference
4142
@@ -71,8 +72,10 @@ def rearrange_archive(root):
7172 move_dir_contents_to_dir (src_data_path , data_path )
7273
7374 # also copy the custom fonts
74- shutil .copy (os .path .join (path , 'DejaVuSansMonoCondensed60.ttf' ), data_path )
75- shutil .copy (os .path .join (path , 'DejaVuSansMonoCondensed75.ttf' ), data_path )
75+ shutil .copy (os .path .join (path , 'DejaVuSansMonoCondensed60.ttf' ),
76+ data_path )
77+ shutil .copy (os .path .join (path , 'DejaVuSansMonoCondensed75.ttf' ),
78+ data_path )
7679
7780 # remove what's left
7881 shutil .rmtree (path )
@@ -81,20 +84,23 @@ def rearrange_archive(root):
8184 for fn in fnmatch .filter (os .listdir (root ), 'cppreference-export*.xml' ):
8285 os .remove (os .path .join (root , fn ))
8386
84- # Converts complex URL to resources supplied by MediaWiki loader to a simplified name
87+
8588def convert_loader_name (fn ):
89+ # Converts complex URL to resources supplied by MediaWiki loader to a
90+ # simplified name
8691 if "modules=site&only=scripts" in fn :
8792 return "site_scripts.js"
88- elif "modules=site&only=styles" in fn :
93+ if "modules=site&only=styles" in fn :
8994 return "site_modules.css"
90- elif "modules=startup&only=scripts" in fn :
95+ if "modules=startup&only=scripts" in fn :
9196 return "startup_scripts.js"
92- elif re .search ("modules=skins.*&only=scripts" , fn ):
97+ if re .search ("modules=skins.*&only=scripts" , fn ):
9398 return "skin_scripts.js"
94- elif re .search ("modules=.*ext.*&only=styles" , fn ):
99+ if re .search ("modules=.*ext.*&only=styles" , fn ):
95100 return "ext.css"
96- else :
97- raise Exception ('Loader file {0} does not match any known files' .format (fn ))
101+ msg = 'Loader file {0} does not match any known files' .format (fn )
102+ raise Exception (msg )
103+
98104
99105def build_rename_map (root ):
100106 # Returns a rename map: a map from old to new file name
@@ -122,13 +128,17 @@ def build_rename_map(root):
122128 if num > 0 :
123129 name , ext = os .path .splitext (fn )
124130 # add file with its path -> only rename that occurrence
125- result [os .path .join (dir , fn )] = "{}.{}{}" .format (name , num + 1 , ext )
131+ result [os .path .join (dir , fn )] = "{}.{}{}" .format (name , num + 1 ,
132+ ext )
126133 seen [low ] += 1
127134
128135 return result
129136
137+
130138def rename_files (root , rename_map ):
131- for dir , old_fn in ((dir , fn ) for dir , _ , filenames in os .walk (root ) for fn in filenames ):
139+ for dir , old_fn in ((dir , fn )
140+ for dir , _ , filenames in os .walk (root )
141+ for fn in filenames ):
132142 src_path = os .path .join (dir , old_fn )
133143
134144 new_fn = rename_map .get (old_fn )
@@ -144,6 +154,7 @@ def rename_files(root, rename_map):
144154 print ("Renaming {0}\n to {1}" .format (src_path , dst_path ))
145155 shutil .move (src_path , dst_path )
146156
157+
147158def find_html_files (root ):
148159 # find files that need to be preprocessed
149160 html_files = []
@@ -152,21 +163,25 @@ def find_html_files(root):
152163 html_files .append (os .path .join (dir , filename ))
153164 return html_files
154165
166+
155167def is_loader_link (target ):
156168 if re .match (r'https?://[a-z]+\.cppreference\.com/mwiki/load\.php' , target ):
157169 return True
158170 return False
159171
172+
160173def transform_loader_link (target , file , root ):
161174 # Absolute loader.php links need to be made relative
162175 abstarget = os .path .join (root , "common" , convert_loader_name (target ))
163176 return os .path .relpath (abstarget , os .path .dirname (file ))
164177
178+
165179def is_ranges_placeholder (target ):
166- if re .match (r'https?://[a-z]+\.cppreference\.com/w/cpp/ranges(-[a-z]+)?-placeholder/.+' , target ):
180+ if re .match (r'https?://[a-z]+\.cppreference\.com/w/cpp/ranges(-[a-z]+)?-placeholder/.+' , target ): # noqa
167181 return True
168182 return False
169183
184+
170185def transform_ranges_placeholder (target , file , root ):
171186 # Placeholder link replacement is implemented in the MediaWiki site JS at
172187 # https://en.cppreference.com/w/MediaWiki:Common.js
@@ -175,9 +190,9 @@ def transform_ranges_placeholder(target, file, root):
175190 repl = (r'\1/cpp/experimental/ranges/\2' if ranges else r'\1/cpp/\2' )
176191
177192 if 'ranges-placeholder' in target :
178- match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-placeholder/(.+)'
193+ match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-placeholder/(.+)' # noqa
179194 else :
180- match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-([a-z]+)-placeholder/(.+)'
195+ match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-([a-z]+)-placeholder/(.+)' # noqa
181196 repl += (r'/\3' if ranges else r'/ranges/\3' )
182197
183198 # Turn absolute placeholder link into site-relative link
@@ -187,24 +202,27 @@ def transform_ranges_placeholder(target, file, root):
187202 abstarget = os .path .join (root , reltarget )
188203 return os .path .relpath (abstarget , os .path .dirname (file ))
189204
205+
190206def is_external_link (target ):
191207 url = urllib .parse .urlparse (target )
192208 return url .scheme != '' or url .netloc != ''
193209
210+
194211def trasform_relative_link (rename_map , target , file ):
195212 # urlparse returns (scheme, host, path, params, query, fragment)
196213 _ , _ , path , params , _ , fragment = urllib .parse .urlparse (target )
197214 assert params == ''
198215
199216 path = urllib .parse .unquote (path )
200- path = path .replace ('../../upload.cppreference.com/mwiki/' ,'../common/' )
201- path = path .replace ('../mwiki/' ,'../common/' )
217+ path = path .replace ('../../upload.cppreference.com/mwiki/' , '../common/' )
218+ path = path .replace ('../mwiki/' , '../common/' )
202219
203220 dir , fn = os .path .split (path )
204221 new_fn = rename_map .get (fn )
205222 if new_fn :
206223 # look for case conflict of the renamed file
207- abstarget = os .path .normpath (os .path .join (os .path .dirname (file ), dir , new_fn ))
224+ abstarget = os .path .normpath (os .path .join (os .path .dirname (file ),
225+ dir , new_fn ))
208226 new_fn = rename_map .get (abstarget , new_fn )
209227 else :
210228 # original filename unchanged, look for case conflict
@@ -216,11 +234,13 @@ def trasform_relative_link(rename_map, target, file):
216234 path = urllib .parse .quote (path )
217235 return urllib .parse .urlunparse (('' , '' , path , params , '' , fragment ))
218236
237+
219238# Transforms a link in the given file according to rename map.
220239# target is the link to transform.
221240# file is the path of the file the link came from.
222241# root is the path to the root of the archive.
223242def transform_link (rename_map , target , file , root ):
243+
224244 if is_loader_link (target ):
225245 return transform_loader_link (target , file , root )
226246
@@ -232,6 +252,7 @@ def transform_link(rename_map, target, file, root):
232252
233253 return trasform_relative_link (rename_map , target , file )
234254
255+
235256def has_class (el , * classes_to_check ):
236257 value = el .get ('class' )
237258 if value is None :
@@ -242,6 +263,7 @@ def has_class(el, *classes_to_check):
242263 return True
243264 return False
244265
266+
245267# remove non-printable elements
246268def remove_noprint (html ):
247269 for el in html .xpath ('//*' ):
@@ -250,14 +272,16 @@ def remove_noprint(html):
250272 elif el .get ('id' ) in ['toc' , 'catlinks' ]:
251273 el .getparent ().remove (el )
252274
275+
253276# remove see also links between C and C++ documentations
254277def remove_see_also (html ):
255278 for el in html .xpath ('//tr[@class]' ):
256279 if not has_class (el , 't-dcl-list-item' , 't-dsc' ):
257280 continue
258281
259282 child_tds = el .xpath ('.//td/div[@class]' )
260- if not any (has_class (td , 't-dcl-list-see' , 't-dsc-see' ) for td in child_tds ):
283+ if not any (has_class (td , 't-dcl-list-see' , 't-dsc-see' )
284+ for td in child_tds ):
261285 continue
262286
263287 # remove preceding separator, if any
@@ -276,17 +300,23 @@ def remove_see_also(html):
276300 next = el .getnext ()
277301 if next is None :
278302 el .getparent ().remove (el )
279- elif next .tag == 'table' and has_class (next , 't-dcl-list-begin' ) and len (next .xpath ('.//tr' )) == 0 :
303+ elif next .tag == 'table' and has_class (next , 't-dcl-list-begin' ) and \
304+ len (next .xpath ('.//tr' )) == 0 :
280305 el .getparent ().remove (el )
281306 next .getparent ().remove (next )
282307
308+
283309# remove Google Analytics scripts
284310def remove_google_analytics (html ):
285311 for el in html .xpath ('/html/body/script' ):
286- if el .get ('src' ) is not None and 'google-analytics.com/ga.js' in el .get ('src' ):
287- el .getparent ().remove (el )
288- elif el .text is not None and ('google-analytics.com/ga.js' in el .text or 'pageTracker' in el .text ):
289- el .getparent ().remove (el )
312+ if el .get ('src' ) is not None :
313+ if 'google-analytics.com/ga.js' in el .get ('src' ):
314+ el .getparent ().remove (el )
315+ elif el .text is not None :
316+ if 'google-analytics.com/ga.js' in el .text or \
317+ 'pageTracker' in el .text :
318+ el .getparent ().remove (el )
319+
290320
291321# remove Carbon ads
292322def remove_ads (html ):
@@ -297,13 +327,15 @@ def remove_ads(html):
297327 if el .text is not None and '#carbonads' in el .text :
298328 el .getparent ().remove (el )
299329
330+
300331# remove links to file info pages (e.g. on images)
301332def remove_fileinfo (html ):
302- info = etree .XPath (r"//a[re:test(@href, 'https?://[a-z]+\.cppreference\.com/w/File:')]/.." ,
303- namespaces = {'re' :'http://exslt.org/regular-expressions' })
333+ info = etree .XPath (r"//a[re:test(@href, 'https?://[a-z]+\.cppreference\.com/w/File:')]/.." , # noqa
334+ namespaces = {'re' :'http://exslt.org/regular-expressions' }) # noqa
304335 for el in info (html ):
305336 el .getparent ().remove (el )
306337
338+
307339# remove external links to unused resources
308340def remove_unused_external (html ):
309341 for el in html .xpath ('/html/head/link' ):
@@ -313,6 +345,7 @@ def remove_unused_external(html):
313345 (head , tail ) = os .path .split (el .get ('href' ))
314346 el .set ('href' , os .path .join (head , 'common' , tail ))
315347
348+
316349def preprocess_html_file (root , fn , rename_map ):
317350 parser = etree .HTMLParser ()
318351 html = etree .parse (fn , parser )
@@ -331,23 +364,27 @@ def preprocess_html_file(root, fn, rename_map):
331364 for el in html .xpath ('//*[@href]' ):
332365 el .set ('href' , transform_link (rename_map , el .get ('href' ), fn , root ))
333366
334- for err in parser .error_log :
367+ for err in list ( parser .error_log ) :
335368 print ("HTML WARN: {0}" .format (err ), file = output )
336369
337370 html .write (fn , encoding = 'utf-8' , method = 'html' )
338371 return output .getvalue ()
339372
373+
340374def preprocess_css_file (fn ):
341375 f = open (fn , "r" , encoding = 'utf-8' )
342376 text = f .read ()
343377 f .close ()
344378
345379 # note that query string is not used in css files
346380
347- text = text .replace ('../DejaVuSansMonoCondensed60.ttf' , 'DejaVuSansMonoCondensed60.ttf' )
348- text = text .replace ('../DejaVuSansMonoCondensed75.ttf' , 'DejaVuSansMonoCondensed75.ttf' )
381+ text = text .replace ('../DejaVuSansMonoCondensed60.ttf' ,
382+ 'DejaVuSansMonoCondensed60.ttf' )
383+ text = text .replace ('../DejaVuSansMonoCondensed75.ttf' ,
384+ 'DejaVuSansMonoCondensed75.ttf' )
349385
350- text = text .replace ('../../upload.cppreference.com/mwiki/images/' , 'images/' )
386+ text = text .replace ('../../upload.cppreference.com/mwiki/images/' ,
387+ 'images/' )
351388
352389 # QT Help viewer doesn't understand nth-child
353390 text = text .replace ('nth-child(1)' , 'first-child' )
@@ -356,6 +393,7 @@ def preprocess_css_file(fn):
356393 f .write (text )
357394 f .close ()
358395
396+
359397def preprocess_startup_script (fn ):
360398 with open (fn , "r" , encoding = 'utf-8' ) as f :
361399 text = f .read ()
0 commit comments