Skip to content

Commit 619f6d6

Browse files
authored
Merge pull request p12tic#55 from mokibit/master
Refactor scripts for testability, add several tests
2 parents 006e391 + 47acbe3 commit 619f6d6

19 files changed

+1027
-725
lines changed

commands/preprocess.py

Lines changed: 312 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,312 @@
1+
#!/usr/bin/env python3
2+
3+
# Copyright (C) 2011, 2012 Povilas Kanapickas <povilas@radix.lt>
4+
#
5+
# This file is part of cppreference-doc
6+
#
7+
# This program is free software: you can redistribute it and/or modify
8+
# it under the terms of the GNU General Public License as published by
9+
# the Free Software Foundation, either version 3 of the License, or
10+
# (at your option) any later version.
11+
#
12+
# This program is distributed in the hope that it will be useful,
13+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
# GNU General Public License for more details.
16+
#
17+
# You should have received a copy of the GNU General Public License
18+
# along with this program. If not, see http://www.gnu.org/licenses/.
19+
20+
import fnmatch
21+
from lxml import etree
22+
import re
23+
import os
24+
import sys
25+
import shutil
26+
import urllib.parse
27+
from xml_utils import xml_escape, xml_unescape
28+
29+
def rmtree_if_exists(dir):
30+
if os.path.isdir(dir):
31+
shutil.rmtree(dir)
32+
33+
def move_dir_contents_to_dir(srcdir, dstdir):
34+
for fn in os.listdir(srcdir):
35+
shutil.move(os.path.join(srcdir, fn),
36+
os.path.join(dstdir, fn))
37+
38+
def rearrange_archive(root):
39+
# rearrange the archive. {root} here is output/reference
40+
41+
# before
42+
# {root}/en.cppreference.com/w/ : html
43+
# {root}/en.cppreference.com/mwiki/ : data
44+
# {root}/en.cppreference.com/ : data
45+
# ... (other languages)
46+
# {root}/upload.cppreference.com/mwiki/ : data
47+
48+
# after
49+
# {root}/common/ : all common data
50+
# {root}/en/ : html for en
51+
# ... (other languages)
52+
53+
data_path = os.path.join(root, 'common')
54+
rmtree_if_exists(data_path)
55+
shutil.move(os.path.join(root, 'upload.cppreference.com/mwiki'), data_path)
56+
shutil.rmtree(os.path.join(root, 'upload.cppreference.com'))
57+
58+
for lang in ["en"]:
59+
path = os.path.join(root, lang + ".cppreference.com/")
60+
src_html_path = path + "w/"
61+
src_data_path = path + "mwiki/"
62+
html_path = os.path.join(root, lang)
63+
64+
if os.path.isdir(src_html_path):
65+
shutil.move(src_html_path, html_path)
66+
67+
if os.path.isdir(src_data_path):
68+
# the skin files should be the same for all languages thus we
69+
# can merge everything
70+
move_dir_contents_to_dir(src_data_path, data_path)
71+
72+
# also copy the custom fonts
73+
shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed60.ttf'), data_path)
74+
shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed75.ttf'), data_path)
75+
76+
# remove what's left
77+
shutil.rmtree(path)
78+
79+
# remove the XML source file
80+
for fn in fnmatch.filter(os.listdir(root), 'cppreference-export*.xml'):
81+
os.remove(os.path.join(root, fn))
82+
83+
def add_file_to_rename_map(rename_map, dir, fn, new_fn):
84+
path = os.path.join(dir, fn)
85+
if not os.path.isfile(path):
86+
print("ERROR: Not renaming '{0}' because path does not exist".format(path))
87+
return
88+
rename_map.append((dir, fn, new_fn))
89+
90+
# Converts complex URL to resources supplied by MediaWiki loader to a simplified
91+
# name
92+
def convert_loader_name(fn):
93+
if re.search("modules=site&only=scripts", fn):
94+
return "site_scripts.js"
95+
elif re.search("modules=site&only=styles", fn):
96+
return "site_modules.css"
97+
elif re.search("modules=skins.*&only=scripts", fn):
98+
return "skin_scripts.js"
99+
elif re.search("modules=startup&only=scripts", fn):
100+
return "startup_scripts.js"
101+
elif re.search("modules=.*ext.*&only=styles", fn):
102+
return "ext.css"
103+
else:
104+
raise Exception('Loader file {0} does not match any known files'\
105+
.format(fn))
106+
107+
def find_files_to_be_renamed(root):
108+
# Returns a rename map: array of tuples each of which contain three strings:
109+
# the directory the file resides in, the source and destination filenames.
110+
111+
# The rename map specifies files to be renamed in order to support them on
112+
# windows filesystems which don't support certain characters in file names
113+
rename_map = []
114+
115+
files_rename = [] # general files to be renamed
116+
files_loader = [] # files served by load.php. These should map to
117+
# consistent and short file names because we
118+
# modify some of them later in the pipeline
119+
120+
for dir, dirnames, filenames in os.walk(root):
121+
filenames_loader = set(fnmatch.filter(filenames, 'load.php[?]*'))
122+
# match any filenames with '?"*' characters
123+
filenames_rename = set(fnmatch.filter(filenames, '*[?"*]*'))
124+
125+
# don't process load.php files in general rename handler
126+
filenames_rename -= filenames_loader
127+
128+
for fn in filenames_loader:
129+
files_loader.append((dir, fn))
130+
for fn in filenames_rename:
131+
files_rename.append((dir, fn))
132+
133+
for dir,orig_fn in files_rename:
134+
fn = orig_fn
135+
fn = re.sub('\?.*', '', fn)
136+
fn = re.sub('"', '_q_', fn)
137+
fn = re.sub('\*', '_star_', fn)
138+
add_file_to_rename_map(rename_map, dir, orig_fn, fn)
139+
140+
# map loader names to more recognizable names
141+
for dir,fn in files_loader:
142+
new_fn = convert_loader_name(fn)
143+
add_file_to_rename_map(rename_map, dir, fn, new_fn)
144+
145+
# rename filenames that conflict on case-insensitive filesystems
146+
# TODO: perform this automatically
147+
add_file_to_rename_map(rename_map, os.path.join(root, 'en/cpp/numeric/math'), 'NAN.html', 'NAN.2.html')
148+
add_file_to_rename_map(rename_map, os.path.join(root, 'en/c/numeric/math'), 'NAN.html', 'NAN.2.html')
149+
return rename_map
150+
151+
def rename_files(rename_map):
152+
for dir, old_fn, new_fn in rename_map:
153+
src_path = os.path.join(dir, old_fn)
154+
dst_path = os.path.join(dir, new_fn)
155+
print("Renaming '{0}' to \n '{1}'".format(src_path, dst_path))
156+
shutil.move(src_path, dst_path)
157+
158+
def find_html_files(root):
159+
# find files that need to be preprocessed
160+
html_files = []
161+
for dir, dirnames, filenames in os.walk(root):
162+
for filename in fnmatch.filter(filenames, '*.html'):
163+
html_files.append(os.path.join(dir, filename))
164+
return html_files
165+
166+
def is_loader_link(target):
167+
if re.match('https?://[a-z]+\.cppreference\.com/mwiki/load\.php', target):
168+
return True
169+
return False
170+
171+
def transform_loader_link(target, file, root):
172+
# Absolute loader.php links need to be made relative
173+
abstarget = os.path.join(root, "common/" + convert_loader_name(target))
174+
return os.path.relpath(abstarget, os.path.dirname(file))
175+
176+
def is_external_link(target):
177+
external_link_patterns = [
178+
'http://',
179+
'https://',
180+
'ftp://'
181+
]
182+
for pattern in external_link_patterns:
183+
if target.startswith(pattern):
184+
return True
185+
return False
186+
187+
def trasform_relative_link(rename_map, target):
188+
target = urllib.parse.unquote(target)
189+
for dir,fn,new_fn in rename_map:
190+
target = target.replace(fn, new_fn)
191+
target = target.replace('../../upload.cppreference.com/mwiki/','../common/')
192+
target = target.replace('../mwiki/','../common/')
193+
target = re.sub('(\.php|\.css)\?.*', '\\1', target)
194+
target = urllib.parse.quote(target)
195+
target = target.replace('%23', '#')
196+
return target
197+
198+
# Transforms a link in the given file according to rename map.
199+
# target is the link to transform.
200+
# file is the path of the file the link came from.
201+
# root is the path to the root of the archive.
202+
def transform_link(rename_map, target, file, root):
203+
if is_loader_link(target):
204+
return transform_loader_link(target, file, root)
205+
206+
if is_external_link(target):
207+
return target
208+
209+
return trasform_relative_link(rename_map, target)
210+
211+
def has_class(el, classes_to_check):
212+
value = el.get('class')
213+
if value is None:
214+
return False
215+
classes = value.split(' ')
216+
for cl in classes_to_check:
217+
if cl != '' and cl in classes:
218+
return True
219+
return False
220+
221+
def preprocess_html_file(root, fn, rename_map):
222+
223+
parser = etree.HTMLParser()
224+
html = etree.parse(fn, parser)
225+
226+
# remove non-printable elements
227+
for el in html.xpath('//*'):
228+
if has_class(el, ['noprint', 'editsection']):
229+
el.getparent().remove(el)
230+
if el.get('id') == 'toc':
231+
el.getparent().remove(el)
232+
233+
# remove see also links between C and C++ documentations
234+
for el in html.xpath('//tr[@class]'):
235+
if not has_class(el, ['t-dcl-list-item']):
236+
continue
237+
238+
child_tds = el.xpath('.//td/div[@class]')
239+
if not any(has_class(td, ['t-dcl-list-see']) for td in child_tds):
240+
continue
241+
242+
# remove preceding separator, if any
243+
prev = el.getprevious()
244+
if prev is not None:
245+
child_tds = prev.xpath('.//td[@class')
246+
if any(has_class(td, 't-dcl-list-sep') for td in child_tds):
247+
prev.getparent().remove(prev)
248+
249+
el.getparent().remove(el)
250+
251+
for el in html.xpath('//h3'):
252+
if len(el.xpath(".//span[@id = 'See_also']")) == 0:
253+
continue
254+
255+
next = el.getnext()
256+
if next is None:
257+
el.getparent().remove(el)
258+
continue
259+
260+
if next.tag != 'table':
261+
continue
262+
263+
if not has_class(next, 't-dcl-list-begin'):
264+
continue
265+
266+
if len(next.xpath('.//tr')) > 0:
267+
continue
268+
269+
el.getparent().remove(el)
270+
next.getparent().remove(next)
271+
272+
# remove external links to unused resources
273+
for el in html.xpath('/html/head/link'):
274+
if el.get('rel') in [ 'alternate', 'search', 'edit', 'EditURI' ]:
275+
el.getparent().remove(el)
276+
277+
# remove Google Analytics scripts
278+
for el in html.xpath('/html/body/script'):
279+
if el.get('src') is not None and 'google-analytics.com/ga.js' in el.get('src'):
280+
el.getparent().remove(el)
281+
elif el.text is not None and ('google-analytics.com/ga.js' in el.text or 'pageTracker' in el.text):
282+
el.getparent().remove(el)
283+
284+
# apply changes to links caused by file renames
285+
for el in html.xpath('//*[@src or @href]'):
286+
if el.get('src') is not None:
287+
el.set('src', transform_link(rename_map, el.get('src'), fn, root))
288+
elif el.get('href') is not None:
289+
el.set('href', transform_link(rename_map, el.get('href'), fn, root))
290+
291+
for err in parser.error_log:
292+
print("HTML WARN: {0}".format(err))
293+
294+
html.write(fn, encoding='utf-8', method='html')
295+
296+
def preprocess_css_file(fn):
297+
298+
f = open(fn, "r", encoding='utf-8')
299+
text = f.read()
300+
f.close()
301+
302+
# note that query string is not used in css files
303+
304+
text = text.replace('../DejaVuSansMonoCondensed60.ttf', 'DejaVuSansMonoCondensed60.ttf')
305+
text = text.replace('../DejaVuSansMonoCondensed75.ttf', 'DejaVuSansMonoCondensed75.ttf')
306+
307+
# QT Help viewer doesn't understand nth-child
308+
text = text.replace('nth-child(1)', 'first-child')
309+
310+
f = open(fn, "w", encoding='utf-8')
311+
f.write(text)
312+
f.close()

0 commit comments

Comments
 (0)