|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +# Copyright (C) 2011, 2012 Povilas Kanapickas <povilas@radix.lt> |
| 4 | +# |
| 5 | +# This file is part of cppreference-doc |
| 6 | +# |
| 7 | +# This program is free software: you can redistribute it and/or modify |
| 8 | +# it under the terms of the GNU General Public License as published by |
| 9 | +# the Free Software Foundation, either version 3 of the License, or |
| 10 | +# (at your option) any later version. |
| 11 | +# |
| 12 | +# This program is distributed in the hope that it will be useful, |
| 13 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | +# GNU General Public License for more details. |
| 16 | +# |
| 17 | +# You should have received a copy of the GNU General Public License |
| 18 | +# along with this program. If not, see http://www.gnu.org/licenses/. |
| 19 | + |
| 20 | +import fnmatch |
| 21 | +from lxml import etree |
| 22 | +import re |
| 23 | +import os |
| 24 | +import sys |
| 25 | +import shutil |
| 26 | +import urllib.parse |
| 27 | +from xml_utils import xml_escape, xml_unescape |
| 28 | + |
| 29 | +def rmtree_if_exists(dir): |
| 30 | + if os.path.isdir(dir): |
| 31 | + shutil.rmtree(dir) |
| 32 | + |
| 33 | +def move_dir_contents_to_dir(srcdir, dstdir): |
| 34 | + for fn in os.listdir(srcdir): |
| 35 | + shutil.move(os.path.join(srcdir, fn), |
| 36 | + os.path.join(dstdir, fn)) |
| 37 | + |
| 38 | +def rearrange_archive(root): |
| 39 | + # rearrange the archive. {root} here is output/reference |
| 40 | + |
| 41 | + # before |
| 42 | + # {root}/en.cppreference.com/w/ : html |
| 43 | + # {root}/en.cppreference.com/mwiki/ : data |
| 44 | + # {root}/en.cppreference.com/ : data |
| 45 | + # ... (other languages) |
| 46 | + # {root}/upload.cppreference.com/mwiki/ : data |
| 47 | + |
| 48 | + # after |
| 49 | + # {root}/common/ : all common data |
| 50 | + # {root}/en/ : html for en |
| 51 | + # ... (other languages) |
| 52 | + |
| 53 | + data_path = os.path.join(root, 'common') |
| 54 | + rmtree_if_exists(data_path) |
| 55 | + shutil.move(os.path.join(root, 'upload.cppreference.com/mwiki'), data_path) |
| 56 | + shutil.rmtree(os.path.join(root, 'upload.cppreference.com')) |
| 57 | + |
| 58 | + for lang in ["en"]: |
| 59 | + path = os.path.join(root, lang + ".cppreference.com/") |
| 60 | + src_html_path = path + "w/" |
| 61 | + src_data_path = path + "mwiki/" |
| 62 | + html_path = os.path.join(root, lang) |
| 63 | + |
| 64 | + if os.path.isdir(src_html_path): |
| 65 | + shutil.move(src_html_path, html_path) |
| 66 | + |
| 67 | + if os.path.isdir(src_data_path): |
| 68 | + # the skin files should be the same for all languages thus we |
| 69 | + # can merge everything |
| 70 | + move_dir_contents_to_dir(src_data_path, data_path) |
| 71 | + |
| 72 | + # also copy the custom fonts |
| 73 | + shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed60.ttf'), data_path) |
| 74 | + shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed75.ttf'), data_path) |
| 75 | + |
| 76 | + # remove what's left |
| 77 | + shutil.rmtree(path) |
| 78 | + |
| 79 | + # remove the XML source file |
| 80 | + for fn in fnmatch.filter(os.listdir(root), 'cppreference-export*.xml'): |
| 81 | + os.remove(os.path.join(root, fn)) |
| 82 | + |
| 83 | +def add_file_to_rename_map(rename_map, dir, fn, new_fn): |
| 84 | + path = os.path.join(dir, fn) |
| 85 | + if not os.path.isfile(path): |
| 86 | + print("ERROR: Not renaming '{0}' because path does not exist".format(path)) |
| 87 | + return |
| 88 | + rename_map.append((dir, fn, new_fn)) |
| 89 | + |
| 90 | +# Converts complex URL to resources supplied by MediaWiki loader to a simplified |
| 91 | +# name |
| 92 | +def convert_loader_name(fn): |
| 93 | + if re.search("modules=site&only=scripts", fn): |
| 94 | + return "site_scripts.js" |
| 95 | + elif re.search("modules=site&only=styles", fn): |
| 96 | + return "site_modules.css" |
| 97 | + elif re.search("modules=skins.*&only=scripts", fn): |
| 98 | + return "skin_scripts.js" |
| 99 | + elif re.search("modules=startup&only=scripts", fn): |
| 100 | + return "startup_scripts.js" |
| 101 | + elif re.search("modules=.*ext.*&only=styles", fn): |
| 102 | + return "ext.css" |
| 103 | + else: |
| 104 | + raise Exception('Loader file {0} does not match any known files'\ |
| 105 | + .format(fn)) |
| 106 | + |
| 107 | +def find_files_to_be_renamed(root): |
| 108 | + # Returns a rename map: array of tuples each of which contain three strings: |
| 109 | + # the directory the file resides in, the source and destination filenames. |
| 110 | + |
| 111 | + # The rename map specifies files to be renamed in order to support them on |
| 112 | + # windows filesystems which don't support certain characters in file names |
| 113 | + rename_map = [] |
| 114 | + |
| 115 | + files_rename = [] # general files to be renamed |
| 116 | + files_loader = [] # files served by load.php. These should map to |
| 117 | + # consistent and short file names because we |
| 118 | + # modify some of them later in the pipeline |
| 119 | + |
| 120 | + for dir, dirnames, filenames in os.walk(root): |
| 121 | + filenames_loader = set(fnmatch.filter(filenames, 'load.php[?]*')) |
| 122 | + # match any filenames with '?"*' characters |
| 123 | + filenames_rename = set(fnmatch.filter(filenames, '*[?"*]*')) |
| 124 | + |
| 125 | + # don't process load.php files in general rename handler |
| 126 | + filenames_rename -= filenames_loader |
| 127 | + |
| 128 | + for fn in filenames_loader: |
| 129 | + files_loader.append((dir, fn)) |
| 130 | + for fn in filenames_rename: |
| 131 | + files_rename.append((dir, fn)) |
| 132 | + |
| 133 | + for dir,orig_fn in files_rename: |
| 134 | + fn = orig_fn |
| 135 | + fn = re.sub('\?.*', '', fn) |
| 136 | + fn = re.sub('"', '_q_', fn) |
| 137 | + fn = re.sub('\*', '_star_', fn) |
| 138 | + add_file_to_rename_map(rename_map, dir, orig_fn, fn) |
| 139 | + |
| 140 | + # map loader names to more recognizable names |
| 141 | + for dir,fn in files_loader: |
| 142 | + new_fn = convert_loader_name(fn) |
| 143 | + add_file_to_rename_map(rename_map, dir, fn, new_fn) |
| 144 | + |
| 145 | + # rename filenames that conflict on case-insensitive filesystems |
| 146 | + # TODO: perform this automatically |
| 147 | + add_file_to_rename_map(rename_map, os.path.join(root, 'en/cpp/numeric/math'), 'NAN.html', 'NAN.2.html') |
| 148 | + add_file_to_rename_map(rename_map, os.path.join(root, 'en/c/numeric/math'), 'NAN.html', 'NAN.2.html') |
| 149 | + return rename_map |
| 150 | + |
| 151 | +def rename_files(rename_map): |
| 152 | + for dir, old_fn, new_fn in rename_map: |
| 153 | + src_path = os.path.join(dir, old_fn) |
| 154 | + dst_path = os.path.join(dir, new_fn) |
| 155 | + print("Renaming '{0}' to \n '{1}'".format(src_path, dst_path)) |
| 156 | + shutil.move(src_path, dst_path) |
| 157 | + |
| 158 | +def find_html_files(root): |
| 159 | + # find files that need to be preprocessed |
| 160 | + html_files = [] |
| 161 | + for dir, dirnames, filenames in os.walk(root): |
| 162 | + for filename in fnmatch.filter(filenames, '*.html'): |
| 163 | + html_files.append(os.path.join(dir, filename)) |
| 164 | + return html_files |
| 165 | + |
| 166 | +def is_loader_link(target): |
| 167 | + if re.match('https?://[a-z]+\.cppreference\.com/mwiki/load\.php', target): |
| 168 | + return True |
| 169 | + return False |
| 170 | + |
| 171 | +def transform_loader_link(target, file, root): |
| 172 | + # Absolute loader.php links need to be made relative |
| 173 | + abstarget = os.path.join(root, "common/" + convert_loader_name(target)) |
| 174 | + return os.path.relpath(abstarget, os.path.dirname(file)) |
| 175 | + |
| 176 | +def is_external_link(target): |
| 177 | + external_link_patterns = [ |
| 178 | + 'http://', |
| 179 | + 'https://', |
| 180 | + 'ftp://' |
| 181 | + ] |
| 182 | + for pattern in external_link_patterns: |
| 183 | + if target.startswith(pattern): |
| 184 | + return True |
| 185 | + return False |
| 186 | + |
| 187 | +def trasform_relative_link(rename_map, target): |
| 188 | + target = urllib.parse.unquote(target) |
| 189 | + for dir,fn,new_fn in rename_map: |
| 190 | + target = target.replace(fn, new_fn) |
| 191 | + target = target.replace('../../upload.cppreference.com/mwiki/','../common/') |
| 192 | + target = target.replace('../mwiki/','../common/') |
| 193 | + target = re.sub('(\.php|\.css)\?.*', '\\1', target) |
| 194 | + target = urllib.parse.quote(target) |
| 195 | + target = target.replace('%23', '#') |
| 196 | + return target |
| 197 | + |
| 198 | +# Transforms a link in the given file according to rename map. |
| 199 | +# target is the link to transform. |
| 200 | +# file is the path of the file the link came from. |
| 201 | +# root is the path to the root of the archive. |
| 202 | +def transform_link(rename_map, target, file, root): |
| 203 | + if is_loader_link(target): |
| 204 | + return transform_loader_link(target, file, root) |
| 205 | + |
| 206 | + if is_external_link(target): |
| 207 | + return target |
| 208 | + |
| 209 | + return trasform_relative_link(rename_map, target) |
| 210 | + |
| 211 | +def has_class(el, classes_to_check): |
| 212 | + value = el.get('class') |
| 213 | + if value is None: |
| 214 | + return False |
| 215 | + classes = value.split(' ') |
| 216 | + for cl in classes_to_check: |
| 217 | + if cl != '' and cl in classes: |
| 218 | + return True |
| 219 | + return False |
| 220 | + |
| 221 | +def preprocess_html_file(root, fn, rename_map): |
| 222 | + |
| 223 | + parser = etree.HTMLParser() |
| 224 | + html = etree.parse(fn, parser) |
| 225 | + |
| 226 | + # remove non-printable elements |
| 227 | + for el in html.xpath('//*'): |
| 228 | + if has_class(el, ['noprint', 'editsection']): |
| 229 | + el.getparent().remove(el) |
| 230 | + if el.get('id') == 'toc': |
| 231 | + el.getparent().remove(el) |
| 232 | + |
| 233 | + # remove see also links between C and C++ documentations |
| 234 | + for el in html.xpath('//tr[@class]'): |
| 235 | + if not has_class(el, ['t-dcl-list-item']): |
| 236 | + continue |
| 237 | + |
| 238 | + child_tds = el.xpath('.//td/div[@class]') |
| 239 | + if not any(has_class(td, ['t-dcl-list-see']) for td in child_tds): |
| 240 | + continue |
| 241 | + |
| 242 | + # remove preceding separator, if any |
| 243 | + prev = el.getprevious() |
| 244 | + if prev is not None: |
| 245 | + child_tds = prev.xpath('.//td[@class') |
| 246 | + if any(has_class(td, 't-dcl-list-sep') for td in child_tds): |
| 247 | + prev.getparent().remove(prev) |
| 248 | + |
| 249 | + el.getparent().remove(el) |
| 250 | + |
| 251 | + for el in html.xpath('//h3'): |
| 252 | + if len(el.xpath(".//span[@id = 'See_also']")) == 0: |
| 253 | + continue |
| 254 | + |
| 255 | + next = el.getnext() |
| 256 | + if next is None: |
| 257 | + el.getparent().remove(el) |
| 258 | + continue |
| 259 | + |
| 260 | + if next.tag != 'table': |
| 261 | + continue |
| 262 | + |
| 263 | + if not has_class(next, 't-dcl-list-begin'): |
| 264 | + continue |
| 265 | + |
| 266 | + if len(next.xpath('.//tr')) > 0: |
| 267 | + continue |
| 268 | + |
| 269 | + el.getparent().remove(el) |
| 270 | + next.getparent().remove(next) |
| 271 | + |
| 272 | + # remove external links to unused resources |
| 273 | + for el in html.xpath('/html/head/link'): |
| 274 | + if el.get('rel') in [ 'alternate', 'search', 'edit', 'EditURI' ]: |
| 275 | + el.getparent().remove(el) |
| 276 | + |
| 277 | + # remove Google Analytics scripts |
| 278 | + for el in html.xpath('/html/body/script'): |
| 279 | + if el.get('src') is not None and 'google-analytics.com/ga.js' in el.get('src'): |
| 280 | + el.getparent().remove(el) |
| 281 | + elif el.text is not None and ('google-analytics.com/ga.js' in el.text or 'pageTracker' in el.text): |
| 282 | + el.getparent().remove(el) |
| 283 | + |
| 284 | + # apply changes to links caused by file renames |
| 285 | + for el in html.xpath('//*[@src or @href]'): |
| 286 | + if el.get('src') is not None: |
| 287 | + el.set('src', transform_link(rename_map, el.get('src'), fn, root)) |
| 288 | + elif el.get('href') is not None: |
| 289 | + el.set('href', transform_link(rename_map, el.get('href'), fn, root)) |
| 290 | + |
| 291 | + for err in parser.error_log: |
| 292 | + print("HTML WARN: {0}".format(err)) |
| 293 | + |
| 294 | + html.write(fn, encoding='utf-8', method='html') |
| 295 | + |
| 296 | +def preprocess_css_file(fn): |
| 297 | + |
| 298 | + f = open(fn, "r", encoding='utf-8') |
| 299 | + text = f.read() |
| 300 | + f.close() |
| 301 | + |
| 302 | + # note that query string is not used in css files |
| 303 | + |
| 304 | + text = text.replace('../DejaVuSansMonoCondensed60.ttf', 'DejaVuSansMonoCondensed60.ttf') |
| 305 | + text = text.replace('../DejaVuSansMonoCondensed75.ttf', 'DejaVuSansMonoCondensed75.ttf') |
| 306 | + |
| 307 | + # QT Help viewer doesn't understand nth-child |
| 308 | + text = text.replace('nth-child(1)', 'first-child') |
| 309 | + |
| 310 | + f = open(fn, "w", encoding='utf-8') |
| 311 | + f.write(text) |
| 312 | + f.close() |
0 commit comments