diff --git a/src/modm_data/__init__.py b/src/modm_data/__init__.py index 62d6876..0c0cd49 100644 --- a/src/modm_data/__init__.py +++ b/src/modm_data/__init__.py @@ -12,23 +12,6 @@ except PackageNotFoundError: __version__ = "0.0.1" - -from . import ( - cubehal, - cubemx, - cube2owl, - dl, - header2svd, - html, - html2owl, - html2svd, - owl, - pdf, - pdf2html, - svd, - utils, -) - __all__ = [ "cube2owl", "cubehal", diff --git a/src/modm_data/dl/__init__.py b/src/modm_data/dl/__init__.py index 6abdc3d..d0766e4 100644 --- a/src/modm_data/dl/__init__.py +++ b/src/modm_data/dl/__init__.py @@ -1,7 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -from . import stmicro from .store import download_data, download_file __all__ = [ diff --git a/src/modm_data/header2svd/__init__.py b/src/modm_data/header2svd/__init__.py index 041edaa..8292d07 100644 --- a/src/modm_data/header2svd/__init__.py +++ b/src/modm_data/header2svd/__init__.py @@ -5,7 +5,6 @@ # CMSIS Header to SVD Pipeline """ -from . import stmicro from .header import Header __all__ = [ diff --git a/src/modm_data/header2svd/stmicro/__init__.py b/src/modm_data/header2svd/stmicro/__init__.py index fc925f9..beca289 100644 --- a/src/modm_data/header2svd/stmicro/__init__.py +++ b/src/modm_data/header2svd/stmicro/__init__.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: MPL-2.0 from .header import Header, getDefineForDevice - from .tree import normalize_memory_map __all__ = [ diff --git a/src/modm_data/html/__init__.py b/src/modm_data/html/__init__.py index ddf16c6..39f618f 100644 --- a/src/modm_data/html/__init__.py +++ b/src/modm_data/html/__init__.py @@ -1,7 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -from . import stmicro from .document import Document from .chapter import Chapter from .table import Table diff --git a/src/modm_data/html2owl/__init__.py b/src/modm_data/html2owl/__init__.py index 6ff35ff..b0130eb 100644 --- a/src/modm_data/html2owl/__init__.py +++ b/src/modm_data/html2owl/__init__.py @@ -5,6 +5,4 @@ # HTML to OWL Pipeline """ -from . import stmicro - __all__ = ["stmicro"] diff --git a/src/modm_data/html2svd/__init__.py b/src/modm_data/html2svd/__init__.py index 8518d90..1112e46 100644 --- a/src/modm_data/html2svd/__init__.py +++ b/src/modm_data/html2svd/__init__.py @@ -5,6 +5,4 @@ # HTML to SVD Pipeline """ -from . import stmicro - __all__ = ["stmicro"] diff --git a/src/modm_data/owl/__init__.py b/src/modm_data/owl/__init__.py index 21d0205..68e6e4a 100644 --- a/src/modm_data/owl/__init__.py +++ b/src/modm_data/owl/__init__.py @@ -3,7 +3,6 @@ from .store import Store from .identifier import DeviceIdentifier -from . import stmicro __all__ = [ "stmicro", diff --git a/src/modm_data/pdf/__init__.py b/src/modm_data/pdf/__init__.py index ac442fd..48e261b 100644 --- a/src/modm_data/pdf/__init__.py +++ b/src/modm_data/pdf/__init__.py @@ -18,17 +18,17 @@ from .link import ObjLink, WebLink from .path import Path from .image import Image -from .render import render_page_pdf +from .render import annotate_debug_info from .structure import Structure __all__ = [ + "annotate_debug_info", "Document", "Page", "Character", - "ObjLink", - "WebLink", "Path", "Image", + "ObjLink", + "WebLink", "Structure", - "render_page_pdf", ] diff --git a/src/modm_data/pdf/character.py b/src/modm_data/pdf/character.py index 2140a0e..e99cd1f 100644 --- a/src/modm_data/pdf/character.py +++ b/src/modm_data/pdf/character.py @@ -1,21 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -""" -# PDF Characters - -Each character on the PDF page is represented by a character object, describing -exactly where and how to render the associated glyph. - -While there are font flags, PDF files typically use entirely different fonts to -render normal, bold, and italic characters. - -The character's loose bounding box may not always be available, since it must be -explicitly provided by the font. The tight bounding box is only available as -long as the glyph is renderable, so a space character may have a loose, but not -a tight bounding box, or none at all. -""" - import math import ctypes from functools import cached_property @@ -26,8 +11,16 @@ class Character: """ - This class contains all information about a single character in the PDF - page. + Each character on the PDF page is represented by a character object, + describing exactly where and how to render the associated glyph. + + While there are font flags, PDF files typically use entirely different fonts + to render normal, bold, and italic characters. + + The character's loose bounding box may not always be available, since it + must be explicitly provided by the font. The tight bounding box is only + available as long as the glyph is renderable, so a space character may have + a loose, but not a tight bounding box, or none at all. """ class RenderMode(Enum): diff --git a/src/modm_data/pdf/document.py b/src/modm_data/pdf/document.py index 3e6c7f7..a5b725b 100644 --- a/src/modm_data/pdf/document.py +++ b/src/modm_data/pdf/document.py @@ -1,16 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -""" -# PDF Documents - -The PDF document is the root of the entire data structure and provides access to -PDF metadata, the table of contents, as well as individual pages. - -You should extend from this class for a specific vendor to provide the -correct page class from `page()` function. -""" - import ctypes import logging import pypdfium2 as pp @@ -39,6 +29,13 @@ def __repr__(self) -> str: class Document(pp.PdfDocument): """ + The PDF document is the root of the entire data structure and provides + access to PDF metadata, the table of contents, as well as individual + pages. + + You should extend from this class for a specific vendor to provide the + correct page class from `page()` function. + This class is a convenience wrapper with caching around the high-level APIs of pypdfium. """ diff --git a/src/modm_data/pdf/image.py b/src/modm_data/pdf/image.py index 4c88b81..a01b24d 100644 --- a/src/modm_data/pdf/image.py +++ b/src/modm_data/pdf/image.py @@ -1,12 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -""" -# PDF Images - -Images support bitmap data. -""" - from functools import cached_property import pypdfium2 as pp from ..utils import Point, Rectangle, Line diff --git a/src/modm_data/pdf/link.py b/src/modm_data/pdf/link.py index f4d43df..ce1f06c 100644 --- a/src/modm_data/pdf/link.py +++ b/src/modm_data/pdf/link.py @@ -1,17 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -""" -# Inter-PDF References and External Links - -PDF contains two types of links: -1. Internal references to other objects by identifier: `ObjLink`. -2. External links to URLs: `WebLink`. - -Both types can be extracted by calling the `modm_data.pdf.page.Page.objlinks` -and `modm_data.pdf.page.Page.weblinks` properties. -""" - import ctypes from functools import cached_property import pypdfium2 as pp @@ -19,7 +8,11 @@ class ObjLink: - """A link to a PDF object giving the bounding box and destination page.""" + """ + An internal reference to other objects by an identifier giving the bounding + box and destination page. These links can be extracted by calling the + `modm_data.pdf.page.Page.objlinks` property. + """ def __init__(self, page: "modm_data.pdf.Page", link: pp.raw.FPDF_LINK): # noqa: F821 """ @@ -47,7 +40,11 @@ def __repr__(self) -> str: class WebLink: - """A weblink object giving the bounding box and destination URL.""" + """ + An external reference to URLs giving the bounding box and destination URL. + These links can be extracted by calling the + `modm_data.pdf.page.Page.weblinks` property. + """ def __init__(self, page: "modm_data.pdf.Page", index: int): # noqa: F821 """ diff --git a/src/modm_data/pdf/page.py b/src/modm_data/pdf/page.py index a363bce..1caafda 100644 --- a/src/modm_data/pdf/page.py +++ b/src/modm_data/pdf/page.py @@ -1,12 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -""" -# PDF Pages - - -""" - import ctypes import logging import weakref diff --git a/src/modm_data/pdf/path.py b/src/modm_data/pdf/path.py index 1dee9a9..fefbbc8 100644 --- a/src/modm_data/pdf/path.py +++ b/src/modm_data/pdf/path.py @@ -1,14 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -""" -# PDF Graphics - -PDF uses a subset of the PostScript graphics language, which draws vector paths -with various rendering options. We are only interested in the basic properties, -in particular, for recognizing table cell borders. -""" - import ctypes from functools import cached_property from enum import Enum @@ -18,6 +10,10 @@ class Path(pp.PdfObject): """ + PDF uses a subset of the PostScript graphics language, which draws vector + paths with various rendering options. We are only interested in the basic + properties, in particular, for recognizing table cell borders. + This class specializes `pypdfium2.PdfObject` to add accessors for graphics containing vector paths of various configurations. diff --git a/src/modm_data/pdf/render.py b/src/modm_data/pdf/render.py index f4cf967..f719e2b 100644 --- a/src/modm_data/pdf/render.py +++ b/src/modm_data/pdf/render.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: MPL-2.0 from ..utils import VLine, HLine +from .page import Page import pypdfium2 as pp @@ -47,13 +48,26 @@ def _rect(pageobj, rotation, rect, **kw): pp.raw.FPDFPage_InsertObject(pageobj, obj) -def render_page_pdf(doc, page, new_doc=None, index=0): +def annotate_debug_info(page: Page, new_doc: pp.PdfDocument = None, index: int = 0) -> pp.PdfDocument: + """ + Copies each page into a new or existing PDF document and overlays the internal information on top of the content. + - Renders the bounding boxes in RED and origins in BLACK of all characters. + - Renders the bounding boxes of web links in BLUE GREEN. + - Renders the bounding boxes of object links in YELLOW GREEN. + - Renders all graphics paths in BLUE. + - Renders the bounding boxes of computed graphics clusters in CYAN. + + :param page: The page to be annotated. + :param new_doc: The PDF document to copy the page to. If not provided, a new document is created. + :param index: The index of the page in the new document. + :return: The new document with the annotated page added. + """ _, height = page.width, page.height if new_doc is None: new_doc = pp.raw.FPDF_CreateNewDocument() # copy page over to new doc - assert pp.raw.FPDF_ImportPages(new_doc, doc, str(page.number).encode("ascii"), index) + assert pp.raw.FPDF_ImportPages(new_doc, page.pdf, str(page.number).encode("ascii"), index) new_page = pp.raw.FPDF_LoadPage(new_doc, index) rotation = page.rotation diff --git a/src/modm_data/pdf/structure.py b/src/modm_data/pdf/structure.py index 00199bd..f8290ec 100644 --- a/src/modm_data/pdf/structure.py +++ b/src/modm_data/pdf/structure.py @@ -1,19 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -""" -# Tagged PDFs - -A tagged PDF/UA (Universal Accessibility) contains the structure of content as a -tree data structure with similar semantics to HTML. Sadly, the quality of the -tags depends heavily on the PDF creation software. See [Overview of PDF tags]( -https://accessible-pdf.info/en/basics/general/overview-of-the-pdf-tags/). - -An example of an accessible pdf that can be inspected via these classes: -[Rock On, D.C. Music Festival]( -https://commonlook.com/wp-content/uploads/2020/04/accessible-pdf-example.pdf). -""" - import ctypes from functools import cached_property, cache import pypdfium2 as pp @@ -22,8 +9,13 @@ class Structure: """ - A PDF/UA ("tagged PDF") contains the structure of content as a tree data - structure with similar semantics to HTML. + A tagged PDF/UA (Universal Accessibility) contains the structure of content + as a tree data structure with similar semantics to HTML. Sadly, the quality + of the tags depends heavily on the PDF creation software. See + [Overview of PDF tags](https://accessible-pdf.info/en/basics/general/overview-of-the-pdf-tags/). + + An example of an accessible pdf that can be inspected via these classes: + [Rock On, D.C. Music Festival](https://commonlook.com/wp-content/uploads/2020/04/accessible-pdf-example.pdf). This class is a convenience wrapper around [the pdfium structtree methods]( https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h). diff --git a/src/modm_data/pdf2html/__init__.py b/src/modm_data/pdf2html/__init__.py index 16fcea6..f2d0da0 100644 --- a/src/modm_data/pdf2html/__init__.py +++ b/src/modm_data/pdf2html/__init__.py @@ -5,25 +5,18 @@ # PDF to HTML Pipeline """ -from . import stmicro -from .render import render_page_pdf +from .render import annotate_debug_info from .convert import convert, patch from .html import format_document, write_html -from . import ast -from . import cell -from . import figure -from . import line -from . import page -from . import table - __all__ = [ "stmicro", - "render_page_pdf", + "ti", "convert", - "patch", + "annotate_debug_info", "format_document", "write_html", + "patch", "ast", "cell", "figure", diff --git a/src/modm_data/pdf2html/convert.py b/src/modm_data/pdf2html/convert.py index c65d95e..8197ede 100644 --- a/src/modm_data/pdf2html/convert.py +++ b/src/modm_data/pdf2html/convert.py @@ -2,9 +2,10 @@ # SPDX-License-Identifier: MPL-2.0 from anytree import RenderTree +from typing import Iterable from .html import format_document, write_html -from .render import render_page_pdf +from .render import annotate_debug_info from ..utils import pkg_apply_patch, pkg_file_exists, apply_patch from .ast import merge_area from pathlib import Path @@ -12,17 +13,17 @@ def convert( - doc, - page_range, - output_path, - format_chapters=False, - pretty=True, - render_html=True, - render_pdf=False, - render_all=False, - show_ast=False, - show_tree=False, - show_tags=False, + doc: pp.PdfDocument, + page_range: Iterable[int], + output_path: Path, + format_chapters: bool = False, + pretty: bool = True, + render_html: bool = True, + render_pdf: bool = False, + render_all: bool = False, + show_ast: bool = False, + show_tree: bool = False, + show_tags: bool = False, ) -> bool: document = None debug_doc = None @@ -47,7 +48,7 @@ def convert( document = merge_area(document, area) if render_pdf: - debug_doc = render_page_pdf(doc, page, debug_doc, debug_index) + debug_doc = annotate_debug_info(page, debug_doc, debug_index) debug_index += 1 if render_pdf: diff --git a/src/modm_data/pdf2html/render.py b/src/modm_data/pdf2html/render.py index efbbee5..07fa490 100644 --- a/src/modm_data/pdf2html/render.py +++ b/src/modm_data/pdf2html/render.py @@ -2,19 +2,25 @@ # SPDX-License-Identifier: MPL-2.0 import pypdfium2 as pp -from ..pdf.render import render_page_pdf as pdf_render_page_pdf +from ..pdf.render import annotate_debug_info as pdf_annotate_debug_info from ..pdf.render import _vline, _hline, _line, _rect +from .page import Page -def render_page_pdf(doc, page, new_doc=None, index=0): +def annotate_debug_info(page: Page, new_doc: pp.PdfDocument = None, index: int = 0) -> pp.PdfDocument: """ + Copies each page into a new or existing PDF document and overlays the internal information on top of the content. + In addition to the information overlayed in `modm_data.pdf.annotate_debug_info`, this function: + - renders all content areas in ORANGE. + - renders all graphic cluster in content areas in GREEN. + - renders all tables in content areas in BLUE. - - :param doc: PDF document - :param page: PDF page - :param new_doc: Empty PDF document to copy debug renders to + :param page: The page to be annotated. + :param new_doc: The PDF document to copy the page to. If not provided, a new document is created. + :param index: The index of the page in the new document. + :return: The new document with the annotated page added. """ - new_doc = pdf_render_page_pdf(doc, page, new_doc, index) + new_doc = pdf_annotate_debug_info(page, new_doc, index) # return new_doc new_page = pp.raw.FPDF_LoadPage(new_doc, index) rotation = page.rotation @@ -58,33 +64,33 @@ def render_page_pdf(doc, page, new_doc=None, index=0): for line in cell.lines: for cluster in line.clusters(): _rect(new_page, rotation, cluster.bbox, width=0.33, stroke=0x808080) - if cell.b.l: + if cell.borders.left: _vline( - new_page, rotation, cell.bbox.left, cell.bbox.bottom, cell.bbox.top, width=cell.b.l, stroke=0xFF0000 + new_page, rotation, cell.bbox.left, cell.bbox.bottom, cell.bbox.top, width=cell.borders.left, stroke=0xFF0000 ) - if cell.b.r: + if cell.borders.right: _vline( new_page, rotation, cell.bbox.right, cell.bbox.bottom, cell.bbox.top, - width=cell.b.r, + width=cell.borders.right, stroke=0x0000FF, ) - if cell.b.b: + if cell.borders.bottom: _hline( new_page, rotation, cell.bbox.bottom, cell.bbox.left, cell.bbox.right, - width=cell.b.b, + width=cell.borders.bottom, stroke=0x00FF00, ) - if cell.b.t: + if cell.borders.top: _hline( - new_page, rotation, cell.bbox.top, cell.bbox.left, cell.bbox.right, width=cell.b.t, stroke=0x808080 + new_page, rotation, cell.bbox.top, cell.bbox.left, cell.bbox.right, width=cell.borders.top, stroke=0x808080 ) assert pp.raw.FPDFPage_GenerateContent(new_page) diff --git a/src/modm_data/pdf2html/stmicro/document.py b/src/modm_data/pdf2html/stmicro/document.py index 97654c4..ba4ac88 100644 --- a/src/modm_data/pdf2html/stmicro/document.py +++ b/src/modm_data/pdf2html/stmicro/document.py @@ -5,9 +5,15 @@ from anytree import RenderTree from .page import Page as StmPage from ...pdf import Document as PdfDocument -from ..ast import normalize_lines, normalize_captions, normalize_lists -from ..ast import normalize_paragraphs, normalize_headings, normalize_registers -from ..ast import normalize_tables +from ..ast import ( + normalize_lines, + normalize_captions, + normalize_lists, + normalize_paragraphs, + normalize_headings, + normalize_registers, + normalize_tables, +) _LOGGER = logging.getLogger(__name__) diff --git a/src/modm_data/pdf2html/ti/__init__.py b/src/modm_data/pdf2html/ti/__init__.py new file mode 100644 index 0000000..6b20557 --- /dev/null +++ b/src/modm_data/pdf2html/ti/__init__.py @@ -0,0 +1,8 @@ +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + + +from .document import Document +from .page import Page + +__all__ = ["Document", "Page"] diff --git a/src/modm_data/pdf2html/ti/__main__.py b/src/modm_data/pdf2html/ti/__main__.py new file mode 100644 index 0000000..0ef8471 --- /dev/null +++ b/src/modm_data/pdf2html/ti/__main__.py @@ -0,0 +1,108 @@ +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + +import re +import tqdm +import logging +import argparse +import subprocess +from pathlib import Path +from multiprocessing.pool import ThreadPool + +from .. import convert, patch + + +def main(): + import modm_data + + parser = argparse.ArgumentParser() + parser.add_argument("--document", type=Path) + parser.add_argument("--output", type=str, default="") + parser.add_argument("--page", type=int, action="append") + parser.add_argument("--range", action="append") + parser.add_argument("--pdf", action="store_true") + parser.add_argument("--ast", action="store_true") + parser.add_argument("--tree", action="store_true") + parser.add_argument("--html", action="store_true") + parser.add_argument("--parallel", action="store_true") + parser.add_argument("--chapters", action="store_true") + parser.add_argument("--tags", action="store_true") + parser.add_argument("--all", action="store_true") + parser.add_argument("-v", dest="verbose", action="count", default=0) + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + doc = modm_data.pdf2html.ti.Document(args.document) + if doc.page_count == 0 or not doc.page(1).width: + print("Corrupt PDF!") + exit(1) + + if args.page or args.range: + page_range = list(map(lambda p: p - 1, args.page or [])) + if args.range: + for arange in args.range: + start, stop = arange.split(":") + arange = range(int(start or 0), int(stop or doc.page_count - 1) + 1) + page_range.extend([p - 1 for p in arange]) + page_range.sort() + else: + page_range = range(doc.page_count) + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + if args.parallel: + log = Path(f"log/ti/html/{doc.name}.txt") + log.parent.mkdir(exist_ok=True, parents=True) + with log.open("w") as logfile: + print(doc.page_count, doc.metadata, doc.is_tagged, file=logfile) + output_dir = output_path.parent / output_path.stem + output_dir.mkdir(parents=True, exist_ok=True) + dests = [(0, "introduction")] + for toc in doc.toc: + if toc.level == 0 and not toc.title.startswith("Table"): + title = toc.title.lower().strip("0123456789").strip() + title = re.sub(r"[\(\)/®&\n\r,;:™]", "", title) + title = re.sub(r"[ -]", "_", title) + title = re.sub(r"_+", "_", title) + title = title.replace("²", "2") + if not any(c in toc.title for c in {"Contents", "List of ", "Index"}): + dests.append((toc.page_index, title)) + print(toc.page_index, toc.title, file=logfile) + dests.append((doc.page_count, None)) + ranges = [(p0, p1, t0) for (p0, t0), (p1, t1) in zip(dests, dests[1:]) if p0 != p1] + calls = [] + for ii, (p0, p1, title) in enumerate(ranges): + call = ( + f"python3 -m modm_data.pdf2html.ti " + f"--document {args.document} --range {p0 + 1}:{p1} --html " + f"--output {output_dir}/chapter_{ii}_{title}.html" + ) + calls.append(call + f" >> {log} 2>&1") + print(call, file=logfile) + with ThreadPool() as pool: + retvals = list(tqdm.tqdm(pool.imap(lambda c: subprocess.run(c, shell=True), calls), total=len(calls))) + for retval, call in zip(retvals, calls): + if retval.returncode != 0: + print(call) + if all(r.returncode == 0 for r in retvals): + from . import data + + return patch(doc, data, output_dir) + return False + + return convert( + doc, + page_range, + output_path, + format_chapters=args.chapters, + render_html=args.html, + render_pdf=args.pdf, + render_all=args.all, + show_ast=args.ast, + show_tree=args.tree, + show_tags=args.tags, + ) + + +exit(0 if main() else 1) diff --git a/src/modm_data/pdf2html/ti/document.py b/src/modm_data/pdf2html/ti/document.py new file mode 100644 index 0000000..5a55e41 --- /dev/null +++ b/src/modm_data/pdf2html/ti/document.py @@ -0,0 +1,55 @@ +# Copyright 2023, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + +import logging +from anytree import RenderTree +from .page import Page as TiPage +from ...pdf import Document as PdfDocument +from ..ast import ( + normalize_lines, + normalize_captions, + normalize_lists, + normalize_paragraphs, + normalize_headings, + normalize_registers, + normalize_tables, +) + +_LOGGER = logging.getLogger(__name__) + + +def _debug(func, indata, debug=0): + _LOGGER.debug(func.__name__) + if debug == -1: + _LOGGER.debug(RenderTree(indata)) + _LOGGER.debug() + outdata = func(indata) + if debug == 1: + _LOGGER.debug(RenderTree(outdata)) + _LOGGER.debug() + return outdata + + +def _normalize_document(document): + document = _debug(normalize_lines, document) + document = _debug(normalize_captions, document) + document = _debug(normalize_lists, document) + document = _debug(normalize_paragraphs, document) + document = _debug(normalize_headings, document) + document = _debug(normalize_registers, document) + document = _debug(normalize_tables, document) + # document = _debug(normalize_chapters, document) + return document + + +class Document(PdfDocument): + def __init__(self, path: str): + super().__init__(path) + self._normalize = _normalize_document + + def page(self, index: int) -> TiPage: + assert index < self.page_count + return TiPage(self, index) + + def __repr__(self) -> str: + return f"TiDoc({self.name})" diff --git a/src/modm_data/pdf2html/ti/page.py b/src/modm_data/pdf2html/ti/page.py new file mode 100644 index 0000000..bb6a729 --- /dev/null +++ b/src/modm_data/pdf2html/ti/page.py @@ -0,0 +1,622 @@ +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + +import re +import logging +from functools import cached_property, reduce +from collections import defaultdict +from ..table import Table +from ..figure import Figure +from ..line import CharLine +from ...utils import HLine, VLine, Rectangle +from ...pdf import Image +from ..page import Page as BasePage +from anytree import Node + + +_LOGGER = logging.getLogger(__name__) + + +def is_compatible(document) -> bool: + if "stmicro" in document.metadata.get("Author", "").lower(): + return True + return False + + +def _areas_black_white(page) -> dict: + def _scale(r): + if page.rotation: + return Rectangle( + r.bottom * page.width, (1 - r.right) * page.height, r.top * page.width, (1 - r.left) * page.height + ) + return Rectangle(r.left * page.width, r.bottom * page.height, r.right * page.width, r.top * page.height) + + bottom_left = Rectangle(0.05, 0.02, 0.11, 0.07) + bottom_right = Rectangle(0.89, 0.02, 0.95, 0.07) + top_left = Rectangle(0.05, 0.9175, 0.5, 0.94) + top_right = Rectangle(0.5, 0.9175, 0.95, 0.94) + content = Rectangle(0.05, 0.07, 0.95, 0.9175) + all_content = [content] + areas = { + "id": top_left if page.index % 2 else top_right + } + if page.index == 0: + # Publish date on the bottom left on first page + areas["date"] = bottom_left + # number on the bottom right on first page + areas["number"] = bottom_right + # Add top areas + all_content.insert(0, Rectangle(0.375, 0.855, 0.975, 0.9125)) + all_content.insert(1, Rectangle(0.025, 0.805, 0.975, 0.855)) + else: + # Page number on bottom + areas["number"] = bottom_left if page.index % 2 else bottom_right + # Chapter name on top + # areas["top"] = top + + # Recognize the two column design of the Datasheets with a big table underneath + if page.index < 3 and "DS" in page.pdf.name: + # Find a wide path that would denote the beginning of a table + top_rect = [ + p.bbox.top / page.height + for p in page.paths + if _scale(content).contains(p.bbox) and p.bbox.width > page.width * 0.75 + ] + if top_rect: + # offset for table label just above it + ybottom = max(*top_rect) + 0.0175 + else: + ybottom = content.bottom + # Try to find list or sublists in these areas + mr = Rectangle(0.49, ybottom, 0.51, content.top) + br = Rectangle(0.51, ybottom, 0.5325, content.top) + hr = Rectangle(0.5325, ybottom, 0.555, content.top) + text_middle = page.text_in_area(_scale(mr)) + text_bullets = page.text_in_area(_scale(br)) + text_hyphens = page.text_in_area(_scale(hr)) + if not text_middle and ( + any(c in text_bullets for c in {"•", chr(61623)}) or any(c in text_hyphens for c in {"-"}) + ): + areas["middle_bullets"] = br + areas["middle_hyphens"] = hr + all_content = all_content[:-1] + all_content.append(Rectangle(content.left, ybottom, 0.5, content.top)) + all_content.append(Rectangle(0.505, ybottom, content.right, content.top)) + if top_rect: + all_content.append(Rectangle(content.left, content.bottom, content.right, ybottom)) + + areas["content"] = all_content + scaled_areas = {} + for name, area in areas.items(): + if isinstance(area, list): + scaled_areas[name] = [_scale(r) for r in area] + else: + scaled_areas[name] = _scale(area) + return scaled_areas + + +def _spacing_black_white(page) -> dict: + content = 0.05 + spacing = { + # Horizontal spacing: left->right + "x_em": 0.01 * page.width, + "x_left": content * page.width, + "x_right": (1 - content) * page.width, + "x_content": 0.2075 * page.width, + # Vertical spacing: bottom->top + "y_em": 0.01 * page.height, + # Max table line thickness + "y_tline": 0.005 * page.height, + # Max line height distance to detect paragraphs + "lh": 0.9, + # Max line height distance to detect super-/subscript + "sc": 0.325, + # Table header cell bold text threshold + "th": 0.33, + } + if page.rotation: + content = 0.14 + spacing.update( + { + "x_em": 0.01 * page.height, + "y_em": 0.01 * page.width, + "x_left": content * page.width, + "x_right": (1 - content) * page.width, + "x_content": 0.2075 * page.width, + "y_tline": 0.005 * page.width, + "lh": 1.2, + "sc": 0.4, + } + ) + return spacing | _spacing_special(page) + + +def _spacing_special(page) -> dict: + return {} + + +def _linesize_black_white(line: CharLine) -> str: + rsize = line.height + # print(rsize, line.content) + if rsize >= 11.9: + return "h1" + elif rsize >= 10.9: + return "h2" + elif rsize >= 9.9: + return "n" + else: + return "fn" + + +def _colors_black_white(color: int) -> str: + if 0xFF <= color <= 0xFF: + return "black" + if 0xFFFFFFFF <= color <= 0xFFFFFFFF: + return "white" + if 0xB9C4CAFF <= color <= 0xB9C4CAFF: + return "gray" + if 0x1F81AFFF <= color <= 0x1F81AFFF: + return "lightblue" + if 0x2052FF <= color <= 0x2052FF: + return "darkblue" + if 0x39A9DCFF <= color <= 0x39A9DCFF: + return "blue" + return "unknown" + + +class Page(BasePage): + def __init__(self, document, index: int): + super().__init__(document, index) + producer = self.pdf.metadata.get("Producer", "").lower() + self._template = "black_white" + if "itext" not in producer: + _LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'") + + if "black_white" in self._template: + self._areas = _areas_black_white(self) + self._spacing = _spacing_black_white(self) + self._colors = _colors_black_white + self._line_size = _linesize_black_white + + def _unicode_filter(self, code: int) -> int: + # Ignore Carriage Return characters and ® (superscript issues) + if code in {0xD, ord("®")}: + return None + # Correct some weird unicode stuffing choices + if code in {2}: + return ord("-") + if code in {61623, 61664}: + return ord("•") + return code + + @cached_property + def identifier(self) -> str: + return self.text_in_named_area("id", check_length=False) + + @cached_property + def top(self) -> str: + if self.index == 0: + return "Cover" + return self.text_in_named_area("top", check_length=False) + + @cached_property + def is_relevant(self) -> bool: + # if any(c in self.top for c in {"Contents", "List of ", "Index"}): + # return False + return True + + @property + def content_ast(self) -> list: + ast = [] + with_graphics = True + if "DS" in self.pdf.name: + # FIXME: Terrible hack to get the ordering information table fixed + # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable + order_page = next( + ( + item.page_index + for item in self.pdf.toc + if item.level == 0 and re.search("ordering +information|part +numbering", item.title, re.IGNORECASE) + ), + -1, + ) + with_graphics = order_page != self.index + for area in self._areas["content"]: + ast.append(self.ast_in_area(area, with_graphics=with_graphics)) + # Add a page node to the first leaf to keep track of where a page starts + first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) + Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) + return ast + + def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]: + # Find all graphic clusters in this area + em = self._spacing["y_em"] + large_area = area.offset_x(em / 2) + graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em / 2) + # for bbox, paths in raw_graphic_clusters: + # # Some docs have large DRAFT chars in the background + # if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths): + # continue + # graphic_clusters.append((bbox, paths)) + + # Find the captions and group them by y origin to catch side-by-side figures + ycaptions = defaultdict(list) + for line in self.charlines_in_area(area, lambda c: "Bold" in c.font): + for cluster in line.clusters(): + for phrase in [r"Figure \d+\.", r"Table \d+\."]: + if re.match(phrase, cluster.content): + ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars)) + ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)] + + # Now associate these captions with the graphics bboxes + categories = [] + for captions in ycaptions: + width = area.width / len(captions) + for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)): + left, right = area.left + ii * width, area.left + (ii + 1) * width + bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height + + # Find the graphic associated with this caption + graphic = next( + ((b, p) for b, p in graphic_clusters if b.bottom <= bottom and left <= b.left and b.right <= right), + None, + ) + if graphic is None: + _LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}") + continue + + if self._template == "blue_gray": + # Search for all lines of the current caption with the same properties + cbbox = Rectangle(left, bottom, right, top) + cchars = self.chars_in_area(cbbox) + while True: + nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top) + nchars = self.chars_in_area(nbbox) + if len(cchars) >= len(nchars): + break + cbbox = nbbox + cchars = nchars + else: + cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top) + + otype = phrase.split(" ")[0].lower() + if "Figure" in phrase: + # Find all other graphics in the bounding box + gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom) + graphics = [] + for b, p in graphic_clusters: + if gbbox.overlaps(b): + graphics.append((b, p)) + for g in graphics: + graphic_clusters.remove(g) + gbbox = [cluster[0] for cluster in graphics] + gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox) + paths = [p for cluster in graphics for p in cluster[1]] + + if self._template == "blue_gray": + # Search for characters below the graphics bbox, max 1 y_em + gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom) + while True: + gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom) + if not self.chars_in_area(gbbox): + break + # Generate the new bounding box which includes the caption + gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom) + elif "Table" in phrase: + graphic_clusters.remove(graphic) + gbbox, paths = graphic + if ( + self._template == "black_white" + and sum(1 for path in paths if path.count == 2) >= len(paths) / 2 + ): + otype += "_lines" + categories.append((otype, cbbox, gbbox, paths)) + + # Deal with the remaining graphic categories + for gbbox, paths in graphic_clusters: + if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]: + continue + category = "" + if any(isinstance(p, Image) for p in paths): + category = "figure" + elif self._template == "blue_gray": + if all(self._colors(path.stroke) == "gray" or self._colors(path.fill) == "darkblue" for path in paths): + category = "table" + else: + category = "figure" + elif self._template == "black_white": + # Some tables are rendered explicitly with filled rectangular + # shapes with others are implicitly rendered with stroked lines + stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2 + is_table = stroked_table_lines or all( + [any(p.isclose(pp) for pp in path.bbox.points) for p in path.points].count(True) + >= len(path.points) * 2 / 3 + for path in paths + ) + if len(paths) > 1 and is_table: + category = "table" + if stroked_table_lines: + category += "_lines" + else: + category = "figure" + + if "table" in category: + # Check if there are only numbers on top of the table + cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"]) + nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xA, 0xD}] + + if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3: + # This is a register table with invisible top borders! + cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars)) + gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top) + name = "register_" + category + else: + cbbox = None + name = category + categories.append((name, cbbox, gbbox, paths)) + else: + categories.append(("figure", None, gbbox, paths)) + + # Convert the objects into specialized classes + categories.sort(key=lambda o: (-o[2].y, o[2].x)) + objects = [] + for otype, caption_bbox, graphics_bbox, graphics_paths in categories: + if "figure" in otype: + figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths) + objects.append(figure) + elif "table" in otype: + xlines, ylines, yhlines = [], [], [] + for path in graphics_paths: + if self._template == "blue_gray" or "_lines" in otype: + if self._colors(path.stroke) == "gray" or "_lines" in otype: + # Intercell paths in gray + if len(path.lines) == 1: + line = path.lines[0] + if line.direction == line.Direction.VERTICAL: + xlines.append(line.specialize()) + elif line.direction == line.Direction.HORIZONTAL: + ylines.append(line.specialize()) + else: + _LOGGER.warn(f"Line not vertical or horizontal: {line}") + else: + _LOGGER.warn(f"Path too long: {path}") + elif self._colors(path.fill) == "darkblue": + # Add the bottom line of the dark blue header box as a very thick line + line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5) + yhlines.append(line) + + elif self._template == "black_white": + bbox = path.bbox + is_vertical = bbox.width < bbox.height + width = bbox.width if is_vertical else bbox.height + length = bbox.height if is_vertical else bbox.width + if width <= self._spacing["x_em"] / 2: + if length >= self._spacing["y_em"] / 2: + if is_vertical: + line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width) + xlines.append(line) + else: + line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height) + ylines.append(line) + else: + # Split the rectangle into it's outline + xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1)) + xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1)) + ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1)) + ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1)) + if yhlines: + yhlines.sort(key=lambda line: line.p0.y) + ylines.append(yhlines[0]) + if not xlines or not ylines: + continue + table = Table(self, graphics_bbox, xlines, ylines, caption_bbox, is_register="register" in otype) + objects.append(table) + + return objects + + def ast_in_area( + self, + area: Rectangle, + with_graphics: bool = True, + ignore_xpos: bool = False, + with_bits: bool = True, + with_notes: bool = True, + ) -> Node: + x_em = self._spacing["x_em"] + spacing_content = self._spacing["x_content"] + lh_factor = self._spacing["lh"] + # spacing_y = self._spacing["y_em"] + root = Node("area", obj=area, xpos=int(area.left), page=self) + + def unindent(_xpos, _current, _newlines=1): + current = _current + # Check if we need to unindent the current node + while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos: + current = current.parent + if _newlines >= 2 and current.name == "para": + current = current.parent + return current + + def parent_name(current): + return "" if current.parent is None else current.parent.name + + current = root + ypos = area.top + for obj in self.objects_in_area(area, with_graphics): + xpos = round(obj.bbox.left) + + # Tables should remain in their current hierarchy regardless of indentation + if isinstance(obj, (Table, Figure)): + current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root) + name = "figure" if isinstance(obj, Figure) else "table" + Node( + name, + parent=current, + obj=obj, + xpos=xpos, + number=-1, + _width=obj.bbox.width / area.width, + _type=obj._type, + ) + ypos = obj.bbox.bottom + + # Lines of text need to be carefully checked for indentation + elif isinstance(obj, CharLine): + newlines = round((ypos - obj.origin) / (lh_factor * obj.height)) + content = obj.content + lcontent = content.lstrip() + content_start = 0 + linesize = self._line_size(obj) + + # Check when the note has finished (=> paragraphs without italic) + if parent_name(current) == "note" and ( + (current.parent.type == "note" and not obj.contains_font(current.parent._font)) + or (current.parent.type in {"caution", "warning"} and newlines >= 2) + ): + current = current.parent.parent + + # Check when the list ends into something indented far too right + elif parent_name(current).startswith("list") and (xpos - current.xpos) >= 2 * x_em: + current = current.parent.parent + + # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content) + + # Check if line is a heading, which may be multi-line, so we must + # be careful not to nest them, but group them properly + # Headings are always inserted into the root note! + if linesize.startswith("h1") or ( + linesize.startswith("h") and xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font + ): + if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None: + start = min(len(match.group(0)), len(obj.chars) - 1) + marker = match.group(1) + size = marker.count(".") + 2 + else: + start = 0 + marker = None + size = linesize[1] + name = f"head{size}" + # Check if we're already parsing a heading, do not split into two + if parent_name(current) != name or newlines > 2: + content_start = start + xpos = round(obj.chars[content_start].bbox.left) + current = Node(name, parent=root, obj=obj, xpos=xpos, size=size, marker=marker) + current = Node("para", parent=current, obj=obj, xpos=current.xpos) + + # Check if the line is a note and deal with the indentation correctly + elif ( + with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None + ): + content_start = min(len(match.group(0)), len(obj.chars) - 1) + # print(obj.fonts) + # Correct xposition only if the Note: string is very far left + if xpos + 4 * x_em <= current.xpos: + xpos = round(obj.chars[content_start].bbox.left) + # Prevent nesting of notes, they should only be listed + if parent_name(current) == "note": + current = current.parent.parent + current = unindent(xpos, current, 2) + current = Node( + "note", + parent=current, + obj=obj, + xpos=xpos, + type=match.group(1).lower(), + _font=obj.chars[content_start].font, + ) + current = Node("para", parent=current, obj=obj, xpos=current.xpos) + + # Check if line is Table or Figure caption + elif with_graphics and ( + (match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None + and "Bold" in obj.chars[0].font + ): + content_start = min(len(match.group(0)), len(obj.chars) - 1) + current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root) + current = Node( + "caption", + parent=current, + obj=obj, + xpos=xpos, + _type=match.group(1).lower(), + number=int(match.group(2)), + ) + current = Node("para", parent=current, obj=obj, xpos=current.xpos) + + # Check if line is list and group them according to indentation + elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None: + current = unindent(xpos, current, newlines) + content_start = len(match.group(0)) - 2 + xpos = round(obj.chars[content_start].bbox.left) + name = "listb" + value = lcontent[0] + if value in {"–", "-"}: + name = "lists" + elif value.isalpha(): + name = "lista" + elif value.isnumeric(): + name = "listn" + value = int(match.group(2)) + current = Node(name, parent=current, obj=obj, xpos=xpos, value=value) + current = Node("para", parent=current, obj=obj, xpos=current.xpos) + + # Check if line is a register bit definition + elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None: + if obj.contains_font("Bold"): + # Use the bold character as delimiter + content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font) + else: + # Default back to the regex + if "Reserved" not in content: + _LOGGER.warning( + f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}" + ) + content_start = re.match( + r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content + ) + if content_start is None: + _LOGGER.error(f"Unable to match Bit regex at all! '{content}'!") + content_start = 0 + else: + content_start = len(content_start.group(0)) + if not content_start: + _LOGGER.error(f"Missing content start (=0)! '{content}'!") + content_start = min(content_start, len(obj.chars) - 1) + + current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root) + middle = obj.chars[content_start].bbox.left + xpos = round(middle) + current = Node( + "bit", + parent=current, + obj=obj, + xpos=xpos, + _page=self, + _middle=middle, + _left=area.left, + _right=area.right, + ) + current = Node("para", parent=current, obj=obj, xpos=current.xpos) + + # Check if this is a new paragraph + elif newlines >= 2 or current.name not in {"para"}: + # Fix issues where notes are reflowing back left of Note: text + if parent_name(current) in {"note"}: + if xpos < current.parent.xpos: + xpos = current.parent.xpos + # Prevent multiline + current = unindent(xpos, current, newlines) + current = Node("para", parent=current, obj=obj, xpos=xpos if current.is_root else current.xpos) + + elif parent_name(current) not in {"caption", "bit", "area"}: + current = unindent(xpos, current, newlines) + + # Add the actual line + Node("line", parent=current, obj=obj, xpos=xpos, start=content_start, str=content[content_start:50]) + + ypos = obj.origin + + return root + + def __repr__(self) -> str: + return f"StmPage({self.number})" diff --git a/src/modm_data/svd/__init__.py b/src/modm_data/svd/__init__.py index af5d163..8129788 100644 --- a/src/modm_data/svd/__init__.py +++ b/src/modm_data/svd/__init__.py @@ -1,7 +1,6 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -from . import stmicro from .model import Device, PeripheralType, Peripheral, Register, BitField, compare_device_trees from .write import format_svd, write_svd from .read import read_svd