Skip to content

这个有点overkill了,我简化成一个脚本 #36

@vincentaxhe

Description

@vincentaxhe

我用之前的版本,当尝试建立下级目录,需要想出精巧的正则公式,这就overkill了,我想要tab来指示层级。不知道现在是不是还忽略行首tab的。

#!/bin/python
import os
import re
import sys
from collections import defaultdict
from pypdf import PdfWriter, PdfReader


class Pdf(object):
    def __init__(self, path):
        self.path = path
        reader = PdfReader(open(path, "rb"), strict=False)
        self.writer = PdfWriter()
        self.writer.append(reader)
        self.writer._root_object.pop("/Outlines", None)

    @property
    def _new_path(self):
        name, ext = os.path.splitext(self.path)
        return name + '_new' + ext

    def add_bookmark(self, title, pagenum, parent=None):
        return self.writer.add_outline_item(title, pagenum, parent=parent)

    def save_pdf(self):
        if os.path.exists(self._new_path):
            os.remove(self._new_path)
        with open(self._new_path, 'wb') as out:
            self.writer.write(out)
        return self._new_path

def _add_bookmark(pdf, index_dict):
    if not index_dict:
        return None
    m = max(index_dict.keys())
    parent_dict = {}  # {parent index:IndirectObject}
    for i in range(m+1):
        value = index_dict[i]
        inobject = pdf.add_bookmark(value['title'], 
                                    value['pagenum'] - 1, 
                                    parent_dict.get(value.get('parent')))
        parent_dict[i] = inobject

def add_bookmark(path, index_dict):
    pdf = Pdf(path)
    _add_bookmark(pdf, index_dict)
    return pdf.save_pdf()

def toc_reader(path, gap):
    pattern = re.compile(r'^(\t*)([^\t]+)\t(\d+)$')
    tocdict = {}
    levels = defaultdict(list)
    lastpagenum = 0
    with open(path, 'r') as toc:
        for line, item in enumerate(toc):
            content = pattern.search(item)
            assert content, f"line {line}:{item} line ill-formatted"
            indent, title, pagenum = content.group(1, 2, 3)
            pagenum = int(pagenum) + int(gap)
            assert pagenum >= lastpagenum, f"line {line}:{item} pagenum wrong"
            tocdict[line] = {'title': title, 'pagenum': pagenum}
            levels[len(indent)].append(line)
            if len(indent) > 0:
                tocdict[line]['parent'] = levels[len(indent) - 1][-1]
            lastpagenum = pagenum
    return tocdict
if __name__ == '__main__':
    file, toc, gap = sys.argv[1:]
    index_dict = toc_reader(toc, gap)
    add_bookmark(file, index_dict)

使用pdfbookmark.py xxx.pdf toc 10来运行它,toc用tab缩进来分级

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions