Skip to content

实际页码可能因为pdf中没包含空白页而需要中途校正,另附简化后无gui脚本 #42

@vincentaxhe

Description

@vincentaxhe

有很多书新章是在奇数页的,上一章最后页可能会成空白,但扫描后的pdf没有包含进空白页,就造成ocr目录后的页码加上个数字还是会有错位,错位还会变大,只有写入bookmark后尝试点击才会发现是否有错位。
解决方式也简单,就是在有错位的地方,加上数字校正,在校正变化的地方加上新数字,这样不用却在目录文件中更新很多数字,还与实际目录中页码对应。我简化成脚本中增加了该功能。

#!/bin/python
import os
import re
import sys
from collections import defaultdict
from pypdf import PdfWriter, PdfReader, PageObject
from pypdf.generic import Destination

class Pdf(object):
    def __init__(self, path):
        self.path = path
        self.reader = PdfReader(open(path, "rb"), strict=False)
        self.pages_num = self._get_pages_num(self.reader.pages)
        self._writer = None

    @property
    def _new_path(self):
        name, ext = os.path.splitext(self.path)
        return name + '_new' + ext

    @property
    def writer(self):
        if not self._writer:
            writer = PdfWriter()
            self.copy_reader_to_writer(self.reader, writer)
            writer._root_object.pop("/Outlines", None)
            self._writer = writer
        return self._writer

    @staticmethod
    def copy_reader_to_writer(reader, writer):
        try:
            writer.append(reader, import_outline=False)
        except Exception as e:
            print("Copy pdf failed, {}, try to exclude /Annots and /B".format(e))
            try:
                writer.append(reader, import_outline=False, excluded_fields=["/Annots", "/B"])
            except Exception as e:
                print("Copy pdf failed again, {}, try to use append_pages_from_reader".format(e))
                writer.append_pages_from_reader(reader)

    @staticmethod
    def _get_pages_num(pages):
        pages_num = {}
        for page in pages:
            try:
                if isinstance(page, PageObject):
                    pages_num[page.indirect_ref.idnum] = page.page_number
                else:
                    print("Unknown page type {} for {}".format(type(page), page.page_number))
            except Exception as e:
                print(e)
        return pages_num

    def _outlines_to_bookmarks(self, outlines, current_level=0):
        index_list = []
        for o in outlines:
            if isinstance(o, Destination):
                try:
                    idnum = o.page if isinstance(o.page, int) else o.page.idnum
                    title = "\t" * current_level + o.title.strip()
                    page_num = self.pages_num[idnum] + 1
                    index_list.append("{title}\t{page_num}".format(title=title, page_num=page_num))
                except Exception as e:
                    print(e)
            elif isinstance(o, list):
                index_list += self._outlines_to_bookmarks(o, current_level + 1)
            else:
                print("Unknown outline type: {} in {}".format(type(o), o))
                continue
        return index_list

    def exist_bookmarks(self):
        return self._outlines_to_bookmarks(self.reader.outline)

    def add_bookmark(self, title, pagenum, parent=None):
        return self.writer.add_outline_item(title, pagenum, parent=parent)

    def save_pdf(self):
        if os.path.exists(self._new_path):
            os.remove(self._new_path)
        with open(self._new_path, 'wb') as out:
            self.writer.write(out)
        return self._new_path

def _add_bookmark(pdf, index_dict):
    if not index_dict:
        return None
    m = max(index_dict.keys())
    parent_dict = {}
    max_page_num = len(pdf.writer.pages) - 1
    for i in range(m+1):
        value = index_dict[i]
        inobject = pdf.add_bookmark(value.get('title', ''),
                                    min(value.get('pagenum', 1) - 1, max_page_num),
                                    parent_dict.get(value.get('parent')))
        parent_dict[i] = inobject

def add_bookmark(path, index_dict):
    pdf = Pdf(path)
    _add_bookmark(pdf, index_dict)
    return pdf.save_pdf()

def get_bookmarks(path):
    if not path:
        return []
    try:
        return Pdf(path).exist_bookmarks()
    except Exception as e:
        print("Read pdf %s failed! %s" % (path, e))
        return []

def toc_reader(path, gap):
    pattern = re.compile(r'^(\t*)([^\t]+)\t(-?\d+)(?:\t([-+]\d+))?$')
    tocdict = {}
    levels = defaultdict(list)
    lastpagenum = 0
    fix = 0
    with open(path, 'r') as toc:
        for line, item in enumerate(toc):
            content = pattern.search(item)
            assert content, f"line {line}:{item} line ill-formatted"
            indent, title, pagenum, fixpagenum = content.group(1, 2, 3, 4)
            if fixpagenum:
                fix = int(fixpagenum)
            pagenum = int(pagenum) + int(gap) + fix
            assert pagenum >= lastpagenum, f"line {line}:{item} pagenum wrong"
            tocdict[line] = {'title': title, 'pagenum': pagenum}
            levels[len(indent)].append(line)
            if len(indent) > 0:
                tocdict[line]['parent'] = levels[len(indent) - 1][-1]
            lastpagenum = pagenum
    return tocdict
if __name__ == '__main__':
    args = len(sys.argv)
    if args == 2:
        file = sys.argv[1]
        toclist = get_bookmarks(file)
        for item in toclist:
            print(item)
    elif args == 4:
        file, toc, gap = sys.argv[1:]
        tocdict = toc_reader(toc, gap)
        add_bookmark(file, tocdict)

根据不是很近的api更新的,页码严格增加。
pdfbookmark.py xxx.pdf 输出toc
pdfbookmark.py xxx.pdf tocfile 10 写入toc,生成新pdf
我自己觉得用脚本更方便,不敢藏私,借用了api才很好用。

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions