PDF_replace/main.py at main · Rendscode/PDF_replace · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# This is a sample Python script.

# Press Umschalt+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

import os
import re
import csv
import fitz  # PyMuPDF
from PIL import Image, ImageDraw
import pytesseract
import io
import matplotlib.pyplot as plt

def extract_pages(article_directory: str, articlename_search_string: str, logfile: str):
    """
    Durchsucht PDF-Dateien in einem Verzeichnis nach einem bestimmten Namensbestandteil,
    extrahiert die Seitenzahlen aus mehreren unteren Bildbereichen per OCR und schreibt die Ergebnisse in eine CSV-Datei.

    Nutzt PyMuPDF für Bildgenerierung und pytesseract für Texterkennung.

    :param article_directory: Verzeichnis mit PDF-Dateien
    :param articlename_search_string: Substring, nach dem in Dateinamen gesucht wird
    :param logfile: Pfad zur Ausgabedatei (CSV)
    """

    def plausibel(seitennummer: str, seitenindex: int):
        # Erlaube Toleranz von ±4 statt ±2, um spätere Seitenzahlen nicht auszuschließen
        try:
            return abs(int(seitennummer) - (seitenindex + 16)) <= 4
        except:
            return False

    results = []

    for filename in os.listdir(article_directory):
        if not filename.lower().endswith(".pdf"):
            continue
        if articlename_search_string not in filename:
            continue

        filepath = os.path.join(article_directory, filename)

        try:
            doc = fitz.open(filepath)
            page_numbers_found = set()

            for i, page in enumerate(doc):
                pix = page.get_pixmap(dpi=300)
                img = Image.open(io.BytesIO(pix.tobytes("png")))
                width, height = img.size

                # Drei Zonen: unten links, mitte, rechts
                zones = {
                    "left":   (0, int(height * 0.90), int(width * 0.33), height),
                    "middle": (int(width * 0.33), int(height * 0.90), int(width * 0.66), height),
                    "right":  (int(width * 0.66), int(height * 0.90), width, height),
                }

                debug_img = img.copy()
                draw = ImageDraw.Draw(debug_img)
                plausible_candidates = []

                for zone_name, crop_area in zones.items():
                    draw.rectangle(crop_area, outline="red", width=2)
                    crop = img.crop(crop_area)
                    text = pytesseract.image_to_string(crop, config='--psm 11')
                    numbers = re.findall(r"(?<!\d)([1-9][0-9]{0,2})\b", text)
                    plausible = [n for n in numbers if plausibel(n, i)]
                    plausible_candidates.extend(plausible)

                    # Debug-Ausgabe OCR-Ergebnis pro Zone
                    print(f"Seite {i+1}, Zone {zone_name}: OCR-Zahlen = {numbers}, plausibel = {plausible}")

                    # Debug-Bild speichern
                    crop.save(f"debug_seite_{i+1}_{zone_name}.png")

                # Beispielanzeige nur auf letzter Seite
                if i == len(doc) - 1:
                    plt.figure(figsize=(8, 10))
                    plt.title(f"OCR-Zonen auf Seite {i+1} von {filename}")
                    plt.imshow(debug_img)
                    plt.axis('off')
                    plt.show()

                if plausible_candidates:
                    # Größte plausible Seitenzahl pro Seite nehmen
                    best_guess = max(map(int, plausible_candidates))
                    page_numbers_found.add(str(best_guess))

            results.append((filename, ", ".join(sorted(page_numbers_found, key=int))))

        except Exception as e:
            print(f"Fehler beim Verarbeiten von {filename}: {e}")

    # Schreibe CSV-Datei
    try:
        with open(logfile, mode="w", newline="", encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Dateiname", "Seitenzahlen"])
            for row in results:
                writer.writerow(row)
        print(f"Ergebnisse in {logfile} gespeichert.")
    except Exception as e:
        print(f"Fehler beim Schreiben der CSV-Datei: {e}")


def print_hi(name):
    # Use a breakpoint in the code line below to debug your script.
    print(f'Hi, {name}')  # Press Strg+F8 to toggle the breakpoint.


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    print_hi('PyCharm')
    # Beispielparameter
    article_directory = "/home/hhhans/Dokumente/Kostbarkeiten"
    # articlename_search_string = "c't 2024-05 Python Argpare"
    articlename_search_string = "c't 2020-04 Social Media Vergessenwerden"
    logfile = "/home/hhhans/Dokumente/Kostbarkeiten/artikelseiten.csv"

    extract_pages(article_directory, articlename_search_string, logfile)

# See PyCharm help at https://www.jetbrains.com/help/pycharm/