qurator-spk · mikegerber · Apr 24, 2025 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ dmypy.json
 
 # User-specific stuff
 .idea
+.*.swp
 
 # Build artifacts
 /build

diff --git a/README.md b/README.md
@@ -112,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
 CLI interface:
 
-~~~
+```
 dinglehopper-line-dirs gt/ ocr/
-~~~
+```
+
+The CLI `dinglehopper-line-dirs` can also work with GT text files in the same
+directories as the the OCR text files. You should read `dinglehopper-line-dirs --help`
+in this case.
 
 ### dinglehopper-extract
 The tool `dinglehopper-extract` extracts the text of the given input file on

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
@@ -114,15 +114,20 @@ def process(
     metrics: bool = True,
     differences: bool = False,
     textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
 ) -> None:
     """Check OCR result against GT.
 
     The @click decorators change the signature of the decorated functions, so we keep
     this undecorated version and use Click on a wrapper.
     """
 
-    gt_text = extract(gt, textequiv_level=textequiv_level)
-    ocr_text = extract(ocr, textequiv_level=textequiv_level)
+    gt_text = extract(
+        gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
+    ocr_text = extract(
+        ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
     gt_words: List[str] = list(words_normalized(gt_text))
     ocr_words: List[str] = list(words_normalized(ocr_text))
 
@@ -195,6 +200,7 @@ def process_dir(
     metrics: bool = True,
     differences: bool = False,
     textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
 ) -> None:
     for gt_file in os.listdir(gt):
         gt_file_path = os.path.join(gt, gt_file)
@@ -209,6 +215,7 @@ def process_dir(
                 metrics=metrics,
                 differences=differences,
                 textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
             )
         else:
             print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@@ -233,6 +240,11 @@ def process_dir(
     help="PAGE TextEquiv level to extract text from",
     metavar="LEVEL",
 )
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
 @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
 @click.version_option()
 def main(
@@ -243,6 +255,7 @@ def main(
     metrics,
     differences,
     textequiv_level,
+    plain_encoding,
     progress,
 ):
     """
@@ -280,6 +293,7 @@ def main(
                 metrics=metrics,
                 differences=differences,
                 textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
             )
     else:
         process(
@@ -290,6 +304,7 @@ def main(
             metrics=metrics,
             differences=differences,
             textequiv_level=textequiv_level,
+            plain_encoding=plain_encoding,
         )
 
 

diff --git a/src/dinglehopper/cli_extract.py b/src/dinglehopper/cli_extract.py
@@ -12,7 +12,12 @@
     help="PAGE TextEquiv level to extract text from",
     metavar="LEVEL",
 )
-def main(input_file, textequiv_level):
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
+def main(input_file, textequiv_level, plain_encoding):
     """
     Extract the text of the given INPUT_FILE.
 
@@ -23,7 +28,9 @@ def main(input_file, textequiv_level):
     use "--textequiv-level line" to extract from the level of TextLine tags.
     """
     initLogging()
-    input_text = extract(input_file, textequiv_level=textequiv_level).text
+    input_text = extract(
+        input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    ).text
     print(input_text)
 
 

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
@@ -1,5 +1,6 @@
 import itertools
 import os
+from typing import Callable, Iterator, List, Optional, Tuple
 
 import click
 from jinja2 import Environment, FileSystemLoader
@@ -12,6 +13,41 @@
 from .word_error_rate import word_error_rate_n, words_normalized
 
 
+def removesuffix(text, suffix):
+    """
+    Remove suffix from text.
+
+    Can be replaced with str.removesuffix when we only support Python >= 3.9.
+    """
+    if suffix and text.endswith(suffix):
+        return text[: -len(suffix)]
+    return text
+
+
+def is_hidden(filepath):
+    filename = os.path.basename(os.path.abspath(filepath))
+    return filename.startswith(".")
+
+
+def find_all_files(
+    dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False
+) -> Iterator[str]:
+    """
+    Find all files in dir_, returning filenames
+
+    If pred is given, pred(filename) must be True for the filename.
+
+    Does not return hidden files by default.
+    """
+    for root, _, filenames in os.walk(dir_):
+        for fn in filenames:
+            if not return_hidden and is_hidden(fn):
+                continue
+            if pred and not pred(fn):
+                continue
+            yield os.path.join(root, fn)
+
+
 def all_equal(iterable):
     g = itertools.groupby(iterable)
     return next(g, True) and not next(g, False)
@@ -25,15 +61,63 @@ def common_suffix(its):
     return reversed(common_prefix(reversed(it) for it in its))
 
 
-def removesuffix(text, suffix):
-    if suffix and text.endswith(suffix):
-        return text[: -len(suffix)]
-    return text
+def find_gt_and_ocr_files(
+    gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str
+) -> Iterator[Tuple[str, str]]:
+    """
+    Find GT files and matching OCR files.
+
+    Returns pairs of GT and OCR files.
+    """
+    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
+        ocr_fn = os.path.join(
+            ocr_dir,
+            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix,
+        )
+        if not os.path.exists(ocr_fn):
+            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
+
+        yield gt_fn, ocr_fn
+
+
+def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
+    """
+    Find GT files and matching OCR files, autodetect suffixes.
+
+    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
+    files with a common suffix. Currently the files must have a suffix, e.g.
+    ".gt.txt" (e.g. ".ocr.txt").
+
+    Returns pairs of GT and OCR files.
+    """
+
+    # Autodetect suffixes
+    gt_files = find_all_files(gt_dir)
+    gt_suffix = "".join(common_suffix(gt_files))
+    if len(gt_suffix) == 0:
+        raise RuntimeError(
+            f"Files in GT directory {gt_dir} do not have a common suffix"
+        )
+    ocr_files = find_all_files(ocr_dir)
+    ocr_suffix = "".join(common_suffix(ocr_files))
+    if len(ocr_suffix) == 0:
+        raise RuntimeError(
+            f"Files in OCR directory {ocr_dir} do not have a common suffix"
+        )
+
+    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 
 
-def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
-    gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
-    ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
+def process(
+    gt_dir,
+    ocr_dir,
+    report_prefix,
+    *,
+    metrics=True,
+    gt_suffix=None,
+    ocr_suffix=None,
+    plain_encoding="autodetect",
+):
 
     cer = None
     n_characters = None
@@ -42,16 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
     n_words = None
     word_diff_report = ""
 
-    for k, gt in enumerate(os.listdir(gt_dir)):
-        # Find a match by replacing the suffix
-        ocr = removesuffix(gt, gt_suffix) + ocr_suffix
+    if gt_suffix is not None and ocr_suffix is not None:
+        gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
+    else:
+        gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
 
-        gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
+    for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
+        gt_text = plain_extract(
+            gt_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
         ocr_text = plain_extract(
-            os.path.join(ocr_dir, ocr), include_filename_in_id=True
+            ocr_fn, include_filename_in_id=True, encoding=plain_encoding
         )
-        gt_words = words_normalized(gt_text)
-        ocr_words = words_normalized(ocr_text)
+        gt_words: List[str] = list(words_normalized(gt_text))
+        ocr_words: List[str] = list(words_normalized(ocr_text))
 
         # Compute CER
         l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@@ -81,15 +169,15 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
             joiner="",
             none="·",
             score_hint=score_hint(l_cer, l_n_characters),
-        )
+        )[0]
         word_diff_report += gen_diff_report(
             gt_words,
             ocr_words,
             css_prefix="l{0}-w".format(k),
             joiner=" ",
             none="⋯",
             score_hint=score_hint(l_wer, l_n_words),
-        )
+        )[0]
 
     env = Environment(
         loader=FileSystemLoader(
@@ -123,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
 @click.option(
     "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
-def main(gt, ocr, report_prefix, metrics):
+@click.option("--gt-suffix", help="Suffix of GT line text files")
+@click.option("--ocr-suffix", help="Suffix of OCR line text files")
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
+def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
     """
     Compare the GT line text directory against the OCR line text directory.
 
     This assumes that the GT line text directory contains textfiles with a common
     suffix like ".gt.txt", and the OCR line text directory contains textfiles with
     a common suffix like ".some-ocr.txt". The text files also need to be paired,
-    i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
-    in the OCT lines directory.
+    i.e. the GT filename "line001.gt.txt" needs to match a filename
+    "line001.some-ocr.txt" in the OCR lines directory.
+
+    GT and OCR directories may contain line text files in matching subdirectories,
+    e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
 
-    The GT and OCR directories are usually round truth line texts and the results of
+    GT and OCR directories can also be the same directory, but in this case you need
+    to give --gt-suffix and --ocr-suffix explicitly.
+
+    The GT and OCR directories are usually ground truth line texts and the results of
     an OCR software, but you may use dinglehopper to compare two OCR results. In
     that case, use --no-metrics to disable the then meaningless metrics and also
     change the color scheme from green/red to blue.
@@ -142,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics):
     $REPORT_PREFIX defaults to "report". The reports include the character error
     rate (CER) and the word error rate (WER).
 
+    It is recommended to specify the encoding of the text files, for example with
+    --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
     """
     initLogging()
-    process(gt, ocr, report_prefix, metrics=metrics)
+    process(
+        gt,
+        ocr,
+        report_prefix,
+        metrics=metrics,
+        gt_suffix=gt_suffix,
+        ocr_suffix=ocr_suffix,
+        plain_encoding=plain_encoding,
+    )
 
 
 if __name__ == "__main__":
-Original file line number
+Diff line change
@@ Expand Up / @@ -25,6 +25,7 @@ dmypy.json @@
     # User-specific stuff
     .idea
+    .*.swp
     # Build artifacts
     /build
@@ Expand Down @@