Skip to content

Commit b336f98

Browse files
committed
🐛 Fix reading plain text files
As reported by @tallemeersch in gh-107, newlines were not removed for plain text files. Fix this by stripping the lines as suggested. Fixes gh-107.
1 parent 41a0fad commit b336f98

File tree

2 files changed

+4
-4
lines changed

2 files changed

+4
-4
lines changed

src/dinglehopper/ocr_files.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
3636
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
3737
line_id = line.attrib.get("ID")
3838
line_text = " ".join(
39-
string.attrib.get("CONTENT")
39+
string.attrib.get("CONTENT", "")
4040
for string in line.iterfind("alto:String", namespaces=nsmap)
4141
)
4242
normalized_text = normalize_sbb(line_text)
@@ -167,7 +167,7 @@ def make_segment(no, line):
167167
with open(filename, "r", encoding=fileencoding) as f:
168168
return ExtractedText(
169169
None,
170-
[make_segment(no, line) for no, line in enumerate(f.readlines())],
170+
[make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
171171
"\n",
172172
None,
173173
None,

src/dinglehopper/tests/test_ocr_files.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,8 @@ def test_text():
177177
def test_plain(tmp_path):
178178
with working_directory(tmp_path):
179179
with open("ocr.txt", "w") as ocrf:
180-
ocrf.write("AAAAB")
180+
ocrf.write("First, a line.\nAnd a second line.\n")
181181

182182
result = plain_text("ocr.txt")
183-
expected = "AAAAB"
183+
expected = "First, a line.\nAnd a second line."
184184
assert result == expected

0 commit comments

Comments
 (0)