-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathtextparser.lua
More file actions
662 lines (582 loc) · 21.7 KB
/
textparser.lua
File metadata and controls
662 lines (582 loc) · 21.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
--[[--
Text Parser Module
Splits text into words and sentences with position tracking.
@module textparser
--]]
local logger = require("logger")
-- Shared utility modules (DRY: countSyllables)
local _utils_dir = debug.getinfo(1, "S").source:match("^@(.*/)[^/]*$") or "./"
local Utils = dofile(_utils_dir .. "utils.lua")
local TextParser = {
-- Sentence ending punctuation
SENTENCE_ENDINGS = "[%.%?!]",
-- Word separators
WORD_SEPARATORS = "[%s%p]",
}
-- Long-sentence splitting thresholds (see benchmark/RESULTS_LONG.md).
-- Piper on ARM OOMs above ~900 chars; 300 keeps us in the efficient window.
-- Chunks below 80 chars waste 90%+ of synthesis time on per-request overhead.
local MAX_CHUNK_CHARS = 300
local MIN_CHUNK_CHARS = 80
function TextParser:new(o)
o = o or {}
setmetatable(o, self)
self.__index = self
return o
end
--[[--
Parse text into structured data with words and sentences.
@param text string The input text to parse
@return table Parsed structure with sentences and words
--]]
function TextParser:parse(text)
if not text or text == "" then
return {
sentences = {},
words = {},
text = "",
}
end
-- Normalize whitespace
text = self:normalizeText(text)
local result = {
text = text,
sentences = self:parseSentences(text),
words = self:parseWords(text),
}
-- Link words to their sentences
self:linkWordsToSentences(result)
logger.dbg("TextParser: Parsed", #result.sentences, "sentences,", #result.words, "words")
return result
end
--[[--
Normalize text by cleaning up whitespace and special characters.
@param text string Input text
@return string Normalized text
--]]
function TextParser:normalizeText(text)
-- Normalize line endings to \n
text = text:gsub("\r\n", "\n")
text = text:gsub("\r", "\n")
-- Issue #15: PocketBook PDF/EPUB text extractors sometimes emit control
-- characters (NUL, SOH, STX, etc.) between words or characters. Left in
-- place they can split words, create phantom paragraphs, or collide with
-- the paragraph sentinel we use below. Strip the whole C0 control range
-- except the whitespace we actually need (\t, \n).
text = text:gsub("[%z\x01-\x08\x0b\x0c\x0e-\x1f]", "")
-- Issue #21: PDFs (and PDF-derived EPUBs) wrap each visual line with a
-- literal newline. parseSentences treats every newline as a paragraph
-- break, which inserts a TTS pause at every line wrap and synthesises
-- hyphen-split words ("re-" + "duce") as two utterances.
--
-- We preserve real paragraph breaks (blank lines) with a sentinel, turn
-- remaining single newlines into spaces, then restore the sentinels.
-- v0.1.5.80 used a single NUL byte ("\0") as the sentinel; v0.1.5.81
-- stripped NULs from the input but PB632/PB700c still reported char-by-char
-- reading, so we now use a multi-byte sentinel that can never collide.
local PARA_SENTINEL = "\x01\x02\x03"
text = text:gsub("\n[ \t]*\n+", PARA_SENTINEL)
text = text:gsub("([^%s%p])%-\n([^%s%p])", "%1%2")
text = text:gsub("\n", " ")
text = text:gsub(PARA_SENTINEL, "\n")
-- Replace runs of spaces/tabs (but NOT newlines) with single space
text = text:gsub("[ \t]+", " ")
-- Trim leading/trailing whitespace
text = text:match("^%s*(.-)%s*$")
-- Defensive guard: if the extractor emits paragraph breaks between every
-- character or word, parseSentences will create one-sentence-per-character
-- and TTS reads the page one letter at a time. Detect an abnormally high
-- proportion of very short lines and join them into a single paragraph.
local line_count = 0
local short_line_count = 0
for line in (text .. "\n"):gmatch("([^\n]*)\n") do
if line ~= "" then
line_count = line_count + 1
if #line <= 2 then
short_line_count = short_line_count + 1
end
end
end
if line_count > 5 and short_line_count / line_count > 0.5 then
text = text:gsub("\n", " ")
text = text:gsub("[ \t]+", " ")
end
return text
end
--[[--
Parse text into sentences.
@param text string Input text
@return table Array of sentence objects
--]]
function TextParser:parseSentences(text)
local sentences = {}
local sentence_index = 1
-- Helper: add a sentence if non-empty
-- @param s string Trimmed sentence text
-- @param end_type string "paragraph" = last segment before a newline,
-- "sentence" = split by .?!;: mid-line
local function addSentence(s, end_type)
s = s:match("^%s*(.-)%s*$") -- trim
if s and s ~= "" then
table.insert(sentences, {
index = sentence_index,
text = s,
start_pos = 0,
end_pos = 0,
words = {},
end_type = end_type or "sentence",
})
sentence_index = sentence_index + 1
end
end
-- Step 1: split on newlines (each line is at least one sentence)
for line in (text .. "\n"):gmatch("([^\n]+)\n") do
line = line:match("^%s*(.-)%s*$")
if line and line ~= "" then
-- Step 2: split each line on sentence-ending punctuation (.?!)
-- followed by a space or end-of-string.
-- NOTE: semicolons (;) and colons (:) are NOT treated as sentence
-- endings — they are mid-sentence punctuation that should not
-- interrupt the reading flow.
local pos = 1
local segments_in_line = {}
while pos <= #line do
-- Find .?! that is followed by a space (or is at end of line)
local pstart, pend = line:find("[%.%?!]+%s", pos)
if not pstart then
-- Check for .?! at very end of line (no trailing space)
pstart, pend = line:find("[%.%?!]+$", pos)
end
if pstart then
-- Include the punctuation but not the trailing space
local seg_end = pend
-- If the match ended with a space, don't include the space
if line:sub(pend, pend):match("%s") then
seg_end = pend - 1
end
table.insert(segments_in_line, line:sub(pos, seg_end))
pos = seg_end + 1
-- Skip whitespace
while pos <= #line and line:sub(pos, pos):match("%s") do
pos = pos + 1
end
else
-- No more sentence-ending punctuation: rest is one segment
table.insert(segments_in_line, line:sub(pos))
break
end
end
-- Tag: last segment in line → "paragraph", others → "sentence"
for i, seg in ipairs(segments_in_line) do
local etype = (i == #segments_in_line) and "paragraph" or "sentence"
addSentence(seg, etype)
end
end
end
-- Step 3: split long sentences for TTS safety.
-- Piper on ARM OOMs above ~900 chars; split anything over MAX_CHUNK_CHARS
-- at clause boundaries, then cap at word boundaries if still too long.
local expanded = {}
local new_index = 1
for _, sentence in ipairs(sentences) do
if #sentence.text > MAX_CHUNK_CHARS then
local chunks = self:splitLongSentence(sentence.text)
for j, chunk in ipairs(chunks) do
local etype = (j == #chunks) and sentence.end_type or "sentence"
table.insert(expanded, {
index = new_index,
text = chunk,
start_pos = 0,
end_pos = 0,
words = {},
end_type = etype,
})
new_index = new_index + 1
end
else
sentence.index = new_index
table.insert(expanded, sentence)
new_index = new_index + 1
end
end
sentences = expanded
-- Recalculate start/end positions relative to original text
local search_from = 1
for _, sentence in ipairs(sentences) do
local found = text:find(sentence.text, search_from, true) -- plain search
if found then
sentence.start_pos = found
sentence.end_pos = found + #sentence.text - 1
search_from = sentence.end_pos + 1
end
end
return sentences
end
--[[--
Split a long sentence into clause-aware chunks for Piper TTS.
Piper on ARM has a hard ceiling at ~900 chars (OOM above ~1000). Even below
that, throughput is best with 100-300 char chunks. This function:
1. Splits at natural clause boundaries (; : " - " and ", <conjunction>")
2. Merges tiny fragments (< MIN_CHUNK_CHARS) with their neighbours
3. Re-splits anything still over max_chars at word boundaries
See benchmark/RESULTS_LONG.md for the data behind these thresholds.
@param text string The sentence text to split
@param max_chars number Maximum chunk size (default MAX_CHUNK_CHARS)
@return table Array of chunk strings
--]]
function TextParser:splitLongSentence(text, max_chars)
max_chars = max_chars or MAX_CHUNK_CHARS
if #text <= max_chars then
return { text }
end
-- Step 1: split at clause boundaries
local chunks = self:_splitAtClauses(text)
-- Step 2: merge fragments smaller than MIN_CHUNK_CHARS
chunks = self:_mergeSmallChunks(chunks, MIN_CHUNK_CHARS)
-- Step 3: re-split anything still over max_chars at word boundaries
local final = {}
for _, chunk in ipairs(chunks) do
if #chunk > max_chars then
local subs = self:_splitAtWordBoundary(chunk, max_chars)
for _, sub in ipairs(subs) do
table.insert(final, sub)
end
else
table.insert(final, chunk)
end
end
return final
end
--[[--
Split text at clause boundaries.
Recognised boundaries (kept at the end of the preceding chunk):
- semicolons: "; "
- colons: ": "
- dashes: " - "
- conjunctions: ", and/but/or/nor/for/yet/so/which/who/that/where/when/
while/although/because/since/unless/if/after/before"
@param text string
@return table Array of trimmed non-empty strings
--]]
function TextParser:_splitAtClauses(text)
local conjunctions = {
"and", "but", "or", "nor", "for", "yet", "so",
"which", "who", "that", "where", "when", "while",
"although", "because", "since", "unless", "if",
"after", "before",
}
local chunks = {}
local current = ""
local pos = 1
while pos <= #text do
local ch = text:sub(pos, pos)
-- "; " or ": " - split after the punctuation
if (ch == ";" or ch == ":") and text:sub(pos + 1, pos + 1) == " " then
current = current .. ch
table.insert(chunks, current)
current = ""
pos = pos + 2 -- skip the trailing space
-- " - " - split after the dash
elseif text:sub(pos, pos + 2) == " - " then
current = current .. " -"
table.insert(chunks, current)
current = ""
pos = pos + 3
-- ", <conjunction> " - split after the comma
elseif ch == "," and text:sub(pos + 1, pos + 1) == " " then
local rest = text:sub(pos + 2)
local found_conj = false
for _, conj in ipairs(conjunctions) do
if rest:find("^" .. conj .. "%s") or rest:find("^" .. conj .. "$") then
current = current .. ","
table.insert(chunks, current)
current = ""
pos = pos + 2 -- skip ", "; conjunction starts the next chunk
found_conj = true
break
end
end
if not found_conj then
current = current .. ch
pos = pos + 1
end
else
current = current .. ch
pos = pos + 1
end
end
if current ~= "" then
table.insert(chunks, current)
end
-- Trim and drop empties
local result = {}
for _, chunk in ipairs(chunks) do
chunk = chunk:match("^%s*(.-)%s*$")
if chunk and chunk ~= "" then
table.insert(result, chunk)
end
end
return result
end
--[[--
Merge chunks shorter than min_chars with a neighbour.
Prefers merging with the previous chunk (so we build up the leading chunk).
Falls back to merging forward when there is no previous chunk.
@param chunks table Array of chunk strings
@param min_chars number Minimum acceptable chunk length
@return table Merged array
--]]
function TextParser:_mergeSmallChunks(chunks, min_chars)
if #chunks <= 1 then return chunks end
local merged = {}
for _, chunk in ipairs(chunks) do
if #chunk < min_chars and #merged > 0 then
-- Merge with previous chunk
merged[#merged] = merged[#merged] .. " " .. chunk
elseif #chunk < min_chars then
-- First chunk is tiny - just push, will merge on next iteration
table.insert(merged, chunk)
else
table.insert(merged, chunk)
end
end
-- Second pass: if the first chunk is still too small, merge it forward
if #merged > 1 and #merged[1] < min_chars then
merged[2] = merged[1] .. " " .. merged[2]
table.remove(merged, 1)
end
return merged
end
--[[--
Split text at word boundaries so every chunk is <= max_chars.
Finds the last space at or before the limit and splits there.
Falls back to a hard cut when a single word exceeds max_chars.
@param text string
@param max_chars number
@return table Array of chunk strings
--]]
function TextParser:_splitAtWordBoundary(text, max_chars)
local chunks = {}
local remaining = text
while #remaining > max_chars do
local split_pos = max_chars
-- Walk backwards to find a space
while split_pos > 0 and remaining:sub(split_pos, split_pos) ~= " " do
split_pos = split_pos - 1
end
if split_pos == 0 then
-- No space found - hard cut (extremely unlikely with natural text)
split_pos = max_chars
end
table.insert(chunks, remaining:sub(1, split_pos - 1))
remaining = remaining:sub(split_pos + 1) -- skip the space
end
if #remaining > 0 then
-- If the trailing fragment is shorter than MIN_CHUNK_CHARS, merge it
-- back into the previous chunk. Slightly exceeding max_chars is far
-- cheaper than the ~4-5 s fixed overhead of a tiny extra request.
if #remaining < MIN_CHUNK_CHARS and #chunks > 0 then
chunks[#chunks] = chunks[#chunks] .. " " .. remaining
else
table.insert(chunks, remaining)
end
end
return chunks
end
--[[--
Parse text into words with positions.
@param text string Input text
@return table Array of word objects
--]]
function TextParser:parseWords(text)
local words = {}
local word_index = 1
local pos = 1
while pos <= #text do
-- Skip whitespace
while pos <= #text and text:sub(pos, pos):match("%s") do
pos = pos + 1
end
if pos > #text then
break
end
-- Find word start
local word_start = pos
-- Find word end (non-whitespace sequence)
while pos <= #text and not text:sub(pos, pos):match("%s") do
pos = pos + 1
end
local word_text = text:sub(word_start, pos - 1)
-- Strip punctuation for clean word (but keep position of full token)
local clean_word = word_text:gsub("^[%p]*", ""):gsub("[%p]*$", "")
if clean_word ~= "" then
table.insert(words, {
index = word_index,
text = word_text, -- Original with punctuation
clean_text = clean_word, -- Without punctuation
start_pos = word_start,
end_pos = pos - 1,
sentence_index = nil, -- Will be set later
duration = nil, -- Will be set by TTS timing
start_time = nil, -- Will be set by TTS timing
end_time = nil, -- Will be set by TTS timing
})
word_index = word_index + 1
end
end
return words
end
--[[--
Link words to their containing sentences.
@param parsed_data table The parsed data structure
--]]
function TextParser:linkWordsToSentences(parsed_data)
for _, word in ipairs(parsed_data.words) do
for _, sentence in ipairs(parsed_data.sentences) do
if word.start_pos >= sentence.start_pos and word.end_pos <= sentence.end_pos then
word.sentence_index = sentence.index
table.insert(sentence.words, word)
break
end
end
end
end
--[[--
Get word at specific character position.
@param parsed_data table The parsed data structure
@param position number Character position in text
@return table|nil Word object or nil
--]]
function TextParser:getWordAtPosition(parsed_data, position)
for _, word in ipairs(parsed_data.words) do
if position >= word.start_pos and position <= word.end_pos then
return word
end
end
return nil
end
--[[--
Get sentence at specific character position.
@param parsed_data table The parsed data structure
@param position number Character position in text
@return table|nil Sentence object or nil
--]]
function TextParser:getSentenceAtPosition(parsed_data, position)
for _, sentence in ipairs(parsed_data.sentences) do
if position >= sentence.start_pos and position <= sentence.end_pos then
return sentence
end
end
return nil
end
--[[--
Get word by index.
@param parsed_data table The parsed data structure
@param index number Word index (1-based)
@return table|nil Word object or nil
--]]
function TextParser:getWordByIndex(parsed_data, index)
return parsed_data.words[index]
end
--[[--
Get sentence by index.
@param parsed_data table The parsed data structure
@param index number Sentence index (1-based)
@return table|nil Sentence object or nil
--]]
function TextParser:getSentenceByIndex(parsed_data, index)
return parsed_data.sentences[index]
end
--[[--
Estimate word timing based on syllable count and speech rate.
@param word table Word object
@param rate number Speech rate multiplier
@return number Estimated duration in milliseconds
--]]
function TextParser:estimateWordDuration(word, rate)
rate = rate or 1.0
local syllables = self:countSyllables(word.clean_text)
-- Average syllable duration is about 200ms at normal rate
local base_duration = syllables * 200
return math.floor(base_duration / rate)
end
--[[--
Count syllables in a word (simple heuristic).
Delegates to shared Utils module.
@param word string The word to analyze
@return number Estimated syllable count
--]]
function TextParser:countSyllables(word)
return Utils.countSyllables(word)
end
--[[--
Apply timing information to parsed words.
@param parsed_data table The parsed data structure
@param timing_data table Array of timing info from TTS engine
--]]
function TextParser:applyTimingData(parsed_data, timing_data)
if not timing_data or #timing_data == 0 then
logger.dbg("TextParser: No timing data provided, using estimates")
self:applyEstimatedTiming(parsed_data)
return
end
-- Match timing data to words
local timing_index = 1
for _, word in ipairs(parsed_data.words) do
if timing_index <= #timing_data then
local timing = timing_data[timing_index]
word.start_time = timing.start_time
word.end_time = timing.end_time
word.duration = timing.end_time - timing.start_time
timing_index = timing_index + 1
end
end
logger.dbg("TextParser: Applied timing data to", timing_index - 1, "words")
end
--[[--
Apply estimated timing when real timing is not available.
@param parsed_data table The parsed data structure
@param rate number Speech rate (default 1.0)
--]]
function TextParser:applyEstimatedTiming(parsed_data, rate)
rate = rate or 1.0
local current_time = 0
for _, word in ipairs(parsed_data.words) do
local duration = self:estimateWordDuration(word, rate)
word.start_time = current_time
word.end_time = current_time + duration
word.duration = duration
current_time = current_time + duration + 50 -- 50ms gap between words
end
logger.dbg("TextParser: Applied estimated timing, total duration:", current_time, "ms")
end
--[[--
Get the word that should be highlighted at a given time.
@param parsed_data table The parsed data structure
@param time_ms number Current playback time in milliseconds
@return table|nil Word object or nil
--]]
function TextParser:getWordAtTime(parsed_data, time_ms)
for _, word in ipairs(parsed_data.words) do
if word.start_time and word.end_time then
if time_ms >= word.start_time and time_ms < word.end_time then
return word
end
end
end
return nil
end
--[[--
Get the sentence that should be highlighted at a given time.
@param parsed_data table The parsed data structure
@param time_ms number Current playback time in milliseconds
@return table|nil Sentence object or nil
--]]
function TextParser:getSentenceAtTime(parsed_data, time_ms)
local word = self:getWordAtTime(parsed_data, time_ms)
if word and word.sentence_index then
return self:getSentenceByIndex(parsed_data, word.sentence_index)
end
return nil
end
return TextParser