-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathentry_processor.py
More file actions
742 lines (688 loc) · 47.1 KB
/
entry_processor.py
File metadata and controls
742 lines (688 loc) · 47.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
import re
import html as html_module
class EntryProcessor:
"""Encapsulates all HTML cleaning and processing for a single dictionary entry."""
def __init__(self, html: str, headword: str):
self.html = html
self.headword = headword
@staticmethod
def _process_pos_forms_section(html: str) -> str:
"""Finds a 'forms' section demarcated by <span class="pos"> markers
and wraps any unstyled sense/subsense blockquotes within that range."""
from bs4 import BeautifulSoup, Tag, FeatureNotFound # Fix macOS fork-safety deadlock by lazy-loading BeautifulSoup imports
try:
soup = BeautifulSoup(html, 'lxml')
except FeatureNotFound:
soup = BeautifulSoup(html, 'html.parser')
pos_spans = soup.find_all('span', class_='pos', limit=2)
if len(pos_spans) < 2:
return html
start_node = pos_spans[0].find_parent('blockquote')
# Check the trigger *before* doing more work.
if start_node is None or 'forms' not in start_node.get_text(strip=True).lower():
return html
end_node = pos_spans[1].find_parent('blockquote')
# Guard against invalid start/end nodes to prevent uncontrolled loops.
if not end_node or start_node is end_node:
return html
# Collect all target nodes in a separate list before modifying the document,
# we want to ensure we only wrap sections with actual forms in them.
targets_to_wrap = []
current_node = start_node
while current_node and current_node != end_node:
if (isinstance(current_node, Tag) and
current_node.name == 'blockquote' and not current_node.has_attr('class') and
current_node.find('span', class_=['senses', 'subsenses'])):
if not current_node.find_parent(class_='forms'):
targets_to_wrap.append(current_node)
current_node = current_node.find_next_sibling()
# Safely iterate over the collected list to modify the soup object.
for blockquote_node in targets_to_wrap:
all_b_tags = blockquote_node.find_all('b')
# Check for a 'content' <b> tag (one without a nested <span>).
for b_tag in all_b_tags:
if not b_tag.find('span'):
# If found, wrap the blockquote and stop checking this node.
wrapper_div = soup.new_tag('div', attrs={'class': 'forms'})
blockquote_node.wrap(wrapper_div)
break
# BeautifulSoup with lxml automatically adds <html> and <body> tags to partial fragments.
# We must strip these out to return just the modified HTML content. Otherwise processing_worker
# will have a hard time detecting proper headwords (e.g., entry 'it')
return str(soup).replace('<html><body>', '').replace('</body></html>', '')
def process(self) -> str:
"""Runs the full suite of cleaning and formatting operations on the HTML."""
html = self.html
html = re.sub(r'<img[^>]+>', '', html)
html = re.sub(r'\\t\\n', ' ', html)
html = re.sub(r'\\n', ' ', html)
html = re.sub(r'\\t', ' ', html)
html = re.sub(r'(</b>|/)(\[)', r'\1 \2', html)
if self.headword.endswith('.') and self.headword != 'No.':
# Escape the headword so dots are treated as literal dots, not regex wildcards
pattern = rf'<abr>({re.escape(self.headword)})</abr>'
html = re.sub(pattern, r'\1', html, count=1)
html = re.sub(r'(<span>[IVXL]+\.</span></span></b>)\s*(<blockquote>)?(<b>.*?</b>)(</blockquote>)?', r'\1 <span class="headword">\3</span>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>\(<span style="color:#2F4F4F">(.*?)</span>\)</blockquote>', r'(<span class="phonetic">\1</span>)', html, flags=re.DOTALL)
html = re.sub(r'<span style="color:#2F4F4F">(.*?)</span>', r'<span class="phonetic">\1</span>', html, flags=re.DOTALL)
html = html.replace('<blockquote><ex>', '<div class="quotations">')
html = html.replace('</ex></blockquote>', '</div>')
# Fix for cases where etymology bracket appears at the end of a quotation block (e.g. entry 'jefe')
# We must close the quotations div explicitly here so the bracket remains part of the outer etymology structure
html = html.replace('</ex>]</blockquote>', '</div>]</blockquote>')
html = re.sub(r'(<abr>†</abr>)\s', r'\1', html)
html = re.sub(r'(<abr>¶</abr>)\s', r'\1', html)
html = re.sub(r'(<abr>‖</abr>)\s', r'\1', html)
html = re.sub(r'<kref>(.*?)</kref>', r'<span class="kref">\1</span>', html)
html = html.replace('<abr>=</abr> (', '<abr>=</abr> (')
html = html.replace('</sub> (', '</sub> (')
html = html.replace('<abr>=</abr>', '<span class="same-as">=</span>')
html = re.sub(r'(<dtrn>(.*?)</dtrn>)\s*<dtrn>(.*?)</dtrn>', r' <br/>\1', html)
html = re.sub(r'<b>(<i>(?:Affix|Derivatives|Compounds)\.</i>)</b>', r'\1', html)
# This is a liberty I've taken, which will capture some false positives (relative to the original OED text, see entry "them" section II. 4),
# but as it is a very common pattern, it will be useful to have it regardless.
html = re.sub(r'(<span class="same-as">=</span>)\s+([a-zA-Z]+)', r'\1 <span class="kref">\2</span>', html)
# This should separate quotations blocks in cases like "weak" where there are continuous quotations for different subsenses
# or cases like "which" sense 14, subsense a. where there are sub-subsenses (a) and (b).
# or when greek letters are involved, see "fantastic". Finally combine all other blocks into one.
html = re.sub(r'(</div>)(<div class="quotations">)(<b>[a-z]\.</b>)', r'\1 \2\3', html)
html = re.sub(r'(</div>)(<div class="quotations">)(<i>\([a-z]\)</i>)', r'\1 \2\3', html)
html = re.sub(r'(</div>)(<div class="quotations">)(<i><abr>[a-zA-Z]+\.</abr></i>)', r'\1 \2\3', html) # weak 2.a
html = re.sub(r'(</div>)(<div class="quotations">)(<i>[a-zA-Z]+\.?(?:[-\s][a-zA-Z]+\.)?</i>)', r'\1 \2\3', html) # weak 5.a
html = re.sub(r'(</div>)(<div class="quotations">)([\u03b1-\u03c9](?:<sup>[0-9]</sup>)? ?\[?<[bi]>)', r'\1 \2\3', html) # greek letters
html = re.sub(r'(</div>)(<div class="quotations">)(<b>)', r'\1\2 \3', html)
html = html.replace('</div><div class="quotations">', '')
html = re.sub(r'(<b>)<span style="color:#8B008B">▪ <span>([IVXL]+)\.</span></span>(</b>)', r'\1<sup>\2</sup>\3', html)
# Fix dates, only match exactly 3 or 4 digit years. This should turn "c 1500" into "c1500" or "? a 1300" into "?a1300".
html = re.sub(r'<b>(\?)?\s?<i>([acp])</i> (\d{3,4})(\u2013\d{2})?</b>', r'<b>\1<i>\2</i>\3\4</b>', html)
html = re.sub(r'<b>(\?)?\s?(\d{3,4})(\u2013\d{2})?</b>', r'<b>\1\2\3</b>', html)
html = re.sub(r'<b>(\?)?(\d{3,4})(\u2013\d{2})?</b>([^\s])', r'<b>\1\2\3</b> \4', html)
# Handle anonymous "in Source" patterns first, we add a placeholder which will be removed later.
html = re.sub(
r'(<b>(?:\?)?(?:<i>[acp]</i>)?(\d{3,4})(\u2013\d{2})?</b>)\s+((?:in\s+[^<]*|―\s+)<i>.*?</i>)',
r'\1 <ANON_IN_SOURCE>\4</ANON_IN_SOURCE>',
html
)
html = re.sub(
r'(<b>(?:\?)?(?:<i>[acp]</i>)?(\d{3,4})(\u2013\d{2})?</b>)\s+([^\s<]+(?:\s+[^\s<]+)*?)\s+(?=in\s+<i>|<i>|in|\(\w)',
r'\1 <span class="author">\4</span> ',
html
)
html = re.sub(r'(<span class="author">[^<]*</span>)\s+((?:in\s+)?<i>[^<]*</i>)', r'\1 <span class="title">\2</span>', html)
html = re.sub( # Handle author + number reference pattern (like Ormin 9500)
r'(<b>(?:\?)?(?:<i>[acp]</i>)?(\d{3,4})</b>)\s+([^\s<]+(?:\s+[^\s<]+)*)\s+(\d+)\s+<span style="color:#8B008B">',
r'\1 <span class="author">\3</span> <span class="reference">\4</span> <span style="color:#8B008B">',
html
)
html = re.sub( # Handle author + number line-number pattern (like Lay. 3014)
r'(<b>(?:\?)?(?:<i>[acp]</i>\s?)?(\d{3,4})</b>)\s+((?:[A-Z]+\.)?\s?<abr>[^<]+</abr>)\s+(\d+)\s+<span style="color:#8B008B">',
r'\1 <span class="author">\3</span> <span class="line-number">\4</span> <span style="color:#8B008B">',
html
)
html = re.sub( # Handle year-digit + author (ex: <b>1925–6</b> E. Hemingway in <i>)
r'(<b>(?:\?)?(?:<i>[acp]</i>\s?)?(\d{3,4})(?:[-–]\d{1,2})?</b>)\s+([A-Z]\.\s+[A-Z][a-z]+)\s+in\s+(<i>)',
r'\1 <span class="author">\3</span> in \4',
html
)
# html = re.sub(
# r'(<b>(?:\?)?(?:<i>[acp]</i>)?(\d{3,4})(\u2013\d{2})?</b>)\s+([^\s<,]+(?:\s+[^\s<,]+)*),\s+<span class="quotes">',
# r'\1 <span class="author">\4</span>, <span class="quotes">',
# html
# )
# Replace the start of the etymology block, but only the first occurrence, just in case.
# We need to track WHERE this replacement happened to know where to start searching for the end.
# otherwise we might end up closing before the etymology actually starts (happened to entry 'Islamo-')
etymology_start_marker = '<div class="etymology"><blockquote><span class="etymology-main">['
html = re.sub(
r'<blockquote><span style="color:#808080">\[',
etymology_start_marker,
html,
count=1
)
# Locate the start of the etymology section we just created
ety_start_idx = html.find(etymology_start_marker)
if ety_start_idx != -1:
# Then let's try finding the correct closing tag for the etymology block. stop_pos is a point at which it will for sure have closed.
# We search starting from our current position to ensure we don't find a marker from a previous section.
stop_pos = html.find('<b><span style="color:#4B0082">', ety_start_idx)
# From the start of the etymology block -> to the stop_pos (or end of string)
search_start = ety_start_idx
search_end = stop_pos if stop_pos != -1 else len(html)
target_segment = html[search_start:search_end]
# Apply replacements only to this target segment
result, count = re.subn(r'\],?(</span>)\s*</blockquote>', ']</span></blockquote></div> ', target_segment, count=1)
if count == 0:
result = re.sub(r'\]\s*</blockquote>', ']</blockquote></div> ', target_segment, count=1)
# We add the notes class to all blockquotes inside this etymology block.
div_end = '</div> '
if div_end in result:
inside, after = result.split(div_end, 1)
inside = re.sub(r'(</blockquote>)<blockquote>', r'\1<blockquote class="etymology-notes">', inside)
result = inside + div_end + after
# else:
# result = re.sub(r'(</blockquote>)<blockquote>', r'\1<blockquote class="etymology-notes">', result)
# Reassemble the HTML: Pre-segment + Processed Segment + Post-segment
html = html[:search_start] + result + html[search_end:]
# Earlier we left the extra closing blockquote so it could be found through the previos etymology bit,
# we now remove it, so we don't have a single lingering closing tag. (see 'jefe')
html = re.sub(r'(</div>)(\])</blockquote>(</div>)', r'\2\1\3', html)
# some newer entries (like 'anime'), have a sligtly different pattern than </ex></blockquote>
html = re.sub(r'</ex>(.*?)</blockquote>', r'</div> <blockquote class="etymology-notes">\1</blockquote> ', html)
# Heuristic approach to wrap in the forms section. note: there are multiple variations here so other forms sections found deep
# into an entry might not be captured. HELP WANTED #fixme.
html = re.sub(r'<blockquote>(\(?Forms:?.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>(?:<i>)?(Compared.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>(\(?[Aa]lso (?:[0-9])?.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>(\(?[0-9][-–]?[0-9]? <b>.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL) # see '-y' suffix^2
html = re.sub(r'<blockquote>([0-9]–[0-9]?.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL) # see 'ABC'
html = re.sub(r'<blockquote>(<abr>Pa.</abr>.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>(Past and <abr>pple.</abr>.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>(Pl. <b>.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
# this one gets reprocessed later on in _apply_headword_fix_outside_quotations
html = re.sub(r'<blockquote>(Pl. [,.;].*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>(Usually in <abr>pl.</abr>.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>(commonly in (?:<i>)?<abr>pl.</abr>.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>(Inflected .*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>(\(?In [0-9](?:–[0-9])? .*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
# sometimes the 'forms' section is placed below its normal location and is preceded by a greek letter, e.g., "α", so we need to capture that too.
html = re.sub(r'<blockquote>(\(<i>[\u03b1-\u03c9]</i>\).*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL)
html = re.sub(r'<blockquote>([\u03b1-\u03c9]<sup>[0-9]</sup>.*?)</blockquote>', r'<div class="forms">\1</div>', html, flags=re.DOTALL) # greek letters
# These are mini etymologies found for specific senses (i.e., not the main at the top of the entry).
# Note: this needs rethinking, see issue #3
# html = html.replace('<blockquote>[', '<blockquote class="etymology">[')
# html = html.replace(']</blockquote>', ']</div>')
html = html.replace('<span style="color:#8B008B">', '<span class="quotes">')
html = re.sub(r'</span><b>(\??(<i>)?[acp0-9])', r'</span> <b>\1', html)
html = re.sub(r'(<span class="quotes">.*?</span>)(<[^>]+>)', r'\1 \2', html)
html = re.sub(r'\{sup([a-z])\}', r'<span class="small-cap-letter">\1</span>', html)
html = re.sub(r'(</blockquote>)(<blockquote><abr>†</abr>\s*<b><span style="color:#4B0082">)', r'\1 \2', html)
# Remove embedded styles and add classes to the spans
html = re.sub(r'(<span style="color:#4B0082">\[?)<abr>([a-z]\.)</abr>(\]?</span>)', r'\1\2\3', html) # stay [f.] in entry 'acid'
html = re.sub(r'<span style="color:#4B0082">(\[?[0-9]+\.\]?)</span>', r'<span class="senses">\1</span>', html)
html = re.sub(r'<span style="color:#4B0082">(\[?[a-z]+\.\]?)</span>', r'<span class="subsenses">\1</span>', html) # entry 'set' has double letters
html = re.sub(r'<span style="color:#4B0082"><abr>(\[?[fn]\.\]?)</abr></span>', r'<span class="subsenses">\1</span>', html)
html = re.sub(r'<span style="color:#4B0082">(\[?[0-9]+\.\]?) (\[?[a-z]\.\]?)</span>', r'<span class="senses">\1</span> <span class="subsenses">\2</span>', html)
html = re.sub(r'<span style="color:#4B0082">(\[?[0-9]+\.\]?) <abr>(\[?[a-z]\.\]?)</abr></span>', r'<span class="senses">\1</span> <span class="subsenses">\2</span>', html)
html = re.sub(r'<span style="color:#4B0082">(\[?[IVXL]+\.\]?) (\[?[0-9]+\.\]?)</span>', r'<span class="major-division">\1</span> <span class="senses">\2</span>', html)
html = re.sub(r'<span style="color:#4B0082">(\[?[IVXL]+\.\]?)</span>', r'<span class="major-division">\1</span>', html)
html = re.sub(r'<span style="color:#4B0082">(\[?[A-Z]\.\]?)</span>', r'<span class="pos">\1</span>', html)
html = re.sub(r'<span style="color:#4B0082">(\[?[A-Z]\.\]?) (\[?[IVXL]+\.\]?)</span>', r'<span class="pos">\1</span> <span class="major-division">\2</span>', html)
html = re.sub(r'<span style="color:#4B0082">(\[?[A-Z]\.\]?) (\[?[0-9]+\.\]?)</span>', r'<span class="pos">\1</span> <span class="senses">\2</span>', html)
html = re.sub(r'<span style="color:#4B0082">(\[?[A-Z]\.\]?) (\[?[a-z]\.\]?)</span>', r'<span class="pos">\1</span> <span class="subsenses">\2</span>', html)
# odd case (entry 'do'), must happen before _process_pos_forms_section
html = html.replace('<i><abr>pl.</abr></i>; .', '<i><abr>pl.</abr></i>; <b>do</b>.')
if 'class="pos"' in html:
html = self._process_pos_forms_section(html)
# it is crazy how the mind forgets, why one does things ¯\_(ツ)_/¯, i believe this is here so main sections don't get turned into usage-notes.
html = re.sub(r'</blockquote><blockquote>(\s*)(<b>)?(<span class=|<abr>)', r'</blockquote><blockquote class="definition-partial">\1\2\3', html)
html = re.sub(r'</blockquote><blockquote>(\s*)([0-9]\.)', r'</blockquote><blockquote class="definition-partial">\1\2', html)
html = re.sub(r'(_____</blockquote>)<blockquote>', r'\1<blockquote class="addendum">', html)
html = re.sub(r'<blockquote>\*', '<blockquote class="subheading">*', html)
html = re.sub(r'<blockquote>([a-z].*?)</blockquote><blockquote class="definition-partial">',
r'<blockquote class="definition-indent">\1</blockquote><blockquote class="definition-partial">', html) #see entry 'pneumono-'
# This seems to be introducing some false positives, see entry "in", but overall it follows the OED pattern,
# so keeping it for now, however it might need to be revisited. #fixme.
html = html.replace('</blockquote><blockquote>', '</blockquote><blockquote class="usage-note">')
html = re.sub(r'(</blockquote></div>)(<blockquote><b><span class="major-division">)', r'\1 \2', html)
html = html.replace('</blockquote></div><blockquote>', '</blockquote></div><blockquote class="usage-note">')
html = re.sub(r'(<blockquote class=")usage-note("><i><abr>)', r'\1phonetic\2', html)
html = html.replace('{two2n}', '2<sup>2<sup>n</sup></sup>') # Fermat number
html = html.replace('{ppp}', '\u2034') # ‴ (triple prime)
html = html.replace('{pp}', '\u02ba') # ʺ (modifier letter double prime)
html = html.replace('{p}', '\u02c8') # ˈ (primary stress marker) see entry flat adv and n^3 12.b year 1901.
html = html.replace('{ddd}', '...')
html = html.replace('{tittle}', '⋰')
html = html.replace('{bra}', '⟨ |') # refers to a vector in quantum mech.
html = html.replace('{vb}', '\u007C') # vertical bar |
html = html.replace('{ob}', '{')
html = html.replace('{cb}', '}')
html = html.replace('{oqq}', '\u201C') # Left double quotation mark
html = html.replace('{cqq}', '\u201D') # Right double quotation mark
html = html.replace('{pstlg}', '£')
html = html.replace('{pcnt}', '%')
html = html.replace('{cprt}', '©') # copyright
html = html.replace('{hash}', '#')
html = html.replace('{at}', '@')
html = html.replace('{cross}', '✠')
html = html.replace('{fatpara}', '\'¶\'') # liberty taken here, the qoutes should differedntiate it from one used by OED
html = html.replace('{revsc}', '\u061B') # reverse semi-colon
html = html.replace('{arzero}', '\u0660') # Arabic-Indic zero ٠
html = html.replace('{scruple}', '℈') # small unit of measure
html = html.replace('{smY}', 'ʏ') # small capital Y
html = html.replace('{smR}', 'ʀ') # small capital R - uvular trill, like French R
# chemistry stuff
html = html.replace('{pm}', '±') # plus or minus
html = html.replace('{equil}', '⇌') # equilibrium
html = html.replace('{b1}', '−') # single bond (minus sign U+2212)
html = html.replace('{b2}', '=') # double bond (equals sign U+003D)
html = html.replace('{b3}', '≡') # triple bond (identical to U+2261)
html = html.replace('{btl1}', '−') # top-line of manually constructed bond
html = html.replace('{bbl1}', '−') # bottom-line of manually constructed bond
html = html.replace('{btr1}', '−') # top-line right
html = html.replace('{bbr1}', '−') # bottom-line right
html = html.replace('{obigb}', '{') # large opening curly brackets
html = html.replace('{cbigb}', '}') # large closing curly brackets
html = html.replace('{obigpren}', '(') # large opening parens
html = html.replace('{cbigpren}', ')') # large closing parens
html = html.replace('{obigsb}', '[') # large opening square brackets
html = html.replace('{cbigsb}', ']') # large closing square brackets
html = html.replace('{elem}', '∈')
html = html.replace('{supg}', 'g') # odd one, seems to be just a regular 'g'
html = html.replace('{ddag}', '‡')
html = html.replace('{repetn}', ':||:') # repetition, musical notation (Morley 1597)
html = html.replace('{quaver}', '\u266A') # ♪
html = html.replace('{squaver}', '\u266C') # ♬ unicode does not seem to have a single semiquaver... OED shows [squaver] in entry 'hook'
html = html.replace('{semibr}', '\U0001D15D')
html = html.replace('{ruasper}', 'u\u0314') # u̔
html = html.replace('{roasper}', 'o\u0314') # o̓
html = html.replace('{nfasper}', '\u0314')
# these following ones, require being checked on a printed edition of the OED
html = html.replace('{egyasper}', '[egyasper]') # still needs revision, same as following line
html = html.replace('{ormg}', '[ormg]') # OED shows it like this, hard to tell what it actually is at the moment. tracked in #12
html = html.replace('{blb}', '[blb]') # there is no unicode, OED requires special font, shown as ""
html = html.replace('{Tse}', 'Ц') # cyrillic
html = html.replace('{wlenisisub}', 'ᾠ')
html = html.replace('{schwafrbl}', 'ə̯')
html = html.replace('{nfgra}', 'ˋ')
def format_fraction(match):
key = match.group(1)
fraction_map = {
'sixon8': '6/8', 'sixon4': '6/4', 'sixon2': '6/2',
'threeon4': '3/4', 'threeon2': '3/2', 'threeon8': '3/8',
'threeon16': '3/16', 'twoon4': '2/4', 'oneon4': '1/4',
'oneon3': '1/3',
}
if key in fraction_map:
num, denom = fraction_map[key].split('/')
return f'<sup>{num}</sup>/<sub>{denom}</sub>'
return match.group(0)
html = re.sub(r'\{(\w+on\d+)\}', format_fraction, html)
def replace_acute(match):
letter = match.group(1)
acute_map = {
'a': '\u00e1', 'A': '\u00c1', # á
'e': '\u00e9', 'E': '\u00c9', # é
'i': '\u00ed', 'I': '\u00cd', # í
'o': '\u00f3', 'O': '\u00d3', # ó
'u': '\u00fa', 'U': '\u00da', # ú
'y': '\u00fd', 'Y': '\u00dd', # ý
'nf': '´', # ´ (acute alone)
'g': '\u01F5', # ǵ
'n': '\u0144', # ń
't': 't\u0301', # t + acute
'w': '\u1E83', # ẃ
'r': '\u0155', # ŕ
'z': '\u017A', # ź
'Ae': '\u01FC', # Ǽ (precomposed)
'uuml': '\u01D8', # ǘ (precomposed)
'aisub': '\u1FB4', # ᾴ (precomposed Greek)
'amac': '\u0101\u0301', # ā + acute
'omac': '\u014D\u0301', # ō + acute
'imac': '\u012B\u0301', # ī + acute
'umac': '\u016B\u0301', # ū + acute
'ymac': '\u0233\u0301', # ȳ + acute
'euml': '\u00EB\u0301', # ë + acute
'obar': '\u00F8\u0301', # ø + acute
'edotab':'\u0117\u0301', # ė + acute
'utilde':'\u0169\u0301', # ũ + acute
'rdotbl':'\u1E5B\u0301', # ṛ + acute
'mdotbl':'\u1E43\u0301', # ṃ + acute
'gibreve':'\u011F\u0301', # ğ + acute
'eundl': 'e\u0332\u0301', # e + underline + acute
'giuml': '\u0390', # ΐ (precomposed, iota + diaeresis + acute)
'wisub': '\u1FF4', # w with subscript?
'alenisisub': '\u1F84', # ᾄ
'hisub': '\u1FC4', # ῄ
'ilenismac': '\u1F30\u0304\u0301', # ἰ + macron + acute
'Ulenis': '\u03A5\u0313\u0301', # Υ + smooth breathing + acute
'guuml': '\u03B0', # ΰ (precomposed, upsilon + diaeresis + acute)
}
return acute_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)acu\}', replace_acute, html)
def replace_cedilla(match):
letter = match.group(1)
cedilla_map = {
'a': 'a\u0327', # a̧
'c': '\u00e7', # ç
'C': '\u00c7', # Ç
'S': '\u015e',
'i': 'i\u0327', # see "Lamba" or issue https://github.com/Commodore64user/oed_prettifier/issues/12
'd': 'd\u0327', # ḑ
't': '\u0163', # ţ
'z': 'z\u0327', # z̧
'nf': '\u00B8', # cedilla [squiggly bit only, which technically is what a cedilla is ;)]
'aacu': '\u00e1', # verified by og quote, see "id-al-adha" or issue #12
}
return cedilla_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)ced\}', replace_cedilla, html)
html = re.sub(r'⊇', 'e', html)
def replace_breve(match):
letter = match.group(1)
breve_map = {
'c': 'c\u0306', 's': 's\u0306', # s̆
'y': 'y\u0306', 'A': '\u0102', # Ă
'z': 'z\u0306', 'G': '\u011e', # Ğ
'r': 'r\u0306', 'S': 'S\u0306', # S̆
'I': '\u012c', 'O': '\u014e', # Ŏ
'j': 'j\u0306', 'n': 'n\u0306', # n̆
'nf': '\u0306', 'ae': 'æ̆̆',
# that 'sq' replacement IS supposed to be like that, don't undo it again silly...
'go': '\u03bf\u0306', 'sq': '', # see issue https://github.com/Commodore64user/oed_prettifier/issues/12#issuecomment-3316062903
'ymac': 'y\u0304\u0306', 'kmac': 'k\u0304\u0306',
'oemac': '\u0153\u0304\u0306', #'gamac': 'FILLER_gamac_breve',
'aemac': '\u00e6\u0304\u0306', 'ohook': '\u01eb\u0306',
}
return breve_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)breve\}', replace_breve, html)
def replace_mac(match):
letter = match.group(1)
mac_map = {
'g': 'g\u0304', 'n': 'n\u0304',
'S': 'S\u0304', 'I': 'I\u0304',
'z': 'z\u0304', 'w': 'w\u0304',
'nf': '\u0304', 'oe': '\u0153\u0304',
'gh': '\u03b7\u0304', 'Ae': '\u00c6\u0304', # Ǣ
'ope': '\u025b\u0304', 'revv': '\u028c\u0304',
'revr': '\u0279\u0304', 'obar': '\u00f8\u0304',
'ahook': 'ą̄̄̄', 'schwa': '\u0259\u0304',
'rcircbl': 'r̥̄', 'shtsyll': '\u222A\u0304', # still needs verification
'edotbl': 'e\u0323\u0304', 'odotbl': 'o\u0323\u0304',
'alenis': '\u1F00\u0304', 'ilenis': '\u1F30\u0304', # still needs verification, greek chars used in entries sun and exipotic
'ibreve': 'i\u0304\u0306', 'obreve': 'o\u0304\u0306',
}
return mac_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)mac\}', replace_mac, html)
def replace_bar(match):
letter = match.group(1)
bar_map = {
'o': '\u00F8', # ø
'O': '\u00D8',
'L': '\u0141',
'i': '\u0268',
'p': '\u1D7D', # still needs visual confirmation
'u': '\u0289',
'th': '\uA765', # still needs visual confirmation
'Th': '\uA764', # still needs visual confirmation
}
return bar_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)bar\}', replace_bar, html)
def replace_script(match):
letter = match.group(1)
script_map = {
'T': '\U0001D4AF', 'S': '\U0001D4AE', # 𝒯 𝒮
'C': '\U0001D49E', 'E': '\U0001D4A0', # 𝒞 𝓔
'Q': '\U0001D4AC', 'L': '\U0001D4A6', # 𝒬 𝓛
'b': '\U0001D4B7', 'h': '\U0001D4BD', # 𝒷 𝒽
'l': '\U0001D4C1', 'D': '\U0001D49F', # 𝓁 𝒟
'A': '\U0001D49C', 'F': '\U0001D4A1', # 𝒜 𝓕
'M': '\U0001D4A8', 'R': '\U0001D4A5', # 𝓜 𝓡
'U': '\U0001D4B0', # 𝒰
}
return script_map.get(letter, match.group(0))
html = re.sub(r'\{scr([TSCEQLbhlDAFMRU])\}', replace_script, html)
def replace_zodiac(match):
sign = match.group(1)
zodiac_map = {
'aries': '\u2648', 'virgo': '\u264D',
'scorpio': '\u264F', 'sagit': '\u2650',
'capr': '\u2651', 'aquar': '\u2652',
}
return zodiac_map.get(sign, match.group(0))
html = re.sub(r'\{(aries|virgo|scorpio|sagit|capr|aquar)\}', replace_zodiac, html)
def replace_ring_below(match):
letter = match.group(1)
ring_below_map = {
'l': 'l\u0325', # l̥
'm': 'm\u0325', # m̥
'n': 'n\u0325', # n̥
}
return ring_below_map.get(letter, match.group(0))
html = re.sub(r'\{([lmn])circbl\}', replace_ring_below, html)
def replace_circ(match):
letter = match.group(1)
circ_map = {
'c': '\u0109', # ĉ
'epsilon': '\u025b\u0302', # ɛ̂
'g': '\u011d', # ĝ
'n': 'n\u0302', # n̂
'nf': '\u0302', # ̂ (bare circumflex)
'ohg': '\u00f4', # ô
's': '\u015d', # ŝ
'uuml': '\u00fc\u0302', # ü̂
'w': '\u0175', # ŵ
}
return circ_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)circ\}', replace_circ, html)
def replace_uml(match):
letter = match.group(1)
uml_map = {
'gi': '\u03ca', # ϊ Greek iota with dialytika
'gu': '\u03cb', # ϋ Greek upsilon with dialytika
'imac': '\u012b\u0308', # ī̈ i-macron + combining diaeresis
'nf': '\u00a8', # ¨ standalone diaeresis
'v': 'v\u0308', # v̈ v + combining diaeresis
}
return uml_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)uml\}', replace_uml, html)
def replace_frown(match):
letter = match.group(1)
frown_map = {
'alenisisub': '\u1f80\u0311', # ᾀ̑
'Elenis': '\u1f18\u0311', # Ἐ̑
'hasperisub': '\u1f97\u0311', # ᾗ̑
'm': 'm\u0311', # m̑
'nf': '\u0311', # ̑ (bare frown)
'o': 'o\u0311', # ȏ
'u': 'u\u0311', # ȗ
}
return frown_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)frown\}', replace_frown, html)
# Absolutely no clue, whether this is correct or not. i am cheating here and using help from AI
def replace_arabic(match):
letter = match.group(1)
arabic_map = {
'dal': '\u062F', 'alif': '\u0627', # د ا
'Ha': '\u062D', 'nun': '\uFEED', # ح ﻧ
'ta': '\uFE97', 'tha': '\uFE9B', # ﺗ ﺛ
'ba': '\uFE91', 'yafull': '\uFEEF', # ﺑ ﻳ
'pa': '\uFB56', 'ha': '\u0647', # ﭘ ه
'waw': '\u0648', 'ya': '\u06CC', # و ی
'Dadfull': '\u0636', 'nunfull': '\u0646', # ض ن
}
return arabic_map.get(letter, match.group(0))
html = re.sub(r'\{ar([a-zA-Z]+)\}', replace_arabic, html)
def replace_dotbl(match):
letter = match.group(1)
dotbl_map = {
'c': 'c\u0323', # c + dot below
'D': 'D\u0323', # Ḍ
'eacu': 'é\u0323', # é + dot below
'e': 'e\u0323', # ẹ
'E': 'E\u0323', # Ẹ
'K': 'K\u0323', # Ḳ
'l': 'l\u0323', # ḷ
'R': 'R\u0323', # Ṛ
'shacek': 'š\u0323', # š + dot below
'T': 'T\u0323', # Ṭ
}
return dotbl_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)dotbl\}', replace_dotbl, html)
def replace_tilde(match):
letter = match.group(1)
tilde_map = {
'aisub': 'ᾷ', 'amac': '\u0101\u0303', # ā + combining tilde
'd': 'd\u0303', 'edotab':'\u0117\u0303', # ė + combining tilde
'e': '\u1EBD', 'i': '\u0129',
'l': 'l\u0303', 'm': 'm\u0303',
'nf': '\u02DC', 'omac': '\u014D\u0303', # ō + combining tilde
'q': 'q\u0303', 'revv': '\u028C\u0303', # ʌ + combining tilde
'r': 'r\u0303', 'schwa': '\u0259\u0303', # ə + combining tilde
's': 's\u0303', 't': 't\u0303',
'uang': 'ů̃', 'uda': '\u0250\u0303',
'u': '\u0169', 'y': '\u1EF9',
}
return tilde_map.get(letter, match.group(0))
html = re.sub(r'\{([^}]+)tilde\}', replace_tilde, html)
html = re.sub(r'(<b>(?:\?)?(?:<i>[acp]</i>)?(\d{3,4})</b>) (<abr>tr\.</abr>)(\s<i>)', r'\1 <span class="translator">tr.</span>\4', html)
# Handle "Author abbreviation." pattern (like "Francis tr.")
html = re.sub(r'(<b>(?:\?)?(?:<i>[acp]</i>)?(\d{3,4})</b>) ((?:[\w]\.)?\s?[\w]+)\s(<abr>[\w]+\.</abr>)(\s<i>)', r'\1 <span class="author">\3</span> \4\5', html)
# Handle specific "Initial Author (Source) Number" pattern
html = re.sub(
r'(<b>(?:\?)?(?:<i>[acp]</i>)?(?:\d{3,4})</b>) ([A-Z]\.)\s<abr>([\w]+\.)</abr>\s(\([^)]+\))\s([0-9]+)',
r'\1 <span class="author">\2 \3</span> \4 \5',
html
)
# handle authors with abbreviated names.
html = re.sub(r'(<b>(?:\?)?(?:<i>[acp]</i>)?(?:\d{3,4})</b>) (<abr>[\w]+\.</abr>)\s([0-9]+)', r'\1 <span class="author">\2</span> \3', html)
# matches: <b>1755</b> Johnson <span class="quotes">...
html = re.sub(r'(<b>(?:\?)?(?:<i>[acp]</i>)?(?:\d{3,4})</b>) ([A-Z][a-zA-Z]+) (<span class="quotes">)', r'\1 <span class="author">\2</span> \3', html)
# matches the following pattern: "<b>1855</b> <abr">Geo.</abr> Eliot in"
html = re.sub(r'(<b>(?:\?)?(?:<i>[acp]</i>)?(?:\d{3,4})</b>) (<abr>[\w]+\.</abr>)\s([\w]+)\s(in)', r'\1 <span class="author">\2 \3</span> \4', html)
# This grew out of control, but is seems to be held together by fairy dust, it works although this should have been done in a more structured way.
html = re.sub(
r'(<b>(?:\?)?(?:<i>[acp]</i>)?(?:\d{3,4})</b>) ([^<]*)?<abr>([\w]+\.)</abr>\s([\w]+)?\s?((<i>)?[0-9]?\s?)(<i>|<abr>)',
r'\1 <span class="author">\2\3 \4</span> \5\7',
html
)
# Finally, convert the placeholder back
html = re.sub(r'<ANON_IN_SOURCE>', '', html)
html = re.sub(r'</ANON_IN_SOURCE>', '', html)
# When wrapping authors earlier on, we may have overshot and captured incorrect strings, here we try to undo those mistakes.
def fix_author_tr(match):
content = match.group(0)
words_to_move = [' tr.', ' quoted', ' [not', ' [impled', ' [implied', ' in<', ', etc.', ' [see']
prefix_to_move = ['*', '[impled', '[implied', '[see', '―', ' ,', ', ', 'Steel fixer', ': implied']
has_suffix = any(word in content for word in words_to_move)
has_prefix = any(f'>{prefix}' in content for prefix in prefix_to_move)
has_trailing_number = bool(re.search(r' \d+[^<]*</span>$', content))
if has_suffix or has_prefix or has_trailing_number:
escaped_words = [re.escape(word) for word in words_to_move]
escaped_prefixes = [re.escape(prefix) for prefix in prefix_to_move]
suffix_pattern = '|'.join(escaped_words)
prefix_pattern = '|'.join(escaped_prefixes)
# Handle prefixes - remove span entirely
content = re.sub(
rf'<span class="author">({prefix_pattern})(.*?)</span>',
r'\1\2',
content
)
# Handle suffixes - move stuff out of the span
content = re.sub(
rf'<span class="author">(.*?)({suffix_pattern})\s*</span>',
r'<span class="author">\1</span>\2 ',
content
)
# Handle trailing dates/numbers
content = re.sub(
r'<span class="author">(.*?)( \d+[^<]*)</span>',
r'<span class="author">\1</span>\2',
content
)
return content
return content
html = re.sub(r'<span class="author">.*?</span>', fix_author_tr, html)
# single occurence in entry "crumpet", doing it for Lady Bracknell...
html = re.sub(r'<span class="author">(a tender cake of o loof, spreynde with oile, paast sodun)</span>', r'\1', html)
html = html.replace('</b>; β.</blockquote>', '</b>; β<b>otherwise</b>.</blockquote>')
html = html.replace('See also Early', 'See also <span class="kref">bushment</span>. Early') # entry 'embushment'
html = html.replace('See also as', 'See also C.B. as') # entry 'C'
html = html.replace('See also a.', 'See also <span class="kref">O.K.</span> a.') # entry 'O'
html = html.replace('see also <abr>', 'See also <span class="kref">Lit</span> <abr>') # entry 'sup'
html = html.replace('<abr>Mod.</abr>E. .', '<abr>Mod.</abr>E. <b>tell</b>.') # entry 'tell'
html = html.replace('; 6 6, 9', '; 6 <b>speare</b> 6, 9') # entry 'speare'
html = html.replace('>, ? <', '>, ? <b>athel</b> <') # entry 'athel'
html = html.replace(') ; <i>', ') <b>think</b>; <i>') # entry 'think'
html = re.sub(r'(<b>partridge p.</b>,) , (<b>rock p.</b>)', r'\1 <span class="kref">passanger-p.</span>, \2', html) # entry 'pigeon'
if html.startswith('<b>['):
for old, new in [
('<div class="quotations">]</div>', '<span class="spurious-entry">]</span>'),
(']</div>', '</div><span class="spurious-entry">]</span>'),
(']</blockquote>', '</blockquote><span class="spurious-entry">]</span>'),
]:
if old in html:
html = new.join(html.rsplit(old, 1))
break
html = re.sub(r'<span class="author">in</span>', 'in', html)
html = re.sub(r'(<span class="author">)(\? )(.*?)(</span>)', r'\2\1\3\4', html)
html = re.sub(r'<span class="translator">tr.</span>', '<abr>tr.</abr>', html)
html = html.replace('<abr>', '<span class="abbreviation">')
html = html.replace('</abr>', '</span>')
escaped_hwd = html_module.escape(self.headword)
def _apply_headword_fix_outside_quotations(fragment: str) -> str:
quote_blocks = []
placeholder_prefix = '__OED_QUOTATION_BLOCK_'
quote_start_pattern = re.compile(
r'''<div\b[^>]*\bclass=(['"])[^'"]*\bquotations\b[^'"]*\1[^>]*>''',
flags=re.IGNORECASE,
)
div_token_pattern = re.compile(r'<div\b[^>]*>|</div>', flags=re.IGNORECASE)
def _stash_quotations(raw_fragment: str) -> str:
parts = []
cursor = 0
while True:
start_match = quote_start_pattern.search(raw_fragment, cursor)
if not start_match:
parts.append(raw_fragment[cursor:])
break
start_idx = start_match.start()
parts.append(raw_fragment[cursor:start_idx])
depth = 0
end_idx = None
for token_match in div_token_pattern.finditer(raw_fragment, start_idx):
token = token_match.group(0)
if token.lower().startswith('<div'):
depth += 1
else:
depth -= 1
if depth == 0:
end_idx = token_match.end()
break
if end_idx is None:
# If HTML is malformed, keep the tail untouched rather than truncating content.
parts.append(raw_fragment[start_idx:])
break
quote_blocks.append(raw_fragment[start_idx:end_idx])
parts.append(f'{placeholder_prefix}{len(quote_blocks) - 1}__')
cursor = end_idx
return ''.join(parts)
# Keep quotation HTML untouched, then apply the fixes to the rest.
html_fragment = _stash_quotations(fragment)
html_fragment = re.sub(r'\u2013 ([,;.])', f'– <b>{escaped_hwd}</b>' + r'\1', html_fragment)
html_fragment = re.sub(r'(\d)\u2013 (\(|</)', r'\1' + f'– <b>{escaped_hwd}</b> ' + r'\2', html_fragment) #hois
html_fragment = re.sub(r'([0-9]) ([,;:.])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'Also ([., ])', f'Also <b>{escaped_hwd}</b>' + r'\1', html_fragment)
html_fragment = re.sub(r'\(also \)', f'(also <b>{escaped_hwd}</b>)', html_fragment)
html_fragment = re.sub(r'([Hh])ence ([,. ])', r'\1' + f'ence <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r', ([,.;])', f', <b>{escaped_hwd}</b>' + r'\1', html_fragment)
html_fragment = re.sub(r'([lL]\.) ([,.;])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'(\(\d) \)', r'\1' + f' <b>{escaped_hwd}</b>)', html_fragment)
html_fragment = re.sub(r'>; \.', f'>; <b>{escaped_hwd}</b>.', html_fragment)
html_fragment = re.sub(r'\) ([,])', f') <b>{escaped_hwd}</b>' + r'\1', html_fragment)
html_fragment = re.sub(r'\(\),', f'(<b>{escaped_hwd}</b>),', html_fragment)
html_fragment = re.sub(r'(\(\d\)) ([;,.])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'(\d–\d) (\(\d)', r'\1' + f' <b>{escaped_hwd}</b> ' + r'\2', html_fragment)
html_fragment = re.sub(r'(\d–(?:\d)?) </div', r'\1' + f' <b>{escaped_hwd}</b></div', html_fragment)
# html_fragment = re.sub(r'(\d–) </div', r'\1' + f' <b>{escaped_hwd}</b></div', html_fragment)
html_fragment = re.sub(r'\) \.</div', f') <b>{escaped_hwd}</b>.</div', html_fragment)
html_fragment = re.sub(r'\) ; (\d)', f') <b>{escaped_hwd}</b>;' + r'\1', html_fragment)
html_fragment = re.sub(r'(\d) \)\.</', r'\1' + f' <b>{escaped_hwd}</b>).</', html_fragment)
html_fragment = re.sub(r'(sing\.</span></i>) ([,. ])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'(Sc\.</span></i>) ([.,])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'(pl\.</span></i>) ([.,])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'(imp\.</span></i>) ([.,])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'(dial\.</span></i>) ([.,; ])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'(pple.</span></i>) ([,.;])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'(pple.</span>) ([,.;])', r'\1' + f' <b>{escaped_hwd}</b>' + r'\2', html_fragment)
html_fragment = re.sub(r'(inf\.</i>,) \(', r'\1' + f' <b>{escaped_hwd}</b> (', html_fragment)
html_fragment = re.sub(r'([^\w][Ss]o) ([(<])', r'\1' + f' <b>{escaped_hwd}</b> ' + r'\2', html_fragment)
html_fragment = re.sub(r'(†</span>)([,.])', r'\1' + f'<b>{escaped_hwd}</b>' + r'\2', html_fragment)
# html_fragment = re.sub(r' \(', f' <b>{escaped_hwd}</b> (', html_fragment)
for idx, quote_block in enumerate(quote_blocks):
html_fragment = html_fragment.replace(f'{placeholder_prefix}{idx}__', quote_block)
return html_fragment
html = _apply_headword_fix_outside_quotations(html)
html = re.sub(r'\s+', ' ', html)
html = html.rstrip()
# this final step is for testing purposes only, to make spotting these much easier
html = re.sub(r'\{([^\s{}]+)\}', r'<span class="unprocessed">{\1}</span>', html)
return html