WriteBot/text_processor.py at main · ariedotcodotnz/WriteBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""
Text processing module for handling paragraphs, word wrapping, and pagination.

This module provides the `TextProcessor` class and related configuration and
utility functions. It handles the complexities of formatting text for
handwriting synthesis, including breaking text into pages, preserving
paragraph structure, and wrapping text within line limits.
"""

from enum import Enum
from typing import List, Dict, Any, Tuple, Optional, Set
import re


class ParagraphStyle(Enum):
    """Enumeration of paragraph handling styles."""
    PRESERVE_BREAKS = "preserve_breaks"
    SINGLE_SPACE = "single_space"
    NO_BREAKS = "no_breaks"
    INDENT_FIRST = "indent_first"


class TextProcessingConfig:
    """Configuration for the text processor."""

    def __init__(
        self,
        max_line_length: int = 75,
        lines_per_page: int = 24,
        paragraph_style: ParagraphStyle = ParagraphStyle.PRESERVE_BREAKS,
        preserve_empty_lines: bool = True,
        hyphenate_long_words: bool = False,
        normalize_whitespace: bool = True,
    ):
        """
        Initialize the text processing configuration.

        Args:
            max_line_length: Maximum characters per line.
            lines_per_page: Maximum lines per page.
            paragraph_style: Style for handling paragraphs.
            preserve_empty_lines: Whether to keep empty lines from input.
            hyphenate_long_words: Whether to split long words (basic implementation).
            normalize_whitespace: Whether to collapse multiple spaces.
        """
        self.max_line_length = max_line_length
        self.lines_per_page = lines_per_page
        self.paragraph_style = paragraph_style
        self.preserve_empty_lines = preserve_empty_lines
        self.hyphenate_long_words = hyphenate_long_words
        self.normalize_whitespace = normalize_whitespace


def create_alphabet_set(alphabet_list: Optional[List[str]]) -> Optional[Set[str]]:
    """
    Create a set of allowed characters from a list.

    Args:
        alphabet_list: List of allowed characters.

    Returns:
        Set of allowed characters, or None if list is None.
    """
    if alphabet_list is None:
        return None
    return set(alphabet_list)


class TextProcessor:
    """
    Handles text cleaning, wrapping, pagination, and formatting.
    """

    def __init__(self, config: TextProcessingConfig):
        """
        Initialize the TextProcessor.

        Args:
            config: TextProcessingConfig object.
        """
        self.config = config

    def process_text(
        self,
        text: str,
        alphabet: Optional[Set[str]] = None
    ) -> Tuple[List[str], Dict[str, Any]]:
        """
        Process raw text into a list of formatted lines.

        Args:
            text: Input text string.
            alphabet: Optional set of allowed characters for validation/filtering.

        Returns:
            Tuple containing:
            - List of formatted string lines.
            - Metadata dictionary with processing stats.
        """
        if not text:
            return [], {"num_lines": 0, "num_paragraphs": 0}

        # Step 1: Normalize text and split into paragraphs
        paragraphs = self._split_into_paragraphs(text)

        # Step 2: Process each paragraph
        processed_lines = []
        for i, para in enumerate(paragraphs):
            # Handle empty paragraphs (from preserved empty lines)
            if not para.strip():
                if self.config.preserve_empty_lines:
                    processed_lines.append("")
                continue

            # Filter characters if alphabet is provided
            cleaned_para = self._clean_text(para, alphabet)

            # Wrap paragraph into lines
            wrapped_lines = self._wrap_paragraph(cleaned_para)

            # Add paragraph spacing/formatting based on style
            if i > 0:  # Only add spacing between paragraphs
                if self.config.paragraph_style == ParagraphStyle.PRESERVE_BREAKS:
                    processed_lines.append("")  # Empty line between paragraphs
                # Other styles might just append directly

            # Handle indentation
            if self.config.paragraph_style == ParagraphStyle.INDENT_FIRST and wrapped_lines:
                wrapped_lines[0] = "    " + wrapped_lines[0]

            processed_lines.extend(wrapped_lines)

        return processed_lines, {
            "num_lines": len(processed_lines),
            "num_paragraphs": len(paragraphs)
        }

    def get_pages(
        self,
        text: str,
        alphabet: Optional[Set[str]] = None
    ) -> Tuple[List[List[str]], Dict[str, Any]]:
        """
        Process text and split it into pages.

        Args:
            text: Input text string.
            alphabet: Optional set of allowed characters.

        Returns:
            Tuple containing:
            - List of pages, where each page is a list of line strings.
            - Metadata dictionary.
        """
        lines, metadata = self.process_text(text, alphabet)

        pages = []
        current_page = []

        for line in lines:
            if len(current_page) >= self.config.lines_per_page:
                pages.append(current_page)
                current_page = []
            current_page.append(line)

        if current_page:
            pages.append(current_page)

        metadata["num_pages"] = len(pages)
        return pages, metadata

    def _split_into_paragraphs(self, text: str) -> List[str]:
        """
        Split text into paragraphs based on newlines.
        """
        # Normalize line endings
        text = text.replace('\r\n', '\n').replace('\r', '\n')

        # Split by double newline to find paragraphs
        # If preserving single line breaks is not desired, we treat single newlines as spaces
        if self.config.paragraph_style == ParagraphStyle.SINGLE_SPACE:
             # Collapse single newlines into spaces, split on double newlines
             paragraphs = [p.replace('\n', ' ') for p in re.split(r'\n\s*\n', text)]
        else:
             # Respect explicitly provided structure
             paragraphs = text.split('\n')

        if self.config.normalize_whitespace:
            paragraphs = [re.sub(r'\s+', ' ', p).strip() for p in paragraphs]
        else:
            paragraphs = [p.strip() for p in paragraphs]

        return paragraphs

    def _clean_text(self, text: str, alphabet: Optional[Set[str]]) -> str:
        """
        Filter text to include only allowed characters.
        """
        if alphabet is None:
            return text

        return "".join(c for c in text if c in alphabet or c.isspace())

    def _wrap_paragraph(self, text: str) -> List[str]:
        """
        Wrap a single paragraph into lines of max_line_length.
        """
        words = text.split()
        lines = []
        current_line = []
        current_length = 0

        for word in words:
            word_len = len(word)

            # Check if word fits in current line
            # +1 for space if line is not empty
            space_len = 1 if current_length > 0 else 0

            if current_length + space_len + word_len <= self.config.max_line_length:
                current_line.append(word)
                current_length += space_len + word_len
            else:
                # Word doesn't fit
                if current_line:
                    lines.append(" ".join(current_line))

                # Handle very long words (basic hyphenation or forced break)
                if word_len > self.config.max_line_length:
                    # Force break the word
                    remaining = word
                    while len(remaining) > self.config.max_line_length:
                        chunk = remaining[:self.config.max_line_length]
                        lines.append(chunk)
                        remaining = remaining[self.config.max_line_length:]
                    current_line = [remaining]
                    current_length = len(remaining)
                else:
                    current_line = [word]
                    current_length = word_len

        if current_line:
            lines.append(" ".join(current_line))

        return lines