diff --git a/CHANGES.rst b/CHANGES.rst index 64f94e14ec3..f63fc1bdeb3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -118,6 +118,8 @@ Bugs fixed for objects documented as ``:py:data:`` to be hyperlinked in function signatures. * #13858: doctest: doctest blocks are now correctly added to a group defined by the configuration variable ``doctest_test_doctest_blocks``. +* #13741: text: fix an infinite loop when processing CSV tables. + Patch by Bénédikt Tran. Testing diff --git a/sphinx/writers/text.py b/sphinx/writers/text.py index 1f14781fc19..cec73342dd8 100644 --- a/sphinx/writers/text.py +++ b/sphinx/writers/text.py @@ -292,18 +292,18 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: else: indent = self.initial_indent + # Note that column_width(x) > len(x) is possible, + # but _handle_long_word() handles negative widths. width = self.width - column_width(indent) if self.drop_whitespace and not chunks[-1].strip() and lines: del chunks[-1] while chunks: - l = column_width(chunks[-1]) - - if cur_len + l <= width: + chunk_width = column_width(chunks[-1]) + if cur_len + chunk_width <= width: cur_line.append(chunks.pop()) - cur_len += l - + cur_len += chunk_width else: break @@ -318,14 +318,21 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: return lines - def _break_word(self, word: str, space_left: int) -> tuple[str, str]: - """Break line by unicode width instead of len(word).""" + @staticmethod + def _find_break_end(word: str, space_left: int) -> int: + """Break word by unicode width instead of len(word). + + The returned position 'end' satisfies:: + + assert column_width(word[:end]) <= space_left + assert end == len(word) or column_width(word[:end+1]) > space_left + """ total = 0 - for i, c in enumerate(word): + for end, c in enumerate(word, 1): total += column_width(c) if total > space_left: - return word[: i - 1], word[i - 1 :] - return word, '' + return end - 1 + return len(word) def _split(self, text: str) -> list[str]: """Override original method that only split by 'wordsep_re'. @@ -333,14 +340,21 @@ def _split(self, text: str) -> list[str]: This '_split' splits wide-characters into chunks by one character. """ - def split(t: str) -> list[str]: - return super(TextWrapper, self)._split(t) + def column_width_safe(x: str) -> int: + # Handle characters that are 0-width. We should refine + # the grouping to prevent splitting a word at combining + # characters or in a group of combining characters with + # at most one non-combining character as the combining + # characters may act on the right or left character. + # + # See https://github.com/sphinx-doc/sphinx/issues/13741. + return max(1, column_width(x)) chunks: list[str] = [] - for chunk in split(text): - for w, g in groupby(chunk, column_width): - if w == 1: - chunks.extend(split(''.join(g))) + for chunk in super()._split(text): + for w, g in groupby(chunk, column_width_safe): + if w <= 1: + chunks.extend(super()._split(''.join(g))) else: chunks.extend(list(g)) return chunks @@ -348,13 +362,25 @@ def split(t: str) -> list[str]: def _handle_long_word( self, reversed_chunks: list[str], cur_line: list[str], cur_len: int, width: int ) -> None: - """Override original method for using self._break_word() instead of slice.""" - space_left = max(width - cur_len, 1) - if self.break_long_words: - l, r = self._break_word(reversed_chunks[-1], space_left) - cur_line.append(l) - reversed_chunks[-1] = r + """Override using self._find_break() instead of str.find().""" + # Make sure at least one character is stripped off on every pass. + # + # Do NOT use space_left = max(width - cur_len, 1) as corner cases + # with "self.drop_whitespace == False" and "self.width == 1" fail. + space_left = 1 if width < 1 else (width - cur_len) + if self.break_long_words: + # Some characters may have len(X) < space_left < column_width(X) + # so we should only wrap chunks for which len(X) > space_left. + end = space_left + chunk = reversed_chunks[-1] + if space_left > 0: + end = self._find_break_end(chunk, space_left) + if end == 0 and space_left: + # force processing at least one character + end = 1 + cur_line.append(chunk[:end]) + reversed_chunks[-1] = chunk[end:] elif not cur_line: cur_line.append(reversed_chunks.pop()) diff --git a/tests/test_writers/test_writer_text.py b/tests/test_writers/test_writer_text.py new file mode 100644 index 00000000000..66eb4f95f4e --- /dev/null +++ b/tests/test_writers/test_writer_text.py @@ -0,0 +1,183 @@ +"""Test the LaTeX writer""" + +from __future__ import annotations + +import pytest +from docutils.utils import column_width + +from sphinx.writers.text import TextWrapper + +find_break_end = TextWrapper._find_break_end + + +@pytest.mark.parametrize( + # glyph of column width 0 + 'glyph', + ['', '\N{COMBINING TILDE}'], +) +def test_text_wrapper_break_phantom_symbol(glyph: str) -> None: + assert column_width(glyph) == 0 + glyph_length = len(glyph) + + for n in range(1, 5): + # Since the glyph has length 0 and column width 0, + # we can always take the entire glpyh. + assert find_break_end(glyph, n) == glyph_length + for m in range(1, 5): + # The multiplied glyph may have non-zero length + # but its column width will always be 0, so we + # take the entire glyph again. + assert find_break_end(m * glyph, n) == m * glyph_length + + +@pytest.mark.parametrize( + ('text', 'colwidth'), + [ + # Glyph of length 1 and column width 1 + ('X', 1), + # Glyph of length 1 and column width 2 + ('\N{CJK UNIFIED IDEOGRAPH-65E5}', 2), + # Glyph of length 2 and column width 1 + ('\N{COMBINING TILDE}X', 1), + # Glyph of length 2 and column width 2 + ('\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}', 2), + # Glyph of length 3 and column width 1 + ('\N{COMBINING TILDE}\N{COMBINING BREVE}X', 1), + ], +) +def test_text_wrapper_break_visible_symbol(text: str, colwidth: int) -> None: + assert column_width(text) == colwidth + for n in range(1, 5): + end = find_break_end(text, n) + assert column_width(text[:end]) <= n + for m in range(2, 5): + m_text = m * text + end = find_break_end(m_text, n) + assert column_width(m_text[:end]) <= n + assert end == m * len(text) or column_width(m_text[: end + 1]) > n + + +def test_text_wrapper_break_stop_after_combining_symbols() -> None: + tilde = '\N{COMBINING TILDE}' + multi = '\N{CJK UNIFIED IDEOGRAPH-65E5}' + + head = tilde + tilde + '....' + tail = multi + tilde + tilde + text = head + tail + assert find_break_end(head + tail, column_width(head)) == len(head) + + +@pytest.mark.parametrize( + ('text', 'results'), + [ + ('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}), + ( + 'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!', + { + 1: list('Helloa\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'), + 2: ['He', 'll', 'o', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'], + 3: ['Hel', 'lo', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'], + }, + ), + ( + 'ab c\N{COMBINING TILDE}def', + { + 1: ['a', 'b', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'], + 2: ['ab', 'c\N{COMBINING TILDE}d', 'ef'], + 3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'], + }, + ), + ( + 'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def', + { + 1: [ + 'a', + 'b', + 'c\N{COMBINING TILDE}', + '\N{CJK UNIFIED IDEOGRAPH-65E5}', + 'd', + 'e', + 'f', + ], + 2: [ + 'ab', + 'c\N{COMBINING TILDE}', + '\N{CJK UNIFIED IDEOGRAPH-65E5}', + 'de', + 'f', + ], + 3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'], + }, + ), + ( + 'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def', + { + 1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'], + 2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'], + 3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'], + }, + ), + ], +) +def test_text_wrapper(text: str, results: dict[int, list[str]]) -> None: + for width, expected in results.items(): + w = TextWrapper(width=width, drop_whitespace=True) + assert w.wrap(text) == expected + + +@pytest.mark.parametrize( + ('text', 'results'), + [ + ('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}), + ( + 'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!', + { + 1: list('Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'), + 2: ['He', 'll', 'o ', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'], + 3: ['Hel', 'lo ', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'], + }, + ), + ( + 'ab c\N{COMBINING TILDE}def', + { + 1: ['a', 'b', ' ', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'], + 2: ['ab', ' c\N{COMBINING TILDE}', 'de', 'f'], + 3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'], + }, + ), + ( + 'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def', + { + 1: [ + 'a', + 'b', + 'c\N{COMBINING TILDE}', + '\N{CJK UNIFIED IDEOGRAPH-65E5}', + 'd', + 'e', + 'f', + ], + 2: [ + 'ab', + 'c\N{COMBINING TILDE}', + '\N{CJK UNIFIED IDEOGRAPH-65E5}', + 'de', + 'f', + ], + 3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'], + }, + ), + ( + 'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def', + { + 1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'], + 2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'], + 3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'], + }, + ), + ], +) +def test_text_wrapper_drop_ws(text: str, results: dict[int, list[str]]) -> None: + for width, expected in results.items(): + w = TextWrapper(width=width, drop_whitespace=False) + assert w.wrap(text) == expected