From a2cb29ebc12d555e925d7a266941d813cf7574ec Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Sun, 26 Oct 2025 21:14:07 +0400 Subject: [PATCH 1/3] Fix failing wcwidth "wide character" tests by reverting bad ANSI code --- tabulate/__init__.py | 52 +++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/tabulate/__init__.py b/tabulate/__init__.py index e100c09..3e4da13 100644 --- a/tabulate/__init__.py +++ b/tabulate/__init__.py @@ -2738,29 +2738,45 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): # If we're allowed to break long words, then do so: put as much # of the next chunk onto the current line as will fit. - if self.break_long_words: + + # Reverted the broken ANSI code handling stuff to fix wcwidth handling + # - Doesn't use self._lend, infinite loops + # - doesn't locate chunks correctly b/c could be split by ANSI codes + # + # if self.break_long_words and space_left > 0: + # # Tabulate Custom: Build the string up piece-by-piece in order to + # # take each charcter's width into account + # chunk = reversed_chunks[-1] + # # Only count printable characters, so strip_ansi first, index later. + # for i in range( 1, space_left + 1 ): + # if self._len(_strip_ansi(chunk)[:i]) > space_left: + # break + # + # # Consider escape codes when breaking words up + # total_escape_len = 0 + # last_group = 0 + # if _ansi_codes.search(chunk) is not None: + # for group, _, _, _ in _ansi_codes.findall(chunk): + # escape_len = len(group) + # if ( + # group + # in chunk[last_group : i + total_escape_len + escape_len - 1] + # ): + # total_escape_len += escape_len + # found = _ansi_codes.search(chunk[last_group:]) + # last_group += found.end() + # cur_line.append(chunk[: i + total_escape_len - 1]) + # reversed_chunks[-1] = chunk[i + total_escape_len - 1 :] + + if self.break_long_words: # and space_left > 0: # Tabulate Custom: Build the string up piece-by-piece in order to # take each charcter's width into account chunk = reversed_chunks[-1] i = 1 - # Only count printable characters, so strip_ansi first, index later. - while len(_strip_ansi(chunk)[:i]) <= space_left: + while self._len(chunk[:i]) <= space_left: i = i + 1 - # Consider escape codes when breaking words up - total_escape_len = 0 - last_group = 0 - if _ansi_codes.search(chunk) is not None: - for group, _, _, _ in _ansi_codes.findall(chunk): - escape_len = len(group) - if ( - group - in chunk[last_group : i + total_escape_len + escape_len - 1] - ): - total_escape_len += escape_len - found = _ansi_codes.search(chunk[last_group:]) - last_group += found.end() - cur_line.append(chunk[: i + total_escape_len - 1]) - reversed_chunks[-1] = chunk[i + total_escape_len - 1 :] + cur_line.append(chunk[: i - 1]) + reversed_chunks[-1] = chunk[i - 1 :] # Otherwise, we have to preserve the long word intact. Only add # it to the current line if there's nothing already there -- From 2692e1afb54caba9b668cbf8bc5efe75b9422843 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Mon, 27 Oct 2025 11:24:53 +0400 Subject: [PATCH 2/3] Fix and test long word wrapping o Tests pass with/without wcwidth module installed o Include some linting changes --- tabulate/__init__.py | 88 +++++++++++++++++++++------------------- test/test_textwrapper.py | 35 ++++++++++++++++ 2 files changed, 81 insertions(+), 42 deletions(-) diff --git a/tabulate/__init__.py b/tabulate/__init__.py index 3e4da13..909293c 100644 --- a/tabulate/__init__.py +++ b/tabulate/__init__.py @@ -1638,7 +1638,13 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"): return rows, headers, headers_pad -def _wrap_text_to_colwidths(list_of_lists, colwidths, numparses=True, break_long_words=_BREAK_LONG_WORDS, break_on_hyphens=_BREAK_ON_HYPHENS): +def _wrap_text_to_colwidths( + list_of_lists, + colwidths, + numparses=True, + break_long_words=_BREAK_LONG_WORDS, + break_on_hyphens=_BREAK_ON_HYPHENS, +): if len(list_of_lists): num_cols = len(list_of_lists[0]) else: @@ -1655,7 +1661,11 @@ def _wrap_text_to_colwidths(list_of_lists, colwidths, numparses=True, break_long continue if width is not None: - wrapper = _CustomTextWrap(width=width, break_long_words=break_long_words, break_on_hyphens=break_on_hyphens) + wrapper = _CustomTextWrap( + width=width, + break_long_words=break_long_words, + break_on_hyphens=break_on_hyphens, + ) casted_cell = str(cell) wrapped = [ "\n".join(wrapper.wrap(line)) @@ -2258,7 +2268,11 @@ def tabulate( numparses = _expand_numparse(disable_numparse, num_cols) list_of_lists = _wrap_text_to_colwidths( - list_of_lists, maxcolwidths, numparses=numparses, break_long_words=break_long_words, break_on_hyphens=break_on_hyphens + list_of_lists, + maxcolwidths, + numparses=numparses, + break_long_words=break_long_words, + break_on_hyphens=break_on_hyphens, ) if maxheadercolwidths is not None: @@ -2272,7 +2286,11 @@ def tabulate( numparses = _expand_numparse(disable_numparse, num_cols) headers = _wrap_text_to_colwidths( - [headers], maxheadercolwidths, numparses=numparses, break_long_words=break_long_words, break_on_hyphens=break_on_hyphens + [headers], + maxheadercolwidths, + numparses=numparses, + break_long_words=break_long_words, + break_on_hyphens=break_on_hyphens, )[0] # empty values in the first column of RST tables should be escaped (issue #82) @@ -2737,46 +2755,32 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): space_left = width - cur_len # If we're allowed to break long words, then do so: put as much - # of the next chunk onto the current line as will fit. - - # Reverted the broken ANSI code handling stuff to fix wcwidth handling - # - Doesn't use self._lend, infinite loops - # - doesn't locate chunks correctly b/c could be split by ANSI codes - # - # if self.break_long_words and space_left > 0: - # # Tabulate Custom: Build the string up piece-by-piece in order to - # # take each charcter's width into account - # chunk = reversed_chunks[-1] - # # Only count printable characters, so strip_ansi first, index later. - # for i in range( 1, space_left + 1 ): - # if self._len(_strip_ansi(chunk)[:i]) > space_left: - # break - # - # # Consider escape codes when breaking words up - # total_escape_len = 0 - # last_group = 0 - # if _ansi_codes.search(chunk) is not None: - # for group, _, _, _ in _ansi_codes.findall(chunk): - # escape_len = len(group) - # if ( - # group - # in chunk[last_group : i + total_escape_len + escape_len - 1] - # ): - # total_escape_len += escape_len - # found = _ansi_codes.search(chunk[last_group:]) - # last_group += found.end() - # cur_line.append(chunk[: i + total_escape_len - 1]) - # reversed_chunks[-1] = chunk[i + total_escape_len - 1 :] - - if self.break_long_words: # and space_left > 0: + # of the next chunk onto the current line as will fit. Be careful + # of empty chunks after ANSI codes removed. + chunk = reversed_chunks[-1] + chunk_noansi = _strip_ansi(chunk) + if self.break_long_words and chunk_noansi: # Tabulate Custom: Build the string up piece-by-piece in order to # take each charcter's width into account - chunk = reversed_chunks[-1] - i = 1 - while self._len(chunk[:i]) <= space_left: - i = i + 1 - cur_line.append(chunk[: i - 1]) - reversed_chunks[-1] = chunk[i - 1 :] + # Only count printable characters, so strip_ansi first, index later. + for i in range(1, len(chunk_noansi) + 1): + if self._len(chunk_noansi[:i]) > space_left: + break + # Consider escape codes when breaking words up + total_escape_len = 0 + last_group = 0 + if _ansi_codes.search(chunk) is not None: + for group, _, _, _ in _ansi_codes.findall(chunk): + escape_len = len(group) + if ( + group + in chunk[last_group : i + total_escape_len + escape_len - 1] + ): + total_escape_len += escape_len + found = _ansi_codes.search(chunk[last_group:]) + last_group += found.end() + cur_line.append(chunk[: i + total_escape_len - 1]) + reversed_chunks[-1] = chunk[i + total_escape_len - 1 :] # Otherwise, we have to preserve the long word intact. Only add # it to the current line if there's nothing already there -- diff --git a/test/test_textwrapper.py b/test/test_textwrapper.py index 46dd818..ce1b75c 100644 --- a/test/test_textwrapper.py +++ b/test/test_textwrapper.py @@ -176,6 +176,41 @@ def test_wrap_color_line_longword(): assert_equal(expected, result) +def test_wrap_color_line_longword_zerowidth(): + """Lines with zero-width symbols (eg. accents) must include those symbols with the prior symbol. + Let's exercise the calculation where the available symbols never satisfy the available width, + and ensure chunk calculation succeeds and ANSI colors are maintained. + + Most combining marks combine with the preceding character (even in right-to-left alphabets): + - "e\u0301" → "é" (e + combining acute accent) + - "a\u0308" → "ä" (a + combining diaeresis) + - "n\u0303" → "ñ" (n + combining tilde) + Enclosing Marks: Some combining marks enclose the base character: + - "A\u20DD" → Ⓐ Combining enclosing circle + Multiple Combining Marks: You can stack multiple combining marks on a single base character: + - "e\u0301\u0308" → e with both acute accent and diaeresis + Zero width space → "ab" with a : + - "a\u200Bb" + + """ + try: + import wcwidth # noqa + except ImportError: + skip("test_wrap_wide_char is skipped") + + # Exactly filled, with a green zero-width segment at the end. + data = "This_is_A\u20DD_\033[31mte\u0301st_string_\u200bto_te\u0301\u0308st_a\u0308ccent\033[32m\u200b\033[0m" + + expected = [ + "This_is_A\u20DD_\033[31mte\u0301\033[0m", + "\033[31mst_string_\u200bto\033[0m", + "\033[31m_te\u0301\u0308st_a\u0308ccent\033[32m\u200b\033[0m", + ] + wrapper = CTW(width=12) + result = wrapper.wrap(data) + assert_equal(expected, result) + + def test_wrap_color_line_multiple_escapes(): data = "012345(\x1b[32ma\x1b[0mbc\x1b[32mdefghij\x1b[0m)" expected = [ From abe2989df1acfcae2ce1b3d0c92b05a17b6766eb Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Mon, 27 Oct 2025 11:36:19 +0400 Subject: [PATCH 3/3] Fix some line length issues --- test/test_textwrapper.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_textwrapper.py b/test/test_textwrapper.py index ce1b75c..c0aa4c6 100644 --- a/test/test_textwrapper.py +++ b/test/test_textwrapper.py @@ -177,7 +177,7 @@ def test_wrap_color_line_longword(): def test_wrap_color_line_longword_zerowidth(): - """Lines with zero-width symbols (eg. accents) must include those symbols with the prior symbol. + """Lines with zero-width symbols (accents) must include those symbols with the prior symbol. Let's exercise the calculation where the available symbols never satisfy the available width, and ensure chunk calculation succeeds and ANSI colors are maintained. @@ -199,7 +199,10 @@ def test_wrap_color_line_longword_zerowidth(): skip("test_wrap_wide_char is skipped") # Exactly filled, with a green zero-width segment at the end. - data = "This_is_A\u20DD_\033[31mte\u0301st_string_\u200bto_te\u0301\u0308st_a\u0308ccent\033[32m\u200b\033[0m" + data = ( + "This_is_A\u20DD_\033[31mte\u0301st_string_\u200b" + "to_te\u0301\u0308st_a\u0308ccent\033[32m\u200b\033[0m" + ) expected = [ "This_is_A\u20DD_\033[31mte\u0301\033[0m",