From c63b5a2edec9ca8b1aa9ca67a4af6949847380b2 Mon Sep 17 00:00:00 2001 From: alextuan1024 Date: Sat, 21 Mar 2026 13:34:35 +0800 Subject: [PATCH 1/3] fix(article): preserve inline hyperlinks in markdown export --- tests/test_client.py | 75 +++++++++++++++++++++++++++++++++++++++++++ twitter_cli/parser.py | 44 ++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/tests/test_client.py b/tests/test_client.py index cb92a14..7bb1d5d 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -31,6 +31,7 @@ _normalize_article_entity_map, _parse_article, _parse_int, + _render_article_text_block, parse_tweet_result, parse_user_result, ) @@ -475,6 +476,31 @@ def test_ignores_blank_markdown(self): assert _extract_atomic_markdown(block, entity_map) == [] +class TestRenderArticleTextBlock: + def test_renders_inline_link_entities_as_markdown(self): + block = { + "text": "Read the docs and the course.", + "entityRanges": [ + {"key": 0, "offset": 9, "length": 4}, + {"key": 1, "offset": 22, "length": 6}, + ], + } + entity_map = { + "0": {"type": "LINK", "data": {"url": "https://docs.example.com"}}, + "1": {"type": "LINK", "data": {"url": "https://course.example.com"}}, + } + + assert _render_article_text_block(block, entity_map) == ( + "Read the [docs](https://docs.example.com) and the [course](https://course.example.com)." + ) + + def test_ignores_non_link_entities(self): + block = {"text": "Intro", "entityRanges": [{"key": 4, "offset": 0, "length": 5}]} + entity_map = {"4": {"type": "MARKDOWN", "data": {"markdown": "```md\nIntro\n```"}}} + + assert _render_article_text_block(block, entity_map) == "Intro" + + class TestParseArticle: def test_preserves_atomic_markdown_between_text_blocks(self): result = { @@ -624,6 +650,55 @@ def test_preserves_markdown_and_images_in_mixed_atomic_blocks(self): ), } + def test_renders_inline_hyperlinks_from_article_entity_ranges(self): + result = { + "article": { + "article_results": { + "result": { + "title": "Linked article", + "content_state": { + "blocks": [ + { + "key": "a", + "type": "unstyled", + "text": "Read the docs and the course.", + "entityRanges": [ + {"key": 0, "offset": 9, "length": 4}, + {"key": 1, "offset": 22, "length": 6}, + ], + } + ], + "entityMap": [ + { + "key": "0", + "value": { + "type": "LINK", + "data": {"url": "https://docs.example.com"}, + }, + }, + { + "key": "1", + "value": { + "type": "LINK", + "data": {"url": "https://course.example.com"}, + }, + }, + ], + }, + } + } + } + } + + parsed = _parse_article(result) + + assert parsed == { + "article_title": "Linked article", + "article_text": ( + "Read the [docs](https://docs.example.com) and the [course](https://course.example.com)." + ), + } + # ── TwitterClient._parse_tweet_result ───────────────────────────────────── diff --git a/twitter_cli/parser.py b/twitter_cli/parser.py index f0f62c8..d56627a 100644 --- a/twitter_cli/parser.py +++ b/twitter_cli/parser.py @@ -215,6 +215,48 @@ def _extract_atomic_markdown(block, entity_map): return parts +def _render_article_text_block(block, entity_map): + # type: (Dict[str, Any], Dict[str, Any]) -> str + """Render a Draft.js text block, converting inline hyperlinks to Markdown.""" + text = block.get("text", "") + if not isinstance(text, str) or not text: + return "" + + rendered = text + ranges = [] + for entity_range in block.get("entityRanges", []) or []: + if not isinstance(entity_range, dict): + continue + entity_key = entity_range.get("key") + entity = entity_map.get(str(entity_key)) if entity_key is not None else None + if not isinstance(entity, dict): + continue + if str(entity.get("type") or "").upper() != "LINK": + continue + offset = entity_range.get("offset") + length = entity_range.get("length") + if not isinstance(offset, int) or not isinstance(length, int) or length <= 0: + continue + url = _deep_get(entity, "data", "url") + if not isinstance(url, str) or not url.strip(): + continue + ranges.append((offset, length, url.strip())) + + for offset, length, url in sorted(ranges, reverse=True): + if offset < 0 or offset + length > len(rendered): + continue + label = rendered[offset:offset + length] + if not label: + continue + rendered = "%s[%s](%s)%s" % ( + rendered[:offset], + label, + url, + rendered[offset + length:], + ) + return rendered + + def _find_article_caption(value): # type: (Any) -> Optional[str] """Best-effort extraction of image caption/alt text from article entity data.""" @@ -289,7 +331,7 @@ def _parse_article(tweet_data): parts.extend(_extract_article_images(block, entity_map, media_url_map)) ordered_counter = 0 continue - text = block.get("text", "") # type: str + text = _render_article_text_block(block, entity_map) if not text: continue if block_type != "ordered-list-item": From b7c7ef826c5935b0f5ef915a07ea8d01964bf30f Mon Sep 17 00:00:00 2001 From: alextuan1024 Date: Sat, 21 Mar 2026 14:07:59 +0800 Subject: [PATCH 2/3] test(article): cover hyperlink parser edge cases --- tests/test_client.py | 49 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tests/test_client.py b/tests/test_client.py index 7bb1d5d..580d792 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -494,12 +494,61 @@ def test_renders_inline_link_entities_as_markdown(self): "Read the [docs](https://docs.example.com) and the [course](https://course.example.com)." ) + def test_returns_empty_string_for_missing_text(self): + assert _render_article_text_block({"entityRanges": []}, {}) == "" + + def test_returns_empty_string_for_non_string_text(self): + assert _render_article_text_block({"text": None, "entityRanges": []}, {}) == "" + + def test_ignores_non_dict_entity_ranges(self): + block = {"text": "Intro", "entityRanges": [None, "bad", {"key": 0, "offset": 0, "length": 5}]} + entity_map = {"0": {"type": "LINK", "data": {"url": "https://example.com"}}} + + assert _render_article_text_block(block, entity_map) == "[Intro](https://example.com)" + + def test_ignores_missing_or_non_dict_entities(self): + block = { + "text": "Docs here", + "entityRanges": [ + {"key": 0, "offset": 0, "length": 4}, + {"key": 1, "offset": 5, "length": 4}, + ], + } + entity_map = {"1": "bad"} + + assert _render_article_text_block(block, entity_map) == "Docs here" + def test_ignores_non_link_entities(self): block = {"text": "Intro", "entityRanges": [{"key": 4, "offset": 0, "length": 5}]} entity_map = {"4": {"type": "MARKDOWN", "data": {"markdown": "```md\nIntro\n```"}}} assert _render_article_text_block(block, entity_map) == "Intro" + def test_ignores_invalid_offsets_lengths_and_blank_urls(self): + block = { + "text": "Read docs now", + "entityRanges": [ + {"key": 0, "offset": "bad", "length": 4}, + {"key": 1, "offset": 5, "length": 0}, + {"key": 2, "offset": 5, "length": 4}, + {"key": 3, "offset": 20, "length": 3}, + ], + } + entity_map = { + "0": {"type": "LINK", "data": {"url": "https://bad-offset.example.com"}}, + "1": {"type": "LINK", "data": {"url": "https://zero-length.example.com"}}, + "2": {"type": "LINK", "data": {"url": " "}}, + "3": {"type": "LINK", "data": {"url": "https://out-of-bounds.example.com"}}, + } + + assert _render_article_text_block(block, entity_map) == "Read docs now" + + def test_ignores_range_with_empty_label(self): + block = {"text": "abc", "entityRanges": [{"key": 0, "offset": 1, "length": -1}]} + entity_map = {"0": {"type": "LINK", "data": {"url": "https://example.com"}}} + + assert _render_article_text_block(block, entity_map) == "abc" + class TestParseArticle: def test_preserves_atomic_markdown_between_text_blocks(self): From b752c31dfd3c3fcbc86f8bd9cc647c92be18b451 Mon Sep 17 00:00:00 2001 From: jackwener Date: Sat, 21 Mar 2026 17:48:02 +0800 Subject: [PATCH 3/3] improve: add early return, escape markdown brackets in labels and parentheses in URLs - Early return when no entityRanges (skip unnecessary processing) - Escape [ and ] in link labels to prevent nested bracket issues - Encode ) in URLs as %29 to prevent malformed markdown links (e.g. Wikipedia) - Add 3 new test cases for the above edge cases --- tests/test_client.py | 20 ++++++++++++++++++++ twitter_cli/parser.py | 13 ++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/tests/test_client.py b/tests/test_client.py index 580d792..66fbb5e 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -549,6 +549,26 @@ def test_ignores_range_with_empty_label(self): assert _render_article_text_block(block, entity_map) == "abc" + def test_returns_plain_text_when_no_entity_ranges(self): + block = {"text": "Hello world"} + assert _render_article_text_block(block, {}) == "Hello world" + + def test_encodes_parentheses_in_url(self): + block = {"text": "see Wiki", "entityRanges": [{"key": 0, "offset": 4, "length": 4}]} + entity_map = {"0": {"type": "LINK", "data": {"url": "https://en.wikipedia.org/wiki/Rust_(programming_language)"}}} + + assert _render_article_text_block(block, entity_map) == ( + "see [Wiki](https://en.wikipedia.org/wiki/Rust_(programming_language%29)" + ) + + def test_escapes_brackets_in_label(self): + block = {"text": "see [docs] now", "entityRanges": [{"key": 0, "offset": 4, "length": 6}]} + entity_map = {"0": {"type": "LINK", "data": {"url": "https://example.com"}}} + + assert _render_article_text_block(block, entity_map) == ( + "see [\\[docs\\]](https://example.com) now" + ) + class TestParseArticle: def test_preserves_atomic_markdown_between_text_blocks(self): diff --git a/twitter_cli/parser.py b/twitter_cli/parser.py index d56627a..a92374b 100644 --- a/twitter_cli/parser.py +++ b/twitter_cli/parser.py @@ -222,9 +222,13 @@ def _render_article_text_block(block, entity_map): if not isinstance(text, str) or not text: return "" + entity_ranges = block.get("entityRanges", []) or [] + if not entity_ranges: + return text + rendered = text ranges = [] - for entity_range in block.get("entityRanges", []) or []: + for entity_range in entity_ranges: if not isinstance(entity_range, dict): continue entity_key = entity_range.get("key") @@ -248,10 +252,13 @@ def _render_article_text_block(block, entity_map): label = rendered[offset:offset + length] if not label: continue + # Escape markdown special chars: ] in labels and ) in URLs + safe_label = label.replace("[", "\\[").replace("]", "\\]") + safe_url = url.replace(")", "%29") rendered = "%s[%s](%s)%s" % ( rendered[:offset], - label, - url, + safe_label, + safe_url, rendered[offset + length:], ) return rendered