diff --git a/tests/test_client.py b/tests/test_client.py index cb92a14..66fbb5e 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -31,6 +31,7 @@ _normalize_article_entity_map, _parse_article, _parse_int, + _render_article_text_block, parse_tweet_result, parse_user_result, ) @@ -475,6 +476,100 @@ def test_ignores_blank_markdown(self): assert _extract_atomic_markdown(block, entity_map) == [] +class TestRenderArticleTextBlock: + def test_renders_inline_link_entities_as_markdown(self): + block = { + "text": "Read the docs and the course.", + "entityRanges": [ + {"key": 0, "offset": 9, "length": 4}, + {"key": 1, "offset": 22, "length": 6}, + ], + } + entity_map = { + "0": {"type": "LINK", "data": {"url": "https://docs.example.com"}}, + "1": {"type": "LINK", "data": {"url": "https://course.example.com"}}, + } + + assert _render_article_text_block(block, entity_map) == ( + "Read the [docs](https://docs.example.com) and the [course](https://course.example.com)." + ) + + def test_returns_empty_string_for_missing_text(self): + assert _render_article_text_block({"entityRanges": []}, {}) == "" + + def test_returns_empty_string_for_non_string_text(self): + assert _render_article_text_block({"text": None, "entityRanges": []}, {}) == "" + + def test_ignores_non_dict_entity_ranges(self): + block = {"text": "Intro", "entityRanges": [None, "bad", {"key": 0, "offset": 0, "length": 5}]} + entity_map = {"0": {"type": "LINK", "data": {"url": "https://example.com"}}} + + assert _render_article_text_block(block, entity_map) == "[Intro](https://example.com)" + + def test_ignores_missing_or_non_dict_entities(self): + block = { + "text": "Docs here", + "entityRanges": [ + {"key": 0, "offset": 0, "length": 4}, + {"key": 1, "offset": 5, "length": 4}, + ], + } + entity_map = {"1": "bad"} + + assert _render_article_text_block(block, entity_map) == "Docs here" + + def test_ignores_non_link_entities(self): + block = {"text": "Intro", "entityRanges": [{"key": 4, "offset": 0, "length": 5}]} + entity_map = {"4": {"type": "MARKDOWN", "data": {"markdown": "```md\nIntro\n```"}}} + + assert _render_article_text_block(block, entity_map) == "Intro" + + def test_ignores_invalid_offsets_lengths_and_blank_urls(self): + block = { + "text": "Read docs now", + "entityRanges": [ + {"key": 0, "offset": "bad", "length": 4}, + {"key": 1, "offset": 5, "length": 0}, + {"key": 2, "offset": 5, "length": 4}, + {"key": 3, "offset": 20, "length": 3}, + ], + } + entity_map = { + "0": {"type": "LINK", "data": {"url": "https://bad-offset.example.com"}}, + "1": {"type": "LINK", "data": {"url": "https://zero-length.example.com"}}, + "2": {"type": "LINK", "data": {"url": " "}}, + "3": {"type": "LINK", "data": {"url": "https://out-of-bounds.example.com"}}, + } + + assert _render_article_text_block(block, entity_map) == "Read docs now" + + def test_ignores_range_with_empty_label(self): + block = {"text": "abc", "entityRanges": [{"key": 0, "offset": 1, "length": -1}]} + entity_map = {"0": {"type": "LINK", "data": {"url": "https://example.com"}}} + + assert _render_article_text_block(block, entity_map) == "abc" + + def test_returns_plain_text_when_no_entity_ranges(self): + block = {"text": "Hello world"} + assert _render_article_text_block(block, {}) == "Hello world" + + def test_encodes_parentheses_in_url(self): + block = {"text": "see Wiki", "entityRanges": [{"key": 0, "offset": 4, "length": 4}]} + entity_map = {"0": {"type": "LINK", "data": {"url": "https://en.wikipedia.org/wiki/Rust_(programming_language)"}}} + + assert _render_article_text_block(block, entity_map) == ( + "see [Wiki](https://en.wikipedia.org/wiki/Rust_(programming_language%29)" + ) + + def test_escapes_brackets_in_label(self): + block = {"text": "see [docs] now", "entityRanges": [{"key": 0, "offset": 4, "length": 6}]} + entity_map = {"0": {"type": "LINK", "data": {"url": "https://example.com"}}} + + assert _render_article_text_block(block, entity_map) == ( + "see [\\[docs\\]](https://example.com) now" + ) + + class TestParseArticle: def test_preserves_atomic_markdown_between_text_blocks(self): result = { @@ -624,6 +719,55 @@ def test_preserves_markdown_and_images_in_mixed_atomic_blocks(self): ), } + def test_renders_inline_hyperlinks_from_article_entity_ranges(self): + result = { + "article": { + "article_results": { + "result": { + "title": "Linked article", + "content_state": { + "blocks": [ + { + "key": "a", + "type": "unstyled", + "text": "Read the docs and the course.", + "entityRanges": [ + {"key": 0, "offset": 9, "length": 4}, + {"key": 1, "offset": 22, "length": 6}, + ], + } + ], + "entityMap": [ + { + "key": "0", + "value": { + "type": "LINK", + "data": {"url": "https://docs.example.com"}, + }, + }, + { + "key": "1", + "value": { + "type": "LINK", + "data": {"url": "https://course.example.com"}, + }, + }, + ], + }, + } + } + } + } + + parsed = _parse_article(result) + + assert parsed == { + "article_title": "Linked article", + "article_text": ( + "Read the [docs](https://docs.example.com) and the [course](https://course.example.com)." + ), + } + # ── TwitterClient._parse_tweet_result ───────────────────────────────────── diff --git a/twitter_cli/parser.py b/twitter_cli/parser.py index f0f62c8..a92374b 100644 --- a/twitter_cli/parser.py +++ b/twitter_cli/parser.py @@ -215,6 +215,55 @@ def _extract_atomic_markdown(block, entity_map): return parts +def _render_article_text_block(block, entity_map): + # type: (Dict[str, Any], Dict[str, Any]) -> str + """Render a Draft.js text block, converting inline hyperlinks to Markdown.""" + text = block.get("text", "") + if not isinstance(text, str) or not text: + return "" + + entity_ranges = block.get("entityRanges", []) or [] + if not entity_ranges: + return text + + rendered = text + ranges = [] + for entity_range in entity_ranges: + if not isinstance(entity_range, dict): + continue + entity_key = entity_range.get("key") + entity = entity_map.get(str(entity_key)) if entity_key is not None else None + if not isinstance(entity, dict): + continue + if str(entity.get("type") or "").upper() != "LINK": + continue + offset = entity_range.get("offset") + length = entity_range.get("length") + if not isinstance(offset, int) or not isinstance(length, int) or length <= 0: + continue + url = _deep_get(entity, "data", "url") + if not isinstance(url, str) or not url.strip(): + continue + ranges.append((offset, length, url.strip())) + + for offset, length, url in sorted(ranges, reverse=True): + if offset < 0 or offset + length > len(rendered): + continue + label = rendered[offset:offset + length] + if not label: + continue + # Escape markdown special chars: ] in labels and ) in URLs + safe_label = label.replace("[", "\\[").replace("]", "\\]") + safe_url = url.replace(")", "%29") + rendered = "%s[%s](%s)%s" % ( + rendered[:offset], + safe_label, + safe_url, + rendered[offset + length:], + ) + return rendered + + def _find_article_caption(value): # type: (Any) -> Optional[str] """Best-effort extraction of image caption/alt text from article entity data.""" @@ -289,7 +338,7 @@ def _parse_article(tweet_data): parts.extend(_extract_article_images(block, entity_map, media_url_map)) ordered_counter = 0 continue - text = block.get("text", "") # type: str + text = _render_article_text_block(block, entity_map) if not text: continue if block_type != "ordered-list-item":