Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
_normalize_article_entity_map,
_parse_article,
_parse_int,
_render_article_text_block,
parse_tweet_result,
parse_user_result,
)
Expand Down Expand Up @@ -475,6 +476,100 @@ def test_ignores_blank_markdown(self):
assert _extract_atomic_markdown(block, entity_map) == []


class TestRenderArticleTextBlock:
def test_renders_inline_link_entities_as_markdown(self):
block = {
"text": "Read the docs and the course.",
"entityRanges": [
{"key": 0, "offset": 9, "length": 4},
{"key": 1, "offset": 22, "length": 6},
],
}
entity_map = {
"0": {"type": "LINK", "data": {"url": "https://docs.example.com"}},
"1": {"type": "LINK", "data": {"url": "https://course.example.com"}},
}

assert _render_article_text_block(block, entity_map) == (
"Read the [docs](https://docs.example.com) and the [course](https://course.example.com)."
)

def test_returns_empty_string_for_missing_text(self):
assert _render_article_text_block({"entityRanges": []}, {}) == ""

def test_returns_empty_string_for_non_string_text(self):
assert _render_article_text_block({"text": None, "entityRanges": []}, {}) == ""

def test_ignores_non_dict_entity_ranges(self):
block = {"text": "Intro", "entityRanges": [None, "bad", {"key": 0, "offset": 0, "length": 5}]}
entity_map = {"0": {"type": "LINK", "data": {"url": "https://example.com"}}}

assert _render_article_text_block(block, entity_map) == "[Intro](https://example.com)"

def test_ignores_missing_or_non_dict_entities(self):
block = {
"text": "Docs here",
"entityRanges": [
{"key": 0, "offset": 0, "length": 4},
{"key": 1, "offset": 5, "length": 4},
],
}
entity_map = {"1": "bad"}

assert _render_article_text_block(block, entity_map) == "Docs here"

def test_ignores_non_link_entities(self):
block = {"text": "Intro", "entityRanges": [{"key": 4, "offset": 0, "length": 5}]}
entity_map = {"4": {"type": "MARKDOWN", "data": {"markdown": "```md\nIntro\n```"}}}

assert _render_article_text_block(block, entity_map) == "Intro"

def test_ignores_invalid_offsets_lengths_and_blank_urls(self):
block = {
"text": "Read docs now",
"entityRanges": [
{"key": 0, "offset": "bad", "length": 4},
{"key": 1, "offset": 5, "length": 0},
{"key": 2, "offset": 5, "length": 4},
{"key": 3, "offset": 20, "length": 3},
],
}
entity_map = {
"0": {"type": "LINK", "data": {"url": "https://bad-offset.example.com"}},
"1": {"type": "LINK", "data": {"url": "https://zero-length.example.com"}},
"2": {"type": "LINK", "data": {"url": " "}},
"3": {"type": "LINK", "data": {"url": "https://out-of-bounds.example.com"}},
}

assert _render_article_text_block(block, entity_map) == "Read docs now"

def test_ignores_range_with_empty_label(self):
block = {"text": "abc", "entityRanges": [{"key": 0, "offset": 1, "length": -1}]}
entity_map = {"0": {"type": "LINK", "data": {"url": "https://example.com"}}}

assert _render_article_text_block(block, entity_map) == "abc"

def test_returns_plain_text_when_no_entity_ranges(self):
block = {"text": "Hello world"}
assert _render_article_text_block(block, {}) == "Hello world"

def test_encodes_parentheses_in_url(self):
block = {"text": "see Wiki", "entityRanges": [{"key": 0, "offset": 4, "length": 4}]}
entity_map = {"0": {"type": "LINK", "data": {"url": "https://en.wikipedia.org/wiki/Rust_(programming_language)"}}}

assert _render_article_text_block(block, entity_map) == (
"see [Wiki](https://en.wikipedia.org/wiki/Rust_(programming_language%29)"
)

def test_escapes_brackets_in_label(self):
block = {"text": "see [docs] now", "entityRanges": [{"key": 0, "offset": 4, "length": 6}]}
entity_map = {"0": {"type": "LINK", "data": {"url": "https://example.com"}}}

assert _render_article_text_block(block, entity_map) == (
"see [\\[docs\\]](https://example.com) now"
)


class TestParseArticle:
def test_preserves_atomic_markdown_between_text_blocks(self):
result = {
Expand Down Expand Up @@ -624,6 +719,55 @@ def test_preserves_markdown_and_images_in_mixed_atomic_blocks(self):
),
}

def test_renders_inline_hyperlinks_from_article_entity_ranges(self):
result = {
"article": {
"article_results": {
"result": {
"title": "Linked article",
"content_state": {
"blocks": [
{
"key": "a",
"type": "unstyled",
"text": "Read the docs and the course.",
"entityRanges": [
{"key": 0, "offset": 9, "length": 4},
{"key": 1, "offset": 22, "length": 6},
],
}
],
"entityMap": [
{
"key": "0",
"value": {
"type": "LINK",
"data": {"url": "https://docs.example.com"},
},
},
{
"key": "1",
"value": {
"type": "LINK",
"data": {"url": "https://course.example.com"},
},
},
],
},
}
}
}
}

parsed = _parse_article(result)

assert parsed == {
"article_title": "Linked article",
"article_text": (
"Read the [docs](https://docs.example.com) and the [course](https://course.example.com)."
),
}


# ── TwitterClient._parse_tweet_result ─────────────────────────────────────

Expand Down
51 changes: 50 additions & 1 deletion twitter_cli/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,55 @@ def _extract_atomic_markdown(block, entity_map):
return parts


def _render_article_text_block(block, entity_map):
# type: (Dict[str, Any], Dict[str, Any]) -> str
"""Render a Draft.js text block, converting inline hyperlinks to Markdown."""
text = block.get("text", "")
if not isinstance(text, str) or not text:
return ""

entity_ranges = block.get("entityRanges", []) or []
if not entity_ranges:
return text

rendered = text
ranges = []
for entity_range in entity_ranges:
if not isinstance(entity_range, dict):
continue
entity_key = entity_range.get("key")
entity = entity_map.get(str(entity_key)) if entity_key is not None else None
if not isinstance(entity, dict):
continue
if str(entity.get("type") or "").upper() != "LINK":
continue
offset = entity_range.get("offset")
length = entity_range.get("length")
if not isinstance(offset, int) or not isinstance(length, int) or length <= 0:
continue
url = _deep_get(entity, "data", "url")
if not isinstance(url, str) or not url.strip():
continue
ranges.append((offset, length, url.strip()))

for offset, length, url in sorted(ranges, reverse=True):
if offset < 0 or offset + length > len(rendered):
continue
label = rendered[offset:offset + length]
if not label:
continue
# Escape markdown special chars: ] in labels and ) in URLs
safe_label = label.replace("[", "\\[").replace("]", "\\]")
safe_url = url.replace(")", "%29")
rendered = "%s[%s](%s)%s" % (
rendered[:offset],
safe_label,
safe_url,
rendered[offset + length:],
)
return rendered


def _find_article_caption(value):
# type: (Any) -> Optional[str]
"""Best-effort extraction of image caption/alt text from article entity data."""
Expand Down Expand Up @@ -289,7 +338,7 @@ def _parse_article(tweet_data):
parts.extend(_extract_article_images(block, entity_map, media_url_map))
ordered_counter = 0
continue
text = block.get("text", "") # type: str
text = _render_article_text_block(block, entity_map)
if not text:
continue
if block_type != "ordered-list-item":
Expand Down
Loading