Skip to content

Commit 10b69ff

Browse files
authored
Accounting for media in Text comparators (#1178)
1 parent 54eac6c commit 10b69ff

File tree

2 files changed

+30
-1
lines changed

2 files changed

+30
-1
lines changed

src/paperqa/types.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,12 @@ def __eq__(self, other) -> bool:
167167
return (
168168
self.name == other.name
169169
and self.text == other.text
170+
and self.media == other.media
170171
and self.doc == other.doc
171172
)
172173

173174
def __hash__(self) -> int:
174-
return hash((self.name, self.text))
175+
return hash((self.name, self.text, tuple(self.media)))
175176

176177
async def get_embeddable_text(self, with_enrichment: bool = False) -> str:
177178
"""Get the text to embed, which may be different from the actual text content.

tests/test_paperqa.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3190,3 +3190,31 @@ async def test_parse_office_doc(stub_data_dir: Path, filename: str, query: str)
31903190
assert session.used_contexts
31913191
assert len(session.answer) > 10, "Expected an answer"
31923192
assert CANNOT_ANSWER_PHRASE not in session.answer, "Expected the system to be sure"
3193+
3194+
3195+
def test_text_comparison() -> None:
3196+
doc = Doc(docname="test", citation="test", dockey="test")
3197+
media1 = ParsedMedia(index=0, data=b"image_data_1")
3198+
media2 = ParsedMedia(index=1, data=b"image_data_2")
3199+
3200+
# Test equality and hashing without media
3201+
text_no_media1 = Text(text="Hello", name="chunk1", doc=doc)
3202+
text_no_media2 = Text(text="Hello", name="chunk1", doc=doc)
3203+
assert text_no_media1 == text_no_media2
3204+
assert hash(text_no_media1) == hash(text_no_media2)
3205+
3206+
# Test equality and hashing with media
3207+
# First with same media
3208+
text_with_media1 = Text(text="Hello", name="chunk1", doc=doc, media=[media1])
3209+
text_with_media2 = Text(text="Hello", name="chunk1", doc=doc, media=[media1])
3210+
assert text_with_media1 == text_with_media2
3211+
assert hash(text_with_media1) == hash(text_with_media2)
3212+
# Next with different media
3213+
text_diff_media = Text(text="Hello", name="chunk1", doc=doc, media=[media2])
3214+
assert text_with_media1 != text_diff_media
3215+
assert hash(text_with_media1) != hash(text_diff_media)
3216+
3217+
# Test that media matters for equality and set storage
3218+
assert text_with_media1 != text_no_media1
3219+
assert hash(text_with_media1) != hash(text_no_media1)
3220+
assert len({text_with_media1, text_with_media2, text_diff_media}) == 2

0 commit comments

Comments
 (0)