Skip to content
This repository was archived by the owner on Jan 29, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/bluesearch/entrypoint/database/add_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,25 @@ def bulk_paragraphs(
for inp in inputs:
serialized = inp.read_text("utf-8")
article = Article.from_json(serialized)
# add abstract to paragraphs in order to be able to search for abstracts
for i, abstract in enumerate(article.abstract):
doc = {
"_index": "paragraphs",
"_source": {
"article_id": article.uid,
"section": "abstract",
"text": abstract,
"paragraph_id": i,
},
}
yield doc
# add body paragraphs
for ppos, (section, text) in enumerate(article.section_paragraphs):
doc = {
"_index": "paragraphs",
"_source": {
"article_id": article.uid,
"section_name": section,
"section": section,
"text": text,
"paragraph_id": ppos,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I remember correctly, the goal of this paragraph_id is to hold a unique number (just an increment integer) for each paragraph. However, if I am not mistaken after introduction of the abstract paragraphs into the paragraphs table we might have collisions

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, maybe we could put "paragraph_id": ppos + i + 1 to avoid any collisions. What do you think @drsantos89 @jankrepl ?

},
Expand Down Expand Up @@ -160,7 +173,7 @@ def run(
if resp[0] == 0:
raise RuntimeWarning(f"No articles were loaded to ES from '{parsed_path}'!")

logger.info("Uploading articles to the database...")
logger.info("Uploading paragraphs to the database...")
progress = tqdm.tqdm(
desc="Uploading paragraphs", total=len(inputs), unit="articles"
)
Expand Down
2 changes: 1 addition & 1 deletion src/bluesearch/k8s/create_indices.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"dynamic": "strict",
"properties": {
"article_id": {"type": "keyword"},
"section_name": {"type": "keyword"},
"section": {"type": "keyword"},
"paragraph_id": {"type": "short"},
"text": {"type": "text"},
"is_bad": {"type": "boolean"},
Expand Down
8 changes: 6 additions & 2 deletions tests/unit/entrypoint/database/test_add_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,22 +72,26 @@ def test(get_es_client: Elasticsearch, tmp_path: Path) -> None:

# verify paragraphs
resp = client.search(index="paragraphs", query={"match_all": {}})
assert resp["hits"]["total"]["value"] == 4
assert resp["hits"]["total"]["value"] == 8

all_docs = set()
for doc in resp["hits"]["hits"]:
all_docs.add(
(
doc["_source"]["article_id"],
doc["_source"]["paragraph_id"],
doc["_source"]["section_name"],
doc["_source"]["section"],
doc["_source"]["text"],
)
)

all_docs_expected = {
("1", 0, "abstract", "some test abstract"),
("1", 1, "abstract", "abcd"),
("1", 0, "intro", "some test section_paragraphs 1client"),
("1", 1, "summary", "some test section_paragraphs 2"),
("2", 0, "abstract", "dsaklf"),
("2", 1, "abstract", "abcd"),
("2", 0, "intro", "some TESTTT section_paragraphs 1client"),
("2", 1, "summary", "some other test section_paragraphs 2"),
}
Expand Down