From 923ab9e573b524d25dae70310b26797ef78e0481 Mon Sep 17 00:00:00 2001 From: Wrobel Date: Fri, 3 May 2024 17:02:30 +0200 Subject: [PATCH 1/6] Improve Query functions * Add PubMed to id * Add ability to pass extra parameters to query (request_query and request_paper). This will be needed for new functions. * Add missing timeout to request_query * Exclude "Too Many Requests" and "Endpont request timed out" Errors from raising an Exception and return magic-string "TIMEOUT". search_semantic will retry with increasing interval until success or other error type. --- litstudy/sources/semanticscholar.py | 31 ++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py index 0ab60bd..a31379e 100644 --- a/litstudy/sources/semanticscholar.py +++ b/litstudy/sources/semanticscholar.py @@ -18,6 +18,7 @@ def extract_id(item): doi=item.get("doi"), arxivid=item.get("arxivId"), s2id=item.get("paperId"), + pubmed=item.get("pubmed"), ) @@ -96,26 +97,34 @@ def load(id): DEFAULT_TIMEOUT = 3.05 # 100 requests per 5 minutes -def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT): - params = urlencode(dict(query=query, offset=offset, limit=limit)) - url = f"{S2_QUERY_URL}?{params}" +def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT, extraParams=dict()): + params=dict(query=query, offset=offset, limit=limit) + params.update(extraParams) + encparams = urlencode(params) + url = f"{S2_QUERY_URL}?{encparams}" if url in cache: return cache[url] + sleep(timeout) - reply = session.get(url) + reply = session.get(url,timeout=60*10) response = reply.json() if "data" not in response: msg = response.get("error") or response.get("message") or "unknown" - raise Exception(f"error while fetching {reply.url}: {msg}") + if msg.find("Too Many Requests.")>-1 or msg.find("Endpoint request timed out")>-1: + logging.info(f"request_query: Timeout error while fetching {reply.url}: {msg}") + return "TIMEOUT" + else: + raise Exception(f"error while fetching {reply.url}: {msg}") cache[url] = response return response -def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT): - url = S2_PAPER_URL + quote_plus(key) +def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT, extraParams=dict()): + encparams = urlencode(extraParams) + url = S2_PAPER_URL + quote_plus(key)+"?"+encparams if url in cache: return cache[url] @@ -224,6 +233,7 @@ def search_semanticscholar( with shelve.open(CACHE_FILE) as cache: paper_ids = [] + to=0 while True: offset = len(paper_ids) @@ -231,6 +241,13 @@ def search_semanticscholar( response = request_query(query, offset, batch_size, cache, session) if not response: break + if response == "TIMEOUT": + to=to+1 + logging.info("Timeout:",DEFAULT_TIMEOUT*4*to) + sleep(DEFAULT_TIMEOUT*4*to) + continue + else: + to=0 records = response["data"] total = response["total"] From ef491d3709e1d4f64a73a25864d078ffd6e451ae Mon Sep 17 00:00:00 2001 From: Wrobel Date: Fri, 3 May 2024 17:06:28 +0200 Subject: [PATCH 2/6] Add function to load json file --- litstudy/__init__.py | 3 +++ litstudy/sources/__init__.py | 3 ++- litstudy/sources/semanticscholar.py | 20 ++++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/litstudy/__init__.py b/litstudy/__init__.py index d96ce64..c860ed0 100644 --- a/litstudy/__init__.py +++ b/litstudy/__init__.py @@ -7,6 +7,7 @@ load_ieee_csv, load_ris_file, load_scopus_csv, + load_semanticscholar_json, load_springer_csv, refine_crossref, refine_scopus, @@ -120,6 +121,8 @@ "load_csv", "load_ieee_csv", "load_ris_file", + "load_scopus_csv", + "load_semanticscholar_json", "load_springer_csv", "refine_crossref", "refine_scopus", diff --git a/litstudy/sources/__init__.py b/litstudy/sources/__init__.py index a5eeccc..848c97b 100644 --- a/litstudy/sources/__init__.py +++ b/litstudy/sources/__init__.py @@ -1,6 +1,6 @@ from .scopus import search_scopus, refine_scopus, fetch_scopus from .bibtex import load_bibtex -from .semanticscholar import fetch_semanticscholar, search_semanticscholar, refine_semanticscholar +from .semanticscholar import fetch_semanticscholar, search_semanticscholar, refine_semanticscholar, load_semanticscholar_json from .crossref import fetch_crossref, refine_crossref, search_crossref from .ieee import load_ieee_csv from .springer import load_springer_csv @@ -19,6 +19,7 @@ "load_ieee_csv", "load_ris_file", "load_scopus_csv", + "load_semanticscholar_json", "load_springer_csv", "refine_crossref", "refine_scopus", diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py index a31379e..7a274a4 100644 --- a/litstudy/sources/semanticscholar.py +++ b/litstudy/sources/semanticscholar.py @@ -4,6 +4,8 @@ import logging import requests import shelve +from ..common import robust_open +import json from ..common import progress_bar from ..types import Document, Author, DocumentSet, DocumentIdentifier @@ -273,3 +275,21 @@ def search_semanticscholar( logging.warn(f"could not find paper id {paper_id}") return DocumentSet(docs) + +def load_semanticscholar_json(path: str) -> DocumentSet: + """Import json file exported from SemanticScholar""" + docs = [] + with robust_open(path) as f: + result = json.load(f) + data=result["data"] + for doc in data: + ids=doc.pop("externalIds") + for i in ids: + if i=="DOI": + doc["doi"]=ids.get("DOI").lower() + elif i=="ArXiv": + doc["arxivId"]=ids.get("ArXiv") + elif i=="PubMed": + doc["pubmed"]=ids.get("PubMed") + docs.append(ScholarDocument(doc)) + return DocumentSet(docs) \ No newline at end of file From 92a96922c22b8fcdfc76ed29eb4f58b928e8e03a Mon Sep 17 00:00:00 2001 From: Wrobel Date: Fri, 3 May 2024 17:50:09 +0200 Subject: [PATCH 3/6] Add function to search 1,000 docs in 10 requests instead of 1,000+ --- litstudy/__init__.py | 2 + litstudy/sources/__init__.py | 3 +- litstudy/sources/semanticscholar.py | 69 ++++++++++++++++++++++++++++- 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/litstudy/__init__.py b/litstudy/__init__.py index c860ed0..0db24ef 100644 --- a/litstudy/__init__.py +++ b/litstudy/__init__.py @@ -17,6 +17,7 @@ search_dblp, search_scopus, search_semanticscholar, + fastsearch_semanticscholar, ) from .stats import ( compute_year_histogram, @@ -132,6 +133,7 @@ "search_dblp", "search_scopus", "search_semanticscholar", + "fastsearch_semanticscholar", "Affiliation", "Author", "Document", diff --git a/litstudy/sources/__init__.py b/litstudy/sources/__init__.py index 848c97b..401ca27 100644 --- a/litstudy/sources/__init__.py +++ b/litstudy/sources/__init__.py @@ -1,6 +1,6 @@ from .scopus import search_scopus, refine_scopus, fetch_scopus from .bibtex import load_bibtex -from .semanticscholar import fetch_semanticscholar, search_semanticscholar, refine_semanticscholar, load_semanticscholar_json +from .semanticscholar import fetch_semanticscholar, search_semanticscholar, fastsearch_semanticscholar, refine_semanticscholar, load_semanticscholar_json from .crossref import fetch_crossref, refine_crossref, search_crossref from .ieee import load_ieee_csv from .springer import load_springer_csv @@ -29,4 +29,5 @@ "search_dblp", "search_scopus", "search_semanticscholar", + "fastsearch_semanticscholar", ] diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py index 7a274a4..8797bc3 100644 --- a/litstudy/sources/semanticscholar.py +++ b/litstudy/sources/semanticscholar.py @@ -243,6 +243,8 @@ def search_semanticscholar( response = request_query(query, offset, batch_size, cache, session) if not response: break + + #retry in case of timeout if response == "TIMEOUT": to=to+1 logging.info("Timeout:",DEFAULT_TIMEOUT*4*to) @@ -292,4 +294,69 @@ def load_semanticscholar_json(path: str) -> DocumentSet: elif i=="PubMed": doc["pubmed"]=ids.get("PubMed") docs.append(ScholarDocument(doc)) - return DocumentSet(docs) \ No newline at end of file + return DocumentSet(docs) + +def fastsearch_semanticscholar( + query: str, *, limit: int = 1000, batch_size: int = 100, session=None +) -> DocumentSet: + """Submit the given query to SemanticScholar API and return the results + as a `DocumentSet`. + + :param query: The search query to submit. + :param limit: The maximum number of results to return. Must be at most 1,000 + :param batch_size: The number of results to retrieve per request. Must be at most 100. + :param session: The `requests.Session` to use for HTTP requests. + """ + + if not query: + raise Exception("no query specified in `search_semanticscholar`") + + if session is None: + session = requests.Session() + + docs = [] + + with shelve.open(CACHE_FILE) as cache: + paper_ids = [] + to=0 + while True: + offset = len(docs) + + response = request_query(query, offset, batch_size, cache, session,extraParams={"fields":"title,authors,year,venue,abstract,citations,references,externalIds"}) + if not response: + break + + #Retry in case of timeout + if response == "TIMEOUT": + to=to+1 + logging.info("Timeout:",DEFAULT_TIMEOUT*4*to) + sleep(DEFAULT_TIMEOUT*4*to) + continue + else: + to=0 + + records = response["data"] + total = response["total"] + print("Gesamt:",total,"Offset:",offset) + for record in records: + ids=record.pop("externalIds") + for i in ids: + if i=="DOI": + record["doi"]=ids.get("DOI").lower() + elif i=="ArXiv": + record["arxivId"]=ids.get("ArXiv") + elif i=="PubMed": + record["pubmed"]=ids.get("PubMed") + docs.append(ScholarDocument(record)) + + + # Check if we reached the total number of papers + if len(docs) >= total: + break + + # Check if we exceeded the user-defined limit + if limit is not None and len(docs) >= limit: + docs = docs[:limit] + break + + return DocumentSet(docs) From 0aedf85bb942856aba04a26ade4aaf29d759c866 Mon Sep 17 00:00:00 2001 From: Wrobel Date: Fri, 3 May 2024 17:54:46 +0200 Subject: [PATCH 4/6] Add functions to download metadata for referenced docs --- litstudy/__init__.py | 4 +++ litstudy/sources/__init__.py | 4 ++- litstudy/sources/semanticscholar.py | 46 +++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/litstudy/__init__.py b/litstudy/__init__.py index 0db24ef..ea41d37 100644 --- a/litstudy/__init__.py +++ b/litstudy/__init__.py @@ -18,6 +18,8 @@ search_scopus, search_semanticscholar, fastsearch_semanticscholar, + generate_reference_list, + mass_fetch_semanticscholar, ) from .stats import ( compute_year_histogram, @@ -134,6 +136,8 @@ "search_scopus", "search_semanticscholar", "fastsearch_semanticscholar", + "generate_reference_list", + "mass_fetch_semanticscholar", "Affiliation", "Author", "Document", diff --git a/litstudy/sources/__init__.py b/litstudy/sources/__init__.py index 401ca27..4c5dff6 100644 --- a/litstudy/sources/__init__.py +++ b/litstudy/sources/__init__.py @@ -1,6 +1,6 @@ from .scopus import search_scopus, refine_scopus, fetch_scopus from .bibtex import load_bibtex -from .semanticscholar import fetch_semanticscholar, search_semanticscholar, fastsearch_semanticscholar, refine_semanticscholar, load_semanticscholar_json +from .semanticscholar import fetch_semanticscholar, search_semanticscholar, fastsearch_semanticscholar, refine_semanticscholar, load_semanticscholar_json, generate_reference_list, mass_fetch_semanticscholar from .crossref import fetch_crossref, refine_crossref, search_crossref from .ieee import load_ieee_csv from .springer import load_springer_csv @@ -30,4 +30,6 @@ "search_scopus", "search_semanticscholar", "fastsearch_semanticscholar", + "generate_reference_list", + "mass_fetch_semanticscholar", ] diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py index 8797bc3..82afdea 100644 --- a/litstudy/sources/semanticscholar.py +++ b/litstudy/sources/semanticscholar.py @@ -360,3 +360,49 @@ def fastsearch_semanticscholar( break return DocumentSet(docs) + +def generate_reference_list(docs: DocumentSet): + """Returns a list of referenced documents formattet for a fetch_semanticscholar request. + s2id: + PubMed: PMID: + DOI: DOI: + ArXiv: ARXIV: + """ + references=[] + for u in range(len(docs)): + if docs[u].references == None: + continue + for i in range(len(docs[u].references)): + doi=docs[u].references[i].doi + s2id=docs[u].references[i].s2id + arxivid=docs[u].references[i].arxivid + pubmed=docs[u].references[i].pubmed + title=docs[u].references[i].title + if doi != None: + references.append("DOI:"+doi) + elif s2id != None: + references.append(s2id) + elif pubmed != None: + references.append("PMID:"+pubmed) + elif arxivid != None: + references.append("ARXIV:"+arxivid) + else: + continue + return references + +def mass_fetch_semanticscholar(paper_ids: list, session=None) -> DocumentSet: + if session is None: + session = requests.Session() + #remove duplicates: + paper_ids=list(set(paper_ids)) + + docs = [] + + with shelve.open(CACHE_FILE) as cache: + for paper_id in progress_bar(paper_ids): + record = request_paper(paper_id, cache, session) + if record: + docs.append(ScholarDocument(record)) + else: + logging.warn(f"could not find paper id {paper_id}") + return DocumentSet(docs) \ No newline at end of file From f0c0bc609928317270b1a4198fed832238bb95b3 Mon Sep 17 00:00:00 2001 From: Wrobel Date: Fri, 3 May 2024 18:00:44 +0200 Subject: [PATCH 5/6] Bugfix --- litstudy/sources/crossref.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/litstudy/sources/crossref.py b/litstudy/sources/crossref.py index 7ef67b0..0ea8254 100644 --- a/litstudy/sources/crossref.py +++ b/litstudy/sources/crossref.py @@ -44,7 +44,7 @@ def __init__(self, entry): @property def name(self) -> str: - return self.entry["name"] + return self.entry.get("name") def _extract_title(entry): @@ -80,7 +80,7 @@ def publisher(self): @property def language(self): - return self.get("language") + return self.entry.get("language") @property def publication_date(self): From 253e9185d97734c570df72b4566a2c9000e630da Mon Sep 17 00:00:00 2001 From: Wrobel Date: Fri, 3 May 2024 18:02:44 +0200 Subject: [PATCH 6/6] Ignore None values when matching identifiers --- litstudy/types.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litstudy/types.py b/litstudy/types.py index 81f1b25..4337357 100644 --- a/litstudy/types.py +++ b/litstudy/types.py @@ -403,6 +403,8 @@ def matches(self, other: "DocumentIdentifier") -> bool: # Two identifiers match if all keys that they have in common are equal for key in self._attr: + if self._attr[key] == None: + continue if key in other._attr: if self._attr[key] != other._attr[key]: return False