From 923ab9e573b524d25dae70310b26797ef78e0481 Mon Sep 17 00:00:00 2001
From: Wrobel <git-work@r-wrobel.de>
Date: Fri, 3 May 2024 17:02:30 +0200
Subject: [PATCH 1/6] Improve Query functions * Add PubMed to id * Add ability
 to pass extra parameters to query (request_query and request_paper). This
 will be needed for new functions. * Add missing timeout to request_query *
 Exclude "Too Many Requests" and "Endpont request timed out" Errors from
 raising an Exception and return magic-string "TIMEOUT". search_semantic will
 retry with increasing interval until success or other error type.

---
 litstudy/sources/semanticscholar.py | 31 ++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py
index 0ab60bd..a31379e 100644
--- a/litstudy/sources/semanticscholar.py
+++ b/litstudy/sources/semanticscholar.py
@@ -18,6 +18,7 @@ def extract_id(item):
         doi=item.get("doi"),
         arxivid=item.get("arxivId"),
         s2id=item.get("paperId"),
+        pubmed=item.get("pubmed"),
     )
 
 
@@ -96,26 +97,34 @@ def load(id):
 DEFAULT_TIMEOUT = 3.05  # 100 requests per 5 minutes
 
 
-def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT):
-    params = urlencode(dict(query=query, offset=offset, limit=limit))
-    url = f"{S2_QUERY_URL}?{params}"
+def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT, extraParams=dict()):
+    params=dict(query=query, offset=offset, limit=limit)
+    params.update(extraParams)
+    encparams = urlencode(params)
+    url = f"{S2_QUERY_URL}?{encparams}"
 
     if url in cache:
         return cache[url]
+    sleep(timeout)
 
-    reply = session.get(url)
+    reply = session.get(url,timeout=60*10)
     response = reply.json()
 
     if "data" not in response:
         msg = response.get("error") or response.get("message") or "unknown"
-        raise Exception(f"error while fetching {reply.url}: {msg}")
+        if msg.find("Too Many Requests.")>-1 or msg.find("Endpoint request timed out")>-1:
+            logging.info(f"request_query: Timeout error while fetching {reply.url}: {msg}")
+            return "TIMEOUT"
+        else:
+            raise Exception(f"error while fetching {reply.url}: {msg}")
 
     cache[url] = response
     return response
 
 
-def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT):
-    url = S2_PAPER_URL + quote_plus(key)
+def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT, extraParams=dict()):
+    encparams = urlencode(extraParams)
+    url = S2_PAPER_URL + quote_plus(key)+"?"+encparams
 
     if url in cache:
         return cache[url]
@@ -224,6 +233,7 @@ def search_semanticscholar(
 
     with shelve.open(CACHE_FILE) as cache:
         paper_ids = []
+        to=0
 
         while True:
             offset = len(paper_ids)
@@ -231,6 +241,13 @@ def search_semanticscholar(
             response = request_query(query, offset, batch_size, cache, session)
             if not response:
                 break
+            if response == "TIMEOUT":
+                to=to+1
+                logging.info("Timeout:",DEFAULT_TIMEOUT*4*to)
+                sleep(DEFAULT_TIMEOUT*4*to)
+                continue 
+            else:
+                to=0
 
             records = response["data"]
             total = response["total"]

From ef491d3709e1d4f64a73a25864d078ffd6e451ae Mon Sep 17 00:00:00 2001
From: Wrobel <git-work@r-wrobel.de>
Date: Fri, 3 May 2024 17:06:28 +0200
Subject: [PATCH 2/6] Add function to load json file

---
 litstudy/__init__.py                |  3 +++
 litstudy/sources/__init__.py        |  3 ++-
 litstudy/sources/semanticscholar.py | 20 ++++++++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/litstudy/__init__.py b/litstudy/__init__.py
index d96ce64..c860ed0 100644
--- a/litstudy/__init__.py
+++ b/litstudy/__init__.py
@@ -7,6 +7,7 @@
     load_ieee_csv,
     load_ris_file,
     load_scopus_csv,
+    load_semanticscholar_json,
     load_springer_csv,
     refine_crossref,
     refine_scopus,
@@ -120,6 +121,8 @@
     "load_csv",
     "load_ieee_csv",
     "load_ris_file",
+    "load_scopus_csv",
+    "load_semanticscholar_json",
     "load_springer_csv",
     "refine_crossref",
     "refine_scopus",
diff --git a/litstudy/sources/__init__.py b/litstudy/sources/__init__.py
index a5eeccc..848c97b 100644
--- a/litstudy/sources/__init__.py
+++ b/litstudy/sources/__init__.py
@@ -1,6 +1,6 @@
 from .scopus import search_scopus, refine_scopus, fetch_scopus
 from .bibtex import load_bibtex
-from .semanticscholar import fetch_semanticscholar, search_semanticscholar, refine_semanticscholar
+from .semanticscholar import fetch_semanticscholar, search_semanticscholar, refine_semanticscholar, load_semanticscholar_json
 from .crossref import fetch_crossref, refine_crossref, search_crossref
 from .ieee import load_ieee_csv
 from .springer import load_springer_csv
@@ -19,6 +19,7 @@
     "load_ieee_csv",
     "load_ris_file",
     "load_scopus_csv",
+    "load_semanticscholar_json",
     "load_springer_csv",
     "refine_crossref",
     "refine_scopus",
diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py
index a31379e..7a274a4 100644
--- a/litstudy/sources/semanticscholar.py
+++ b/litstudy/sources/semanticscholar.py
@@ -4,6 +4,8 @@
 import logging
 import requests
 import shelve
+from ..common import robust_open
+import json
 
 from ..common import progress_bar
 from ..types import Document, Author, DocumentSet, DocumentIdentifier
@@ -273,3 +275,21 @@ def search_semanticscholar(
                 logging.warn(f"could not find paper id {paper_id}")
 
     return DocumentSet(docs)
+
+def load_semanticscholar_json(path: str) -> DocumentSet:
+    """Import json file exported from SemanticScholar"""
+    docs = []
+    with robust_open(path) as f:
+        result = json.load(f)
+        data=result["data"]
+        for doc in data:
+            ids=doc.pop("externalIds")
+            for i in ids:
+                if i=="DOI":
+                    doc["doi"]=ids.get("DOI").lower()
+                elif i=="ArXiv":
+                    doc["arxivId"]=ids.get("ArXiv")
+                elif i=="PubMed":
+                    doc["pubmed"]=ids.get("PubMed")
+            docs.append(ScholarDocument(doc))
+    return DocumentSet(docs)
\ No newline at end of file

From 92a96922c22b8fcdfc76ed29eb4f58b928e8e03a Mon Sep 17 00:00:00 2001
From: Wrobel <git-work@r-wrobel.de>
Date: Fri, 3 May 2024 17:50:09 +0200
Subject: [PATCH 3/6] Add function to search 1,000 docs in 10 requests instead
 of 1,000+

---
 litstudy/__init__.py                |  2 +
 litstudy/sources/__init__.py        |  3 +-
 litstudy/sources/semanticscholar.py | 69 ++++++++++++++++++++++++++++-
 3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/litstudy/__init__.py b/litstudy/__init__.py
index c860ed0..0db24ef 100644
--- a/litstudy/__init__.py
+++ b/litstudy/__init__.py
@@ -17,6 +17,7 @@
     search_dblp,
     search_scopus,
     search_semanticscholar,
+    fastsearch_semanticscholar,
 )
 from .stats import (
     compute_year_histogram,
@@ -132,6 +133,7 @@
     "search_dblp",
     "search_scopus",
     "search_semanticscholar",
+    "fastsearch_semanticscholar",
     "Affiliation",
     "Author",
     "Document",
diff --git a/litstudy/sources/__init__.py b/litstudy/sources/__init__.py
index 848c97b..401ca27 100644
--- a/litstudy/sources/__init__.py
+++ b/litstudy/sources/__init__.py
@@ -1,6 +1,6 @@
 from .scopus import search_scopus, refine_scopus, fetch_scopus
 from .bibtex import load_bibtex
-from .semanticscholar import fetch_semanticscholar, search_semanticscholar, refine_semanticscholar, load_semanticscholar_json
+from .semanticscholar import fetch_semanticscholar, search_semanticscholar, fastsearch_semanticscholar, refine_semanticscholar, load_semanticscholar_json
 from .crossref import fetch_crossref, refine_crossref, search_crossref
 from .ieee import load_ieee_csv
 from .springer import load_springer_csv
@@ -29,4 +29,5 @@
     "search_dblp",
     "search_scopus",
     "search_semanticscholar",
+    "fastsearch_semanticscholar",
 ]
diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py
index 7a274a4..8797bc3 100644
--- a/litstudy/sources/semanticscholar.py
+++ b/litstudy/sources/semanticscholar.py
@@ -243,6 +243,8 @@ def search_semanticscholar(
             response = request_query(query, offset, batch_size, cache, session)
             if not response:
                 break
+
+            #retry in case of timeout
             if response == "TIMEOUT":
                 to=to+1
                 logging.info("Timeout:",DEFAULT_TIMEOUT*4*to)
@@ -292,4 +294,69 @@ def load_semanticscholar_json(path: str) -> DocumentSet:
                 elif i=="PubMed":
                     doc["pubmed"]=ids.get("PubMed")
             docs.append(ScholarDocument(doc))
-    return DocumentSet(docs)
\ No newline at end of file
+    return DocumentSet(docs)
+
+def fastsearch_semanticscholar(
+    query: str, *, limit: int = 1000, batch_size: int = 100, session=None
+) -> DocumentSet:
+    """Submit the given query to SemanticScholar API and return the results
+    as a `DocumentSet`.
+
+    :param query: The search query to submit.
+    :param limit: The maximum number of results to return. Must be at most 1,000
+    :param batch_size: The number of results to retrieve per request. Must be at most 100.
+    :param session: The `requests.Session` to use for HTTP requests.
+    """
+
+    if not query:
+        raise Exception("no query specified in `search_semanticscholar`")
+
+    if session is None:
+        session = requests.Session()
+
+    docs = []
+
+    with shelve.open(CACHE_FILE) as cache:
+        paper_ids = []
+        to=0
+        while True:
+            offset = len(docs)
+
+            response = request_query(query, offset, batch_size, cache, session,extraParams={"fields":"title,authors,year,venue,abstract,citations,references,externalIds"})
+            if not response:
+                break
+
+            #Retry in case of timeout
+            if response == "TIMEOUT":
+                to=to+1
+                logging.info("Timeout:",DEFAULT_TIMEOUT*4*to)
+                sleep(DEFAULT_TIMEOUT*4*to)
+                continue 
+            else:
+                to=0
+
+            records = response["data"]
+            total = response["total"]
+            print("Gesamt:",total,"Offset:",offset)
+            for record in records:
+                ids=record.pop("externalIds")
+                for i in ids:
+                    if i=="DOI":
+                        record["doi"]=ids.get("DOI").lower()
+                    elif i=="ArXiv":
+                        record["arxivId"]=ids.get("ArXiv")
+                    elif i=="PubMed":
+                        record["pubmed"]=ids.get("PubMed")
+                docs.append(ScholarDocument(record))
+
+
+            # Check if we reached the total number of papers
+            if len(docs) >= total:
+                break
+
+            # Check if we exceeded the user-defined limit
+            if limit is not None and len(docs) >= limit:
+                docs = docs[:limit]
+                break
+
+    return DocumentSet(docs)

From 0aedf85bb942856aba04a26ade4aaf29d759c866 Mon Sep 17 00:00:00 2001
From: Wrobel <git-work@r-wrobel.de>
Date: Fri, 3 May 2024 17:54:46 +0200
Subject: [PATCH 4/6] Add functions to download metadata for referenced docs

---
 litstudy/__init__.py                |  4 +++
 litstudy/sources/__init__.py        |  4 ++-
 litstudy/sources/semanticscholar.py | 46 +++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/litstudy/__init__.py b/litstudy/__init__.py
index 0db24ef..ea41d37 100644
--- a/litstudy/__init__.py
+++ b/litstudy/__init__.py
@@ -18,6 +18,8 @@
     search_scopus,
     search_semanticscholar,
     fastsearch_semanticscholar,
+    generate_reference_list,
+    mass_fetch_semanticscholar,
 )
 from .stats import (
     compute_year_histogram,
@@ -134,6 +136,8 @@
     "search_scopus",
     "search_semanticscholar",
     "fastsearch_semanticscholar",
+    "generate_reference_list",
+    "mass_fetch_semanticscholar",
     "Affiliation",
     "Author",
     "Document",
diff --git a/litstudy/sources/__init__.py b/litstudy/sources/__init__.py
index 401ca27..4c5dff6 100644
--- a/litstudy/sources/__init__.py
+++ b/litstudy/sources/__init__.py
@@ -1,6 +1,6 @@
 from .scopus import search_scopus, refine_scopus, fetch_scopus
 from .bibtex import load_bibtex
-from .semanticscholar import fetch_semanticscholar, search_semanticscholar, fastsearch_semanticscholar, refine_semanticscholar, load_semanticscholar_json
+from .semanticscholar import fetch_semanticscholar, search_semanticscholar, fastsearch_semanticscholar, refine_semanticscholar, load_semanticscholar_json, generate_reference_list, mass_fetch_semanticscholar
 from .crossref import fetch_crossref, refine_crossref, search_crossref
 from .ieee import load_ieee_csv
 from .springer import load_springer_csv
@@ -30,4 +30,6 @@
     "search_scopus",
     "search_semanticscholar",
     "fastsearch_semanticscholar",
+    "generate_reference_list",
+    "mass_fetch_semanticscholar",
 ]
diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py
index 8797bc3..82afdea 100644
--- a/litstudy/sources/semanticscholar.py
+++ b/litstudy/sources/semanticscholar.py
@@ -360,3 +360,49 @@ def fastsearch_semanticscholar(
                 break
 
     return DocumentSet(docs)
+
+def generate_reference_list(docs: DocumentSet):
+    """Returns a list of referenced documents formattet for a fetch_semanticscholar request.
+    s2id:   <id>
+    PubMed: PMID:<id>
+    DOI:    DOI:<id>
+    ArXiv: ARXIV:<id>
+    """
+    references=[]
+    for u in range(len(docs)):
+        if docs[u].references == None:
+            continue
+        for i in range(len(docs[u].references)):
+            doi=docs[u].references[i].doi
+            s2id=docs[u].references[i].s2id
+            arxivid=docs[u].references[i].arxivid
+            pubmed=docs[u].references[i].pubmed
+            title=docs[u].references[i].title
+            if doi != None:
+                references.append("DOI:"+doi)
+            elif s2id != None:
+                references.append(s2id)
+            elif pubmed != None:
+                references.append("PMID:"+pubmed)
+            elif arxivid != None:
+                references.append("ARXIV:"+arxivid)
+            else:
+                continue
+    return references
+
+def mass_fetch_semanticscholar(paper_ids: list, session=None) -> DocumentSet:
+    if session is None:
+        session = requests.Session()
+    #remove duplicates:
+    paper_ids=list(set(paper_ids))
+    
+    docs = []
+
+    with shelve.open(CACHE_FILE) as cache:
+        for paper_id in progress_bar(paper_ids):            
+            record = request_paper(paper_id, cache, session)
+            if record:
+                docs.append(ScholarDocument(record))
+            else:
+                logging.warn(f"could not find paper id {paper_id}")
+    return DocumentSet(docs)
\ No newline at end of file

From f0c0bc609928317270b1a4198fed832238bb95b3 Mon Sep 17 00:00:00 2001
From: Wrobel <git-work@r-wrobel.de>
Date: Fri, 3 May 2024 18:00:44 +0200
Subject: [PATCH 5/6] Bugfix

---
 litstudy/sources/crossref.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/litstudy/sources/crossref.py b/litstudy/sources/crossref.py
index 7ef67b0..0ea8254 100644
--- a/litstudy/sources/crossref.py
+++ b/litstudy/sources/crossref.py
@@ -44,7 +44,7 @@ def __init__(self, entry):
 
     @property
     def name(self) -> str:
-        return self.entry["name"]
+        return self.entry.get("name")
 
 
 def _extract_title(entry):
@@ -80,7 +80,7 @@ def publisher(self):
 
     @property
     def language(self):
-        return self.get("language")
+        return self.entry.get("language")
 
     @property
     def publication_date(self):

From 253e9185d97734c570df72b4566a2c9000e630da Mon Sep 17 00:00:00 2001
From: Wrobel <git-work@r-wrobel.de>
Date: Fri, 3 May 2024 18:02:44 +0200
Subject: [PATCH 6/6] Ignore None values when matching identifiers

---
 litstudy/types.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litstudy/types.py b/litstudy/types.py
index 81f1b25..4337357 100644
--- a/litstudy/types.py
+++ b/litstudy/types.py
@@ -403,6 +403,8 @@ def matches(self, other: "DocumentIdentifier") -> bool:
 
         # Two identifiers match if all keys that they have in common are equal
         for key in self._attr:
+            if self._attr[key] == None:
+                continue
             if key in other._attr:
                 if self._attr[key] != other._attr[key]:
                     return False