From a7de5bf38d0dd3ca2714390d06850eb8324de17e Mon Sep 17 00:00:00 2001
From: Ahmed Shariff <shariff.mfa@outlook.com>
Date: Fri, 25 Mar 2022 17:27:39 -0700
Subject: [PATCH 1/9] Wrapped update in package

---
 .gitignore                  |   1 +
 paperscraper/_cli.py        |  20 +
 paperscraper/_preprocess.py | 327 +++++++++++++++++
 paperscraper/config.py      |  26 +-
 poetry.lock                 | 706 ++++++++++++++++++++++++++++++++++++
 pyproject.toml              |  26 ++
 6 files changed, 1094 insertions(+), 12 deletions(-)
 create mode 100644 paperscraper/_cli.py
 create mode 100644 paperscraper/_preprocess.py
 create mode 100644 poetry.lock
 create mode 100644 pyproject.toml
diff --git a/.gitignore b/.gitignore
index 1d6fad3..4d1b36c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+output/
 venv
 chromedriver
 .DS_Store
diff --git a/paperscraper/_cli.py b/paperscraper/_cli.py
new file mode 100644
index 0000000..7a55635
--- /dev/null
+++ b/paperscraper/_cli.py
@@ -0,0 +1,20 @@
+import click
+from paperscraper._preprocess import (get_processed_db, get_unique_venues, get_extracted_data, get_processed_data)
+
+
+@click.group()
+def cli():
+    pass
+
+
+@cli.group()
+def process():
+    pass
+
+@process.command()
+@click.option("-f", "--force", help="Force run all steps", is_flag=True)
+def run_all(force):
+    get_processed_db(force=False)
+    get_unique_venues(force=False)
+    get_extracted_data(force=False)
+    get_processed_data(force=force)
diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py
new file mode 100644
index 0000000..49eec4c
--- /dev/null
+++ b/paperscraper/_preprocess.py
@@ -0,0 +1,327 @@
+import re
+from pathlib import Path
+import lxml.etree as ET
+import pandas as pd
+from loguru import logger
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from tqdm import tqdm
+import ast
+import time
+
+import paperscraper.config as config
+from paperscraper.scrapers.abstracts import get_abstract
+from paperscraper.scrapers.keywords import get_keywords
+from paperscraper.scrapers.citations import get_citation_count
+
+logger.remove()
+logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
+
+# List sources that are to be processed.
+# __publication_src = ["IEEE Visualization"]
+__publication_src = list(config.interesting_venues.keys())
+
+# Process only the below scraped STATES
+# Possible values: ["Not Scraped", "Error", "No Url"]
+__scraper_filter = {
+    "keywords": ["Not Scraped", "Error", "No Url"],
+    "abstract": ["Not Scraped", "Error", "No Url"],
+    "citation_count": ["Not Scraped", "Error", "No Url"],
+}
+
+
+def get_processed_db(force:bool=False) -> Path:
+    if force or not config.path_input.exists():
+        logger.info(f"Cleaning data from {config.path_input_raw} into {config.path_input}")
+        # This Regular Find+Replace replaces instances of &amp; between <ee></ee> tags with a SPECIAL TAG `%26`. This tag will be replaced back to `&` in the code later on.
+        regex_find = r'(<ee>.*)&amp;(.*</ee>)'
+        regex_replace = r'\1%26\2'
+
+        with open(config.path_input_raw, "r") as raw_dblp:
+            with open(config.path_input, "w") as processed_dblp:
+                for line in tqdm(raw_dblp, desc="Raw file line"):
+
+                    # Iterations are needed because re.sub replaces just 1 instance at a time
+                    intermediate_result = ""
+                    while line != intermediate_result:
+                        intermediate_result = line
+                        line = re.sub(regex_find, regex_replace, line)
+
+                    processed_dblp.write(line)
+        
+    return config.path_input
+
+
+# Find Unique venues from the DBLP xml looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"].
+# TODO: Re-run this if (1) The above list has changed OR (2) There is a NEW DBLP snapshot.
+def get_unique_venues(force:bool=False) -> pd.DataFrame:
+    if force or not config.path_unique_venues.exists():
+        logger.info(f"Extracting venues to {config.path_unique_venues}")
+        unique_sources = dict()
+        for event, elem in tqdm(ET.iterparse(config.path_input, recover=True), desc="Entry"):
+            if elem.tag in ["article","inproceedings","incollection"]:
+                for child in elem.getchildren():
+                    if child.tag in ["journal", "booktitle"]:
+                        if child.text not in unique_sources:
+                            unique_sources[child.text] = dict()
+                            unique_sources[child.text]["count"] = 0
+                            unique_sources[child.text]["child_tag"] = child.tag
+                            unique_sources[child.text]["elem_tag"] = elem.tag
+                        unique_sources[child.text]["count"] += 1
+
+        # Create a Pandas DataFrame
+        df_unique_sources = pd.DataFrame.from_dict(unique_sources, orient="index")
+
+        logger.debug("Writing to disk")
+        # Save it to disk
+        df_unique_sources.to_csv(config.path_unique_venues, header=True, sep='\t')
+    else:
+        logger.info(f"Loading data from {config.path_unique_venues}")
+        df_unique_sources = pd.read_csv(config.path_unique_venues, header=0, sep='\t')
+
+    return df_unique_sources
+
+
+# FILTER the huge dblp_processed.xml file to keep just the data that we are interested in.
+# TODO: Re-run this if (1) The <config.interesting_venues> list has changed or (2) There is a NEW DBLP snapshot .
+def get_extracted_data(force:bool=False) -> pd.DataFrame:
+    if force or not config.path_output.exists():
+        logger.info(f"Extracting data to {config.path_output}")
+        result_list = list()
+        src_set = set()
+        for event, elem in tqdm(ET.iterparse(config.path_input, encoding='UTF-8', recover=True), desc="Entry"):
+            obj = dict()
+            to_add = False
+            for child in elem.getchildren():
+                if child.tag not in obj:
+                    if child.tag in ["author", "ee", "url"]:
+                        obj[child.tag] = list()
+                    else:
+                        obj[child.tag] = None
+
+                if child.tag in ["author", "ee", "url"]:
+                    if child.text is not None:
+                        obj[child.tag].append(child.text.replace("%26", "&"))
+                    else:
+                        obj[child.tag].append(child.text)
+                else:
+                    obj[child.tag] = child.text # title, year, pgs
+
+                # Only consider adding entries from the source defined above
+                if child.text in config.interesting_venues and child.tag == config.interesting_venues[child.text]["sourcetype"]:
+                    obj["source"] = child.text
+                    to_add = True
+                    if child.text not in src_set:
+                        src_set.add(child.text)
+                        logger.debug(f"Adding source: {child.text}")
+
+            if to_add:
+                result_list.append(obj)
+
+        # Create a DataFrame
+        df_result_list = pd.DataFrame(result_list)
+
+        # Initialize the fields that we are going to scrape.
+        # TODO: Update these if more fields are added.
+        df_result_list["abstract"] = "Not Scraped"
+        df_result_list["keywords"] = "Not Scraped"
+        df_result_list["citation_count"] = "Not Scraped"
+
+        logger.debug("Writing to disk")
+        # Save to disk
+        df_result_list.to_csv(config.path_output, sep='\t', header=True)
+    else:
+        logger.info(f"Loading data from {config.path_output}")
+        df_result_list = pd.read_csv(config.path_output, sep='\t', header=0)
+
+    return df_result_list
+
+
+# get a new headless Chrome driver
+def _get_webdriver_instance():
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    # chrome_options.binary_location = config.path_chromeoptions_binary
+    # driver = webdriver.chrome(executable_path=config.path_chromedriver, chrome_options=chrome_options)
+    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
+                              chrome_options=chrome_options)
+    # driver.implicitly_wait(10000)
+    return driver
+
+
+# Scrap the Abstracts, Keywords, and Citations
+def get_processed_data(force:bool=False) -> pd.DataFrame:
+    if force or not config.path_output.exists():
+        # Get a webdriver instance (Headless Chrome)
+        logger.info(f"Processing data to {config.path_output}")
+        driver = _get_webdriver_instance()
+
+        # Read the base datafile
+        df_papers = pd.read_csv(config.path_output, sep='\t', header=0)
+
+        # Initialize a log object to analyze the summary of a particular run.
+        log_obj = dict()
+
+        # Start scraping
+        for index, row in tqdm(df_papers.iterrows(), desc="Papers", total=df_papers.shape[0]):
+
+            # ToDo: Keep Checking this high-level filter to minimize iterations.
+            if (str(row["abstract"]) in __scraper_filter["abstract"] or
+                str(row["keywords"]) in __scraper_filter["keywords"] or
+                str(row["citation_count"]) in __scraper_filter["citation_count"]) \
+                    and row["source"] in __publication_src:
+
+                if row["source"] not in log_obj:
+                    log_obj[row["source"]] = dict()
+                    log_obj[row["source"]]["papers"] = 0
+                    log_obj[row["source"]]["abstract_parse_errors"] = 0
+                    log_obj[row["source"]]["abstract_fetch_errors"] = 0
+                    log_obj[row["source"]]["abstract_errors"] = 0
+                    log_obj[row["source"]]["keyword_parse_errors"] = 0
+                    log_obj[row["source"]]["keyword_fetch_errors"] = 0
+                    log_obj[row["source"]]["keyword_errors"] = 0
+                    log_obj[row["source"]]["no_of_citations_parse_errors"] = 0
+                    log_obj[row["source"]]["no_of_citations_fetch_errors"] = 0
+                    log_obj[row["source"]]["no_of_citations_errors"] = 0
+
+                # Increment no of papers
+                log_obj[row["source"]]["papers"] += 1
+
+                # Get the URLs
+                urls = []
+                try:
+                    urls = ast.literal_eval(row["ee"])
+                except Exception as e:
+                    # If not ee, check url.
+                    # But, this doesn't have HTTP/HTTPS it seems to be following some Relative Paths from a BaseURL that is unknown.
+                    # Hence, it will fail 99% of the times.
+                    try:
+                        urls = ast.literal_eval(row["url"])
+                    except:
+                        pass
+
+                # If there is No url OR If the URL begins with a db/, continue.
+                if len(urls) == 0 or urls[0].startswith("db/"):
+                    df_papers.at[index, 'abstract'] = "No Url"
+                    df_papers.at[index, 'keywords'] = "No Url"
+                    df_papers.at[index, 'citation_count'] = "No Url"
+                    logger.error(str(index) + " [No URL]: " + str(row["title"]))
+                    continue
+
+                # ABSTRACT
+                abstract_soup = None
+                try:
+                    driver.get(urls[0])
+
+                    # Delay to ensure routings are complete, page renders
+                    time.sleep(1.5)
+
+                    # Initialize the Soup object
+                    abstract_soup = BeautifulSoup(driver.page_source, 'lxml')
+
+                except Exception as e:
+                    logger.error('Abstract: ' + str(e))
+
+                if abstract_soup is not None:
+                    is_abstract = False
+                    for publisher in config.interesting_venues[row["source"]]["publishers"]:
+                        abstract = get_abstract(publisher, abstract_soup)
+                        if abstract is not None:
+                            df_papers.at[index, 'abstract'] = abstract
+                            logger.info(str(index) + " [Success][Abstract] " + str(urls[0]) + " " + str(abstract)[:50])
+                            is_abstract = True
+                            break
+
+                    if not is_abstract:
+                        df_papers.at[index, 'abstract'] = "Error"
+                        logger.error(str(index) + " [Abstract Parse]: " + str(urls[0]) + " : " + str(row["source"]))
+                        log_obj[row["source"]]["abstract_parse_errors"] += 1
+                        log_obj[row["source"]]["abstract_errors"] += 1
+
+                else:
+                    df_papers.at[index, 'abstract'] = "Error"
+                    logger.error(str(index) + " [Abstract URL Fetch]: " + str(row["source"]))
+                    log_obj[row["source"]]["abstract_fetch_errors"] += 1
+                    log_obj[row["source"]]["abstract_errors"] += 1
+
+                # No. of CITATIONS
+                citation_soup = abstract_soup
+                if citation_soup is not None:
+                    is_citation = False
+                    for publisher in config.interesting_venues[row["source"]]["publishers"]:
+                        citation_count = get_citation_count(publisher, citation_soup)
+                        if citation_count is not None:
+                            df_papers.at[index, 'citation_count'] = citation_count
+                            logger.info(str(index) + " [Success][Citation Count] " + str(urls[0]) + " " + str(citation_count))
+                            is_citation = True
+                            break
+
+                    if not is_citation:
+                        df_papers.at[index, 'citation_count'] = "Error"
+                        logger.error(str(index) + " [Citation Parse]: " + str(urls[0]) + " : " + str(row["source"]))
+                        log_obj[row["source"]]["no_of_citations_parse_errors"] += 1
+                        log_obj[row["source"]]["no_of_citations_errors"] += 1
+
+                else:
+                    df_papers.at[index, 'citation_count'] = "Error"
+                    logger.error(str(index) + " [Citation Count URL Fetch]: " + str(row["source"]))
+                    log_obj[row["source"]]["no_of_citations_fetch_errors"] += 1
+                    log_obj[row["source"]]["no_of_citations_errors"] += 1
+
+                # KEYWORDS
+                # Redirect to a different URL to fetch KEYWORDS in some cases.
+                is_keyword = False
+                current_url = driver.current_url
+                for publisher in config.interesting_venues[row["source"]]["publishers"]:
+                    try:
+                        if publisher == "ieee_explore":
+                            driver.get(current_url+ "/keywords#keywords")
+                        elif publisher == "eurographics_digital_library":
+                            driver.get(current_url + "?show=full")
+                        else:
+                            driver.get(current_url)
+
+                        # Delay to ensure routings are complete, page renders
+                        time.sleep(1.5)
+
+                        # Initialize the Soup object
+                        keyword_soup = BeautifulSoup(driver.page_source, 'lxml')
+
+                        if keyword_soup is not None:
+                            keywords_list = get_keywords(publisher, keyword_soup)
+                            if keywords_list is not None:
+                                df_papers.at[index, 'keywords'] = keywords_list
+                                logger.info(str(index) + " [Success][Keywords] " + str(urls[0]) + " " + str(keywords_list))
+                                is_keyword = True
+                                break
+                        else:
+                            df_papers.at[index, 'keywords'] = "Error"
+                            logger.error(str(index) + " [Keywords URL Fetch]: " + str(row["source"]))
+                            log_obj[row["source"]]["keyword_fetch_errors"] += 1
+                            log_obj[row["source"]]["keyword_errors"] += 1
+
+                    except Exception as e:
+                        pass
+
+                if not is_keyword:
+                    df_papers.at[index, 'keywords'] = "Error"
+                    logger.error(str(index) + " [Error][Keywords Parse]: " + str(urls[0]) + " : " + str(row["source"]))
+                    log_obj[row["source"]]["keyword_parse_errors"] += 1
+                    log_obj[row["source"]]["keyword_errors"] += 1
+
+        # Persist the paper file
+        df_papers.to_csv(config.path_output, sep='\t', header=True, index=False)
+        logger.i("scraped papers saved to disk.")
+
+        # Persist Logs
+        df_logs = pd.DataFrame.from_dict(log_obj, orient="index")
+        logger.i(log_obj)
+        df_logs.to_csv(config.path_logfile, sep='\t', header=True)
+    else:
+        logger.info(f"Loading processed data from {config.path_output}")
+        df_papers = pd.read_csv(config.path_output, sep='\t', header=0)
+
+    return df_papers
diff --git a/paperscraper/config.py b/paperscraper/config.py
index 19ee3e0..9a1d7c4 100644
--- a/paperscraper/config.py
+++ b/paperscraper/config.py
@@ -1,25 +1,27 @@
-import os
+from pathlib import Path
 
-# ToDo: [Update as required] Paths to important input/output files
-path_input_raw = os.path.join("..", "assets", "data", "dblp-2020-11-01.xml")
-path_input = os.path.join("..", "assets", "data", "dblp_processed.xml")
-path_output = os.path.join("..", "output", "output.tsv")
-path_postprocessing_output = os.path.join("..", "output", "output_processed.tsv")
-path_unique_venues = os.path.join("..", "output", "unique_venues.tsv")
-path_unique_keywords = os.path.join("..", "output", "unique_keywords.tsv")
-path_unique_authors = os.path.join("..", "output", "unique_authors.tsv")
-path_logfile = os.path.join("..", "output", "log.tsv")
+_root_dir = Path(__file__).parent.parent
+# TODO: [Update as required] Paths to important input/output files
+# FIXME: automatically extract the latest
+path_input_raw = _root_dir / "assets" / "data" / "dblp-2022-03-01.xml"
+path_input = _root_dir / "assets" / "data" / "dblp_processed.xml"
+path_output = _root_dir / "output" / "output.tsv"
+path_postprocessing_output = _root_dir / "output" / "output_processed.tsv"
+path_unique_venues = _root_dir / "output" / "unique_venues.tsv"
+path_unique_keywords = _root_dir / "output" / "unique_keywords.tsv"
+path_unique_authors = _root_dir / "output"/ "unique_authors.tsv"
+path_logfile = _root_dir / "output" / "log.tsv"
 
 # ChromeDriver
 # TODO Option 1: Manual Download  from https://chromedriver.chromium.org/downloads (e.g., ChromeDriver 86.0.4240.22) and save to a known location in PATH
 # TODO Option 2: Install using brew: `brew cask install chromedriver`. It is generally saved to `/usr/local/bin/chromedriver`
 # For Mac OSX, the executable will have to be quarantined - `xattr -d com.apple.quarantine chromedriver`
 # Set the chromedriver path below.
-path_chromedriver = os.path.join("..", "assets", "chromedriver")  # /usr/local/bin/chromedriver
+path_chromedriver = _root_dir / "assets" / "chromedriver"  # /usr/local/bin/chromedriver
 
 # ChromeOptions binary
 # TODO: [Update this path depending on where it is located in your Operating System]
-path_chromeoptions_binary = os.path.join("/", "Applications", "Google Chrome.app", "Contents", "MacOS", "Google Chrome")
+path_chromeoptions_binary = Path("/") / "Applications" / "Google Chrome.app" / "Contents" / "MacOS" / "Google Chrome"
 
 # List of Venues we target with their DBLP category. This information can be found in the <path_unique_venues> path above.
 # TODO: [Update as required] Don't forget to add the corresponding logic to scrape keywords/absracts/titles/citations, etc.
diff --git a/poetry.lock b/poetry.lock
new file mode 100644
index 0000000..7291d15
--- /dev/null
+++ b/poetry.lock
@@ -0,0 +1,706 @@
+[[package]]
+name = "async-generator"
+version = "1.10"
+description = "Async generators and context managers for Python 3.5+"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "attrs"
+version = "21.4.0"
+description = "Classes Without Boilerplate"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.extras]
+dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"]
+docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
+tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"]
+tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"]
+
+[[package]]
+name = "beautifulsoup4"
+version = "4.10.0"
+description = "Screen-scraping library"
+category = "main"
+optional = false
+python-versions = ">3.0.0"
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
+[[package]]
+name = "certifi"
+version = "2021.10.8"
+description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "cffi"
+version = "1.15.0"
+description = "Foreign Function Interface for Python calling C code."
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pycparser = "*"
+
+[[package]]
+name = "charset-normalizer"
+version = "2.0.12"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
+optional = false
+python-versions = ">=3.5.0"
+
+[package.extras]
+unicode_backport = ["unicodedata2"]
+
+[[package]]
+name = "click"
+version = "8.0.4"
+description = "Composable command line interface toolkit"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[[package]]
+name = "colorama"
+version = "0.4.4"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "cryptography"
+version = "36.0.2"
+description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+cffi = ">=1.12"
+
+[package.extras]
+docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
+docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"]
+pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
+sdist = ["setuptools_rust (>=0.11.4)"]
+ssh = ["bcrypt (>=3.1.5)"]
+test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"]
+
+[[package]]
+name = "h11"
+version = "0.13.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "idna"
+version = "3.3"
+description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "loguru"
+version = "0.6.0"
+description = "Python logging made (stupidly) simple"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
+win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
+
+[package.extras]
+dev = ["colorama (>=0.3.4)", "docutils (==0.16)", "flake8 (>=3.7.7)", "tox (>=3.9.0)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "black (>=19.10b0)", "isort (>=5.1.1)", "Sphinx (>=4.1.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)"]
+
+[[package]]
+name = "lxml"
+version = "4.8.0"
+description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
+
+[package.extras]
+cssselect = ["cssselect (>=0.7)"]
+html5 = ["html5lib"]
+htmlsoup = ["beautifulsoup4"]
+source = ["Cython (>=0.29.7)"]
+
+[[package]]
+name = "numpy"
+version = "1.22.3"
+description = "NumPy is the fundamental package for array computing with Python."
+category = "main"
+optional = false
+python-versions = ">=3.8"
+
+[[package]]
+name = "outcome"
+version = "1.1.0"
+description = "Capture the outcome of Python function calls."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+attrs = ">=19.2.0"
+
+[[package]]
+name = "pandas"
+version = "1.4.1"
+description = "Powerful data structures for data analysis, time series, and statistics"
+category = "main"
+optional = false
+python-versions = ">=3.8"
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.18.5", markers = "platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
+    {version = ">=1.19.2", markers = "platform_machine == \"aarch64\" and python_version < \"3.10\""},
+    {version = ">=1.20.0", markers = "platform_machine == \"arm64\" and python_version < \"3.10\""},
+    {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
+]
+python-dateutil = ">=2.8.1"
+pytz = ">=2020.1"
+
+[package.extras]
+test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
+
+[[package]]
+name = "pycparser"
+version = "2.21"
+description = "C parser in Python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pyopenssl"
+version = "22.0.0"
+description = "Python wrapper module around the OpenSSL library"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+cryptography = ">=35.0"
+
+[package.extras]
+docs = ["sphinx", "sphinx-rtd-theme"]
+test = ["flaky", "pretend", "pytest (>=3.0.1)"]
+
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "python-dateutil"
+version = "2.8.2"
+description = "Extensions to the standard Python datetime module"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2021.3"
+description = "World timezone definitions, modern and historical"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "requests"
+version = "2.27.1"
+description = "Python HTTP for Humans."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""}
+idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""}
+urllib3 = ">=1.21.1,<1.27"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
+use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
+
+[[package]]
+name = "selenium"
+version = "4.1.3"
+description = ""
+category = "main"
+optional = false
+python-versions = "~=3.7"
+
+[package.dependencies]
+trio = ">=0.17,<1.0"
+trio-websocket = ">=0.9,<1.0"
+urllib3 = {version = ">=1.26,<2.0", extras = ["secure", "socks"]}
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "sniffio"
+version = "1.2.0"
+description = "Sniff out which async library your code is running under"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "soupsieve"
+version = "2.3.1"
+description = "A modern CSS selector implementation for Beautiful Soup."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "tqdm"
+version = "4.63.0"
+description = "Fast, Extensible Progress Meter"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+telegram = ["requests"]
+
+[[package]]
+name = "trio"
+version = "0.20.0"
+description = "A friendly Python library for async concurrency and I/O"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+async-generator = ">=1.9"
+attrs = ">=19.2.0"
+cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""}
+idna = "*"
+outcome = "*"
+sniffio = "*"
+sortedcontainers = "*"
+
+[[package]]
+name = "trio-websocket"
+version = "0.9.2"
+description = "WebSocket library for Trio"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+async-generator = ">=1.10"
+trio = ">=0.11"
+wsproto = ">=0.14"
+
+[[package]]
+name = "urllib3"
+version = "1.26.9"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
+
+[package.dependencies]
+certifi = {version = "*", optional = true, markers = "extra == \"secure\""}
+cryptography = {version = ">=1.3.4", optional = true, markers = "extra == \"secure\""}
+idna = {version = ">=2.0.0", optional = true, markers = "extra == \"secure\""}
+pyOpenSSL = {version = ">=0.14", optional = true, markers = "extra == \"secure\""}
+PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""}
+
+[package.extras]
+brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"]
+secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
+socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
+
+[[package]]
+name = "webdriver-manager"
+version = "3.5.4"
+description = "Library provides the way to automatically manage drivers for different browsers"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+requests = "*"
+
+[[package]]
+name = "win32-setctime"
+version = "1.1.0"
+description = "A small Python utility to set file creation time on Windows"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+dev = ["pytest (>=4.6.2)", "black (>=19.3b0)"]
+
+[[package]]
+name = "wsproto"
+version = "1.1.0"
+description = "WebSockets state-machine based protocol implementation"
+category = "main"
+optional = false
+python-versions = ">=3.7.0"
+
+[package.dependencies]
+h11 = ">=0.9.0,<1"
+
+[metadata]
+lock-version = "1.1"
+python-versions = "~=3.8"
+content-hash = "89d5de02738bcf3f4a31eca13e4759300c5312821679bf90a58809024885e1a2"
+
+[metadata.files]
+async-generator = [
+    {file = "async_generator-1.10-py3-none-any.whl", hash = "sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b"},
+    {file = "async_generator-1.10.tar.gz", hash = "sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144"},
+]
+attrs = [
+    {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
+    {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
+]
+beautifulsoup4 = [
+    {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"},
+    {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"},
+]
+certifi = [
+    {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
+    {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
+]
+cffi = [
+    {file = "cffi-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962"},
+    {file = "cffi-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0"},
+    {file = "cffi-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14"},
+    {file = "cffi-1.15.0-cp27-cp27m-win32.whl", hash = "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474"},
+    {file = "cffi-1.15.0-cp27-cp27m-win_amd64.whl", hash = "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6"},
+    {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27"},
+    {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023"},
+    {file = "cffi-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2"},
+    {file = "cffi-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382"},
+    {file = "cffi-1.15.0-cp310-cp310-win32.whl", hash = "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55"},
+    {file = "cffi-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0"},
+    {file = "cffi-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605"},
+    {file = "cffi-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e"},
+    {file = "cffi-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc"},
+    {file = "cffi-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7"},
+    {file = "cffi-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66"},
+    {file = "cffi-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029"},
+    {file = "cffi-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6"},
+    {file = "cffi-1.15.0-cp38-cp38-win32.whl", hash = "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c"},
+    {file = "cffi-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443"},
+    {file = "cffi-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a"},
+    {file = "cffi-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8"},
+    {file = "cffi-1.15.0-cp39-cp39-win32.whl", hash = "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a"},
+    {file = "cffi-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139"},
+    {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"},
+]
+charset-normalizer = [
+    {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"},
+    {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"},
+]
+click = [
+    {file = "click-8.0.4-py3-none-any.whl", hash = "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1"},
+    {file = "click-8.0.4.tar.gz", hash = "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"},
+]
+colorama = [
+    {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
+    {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
+]
+cryptography = [
+    {file = "cryptography-36.0.2-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:4e2dddd38a5ba733be6a025a1475a9f45e4e41139d1321f412c6b360b19070b6"},
+    {file = "cryptography-36.0.2-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:4881d09298cd0b669bb15b9cfe6166f16fc1277b4ed0d04a22f3d6430cb30f1d"},
+    {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ea634401ca02367c1567f012317502ef3437522e2fc44a3ea1844de028fa4b84"},
+    {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:7be666cc4599b415f320839e36367b273db8501127b38316f3b9f22f17a0b815"},
+    {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8241cac0aae90b82d6b5c443b853723bcc66963970c67e56e71a2609dc4b5eaf"},
+    {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b2d54e787a884ffc6e187262823b6feb06c338084bbe80d45166a1cb1c6c5bf"},
+    {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:c2c5250ff0d36fd58550252f54915776940e4e866f38f3a7866d92b32a654b86"},
+    {file = "cryptography-36.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ec6597aa85ce03f3e507566b8bcdf9da2227ec86c4266bd5e6ab4d9e0cc8dab2"},
+    {file = "cryptography-36.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ca9f686517ec2c4a4ce930207f75c00bf03d94e5063cbc00a1dc42531511b7eb"},
+    {file = "cryptography-36.0.2-cp36-abi3-win32.whl", hash = "sha256:f64b232348ee82f13aac22856515ce0195837f6968aeaa94a3d0353ea2ec06a6"},
+    {file = "cryptography-36.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:53e0285b49fd0ab6e604f4c5d9c5ddd98de77018542e88366923f152dbeb3c29"},
+    {file = "cryptography-36.0.2-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:32db5cc49c73f39aac27574522cecd0a4bb7384e71198bc65a0d23f901e89bb7"},
+    {file = "cryptography-36.0.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b3d199647468d410994dbeb8cec5816fb74feb9368aedf300af709ef507e3e"},
+    {file = "cryptography-36.0.2-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:da73d095f8590ad437cd5e9faf6628a218aa7c387e1fdf67b888b47ba56a17f0"},
+    {file = "cryptography-36.0.2-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:0a3bf09bb0b7a2c93ce7b98cb107e9170a90c51a0162a20af1c61c765b90e60b"},
+    {file = "cryptography-36.0.2-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8897b7b7ec077c819187a123174b645eb680c13df68354ed99f9b40a50898f77"},
+    {file = "cryptography-36.0.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82740818f2f240a5da8dfb8943b360e4f24022b093207160c77cadade47d7c85"},
+    {file = "cryptography-36.0.2-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:1f64a62b3b75e4005df19d3b5235abd43fa6358d5516cfc43d87aeba8d08dd51"},
+    {file = "cryptography-36.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e167b6b710c7f7bc54e67ef593f8731e1f45aa35f8a8a7b72d6e42ec76afd4b3"},
+    {file = "cryptography-36.0.2.tar.gz", hash = "sha256:70f8f4f7bb2ac9f340655cbac89d68c527af5bb4387522a8413e841e3e6628c9"},
+]
+h11 = [
+    {file = "h11-0.13.0-py3-none-any.whl", hash = "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"},
+    {file = "h11-0.13.0.tar.gz", hash = "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06"},
+]
+idna = [
+    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
+    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
+]
+loguru = [
+    {file = "loguru-0.6.0-py3-none-any.whl", hash = "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"},
+    {file = "loguru-0.6.0.tar.gz", hash = "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c"},
+]
+lxml = [
+    {file = "lxml-4.8.0-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b"},
+    {file = "lxml-4.8.0-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430"},
+    {file = "lxml-4.8.0-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a"},
+    {file = "lxml-4.8.0-cp27-cp27m-win32.whl", hash = "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5"},
+    {file = "lxml-4.8.0-cp27-cp27m-win_amd64.whl", hash = "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9"},
+    {file = "lxml-4.8.0-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc"},
+    {file = "lxml-4.8.0-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170"},
+    {file = "lxml-4.8.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9"},
+    {file = "lxml-4.8.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03"},
+    {file = "lxml-4.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe"},
+    {file = "lxml-4.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa"},
+    {file = "lxml-4.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1"},
+    {file = "lxml-4.8.0-cp310-cp310-win32.whl", hash = "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b"},
+    {file = "lxml-4.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76"},
+    {file = "lxml-4.8.0-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6"},
+    {file = "lxml-4.8.0-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2"},
+    {file = "lxml-4.8.0-cp35-cp35m-win32.whl", hash = "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150"},
+    {file = "lxml-4.8.0-cp35-cp35m-win_amd64.whl", hash = "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654"},
+    {file = "lxml-4.8.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169"},
+    {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb"},
+    {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3"},
+    {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4"},
+    {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e"},
+    {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613"},
+    {file = "lxml-4.8.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33"},
+    {file = "lxml-4.8.0-cp36-cp36m-win32.whl", hash = "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429"},
+    {file = "lxml-4.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63"},
+    {file = "lxml-4.8.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a"},
+    {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4"},
+    {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15"},
+    {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"},
+    {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c"},
+    {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85"},
+    {file = "lxml-4.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141"},
+    {file = "lxml-4.8.0-cp37-cp37m-win32.whl", hash = "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63"},
+    {file = "lxml-4.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8"},
+    {file = "lxml-4.8.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7"},
+    {file = "lxml-4.8.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428"},
+    {file = "lxml-4.8.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5"},
+    {file = "lxml-4.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f"},
+    {file = "lxml-4.8.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870"},
+    {file = "lxml-4.8.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9"},
+    {file = "lxml-4.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68"},
+    {file = "lxml-4.8.0-cp38-cp38-win32.whl", hash = "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696"},
+    {file = "lxml-4.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939"},
+    {file = "lxml-4.8.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807"},
+    {file = "lxml-4.8.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1"},
+    {file = "lxml-4.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939"},
+    {file = "lxml-4.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca"},
+    {file = "lxml-4.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c"},
+    {file = "lxml-4.8.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87"},
+    {file = "lxml-4.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9"},
+    {file = "lxml-4.8.0-cp39-cp39-win32.whl", hash = "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea"},
+    {file = "lxml-4.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c"},
+    {file = "lxml-4.8.0-pp37-pypy37_pp73-macosx_10_14_x86_64.whl", hash = "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507"},
+    {file = "lxml-4.8.0-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9"},
+    {file = "lxml-4.8.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e"},
+    {file = "lxml-4.8.0-pp38-pypy38_pp73-macosx_10_14_x86_64.whl", hash = "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0"},
+    {file = "lxml-4.8.0-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79"},
+    {file = "lxml-4.8.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93"},
+    {file = "lxml-4.8.0.tar.gz", hash = "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23"},
+]
+numpy = [
+    {file = "numpy-1.22.3-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75"},
+    {file = "numpy-1.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab"},
+    {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e"},
+    {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4"},
+    {file = "numpy-1.22.3-cp310-cp310-win32.whl", hash = "sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430"},
+    {file = "numpy-1.22.3-cp310-cp310-win_amd64.whl", hash = "sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4"},
+    {file = "numpy-1.22.3-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce"},
+    {file = "numpy-1.22.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe"},
+    {file = "numpy-1.22.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:568dfd16224abddafb1cbcce2ff14f522abe037268514dd7e42c6776a1c3f8e5"},
+    {file = "numpy-1.22.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca688e1b9b95d80250bca34b11a05e389b1420d00e87a0d12dc45f131f704a1"},
+    {file = "numpy-1.22.3-cp38-cp38-win32.whl", hash = "sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62"},
+    {file = "numpy-1.22.3-cp38-cp38-win_amd64.whl", hash = "sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676"},
+    {file = "numpy-1.22.3-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:2c10a93606e0b4b95c9b04b77dc349b398fdfbda382d2a39ba5a822f669a0123"},
+    {file = "numpy-1.22.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802"},
+    {file = "numpy-1.22.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bfb1bb598e8229c2d5d48db1860bcf4311337864ea3efdbe1171fb0c5da515d"},
+    {file = "numpy-1.22.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97098b95aa4e418529099c26558eeb8486e66bd1e53a6b606d684d0c3616b168"},
+    {file = "numpy-1.22.3-cp39-cp39-win32.whl", hash = "sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa"},
+    {file = "numpy-1.22.3-cp39-cp39-win_amd64.whl", hash = "sha256:639b54cdf6aa4f82fe37ebf70401bbb74b8508fddcf4797f9fe59615b8c5813a"},
+    {file = "numpy-1.22.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c34ea7e9d13a70bf2ab64a2532fe149a9aced424cd05a2c4ba662fd989e3e45f"},
+    {file = "numpy-1.22.3.zip", hash = "sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18"},
+]
+outcome = [
+    {file = "outcome-1.1.0-py2.py3-none-any.whl", hash = "sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958"},
+    {file = "outcome-1.1.0.tar.gz", hash = "sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967"},
+]
+pandas = [
+    {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3dfb32ed50122fe8c5e7f2b8d97387edd742cc78f9ec36f007ee126cd3720907"},
+    {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0259cd11e7e6125aaea3af823b80444f3adad6149ff4c97fef760093598b3e34"},
+    {file = "pandas-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:96e9ece5759f9b47ae43794b6359bbc54805d76e573b161ae770c1ea59393106"},
+    {file = "pandas-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:508c99debccd15790d526ce6b1624b97a5e1e4ca5b871319fb0ebfd46b8f4dad"},
+    {file = "pandas-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6a7bbbb7950063bfc942f8794bc3e31697c020a14f1cd8905fc1d28ec674a01"},
+    {file = "pandas-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:c614001129b2a5add5e3677c3a213a9e6fd376204cb8d17c04e84ff7dfc02a73"},
+    {file = "pandas-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4e1176f45981c8ccc8161bc036916c004ca51037a7ed73f2d2a9857e6dbe654f"},
+    {file = "pandas-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bbb15ad79050e8b8d39ec40dd96a30cd09b886a2ae8848d0df1abba4d5502a67"},
+    {file = "pandas-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6d6ad1da00c7cc7d8dd1559a6ba59ba3973be6b15722d49738b2be0977eb8a0c"},
+    {file = "pandas-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:358b0bc98a5ff067132d23bf7a2242ee95db9ea5b7bbc401cf79205f11502fd3"},
+    {file = "pandas-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6105af6533f8b63a43ea9f08a2ede04e8f43e49daef0209ab0d30352bcf08bee"},
+    {file = "pandas-1.4.1-cp38-cp38-win32.whl", hash = "sha256:04dd15d9db538470900c851498e532ef28d4e56bfe72c9523acb32042de43dfb"},
+    {file = "pandas-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:1b384516dbb4e6aae30e3464c2e77c563da5980440fbdfbd0968e3942f8f9d70"},
+    {file = "pandas-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f02e85e6d832be37d7f16cf6ac8bb26b519ace3e5f3235564a91c7f658ab2a43"},
+    {file = "pandas-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0b1a13f647e4209ed7dbb5da3497891d0045da9785327530ab696417ef478f84"},
+    {file = "pandas-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:19f7c632436b1b4f84615c3b127bbd7bc603db95e3d4332ed259dc815c9aaa26"},
+    {file = "pandas-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ea47ba1d6f359680130bd29af497333be6110de8f4c35b9211eec5a5a9630fa"},
+    {file = "pandas-1.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e5a7a1e0ecaac652326af627a3eca84886da9e667d68286866d4e33f6547caf"},
+    {file = "pandas-1.4.1-cp39-cp39-win32.whl", hash = "sha256:1d85d5f6be66dfd6d1d8d13b9535e342a2214260f1852654b19fa4d7b8d1218b"},
+    {file = "pandas-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:3129a35d9dad1d80c234dd78f8f03141b914395d23f97cf92a366dcd19f8f8bf"},
+    {file = "pandas-1.4.1.tar.gz", hash = "sha256:8db93ec98ac7cb5f8ac1420c10f5e3c43533153f253fe7fb6d891cf5aa2b80d2"},
+]
+pycparser = [
+    {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
+    {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
+]
+pyopenssl = [
+    {file = "pyOpenSSL-22.0.0-py2.py3-none-any.whl", hash = "sha256:ea252b38c87425b64116f808355e8da644ef9b07e429398bfece610f893ee2e0"},
+    {file = "pyOpenSSL-22.0.0.tar.gz", hash = "sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf"},
+]
+pysocks = [
+    {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
+    {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
+    {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
+]
+python-dateutil = [
+    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
+    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
+]
+pytz = [
+    {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"},
+    {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"},
+]
+requests = [
+    {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"},
+    {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"},
+]
+selenium = [
+    {file = "selenium-4.1.3-py3-none-any.whl", hash = "sha256:14d28a628c831c105d38305c881c9c7847199bfd728ec84240c5e86fa1c9bd5a"},
+]
+six = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+sniffio = [
+    {file = "sniffio-1.2.0-py3-none-any.whl", hash = "sha256:471b71698eac1c2112a40ce2752bb2f4a4814c22a54a3eed3676bc0f5ca9f663"},
+    {file = "sniffio-1.2.0.tar.gz", hash = "sha256:c4666eecec1d3f50960c6bdf61ab7bc350648da6c126e3cf6898d8cd4ddcd3de"},
+]
+sortedcontainers = [
+    {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
+    {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
+]
+soupsieve = [
+    {file = "soupsieve-2.3.1-py3-none-any.whl", hash = "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb"},
+    {file = "soupsieve-2.3.1.tar.gz", hash = "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"},
+]
+tqdm = [
+    {file = "tqdm-4.63.0-py2.py3-none-any.whl", hash = "sha256:e643e071046f17139dea55b880dc9b33822ce21613b4a4f5ea57f202833dbc29"},
+    {file = "tqdm-4.63.0.tar.gz", hash = "sha256:1d9835ede8e394bb8c9dcbffbca02d717217113adc679236873eeaac5bc0b3cd"},
+]
+trio = [
+    {file = "trio-0.20.0-py3-none-any.whl", hash = "sha256:fb2d48e4eab0dfb786a472cd514aaadc71e3445b203bc300bad93daa75d77c1a"},
+    {file = "trio-0.20.0.tar.gz", hash = "sha256:670a52d3115d0e879e1ac838a4eb999af32f858163e3a704fe4839de2a676070"},
+]
+trio-websocket = [
+    {file = "trio-websocket-0.9.2.tar.gz", hash = "sha256:a3d34de8fac26023eee701ed1e7bf4da9a8326b61a62934ec9e53b64970fd8fe"},
+    {file = "trio_websocket-0.9.2-py3-none-any.whl", hash = "sha256:5b558f6e83cc20a37c3b61202476c5295d1addf57bd65543364e0337e37ed2bc"},
+]
+urllib3 = [
+    {file = "urllib3-1.26.9-py2.py3-none-any.whl", hash = "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14"},
+    {file = "urllib3-1.26.9.tar.gz", hash = "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"},
+]
+webdriver-manager = [
+    {file = "webdriver_manager-3.5.4-py2.py3-none-any.whl", hash = "sha256:b5b91b5df83181e002263fe27296967a5b19cb1ebe8e4a63ee83538394037df4"},
+    {file = "webdriver_manager-3.5.4.tar.gz", hash = "sha256:2eb7c2fe38ec5b06e2090164923e4dfb7c3ac4e7140333a3de9c7956f5047858"},
+]
+win32-setctime = [
+    {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"},
+    {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"},
+]
+wsproto = [
+    {file = "wsproto-1.1.0-py3-none-any.whl", hash = "sha256:2218cb57952d90b9fca325c0dcfb08c3bda93e8fd8070b0a17f048e2e47a521b"},
+    {file = "wsproto-1.1.0.tar.gz", hash = "sha256:a2e56bfd5c7cd83c1369d83b5feccd6d37798b74872866e62616e0ecf111bda8"},
+]
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..74038a5
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,26 @@
+[tool.poetry]
+name = "paperscraper"
+version = "0.1.0-alpha.1"
+description = "Scrape and provide interface for data from dblp"
+authors = []
+
+[tool.poetry.dependencies]
+python = "~=3.8"
+lxml = "^4.8.0"
+pandas = "^1.4.1"
+beautifulsoup4 = "^4.10.0"
+selenium = "^4.1.3"
+numpy = "^1.22.3"
+click = "^8.0.4"
+loguru = "^0.6.0"
+tqdm = "^4.63.0"
+webdriver-manager = "^3.5.4"
+
+[tool.poetry.dev-dependencies]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.scripts]
+paperscraper = "paperscraper._cli:cli"
\ No newline at end of file

From 413570eccbb82043c0efd24390f6c95681812af8 Mon Sep 17 00:00:00 2001
From: Ahmed Shariff <shariff.mfa@outlook.com>
Date: Fri, 25 Nov 2022 14:44:36 -0800
Subject: [PATCH 2/9] Cleanup code

---
 paperscraper/_preprocess.py | 56 ++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py
index 49eec4c..da06c8d 100644
--- a/paperscraper/_preprocess.py
+++ b/paperscraper/_preprocess.py
@@ -33,10 +33,16 @@
 }
 
 
-def get_processed_db(force:bool=False) -> Path:
+def get_processed_db(force: bool = False) -> Path:
+    """
+    Clean the raw file (set in config.path_input_raw) and writing it out to config.path_input.
+
+    Function is run only if config.path_input doesn't exsit or if `force` is True.
+    """
     if force or not config.path_input.exists():
         logger.info(f"Cleaning data from {config.path_input_raw} into {config.path_input}")
-        # This Regular Find+Replace replaces instances of &amp; between <ee></ee> tags with a SPECIAL TAG `%26`. This tag will be replaced back to `&` in the code later on.
+        # This Regular Find+Replace replaces instances of &amp; between <ee></ee> tags with a
+        # SPECIAL TAG `%26`. This tag will be replaced back to `&` in the code later on.
         regex_find = r'(<ee>.*)&amp;(.*</ee>)'
         regex_replace = r'\1%26\2'
 
@@ -51,18 +57,22 @@ def get_processed_db(force:bool=False) -> Path:
                         line = re.sub(regex_find, regex_replace, line)
 
                     processed_dblp.write(line)
-        
+
     return config.path_input
 
 
-# Find Unique venues from the DBLP xml looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"].
 # TODO: Re-run this if (1) The above list has changed OR (2) There is a NEW DBLP snapshot.
-def get_unique_venues(force:bool=False) -> pd.DataFrame:
+def get_unique_venues(force: bool = False) -> pd.DataFrame:
+    """
+    Find Unique venues from the DBLP xml.
+
+    Looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"].
+    """
     if force or not config.path_unique_venues.exists():
         logger.info(f"Extracting venues to {config.path_unique_venues}")
-        unique_sources = dict()
+        unique_sources: dict = {}
         for event, elem in tqdm(ET.iterparse(config.path_input, recover=True), desc="Entry"):
-            if elem.tag in ["article","inproceedings","incollection"]:
+            if elem.tag in ["article", "inproceedings", "incollection"]:
                 for child in elem.getchildren():
                     if child.tag in ["journal", "booktitle"]:
                         if child.text not in unique_sources:
@@ -85,15 +95,17 @@ def get_unique_venues(force:bool=False) -> pd.DataFrame:
     return df_unique_sources
 
 
-# FILTER the huge dblp_processed.xml file to keep just the data that we are interested in.
-# TODO: Re-run this if (1) The <config.interesting_venues> list has changed or (2) There is a NEW DBLP snapshot .
-def get_extracted_data(force:bool=False) -> pd.DataFrame:
+# TODO: Re-run this if
+#   (1) The <config.interesting_venues> list has changed or
+#   (2) There is a NEW DBLP snapshot.
+def get_extracted_data(force: bool = False) -> pd.DataFrame:
+    """FILTER the huge dblp_processed.xml file to keep just the data that we are interested in."""
     if force or not config.path_output.exists():
         logger.info(f"Extracting data to {config.path_output}")
         result_list = list()
         src_set = set()
         for event, elem in tqdm(ET.iterparse(config.path_input, encoding='UTF-8', recover=True), desc="Entry"):
-            obj = dict()
+            obj: dict = {}
             to_add = False
             for child in elem.getchildren():
                 if child.tag not in obj:
@@ -108,10 +120,10 @@ def get_extracted_data(force:bool=False) -> pd.DataFrame:
                     else:
                         obj[child.tag].append(child.text)
                 else:
-                    obj[child.tag] = child.text # title, year, pgs
+                    obj[child.tag] = child.text  # title, year, pgs
 
                 # Only consider adding entries from the source defined above
-                if child.text in config.interesting_venues and child.tag == config.interesting_venues[child.text]["sourcetype"]:
+                if (child.text in config.interesting_venues and child.tag == config.interesting_venues[child.text]["sourcetype"]):
                     obj["source"] = child.text
                     to_add = True
                     if child.text not in src_set:
@@ -152,8 +164,8 @@ def _get_webdriver_instance():
     return driver
 
 
-# Scrap the Abstracts, Keywords, and Citations
-def get_processed_data(force:bool=False) -> pd.DataFrame:
+def get_processed_data(force: bool = False) -> pd.DataFrame:
+    """Scrap the Abstracts, Keywords, and Citations."""
     if force or not config.path_output.exists():
         # Get a webdriver instance (Headless Chrome)
         logger.info(f"Processing data to {config.path_output}")
@@ -163,7 +175,7 @@ def get_processed_data(force:bool=False) -> pd.DataFrame:
         df_papers = pd.read_csv(config.path_output, sep='\t', header=0)
 
         # Initialize a log object to analyze the summary of a particular run.
-        log_obj = dict()
+        log_obj: dict = {}
 
         # Start scraping
         for index, row in tqdm(df_papers.iterrows(), desc="Papers", total=df_papers.shape[0]):
@@ -194,13 +206,13 @@ def get_processed_data(force:bool=False) -> pd.DataFrame:
                 urls = []
                 try:
                     urls = ast.literal_eval(row["ee"])
-                except Exception as e:
+                except Exception:
                     # If not ee, check url.
-                    # But, this doesn't have HTTP/HTTPS it seems to be following some Relative Paths from a BaseURL that is unknown.
-                    # Hence, it will fail 99% of the times.
+                    # But, this doesn't have HTTP/HTTPS it seems to be following some Relative Paths from a
+                    # BaseURL that is unknown. Hence, it will fail 99% of the times.
                     try:
                         urls = ast.literal_eval(row["url"])
-                    except:
+                    except Exception:
                         pass
 
                 # If there is No url OR If the URL begins with a db/, continue.
@@ -278,7 +290,7 @@ def get_processed_data(force:bool=False) -> pd.DataFrame:
                 for publisher in config.interesting_venues[row["source"]]["publishers"]:
                     try:
                         if publisher == "ieee_explore":
-                            driver.get(current_url+ "/keywords#keywords")
+                            driver.get(current_url + "/keywords#keywords")
                         elif publisher == "eurographics_digital_library":
                             driver.get(current_url + "?show=full")
                         else:
@@ -303,7 +315,7 @@ def get_processed_data(force:bool=False) -> pd.DataFrame:
                             log_obj[row["source"]]["keyword_fetch_errors"] += 1
                             log_obj[row["source"]]["keyword_errors"] += 1
 
-                    except Exception as e:
+                    except Exception:
                         pass
 
                 if not is_keyword:

From db8ac254eda1fda4cb14076fc34f7603869a2491 Mon Sep 17 00:00:00 2001
From: Ahmed Shariff <shariff.mfa@outlook.com>
Date: Fri, 25 Nov 2022 16:01:06 -0800
Subject: [PATCH 3/9] Use sqlitedict _preprocess and related tests

---
 .gitignore                          |   3 +
 paperscraper/_cli.py                |   8 +-
 paperscraper/_preprocess.py         | 113 +++---
 paperscraper/config.py              | 587 +++++++++++++++-------------
 poetry.lock                         | 146 ++++++-
 pyproject.toml                      |   4 +
 test/assets/data/dblp_processed.xml |  89 +++++
 test/test_preprocess.py             |  56 +++
 8 files changed, 657 insertions(+), 349 deletions(-)
 create mode 100644 test/assets/data/dblp_processed.xml
 create mode 100644 test/test_preprocess.py

diff --git a/.gitignore b/.gitignore
index 4d1b36c..ad07f50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@ chromedriver
 .idea/
 *.pyc
 
+# include test files
+!test/assets/data/*.xml
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/paperscraper/_cli.py b/paperscraper/_cli.py
index 7a55635..9bbcd28 100644
--- a/paperscraper/_cli.py
+++ b/paperscraper/_cli.py
@@ -1,6 +1,6 @@
 import click
 from paperscraper._preprocess import (get_processed_db, get_unique_venues, get_extracted_data, get_processed_data)
-
+from paperscraper.config import config
 
 @click.group()
 def cli():
@@ -15,6 +15,6 @@ def process():
 @click.option("-f", "--force", help="Force run all steps", is_flag=True)
 def run_all(force):
     get_processed_db(force=False)
-    get_unique_venues(force=False)
-    get_extracted_data(force=False)
-    get_processed_data(force=force)
+    get_unique_venues(config, force=False)
+    get_extracted_data(config, force=False)
+    get_processed_data(config, force=force)
diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py
index da06c8d..b284f2f 100644
--- a/paperscraper/_preprocess.py
+++ b/paperscraper/_preprocess.py
@@ -1,21 +1,23 @@
+import ast
 import re
+import time
 from pathlib import Path
+
 import lxml.etree as ET
 import pandas as pd
-from loguru import logger
 from bs4 import BeautifulSoup
+from loguru import logger
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
-from webdriver_manager.chrome import ChromeDriverManager
+from sqlitedict import SqliteDict
 from tqdm import tqdm
-import ast
-import time
+from webdriver_manager.chrome import ChromeDriverManager
 
-import paperscraper.config as config
+from paperscraper.config import Config, config
 from paperscraper.scrapers.abstracts import get_abstract
-from paperscraper.scrapers.keywords import get_keywords
 from paperscraper.scrapers.citations import get_citation_count
+from paperscraper.scrapers.keywords import get_keywords
 
 logger.remove()
 logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
@@ -62,50 +64,58 @@ def get_processed_db(force: bool = False) -> Path:
 
 
 # TODO: Re-run this if (1) The above list has changed OR (2) There is a NEW DBLP snapshot.
-def get_unique_venues(force: bool = False) -> pd.DataFrame:
+def get_unique_venues(config: Config, force: bool = False) -> SqliteDict:
     """
     Find Unique venues from the DBLP xml.
 
     Looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"].
     """
     if force or not config.path_unique_venues.exists():
+        unique_sources = SqliteDict(config.path_unique_venues)
+        unique_sources.clear()  # empty the db
         logger.info(f"Extracting venues to {config.path_unique_venues}")
-        unique_sources: dict = {}
         for event, elem in tqdm(ET.iterparse(config.path_input, recover=True), desc="Entry"):
             if elem.tag in ["article", "inproceedings", "incollection"]:
                 for child in elem.getchildren():
                     if child.tag in ["journal", "booktitle"]:
                         if child.text not in unique_sources:
-                            unique_sources[child.text] = dict()
-                            unique_sources[child.text]["count"] = 0
-                            unique_sources[child.text]["child_tag"] = child.tag
-                            unique_sources[child.text]["elem_tag"] = elem.tag
-                        unique_sources[child.text]["count"] += 1
+                            child_dict = {}
+                            child_dict["count"] = 0
+                            child_dict["child_tag"] = child.tag
+                            child_dict["elem_tag"] = elem.tag
+                        else:
+                            child_dict = unique_sources[child.text]
 
-        # Create a Pandas DataFrame
-        df_unique_sources = pd.DataFrame.from_dict(unique_sources, orient="index")
+                        child_dict["count"] += 1
+                        unique_sources[child.text] = child_dict
 
         logger.debug("Writing to disk")
         # Save it to disk
-        df_unique_sources.to_csv(config.path_unique_venues, header=True, sep='\t')
+        unique_sources.commit()
     else:
         logger.info(f"Loading data from {config.path_unique_venues}")
-        df_unique_sources = pd.read_csv(config.path_unique_venues, header=0, sep='\t')
+        unique_sources = SqliteDict(config.path_unique_venues)
 
-    return df_unique_sources
+    return unique_sources
 
 
 # TODO: Re-run this if
 #   (1) The <config.interesting_venues> list has changed or
 #   (2) There is a NEW DBLP snapshot.
-def get_extracted_data(force: bool = False) -> pd.DataFrame:
+def get_extracted_data(config: Config, force: bool = False) -> SqliteDict:
     """FILTER the huge dblp_processed.xml file to keep just the data that we are interested in."""
     if force or not config.path_output.exists():
         logger.info(f"Extracting data to {config.path_output}")
-        result_list = list()
+        result_list = SqliteDict(config.path_output)
+        result_list.clear()  # empty the db
         src_set = set()
-        for event, elem in tqdm(ET.iterparse(config.path_input, encoding='UTF-8', recover=True), desc="Entry"):
+        for _idx, (event, elem) in tqdm(enumerate(ET.iterparse(config.path_input, encoding='UTF-8', recover=True)), desc="Entry"):
             obj: dict = {}
+            # Initialize the fields that we are going to scrape.
+            # TODO: Update these if more fields are added.
+            obj["abstract"] = "Not Scraped"
+            obj["keywords"] = "Not Scraped"
+            obj["citation_count"] = "Not Scraped"
             to_add = False
             for child in elem.getchildren():
                 if child.tag not in obj:
@@ -131,25 +141,20 @@ def get_extracted_data(force: bool = False) -> pd.DataFrame:
                         logger.debug(f"Adding source: {child.text}")
 
             if to_add:
-                result_list.append(obj)
+                result_list[_idx] = obj
 
-        # Create a DataFrame
-        df_result_list = pd.DataFrame(result_list)
-
-        # Initialize the fields that we are going to scrape.
-        # TODO: Update these if more fields are added.
-        df_result_list["abstract"] = "Not Scraped"
-        df_result_list["keywords"] = "Not Scraped"
-        df_result_list["citation_count"] = "Not Scraped"
+            # Periodically commiting stuff
+            if _idx % 100 == 0:
+                result_list.commit()
 
         logger.debug("Writing to disk")
         # Save to disk
-        df_result_list.to_csv(config.path_output, sep='\t', header=True)
+        result_list.commit()
     else:
         logger.info(f"Loading data from {config.path_output}")
-        df_result_list = pd.read_csv(config.path_output, sep='\t', header=0)
+        result_list = SqliteDict(config.path_output)
 
-    return df_result_list
+    return result_list
 
 
 # get a new headless Chrome driver
@@ -164,7 +169,7 @@ def _get_webdriver_instance():
     return driver
 
 
-def get_processed_data(force: bool = False) -> pd.DataFrame:
+def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict:
     """Scrap the Abstracts, Keywords, and Citations."""
     if force or not config.path_output.exists():
         # Get a webdriver instance (Headless Chrome)
@@ -172,13 +177,13 @@ def get_processed_data(force: bool = False) -> pd.DataFrame:
         driver = _get_webdriver_instance()
 
         # Read the base datafile
-        df_papers = pd.read_csv(config.path_output, sep='\t', header=0)
+        papers_db = SqliteDict(config.path_output)
 
         # Initialize a log object to analyze the summary of a particular run.
         log_obj: dict = {}
 
         # Start scraping
-        for index, row in tqdm(df_papers.iterrows(), desc="Papers", total=df_papers.shape[0]):
+        for index, row in tqdm(papers_db.items(), desc="Papers", total=len(papers_db)):
 
             # ToDo: Keep Checking this high-level filter to minimize iterations.
             if (str(row["abstract"]) in __scraper_filter["abstract"] or
@@ -217,9 +222,10 @@ def get_processed_data(force: bool = False) -> pd.DataFrame:
 
                 # If there is No url OR If the URL begins with a db/, continue.
                 if len(urls) == 0 or urls[0].startswith("db/"):
-                    df_papers.at[index, 'abstract'] = "No Url"
-                    df_papers.at[index, 'keywords'] = "No Url"
-                    df_papers.at[index, 'citation_count'] = "No Url"
+                    row['abstract'] = "No Url"
+                    row['abstract'] = "No Url"
+                    row['abstract'] = "No Url"
+                    papers_db[index] = row
                     logger.error(str(index) + " [No URL]: " + str(row["title"]))
                     continue
 
@@ -242,19 +248,19 @@ def get_processed_data(force: bool = False) -> pd.DataFrame:
                     for publisher in config.interesting_venues[row["source"]]["publishers"]:
                         abstract = get_abstract(publisher, abstract_soup)
                         if abstract is not None:
-                            df_papers.at[index, 'abstract'] = abstract
+                            row['abstract'] = abstract
                             logger.info(str(index) + " [Success][Abstract] " + str(urls[0]) + " " + str(abstract)[:50])
                             is_abstract = True
                             break
 
                     if not is_abstract:
-                        df_papers.at[index, 'abstract'] = "Error"
+                        row['abstract'] = "Error"
                         logger.error(str(index) + " [Abstract Parse]: " + str(urls[0]) + " : " + str(row["source"]))
                         log_obj[row["source"]]["abstract_parse_errors"] += 1
                         log_obj[row["source"]]["abstract_errors"] += 1
 
                 else:
-                    df_papers.at[index, 'abstract'] = "Error"
+                    row['abstract'] = "Error"
                     logger.error(str(index) + " [Abstract URL Fetch]: " + str(row["source"]))
                     log_obj[row["source"]]["abstract_fetch_errors"] += 1
                     log_obj[row["source"]]["abstract_errors"] += 1
@@ -266,19 +272,19 @@ def get_processed_data(force: bool = False) -> pd.DataFrame:
                     for publisher in config.interesting_venues[row["source"]]["publishers"]:
                         citation_count = get_citation_count(publisher, citation_soup)
                         if citation_count is not None:
-                            df_papers.at[index, 'citation_count'] = citation_count
+                            row['citation_count'] = citation_count
                             logger.info(str(index) + " [Success][Citation Count] " + str(urls[0]) + " " + str(citation_count))
                             is_citation = True
                             break
 
                     if not is_citation:
-                        df_papers.at[index, 'citation_count'] = "Error"
+                        row['citation_count'] = "Error"
                         logger.error(str(index) + " [Citation Parse]: " + str(urls[0]) + " : " + str(row["source"]))
                         log_obj[row["source"]]["no_of_citations_parse_errors"] += 1
                         log_obj[row["source"]]["no_of_citations_errors"] += 1
 
                 else:
-                    df_papers.at[index, 'citation_count'] = "Error"
+                    row['citation_count'] = "Error"
                     logger.error(str(index) + " [Citation Count URL Fetch]: " + str(row["source"]))
                     log_obj[row["source"]]["no_of_citations_fetch_errors"] += 1
                     log_obj[row["source"]]["no_of_citations_errors"] += 1
@@ -305,12 +311,12 @@ def get_processed_data(force: bool = False) -> pd.DataFrame:
                         if keyword_soup is not None:
                             keywords_list = get_keywords(publisher, keyword_soup)
                             if keywords_list is not None:
-                                df_papers.at[index, 'keywords'] = keywords_list
+                                row['keywords'] = keywords_list
                                 logger.info(str(index) + " [Success][Keywords] " + str(urls[0]) + " " + str(keywords_list))
                                 is_keyword = True
                                 break
                         else:
-                            df_papers.at[index, 'keywords'] = "Error"
+                            row['keywords'] = "Error"
                             logger.error(str(index) + " [Keywords URL Fetch]: " + str(row["source"]))
                             log_obj[row["source"]]["keyword_fetch_errors"] += 1
                             log_obj[row["source"]]["keyword_errors"] += 1
@@ -319,13 +325,18 @@ def get_processed_data(force: bool = False) -> pd.DataFrame:
                         pass
 
                 if not is_keyword:
-                    df_papers.at[index, 'keywords'] = "Error"
+                    row['keywords'] = "Error"
                     logger.error(str(index) + " [Error][Keywords Parse]: " + str(urls[0]) + " : " + str(row["source"]))
                     log_obj[row["source"]]["keyword_parse_errors"] += 1
                     log_obj[row["source"]]["keyword_errors"] += 1
 
+                papers_db[index] = row
+
+                if index % 100 == 100:
+                    papers_db.commit()
+
         # Persist the paper file
-        df_papers.to_csv(config.path_output, sep='\t', header=True, index=False)
+        papers_db.commit()
         logger.i("scraped papers saved to disk.")
 
         # Persist Logs
@@ -334,6 +345,6 @@ def get_processed_data(force: bool = False) -> pd.DataFrame:
         df_logs.to_csv(config.path_logfile, sep='\t', header=True)
     else:
         logger.info(f"Loading processed data from {config.path_output}")
-        df_papers = pd.read_csv(config.path_output, sep='\t', header=0)
+        papers_db = SqliteDict(config.path_output)
 
-    return df_papers
+    return papers_db
diff --git a/paperscraper/config.py b/paperscraper/config.py
index 9a1d7c4..dacc934 100644
--- a/paperscraper/config.py
+++ b/paperscraper/config.py
@@ -1,288 +1,313 @@
 from pathlib import Path
+from typing import Union
 
-_root_dir = Path(__file__).parent.parent
-# TODO: [Update as required] Paths to important input/output files
-# FIXME: automatically extract the latest
-path_input_raw = _root_dir / "assets" / "data" / "dblp-2022-03-01.xml"
-path_input = _root_dir / "assets" / "data" / "dblp_processed.xml"
-path_output = _root_dir / "output" / "output.tsv"
-path_postprocessing_output = _root_dir / "output" / "output_processed.tsv"
-path_unique_venues = _root_dir / "output" / "unique_venues.tsv"
-path_unique_keywords = _root_dir / "output" / "unique_keywords.tsv"
-path_unique_authors = _root_dir / "output"/ "unique_authors.tsv"
-path_logfile = _root_dir / "output" / "log.tsv"
 
-# ChromeDriver
-# TODO Option 1: Manual Download  from https://chromedriver.chromium.org/downloads (e.g., ChromeDriver 86.0.4240.22) and save to a known location in PATH
-# TODO Option 2: Install using brew: `brew cask install chromedriver`. It is generally saved to `/usr/local/bin/chromedriver`
-# For Mac OSX, the executable will have to be quarantined - `xattr -d com.apple.quarantine chromedriver`
-# Set the chromedriver path below.
-path_chromedriver = _root_dir / "assets" / "chromedriver"  # /usr/local/bin/chromedriver
+class Config:
+    """The main config object."""
+    def __init__(self, root_dir: Union[str, Path] = None,
+                 assets_dir: Union[str, Path] = None,
+                 output_dir: Union[str, Path] = None):
+        """Initialize the config."""
+        if root_dir is None:
+            _root_dir = Path(__file__).parent.parent
+        else:
+            _root_dir = Path(root_dir)
 
-# ChromeOptions binary
-# TODO: [Update this path depending on where it is located in your Operating System]
-path_chromeoptions_binary = Path("/") / "Applications" / "Google Chrome.app" / "Contents" / "MacOS" / "Google Chrome"
+        if assets_dir is None:
+            assets_dir = _root_dir / "assets"
+        elif not isinstance(assets_dir, Path):
+            assets_dir = Path(assets_dir)
 
-# List of Venues we target with their DBLP category. This information can be found in the <path_unique_venues> path above.
-# TODO: [Update as required] Don't forget to add the corresponding logic to scrape keywords/absracts/titles/citations, etc.
-interesting_venues = {
-    "ACM Trans. Comput. Hum. Interact.": {
-        "sourcetype": "journal",
-        "publishers": ["acm_digital_library"]
-    },
-    "AVI": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "BCS HCI": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library", "scienceopen", "springer_v2"]
-    },
-    "BCS HCI (1)": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "BCS HCI (2)": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "BELIV": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library", "ieee_explore"]
-    },
-    "BioVis": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "CHI": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "Cognitive Biases in Visualizations": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2"]
-    },
-    "CogSci": {
-        "sourcetype": "booktitle",
-        "publishers": ["cogsci"]
-    },
-    "Comput. Graph. Forum": {
-        "sourcetype": "journal",
-        "publishers": ["wiley_online_library"]
-    },
-    "Conference on Designing Interactive Systems": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "Conference on Designing Interactive Systems (Companion Volume)": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "CSCW": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "Diagrams": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2"]
-    },
-    "Eurographics": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2", "eurographics_digital_library"]
-    },
-    "Eurographics (Areas Papers)": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library"]
-    },
-    "Eurographics (Posters)": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library"]
-    },
-    "Eurographics (Short Papers)": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library"]
-    },
-    "Eurographics (Short Presentations)": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library" ]
-    },
-    "Eurographics (State of the Art Reports)": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library" ]
-    },
-    "EuroVAST@EuroVis": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library"]
-    },
-    "Graphics Interface": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library", "graphics_interface_proceedings"]
-    },
-    "ICDM": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2", "ieee_explore"]
-    },
-    "IEEE Computer Graphics and Applications": {
-        "sourcetype": "journal",
-        "publishers": ["ieee_explore"]
-    },
-    "IEEE Trans. Vis. Comput. Graph.": {
-        "sourcetype": "journal",
-        "publishers": ["ieee_explore"]
-    },
-    "IEEE VAST": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "IEEE Visualization": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "IEEE VIS (Short Papers)": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "Information Visualization": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2", "dagstuhl"]
-    },
-    "INTERACT": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2"]
-    },
-    "INTERACT (1)": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2"]
-    },
-    "INTERACT (2)": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2"]
-    },
-    "INTERACT (3)": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2"]
-    },
-    "INTERACT (4)": {
-        "sourcetype": "booktitle",
-        "publishers": ["springer_v2"]
-    },
-    "International Conference on Supercomputing": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "IUI": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "IV": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "IV (1)": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "IV (2)": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "IVAPP": {
-        "sourcetype": "booktitle",
-        "publishers": ["scitepress"]
-    },
-    "J. Vis.": {
-        "sourcetype": "journal",
-        "publishers": ["springer_v1"]
-    },
-    "KDD": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library", "aaai"]
-    },
-    "PacificVis": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "SciVis": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "SIBGRAPI": {
-        "sourcetype": "booktitle",
-        "publishers": ["ieee_explore"]
-    },
-    "SIGGRAPH": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "SIGGRAPH Asia": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "SIGMOD Conference": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "UbiComp": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library", "springer_v2"]
-    },
-    "UIST": {
-        "sourcetype": "booktitle",
-        "publishers": ["acm_digital_library"]
-    },
-    "VAST": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library", "ieee_explore" ]
-    },
-    "VAST (Short and Project Papers)": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library", "ieee_explore" ]
-    },
-    "VCBM": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library"]
-    },
-    "Vis. Comput.": {
-        "sourcetype": "journal",
-        "publishers": ["springer_v1"]
-    },
-    "VMV": {
-        "sourcetype": "booktitle",
-        "publishers": ["eurographics_digital_library"]
-    }
-}
+        if output_dir is None:
+            output_dir = _root_dir / "output"
+        elif not isinstance(output_dir, Path):
+            assets_dir = Path(output_dir)
 
-# Object to map different variations of a keyword to a consistent name.
-keywords_to_merge = {
-    "cscw": "computer supported collaborative work",
-    "computer supported collaborative work": "computer supported collaborative work",
-    "data visualization": "data visualization",
-    "data visualisation": "data visualization",
-    "visualisation": "visualization",
-    "visualization": "visualization",
-    "hci": "human computer interaction",
-    "human computer interaction": "human computer interaction",
-    "human-computer-interaction": "human computer interaction",
-    "human-computer interaction": "human computer interaction",
-    "human computer interaction (hci)": "human computer interaction",
-    "human-computer interaction (hci)": "human computer interaction",
-    "human computer interactions": "human computer interaction",
-    "human-computer-interactions": "human computer interaction",
-    "human-computer interactions": "human computer interaction",
-}
+        # TODO: [Update as required] Paths to important input/output files
+        # FIXME: automatically extract the latest
+        self.path_input_raw = assets_dir / "data" / "dblp-2022-11-02.xml"
+        self.path_input = assets_dir / "data" / "dblp_processed.xml"
+        self.path_output = output_dir / "output.db"
+        self.path_postprocessing_output = output_dir / "output_processed.tsv"
+        self.path_unique_venues = output_dir / "unique_venues.db"
+        self.path_unique_keywords = output_dir / "unique_keywords.tsv"
+        self.path_unique_authors = output_dir/ "unique_authors.tsv"
+        self.path_logfile = output_dir / "log.tsv"
 
-keyword_patterns_to_remove = [
-    r"\d+.\d+.\d+.", # e.g., 1.3.4.
-    r"\d+.\d+.\d+", # e.g., 1.3.4
-    r"\w+.\d+.\d+.", # e.g., d.3.4.
-    r"\w+.\d+.\d+", # e.g., d.3.4
-    r"according to",
-    r"acm ccs",
-    r"acmccs",
-    r"acma ccs",
-    r"\(\s*\)",
-    r"\/spl",
-    r"\/sup",
-    r"\/",
-    r"^-\s*"
-]
+        # ChromeDriver
+        # TODO Option 1: Manual Download  from https://chromedriver.chromium.org/downloads (e.g., ChromeDriver 86.0.4240.22) and save to a known location in PATH
+        # TODO Option 2: Install using brew: `brew cask install chromedriver`. It is generally saved to `/usr/local/bin/chromedriver`
+        # For Mac OSX, the executable will have to be quarantined - `xattr -d com.apple.quarantine chromedriver`
+        # Set the chromedriver path below.
+        self.path_chromedriver = assets_dir / "chromedriver"  # /usr/local/bin/chromedriver
+
+        # ChromeOptions binary
+        # TODO: [Update this path depending on where it is located in your Operating System]
+        self.path_chromeoptions_binary = Path("/") / "Applications" / "Google Chrome.app" / "Contents" / "MacOS" / "Google Chrome"
+
+        # List of Venues we target with their DBLP category. This information can be found in the <path_unique_venues> path above.
+        # TODO: [Update as required] Don't forget to add the corresponding logic to scrape keywords/absracts/titles/citations, etc.
+        self.interesting_venues = {
+            "ACM Trans. Comput. Hum. Interact.": {
+                "sourcetype": "journal",
+                "publishers": ["acm_digital_library"]
+            },
+            "AVI": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "BCS HCI": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library", "scienceopen", "springer_v2"]
+            },
+            "BCS HCI (1)": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "BCS HCI (2)": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "BELIV": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library", "ieee_explore"]
+            },
+            "BioVis": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "CHI": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "Cognitive Biases in Visualizations": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2"]
+            },
+            "CogSci": {
+                "sourcetype": "booktitle",
+                "publishers": ["cogsci"]
+            },
+            "Comput. Graph. Forum": {
+                "sourcetype": "journal",
+                "publishers": ["wiley_online_library"]
+            },
+            "Conference on Designing Interactive Systems": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "Conference on Designing Interactive Systems (Companion Volume)": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "CSCW": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "Diagrams": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2"]
+            },
+            "Eurographics": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2", "eurographics_digital_library"]
+            },
+            "Eurographics (Areas Papers)": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library"]
+            },
+            "Eurographics (Posters)": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library"]
+            },
+            "Eurographics (Short Papers)": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library"]
+            },
+            "Eurographics (Short Presentations)": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library" ]
+            },
+            "Eurographics (State of the Art Reports)": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library" ]
+            },
+            "EuroVAST@EuroVis": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library"]
+            },
+            "Graphics Interface": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library", "graphics_interface_proceedings"]
+            },
+            "ICDM": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2", "ieee_explore"]
+            },
+            "IEEE Computer Graphics and Applications": {
+                "sourcetype": "journal",
+                "publishers": ["ieee_explore"]
+            },
+            "IEEE Trans. Vis. Comput. Graph.": {
+                "sourcetype": "journal",
+                "publishers": ["ieee_explore"]
+            },
+            "IEEE VAST": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "IEEE Visualization": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "IEEE VIS (Short Papers)": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "Information Visualization": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2", "dagstuhl"]
+            },
+            "INTERACT": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2"]
+            },
+            "INTERACT (1)": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2"]
+            },
+            "INTERACT (2)": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2"]
+            },
+            "INTERACT (3)": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2"]
+            },
+            "INTERACT (4)": {
+                "sourcetype": "booktitle",
+                "publishers": ["springer_v2"]
+            },
+            "International Conference on Supercomputing": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "IUI": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "IV": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "IV (1)": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "IV (2)": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "IVAPP": {
+                "sourcetype": "booktitle",
+                "publishers": ["scitepress"]
+            },
+            "J. Vis.": {
+                "sourcetype": "journal",
+                "publishers": ["springer_v1"]
+            },
+            "KDD": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library", "aaai"]
+            },
+            "PacificVis": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "SciVis": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "SIBGRAPI": {
+                "sourcetype": "booktitle",
+                "publishers": ["ieee_explore"]
+            },
+            "SIGGRAPH": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "SIGGRAPH Asia": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "SIGMOD Conference": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "UbiComp": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library", "springer_v2"]
+            },
+            "UIST": {
+                "sourcetype": "booktitle",
+                "publishers": ["acm_digital_library"]
+            },
+            "VAST": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library", "ieee_explore" ]
+            },
+            "VAST (Short and Project Papers)": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library", "ieee_explore" ]
+            },
+            "VCBM": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library"]
+            },
+            "Vis. Comput.": {
+                "sourcetype": "journal",
+                "publishers": ["springer_v1"]
+            },
+            "VMV": {
+                "sourcetype": "booktitle",
+                "publishers": ["eurographics_digital_library"]
+            }
+        }
+
+        # Object to map different variations of a keyword to a consistent name.
+        self.keywords_to_merge = {
+            "cscw": "computer supported collaborative work",
+            "computer supported collaborative work": "computer supported collaborative work",
+            "data visualization": "data visualization",
+            "data visualisation": "data visualization",
+            "visualisation": "visualization",
+            "visualization": "visualization",
+            "hci": "human computer interaction",
+            "human computer interaction": "human computer interaction",
+            "human-computer-interaction": "human computer interaction",
+            "human-computer interaction": "human computer interaction",
+            "human computer interaction (hci)": "human computer interaction",
+            "human-computer interaction (hci)": "human computer interaction",
+            "human computer interactions": "human computer interaction",
+            "human-computer-interactions": "human computer interaction",
+            "human-computer interactions": "human computer interaction",
+        }
+
+        self.keyword_patterns_to_remove = [
+            r"\d+.\d+.\d+.",  # e.g., 1.3.4.
+            r"\d+.\d+.\d+",  # e.g., 1.3.4
+            r"\w+.\d+.\d+.",  # e.g., d.3.4.
+            r"\w+.\d+.\d+",  # e.g., d.3.4
+            r"according to",
+            r"acm ccs",
+            r"acmccs",
+            r"acma ccs",
+            r"\(\s*\)",
+            r"\/spl",
+            r"\/sup",
+            r"\/",
+            r"^-\s*"
+        ]
+
+
+config = Config()
diff --git a/poetry.lock b/poetry.lock
index 7291d15..dfe74ba 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -15,10 +15,10 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
 [package.extras]
-dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"]
-docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
-tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"]
-tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"]
+dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
+docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
+tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
+tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 
 [[package]]
 name = "beautifulsoup4"
@@ -96,12 +96,23 @@ python-versions = ">=3.6"
 cffi = ">=1.12"
 
 [package.extras]
-docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
-docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"]
+docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"]
+docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
 pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
 sdist = ["setuptools_rust (>=0.11.4)"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"]
+test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.0.4"
+description = "Backport of PEP 654 (exception groups)"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+test = ["pytest (>=6)"]
 
 [[package]]
 name = "h11"
@@ -119,6 +130,14 @@ category = "main"
 optional = false
 python-versions = ">=3.5"
 
+[[package]]
+name = "iniconfig"
+version = "1.1.1"
+description = "iniconfig: brain-dead simple config-ini parsing"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "loguru"
 version = "0.6.0"
@@ -132,7 +151,7 @@ colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
 win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
 
 [package.extras]
-dev = ["colorama (>=0.3.4)", "docutils (==0.16)", "flake8 (>=3.7.7)", "tox (>=3.9.0)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "black (>=19.10b0)", "isort (>=5.1.1)", "Sphinx (>=4.1.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)"]
+dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils (==0.16)", "flake8 (>=3.7.7)", "isort (>=5.1.1)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)", "tox (>=3.9.0)"]
 
 [[package]]
 name = "lxml"
@@ -145,7 +164,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
 [package.extras]
 cssselect = ["cssselect (>=0.7)"]
 html5 = ["html5lib"]
-htmlsoup = ["beautifulsoup4"]
+htmlsoup = ["BeautifulSoup4"]
 source = ["Cython (>=0.29.7)"]
 
 [[package]]
@@ -167,6 +186,17 @@ python-versions = ">=3.6"
 [package.dependencies]
 attrs = ">=19.2.0"
 
+[[package]]
+name = "packaging"
+version = "21.3"
+description = "Core utilities for Python packages"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
+
 [[package]]
 name = "pandas"
 version = "1.4.1"
@@ -188,6 +218,18 @@ pytz = ">=2020.1"
 [package.extras]
 test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
 
+[[package]]
+name = "pluggy"
+version = "1.0.0"
+description = "plugin and hook calling mechanisms for python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
 [[package]]
 name = "pycparser"
 version = "2.21"
@@ -211,6 +253,17 @@ cryptography = ">=35.0"
 docs = ["sphinx", "sphinx-rtd-theme"]
 test = ["flaky", "pretend", "pytest (>=3.0.1)"]
 
+[[package]]
+name = "pyparsing"
+version = "3.0.9"
+description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+category = "dev"
+optional = false
+python-versions = ">=3.6.8"
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
 [[package]]
 name = "pysocks"
 version = "1.7.1"
@@ -219,6 +272,26 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 
+[[package]]
+name = "pytest"
+version = "7.2.0"
+description = "pytest: simple powerful testing with Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+attrs = ">=19.2.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
+
+[package.extras]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+
 [[package]]
 name = "python-dateutil"
 version = "2.8.2"
@@ -301,6 +374,22 @@ category = "main"
 optional = false
 python-versions = ">=3.6"
 
+[[package]]
+name = "sqlitedict"
+version = "2.0.0"
+description = "Persistent dict in Python, backed up by sqlite3 and pickle, multithread-safe."
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "tqdm"
 version = "4.63.0"
@@ -363,8 +452,8 @@ pyOpenSSL = {version = ">=0.14", optional = true, markers = "extra == \"secure\"
 PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""}
 
 [package.extras]
-brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"]
-secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
+secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"]
 socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 
 [[package]]
@@ -387,7 +476,7 @@ optional = false
 python-versions = ">=3.5"
 
 [package.extras]
-dev = ["pytest (>=4.6.2)", "black (>=19.3b0)"]
+dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
 
 [[package]]
 name = "wsproto"
@@ -403,7 +492,7 @@ h11 = ">=0.9.0,<1"
 [metadata]
 lock-version = "1.1"
 python-versions = "~=3.8"
-content-hash = "89d5de02738bcf3f4a31eca13e4759300c5312821679bf90a58809024885e1a2"
+content-hash = "3527f8a60e9adf40ccd4edbc67ca03e2dd188ed70987df71a72b926bb1dc6aff"
 
 [metadata.files]
 async-generator = [
@@ -508,6 +597,10 @@ cryptography = [
     {file = "cryptography-36.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e167b6b710c7f7bc54e67ef593f8731e1f45aa35f8a8a7b72d6e42ec76afd4b3"},
     {file = "cryptography-36.0.2.tar.gz", hash = "sha256:70f8f4f7bb2ac9f340655cbac89d68c527af5bb4387522a8413e841e3e6628c9"},
 ]
+exceptiongroup = [
+    {file = "exceptiongroup-1.0.4-py3-none-any.whl", hash = "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828"},
+    {file = "exceptiongroup-1.0.4.tar.gz", hash = "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"},
+]
 h11 = [
     {file = "h11-0.13.0-py3-none-any.whl", hash = "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"},
     {file = "h11-0.13.0.tar.gz", hash = "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06"},
@@ -516,6 +609,10 @@ idna = [
     {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
     {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
 ]
+iniconfig = [
+    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
+    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
+]
 loguru = [
     {file = "loguru-0.6.0-py3-none-any.whl", hash = "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"},
     {file = "loguru-0.6.0.tar.gz", hash = "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c"},
@@ -609,6 +706,10 @@ outcome = [
     {file = "outcome-1.1.0-py2.py3-none-any.whl", hash = "sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958"},
     {file = "outcome-1.1.0.tar.gz", hash = "sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967"},
 ]
+packaging = [
+    {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
+    {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
+]
 pandas = [
     {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3dfb32ed50122fe8c5e7f2b8d97387edd742cc78f9ec36f007ee126cd3720907"},
     {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0259cd11e7e6125aaea3af823b80444f3adad6149ff4c97fef760093598b3e34"},
@@ -632,6 +733,10 @@ pandas = [
     {file = "pandas-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:3129a35d9dad1d80c234dd78f8f03141b914395d23f97cf92a366dcd19f8f8bf"},
     {file = "pandas-1.4.1.tar.gz", hash = "sha256:8db93ec98ac7cb5f8ac1420c10f5e3c43533153f253fe7fb6d891cf5aa2b80d2"},
 ]
+pluggy = [
+    {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
+    {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
+]
 pycparser = [
     {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
@@ -640,11 +745,19 @@ pyopenssl = [
     {file = "pyOpenSSL-22.0.0-py2.py3-none-any.whl", hash = "sha256:ea252b38c87425b64116f808355e8da644ef9b07e429398bfece610f893ee2e0"},
     {file = "pyOpenSSL-22.0.0.tar.gz", hash = "sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf"},
 ]
+pyparsing = [
+    {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
+    {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
+]
 pysocks = [
     {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
     {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
     {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
 ]
+pytest = [
+    {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"},
+    {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"},
+]
 python-dateutil = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
@@ -676,6 +789,13 @@ soupsieve = [
     {file = "soupsieve-2.3.1-py3-none-any.whl", hash = "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb"},
     {file = "soupsieve-2.3.1.tar.gz", hash = "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"},
 ]
+sqlitedict = [
+    {file = "sqlitedict-2.0.0.tar.gz", hash = "sha256:23a370416f4e1e962daa293382f3a8dbc4127e6a0abc06a5d4e58e6902f05d17"},
+]
+tomli = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
 tqdm = [
     {file = "tqdm-4.63.0-py2.py3-none-any.whl", hash = "sha256:e643e071046f17139dea55b880dc9b33822ce21613b4a4f5ea57f202833dbc29"},
     {file = "tqdm-4.63.0.tar.gz", hash = "sha256:1d9835ede8e394bb8c9dcbffbca02d717217113adc679236873eeaac5bc0b3cd"},
diff --git a/pyproject.toml b/pyproject.toml
index 74038a5..7c4842b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,9 +15,13 @@ click = "^8.0.4"
 loguru = "^0.6.0"
 tqdm = "^4.63.0"
 webdriver-manager = "^3.5.4"
+sqlitedict = "^2.0.0"
 
 [tool.poetry.dev-dependencies]
 
+[tool.poetry.group.dev.dependencies]
+pytest = "^7.2.0"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/test/assets/data/dblp_processed.xml b/test/assets/data/dblp_processed.xml
new file mode 100644
index 0000000..acf70d7
--- /dev/null
+++ b/test/assets/data/dblp_processed.xml
@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE dblp SYSTEM "dblp-2019-11-22.dtd">
+<dblp>
+  <incollection mdate="2019-10-19" key="reference/sp/CantadorFBC15" publtype="encyclopedia">
+    <author>Iv&aacute;n Cantador</author>
+    <author>Ignacio Fern&aacute;ndez-Tob&iacute;as</author>
+    <author orcid="0000-0003-2638-4121">Shlomo Berkovsky</author>
+    <author orcid="0000-0002-1253-8081">Paolo Cremonesi</author>
+    <title>Cross-Domain Recommender Systems.</title>
+    <pages>919-959</pages>
+    <year>2015</year>
+    <booktitle>Recommender Systems Handbook</booktitle>
+    <ee>https://doi.org/10.1007/978-1-4899-7637-6_27</ee>
+    <crossref>reference/sp/2015rsh</crossref>
+    <url>db/reference/sp/rsh2015.html#CantadorFBC15</url>
+  </incollection>
+  <incollection mdate="2021-06-01" key="reference/sp/Nickerson13" publtype="encyclopedia">
+    <author orcid="0000-0001-8590-3898">Jeffrey V. Nickerson</author>
+    <title>Human-Based Evolutionary Computing.</title>
+    <pages>641-648</pages>
+    <year>2013</year>
+    <booktitle>Handbook of Human Computation</booktitle>
+    <ee>https://doi.org/10.1007/978-1-4614-8806-4_51</ee>
+    <ee>https://www.wikidata.org/entity/Q105641856</ee>
+    <crossref>reference/sp/2013hc</crossref>
+    <url>db/reference/sp/hc2013.html#Nickerson13</url>
+  </incollection>
+  <incollection mdate="2018-11-06" key="reference/sp/Buriol18" publtype="encyclopedia">
+    <author>Luciana S. Buriol</author>
+    <title>Network Optimization.</title>
+    <pages>1123-1140</pages>
+    <year>2018</year>
+    <booktitle>Handbook of Heuristics</booktitle>
+    <ee>https://doi.org/10.1007/978-3-319-07124-4_46</ee>
+    <crossref>reference/sp/2018heuristics</crossref>
+    <url>db/reference/sp/heuristics2018.html#Buriol18</url>
+  </incollection>
+  <incollection mdate="2017-07-12" key="reference/sp/FelfernigFJZ15" publtype="encyclopedia">
+    <author>Alexander Felfernig</author>
+    <author>Gerhard Friedrich</author>
+    <author>Dietmar Jannach</author>
+    <author>Markus Zanker</author>
+    <title>Constraint-Based Recommender Systems.</title>
+    <pages>161-190</pages>
+    <year>2015</year>
+    <booktitle>Recommender Systems Handbook</booktitle>
+    <ee>https://doi.org/10.1007/978-1-4899-7637-6_5</ee>
+    <crossref>reference/sp/2015rsh</crossref>
+    <url>db/reference/sp/rsh2015.html#FelfernigFJZ15</url>
+  </incollection>
+  <incollection mdate="2019-10-19" key="reference/sp/SandoyaMADM18" publtype="encyclopedia">
+    <author orcid="0000-0002-0011-4003">Fernando Sandoya</author>
+    <author>Anna Mart&iacute;nez-Gavara</author>
+    <author>Ricardo Aceves</author>
+    <author>Abraham Duarte</author>
+    <author>Rafael Mart&iacute;</author>
+    <title>Diversity and Equity Models.</title>
+    <pages>979-998</pages>
+    <year>2018</year>
+    <booktitle>Handbook of Heuristics</booktitle>
+    <ee>https://doi.org/10.1007/978-3-319-07124-4_61</ee>
+    <crossref>reference/sp/2018heuristics</crossref>
+    <url>db/reference/sp/heuristics2018.html#SandoyaMADM18</url>
+  </incollection>
+  <incollection mdate="2021-06-01" key="reference/sp/Gabora13" publtype="encyclopedia">
+    <author>Liane Gabora</author>
+    <title>Cultural Evolution as Distributed Computation.</title>
+    <pages>447-461</pages>
+    <year>2013</year>
+    <booktitle>Handbook of Human Computation</booktitle>
+    <ee>https://doi.org/10.1007/978-1-4614-8806-4_34</ee>
+    <ee>https://www.wikidata.org/entity/Q105641836</ee>
+    <crossref>reference/sp/2013hc</crossref>
+    <url>db/reference/sp/hc2013.html#Gabora13</url>
+  </incollection>
+  <incollection mdate="2018-11-06" key="reference/sp/TooleMGP18" publtype="encyclopedia">
+    <author>Jameson L. Toole</author>
+    <author>Yves-Alexandre de Montjoye</author>
+    <author>Marta C. Gonz&aacute;lez</author>
+    <author>Alex 'Sandy' Pentland</author>
+    <title>Modeling and Understanding Intrinsic Characteristics of Human Mobility.</title>
+    <pages>13-34</pages>
+    <year>2018</year>
+    <crossref>reference/sp/2018mdp</crossref>
+    <booktitle>Handbook of Mobile Data Privacy</booktitle>
+    <ee>https://doi.org/10.1007/978-3-319-98161-1_2</ee>
+    <url>db/reference/sp/mdp2018.html#TooleMGP18</url>
+  </incollection>
+</dblp>
diff --git a/test/test_preprocess.py b/test/test_preprocess.py
new file mode 100644
index 0000000..8a78026
--- /dev/null
+++ b/test/test_preprocess.py
@@ -0,0 +1,56 @@
+import pytest
+import shutil
+from pathlib import Path
+from paperscraper._preprocess import get_unique_venues, get_extracted_data
+from paperscraper.config import Config
+
+
+@pytest.fixture(scope="class")
+def test_config(request, tmp_path_factory):
+    root_dir = Path(__file__).parent
+    output_dir = tmp_path_factory.mktemp("output")
+    _config = Config(root_dir=root_dir, output_dir=output_dir)
+    _config.interesting_venues = {
+        "Handbook of Human Computation": {
+            "sourcetype": "booktitle",
+            "publishers": []
+        },
+        "Recommender Systems Handbook": {
+            "sourcetype": "booktitle",
+            "publishers": []
+        },
+        "Handbook of Heuristics": {
+            "sourcetype": "booktitle",
+            "publishers": []
+        }
+    }
+    yield _config
+    shutil.rmtree(str(output_dir))
+
+
+class Test_get_unique_venues:
+    def test_get_unique_venues_first(self, test_config):
+        result = get_unique_venues(test_config, force=True)
+        _len = len(result)
+        result.close(force=False)
+        assert _len == 4
+
+    def test_get_unique_venues_second(self, test_config):
+        result = get_unique_venues(test_config, force=False)
+        _len = len(result)
+        result.close(force=False)
+        assert _len == 4
+
+
+class Test_get_extracted_data:
+    def test_get_extracted_data_first(self, test_config):
+        result = get_extracted_data(test_config, force=True)
+        _len = len(result)
+        result.close(force=False)
+        assert _len == 6
+
+    def test_get_extracted_data_second(self, test_config):
+        result = get_extracted_data(test_config, force=False)
+        _len = len(result)
+        result.close(force=False)
+        assert _len == 6

From 2e79acaa58f2d00630d8d024af2212f673431143 Mon Sep 17 00:00:00 2001
From: Ahmed Shariff <shariff.mfa@outlook.com>
Date: Fri, 25 Nov 2022 18:16:34 -0800
Subject: [PATCH 4/9] Adding post-process module

---
 paperscraper/_postprocess.py | 196 +++++++++++++++++++++++++++++++++++
 1 file changed, 196 insertions(+)
 create mode 100644 paperscraper/_postprocess.py

diff --git a/paperscraper/_postprocess.py b/paperscraper/_postprocess.py
new file mode 100644
index 0000000..7f57abc
--- /dev/null
+++ b/paperscraper/_postprocess.py
@@ -0,0 +1,196 @@
+# External packages
+import ast
+import re
+import string
+import unicodedata
+
+from sqlitedict import SqliteDict
+from loguru import logger
+from tqdm import tqdm
+
+# Internal modules
+from paperscraper.config import config, Config
+
+regex = re.compile(r'[\n\r\t]')
+set_punctuations = set(string.punctuation)
+set_numbers = set("0123456789")
+
+logger.remove()
+logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
+
+
+def _clean_string(_string):
+    _string_normalized = unicodedata.normalize("NFKD", _string)
+    _string_stripped = str(regex.sub("", _string_normalized)).strip()
+    _string_recoded = _string_stripped.encode('ascii', 'ignore').decode('UTF-8')
+    return _string_recoded
+
+
+def process_title(title_string):
+    """Ensure that there aren't new lines and that the titles are between X and Y characters in length."""
+    try:
+        if not (5 < len(title_string) < 250):
+            return None
+
+        return " ".join(title_string.split())
+    except Exception:
+        # print(e)
+        return None
+
+
+def process_abstract(abstract_string):
+    """Ensure that there aren't new lines and that the abstracts are between X and Y characters in length."""
+    try:
+        if abstract_string in ["Not Scraped", "Error", "No Url"]:
+            return None
+
+        if not (50 < len(abstract_string) < 2500):
+            return None
+
+        return " ".join(abstract_string.split())
+    except Exception:
+        # print(e)
+        return None
+
+
+def process_authors(author_string):
+    """
+    Convert utf-8 characters to ascii so that they are searchable via a keyboard.
+
+    (will result in data loss but ignore errors)
+    """
+    try:
+        author_list = ast.literal_eval(author_string)
+        if isinstance(author_list, list):
+            recoded_author_list = [string.capwords(_author.encode('ascii', 'ignore').decode('UTF-8')) for _author in author_list]
+            return str(recoded_author_list)
+    except Exception:
+        # print(e)
+        pass
+    return author_string
+
+
+def process_citation_counts(citation_count_string):
+    """Ensure that this is always NONE or NUMERIC."""
+    try:
+        if not citation_count_string.isnumeric():
+            return None
+        else:
+            return citation_count_string
+    except Exception:
+        return None
+
+
+def process_keywords(keywords_string):
+    """
+    Convert utf-8 characters to ascii  so that they are searchable via a keyboard.
+
+    (will result in data loss but ignore errors)
+    """
+    try:
+        keywords_list = ast.literal_eval(keywords_string)
+        if isinstance(keywords_list, list):
+            processed_keywords_list = list()
+
+            for _keyword in keywords_list:
+                if "→" in _keyword:
+                    kws = _keyword.split("→")
+                    for kw in kws:
+                        processed_keywords_list.append(kw)
+                elif "Key words: " in _keyword:
+                    _keyword = re.sub("Key words: ","",_keyword)
+                    kws = _keyword.split(" – ")
+                    for kw in kws:
+                        processed_keywords_list.append(kw)
+                else:
+                    processed_keywords_list.append(_keyword)
+
+            # Start with removing Nones.
+            processed_keywords_list = list(filter(None, processed_keywords_list))
+
+            # Make them all lower-case for case insensitive match to be successful.
+            processed_keywords_list = [str(kw).lower() for kw in processed_keywords_list]
+
+            # Clean the Keyword String
+            processed_keywords_list = [_clean_string(kw) for kw in processed_keywords_list]
+
+            # Remove weird phrases in the Keyword that sometimes happens based on how it's maintained on the Publisher's website.
+            _interim_processed_list = []
+            for kw in processed_keywords_list:
+                for regex in config.keyword_patterns_to_remove:
+                    kw = re.sub(regex, "", kw)
+                _interim_processed_list.append(kw)
+            processed_keywords_list = _interim_processed_list
+
+            # Remove keywords if it has Only keywords or Only punctuations
+            processed_keywords_list = [i for i in processed_keywords_list if not all(j in set_punctuations or j in set_numbers for j in i)]
+
+            # Finally, Remove None's again.
+            processed_keywords_list = list(filter(None, processed_keywords_list))
+
+            # Merge Different Variations of the same Keyword
+            _interim_processed_list = []
+            for kw in processed_keywords_list:
+                if kw in config.keywords_to_merge:
+                    _interim_processed_list.append(config.keywords_to_merge[kw])
+                else:
+                    _interim_processed_list.append(kw)
+            processed_keywords_list = _interim_processed_list
+
+            # And of course, de-duplicate if some have both HCI and Human-Computer Interaction initially.
+            processed_keywords_list = list(set(processed_keywords_list))
+
+            # Let's capitalize the keywords so that they look nice.
+            processed_keywords_list = [string.capwords(kw) for kw in processed_keywords_list]
+
+            return str(processed_keywords_list)
+    except Exception:
+        # print(e)
+        pass
+    return None
+
+
+def get_post_processed_data(config: Config, force: bool = False) -> SqliteDict:
+    """Process fields and return them."""
+    if force or not config.path_output.exists():
+        # Read it
+        papers_db = SqliteDict(config.path_output)
+
+        author_processed = []
+        keywords_processed = []
+        citation_count_processed = []
+        abstract_processed = []
+        title_processed = []
+
+        for index, row in tqdm(papers_db.items(), desc="Papers", total=len(papers_db)):
+            # Process authors
+            author_processed.append(process_authors(row["author"]))
+
+            # Process keywords
+            keywords_processed.append(process_keywords(row["keywords"]))
+
+            # Process citation counts
+            citation_count_processed.append(process_citation_counts(row["citation_count"]))
+
+            # Process abstract
+            abstract_processed.append(process_abstract(row["abstract"]))
+
+            # Process titles
+            title_processed.append(process_title(row["title"]))
+
+        papers_db.close()
+
+        # Commit all the data to db
+        scraped_input_db = SqliteDict(config.path_postprocessing_output)
+        scraped_input_db["author_processed"] = author_processed
+        scraped_input_db["keywords_processed"] = keywords_processed
+        scraped_input_db["citation_count_processed"] = citation_count_processed
+        scraped_input_db["abstract_processed"] = abstract_processed
+        scraped_input_db["title_processed"] = title_processed
+
+        # Save POST-PROCESSED FILE
+        scraped_input_db.commit()
+    else:
+        scraped_input_db = SqliteDict(config.path_postprocessing_output)
+
+    return scraped_input_db

From 49321cfbb695d7983c2c8928492760a55dc1129f Mon Sep 17 00:00:00 2001
From: Ahmed Shariff <shariff.mfa@outlook.com>
Date: Sat, 26 Nov 2022 01:48:34 -0800
Subject: [PATCH 5/9] Merge get_(unique_venues|extracted_data) & memory
 optimize xml iter

---
 paperscraper/_preprocess.py | 72 ++++++++++++++++++++-----------------
 test/test_preprocess.py     | 36 +++++++------------
 2 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py
index b284f2f..a1557e2 100644
--- a/paperscraper/_preprocess.py
+++ b/paperscraper/_preprocess.py
@@ -63,18 +63,30 @@ def get_processed_db(force: bool = False) -> Path:
     return config.path_input
 
 
-# TODO: Re-run this if (1) The above list has changed OR (2) There is a NEW DBLP snapshot.
-def get_unique_venues(config: Config, force: bool = False) -> SqliteDict:
+# TODO: Re-run this if
+#   (1) The <config.interesting_venues> list has changed or
+#   (2) There is a NEW DBLP snapshot.
+def get_extracted_data(config: Config, force: bool = False) -> tuple[SqliteDict, SqliteDict]:
     """
-    Find Unique venues from the DBLP xml.
+    FILTER the huge dblp_processed.xml file to keep just the data that we are interested in and Find Unique venues from the DBLP xml.
 
-    Looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"].
+    For unqiue venues looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"].
     """
-    if force or not config.path_unique_venues.exists():
+    if force or not config.path_output.exists():
+        logger.info(f"Extracting venues to {config.path_unique_venues}")
         unique_sources = SqliteDict(config.path_unique_venues)
         unique_sources.clear()  # empty the db
-        logger.info(f"Extracting venues to {config.path_unique_venues}")
-        for event, elem in tqdm(ET.iterparse(config.path_input, recover=True), desc="Entry"):
+
+        logger.info(f"Extracting data to {config.path_output}")
+        result_list = SqliteDict(config.path_output)
+        result_list.clear()  # empty the db
+        src_set = set()
+
+        _idx: dict[int, int] = {0: 0}
+
+        for event, elem in tqdm(ET.iterparse(config.path_input, encoding='UTF-8', events=("end", ) ,recover=True), desc="Entry"):
+            _idx[0] += 1
+
             if elem.tag in ["article", "inproceedings", "incollection"]:
                 for child in elem.getchildren():
                     if child.tag in ["journal", "booktitle"]:
@@ -89,27 +101,6 @@ def get_unique_venues(config: Config, force: bool = False) -> SqliteDict:
                         child_dict["count"] += 1
                         unique_sources[child.text] = child_dict
 
-        logger.debug("Writing to disk")
-        # Save it to disk
-        unique_sources.commit()
-    else:
-        logger.info(f"Loading data from {config.path_unique_venues}")
-        unique_sources = SqliteDict(config.path_unique_venues)
-
-    return unique_sources
-
-
-# TODO: Re-run this if
-#   (1) The <config.interesting_venues> list has changed or
-#   (2) There is a NEW DBLP snapshot.
-def get_extracted_data(config: Config, force: bool = False) -> SqliteDict:
-    """FILTER the huge dblp_processed.xml file to keep just the data that we are interested in."""
-    if force or not config.path_output.exists():
-        logger.info(f"Extracting data to {config.path_output}")
-        result_list = SqliteDict(config.path_output)
-        result_list.clear()  # empty the db
-        src_set = set()
-        for _idx, (event, elem) in tqdm(enumerate(ET.iterparse(config.path_input, encoding='UTF-8', recover=True)), desc="Entry"):
             obj: dict = {}
             # Initialize the fields that we are going to scrape.
             # TODO: Update these if more fields are added.
@@ -141,20 +132,35 @@ def get_extracted_data(config: Config, force: bool = False) -> SqliteDict:
                         logger.debug(f"Adding source: {child.text}")
 
             if to_add:
-                result_list[_idx] = obj
+                result_list[_idx[0]] = obj
 
             # Periodically commiting stuff
-            if _idx % 100 == 0:
+            if _idx[0] % 200000 == 0:
+                unique_sources.commit()
                 result_list.commit()
 
+            # from https://stackoverflow.com/questions/7171140/using-python-iterparse-for-large-xml-files
+            # http://lxml.de/parsing.html#modifying-the-tree
+            # Based on Liza Daly's fast_iter
+            # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
+            # See also http://effbot.org/zone/element-iterparse.htm
+            # NOTE: deleting only the 2nd level nodes
+            if len(elem.getroottree().getpath(elem).split("/")) <= 3:
+                elem.clear()
+                while elem.getprevious() is not None:
+                    del elem.getparent()[0]
+
         logger.debug("Writing to disk")
         # Save to disk
+        unique_sources.commit()
         result_list.commit()
     else:
         logger.info(f"Loading data from {config.path_output}")
         result_list = SqliteDict(config.path_output)
+        logger.info(f"Loading data from {config.path_unique_venues}")
+        unique_sources = SqliteDict(config.path_unique_venues)
 
-    return result_list
+    return result_list, unique_sources
 
 
 # get a new headless Chrome driver
@@ -337,11 +343,11 @@ def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict:
 
         # Persist the paper file
         papers_db.commit()
-        logger.i("scraped papers saved to disk.")
+        logger.info("scraped papers saved to disk.")
 
         # Persist Logs
         df_logs = pd.DataFrame.from_dict(log_obj, orient="index")
-        logger.i(log_obj)
+        logger.info(log_obj)
         df_logs.to_csv(config.path_logfile, sep='\t', header=True)
     else:
         logger.info(f"Loading processed data from {config.path_output}")
diff --git a/test/test_preprocess.py b/test/test_preprocess.py
index 8a78026..c3d2530 100644
--- a/test/test_preprocess.py
+++ b/test/test_preprocess.py
@@ -1,7 +1,7 @@
 import pytest
 import shutil
 from pathlib import Path
-from paperscraper._preprocess import get_unique_venues, get_extracted_data
+from paperscraper._preprocess import get_extracted_data
 from paperscraper.config import Config
 
 
@@ -28,29 +28,19 @@ def test_config(request, tmp_path_factory):
     shutil.rmtree(str(output_dir))
 
 
-class Test_get_unique_venues:
-    def test_get_unique_venues_first(self, test_config):
-        result = get_unique_venues(test_config, force=True)
-        _len = len(result)
-        result.close(force=False)
-        assert _len == 4
-
-    def test_get_unique_venues_second(self, test_config):
-        result = get_unique_venues(test_config, force=False)
-        _len = len(result)
-        result.close(force=False)
-        assert _len == 4
-
-
 class Test_get_extracted_data:
+    def _get_extracted_data_results(self, data, venues):
+        _len_data = len(data)
+        data.close(force=True)
+        _len_venues = len(venues)
+        venues.close(force=True)
+        assert _len_data == 6
+        assert _len_venues == 4
+    
     def test_get_extracted_data_first(self, test_config):
-        result = get_extracted_data(test_config, force=True)
-        _len = len(result)
-        result.close(force=False)
-        assert _len == 6
+        data, venues = get_extracted_data(test_config, force=True)
+        self._get_extracted_data_results(data, venues)
 
     def test_get_extracted_data_second(self, test_config):
-        result = get_extracted_data(test_config, force=False)
-        _len = len(result)
-        result.close(force=False)
-        assert _len == 6
+        data, venues = get_extracted_data(test_config, force=False)
+        self._get_extracted_data_results(data, venues)

From 4b449ec13503f4dd657519bbb50461c7f6f81013 Mon Sep 17 00:00:00 2001
From: Ahmed Shariff <shariff.mfa@outlook.com>
Date: Sat, 26 Nov 2022 01:50:02 -0800
Subject: [PATCH 6/9] Update cli & adding tests

---
 paperscraper/_cli.py | 43 ++++++++++++++++++++++++++++----
 test/test_cli.py     | 58 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 5 deletions(-)
 create mode 100644 test/test_cli.py

diff --git a/paperscraper/_cli.py b/paperscraper/_cli.py
index 9bbcd28..d6be844 100644
--- a/paperscraper/_cli.py
+++ b/paperscraper/_cli.py
@@ -1,20 +1,53 @@
 import click
-from paperscraper._preprocess import (get_processed_db, get_unique_venues, get_extracted_data, get_processed_data)
+from paperscraper._preprocess import (get_processed_db, get_extracted_data, get_processed_data)
+from paperscraper._postprocess import get_post_processed_data
 from paperscraper.config import config
 
 @click.group()
 def cli():
+    """Cli interface for paperscraper."""
     pass
 
 
 @cli.group()
 def process():
+    """Process and setup database."""
     pass
 
+
+@process.command()
+@click.option("-f", "--force", help="Force run all steps", is_flag=True)
+def process_db(force):
+    """Process the dblp xml file."""
+    get_processed_db(config=config, force=force)
+
+
+@process.command()
+@click.option("-f", "--force", help="Force run all steps", is_flag=True)
+def extract_data(force):
+    """Extract data from processed dblp xml file."""
+    get_extracted_data(config=config, force=force)
+
+
+@process.command()
+@click.option("-f", "--force", help="Force run all steps", is_flag=True)
+def process_data(force):
+    """Process extracted data."""
+    get_processed_data(config=config, force=force)
+
+
+@process.command()
+@click.option("-f", "--force", help="Force run all steps", is_flag=True)
+def post_process_data(force):
+    """Run cleanup process after processing data."""
+    get_post_processed_data(config=config, force=force)
+
+
 @process.command()
 @click.option("-f", "--force", help="Force run all steps", is_flag=True)
 def run_all(force):
-    get_processed_db(force=False)
-    get_unique_venues(config, force=False)
-    get_extracted_data(config, force=False)
-    get_processed_data(config, force=force)
+    """Run all steps in order."""
+    get_processed_db(config=config, force=force)
+    get_extracted_data(config=config, force=force)
+    get_processed_data(config=config, force=force)
+    get_post_processed_data(config=config, force=force)
diff --git a/test/test_cli.py b/test/test_cli.py
new file mode 100644
index 0000000..b7b260d
--- /dev/null
+++ b/test/test_cli.py
@@ -0,0 +1,58 @@
+import pytest
+import importlib
+from click.testing import CliRunner
+import paperscraper
+import pytest_mock
+
+import paperscraper._cli
+
+
+@pytest.fixture(scope="function")
+def runner():
+    return CliRunner()
+
+
+def called_with_config_and_force(mocked_function):
+    mocked_function.assert_called_with(config=paperscraper._cli.config, force=True)
+
+
+def mock_function(mocker, mock_function):
+    mocker.patch(mock_function)
+    # Before the main methods gets imported need to mock them
+    importlib.reload(paperscraper._cli)
+
+
+def test_process_db(runner, mocker):
+    mock_function(mocker, "paperscraper._preprocess.get_processed_db")
+    result = runner.invoke(paperscraper._cli.cli, ["process", "process-db", "-f"])
+    called_with_config_and_force(paperscraper._preprocess.get_processed_db)
+
+
+def test_extract_data(runner, mocker):
+    mock_function(mocker, "paperscraper._preprocess.get_extracted_data")
+    result = runner.invoke(paperscraper._cli.cli, ["process", "extract-data", "-f"])
+    called_with_config_and_force(paperscraper._preprocess.get_extracted_data)
+
+
+def test_process_data(runner, mocker):
+    mock_function(mocker, "paperscraper._preprocess.get_processed_data")
+    result = runner.invoke(paperscraper._cli.cli, ["process", "process-data", "-f"])
+    called_with_config_and_force(paperscraper._preprocess.get_processed_data)
+
+
+def test_post_process_data(runner, mocker):
+    mock_function(mocker, "paperscraper._postprocess.get_post_processed_data")
+    result = runner.invoke(paperscraper._cli.cli, ["process", "post-process-data", "-f"])
+    called_with_config_and_force(paperscraper._postprocess.get_post_processed_data)
+
+
+def test_run_all(runner, mocker):
+    mock_function(mocker, "paperscraper._preprocess.get_processed_db")
+    mock_function(mocker, "paperscraper._preprocess.get_extracted_data")
+    mock_function(mocker, "paperscraper._preprocess.get_processed_data")
+    mock_function(mocker, "paperscraper._postprocess.get_post_processed_data")
+    result = runner.invoke(paperscraper._cli.cli, ["process", "run-all", "-f"])
+    called_with_config_and_force(paperscraper._preprocess.get_processed_db)
+    called_with_config_and_force(paperscraper._preprocess.get_extracted_data)
+    called_with_config_and_force(paperscraper._preprocess.get_processed_data)
+    called_with_config_and_force(paperscraper._postprocess.get_post_processed_data)

From 4e79979fd029d1e0409bdceec348d8ed8a3370b1 Mon Sep 17 00:00:00 2001
From: Ahmed Shariff <shariff.mfa@outlook.com>
Date: Sat, 26 Nov 2022 01:50:22 -0800
Subject: [PATCH 7/9] Adding dev dependencies & add .xml.gz to gitignore

---
 .gitignore     |  1 +
 poetry.lock    | 90 +++++++++++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml |  3 ++
 3 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index ad07f50..34882de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ chromedriver
 .DS_Store
 *.exe
 *.xml
+*.xml.gz
 *.log
 .idea/
 *.pyc
diff --git a/poetry.lock b/poetry.lock
index dfe74ba..3c92596 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -103,6 +103,14 @@ sdist = ["setuptools_rust (>=0.11.4)"]
 ssh = ["bcrypt (>=3.1.5)"]
 test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"]
 
+[[package]]
+name = "debugpy"
+version = "1.6.3"
+description = "An implementation of the Debug Adapter Protocol for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "exceptiongroup"
 version = "1.0.4"
@@ -167,6 +175,17 @@ html5 = ["html5lib"]
 htmlsoup = ["BeautifulSoup4"]
 source = ["Cython (>=0.29.7)"]
 
+[[package]]
+name = "memory-profiler"
+version = "0.61.0"
+description = "A module for monitoring memory usage of a python program"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+psutil = "*"
+
 [[package]]
 name = "numpy"
 version = "1.22.3"
@@ -230,6 +249,17 @@ python-versions = ">=3.6"
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "psutil"
+version = "5.9.4"
+description = "Cross-platform lib for process and system monitoring in Python."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+
 [[package]]
 name = "pycparser"
 version = "2.21"
@@ -292,6 +322,20 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
 
+[[package]]
+name = "pytest-mock"
+version = "3.10.0"
+description = "Thin-wrapper around the mock package for easier use with pytest"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+pytest = ">=5.0"
+
+[package.extras]
+dev = ["pre-commit", "pytest-asyncio", "tox"]
+
 [[package]]
 name = "python-dateutil"
 version = "2.8.2"
@@ -492,7 +536,7 @@ h11 = ">=0.9.0,<1"
 [metadata]
 lock-version = "1.1"
 python-versions = "~=3.8"
-content-hash = "3527f8a60e9adf40ccd4edbc67ca03e2dd188ed70987df71a72b926bb1dc6aff"
+content-hash = "963f00872e5cf8e48cf9a053276d77ea593d40b80b6c670c1c2e7e5d37309c33"
 
 [metadata.files]
 async-generator = [
@@ -597,6 +641,26 @@ cryptography = [
     {file = "cryptography-36.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e167b6b710c7f7bc54e67ef593f8731e1f45aa35f8a8a7b72d6e42ec76afd4b3"},
     {file = "cryptography-36.0.2.tar.gz", hash = "sha256:70f8f4f7bb2ac9f340655cbac89d68c527af5bb4387522a8413e841e3e6628c9"},
 ]
+debugpy = [
+    {file = "debugpy-1.6.3-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:c4b2bd5c245eeb49824bf7e539f95fb17f9a756186e51c3e513e32999d8846f3"},
+    {file = "debugpy-1.6.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b8deaeb779699350deeed835322730a3efec170b88927debc9ba07a1a38e2585"},
+    {file = "debugpy-1.6.3-cp310-cp310-win32.whl", hash = "sha256:fc233a0160f3b117b20216f1169e7211b83235e3cd6749bcdd8dbb72177030c7"},
+    {file = "debugpy-1.6.3-cp310-cp310-win_amd64.whl", hash = "sha256:dda8652520eae3945833e061cbe2993ad94a0b545aebd62e4e6b80ee616c76b2"},
+    {file = "debugpy-1.6.3-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:d5c814596a170a0a58fa6fad74947e30bfd7e192a5d2d7bd6a12156c2899e13a"},
+    {file = "debugpy-1.6.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c4cd6f37e3c168080d61d698390dfe2cd9e74ebf80b448069822a15dadcda57d"},
+    {file = "debugpy-1.6.3-cp37-cp37m-win32.whl", hash = "sha256:3c9f985944a30cfc9ae4306ac6a27b9c31dba72ca943214dad4a0ab3840f6161"},
+    {file = "debugpy-1.6.3-cp37-cp37m-win_amd64.whl", hash = "sha256:5ad571a36cec137ae6ed951d0ff75b5e092e9af6683da084753231150cbc5b25"},
+    {file = "debugpy-1.6.3-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:adcfea5ea06d55d505375995e150c06445e2b20cd12885bcae566148c076636b"},
+    {file = "debugpy-1.6.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:daadab4403427abd090eccb38d8901afd8b393e01fd243048fab3f1d7132abb4"},
+    {file = "debugpy-1.6.3-cp38-cp38-win32.whl", hash = "sha256:6efc30325b68e451118b795eff6fe8488253ca3958251d5158106d9c87581bc6"},
+    {file = "debugpy-1.6.3-cp38-cp38-win_amd64.whl", hash = "sha256:86d784b72c5411c833af1cd45b83d80c252b77c3bfdb43db17c441d772f4c734"},
+    {file = "debugpy-1.6.3-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:4e255982552b0edfe3a6264438dbd62d404baa6556a81a88f9420d3ed79b06ae"},
+    {file = "debugpy-1.6.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cca23cb6161ac89698d629d892520327dd1be9321c0960e610bbcb807232b45d"},
+    {file = "debugpy-1.6.3-cp39-cp39-win32.whl", hash = "sha256:7c302095a81be0d5c19f6529b600bac971440db3e226dce85347cc27e6a61908"},
+    {file = "debugpy-1.6.3-cp39-cp39-win_amd64.whl", hash = "sha256:34d2cdd3a7c87302ba5322b86e79c32c2115be396f3f09ca13306d8a04fe0f16"},
+    {file = "debugpy-1.6.3-py2.py3-none-any.whl", hash = "sha256:84c39940a0cac410bf6aa4db00ba174f973eef521fbe9dd058e26bcabad89c4f"},
+    {file = "debugpy-1.6.3.zip", hash = "sha256:e8922090514a890eec99cfb991bab872dd2e353ebb793164d5f01c362b9a40bf"},
+]
 exceptiongroup = [
     {file = "exceptiongroup-1.0.4-py3-none-any.whl", hash = "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828"},
     {file = "exceptiongroup-1.0.4.tar.gz", hash = "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"},
@@ -680,6 +744,10 @@ lxml = [
     {file = "lxml-4.8.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93"},
     {file = "lxml-4.8.0.tar.gz", hash = "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23"},
 ]
+memory-profiler = [
+    {file = "memory_profiler-0.61.0-py3-none-any.whl", hash = "sha256:400348e61031e3942ad4d4109d18753b2fb08c2f6fb8290671c5513a34182d84"},
+    {file = "memory_profiler-0.61.0.tar.gz", hash = "sha256:4e5b73d7864a1d1292fb76a03e82a3e78ef934d06828a698d9dada76da2067b0"},
+]
 numpy = [
     {file = "numpy-1.22.3-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75"},
     {file = "numpy-1.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab"},
@@ -737,6 +805,22 @@ pluggy = [
     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
 ]
+psutil = [
+    {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"},
+    {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"},
+    {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ff89f9b835100a825b14c2808a106b6fdcc4b15483141482a12c725e7f78549"},
+    {file = "psutil-5.9.4-cp27-cp27m-win32.whl", hash = "sha256:852dd5d9f8a47169fe62fd4a971aa07859476c2ba22c2254d4a1baa4e10b95ad"},
+    {file = "psutil-5.9.4-cp27-cp27m-win_amd64.whl", hash = "sha256:9120cd39dca5c5e1c54b59a41d205023d436799b1c8c4d3ff71af18535728e94"},
+    {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6b92c532979bafc2df23ddc785ed116fced1f492ad90a6830cf24f4d1ea27d24"},
+    {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:efeae04f9516907be44904cc7ce08defb6b665128992a56957abc9b61dca94b7"},
+    {file = "psutil-5.9.4-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7"},
+    {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1"},
+    {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08"},
+    {file = "psutil-5.9.4-cp36-abi3-win32.whl", hash = "sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff"},
+    {file = "psutil-5.9.4-cp36-abi3-win_amd64.whl", hash = "sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4"},
+    {file = "psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e"},
+    {file = "psutil-5.9.4.tar.gz", hash = "sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62"},
+]
 pycparser = [
     {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
@@ -758,6 +842,10 @@ pytest = [
     {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"},
     {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"},
 ]
+pytest-mock = [
+    {file = "pytest-mock-3.10.0.tar.gz", hash = "sha256:fbbdb085ef7c252a326fd8cdcac0aa3b1333d8811f131bdcc701002e1be7ed4f"},
+    {file = "pytest_mock-3.10.0-py3-none-any.whl", hash = "sha256:f4c973eeae0282963eb293eb173ce91b091a79c1334455acfac9ddee8a1c784b"},
+]
 python-dateutil = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
diff --git a/pyproject.toml b/pyproject.toml
index 7c4842b..066b74a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,9 @@ sqlitedict = "^2.0.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.2.0"
+debugpy = "^1.6.3"
+memory-profiler = "^0.61.0"
+pytest-mock = "^3.10.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

From d365b44e8bf09d21b772b22974fb1113c832bc3c Mon Sep 17 00:00:00 2001
From: Ahmed Shariff <shariff.mfa@outlook.com>
Date: Sat, 26 Nov 2022 17:39:22 -0800
Subject: [PATCH 8/9] Improved logging during get_processed_data

---
 paperscraper/__init__.py          | 25 ++++++++++++++++
 paperscraper/_preprocess.py       | 47 +++++++++++++++++++++++++++----
 paperscraper/config.py            |  6 +++-
 paperscraper/scrapers/keywords.py | 43 +++++++++++++++-------------
 4 files changed, 95 insertions(+), 26 deletions(-)

diff --git a/paperscraper/__init__.py b/paperscraper/__init__.py
index e69de29..6088c6c 100644
--- a/paperscraper/__init__.py
+++ b/paperscraper/__init__.py
@@ -0,0 +1,25 @@
+from importlib.metadata import version
+import logging
+from loguru import logger
+
+__version__ = version(__package__)
+
+
+class __InterceptHandler(logging.Handler):
+    def emit(self, record):
+        # Get corresponding Loguru level if it exists
+        try:
+            level = logger.level(record.levelname).name
+        except ValueError:
+            level = record.levelno
+
+        # Find caller from where originated the logged message
+        frame, depth = logging.currentframe(), 2
+        while frame.f_code.co_filename == logging.__file__:
+            frame = frame.f_back
+            depth += 1
+
+        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
+
+
+logging.basicConfig(handlers=[__InterceptHandler()], level=0)
diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py
index a1557e2..ad1c952 100644
--- a/paperscraper/_preprocess.py
+++ b/paperscraper/_preprocess.py
@@ -1,4 +1,4 @@
-import ast
+import sys
 import re
 import time
 from pathlib import Path
@@ -13,6 +13,7 @@
 from sqlitedict import SqliteDict
 from tqdm import tqdm
 from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
 from paperscraper.config import Config, config
 from paperscraper.scrapers.abstracts import get_abstract
@@ -167,16 +168,51 @@ def get_extracted_data(config: Config, force: bool = False) -> tuple[SqliteDict,
 def _get_webdriver_instance():
     chrome_options = Options()
     chrome_options.add_argument("--headless")
+    chrome_desired_capabilities = DesiredCapabilities.CHROME
+    chrome_desired_capabilities['goog:loggingPrefs'] = { 'browser':'ALL' }
     # chrome_options.binary_location = config.path_chromeoptions_binary
     # driver = webdriver.chrome(executable_path=config.path_chromedriver, chrome_options=chrome_options)
     driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                               chrome_options=chrome_options)
     # driver.implicitly_wait(10000)
+    driver._old_get_method = driver.get
+    driver.get = lambda *args, **kwargs: get_browser_log_entries(driver, *args, **kwargs)
     return driver
 
 
-def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict:
+def get_browser_log_entries(driver, *args, **kwargs):
+    """get log entreies from selenium and add to python logger before returning"""
+    ret_val = driver._old_get_method(*args, **kwargs)
+    loglevels = {
+        'NOTSET': 'TRACE' ,
+        'DEBUG': 'DEBUG' ,
+        'INFO': 'INFO' ,
+        'WARNING':'WARNING',
+        'ERROR': 'ERROR',
+        'SEVERE':'ERROR',
+        'CRITICAL':'CRITICAL'
+    }
+
+    #get browser logs
+    slurped_logs = driver.get_log('browser')
+    for entry in slurped_logs:
+        #convert broswer log to python log format
+        rec = logger.log(loglevels.get(entry['level']), "{}: {}".format(entry['source'], entry['message']))
+        # rec.created = entry['timestamp'] /1000 # log using original timestamp.. us -> ms
+        # try:
+        #     #add browser log to python log
+        #     browserlog.handle(rec)
+        # except:
+        #     print(entry)
+    #and return logs incase you want them
+    return ret_val
+
+
+def get_processed_data(config: Config, force: bool = False) -> SqliteDict:
     """Scrap the Abstracts, Keywords, and Citations."""
+
+    logger.add(config.path_console_log_file)
+
     if force or not config.path_output.exists():
         # Get a webdriver instance (Headless Chrome)
         logger.info(f"Processing data to {config.path_output}")
@@ -210,6 +246,7 @@ def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict:
                     log_obj[row["source"]]["no_of_citations_fetch_errors"] = 0
                     log_obj[row["source"]]["no_of_citations_errors"] = 0
 
+                logger.debug("Processing {} ".format(row["title"]))
                 # Increment no of papers
                 log_obj[row["source"]]["papers"] += 1
 
@@ -247,7 +284,7 @@ def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict:
                     abstract_soup = BeautifulSoup(driver.page_source, 'lxml')
 
                 except Exception as e:
-                    logger.error('Abstract: ' + str(e))
+                    logger.error(f'{index} Abstract: ' + str(e))
 
                 if abstract_soup is not None:
                     is_abstract = False
@@ -327,8 +364,8 @@ def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict:
                             log_obj[row["source"]]["keyword_fetch_errors"] += 1
                             log_obj[row["source"]]["keyword_errors"] += 1
 
-                    except Exception:
-                        pass
+                    except Exception as e:
+                        logger.error(f'{index} Keywords: ' + str(e))
 
                 if not is_keyword:
                     row['keywords'] = "Error"
diff --git a/paperscraper/config.py b/paperscraper/config.py
index dacc934..64c04c4 100644
--- a/paperscraper/config.py
+++ b/paperscraper/config.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 from pathlib import Path
 from typing import Union
 
@@ -32,7 +33,10 @@ def __init__(self, root_dir: Union[str, Path] = None,
         self.path_unique_venues = output_dir / "unique_venues.db"
         self.path_unique_keywords = output_dir / "unique_keywords.tsv"
         self.path_unique_authors = output_dir/ "unique_authors.tsv"
-        self.path_logfile = output_dir / "log.tsv"
+
+        datetime_str = f"{datetime.now():%Y-%m-%d_%H-%M-%S%z}"
+        self.path_logfile = output_dir / f"log-{datetime_str}.tsv"
+        self.path_console_log_file = output_dir / f"console-{datetime_str}.log"
 
         # ChromeDriver
         # TODO Option 1: Manual Download  from https://chromedriver.chromium.org/downloads (e.g., ChromeDriver 86.0.4240.22) and save to a known location in PATH
diff --git a/paperscraper/scrapers/keywords.py b/paperscraper/scrapers/keywords.py
index e6e9ffb..85cc14b 100644
--- a/paperscraper/scrapers/keywords.py
+++ b/paperscraper/scrapers/keywords.py
@@ -1,10 +1,13 @@
 import re
+from loguru import logger
 
 regex = re.compile(r'[\n\r\t]')
 
 
 def acm_digital_library(soup):
     try:
+        # TODO: Get keyoards by clicking on the citation linke (soup.select('a[data-title="Export Citation"]'))
+        # Then using the the ActionChains from selenium to click, parse the bib result and get keywords
         keywords = set()
         keywords_parent_ol = soup.find('ol', class_="rlist organizational-chart")
         keywords_divs = keywords_parent_ol.findChildren('div', recursive=True)
@@ -13,12 +16,12 @@ def acm_digital_library(soup):
             keywords.add(regex.sub("", kw.split(",")[0]))
         return list(keywords)
     except Exception as e:
-        print(e)
-    return None
+        logger.error(e)
+    return []
 
 
 def graphics_interface_proceedings(soup):
-    return None
+    return []
 
 
 def ieee_explore(soup):
@@ -36,8 +39,8 @@ def ieee_explore(soup):
                         keywords.add(str(regex.sub("", str(keywords_l.text).split(",")[0])))
         return list(keywords)
     except Exception as e:
-        print(e)
-    return None
+        logger.error(e)
+    return []
 
 
 def eurographics_digital_library(soup):
@@ -65,8 +68,8 @@ def eurographics_digital_library(soup):
                     keywords_set.update(re.split(',|:|;', keywords_str))
         return list(keywords_set)
     except Exception as e:
-        print(e)
-    return None
+        logger.error(e)
+    return []
 
 
 def springer_v2(soup):
@@ -78,8 +81,8 @@ def springer_v2(soup):
             keywords.add(k.text)
         return list(keywords)
     except Exception as e:
-        print(e)
-    return None
+        logger.error(e)
+    return []
 
 
 def dagstuhl(soup):
@@ -91,8 +94,8 @@ def dagstuhl(soup):
         if keywords_font is not None:
             return re.split(',', keywords_font.text)
     except Exception as e:
-        print(e)
-    return None
+        logger.error(e)
+    return []
 
 
 def springer_v1(soup):
@@ -105,8 +108,8 @@ def springer_v1(soup):
             keywords.add(str(regex.sub("", kw)).strip())
         return list(keywords)
     except Exception as e:
-        print(e)
-    return None
+        logger.error(e)
+    return []
 
 
 def wiley_online_library(soup):
@@ -138,12 +141,12 @@ def wiley_online_library(soup):
 
         return list(keywords_set)
     except Exception as e:
-        print(e)
-    return None
+        logger.error(e)
+    return []
 
 
 def cogsci(soup):
-    return None
+    return []
 
 
 def scitepress(soup):
@@ -154,8 +157,8 @@ def scitepress(soup):
             keywords_set.add(kw)
         return list(keywords_set)
     except Exception as e:
-        print(e)
-    return None
+        logger.error(e)
+    return []
 
 
 def scienceopen(soup):
@@ -168,11 +171,11 @@ def scienceopen(soup):
         return list(keywords_set)
     except Exception as e:
         pass
-    return None
+    return []
 
 
 def aaai(soup):
-    return None
+    return []
 
 
 def get_keywords(publisher, soup):

From 2d0b3164bab0ec6a2b4a259be9df95462812d8d3 Mon Sep 17 00:00:00 2001
From: Ahmed Shariff <shariff.mfa@outlook.com>
Date: Sat, 26 Nov 2022 17:40:01 -0800
Subject: [PATCH 9/9] Fixes in get_processed_data - direct use url, commit
 every step

---
 paperscraper/_preprocess.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py
index ad1c952..7cbde0f 100644
--- a/paperscraper/_preprocess.py
+++ b/paperscraper/_preprocess.py
@@ -253,13 +253,13 @@ def get_processed_data(config: Config, force: bool = False) -> SqliteDict:
                 # Get the URLs
                 urls = []
                 try:
-                    urls = ast.literal_eval(row["ee"])
+                    urls = row["ee"]
                 except Exception:
                     # If not ee, check url.
                     # But, this doesn't have HTTP/HTTPS it seems to be following some Relative Paths from a
                     # BaseURL that is unknown. Hence, it will fail 99% of the times.
                     try:
-                        urls = ast.literal_eval(row["url"])
+                        urls = row["url"]
                     except Exception:
                         pass
 
@@ -375,8 +375,7 @@ def get_processed_data(config: Config, force: bool = False) -> SqliteDict:
 
                 papers_db[index] = row
 
-                if index % 100 == 100:
-                    papers_db.commit()
+                papers_db.commit()
 
         # Persist the paper file
         papers_db.commit()