From a7de5bf38d0dd3ca2714390d06850eb8324de17e Mon Sep 17 00:00:00 2001 From: Ahmed Shariff Date: Fri, 25 Mar 2022 17:27:39 -0700 Subject: [PATCH 1/9] Wrapped update in package --- .gitignore | 1 + paperscraper/_cli.py | 20 + paperscraper/_preprocess.py | 327 +++++++++++++++++ paperscraper/config.py | 26 +- poetry.lock | 706 ++++++++++++++++++++++++++++++++++++ pyproject.toml | 26 ++ 6 files changed, 1094 insertions(+), 12 deletions(-) create mode 100644 paperscraper/_cli.py create mode 100644 paperscraper/_preprocess.py create mode 100644 poetry.lock create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore index 1d6fad3..4d1b36c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +output/ venv chromedriver .DS_Store diff --git a/paperscraper/_cli.py b/paperscraper/_cli.py new file mode 100644 index 0000000..7a55635 --- /dev/null +++ b/paperscraper/_cli.py @@ -0,0 +1,20 @@ +import click +from paperscraper._preprocess import (get_processed_db, get_unique_venues, get_extracted_data, get_processed_data) + + +@click.group() +def cli(): + pass + + +@cli.group() +def process(): + pass + +@process.command() +@click.option("-f", "--force", help="Force run all steps", is_flag=True) +def run_all(force): + get_processed_db(force=False) + get_unique_venues(force=False) + get_extracted_data(force=False) + get_processed_data(force=force) diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py new file mode 100644 index 0000000..49eec4c --- /dev/null +++ b/paperscraper/_preprocess.py @@ -0,0 +1,327 @@ +import re +from pathlib import Path +import lxml.etree as ET +import pandas as pd +from loguru import logger +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager +from tqdm import tqdm +import ast +import time + +import paperscraper.config as config +from paperscraper.scrapers.abstracts import get_abstract +from paperscraper.scrapers.keywords import get_keywords +from paperscraper.scrapers.citations import get_citation_count + +logger.remove() +logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) + +# List sources that are to be processed. +# __publication_src = ["IEEE Visualization"] +__publication_src = list(config.interesting_venues.keys()) + +# Process only the below scraped STATES +# Possible values: ["Not Scraped", "Error", "No Url"] +__scraper_filter = { + "keywords": ["Not Scraped", "Error", "No Url"], + "abstract": ["Not Scraped", "Error", "No Url"], + "citation_count": ["Not Scraped", "Error", "No Url"], +} + + +def get_processed_db(force:bool=False) -> Path: + if force or not config.path_input.exists(): + logger.info(f"Cleaning data from {config.path_input_raw} into {config.path_input}") + # This Regular Find+Replace replaces instances of & between tags with a SPECIAL TAG `%26`. This tag will be replaced back to `&` in the code later on. + regex_find = r'(.*)&(.*)' + regex_replace = r'\1%26\2' + + with open(config.path_input_raw, "r") as raw_dblp: + with open(config.path_input, "w") as processed_dblp: + for line in tqdm(raw_dblp, desc="Raw file line"): + + # Iterations are needed because re.sub replaces just 1 instance at a time + intermediate_result = "" + while line != intermediate_result: + intermediate_result = line + line = re.sub(regex_find, regex_replace, line) + + processed_dblp.write(line) + + return config.path_input + + +# Find Unique venues from the DBLP xml looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"]. +# TODO: Re-run this if (1) The above list has changed OR (2) There is a NEW DBLP snapshot. +def get_unique_venues(force:bool=False) -> pd.DataFrame: + if force or not config.path_unique_venues.exists(): + logger.info(f"Extracting venues to {config.path_unique_venues}") + unique_sources = dict() + for event, elem in tqdm(ET.iterparse(config.path_input, recover=True), desc="Entry"): + if elem.tag in ["article","inproceedings","incollection"]: + for child in elem.getchildren(): + if child.tag in ["journal", "booktitle"]: + if child.text not in unique_sources: + unique_sources[child.text] = dict() + unique_sources[child.text]["count"] = 0 + unique_sources[child.text]["child_tag"] = child.tag + unique_sources[child.text]["elem_tag"] = elem.tag + unique_sources[child.text]["count"] += 1 + + # Create a Pandas DataFrame + df_unique_sources = pd.DataFrame.from_dict(unique_sources, orient="index") + + logger.debug("Writing to disk") + # Save it to disk + df_unique_sources.to_csv(config.path_unique_venues, header=True, sep='\t') + else: + logger.info(f"Loading data from {config.path_unique_venues}") + df_unique_sources = pd.read_csv(config.path_unique_venues, header=0, sep='\t') + + return df_unique_sources + + +# FILTER the huge dblp_processed.xml file to keep just the data that we are interested in. +# TODO: Re-run this if (1) The list has changed or (2) There is a NEW DBLP snapshot . +def get_extracted_data(force:bool=False) -> pd.DataFrame: + if force or not config.path_output.exists(): + logger.info(f"Extracting data to {config.path_output}") + result_list = list() + src_set = set() + for event, elem in tqdm(ET.iterparse(config.path_input, encoding='UTF-8', recover=True), desc="Entry"): + obj = dict() + to_add = False + for child in elem.getchildren(): + if child.tag not in obj: + if child.tag in ["author", "ee", "url"]: + obj[child.tag] = list() + else: + obj[child.tag] = None + + if child.tag in ["author", "ee", "url"]: + if child.text is not None: + obj[child.tag].append(child.text.replace("%26", "&")) + else: + obj[child.tag].append(child.text) + else: + obj[child.tag] = child.text # title, year, pgs + + # Only consider adding entries from the source defined above + if child.text in config.interesting_venues and child.tag == config.interesting_venues[child.text]["sourcetype"]: + obj["source"] = child.text + to_add = True + if child.text not in src_set: + src_set.add(child.text) + logger.debug(f"Adding source: {child.text}") + + if to_add: + result_list.append(obj) + + # Create a DataFrame + df_result_list = pd.DataFrame(result_list) + + # Initialize the fields that we are going to scrape. + # TODO: Update these if more fields are added. + df_result_list["abstract"] = "Not Scraped" + df_result_list["keywords"] = "Not Scraped" + df_result_list["citation_count"] = "Not Scraped" + + logger.debug("Writing to disk") + # Save to disk + df_result_list.to_csv(config.path_output, sep='\t', header=True) + else: + logger.info(f"Loading data from {config.path_output}") + df_result_list = pd.read_csv(config.path_output, sep='\t', header=0) + + return df_result_list + + +# get a new headless Chrome driver +def _get_webdriver_instance(): + chrome_options = Options() + chrome_options.add_argument("--headless") + # chrome_options.binary_location = config.path_chromeoptions_binary + # driver = webdriver.chrome(executable_path=config.path_chromedriver, chrome_options=chrome_options) + driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), + chrome_options=chrome_options) + # driver.implicitly_wait(10000) + return driver + + +# Scrap the Abstracts, Keywords, and Citations +def get_processed_data(force:bool=False) -> pd.DataFrame: + if force or not config.path_output.exists(): + # Get a webdriver instance (Headless Chrome) + logger.info(f"Processing data to {config.path_output}") + driver = _get_webdriver_instance() + + # Read the base datafile + df_papers = pd.read_csv(config.path_output, sep='\t', header=0) + + # Initialize a log object to analyze the summary of a particular run. + log_obj = dict() + + # Start scraping + for index, row in tqdm(df_papers.iterrows(), desc="Papers", total=df_papers.shape[0]): + + # ToDo: Keep Checking this high-level filter to minimize iterations. + if (str(row["abstract"]) in __scraper_filter["abstract"] or + str(row["keywords"]) in __scraper_filter["keywords"] or + str(row["citation_count"]) in __scraper_filter["citation_count"]) \ + and row["source"] in __publication_src: + + if row["source"] not in log_obj: + log_obj[row["source"]] = dict() + log_obj[row["source"]]["papers"] = 0 + log_obj[row["source"]]["abstract_parse_errors"] = 0 + log_obj[row["source"]]["abstract_fetch_errors"] = 0 + log_obj[row["source"]]["abstract_errors"] = 0 + log_obj[row["source"]]["keyword_parse_errors"] = 0 + log_obj[row["source"]]["keyword_fetch_errors"] = 0 + log_obj[row["source"]]["keyword_errors"] = 0 + log_obj[row["source"]]["no_of_citations_parse_errors"] = 0 + log_obj[row["source"]]["no_of_citations_fetch_errors"] = 0 + log_obj[row["source"]]["no_of_citations_errors"] = 0 + + # Increment no of papers + log_obj[row["source"]]["papers"] += 1 + + # Get the URLs + urls = [] + try: + urls = ast.literal_eval(row["ee"]) + except Exception as e: + # If not ee, check url. + # But, this doesn't have HTTP/HTTPS it seems to be following some Relative Paths from a BaseURL that is unknown. + # Hence, it will fail 99% of the times. + try: + urls = ast.literal_eval(row["url"]) + except: + pass + + # If there is No url OR If the URL begins with a db/, continue. + if len(urls) == 0 or urls[0].startswith("db/"): + df_papers.at[index, 'abstract'] = "No Url" + df_papers.at[index, 'keywords'] = "No Url" + df_papers.at[index, 'citation_count'] = "No Url" + logger.error(str(index) + " [No URL]: " + str(row["title"])) + continue + + # ABSTRACT + abstract_soup = None + try: + driver.get(urls[0]) + + # Delay to ensure routings are complete, page renders + time.sleep(1.5) + + # Initialize the Soup object + abstract_soup = BeautifulSoup(driver.page_source, 'lxml') + + except Exception as e: + logger.error('Abstract: ' + str(e)) + + if abstract_soup is not None: + is_abstract = False + for publisher in config.interesting_venues[row["source"]]["publishers"]: + abstract = get_abstract(publisher, abstract_soup) + if abstract is not None: + df_papers.at[index, 'abstract'] = abstract + logger.info(str(index) + " [Success][Abstract] " + str(urls[0]) + " " + str(abstract)[:50]) + is_abstract = True + break + + if not is_abstract: + df_papers.at[index, 'abstract'] = "Error" + logger.error(str(index) + " [Abstract Parse]: " + str(urls[0]) + " : " + str(row["source"])) + log_obj[row["source"]]["abstract_parse_errors"] += 1 + log_obj[row["source"]]["abstract_errors"] += 1 + + else: + df_papers.at[index, 'abstract'] = "Error" + logger.error(str(index) + " [Abstract URL Fetch]: " + str(row["source"])) + log_obj[row["source"]]["abstract_fetch_errors"] += 1 + log_obj[row["source"]]["abstract_errors"] += 1 + + # No. of CITATIONS + citation_soup = abstract_soup + if citation_soup is not None: + is_citation = False + for publisher in config.interesting_venues[row["source"]]["publishers"]: + citation_count = get_citation_count(publisher, citation_soup) + if citation_count is not None: + df_papers.at[index, 'citation_count'] = citation_count + logger.info(str(index) + " [Success][Citation Count] " + str(urls[0]) + " " + str(citation_count)) + is_citation = True + break + + if not is_citation: + df_papers.at[index, 'citation_count'] = "Error" + logger.error(str(index) + " [Citation Parse]: " + str(urls[0]) + " : " + str(row["source"])) + log_obj[row["source"]]["no_of_citations_parse_errors"] += 1 + log_obj[row["source"]]["no_of_citations_errors"] += 1 + + else: + df_papers.at[index, 'citation_count'] = "Error" + logger.error(str(index) + " [Citation Count URL Fetch]: " + str(row["source"])) + log_obj[row["source"]]["no_of_citations_fetch_errors"] += 1 + log_obj[row["source"]]["no_of_citations_errors"] += 1 + + # KEYWORDS + # Redirect to a different URL to fetch KEYWORDS in some cases. + is_keyword = False + current_url = driver.current_url + for publisher in config.interesting_venues[row["source"]]["publishers"]: + try: + if publisher == "ieee_explore": + driver.get(current_url+ "/keywords#keywords") + elif publisher == "eurographics_digital_library": + driver.get(current_url + "?show=full") + else: + driver.get(current_url) + + # Delay to ensure routings are complete, page renders + time.sleep(1.5) + + # Initialize the Soup object + keyword_soup = BeautifulSoup(driver.page_source, 'lxml') + + if keyword_soup is not None: + keywords_list = get_keywords(publisher, keyword_soup) + if keywords_list is not None: + df_papers.at[index, 'keywords'] = keywords_list + logger.info(str(index) + " [Success][Keywords] " + str(urls[0]) + " " + str(keywords_list)) + is_keyword = True + break + else: + df_papers.at[index, 'keywords'] = "Error" + logger.error(str(index) + " [Keywords URL Fetch]: " + str(row["source"])) + log_obj[row["source"]]["keyword_fetch_errors"] += 1 + log_obj[row["source"]]["keyword_errors"] += 1 + + except Exception as e: + pass + + if not is_keyword: + df_papers.at[index, 'keywords'] = "Error" + logger.error(str(index) + " [Error][Keywords Parse]: " + str(urls[0]) + " : " + str(row["source"])) + log_obj[row["source"]]["keyword_parse_errors"] += 1 + log_obj[row["source"]]["keyword_errors"] += 1 + + # Persist the paper file + df_papers.to_csv(config.path_output, sep='\t', header=True, index=False) + logger.i("scraped papers saved to disk.") + + # Persist Logs + df_logs = pd.DataFrame.from_dict(log_obj, orient="index") + logger.i(log_obj) + df_logs.to_csv(config.path_logfile, sep='\t', header=True) + else: + logger.info(f"Loading processed data from {config.path_output}") + df_papers = pd.read_csv(config.path_output, sep='\t', header=0) + + return df_papers diff --git a/paperscraper/config.py b/paperscraper/config.py index 19ee3e0..9a1d7c4 100644 --- a/paperscraper/config.py +++ b/paperscraper/config.py @@ -1,25 +1,27 @@ -import os +from pathlib import Path -# ToDo: [Update as required] Paths to important input/output files -path_input_raw = os.path.join("..", "assets", "data", "dblp-2020-11-01.xml") -path_input = os.path.join("..", "assets", "data", "dblp_processed.xml") -path_output = os.path.join("..", "output", "output.tsv") -path_postprocessing_output = os.path.join("..", "output", "output_processed.tsv") -path_unique_venues = os.path.join("..", "output", "unique_venues.tsv") -path_unique_keywords = os.path.join("..", "output", "unique_keywords.tsv") -path_unique_authors = os.path.join("..", "output", "unique_authors.tsv") -path_logfile = os.path.join("..", "output", "log.tsv") +_root_dir = Path(__file__).parent.parent +# TODO: [Update as required] Paths to important input/output files +# FIXME: automatically extract the latest +path_input_raw = _root_dir / "assets" / "data" / "dblp-2022-03-01.xml" +path_input = _root_dir / "assets" / "data" / "dblp_processed.xml" +path_output = _root_dir / "output" / "output.tsv" +path_postprocessing_output = _root_dir / "output" / "output_processed.tsv" +path_unique_venues = _root_dir / "output" / "unique_venues.tsv" +path_unique_keywords = _root_dir / "output" / "unique_keywords.tsv" +path_unique_authors = _root_dir / "output"/ "unique_authors.tsv" +path_logfile = _root_dir / "output" / "log.tsv" # ChromeDriver # TODO Option 1: Manual Download from https://chromedriver.chromium.org/downloads (e.g., ChromeDriver 86.0.4240.22) and save to a known location in PATH # TODO Option 2: Install using brew: `brew cask install chromedriver`. It is generally saved to `/usr/local/bin/chromedriver` # For Mac OSX, the executable will have to be quarantined - `xattr -d com.apple.quarantine chromedriver` # Set the chromedriver path below. -path_chromedriver = os.path.join("..", "assets", "chromedriver") # /usr/local/bin/chromedriver +path_chromedriver = _root_dir / "assets" / "chromedriver" # /usr/local/bin/chromedriver # ChromeOptions binary # TODO: [Update this path depending on where it is located in your Operating System] -path_chromeoptions_binary = os.path.join("/", "Applications", "Google Chrome.app", "Contents", "MacOS", "Google Chrome") +path_chromeoptions_binary = Path("/") / "Applications" / "Google Chrome.app" / "Contents" / "MacOS" / "Google Chrome" # List of Venues we target with their DBLP category. This information can be found in the path above. # TODO: [Update as required] Don't forget to add the corresponding logic to scrape keywords/absracts/titles/citations, etc. diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..7291d15 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,706 @@ +[[package]] +name = "async-generator" +version = "1.10" +description = "Async generators and context managers for Python 3.5+" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "attrs" +version = "21.4.0" +description = "Classes Without Boilerplate" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] + +[[package]] +name = "beautifulsoup4" +version = "4.10.0" +description = "Screen-scraping library" +category = "main" +optional = false +python-versions = ">3.0.0" + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + +[[package]] +name = "certifi" +version = "2021.10.8" +description = "Python package for providing Mozilla's CA Bundle." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "cffi" +version = "1.15.0" +description = "Foreign Function Interface for Python calling C code." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pycparser = "*" + +[[package]] +name = "charset-normalizer" +version = "2.0.12" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" +optional = false +python-versions = ">=3.5.0" + +[package.extras] +unicode_backport = ["unicodedata2"] + +[[package]] +name = "click" +version = "8.0.4" +description = "Composable command line interface toolkit" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.4" +description = "Cross-platform colored terminal text." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "cryptography" +version = "36.0.2" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +cffi = ">=1.12" + +[package.extras] +docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] +docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] +pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] +sdist = ["setuptools_rust (>=0.11.4)"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] + +[[package]] +name = "h11" +version = "0.13.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "idna" +version = "3.3" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "loguru" +version = "0.6.0" +description = "Python logging made (stupidly) simple" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""} +win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""} + +[package.extras] +dev = ["colorama (>=0.3.4)", "docutils (==0.16)", "flake8 (>=3.7.7)", "tox (>=3.9.0)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "black (>=19.10b0)", "isort (>=5.1.1)", "Sphinx (>=4.1.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)"] + +[[package]] +name = "lxml" +version = "4.8.0" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["beautifulsoup4"] +source = ["Cython (>=0.29.7)"] + +[[package]] +name = "numpy" +version = "1.22.3" +description = "NumPy is the fundamental package for array computing with Python." +category = "main" +optional = false +python-versions = ">=3.8" + +[[package]] +name = "outcome" +version = "1.1.0" +description = "Capture the outcome of Python function calls." +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +attrs = ">=19.2.0" + +[[package]] +name = "pandas" +version = "1.4.1" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +numpy = [ + {version = ">=1.18.5", markers = "platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.19.2", markers = "platform_machine == \"aarch64\" and python_version < \"3.10\""}, + {version = ">=1.20.0", markers = "platform_machine == \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, +] +python-dateutil = ">=2.8.1" +pytz = ">=2020.1" + +[package.extras] +test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] + +[[package]] +name = "pycparser" +version = "2.21" +description = "C parser in Python" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pyopenssl" +version = "22.0.0" +description = "Python wrapper module around the OpenSSL library" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +cryptography = ">=35.0" + +[package.extras] +docs = ["sphinx", "sphinx-rtd-theme"] +test = ["flaky", "pretend", "pytest (>=3.0.1)"] + +[[package]] +name = "pysocks" +version = "1.7.1" +description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2021.3" +description = "World timezone definitions, modern and historical" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "requests" +version = "2.27.1" +description = "Python HTTP for Humans." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""} +idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""} +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] + +[[package]] +name = "selenium" +version = "4.1.3" +description = "" +category = "main" +optional = false +python-versions = "~=3.7" + +[package.dependencies] +trio = ">=0.17,<1.0" +trio-websocket = ">=0.9,<1.0" +urllib3 = {version = ">=1.26,<2.0", extras = ["secure", "socks"]} + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "sniffio" +version = "1.2.0" +description = "Sniff out which async library your code is running under" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "soupsieve" +version = "2.3.1" +description = "A modern CSS selector implementation for Beautiful Soup." +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "tqdm" +version = "4.63.0" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +telegram = ["requests"] + +[[package]] +name = "trio" +version = "0.20.0" +description = "A friendly Python library for async concurrency and I/O" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +async-generator = ">=1.9" +attrs = ">=19.2.0" +cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""} +idna = "*" +outcome = "*" +sniffio = "*" +sortedcontainers = "*" + +[[package]] +name = "trio-websocket" +version = "0.9.2" +description = "WebSocket library for Trio" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +async-generator = ">=1.10" +trio = ">=0.11" +wsproto = ">=0.14" + +[[package]] +name = "urllib3" +version = "1.26.9" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" + +[package.dependencies] +certifi = {version = "*", optional = true, markers = "extra == \"secure\""} +cryptography = {version = ">=1.3.4", optional = true, markers = "extra == \"secure\""} +idna = {version = ">=2.0.0", optional = true, markers = "extra == \"secure\""} +pyOpenSSL = {version = ">=0.14", optional = true, markers = "extra == \"secure\""} +PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""} + +[package.extras] +brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + +[[package]] +name = "webdriver-manager" +version = "3.5.4" +description = "Library provides the way to automatically manage drivers for different browsers" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +requests = "*" + +[[package]] +name = "win32-setctime" +version = "1.1.0" +description = "A small Python utility to set file creation time on Windows" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.extras] +dev = ["pytest (>=4.6.2)", "black (>=19.3b0)"] + +[[package]] +name = "wsproto" +version = "1.1.0" +description = "WebSockets state-machine based protocol implementation" +category = "main" +optional = false +python-versions = ">=3.7.0" + +[package.dependencies] +h11 = ">=0.9.0,<1" + +[metadata] +lock-version = "1.1" +python-versions = "~=3.8" +content-hash = "89d5de02738bcf3f4a31eca13e4759300c5312821679bf90a58809024885e1a2" + +[metadata.files] +async-generator = [ + {file = "async_generator-1.10-py3-none-any.whl", hash = "sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b"}, + {file = "async_generator-1.10.tar.gz", hash = "sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144"}, +] +attrs = [ + {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, + {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, +] +beautifulsoup4 = [ + {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"}, + {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"}, +] +certifi = [ + {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"}, + {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"}, +] +cffi = [ + {file = "cffi-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962"}, + {file = "cffi-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0"}, + {file = "cffi-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14"}, + {file = "cffi-1.15.0-cp27-cp27m-win32.whl", hash = "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474"}, + {file = "cffi-1.15.0-cp27-cp27m-win_amd64.whl", hash = "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6"}, + {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27"}, + {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023"}, + {file = "cffi-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2"}, + {file = "cffi-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382"}, + {file = "cffi-1.15.0-cp310-cp310-win32.whl", hash = "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55"}, + {file = "cffi-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0"}, + {file = "cffi-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605"}, + {file = "cffi-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e"}, + {file = "cffi-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc"}, + {file = "cffi-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7"}, + {file = "cffi-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66"}, + {file = "cffi-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029"}, + {file = "cffi-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6"}, + {file = "cffi-1.15.0-cp38-cp38-win32.whl", hash = "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c"}, + {file = "cffi-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443"}, + {file = "cffi-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a"}, + {file = "cffi-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8"}, + {file = "cffi-1.15.0-cp39-cp39-win32.whl", hash = "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a"}, + {file = "cffi-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139"}, + {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"}, +] +charset-normalizer = [ + {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"}, + {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"}, +] +click = [ + {file = "click-8.0.4-py3-none-any.whl", hash = "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1"}, + {file = "click-8.0.4.tar.gz", hash = "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"}, +] +colorama = [ + {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, + {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, +] +cryptography = [ + {file = "cryptography-36.0.2-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:4e2dddd38a5ba733be6a025a1475a9f45e4e41139d1321f412c6b360b19070b6"}, + {file = "cryptography-36.0.2-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:4881d09298cd0b669bb15b9cfe6166f16fc1277b4ed0d04a22f3d6430cb30f1d"}, + {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ea634401ca02367c1567f012317502ef3437522e2fc44a3ea1844de028fa4b84"}, + {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:7be666cc4599b415f320839e36367b273db8501127b38316f3b9f22f17a0b815"}, + {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8241cac0aae90b82d6b5c443b853723bcc66963970c67e56e71a2609dc4b5eaf"}, + {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b2d54e787a884ffc6e187262823b6feb06c338084bbe80d45166a1cb1c6c5bf"}, + {file = "cryptography-36.0.2-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:c2c5250ff0d36fd58550252f54915776940e4e866f38f3a7866d92b32a654b86"}, + {file = "cryptography-36.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ec6597aa85ce03f3e507566b8bcdf9da2227ec86c4266bd5e6ab4d9e0cc8dab2"}, + {file = "cryptography-36.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ca9f686517ec2c4a4ce930207f75c00bf03d94e5063cbc00a1dc42531511b7eb"}, + {file = "cryptography-36.0.2-cp36-abi3-win32.whl", hash = "sha256:f64b232348ee82f13aac22856515ce0195837f6968aeaa94a3d0353ea2ec06a6"}, + {file = "cryptography-36.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:53e0285b49fd0ab6e604f4c5d9c5ddd98de77018542e88366923f152dbeb3c29"}, + {file = "cryptography-36.0.2-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:32db5cc49c73f39aac27574522cecd0a4bb7384e71198bc65a0d23f901e89bb7"}, + {file = "cryptography-36.0.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b3d199647468d410994dbeb8cec5816fb74feb9368aedf300af709ef507e3e"}, + {file = "cryptography-36.0.2-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:da73d095f8590ad437cd5e9faf6628a218aa7c387e1fdf67b888b47ba56a17f0"}, + {file = "cryptography-36.0.2-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:0a3bf09bb0b7a2c93ce7b98cb107e9170a90c51a0162a20af1c61c765b90e60b"}, + {file = "cryptography-36.0.2-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8897b7b7ec077c819187a123174b645eb680c13df68354ed99f9b40a50898f77"}, + {file = "cryptography-36.0.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82740818f2f240a5da8dfb8943b360e4f24022b093207160c77cadade47d7c85"}, + {file = "cryptography-36.0.2-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:1f64a62b3b75e4005df19d3b5235abd43fa6358d5516cfc43d87aeba8d08dd51"}, + {file = "cryptography-36.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e167b6b710c7f7bc54e67ef593f8731e1f45aa35f8a8a7b72d6e42ec76afd4b3"}, + {file = "cryptography-36.0.2.tar.gz", hash = "sha256:70f8f4f7bb2ac9f340655cbac89d68c527af5bb4387522a8413e841e3e6628c9"}, +] +h11 = [ + {file = "h11-0.13.0-py3-none-any.whl", hash = "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"}, + {file = "h11-0.13.0.tar.gz", hash = "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06"}, +] +idna = [ + {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, + {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, +] +loguru = [ + {file = "loguru-0.6.0-py3-none-any.whl", hash = "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"}, + {file = "loguru-0.6.0.tar.gz", hash = "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c"}, +] +lxml = [ + {file = "lxml-4.8.0-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b"}, + {file = "lxml-4.8.0-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430"}, + {file = "lxml-4.8.0-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a"}, + {file = "lxml-4.8.0-cp27-cp27m-win32.whl", hash = "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5"}, + {file = "lxml-4.8.0-cp27-cp27m-win_amd64.whl", hash = "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9"}, + {file = "lxml-4.8.0-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc"}, + {file = "lxml-4.8.0-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170"}, + {file = "lxml-4.8.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9"}, + {file = "lxml-4.8.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03"}, + {file = "lxml-4.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe"}, + {file = "lxml-4.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa"}, + {file = "lxml-4.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1"}, + {file = "lxml-4.8.0-cp310-cp310-win32.whl", hash = "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b"}, + {file = "lxml-4.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76"}, + {file = "lxml-4.8.0-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6"}, + {file = "lxml-4.8.0-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2"}, + {file = "lxml-4.8.0-cp35-cp35m-win32.whl", hash = "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150"}, + {file = "lxml-4.8.0-cp35-cp35m-win_amd64.whl", hash = "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654"}, + {file = "lxml-4.8.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169"}, + {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb"}, + {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3"}, + {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4"}, + {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e"}, + {file = "lxml-4.8.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613"}, + {file = "lxml-4.8.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33"}, + {file = "lxml-4.8.0-cp36-cp36m-win32.whl", hash = "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429"}, + {file = "lxml-4.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63"}, + {file = "lxml-4.8.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a"}, + {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4"}, + {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15"}, + {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"}, + {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c"}, + {file = "lxml-4.8.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85"}, + {file = "lxml-4.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141"}, + {file = "lxml-4.8.0-cp37-cp37m-win32.whl", hash = "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63"}, + {file = "lxml-4.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8"}, + {file = "lxml-4.8.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7"}, + {file = "lxml-4.8.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428"}, + {file = "lxml-4.8.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5"}, + {file = "lxml-4.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f"}, + {file = "lxml-4.8.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870"}, + {file = "lxml-4.8.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9"}, + {file = "lxml-4.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68"}, + {file = "lxml-4.8.0-cp38-cp38-win32.whl", hash = "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696"}, + {file = "lxml-4.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939"}, + {file = "lxml-4.8.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807"}, + {file = "lxml-4.8.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1"}, + {file = "lxml-4.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939"}, + {file = "lxml-4.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca"}, + {file = "lxml-4.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c"}, + {file = "lxml-4.8.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87"}, + {file = "lxml-4.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9"}, + {file = "lxml-4.8.0-cp39-cp39-win32.whl", hash = "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea"}, + {file = "lxml-4.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c"}, + {file = "lxml-4.8.0-pp37-pypy37_pp73-macosx_10_14_x86_64.whl", hash = "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507"}, + {file = "lxml-4.8.0-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9"}, + {file = "lxml-4.8.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e"}, + {file = "lxml-4.8.0-pp38-pypy38_pp73-macosx_10_14_x86_64.whl", hash = "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0"}, + {file = "lxml-4.8.0-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79"}, + {file = "lxml-4.8.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93"}, + {file = "lxml-4.8.0.tar.gz", hash = "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23"}, +] +numpy = [ + {file = "numpy-1.22.3-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75"}, + {file = "numpy-1.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab"}, + {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e"}, + {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4"}, + {file = "numpy-1.22.3-cp310-cp310-win32.whl", hash = "sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430"}, + {file = "numpy-1.22.3-cp310-cp310-win_amd64.whl", hash = "sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4"}, + {file = "numpy-1.22.3-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce"}, + {file = "numpy-1.22.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe"}, + {file = "numpy-1.22.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:568dfd16224abddafb1cbcce2ff14f522abe037268514dd7e42c6776a1c3f8e5"}, + {file = "numpy-1.22.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca688e1b9b95d80250bca34b11a05e389b1420d00e87a0d12dc45f131f704a1"}, + {file = "numpy-1.22.3-cp38-cp38-win32.whl", hash = "sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62"}, + {file = "numpy-1.22.3-cp38-cp38-win_amd64.whl", hash = "sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676"}, + {file = "numpy-1.22.3-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:2c10a93606e0b4b95c9b04b77dc349b398fdfbda382d2a39ba5a822f669a0123"}, + {file = "numpy-1.22.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802"}, + {file = "numpy-1.22.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bfb1bb598e8229c2d5d48db1860bcf4311337864ea3efdbe1171fb0c5da515d"}, + {file = "numpy-1.22.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97098b95aa4e418529099c26558eeb8486e66bd1e53a6b606d684d0c3616b168"}, + {file = "numpy-1.22.3-cp39-cp39-win32.whl", hash = "sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa"}, + {file = "numpy-1.22.3-cp39-cp39-win_amd64.whl", hash = "sha256:639b54cdf6aa4f82fe37ebf70401bbb74b8508fddcf4797f9fe59615b8c5813a"}, + {file = "numpy-1.22.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c34ea7e9d13a70bf2ab64a2532fe149a9aced424cd05a2c4ba662fd989e3e45f"}, + {file = "numpy-1.22.3.zip", hash = "sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18"}, +] +outcome = [ + {file = "outcome-1.1.0-py2.py3-none-any.whl", hash = "sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958"}, + {file = "outcome-1.1.0.tar.gz", hash = "sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967"}, +] +pandas = [ + {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3dfb32ed50122fe8c5e7f2b8d97387edd742cc78f9ec36f007ee126cd3720907"}, + {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0259cd11e7e6125aaea3af823b80444f3adad6149ff4c97fef760093598b3e34"}, + {file = "pandas-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:96e9ece5759f9b47ae43794b6359bbc54805d76e573b161ae770c1ea59393106"}, + {file = "pandas-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:508c99debccd15790d526ce6b1624b97a5e1e4ca5b871319fb0ebfd46b8f4dad"}, + {file = "pandas-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6a7bbbb7950063bfc942f8794bc3e31697c020a14f1cd8905fc1d28ec674a01"}, + {file = "pandas-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:c614001129b2a5add5e3677c3a213a9e6fd376204cb8d17c04e84ff7dfc02a73"}, + {file = "pandas-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4e1176f45981c8ccc8161bc036916c004ca51037a7ed73f2d2a9857e6dbe654f"}, + {file = "pandas-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bbb15ad79050e8b8d39ec40dd96a30cd09b886a2ae8848d0df1abba4d5502a67"}, + {file = "pandas-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6d6ad1da00c7cc7d8dd1559a6ba59ba3973be6b15722d49738b2be0977eb8a0c"}, + {file = "pandas-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:358b0bc98a5ff067132d23bf7a2242ee95db9ea5b7bbc401cf79205f11502fd3"}, + {file = "pandas-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6105af6533f8b63a43ea9f08a2ede04e8f43e49daef0209ab0d30352bcf08bee"}, + {file = "pandas-1.4.1-cp38-cp38-win32.whl", hash = "sha256:04dd15d9db538470900c851498e532ef28d4e56bfe72c9523acb32042de43dfb"}, + {file = "pandas-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:1b384516dbb4e6aae30e3464c2e77c563da5980440fbdfbd0968e3942f8f9d70"}, + {file = "pandas-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f02e85e6d832be37d7f16cf6ac8bb26b519ace3e5f3235564a91c7f658ab2a43"}, + {file = "pandas-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0b1a13f647e4209ed7dbb5da3497891d0045da9785327530ab696417ef478f84"}, + {file = "pandas-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:19f7c632436b1b4f84615c3b127bbd7bc603db95e3d4332ed259dc815c9aaa26"}, + {file = "pandas-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ea47ba1d6f359680130bd29af497333be6110de8f4c35b9211eec5a5a9630fa"}, + {file = "pandas-1.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e5a7a1e0ecaac652326af627a3eca84886da9e667d68286866d4e33f6547caf"}, + {file = "pandas-1.4.1-cp39-cp39-win32.whl", hash = "sha256:1d85d5f6be66dfd6d1d8d13b9535e342a2214260f1852654b19fa4d7b8d1218b"}, + {file = "pandas-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:3129a35d9dad1d80c234dd78f8f03141b914395d23f97cf92a366dcd19f8f8bf"}, + {file = "pandas-1.4.1.tar.gz", hash = "sha256:8db93ec98ac7cb5f8ac1420c10f5e3c43533153f253fe7fb6d891cf5aa2b80d2"}, +] +pycparser = [ + {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, + {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, +] +pyopenssl = [ + {file = "pyOpenSSL-22.0.0-py2.py3-none-any.whl", hash = "sha256:ea252b38c87425b64116f808355e8da644ef9b07e429398bfece610f893ee2e0"}, + {file = "pyOpenSSL-22.0.0.tar.gz", hash = "sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf"}, +] +pysocks = [ + {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"}, + {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"}, + {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"}, +] +python-dateutil = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] +pytz = [ + {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"}, + {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"}, +] +requests = [ + {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, + {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, +] +selenium = [ + {file = "selenium-4.1.3-py3-none-any.whl", hash = "sha256:14d28a628c831c105d38305c881c9c7847199bfd728ec84240c5e86fa1c9bd5a"}, +] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] +sniffio = [ + {file = "sniffio-1.2.0-py3-none-any.whl", hash = "sha256:471b71698eac1c2112a40ce2752bb2f4a4814c22a54a3eed3676bc0f5ca9f663"}, + {file = "sniffio-1.2.0.tar.gz", hash = "sha256:c4666eecec1d3f50960c6bdf61ab7bc350648da6c126e3cf6898d8cd4ddcd3de"}, +] +sortedcontainers = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] +soupsieve = [ + {file = "soupsieve-2.3.1-py3-none-any.whl", hash = "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb"}, + {file = "soupsieve-2.3.1.tar.gz", hash = "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"}, +] +tqdm = [ + {file = "tqdm-4.63.0-py2.py3-none-any.whl", hash = "sha256:e643e071046f17139dea55b880dc9b33822ce21613b4a4f5ea57f202833dbc29"}, + {file = "tqdm-4.63.0.tar.gz", hash = "sha256:1d9835ede8e394bb8c9dcbffbca02d717217113adc679236873eeaac5bc0b3cd"}, +] +trio = [ + {file = "trio-0.20.0-py3-none-any.whl", hash = "sha256:fb2d48e4eab0dfb786a472cd514aaadc71e3445b203bc300bad93daa75d77c1a"}, + {file = "trio-0.20.0.tar.gz", hash = "sha256:670a52d3115d0e879e1ac838a4eb999af32f858163e3a704fe4839de2a676070"}, +] +trio-websocket = [ + {file = "trio-websocket-0.9.2.tar.gz", hash = "sha256:a3d34de8fac26023eee701ed1e7bf4da9a8326b61a62934ec9e53b64970fd8fe"}, + {file = "trio_websocket-0.9.2-py3-none-any.whl", hash = "sha256:5b558f6e83cc20a37c3b61202476c5295d1addf57bd65543364e0337e37ed2bc"}, +] +urllib3 = [ + {file = "urllib3-1.26.9-py2.py3-none-any.whl", hash = "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14"}, + {file = "urllib3-1.26.9.tar.gz", hash = "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"}, +] +webdriver-manager = [ + {file = "webdriver_manager-3.5.4-py2.py3-none-any.whl", hash = "sha256:b5b91b5df83181e002263fe27296967a5b19cb1ebe8e4a63ee83538394037df4"}, + {file = "webdriver_manager-3.5.4.tar.gz", hash = "sha256:2eb7c2fe38ec5b06e2090164923e4dfb7c3ac4e7140333a3de9c7956f5047858"}, +] +win32-setctime = [ + {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"}, + {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"}, +] +wsproto = [ + {file = "wsproto-1.1.0-py3-none-any.whl", hash = "sha256:2218cb57952d90b9fca325c0dcfb08c3bda93e8fd8070b0a17f048e2e47a521b"}, + {file = "wsproto-1.1.0.tar.gz", hash = "sha256:a2e56bfd5c7cd83c1369d83b5feccd6d37798b74872866e62616e0ecf111bda8"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..74038a5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,26 @@ +[tool.poetry] +name = "paperscraper" +version = "0.1.0-alpha.1" +description = "Scrape and provide interface for data from dblp" +authors = [] + +[tool.poetry.dependencies] +python = "~=3.8" +lxml = "^4.8.0" +pandas = "^1.4.1" +beautifulsoup4 = "^4.10.0" +selenium = "^4.1.3" +numpy = "^1.22.3" +click = "^8.0.4" +loguru = "^0.6.0" +tqdm = "^4.63.0" +webdriver-manager = "^3.5.4" + +[tool.poetry.dev-dependencies] + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry.scripts] +paperscraper = "paperscraper._cli:cli" \ No newline at end of file From 413570eccbb82043c0efd24390f6c95681812af8 Mon Sep 17 00:00:00 2001 From: Ahmed Shariff Date: Fri, 25 Nov 2022 14:44:36 -0800 Subject: [PATCH 2/9] Cleanup code --- paperscraper/_preprocess.py | 56 ++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py index 49eec4c..da06c8d 100644 --- a/paperscraper/_preprocess.py +++ b/paperscraper/_preprocess.py @@ -33,10 +33,16 @@ } -def get_processed_db(force:bool=False) -> Path: +def get_processed_db(force: bool = False) -> Path: + """ + Clean the raw file (set in config.path_input_raw) and writing it out to config.path_input. + + Function is run only if config.path_input doesn't exsit or if `force` is True. + """ if force or not config.path_input.exists(): logger.info(f"Cleaning data from {config.path_input_raw} into {config.path_input}") - # This Regular Find+Replace replaces instances of & between tags with a SPECIAL TAG `%26`. This tag will be replaced back to `&` in the code later on. + # This Regular Find+Replace replaces instances of & between tags with a + # SPECIAL TAG `%26`. This tag will be replaced back to `&` in the code later on. regex_find = r'(.*)&(.*)' regex_replace = r'\1%26\2' @@ -51,18 +57,22 @@ def get_processed_db(force:bool=False) -> Path: line = re.sub(regex_find, regex_replace, line) processed_dblp.write(line) - + return config.path_input -# Find Unique venues from the DBLP xml looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"]. # TODO: Re-run this if (1) The above list has changed OR (2) There is a NEW DBLP snapshot. -def get_unique_venues(force:bool=False) -> pd.DataFrame: +def get_unique_venues(force: bool = False) -> pd.DataFrame: + """ + Find Unique venues from the DBLP xml. + + Looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"]. + """ if force or not config.path_unique_venues.exists(): logger.info(f"Extracting venues to {config.path_unique_venues}") - unique_sources = dict() + unique_sources: dict = {} for event, elem in tqdm(ET.iterparse(config.path_input, recover=True), desc="Entry"): - if elem.tag in ["article","inproceedings","incollection"]: + if elem.tag in ["article", "inproceedings", "incollection"]: for child in elem.getchildren(): if child.tag in ["journal", "booktitle"]: if child.text not in unique_sources: @@ -85,15 +95,17 @@ def get_unique_venues(force:bool=False) -> pd.DataFrame: return df_unique_sources -# FILTER the huge dblp_processed.xml file to keep just the data that we are interested in. -# TODO: Re-run this if (1) The list has changed or (2) There is a NEW DBLP snapshot . -def get_extracted_data(force:bool=False) -> pd.DataFrame: +# TODO: Re-run this if +# (1) The list has changed or +# (2) There is a NEW DBLP snapshot. +def get_extracted_data(force: bool = False) -> pd.DataFrame: + """FILTER the huge dblp_processed.xml file to keep just the data that we are interested in.""" if force or not config.path_output.exists(): logger.info(f"Extracting data to {config.path_output}") result_list = list() src_set = set() for event, elem in tqdm(ET.iterparse(config.path_input, encoding='UTF-8', recover=True), desc="Entry"): - obj = dict() + obj: dict = {} to_add = False for child in elem.getchildren(): if child.tag not in obj: @@ -108,10 +120,10 @@ def get_extracted_data(force:bool=False) -> pd.DataFrame: else: obj[child.tag].append(child.text) else: - obj[child.tag] = child.text # title, year, pgs + obj[child.tag] = child.text # title, year, pgs # Only consider adding entries from the source defined above - if child.text in config.interesting_venues and child.tag == config.interesting_venues[child.text]["sourcetype"]: + if (child.text in config.interesting_venues and child.tag == config.interesting_venues[child.text]["sourcetype"]): obj["source"] = child.text to_add = True if child.text not in src_set: @@ -152,8 +164,8 @@ def _get_webdriver_instance(): return driver -# Scrap the Abstracts, Keywords, and Citations -def get_processed_data(force:bool=False) -> pd.DataFrame: +def get_processed_data(force: bool = False) -> pd.DataFrame: + """Scrap the Abstracts, Keywords, and Citations.""" if force or not config.path_output.exists(): # Get a webdriver instance (Headless Chrome) logger.info(f"Processing data to {config.path_output}") @@ -163,7 +175,7 @@ def get_processed_data(force:bool=False) -> pd.DataFrame: df_papers = pd.read_csv(config.path_output, sep='\t', header=0) # Initialize a log object to analyze the summary of a particular run. - log_obj = dict() + log_obj: dict = {} # Start scraping for index, row in tqdm(df_papers.iterrows(), desc="Papers", total=df_papers.shape[0]): @@ -194,13 +206,13 @@ def get_processed_data(force:bool=False) -> pd.DataFrame: urls = [] try: urls = ast.literal_eval(row["ee"]) - except Exception as e: + except Exception: # If not ee, check url. - # But, this doesn't have HTTP/HTTPS it seems to be following some Relative Paths from a BaseURL that is unknown. - # Hence, it will fail 99% of the times. + # But, this doesn't have HTTP/HTTPS it seems to be following some Relative Paths from a + # BaseURL that is unknown. Hence, it will fail 99% of the times. try: urls = ast.literal_eval(row["url"]) - except: + except Exception: pass # If there is No url OR If the URL begins with a db/, continue. @@ -278,7 +290,7 @@ def get_processed_data(force:bool=False) -> pd.DataFrame: for publisher in config.interesting_venues[row["source"]]["publishers"]: try: if publisher == "ieee_explore": - driver.get(current_url+ "/keywords#keywords") + driver.get(current_url + "/keywords#keywords") elif publisher == "eurographics_digital_library": driver.get(current_url + "?show=full") else: @@ -303,7 +315,7 @@ def get_processed_data(force:bool=False) -> pd.DataFrame: log_obj[row["source"]]["keyword_fetch_errors"] += 1 log_obj[row["source"]]["keyword_errors"] += 1 - except Exception as e: + except Exception: pass if not is_keyword: From db8ac254eda1fda4cb14076fc34f7603869a2491 Mon Sep 17 00:00:00 2001 From: Ahmed Shariff Date: Fri, 25 Nov 2022 16:01:06 -0800 Subject: [PATCH 3/9] Use sqlitedict _preprocess and related tests --- .gitignore | 3 + paperscraper/_cli.py | 8 +- paperscraper/_preprocess.py | 113 +++--- paperscraper/config.py | 587 +++++++++++++++------------- poetry.lock | 146 ++++++- pyproject.toml | 4 + test/assets/data/dblp_processed.xml | 89 +++++ test/test_preprocess.py | 56 +++ 8 files changed, 657 insertions(+), 349 deletions(-) create mode 100644 test/assets/data/dblp_processed.xml create mode 100644 test/test_preprocess.py diff --git a/.gitignore b/.gitignore index 4d1b36c..ad07f50 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ chromedriver .idea/ *.pyc +# include test files +!test/assets/data/*.xml + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/paperscraper/_cli.py b/paperscraper/_cli.py index 7a55635..9bbcd28 100644 --- a/paperscraper/_cli.py +++ b/paperscraper/_cli.py @@ -1,6 +1,6 @@ import click from paperscraper._preprocess import (get_processed_db, get_unique_venues, get_extracted_data, get_processed_data) - +from paperscraper.config import config @click.group() def cli(): @@ -15,6 +15,6 @@ def process(): @click.option("-f", "--force", help="Force run all steps", is_flag=True) def run_all(force): get_processed_db(force=False) - get_unique_venues(force=False) - get_extracted_data(force=False) - get_processed_data(force=force) + get_unique_venues(config, force=False) + get_extracted_data(config, force=False) + get_processed_data(config, force=force) diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py index da06c8d..b284f2f 100644 --- a/paperscraper/_preprocess.py +++ b/paperscraper/_preprocess.py @@ -1,21 +1,23 @@ +import ast import re +import time from pathlib import Path + import lxml.etree as ET import pandas as pd -from loguru import logger from bs4 import BeautifulSoup +from loguru import logger from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service -from webdriver_manager.chrome import ChromeDriverManager +from sqlitedict import SqliteDict from tqdm import tqdm -import ast -import time +from webdriver_manager.chrome import ChromeDriverManager -import paperscraper.config as config +from paperscraper.config import Config, config from paperscraper.scrapers.abstracts import get_abstract -from paperscraper.scrapers.keywords import get_keywords from paperscraper.scrapers.citations import get_citation_count +from paperscraper.scrapers.keywords import get_keywords logger.remove() logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) @@ -62,50 +64,58 @@ def get_processed_db(force: bool = False) -> Path: # TODO: Re-run this if (1) The above list has changed OR (2) There is a NEW DBLP snapshot. -def get_unique_venues(force: bool = False) -> pd.DataFrame: +def get_unique_venues(config: Config, force: bool = False) -> SqliteDict: """ Find Unique venues from the DBLP xml. Looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"]. """ if force or not config.path_unique_venues.exists(): + unique_sources = SqliteDict(config.path_unique_venues) + unique_sources.clear() # empty the db logger.info(f"Extracting venues to {config.path_unique_venues}") - unique_sources: dict = {} for event, elem in tqdm(ET.iterparse(config.path_input, recover=True), desc="Entry"): if elem.tag in ["article", "inproceedings", "incollection"]: for child in elem.getchildren(): if child.tag in ["journal", "booktitle"]: if child.text not in unique_sources: - unique_sources[child.text] = dict() - unique_sources[child.text]["count"] = 0 - unique_sources[child.text]["child_tag"] = child.tag - unique_sources[child.text]["elem_tag"] = elem.tag - unique_sources[child.text]["count"] += 1 + child_dict = {} + child_dict["count"] = 0 + child_dict["child_tag"] = child.tag + child_dict["elem_tag"] = elem.tag + else: + child_dict = unique_sources[child.text] - # Create a Pandas DataFrame - df_unique_sources = pd.DataFrame.from_dict(unique_sources, orient="index") + child_dict["count"] += 1 + unique_sources[child.text] = child_dict logger.debug("Writing to disk") # Save it to disk - df_unique_sources.to_csv(config.path_unique_venues, header=True, sep='\t') + unique_sources.commit() else: logger.info(f"Loading data from {config.path_unique_venues}") - df_unique_sources = pd.read_csv(config.path_unique_venues, header=0, sep='\t') + unique_sources = SqliteDict(config.path_unique_venues) - return df_unique_sources + return unique_sources # TODO: Re-run this if # (1) The list has changed or # (2) There is a NEW DBLP snapshot. -def get_extracted_data(force: bool = False) -> pd.DataFrame: +def get_extracted_data(config: Config, force: bool = False) -> SqliteDict: """FILTER the huge dblp_processed.xml file to keep just the data that we are interested in.""" if force or not config.path_output.exists(): logger.info(f"Extracting data to {config.path_output}") - result_list = list() + result_list = SqliteDict(config.path_output) + result_list.clear() # empty the db src_set = set() - for event, elem in tqdm(ET.iterparse(config.path_input, encoding='UTF-8', recover=True), desc="Entry"): + for _idx, (event, elem) in tqdm(enumerate(ET.iterparse(config.path_input, encoding='UTF-8', recover=True)), desc="Entry"): obj: dict = {} + # Initialize the fields that we are going to scrape. + # TODO: Update these if more fields are added. + obj["abstract"] = "Not Scraped" + obj["keywords"] = "Not Scraped" + obj["citation_count"] = "Not Scraped" to_add = False for child in elem.getchildren(): if child.tag not in obj: @@ -131,25 +141,20 @@ def get_extracted_data(force: bool = False) -> pd.DataFrame: logger.debug(f"Adding source: {child.text}") if to_add: - result_list.append(obj) + result_list[_idx] = obj - # Create a DataFrame - df_result_list = pd.DataFrame(result_list) - - # Initialize the fields that we are going to scrape. - # TODO: Update these if more fields are added. - df_result_list["abstract"] = "Not Scraped" - df_result_list["keywords"] = "Not Scraped" - df_result_list["citation_count"] = "Not Scraped" + # Periodically commiting stuff + if _idx % 100 == 0: + result_list.commit() logger.debug("Writing to disk") # Save to disk - df_result_list.to_csv(config.path_output, sep='\t', header=True) + result_list.commit() else: logger.info(f"Loading data from {config.path_output}") - df_result_list = pd.read_csv(config.path_output, sep='\t', header=0) + result_list = SqliteDict(config.path_output) - return df_result_list + return result_list # get a new headless Chrome driver @@ -164,7 +169,7 @@ def _get_webdriver_instance(): return driver -def get_processed_data(force: bool = False) -> pd.DataFrame: +def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict: """Scrap the Abstracts, Keywords, and Citations.""" if force or not config.path_output.exists(): # Get a webdriver instance (Headless Chrome) @@ -172,13 +177,13 @@ def get_processed_data(force: bool = False) -> pd.DataFrame: driver = _get_webdriver_instance() # Read the base datafile - df_papers = pd.read_csv(config.path_output, sep='\t', header=0) + papers_db = SqliteDict(config.path_output) # Initialize a log object to analyze the summary of a particular run. log_obj: dict = {} # Start scraping - for index, row in tqdm(df_papers.iterrows(), desc="Papers", total=df_papers.shape[0]): + for index, row in tqdm(papers_db.items(), desc="Papers", total=len(papers_db)): # ToDo: Keep Checking this high-level filter to minimize iterations. if (str(row["abstract"]) in __scraper_filter["abstract"] or @@ -217,9 +222,10 @@ def get_processed_data(force: bool = False) -> pd.DataFrame: # If there is No url OR If the URL begins with a db/, continue. if len(urls) == 0 or urls[0].startswith("db/"): - df_papers.at[index, 'abstract'] = "No Url" - df_papers.at[index, 'keywords'] = "No Url" - df_papers.at[index, 'citation_count'] = "No Url" + row['abstract'] = "No Url" + row['abstract'] = "No Url" + row['abstract'] = "No Url" + papers_db[index] = row logger.error(str(index) + " [No URL]: " + str(row["title"])) continue @@ -242,19 +248,19 @@ def get_processed_data(force: bool = False) -> pd.DataFrame: for publisher in config.interesting_venues[row["source"]]["publishers"]: abstract = get_abstract(publisher, abstract_soup) if abstract is not None: - df_papers.at[index, 'abstract'] = abstract + row['abstract'] = abstract logger.info(str(index) + " [Success][Abstract] " + str(urls[0]) + " " + str(abstract)[:50]) is_abstract = True break if not is_abstract: - df_papers.at[index, 'abstract'] = "Error" + row['abstract'] = "Error" logger.error(str(index) + " [Abstract Parse]: " + str(urls[0]) + " : " + str(row["source"])) log_obj[row["source"]]["abstract_parse_errors"] += 1 log_obj[row["source"]]["abstract_errors"] += 1 else: - df_papers.at[index, 'abstract'] = "Error" + row['abstract'] = "Error" logger.error(str(index) + " [Abstract URL Fetch]: " + str(row["source"])) log_obj[row["source"]]["abstract_fetch_errors"] += 1 log_obj[row["source"]]["abstract_errors"] += 1 @@ -266,19 +272,19 @@ def get_processed_data(force: bool = False) -> pd.DataFrame: for publisher in config.interesting_venues[row["source"]]["publishers"]: citation_count = get_citation_count(publisher, citation_soup) if citation_count is not None: - df_papers.at[index, 'citation_count'] = citation_count + row['citation_count'] = citation_count logger.info(str(index) + " [Success][Citation Count] " + str(urls[0]) + " " + str(citation_count)) is_citation = True break if not is_citation: - df_papers.at[index, 'citation_count'] = "Error" + row['citation_count'] = "Error" logger.error(str(index) + " [Citation Parse]: " + str(urls[0]) + " : " + str(row["source"])) log_obj[row["source"]]["no_of_citations_parse_errors"] += 1 log_obj[row["source"]]["no_of_citations_errors"] += 1 else: - df_papers.at[index, 'citation_count'] = "Error" + row['citation_count'] = "Error" logger.error(str(index) + " [Citation Count URL Fetch]: " + str(row["source"])) log_obj[row["source"]]["no_of_citations_fetch_errors"] += 1 log_obj[row["source"]]["no_of_citations_errors"] += 1 @@ -305,12 +311,12 @@ def get_processed_data(force: bool = False) -> pd.DataFrame: if keyword_soup is not None: keywords_list = get_keywords(publisher, keyword_soup) if keywords_list is not None: - df_papers.at[index, 'keywords'] = keywords_list + row['keywords'] = keywords_list logger.info(str(index) + " [Success][Keywords] " + str(urls[0]) + " " + str(keywords_list)) is_keyword = True break else: - df_papers.at[index, 'keywords'] = "Error" + row['keywords'] = "Error" logger.error(str(index) + " [Keywords URL Fetch]: " + str(row["source"])) log_obj[row["source"]]["keyword_fetch_errors"] += 1 log_obj[row["source"]]["keyword_errors"] += 1 @@ -319,13 +325,18 @@ def get_processed_data(force: bool = False) -> pd.DataFrame: pass if not is_keyword: - df_papers.at[index, 'keywords'] = "Error" + row['keywords'] = "Error" logger.error(str(index) + " [Error][Keywords Parse]: " + str(urls[0]) + " : " + str(row["source"])) log_obj[row["source"]]["keyword_parse_errors"] += 1 log_obj[row["source"]]["keyword_errors"] += 1 + papers_db[index] = row + + if index % 100 == 100: + papers_db.commit() + # Persist the paper file - df_papers.to_csv(config.path_output, sep='\t', header=True, index=False) + papers_db.commit() logger.i("scraped papers saved to disk.") # Persist Logs @@ -334,6 +345,6 @@ def get_processed_data(force: bool = False) -> pd.DataFrame: df_logs.to_csv(config.path_logfile, sep='\t', header=True) else: logger.info(f"Loading processed data from {config.path_output}") - df_papers = pd.read_csv(config.path_output, sep='\t', header=0) + papers_db = SqliteDict(config.path_output) - return df_papers + return papers_db diff --git a/paperscraper/config.py b/paperscraper/config.py index 9a1d7c4..dacc934 100644 --- a/paperscraper/config.py +++ b/paperscraper/config.py @@ -1,288 +1,313 @@ from pathlib import Path +from typing import Union -_root_dir = Path(__file__).parent.parent -# TODO: [Update as required] Paths to important input/output files -# FIXME: automatically extract the latest -path_input_raw = _root_dir / "assets" / "data" / "dblp-2022-03-01.xml" -path_input = _root_dir / "assets" / "data" / "dblp_processed.xml" -path_output = _root_dir / "output" / "output.tsv" -path_postprocessing_output = _root_dir / "output" / "output_processed.tsv" -path_unique_venues = _root_dir / "output" / "unique_venues.tsv" -path_unique_keywords = _root_dir / "output" / "unique_keywords.tsv" -path_unique_authors = _root_dir / "output"/ "unique_authors.tsv" -path_logfile = _root_dir / "output" / "log.tsv" -# ChromeDriver -# TODO Option 1: Manual Download from https://chromedriver.chromium.org/downloads (e.g., ChromeDriver 86.0.4240.22) and save to a known location in PATH -# TODO Option 2: Install using brew: `brew cask install chromedriver`. It is generally saved to `/usr/local/bin/chromedriver` -# For Mac OSX, the executable will have to be quarantined - `xattr -d com.apple.quarantine chromedriver` -# Set the chromedriver path below. -path_chromedriver = _root_dir / "assets" / "chromedriver" # /usr/local/bin/chromedriver +class Config: + """The main config object.""" + def __init__(self, root_dir: Union[str, Path] = None, + assets_dir: Union[str, Path] = None, + output_dir: Union[str, Path] = None): + """Initialize the config.""" + if root_dir is None: + _root_dir = Path(__file__).parent.parent + else: + _root_dir = Path(root_dir) -# ChromeOptions binary -# TODO: [Update this path depending on where it is located in your Operating System] -path_chromeoptions_binary = Path("/") / "Applications" / "Google Chrome.app" / "Contents" / "MacOS" / "Google Chrome" + if assets_dir is None: + assets_dir = _root_dir / "assets" + elif not isinstance(assets_dir, Path): + assets_dir = Path(assets_dir) -# List of Venues we target with their DBLP category. This information can be found in the path above. -# TODO: [Update as required] Don't forget to add the corresponding logic to scrape keywords/absracts/titles/citations, etc. -interesting_venues = { - "ACM Trans. Comput. Hum. Interact.": { - "sourcetype": "journal", - "publishers": ["acm_digital_library"] - }, - "AVI": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "BCS HCI": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library", "scienceopen", "springer_v2"] - }, - "BCS HCI (1)": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "BCS HCI (2)": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "BELIV": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library", "ieee_explore"] - }, - "BioVis": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "CHI": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "Cognitive Biases in Visualizations": { - "sourcetype": "booktitle", - "publishers": ["springer_v2"] - }, - "CogSci": { - "sourcetype": "booktitle", - "publishers": ["cogsci"] - }, - "Comput. Graph. Forum": { - "sourcetype": "journal", - "publishers": ["wiley_online_library"] - }, - "Conference on Designing Interactive Systems": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "Conference on Designing Interactive Systems (Companion Volume)": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "CSCW": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "Diagrams": { - "sourcetype": "booktitle", - "publishers": ["springer_v2"] - }, - "Eurographics": { - "sourcetype": "booktitle", - "publishers": ["springer_v2", "eurographics_digital_library"] - }, - "Eurographics (Areas Papers)": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library"] - }, - "Eurographics (Posters)": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library"] - }, - "Eurographics (Short Papers)": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library"] - }, - "Eurographics (Short Presentations)": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library" ] - }, - "Eurographics (State of the Art Reports)": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library" ] - }, - "EuroVAST@EuroVis": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library"] - }, - "Graphics Interface": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library", "graphics_interface_proceedings"] - }, - "ICDM": { - "sourcetype": "booktitle", - "publishers": ["springer_v2", "ieee_explore"] - }, - "IEEE Computer Graphics and Applications": { - "sourcetype": "journal", - "publishers": ["ieee_explore"] - }, - "IEEE Trans. Vis. Comput. Graph.": { - "sourcetype": "journal", - "publishers": ["ieee_explore"] - }, - "IEEE VAST": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "IEEE Visualization": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "IEEE VIS (Short Papers)": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "Information Visualization": { - "sourcetype": "booktitle", - "publishers": ["springer_v2", "dagstuhl"] - }, - "INTERACT": { - "sourcetype": "booktitle", - "publishers": ["springer_v2"] - }, - "INTERACT (1)": { - "sourcetype": "booktitle", - "publishers": ["springer_v2"] - }, - "INTERACT (2)": { - "sourcetype": "booktitle", - "publishers": ["springer_v2"] - }, - "INTERACT (3)": { - "sourcetype": "booktitle", - "publishers": ["springer_v2"] - }, - "INTERACT (4)": { - "sourcetype": "booktitle", - "publishers": ["springer_v2"] - }, - "International Conference on Supercomputing": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "IUI": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "IV": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "IV (1)": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "IV (2)": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "IVAPP": { - "sourcetype": "booktitle", - "publishers": ["scitepress"] - }, - "J. Vis.": { - "sourcetype": "journal", - "publishers": ["springer_v1"] - }, - "KDD": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library", "aaai"] - }, - "PacificVis": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "SciVis": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "SIBGRAPI": { - "sourcetype": "booktitle", - "publishers": ["ieee_explore"] - }, - "SIGGRAPH": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "SIGGRAPH Asia": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "SIGMOD Conference": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "UbiComp": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library", "springer_v2"] - }, - "UIST": { - "sourcetype": "booktitle", - "publishers": ["acm_digital_library"] - }, - "VAST": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library", "ieee_explore" ] - }, - "VAST (Short and Project Papers)": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library", "ieee_explore" ] - }, - "VCBM": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library"] - }, - "Vis. Comput.": { - "sourcetype": "journal", - "publishers": ["springer_v1"] - }, - "VMV": { - "sourcetype": "booktitle", - "publishers": ["eurographics_digital_library"] - } -} + if output_dir is None: + output_dir = _root_dir / "output" + elif not isinstance(output_dir, Path): + assets_dir = Path(output_dir) -# Object to map different variations of a keyword to a consistent name. -keywords_to_merge = { - "cscw": "computer supported collaborative work", - "computer supported collaborative work": "computer supported collaborative work", - "data visualization": "data visualization", - "data visualisation": "data visualization", - "visualisation": "visualization", - "visualization": "visualization", - "hci": "human computer interaction", - "human computer interaction": "human computer interaction", - "human-computer-interaction": "human computer interaction", - "human-computer interaction": "human computer interaction", - "human computer interaction (hci)": "human computer interaction", - "human-computer interaction (hci)": "human computer interaction", - "human computer interactions": "human computer interaction", - "human-computer-interactions": "human computer interaction", - "human-computer interactions": "human computer interaction", -} + # TODO: [Update as required] Paths to important input/output files + # FIXME: automatically extract the latest + self.path_input_raw = assets_dir / "data" / "dblp-2022-11-02.xml" + self.path_input = assets_dir / "data" / "dblp_processed.xml" + self.path_output = output_dir / "output.db" + self.path_postprocessing_output = output_dir / "output_processed.tsv" + self.path_unique_venues = output_dir / "unique_venues.db" + self.path_unique_keywords = output_dir / "unique_keywords.tsv" + self.path_unique_authors = output_dir/ "unique_authors.tsv" + self.path_logfile = output_dir / "log.tsv" -keyword_patterns_to_remove = [ - r"\d+.\d+.\d+.", # e.g., 1.3.4. - r"\d+.\d+.\d+", # e.g., 1.3.4 - r"\w+.\d+.\d+.", # e.g., d.3.4. - r"\w+.\d+.\d+", # e.g., d.3.4 - r"according to", - r"acm ccs", - r"acmccs", - r"acma ccs", - r"\(\s*\)", - r"\/spl", - r"\/sup", - r"\/", - r"^-\s*" -] + # ChromeDriver + # TODO Option 1: Manual Download from https://chromedriver.chromium.org/downloads (e.g., ChromeDriver 86.0.4240.22) and save to a known location in PATH + # TODO Option 2: Install using brew: `brew cask install chromedriver`. It is generally saved to `/usr/local/bin/chromedriver` + # For Mac OSX, the executable will have to be quarantined - `xattr -d com.apple.quarantine chromedriver` + # Set the chromedriver path below. + self.path_chromedriver = assets_dir / "chromedriver" # /usr/local/bin/chromedriver + + # ChromeOptions binary + # TODO: [Update this path depending on where it is located in your Operating System] + self.path_chromeoptions_binary = Path("/") / "Applications" / "Google Chrome.app" / "Contents" / "MacOS" / "Google Chrome" + + # List of Venues we target with their DBLP category. This information can be found in the path above. + # TODO: [Update as required] Don't forget to add the corresponding logic to scrape keywords/absracts/titles/citations, etc. + self.interesting_venues = { + "ACM Trans. Comput. Hum. Interact.": { + "sourcetype": "journal", + "publishers": ["acm_digital_library"] + }, + "AVI": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "BCS HCI": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library", "scienceopen", "springer_v2"] + }, + "BCS HCI (1)": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "BCS HCI (2)": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "BELIV": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library", "ieee_explore"] + }, + "BioVis": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "CHI": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "Cognitive Biases in Visualizations": { + "sourcetype": "booktitle", + "publishers": ["springer_v2"] + }, + "CogSci": { + "sourcetype": "booktitle", + "publishers": ["cogsci"] + }, + "Comput. Graph. Forum": { + "sourcetype": "journal", + "publishers": ["wiley_online_library"] + }, + "Conference on Designing Interactive Systems": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "Conference on Designing Interactive Systems (Companion Volume)": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "CSCW": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "Diagrams": { + "sourcetype": "booktitle", + "publishers": ["springer_v2"] + }, + "Eurographics": { + "sourcetype": "booktitle", + "publishers": ["springer_v2", "eurographics_digital_library"] + }, + "Eurographics (Areas Papers)": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library"] + }, + "Eurographics (Posters)": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library"] + }, + "Eurographics (Short Papers)": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library"] + }, + "Eurographics (Short Presentations)": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library" ] + }, + "Eurographics (State of the Art Reports)": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library" ] + }, + "EuroVAST@EuroVis": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library"] + }, + "Graphics Interface": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library", "graphics_interface_proceedings"] + }, + "ICDM": { + "sourcetype": "booktitle", + "publishers": ["springer_v2", "ieee_explore"] + }, + "IEEE Computer Graphics and Applications": { + "sourcetype": "journal", + "publishers": ["ieee_explore"] + }, + "IEEE Trans. Vis. Comput. Graph.": { + "sourcetype": "journal", + "publishers": ["ieee_explore"] + }, + "IEEE VAST": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "IEEE Visualization": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "IEEE VIS (Short Papers)": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "Information Visualization": { + "sourcetype": "booktitle", + "publishers": ["springer_v2", "dagstuhl"] + }, + "INTERACT": { + "sourcetype": "booktitle", + "publishers": ["springer_v2"] + }, + "INTERACT (1)": { + "sourcetype": "booktitle", + "publishers": ["springer_v2"] + }, + "INTERACT (2)": { + "sourcetype": "booktitle", + "publishers": ["springer_v2"] + }, + "INTERACT (3)": { + "sourcetype": "booktitle", + "publishers": ["springer_v2"] + }, + "INTERACT (4)": { + "sourcetype": "booktitle", + "publishers": ["springer_v2"] + }, + "International Conference on Supercomputing": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "IUI": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "IV": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "IV (1)": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "IV (2)": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "IVAPP": { + "sourcetype": "booktitle", + "publishers": ["scitepress"] + }, + "J. Vis.": { + "sourcetype": "journal", + "publishers": ["springer_v1"] + }, + "KDD": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library", "aaai"] + }, + "PacificVis": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "SciVis": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "SIBGRAPI": { + "sourcetype": "booktitle", + "publishers": ["ieee_explore"] + }, + "SIGGRAPH": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "SIGGRAPH Asia": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "SIGMOD Conference": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "UbiComp": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library", "springer_v2"] + }, + "UIST": { + "sourcetype": "booktitle", + "publishers": ["acm_digital_library"] + }, + "VAST": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library", "ieee_explore" ] + }, + "VAST (Short and Project Papers)": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library", "ieee_explore" ] + }, + "VCBM": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library"] + }, + "Vis. Comput.": { + "sourcetype": "journal", + "publishers": ["springer_v1"] + }, + "VMV": { + "sourcetype": "booktitle", + "publishers": ["eurographics_digital_library"] + } + } + + # Object to map different variations of a keyword to a consistent name. + self.keywords_to_merge = { + "cscw": "computer supported collaborative work", + "computer supported collaborative work": "computer supported collaborative work", + "data visualization": "data visualization", + "data visualisation": "data visualization", + "visualisation": "visualization", + "visualization": "visualization", + "hci": "human computer interaction", + "human computer interaction": "human computer interaction", + "human-computer-interaction": "human computer interaction", + "human-computer interaction": "human computer interaction", + "human computer interaction (hci)": "human computer interaction", + "human-computer interaction (hci)": "human computer interaction", + "human computer interactions": "human computer interaction", + "human-computer-interactions": "human computer interaction", + "human-computer interactions": "human computer interaction", + } + + self.keyword_patterns_to_remove = [ + r"\d+.\d+.\d+.", # e.g., 1.3.4. + r"\d+.\d+.\d+", # e.g., 1.3.4 + r"\w+.\d+.\d+.", # e.g., d.3.4. + r"\w+.\d+.\d+", # e.g., d.3.4 + r"according to", + r"acm ccs", + r"acmccs", + r"acma ccs", + r"\(\s*\)", + r"\/spl", + r"\/sup", + r"\/", + r"^-\s*" + ] + + +config = Config() diff --git a/poetry.lock b/poetry.lock index 7291d15..dfe74ba 100644 --- a/poetry.lock +++ b/poetry.lock @@ -15,10 +15,10 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] -docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] -tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] +dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] +docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] +tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] +tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] [[package]] name = "beautifulsoup4" @@ -96,12 +96,23 @@ python-versions = ">=3.6" cffi = ">=1.12" [package.extras] -docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] -docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] +docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"] +docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] sdist = ["setuptools_rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] +test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"] + +[[package]] +name = "exceptiongroup" +version = "1.0.4" +description = "Backport of PEP 654 (exception groups)" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +test = ["pytest (>=6)"] [[package]] name = "h11" @@ -119,6 +130,14 @@ category = "main" optional = false python-versions = ">=3.5" +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" + [[package]] name = "loguru" version = "0.6.0" @@ -132,7 +151,7 @@ colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""} win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""} [package.extras] -dev = ["colorama (>=0.3.4)", "docutils (==0.16)", "flake8 (>=3.7.7)", "tox (>=3.9.0)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "black (>=19.10b0)", "isort (>=5.1.1)", "Sphinx (>=4.1.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)"] +dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils (==0.16)", "flake8 (>=3.7.7)", "isort (>=5.1.1)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)", "tox (>=3.9.0)"] [[package]] name = "lxml" @@ -145,7 +164,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" [package.extras] cssselect = ["cssselect (>=0.7)"] html5 = ["html5lib"] -htmlsoup = ["beautifulsoup4"] +htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=0.29.7)"] [[package]] @@ -167,6 +186,17 @@ python-versions = ">=3.6" [package.dependencies] attrs = ">=19.2.0" +[[package]] +name = "packaging" +version = "21.3" +description = "Core utilities for Python packages" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" + [[package]] name = "pandas" version = "1.4.1" @@ -188,6 +218,18 @@ pytz = ">=2020.1" [package.extras] test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + [[package]] name = "pycparser" version = "2.21" @@ -211,6 +253,17 @@ cryptography = ">=35.0" docs = ["sphinx", "sphinx-rtd-theme"] test = ["flaky", "pretend", "pytest (>=3.0.1)"] +[[package]] +name = "pyparsing" +version = "3.0.9" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "dev" +optional = false +python-versions = ">=3.6.8" + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + [[package]] name = "pysocks" version = "1.7.1" @@ -219,6 +272,26 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +[[package]] +name = "pytest" +version = "7.2.0" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] + [[package]] name = "python-dateutil" version = "2.8.2" @@ -301,6 +374,22 @@ category = "main" optional = false python-versions = ">=3.6" +[[package]] +name = "sqlitedict" +version = "2.0.0" +description = "Persistent dict in Python, backed up by sqlite3 and pickle, multithread-safe." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.7" + [[package]] name = "tqdm" version = "4.63.0" @@ -363,8 +452,8 @@ pyOpenSSL = {version = ">=0.14", optional = true, markers = "extra == \"secure\" PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""} [package.extras] -brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -387,7 +476,7 @@ optional = false python-versions = ">=3.5" [package.extras] -dev = ["pytest (>=4.6.2)", "black (>=19.3b0)"] +dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"] [[package]] name = "wsproto" @@ -403,7 +492,7 @@ h11 = ">=0.9.0,<1" [metadata] lock-version = "1.1" python-versions = "~=3.8" -content-hash = "89d5de02738bcf3f4a31eca13e4759300c5312821679bf90a58809024885e1a2" +content-hash = "3527f8a60e9adf40ccd4edbc67ca03e2dd188ed70987df71a72b926bb1dc6aff" [metadata.files] async-generator = [ @@ -508,6 +597,10 @@ cryptography = [ {file = "cryptography-36.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e167b6b710c7f7bc54e67ef593f8731e1f45aa35f8a8a7b72d6e42ec76afd4b3"}, {file = "cryptography-36.0.2.tar.gz", hash = "sha256:70f8f4f7bb2ac9f340655cbac89d68c527af5bb4387522a8413e841e3e6628c9"}, ] +exceptiongroup = [ + {file = "exceptiongroup-1.0.4-py3-none-any.whl", hash = "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828"}, + {file = "exceptiongroup-1.0.4.tar.gz", hash = "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"}, +] h11 = [ {file = "h11-0.13.0-py3-none-any.whl", hash = "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"}, {file = "h11-0.13.0.tar.gz", hash = "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06"}, @@ -516,6 +609,10 @@ idna = [ {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] loguru = [ {file = "loguru-0.6.0-py3-none-any.whl", hash = "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"}, {file = "loguru-0.6.0.tar.gz", hash = "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c"}, @@ -609,6 +706,10 @@ outcome = [ {file = "outcome-1.1.0-py2.py3-none-any.whl", hash = "sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958"}, {file = "outcome-1.1.0.tar.gz", hash = "sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967"}, ] +packaging = [ + {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, + {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, +] pandas = [ {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3dfb32ed50122fe8c5e7f2b8d97387edd742cc78f9ec36f007ee126cd3720907"}, {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0259cd11e7e6125aaea3af823b80444f3adad6149ff4c97fef760093598b3e34"}, @@ -632,6 +733,10 @@ pandas = [ {file = "pandas-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:3129a35d9dad1d80c234dd78f8f03141b914395d23f97cf92a366dcd19f8f8bf"}, {file = "pandas-1.4.1.tar.gz", hash = "sha256:8db93ec98ac7cb5f8ac1420c10f5e3c43533153f253fe7fb6d891cf5aa2b80d2"}, ] +pluggy = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, @@ -640,11 +745,19 @@ pyopenssl = [ {file = "pyOpenSSL-22.0.0-py2.py3-none-any.whl", hash = "sha256:ea252b38c87425b64116f808355e8da644ef9b07e429398bfece610f893ee2e0"}, {file = "pyOpenSSL-22.0.0.tar.gz", hash = "sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf"}, ] +pyparsing = [ + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, +] pysocks = [ {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"}, {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"}, {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"}, ] +pytest = [ + {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"}, + {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"}, +] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, @@ -676,6 +789,13 @@ soupsieve = [ {file = "soupsieve-2.3.1-py3-none-any.whl", hash = "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb"}, {file = "soupsieve-2.3.1.tar.gz", hash = "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"}, ] +sqlitedict = [ + {file = "sqlitedict-2.0.0.tar.gz", hash = "sha256:23a370416f4e1e962daa293382f3a8dbc4127e6a0abc06a5d4e58e6902f05d17"}, +] +tomli = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] tqdm = [ {file = "tqdm-4.63.0-py2.py3-none-any.whl", hash = "sha256:e643e071046f17139dea55b880dc9b33822ce21613b4a4f5ea57f202833dbc29"}, {file = "tqdm-4.63.0.tar.gz", hash = "sha256:1d9835ede8e394bb8c9dcbffbca02d717217113adc679236873eeaac5bc0b3cd"}, diff --git a/pyproject.toml b/pyproject.toml index 74038a5..7c4842b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,9 +15,13 @@ click = "^8.0.4" loguru = "^0.6.0" tqdm = "^4.63.0" webdriver-manager = "^3.5.4" +sqlitedict = "^2.0.0" [tool.poetry.dev-dependencies] +[tool.poetry.group.dev.dependencies] +pytest = "^7.2.0" + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/test/assets/data/dblp_processed.xml b/test/assets/data/dblp_processed.xml new file mode 100644 index 0000000..acf70d7 --- /dev/null +++ b/test/assets/data/dblp_processed.xml @@ -0,0 +1,89 @@ + + + + + Iván Cantador + Ignacio Fernández-Tobías + Shlomo Berkovsky + Paolo Cremonesi + Cross-Domain Recommender Systems. + 919-959 + 2015 + Recommender Systems Handbook + https://doi.org/10.1007/978-1-4899-7637-6_27 + reference/sp/2015rsh + db/reference/sp/rsh2015.html#CantadorFBC15 + + + Jeffrey V. Nickerson + Human-Based Evolutionary Computing. + 641-648 + 2013 + Handbook of Human Computation + https://doi.org/10.1007/978-1-4614-8806-4_51 + https://www.wikidata.org/entity/Q105641856 + reference/sp/2013hc + db/reference/sp/hc2013.html#Nickerson13 + + + Luciana S. Buriol + Network Optimization. + 1123-1140 + 2018 + Handbook of Heuristics + https://doi.org/10.1007/978-3-319-07124-4_46 + reference/sp/2018heuristics + db/reference/sp/heuristics2018.html#Buriol18 + + + Alexander Felfernig + Gerhard Friedrich + Dietmar Jannach + Markus Zanker + Constraint-Based Recommender Systems. + 161-190 + 2015 + Recommender Systems Handbook + https://doi.org/10.1007/978-1-4899-7637-6_5 + reference/sp/2015rsh + db/reference/sp/rsh2015.html#FelfernigFJZ15 + + + Fernando Sandoya + Anna Martínez-Gavara + Ricardo Aceves + Abraham Duarte + Rafael Martí + Diversity and Equity Models. + 979-998 + 2018 + Handbook of Heuristics + https://doi.org/10.1007/978-3-319-07124-4_61 + reference/sp/2018heuristics + db/reference/sp/heuristics2018.html#SandoyaMADM18 + + + Liane Gabora + Cultural Evolution as Distributed Computation. + 447-461 + 2013 + Handbook of Human Computation + https://doi.org/10.1007/978-1-4614-8806-4_34 + https://www.wikidata.org/entity/Q105641836 + reference/sp/2013hc + db/reference/sp/hc2013.html#Gabora13 + + + Jameson L. Toole + Yves-Alexandre de Montjoye + Marta C. González + Alex 'Sandy' Pentland + Modeling and Understanding Intrinsic Characteristics of Human Mobility. + 13-34 + 2018 + reference/sp/2018mdp + Handbook of Mobile Data Privacy + https://doi.org/10.1007/978-3-319-98161-1_2 + db/reference/sp/mdp2018.html#TooleMGP18 + + diff --git a/test/test_preprocess.py b/test/test_preprocess.py new file mode 100644 index 0000000..8a78026 --- /dev/null +++ b/test/test_preprocess.py @@ -0,0 +1,56 @@ +import pytest +import shutil +from pathlib import Path +from paperscraper._preprocess import get_unique_venues, get_extracted_data +from paperscraper.config import Config + + +@pytest.fixture(scope="class") +def test_config(request, tmp_path_factory): + root_dir = Path(__file__).parent + output_dir = tmp_path_factory.mktemp("output") + _config = Config(root_dir=root_dir, output_dir=output_dir) + _config.interesting_venues = { + "Handbook of Human Computation": { + "sourcetype": "booktitle", + "publishers": [] + }, + "Recommender Systems Handbook": { + "sourcetype": "booktitle", + "publishers": [] + }, + "Handbook of Heuristics": { + "sourcetype": "booktitle", + "publishers": [] + } + } + yield _config + shutil.rmtree(str(output_dir)) + + +class Test_get_unique_venues: + def test_get_unique_venues_first(self, test_config): + result = get_unique_venues(test_config, force=True) + _len = len(result) + result.close(force=False) + assert _len == 4 + + def test_get_unique_venues_second(self, test_config): + result = get_unique_venues(test_config, force=False) + _len = len(result) + result.close(force=False) + assert _len == 4 + + +class Test_get_extracted_data: + def test_get_extracted_data_first(self, test_config): + result = get_extracted_data(test_config, force=True) + _len = len(result) + result.close(force=False) + assert _len == 6 + + def test_get_extracted_data_second(self, test_config): + result = get_extracted_data(test_config, force=False) + _len = len(result) + result.close(force=False) + assert _len == 6 From 2e79acaa58f2d00630d8d024af2212f673431143 Mon Sep 17 00:00:00 2001 From: Ahmed Shariff Date: Fri, 25 Nov 2022 18:16:34 -0800 Subject: [PATCH 4/9] Adding post-process module --- paperscraper/_postprocess.py | 196 +++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 paperscraper/_postprocess.py diff --git a/paperscraper/_postprocess.py b/paperscraper/_postprocess.py new file mode 100644 index 0000000..7f57abc --- /dev/null +++ b/paperscraper/_postprocess.py @@ -0,0 +1,196 @@ +# External packages +import ast +import re +import string +import unicodedata + +from sqlitedict import SqliteDict +from loguru import logger +from tqdm import tqdm + +# Internal modules +from paperscraper.config import config, Config + +regex = re.compile(r'[\n\r\t]') +set_punctuations = set(string.punctuation) +set_numbers = set("0123456789") + +logger.remove() +logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) + + +def _clean_string(_string): + _string_normalized = unicodedata.normalize("NFKD", _string) + _string_stripped = str(regex.sub("", _string_normalized)).strip() + _string_recoded = _string_stripped.encode('ascii', 'ignore').decode('UTF-8') + return _string_recoded + + +def process_title(title_string): + """Ensure that there aren't new lines and that the titles are between X and Y characters in length.""" + try: + if not (5 < len(title_string) < 250): + return None + + return " ".join(title_string.split()) + except Exception: + # print(e) + return None + + +def process_abstract(abstract_string): + """Ensure that there aren't new lines and that the abstracts are between X and Y characters in length.""" + try: + if abstract_string in ["Not Scraped", "Error", "No Url"]: + return None + + if not (50 < len(abstract_string) < 2500): + return None + + return " ".join(abstract_string.split()) + except Exception: + # print(e) + return None + + +def process_authors(author_string): + """ + Convert utf-8 characters to ascii so that they are searchable via a keyboard. + + (will result in data loss but ignore errors) + """ + try: + author_list = ast.literal_eval(author_string) + if isinstance(author_list, list): + recoded_author_list = [string.capwords(_author.encode('ascii', 'ignore').decode('UTF-8')) for _author in author_list] + return str(recoded_author_list) + except Exception: + # print(e) + pass + return author_string + + +def process_citation_counts(citation_count_string): + """Ensure that this is always NONE or NUMERIC.""" + try: + if not citation_count_string.isnumeric(): + return None + else: + return citation_count_string + except Exception: + return None + + +def process_keywords(keywords_string): + """ + Convert utf-8 characters to ascii so that they are searchable via a keyboard. + + (will result in data loss but ignore errors) + """ + try: + keywords_list = ast.literal_eval(keywords_string) + if isinstance(keywords_list, list): + processed_keywords_list = list() + + for _keyword in keywords_list: + if "→" in _keyword: + kws = _keyword.split("→") + for kw in kws: + processed_keywords_list.append(kw) + elif "Key words: " in _keyword: + _keyword = re.sub("Key words: ","",_keyword) + kws = _keyword.split(" – ") + for kw in kws: + processed_keywords_list.append(kw) + else: + processed_keywords_list.append(_keyword) + + # Start with removing Nones. + processed_keywords_list = list(filter(None, processed_keywords_list)) + + # Make them all lower-case for case insensitive match to be successful. + processed_keywords_list = [str(kw).lower() for kw in processed_keywords_list] + + # Clean the Keyword String + processed_keywords_list = [_clean_string(kw) for kw in processed_keywords_list] + + # Remove weird phrases in the Keyword that sometimes happens based on how it's maintained on the Publisher's website. + _interim_processed_list = [] + for kw in processed_keywords_list: + for regex in config.keyword_patterns_to_remove: + kw = re.sub(regex, "", kw) + _interim_processed_list.append(kw) + processed_keywords_list = _interim_processed_list + + # Remove keywords if it has Only keywords or Only punctuations + processed_keywords_list = [i for i in processed_keywords_list if not all(j in set_punctuations or j in set_numbers for j in i)] + + # Finally, Remove None's again. + processed_keywords_list = list(filter(None, processed_keywords_list)) + + # Merge Different Variations of the same Keyword + _interim_processed_list = [] + for kw in processed_keywords_list: + if kw in config.keywords_to_merge: + _interim_processed_list.append(config.keywords_to_merge[kw]) + else: + _interim_processed_list.append(kw) + processed_keywords_list = _interim_processed_list + + # And of course, de-duplicate if some have both HCI and Human-Computer Interaction initially. + processed_keywords_list = list(set(processed_keywords_list)) + + # Let's capitalize the keywords so that they look nice. + processed_keywords_list = [string.capwords(kw) for kw in processed_keywords_list] + + return str(processed_keywords_list) + except Exception: + # print(e) + pass + return None + + +def get_post_processed_data(config: Config, force: bool = False) -> SqliteDict: + """Process fields and return them.""" + if force or not config.path_output.exists(): + # Read it + papers_db = SqliteDict(config.path_output) + + author_processed = [] + keywords_processed = [] + citation_count_processed = [] + abstract_processed = [] + title_processed = [] + + for index, row in tqdm(papers_db.items(), desc="Papers", total=len(papers_db)): + # Process authors + author_processed.append(process_authors(row["author"])) + + # Process keywords + keywords_processed.append(process_keywords(row["keywords"])) + + # Process citation counts + citation_count_processed.append(process_citation_counts(row["citation_count"])) + + # Process abstract + abstract_processed.append(process_abstract(row["abstract"])) + + # Process titles + title_processed.append(process_title(row["title"])) + + papers_db.close() + + # Commit all the data to db + scraped_input_db = SqliteDict(config.path_postprocessing_output) + scraped_input_db["author_processed"] = author_processed + scraped_input_db["keywords_processed"] = keywords_processed + scraped_input_db["citation_count_processed"] = citation_count_processed + scraped_input_db["abstract_processed"] = abstract_processed + scraped_input_db["title_processed"] = title_processed + + # Save POST-PROCESSED FILE + scraped_input_db.commit() + else: + scraped_input_db = SqliteDict(config.path_postprocessing_output) + + return scraped_input_db From 49321cfbb695d7983c2c8928492760a55dc1129f Mon Sep 17 00:00:00 2001 From: Ahmed Shariff Date: Sat, 26 Nov 2022 01:48:34 -0800 Subject: [PATCH 5/9] Merge get_(unique_venues|extracted_data) & memory optimize xml iter --- paperscraper/_preprocess.py | 72 ++++++++++++++++++++----------------- test/test_preprocess.py | 36 +++++++------------ 2 files changed, 52 insertions(+), 56 deletions(-) diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py index b284f2f..a1557e2 100644 --- a/paperscraper/_preprocess.py +++ b/paperscraper/_preprocess.py @@ -63,18 +63,30 @@ def get_processed_db(force: bool = False) -> Path: return config.path_input -# TODO: Re-run this if (1) The above list has changed OR (2) There is a NEW DBLP snapshot. -def get_unique_venues(config: Config, force: bool = False) -> SqliteDict: +# TODO: Re-run this if +# (1) The list has changed or +# (2) There is a NEW DBLP snapshot. +def get_extracted_data(config: Config, force: bool = False) -> tuple[SqliteDict, SqliteDict]: """ - Find Unique venues from the DBLP xml. + FILTER the huge dblp_processed.xml file to keep just the data that we are interested in and Find Unique venues from the DBLP xml. - Looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"]. + For unqiue venues looking ONLY for ["article","inproceedings","incollection"] and ["journal", "booktitle"]. """ - if force or not config.path_unique_venues.exists(): + if force or not config.path_output.exists(): + logger.info(f"Extracting venues to {config.path_unique_venues}") unique_sources = SqliteDict(config.path_unique_venues) unique_sources.clear() # empty the db - logger.info(f"Extracting venues to {config.path_unique_venues}") - for event, elem in tqdm(ET.iterparse(config.path_input, recover=True), desc="Entry"): + + logger.info(f"Extracting data to {config.path_output}") + result_list = SqliteDict(config.path_output) + result_list.clear() # empty the db + src_set = set() + + _idx: dict[int, int] = {0: 0} + + for event, elem in tqdm(ET.iterparse(config.path_input, encoding='UTF-8', events=("end", ) ,recover=True), desc="Entry"): + _idx[0] += 1 + if elem.tag in ["article", "inproceedings", "incollection"]: for child in elem.getchildren(): if child.tag in ["journal", "booktitle"]: @@ -89,27 +101,6 @@ def get_unique_venues(config: Config, force: bool = False) -> SqliteDict: child_dict["count"] += 1 unique_sources[child.text] = child_dict - logger.debug("Writing to disk") - # Save it to disk - unique_sources.commit() - else: - logger.info(f"Loading data from {config.path_unique_venues}") - unique_sources = SqliteDict(config.path_unique_venues) - - return unique_sources - - -# TODO: Re-run this if -# (1) The list has changed or -# (2) There is a NEW DBLP snapshot. -def get_extracted_data(config: Config, force: bool = False) -> SqliteDict: - """FILTER the huge dblp_processed.xml file to keep just the data that we are interested in.""" - if force or not config.path_output.exists(): - logger.info(f"Extracting data to {config.path_output}") - result_list = SqliteDict(config.path_output) - result_list.clear() # empty the db - src_set = set() - for _idx, (event, elem) in tqdm(enumerate(ET.iterparse(config.path_input, encoding='UTF-8', recover=True)), desc="Entry"): obj: dict = {} # Initialize the fields that we are going to scrape. # TODO: Update these if more fields are added. @@ -141,20 +132,35 @@ def get_extracted_data(config: Config, force: bool = False) -> SqliteDict: logger.debug(f"Adding source: {child.text}") if to_add: - result_list[_idx] = obj + result_list[_idx[0]] = obj # Periodically commiting stuff - if _idx % 100 == 0: + if _idx[0] % 200000 == 0: + unique_sources.commit() result_list.commit() + # from https://stackoverflow.com/questions/7171140/using-python-iterparse-for-large-xml-files + # http://lxml.de/parsing.html#modifying-the-tree + # Based on Liza Daly's fast_iter + # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ + # See also http://effbot.org/zone/element-iterparse.htm + # NOTE: deleting only the 2nd level nodes + if len(elem.getroottree().getpath(elem).split("/")) <= 3: + elem.clear() + while elem.getprevious() is not None: + del elem.getparent()[0] + logger.debug("Writing to disk") # Save to disk + unique_sources.commit() result_list.commit() else: logger.info(f"Loading data from {config.path_output}") result_list = SqliteDict(config.path_output) + logger.info(f"Loading data from {config.path_unique_venues}") + unique_sources = SqliteDict(config.path_unique_venues) - return result_list + return result_list, unique_sources # get a new headless Chrome driver @@ -337,11 +343,11 @@ def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict: # Persist the paper file papers_db.commit() - logger.i("scraped papers saved to disk.") + logger.info("scraped papers saved to disk.") # Persist Logs df_logs = pd.DataFrame.from_dict(log_obj, orient="index") - logger.i(log_obj) + logger.info(log_obj) df_logs.to_csv(config.path_logfile, sep='\t', header=True) else: logger.info(f"Loading processed data from {config.path_output}") diff --git a/test/test_preprocess.py b/test/test_preprocess.py index 8a78026..c3d2530 100644 --- a/test/test_preprocess.py +++ b/test/test_preprocess.py @@ -1,7 +1,7 @@ import pytest import shutil from pathlib import Path -from paperscraper._preprocess import get_unique_venues, get_extracted_data +from paperscraper._preprocess import get_extracted_data from paperscraper.config import Config @@ -28,29 +28,19 @@ def test_config(request, tmp_path_factory): shutil.rmtree(str(output_dir)) -class Test_get_unique_venues: - def test_get_unique_venues_first(self, test_config): - result = get_unique_venues(test_config, force=True) - _len = len(result) - result.close(force=False) - assert _len == 4 - - def test_get_unique_venues_second(self, test_config): - result = get_unique_venues(test_config, force=False) - _len = len(result) - result.close(force=False) - assert _len == 4 - - class Test_get_extracted_data: + def _get_extracted_data_results(self, data, venues): + _len_data = len(data) + data.close(force=True) + _len_venues = len(venues) + venues.close(force=True) + assert _len_data == 6 + assert _len_venues == 4 + def test_get_extracted_data_first(self, test_config): - result = get_extracted_data(test_config, force=True) - _len = len(result) - result.close(force=False) - assert _len == 6 + data, venues = get_extracted_data(test_config, force=True) + self._get_extracted_data_results(data, venues) def test_get_extracted_data_second(self, test_config): - result = get_extracted_data(test_config, force=False) - _len = len(result) - result.close(force=False) - assert _len == 6 + data, venues = get_extracted_data(test_config, force=False) + self._get_extracted_data_results(data, venues) From 4b449ec13503f4dd657519bbb50461c7f6f81013 Mon Sep 17 00:00:00 2001 From: Ahmed Shariff Date: Sat, 26 Nov 2022 01:50:02 -0800 Subject: [PATCH 6/9] Update cli & adding tests --- paperscraper/_cli.py | 43 ++++++++++++++++++++++++++++---- test/test_cli.py | 58 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 5 deletions(-) create mode 100644 test/test_cli.py diff --git a/paperscraper/_cli.py b/paperscraper/_cli.py index 9bbcd28..d6be844 100644 --- a/paperscraper/_cli.py +++ b/paperscraper/_cli.py @@ -1,20 +1,53 @@ import click -from paperscraper._preprocess import (get_processed_db, get_unique_venues, get_extracted_data, get_processed_data) +from paperscraper._preprocess import (get_processed_db, get_extracted_data, get_processed_data) +from paperscraper._postprocess import get_post_processed_data from paperscraper.config import config @click.group() def cli(): + """Cli interface for paperscraper.""" pass @cli.group() def process(): + """Process and setup database.""" pass + +@process.command() +@click.option("-f", "--force", help="Force run all steps", is_flag=True) +def process_db(force): + """Process the dblp xml file.""" + get_processed_db(config=config, force=force) + + +@process.command() +@click.option("-f", "--force", help="Force run all steps", is_flag=True) +def extract_data(force): + """Extract data from processed dblp xml file.""" + get_extracted_data(config=config, force=force) + + +@process.command() +@click.option("-f", "--force", help="Force run all steps", is_flag=True) +def process_data(force): + """Process extracted data.""" + get_processed_data(config=config, force=force) + + +@process.command() +@click.option("-f", "--force", help="Force run all steps", is_flag=True) +def post_process_data(force): + """Run cleanup process after processing data.""" + get_post_processed_data(config=config, force=force) + + @process.command() @click.option("-f", "--force", help="Force run all steps", is_flag=True) def run_all(force): - get_processed_db(force=False) - get_unique_venues(config, force=False) - get_extracted_data(config, force=False) - get_processed_data(config, force=force) + """Run all steps in order.""" + get_processed_db(config=config, force=force) + get_extracted_data(config=config, force=force) + get_processed_data(config=config, force=force) + get_post_processed_data(config=config, force=force) diff --git a/test/test_cli.py b/test/test_cli.py new file mode 100644 index 0000000..b7b260d --- /dev/null +++ b/test/test_cli.py @@ -0,0 +1,58 @@ +import pytest +import importlib +from click.testing import CliRunner +import paperscraper +import pytest_mock + +import paperscraper._cli + + +@pytest.fixture(scope="function") +def runner(): + return CliRunner() + + +def called_with_config_and_force(mocked_function): + mocked_function.assert_called_with(config=paperscraper._cli.config, force=True) + + +def mock_function(mocker, mock_function): + mocker.patch(mock_function) + # Before the main methods gets imported need to mock them + importlib.reload(paperscraper._cli) + + +def test_process_db(runner, mocker): + mock_function(mocker, "paperscraper._preprocess.get_processed_db") + result = runner.invoke(paperscraper._cli.cli, ["process", "process-db", "-f"]) + called_with_config_and_force(paperscraper._preprocess.get_processed_db) + + +def test_extract_data(runner, mocker): + mock_function(mocker, "paperscraper._preprocess.get_extracted_data") + result = runner.invoke(paperscraper._cli.cli, ["process", "extract-data", "-f"]) + called_with_config_and_force(paperscraper._preprocess.get_extracted_data) + + +def test_process_data(runner, mocker): + mock_function(mocker, "paperscraper._preprocess.get_processed_data") + result = runner.invoke(paperscraper._cli.cli, ["process", "process-data", "-f"]) + called_with_config_and_force(paperscraper._preprocess.get_processed_data) + + +def test_post_process_data(runner, mocker): + mock_function(mocker, "paperscraper._postprocess.get_post_processed_data") + result = runner.invoke(paperscraper._cli.cli, ["process", "post-process-data", "-f"]) + called_with_config_and_force(paperscraper._postprocess.get_post_processed_data) + + +def test_run_all(runner, mocker): + mock_function(mocker, "paperscraper._preprocess.get_processed_db") + mock_function(mocker, "paperscraper._preprocess.get_extracted_data") + mock_function(mocker, "paperscraper._preprocess.get_processed_data") + mock_function(mocker, "paperscraper._postprocess.get_post_processed_data") + result = runner.invoke(paperscraper._cli.cli, ["process", "run-all", "-f"]) + called_with_config_and_force(paperscraper._preprocess.get_processed_db) + called_with_config_and_force(paperscraper._preprocess.get_extracted_data) + called_with_config_and_force(paperscraper._preprocess.get_processed_data) + called_with_config_and_force(paperscraper._postprocess.get_post_processed_data) From 4e79979fd029d1e0409bdceec348d8ed8a3370b1 Mon Sep 17 00:00:00 2001 From: Ahmed Shariff Date: Sat, 26 Nov 2022 01:50:22 -0800 Subject: [PATCH 7/9] Adding dev dependencies & add .xml.gz to gitignore --- .gitignore | 1 + poetry.lock | 90 +++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 3 ++ 3 files changed, 93 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ad07f50..34882de 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ chromedriver .DS_Store *.exe *.xml +*.xml.gz *.log .idea/ *.pyc diff --git a/poetry.lock b/poetry.lock index dfe74ba..3c92596 100644 --- a/poetry.lock +++ b/poetry.lock @@ -103,6 +103,14 @@ sdist = ["setuptools_rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"] +[[package]] +name = "debugpy" +version = "1.6.3" +description = "An implementation of the Debug Adapter Protocol for Python" +category = "dev" +optional = false +python-versions = ">=3.7" + [[package]] name = "exceptiongroup" version = "1.0.4" @@ -167,6 +175,17 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=0.29.7)"] +[[package]] +name = "memory-profiler" +version = "0.61.0" +description = "A module for monitoring memory usage of a python program" +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +psutil = "*" + [[package]] name = "numpy" version = "1.22.3" @@ -230,6 +249,17 @@ python-versions = ">=3.6" dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "psutil" +version = "5.9.4" +description = "Cross-platform lib for process and system monitoring in Python." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] + [[package]] name = "pycparser" version = "2.21" @@ -292,6 +322,20 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +[[package]] +name = "pytest-mock" +version = "3.10.0" +description = "Thin-wrapper around the mock package for easier use with pytest" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +pytest = ">=5.0" + +[package.extras] +dev = ["pre-commit", "pytest-asyncio", "tox"] + [[package]] name = "python-dateutil" version = "2.8.2" @@ -492,7 +536,7 @@ h11 = ">=0.9.0,<1" [metadata] lock-version = "1.1" python-versions = "~=3.8" -content-hash = "3527f8a60e9adf40ccd4edbc67ca03e2dd188ed70987df71a72b926bb1dc6aff" +content-hash = "963f00872e5cf8e48cf9a053276d77ea593d40b80b6c670c1c2e7e5d37309c33" [metadata.files] async-generator = [ @@ -597,6 +641,26 @@ cryptography = [ {file = "cryptography-36.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e167b6b710c7f7bc54e67ef593f8731e1f45aa35f8a8a7b72d6e42ec76afd4b3"}, {file = "cryptography-36.0.2.tar.gz", hash = "sha256:70f8f4f7bb2ac9f340655cbac89d68c527af5bb4387522a8413e841e3e6628c9"}, ] +debugpy = [ + {file = "debugpy-1.6.3-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:c4b2bd5c245eeb49824bf7e539f95fb17f9a756186e51c3e513e32999d8846f3"}, + {file = "debugpy-1.6.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b8deaeb779699350deeed835322730a3efec170b88927debc9ba07a1a38e2585"}, + {file = "debugpy-1.6.3-cp310-cp310-win32.whl", hash = "sha256:fc233a0160f3b117b20216f1169e7211b83235e3cd6749bcdd8dbb72177030c7"}, + {file = "debugpy-1.6.3-cp310-cp310-win_amd64.whl", hash = "sha256:dda8652520eae3945833e061cbe2993ad94a0b545aebd62e4e6b80ee616c76b2"}, + {file = "debugpy-1.6.3-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:d5c814596a170a0a58fa6fad74947e30bfd7e192a5d2d7bd6a12156c2899e13a"}, + {file = "debugpy-1.6.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c4cd6f37e3c168080d61d698390dfe2cd9e74ebf80b448069822a15dadcda57d"}, + {file = "debugpy-1.6.3-cp37-cp37m-win32.whl", hash = "sha256:3c9f985944a30cfc9ae4306ac6a27b9c31dba72ca943214dad4a0ab3840f6161"}, + {file = "debugpy-1.6.3-cp37-cp37m-win_amd64.whl", hash = "sha256:5ad571a36cec137ae6ed951d0ff75b5e092e9af6683da084753231150cbc5b25"}, + {file = "debugpy-1.6.3-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:adcfea5ea06d55d505375995e150c06445e2b20cd12885bcae566148c076636b"}, + {file = "debugpy-1.6.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:daadab4403427abd090eccb38d8901afd8b393e01fd243048fab3f1d7132abb4"}, + {file = "debugpy-1.6.3-cp38-cp38-win32.whl", hash = "sha256:6efc30325b68e451118b795eff6fe8488253ca3958251d5158106d9c87581bc6"}, + {file = "debugpy-1.6.3-cp38-cp38-win_amd64.whl", hash = "sha256:86d784b72c5411c833af1cd45b83d80c252b77c3bfdb43db17c441d772f4c734"}, + {file = "debugpy-1.6.3-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:4e255982552b0edfe3a6264438dbd62d404baa6556a81a88f9420d3ed79b06ae"}, + {file = "debugpy-1.6.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cca23cb6161ac89698d629d892520327dd1be9321c0960e610bbcb807232b45d"}, + {file = "debugpy-1.6.3-cp39-cp39-win32.whl", hash = "sha256:7c302095a81be0d5c19f6529b600bac971440db3e226dce85347cc27e6a61908"}, + {file = "debugpy-1.6.3-cp39-cp39-win_amd64.whl", hash = "sha256:34d2cdd3a7c87302ba5322b86e79c32c2115be396f3f09ca13306d8a04fe0f16"}, + {file = "debugpy-1.6.3-py2.py3-none-any.whl", hash = "sha256:84c39940a0cac410bf6aa4db00ba174f973eef521fbe9dd058e26bcabad89c4f"}, + {file = "debugpy-1.6.3.zip", hash = "sha256:e8922090514a890eec99cfb991bab872dd2e353ebb793164d5f01c362b9a40bf"}, +] exceptiongroup = [ {file = "exceptiongroup-1.0.4-py3-none-any.whl", hash = "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828"}, {file = "exceptiongroup-1.0.4.tar.gz", hash = "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"}, @@ -680,6 +744,10 @@ lxml = [ {file = "lxml-4.8.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93"}, {file = "lxml-4.8.0.tar.gz", hash = "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23"}, ] +memory-profiler = [ + {file = "memory_profiler-0.61.0-py3-none-any.whl", hash = "sha256:400348e61031e3942ad4d4109d18753b2fb08c2f6fb8290671c5513a34182d84"}, + {file = "memory_profiler-0.61.0.tar.gz", hash = "sha256:4e5b73d7864a1d1292fb76a03e82a3e78ef934d06828a698d9dada76da2067b0"}, +] numpy = [ {file = "numpy-1.22.3-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75"}, {file = "numpy-1.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab"}, @@ -737,6 +805,22 @@ pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, ] +psutil = [ + {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"}, + {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"}, + {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ff89f9b835100a825b14c2808a106b6fdcc4b15483141482a12c725e7f78549"}, + {file = "psutil-5.9.4-cp27-cp27m-win32.whl", hash = "sha256:852dd5d9f8a47169fe62fd4a971aa07859476c2ba22c2254d4a1baa4e10b95ad"}, + {file = "psutil-5.9.4-cp27-cp27m-win_amd64.whl", hash = "sha256:9120cd39dca5c5e1c54b59a41d205023d436799b1c8c4d3ff71af18535728e94"}, + {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6b92c532979bafc2df23ddc785ed116fced1f492ad90a6830cf24f4d1ea27d24"}, + {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:efeae04f9516907be44904cc7ce08defb6b665128992a56957abc9b61dca94b7"}, + {file = "psutil-5.9.4-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7"}, + {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1"}, + {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08"}, + {file = "psutil-5.9.4-cp36-abi3-win32.whl", hash = "sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff"}, + {file = "psutil-5.9.4-cp36-abi3-win_amd64.whl", hash = "sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4"}, + {file = "psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e"}, + {file = "psutil-5.9.4.tar.gz", hash = "sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62"}, +] pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, @@ -758,6 +842,10 @@ pytest = [ {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"}, {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"}, ] +pytest-mock = [ + {file = "pytest-mock-3.10.0.tar.gz", hash = "sha256:fbbdb085ef7c252a326fd8cdcac0aa3b1333d8811f131bdcc701002e1be7ed4f"}, + {file = "pytest_mock-3.10.0-py3-none-any.whl", hash = "sha256:f4c973eeae0282963eb293eb173ce91b091a79c1334455acfac9ddee8a1c784b"}, +] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, diff --git a/pyproject.toml b/pyproject.toml index 7c4842b..066b74a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,9 @@ sqlitedict = "^2.0.0" [tool.poetry.group.dev.dependencies] pytest = "^7.2.0" +debugpy = "^1.6.3" +memory-profiler = "^0.61.0" +pytest-mock = "^3.10.0" [build-system] requires = ["poetry-core>=1.0.0"] From d365b44e8bf09d21b772b22974fb1113c832bc3c Mon Sep 17 00:00:00 2001 From: Ahmed Shariff Date: Sat, 26 Nov 2022 17:39:22 -0800 Subject: [PATCH 8/9] Improved logging during get_processed_data --- paperscraper/__init__.py | 25 ++++++++++++++++ paperscraper/_preprocess.py | 47 +++++++++++++++++++++++++++---- paperscraper/config.py | 6 +++- paperscraper/scrapers/keywords.py | 43 +++++++++++++++------------- 4 files changed, 95 insertions(+), 26 deletions(-) diff --git a/paperscraper/__init__.py b/paperscraper/__init__.py index e69de29..6088c6c 100644 --- a/paperscraper/__init__.py +++ b/paperscraper/__init__.py @@ -0,0 +1,25 @@ +from importlib.metadata import version +import logging +from loguru import logger + +__version__ = version(__package__) + + +class __InterceptHandler(logging.Handler): + def emit(self, record): + # Get corresponding Loguru level if it exists + try: + level = logger.level(record.levelname).name + except ValueError: + level = record.levelno + + # Find caller from where originated the logged message + frame, depth = logging.currentframe(), 2 + while frame.f_code.co_filename == logging.__file__: + frame = frame.f_back + depth += 1 + + logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage()) + + +logging.basicConfig(handlers=[__InterceptHandler()], level=0) diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py index a1557e2..ad1c952 100644 --- a/paperscraper/_preprocess.py +++ b/paperscraper/_preprocess.py @@ -1,4 +1,4 @@ -import ast +import sys import re import time from pathlib import Path @@ -13,6 +13,7 @@ from sqlitedict import SqliteDict from tqdm import tqdm from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from paperscraper.config import Config, config from paperscraper.scrapers.abstracts import get_abstract @@ -167,16 +168,51 @@ def get_extracted_data(config: Config, force: bool = False) -> tuple[SqliteDict, def _get_webdriver_instance(): chrome_options = Options() chrome_options.add_argument("--headless") + chrome_desired_capabilities = DesiredCapabilities.CHROME + chrome_desired_capabilities['goog:loggingPrefs'] = { 'browser':'ALL' } # chrome_options.binary_location = config.path_chromeoptions_binary # driver = webdriver.chrome(executable_path=config.path_chromedriver, chrome_options=chrome_options) driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options) # driver.implicitly_wait(10000) + driver._old_get_method = driver.get + driver.get = lambda *args, **kwargs: get_browser_log_entries(driver, *args, **kwargs) return driver -def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict: +def get_browser_log_entries(driver, *args, **kwargs): + """get log entreies from selenium and add to python logger before returning""" + ret_val = driver._old_get_method(*args, **kwargs) + loglevels = { + 'NOTSET': 'TRACE' , + 'DEBUG': 'DEBUG' , + 'INFO': 'INFO' , + 'WARNING':'WARNING', + 'ERROR': 'ERROR', + 'SEVERE':'ERROR', + 'CRITICAL':'CRITICAL' + } + + #get browser logs + slurped_logs = driver.get_log('browser') + for entry in slurped_logs: + #convert broswer log to python log format + rec = logger.log(loglevels.get(entry['level']), "{}: {}".format(entry['source'], entry['message'])) + # rec.created = entry['timestamp'] /1000 # log using original timestamp.. us -> ms + # try: + # #add browser log to python log + # browserlog.handle(rec) + # except: + # print(entry) + #and return logs incase you want them + return ret_val + + +def get_processed_data(config: Config, force: bool = False) -> SqliteDict: """Scrap the Abstracts, Keywords, and Citations.""" + + logger.add(config.path_console_log_file) + if force or not config.path_output.exists(): # Get a webdriver instance (Headless Chrome) logger.info(f"Processing data to {config.path_output}") @@ -210,6 +246,7 @@ def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict: log_obj[row["source"]]["no_of_citations_fetch_errors"] = 0 log_obj[row["source"]]["no_of_citations_errors"] = 0 + logger.debug("Processing {} ".format(row["title"])) # Increment no of papers log_obj[row["source"]]["papers"] += 1 @@ -247,7 +284,7 @@ def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict: abstract_soup = BeautifulSoup(driver.page_source, 'lxml') except Exception as e: - logger.error('Abstract: ' + str(e)) + logger.error(f'{index} Abstract: ' + str(e)) if abstract_soup is not None: is_abstract = False @@ -327,8 +364,8 @@ def get_processed_data(cofig: Config, force: bool = False) -> SqliteDict: log_obj[row["source"]]["keyword_fetch_errors"] += 1 log_obj[row["source"]]["keyword_errors"] += 1 - except Exception: - pass + except Exception as e: + logger.error(f'{index} Keywords: ' + str(e)) if not is_keyword: row['keywords'] = "Error" diff --git a/paperscraper/config.py b/paperscraper/config.py index dacc934..64c04c4 100644 --- a/paperscraper/config.py +++ b/paperscraper/config.py @@ -1,3 +1,4 @@ +from datetime import datetime from pathlib import Path from typing import Union @@ -32,7 +33,10 @@ def __init__(self, root_dir: Union[str, Path] = None, self.path_unique_venues = output_dir / "unique_venues.db" self.path_unique_keywords = output_dir / "unique_keywords.tsv" self.path_unique_authors = output_dir/ "unique_authors.tsv" - self.path_logfile = output_dir / "log.tsv" + + datetime_str = f"{datetime.now():%Y-%m-%d_%H-%M-%S%z}" + self.path_logfile = output_dir / f"log-{datetime_str}.tsv" + self.path_console_log_file = output_dir / f"console-{datetime_str}.log" # ChromeDriver # TODO Option 1: Manual Download from https://chromedriver.chromium.org/downloads (e.g., ChromeDriver 86.0.4240.22) and save to a known location in PATH diff --git a/paperscraper/scrapers/keywords.py b/paperscraper/scrapers/keywords.py index e6e9ffb..85cc14b 100644 --- a/paperscraper/scrapers/keywords.py +++ b/paperscraper/scrapers/keywords.py @@ -1,10 +1,13 @@ import re +from loguru import logger regex = re.compile(r'[\n\r\t]') def acm_digital_library(soup): try: + # TODO: Get keyoards by clicking on the citation linke (soup.select('a[data-title="Export Citation"]')) + # Then using the the ActionChains from selenium to click, parse the bib result and get keywords keywords = set() keywords_parent_ol = soup.find('ol', class_="rlist organizational-chart") keywords_divs = keywords_parent_ol.findChildren('div', recursive=True) @@ -13,12 +16,12 @@ def acm_digital_library(soup): keywords.add(regex.sub("", kw.split(",")[0])) return list(keywords) except Exception as e: - print(e) - return None + logger.error(e) + return [] def graphics_interface_proceedings(soup): - return None + return [] def ieee_explore(soup): @@ -36,8 +39,8 @@ def ieee_explore(soup): keywords.add(str(regex.sub("", str(keywords_l.text).split(",")[0]))) return list(keywords) except Exception as e: - print(e) - return None + logger.error(e) + return [] def eurographics_digital_library(soup): @@ -65,8 +68,8 @@ def eurographics_digital_library(soup): keywords_set.update(re.split(',|:|;', keywords_str)) return list(keywords_set) except Exception as e: - print(e) - return None + logger.error(e) + return [] def springer_v2(soup): @@ -78,8 +81,8 @@ def springer_v2(soup): keywords.add(k.text) return list(keywords) except Exception as e: - print(e) - return None + logger.error(e) + return [] def dagstuhl(soup): @@ -91,8 +94,8 @@ def dagstuhl(soup): if keywords_font is not None: return re.split(',', keywords_font.text) except Exception as e: - print(e) - return None + logger.error(e) + return [] def springer_v1(soup): @@ -105,8 +108,8 @@ def springer_v1(soup): keywords.add(str(regex.sub("", kw)).strip()) return list(keywords) except Exception as e: - print(e) - return None + logger.error(e) + return [] def wiley_online_library(soup): @@ -138,12 +141,12 @@ def wiley_online_library(soup): return list(keywords_set) except Exception as e: - print(e) - return None + logger.error(e) + return [] def cogsci(soup): - return None + return [] def scitepress(soup): @@ -154,8 +157,8 @@ def scitepress(soup): keywords_set.add(kw) return list(keywords_set) except Exception as e: - print(e) - return None + logger.error(e) + return [] def scienceopen(soup): @@ -168,11 +171,11 @@ def scienceopen(soup): return list(keywords_set) except Exception as e: pass - return None + return [] def aaai(soup): - return None + return [] def get_keywords(publisher, soup): From 2d0b3164bab0ec6a2b4a259be9df95462812d8d3 Mon Sep 17 00:00:00 2001 From: Ahmed Shariff Date: Sat, 26 Nov 2022 17:40:01 -0800 Subject: [PATCH 9/9] Fixes in get_processed_data - direct use url, commit every step --- paperscraper/_preprocess.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/paperscraper/_preprocess.py b/paperscraper/_preprocess.py index ad1c952..7cbde0f 100644 --- a/paperscraper/_preprocess.py +++ b/paperscraper/_preprocess.py @@ -253,13 +253,13 @@ def get_processed_data(config: Config, force: bool = False) -> SqliteDict: # Get the URLs urls = [] try: - urls = ast.literal_eval(row["ee"]) + urls = row["ee"] except Exception: # If not ee, check url. # But, this doesn't have HTTP/HTTPS it seems to be following some Relative Paths from a # BaseURL that is unknown. Hence, it will fail 99% of the times. try: - urls = ast.literal_eval(row["url"]) + urls = row["url"] except Exception: pass @@ -375,8 +375,7 @@ def get_processed_data(config: Config, force: bool = False) -> SqliteDict: papers_db[index] = row - if index % 100 == 100: - papers_db.commit() + papers_db.commit() # Persist the paper file papers_db.commit()