diff --git a/src/mdverse_scrapers/core/toolbox.py b/src/mdverse_scrapers/core/toolbox.py index 0a7a7d3..236d978 100644 --- a/src/mdverse_scrapers/core/toolbox.py +++ b/src/mdverse_scrapers/core/toolbox.py @@ -175,7 +175,7 @@ def read_query_file(query_file_path: Path, logger: "loguru.Logger" = loguru.logg exclusion_path_patterns : list[str] Patterns for path exclusion. """ - with open(query_file_path) as param_file: + with open(query_file_path, encoding="utf-8") as param_file: logger.info(f"Reading parameters from: {query_file_path}") data_loaded = yaml.safe_load(param_file) keywords = data_loaded["keywords"] @@ -209,28 +209,57 @@ def remove_duplicates_in_list_of_dicts(input_list: list[dict]) -> list[dict]: return output_list -def clean_text(string): - """Decode html and remove breaks. +def strip_html(input_text: str) -> str: + """Remove html tags. Arguments --------- - string: str - input string + input_text: str + input text Returns ------- str - decoded string. + clean text """ - # Remove HTML tags - # text_decode = BeautifulSoup(string, features="lxml") - # text_decode = u''.join(text_decode.findAll(text=True)) - text_decode = BeautifulSoup(string, features="lxml").text - # Remove tabulation and carriage return - text_decode = re.sub(r"[\n\r\t]", " ", text_decode) - # Remove multi spaces - text_decode = re.sub(r" {2,}", " ", text_decode) - return text_decode + return BeautifulSoup(input_text, features="lxml").text + + +def strip_whitespace(input_text: str) -> str: + """Remove whitespace characters. + + Arguments + --------- + input_text: str + input text + + Returns + ------- + str + clean text + """ + # Remove tabulation and carriage return. + text_clean = re.sub(r"[\n\r\t]", " ", input_text) + # Remove multi spaces. + text_clean = re.sub(r" {2,}", " ", text_clean) + return text_clean + + +def clean_text(input_text: str) -> str: + """Remove html tags and whitespace characters. + + Arguments + --------- + input_text: str + input text + + Returns + ------- + str + clean text + """ + clean_text = strip_html(input_text) + return strip_whitespace(clean_text) def remove_excluded_files( @@ -303,7 +332,7 @@ def find_false_positive_datasets( ) -> list[str]: """Find false positive datasets. - False positive datasets are datasets that propably do not + False positive datasets are datasets that probably do not contain any molecular dynamics data. Parameters diff --git a/src/mdverse_scrapers/scrapers/figshare.py b/src/mdverse_scrapers/scrapers/figshare.py index aa52b15..4365019 100644 --- a/src/mdverse_scrapers/scrapers/figshare.py +++ b/src/mdverse_scrapers/scrapers/figshare.py @@ -21,6 +21,7 @@ print_statistics, read_query_file, remove_excluded_files, + strip_html, ) from ..models.enums import DatasetSourceName from ..models.scraper import ScraperContext @@ -242,12 +243,12 @@ def extract_metadata_from_single_dataset_record( "dataset_url_in_repository": record_json.get("url_public_html"), "date_created": record_json.get("created_date"), "date_last_updated": record_json.get("modified_date"), - "title": clean_text(record_json.get("title")), + "title": clean_text(record_json.get("title", "")), "author_names": [ clean_text(author.get("full_name")) for author in record_json.get("authors", []) ], - "description": clean_text(record_json.get("description")), + "description": strip_html(record_json.get("description", "")), "license": record_json.get("license", {}).get("name"), "doi": record_json.get("doi"), "download_number": dataset_stats["download_number"], @@ -330,7 +331,7 @@ def search_all_datasets( found_datasets_per_keyword = [] # Search endpoint: /articles/search # https://docs.figshare.com/#articles_search - # Iterate seach on pages. + # Iterate search on pages. while True: data_query = { "order": "published_date", diff --git a/src/mdverse_scrapers/scrapers/zenodo.py b/src/mdverse_scrapers/scrapers/zenodo.py index 59077f1..aa23fe9 100644 --- a/src/mdverse_scrapers/scrapers/zenodo.py +++ b/src/mdverse_scrapers/scrapers/zenodo.py @@ -20,6 +20,7 @@ read_query_file, remove_duplicates_in_list_of_dicts, remove_excluded_files, + strip_html, ) from ..models.enums import DatasetSourceName from ..models.file import FileMetadata @@ -162,7 +163,7 @@ def extract_data_from_zip_file(url, logger: "loguru.Logger" = loguru.logger): Returns ------- list - List of dictionnaries with data extracted from zip preview. + List of dictionaries with data extracted from zip preview. """ file_lst = [] response = make_http_get_request_with_retries( @@ -330,7 +331,7 @@ def extract_metadata_from_json( for author in hit.get("metadata", {}).get("creators", []) if author.get("name", None) ], - "description": clean_text(hit.get("metadata", {}).get("description", "")), + "description": strip_html(hit.get("metadata", {}).get("description", "")), "keywords": [ str(keyword) for keyword in hit.get("metadata", {}).get("keywords", []) ],