Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 45 additions & 16 deletions src/mdverse_scrapers/core/toolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def read_query_file(query_file_path: Path, logger: "loguru.Logger" = loguru.logg
exclusion_path_patterns : list[str]
Patterns for path exclusion.
"""
with open(query_file_path) as param_file:
with open(query_file_path, encoding="utf-8") as param_file:
logger.info(f"Reading parameters from: {query_file_path}")
data_loaded = yaml.safe_load(param_file)
keywords = data_loaded["keywords"]
Expand Down Expand Up @@ -209,28 +209,57 @@ def remove_duplicates_in_list_of_dicts(input_list: list[dict]) -> list[dict]:
return output_list


def clean_text(string):
"""Decode html and remove breaks.
def strip_html(input_text: str) -> str:
"""Remove html tags.

Arguments
---------
string: str
input string
input_text: str
input text

Returns
-------
str
decoded string.
clean text
"""
# Remove HTML tags
# text_decode = BeautifulSoup(string, features="lxml")
# text_decode = u''.join(text_decode.findAll(text=True))
text_decode = BeautifulSoup(string, features="lxml").text
# Remove tabulation and carriage return
text_decode = re.sub(r"[\n\r\t]", " ", text_decode)
# Remove multi spaces
text_decode = re.sub(r" {2,}", " ", text_decode)
return text_decode
return BeautifulSoup(input_text, features="lxml").text


def strip_whitespace(input_text: str) -> str:
"""Remove whitespace characters.

Arguments
---------
input_text: str
input text

Returns
-------
str
clean text
"""
# Remove tabulation and carriage return.
text_clean = re.sub(r"[\n\r\t]", " ", input_text)
# Remove multi spaces.
text_clean = re.sub(r" {2,}", " ", text_clean)
return text_clean


def clean_text(input_text: str) -> str:
"""Remove html tags and whitespace characters.

Arguments
---------
input_text: str
input text

Returns
-------
str
clean text
"""
clean_text = strip_html(input_text)
return strip_whitespace(clean_text)


def remove_excluded_files(
Expand Down Expand Up @@ -303,7 +332,7 @@ def find_false_positive_datasets(
) -> list[str]:
"""Find false positive datasets.

False positive datasets are datasets that propably do not
False positive datasets are datasets that probably do not
contain any molecular dynamics data.

Parameters
Expand Down
7 changes: 4 additions & 3 deletions src/mdverse_scrapers/scrapers/figshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
print_statistics,
read_query_file,
remove_excluded_files,
strip_html,
)
from ..models.enums import DatasetSourceName
from ..models.scraper import ScraperContext
Expand Down Expand Up @@ -242,12 +243,12 @@ def extract_metadata_from_single_dataset_record(
"dataset_url_in_repository": record_json.get("url_public_html"),
"date_created": record_json.get("created_date"),
"date_last_updated": record_json.get("modified_date"),
"title": clean_text(record_json.get("title")),
"title": clean_text(record_json.get("title", "")),
"author_names": [
clean_text(author.get("full_name"))
for author in record_json.get("authors", [])
],
"description": clean_text(record_json.get("description")),
"description": strip_html(record_json.get("description", "")),
"license": record_json.get("license", {}).get("name"),
"doi": record_json.get("doi"),
"download_number": dataset_stats["download_number"],
Expand Down Expand Up @@ -330,7 +331,7 @@ def search_all_datasets(
found_datasets_per_keyword = []
# Search endpoint: /articles/search
# https://docs.figshare.com/#articles_search
# Iterate seach on pages.
# Iterate search on pages.
while True:
data_query = {
"order": "published_date",
Expand Down
5 changes: 3 additions & 2 deletions src/mdverse_scrapers/scrapers/zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
read_query_file,
remove_duplicates_in_list_of_dicts,
remove_excluded_files,
strip_html,
)
from ..models.enums import DatasetSourceName
from ..models.file import FileMetadata
Expand Down Expand Up @@ -162,7 +163,7 @@ def extract_data_from_zip_file(url, logger: "loguru.Logger" = loguru.logger):
Returns
-------
list
List of dictionnaries with data extracted from zip preview.
List of dictionaries with data extracted from zip preview.
"""
file_lst = []
response = make_http_get_request_with_retries(
Expand Down Expand Up @@ -330,7 +331,7 @@ def extract_metadata_from_json(
for author in hit.get("metadata", {}).get("creators", [])
if author.get("name", None)
],
"description": clean_text(hit.get("metadata", {}).get("description", "")),
"description": strip_html(hit.get("metadata", {}).get("description", "")),
"keywords": [
str(keyword) for keyword in hit.get("metadata", {}).get("keywords", [])
],
Expand Down