From 09e5d513508ec7f9d63a8f86f00dfb7c9adf7773 Mon Sep 17 00:00:00 2001 From: essmaw Date: Wed, 24 Dec 2025 16:17:34 +0100 Subject: [PATCH 1/9] Introduce new script for scraping GPCRMD datasets and files. Enhance dataset model to use float for timestep and delta; add simulation_time field. --- models/dataset_model.py | 7 +- scripts/scrap_gpcrmd.py | 559 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 564 insertions(+), 2 deletions(-) create mode 100644 scripts/scrap_gpcrmd.py diff --git a/models/dataset_model.py b/models/dataset_model.py index b89d42b..0629c19 100644 --- a/models/dataset_model.py +++ b/models/dataset_model.py @@ -182,12 +182,15 @@ class BaseDataset(BaseModel): None, description="Version of the forcefield model.", ) - timestep: int | None = Field( + timestep: float | None = Field( None, description="The time interval between new positions computation (in fs)." ) - delta: int | None = Field( + delta: float | None = Field( None, description="The time gap between frames (in ns)." ) + simulation_time: str | None = Field( + None, description="The accumulated simulation time (in Ξs)." + ) # ------------------------------------------------------------------ # Validators diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py new file mode 100644 index 0000000..4d8ad29 --- /dev/null +++ b/scripts/scrap_gpcrmd.py @@ -0,0 +1,559 @@ +""" +Scrape datasets and files from GPCRMD. + +This script fetches datasets from the GPCRMD repository (https://www.gpcrmd.org/). +It collects metadata such as dataset names, descriptions, authors, download links, +and other relevant information for all available datasets. +Additionally, it retrieves file metadata for each dataset, including file paths +in GPCRMD, file size, type/extension, etc. + +The scraped data is validated against Pydantic models (`BaseDataset` and `BaseFile`) +and saved locally in Parquet format: +- "data/gpcrmd/{timestamp}/validated_entries.parquet" +- "data/gpcrmd/{timestamp}/validated_files.parquet" + +Entries that fail validation are saved as: +- "data/gpcrmd/{timestamp}/unvalidated_entries.parquet" +- "data/gpcrmd/{timestamp}/unvalidated_files.parquet" + + +Usage: +====== + uv run -m scripts.scrap_gpcrmd [--out-path] + +Arguments: +========== + --out-path : (optional) + Folder path to save the scraped GPCRMD data (dataset and file metadata). + Default is "data/gpcrmd/{timestamp}". + +Example: +======== + uv run -m scripts.scrap_gpcrmd + +This command will: + 1. Fetch all available datasets from GPCRMD in batches. + 2. Parse their metadata and validate them using the Pydantic models `BaseDataset` + and `BaseFile`. + 3. Save both the validated and unvalidated dataset entries to + "data/gpcrmd/{timestamp}/{validated or unvalidated}_entries.parquet". + 4. Save file metadata similarly for validated and unvalidated files. +""" + +# METADATAS +__authors__ = ("Pierre Poulain", "Essmay Touami") +__contact__ = "pierre.poulain@u-paris.fr" +__copyright__ = "AGPL-3.0 license" +__date__ = "2025" +__version__ = "1.0.0" + + +# LIBRARY IMPORTS +import os +import sys +import time +from datetime import datetime +from pathlib import Path +from typing import Any + +import click +import httpx +import pandas as pd +from bs4 import BeautifulSoup +from loguru import logger +from pydantic import ValidationError +from tqdm import tqdm + +from models.dataset_model import BaseDataset +from models.file_model import BaseFile + +# CONSTANTS +BASE_GPCRMD_URL = "https://www.gpcrmd.org/api/search_all/info/" + + +# FUNCTIONS +def setup_logger(loguru_logger: Any, log_dir: str | Path = "logs") -> None: + """Configure a Loguru logger to write logs into a rotating daily log file. + + Parameters + ---------- + loguru_logger : Any + A Loguru logger instance (typically `loguru.logger`). + log_dir : str or Path, optional + Directory where log files will be stored. Default is "logs". + """ + # Ensure log directory exists + log_folder = Path(log_dir) + log_folder.mkdir(parents=True, exist_ok=True) + # Reset any previous configuration + loguru_logger.remove() + # Define log format + fmt = ( + "{time:YYYY-MM-DD HH:mm:ss}" + "| {level:<8} " + "| {message}" + ) + loguru_logger.add( + log_folder / "scrap_gpcrmd_data_{time:YYYY-MM-DD}.log", + format=fmt, + level="DEBUG", + ) + loguru_logger.add( + sys.stdout, + format=fmt, + level="DEBUG", + ) + + +def fetch_entries_once() -> tuple[list[dict[str, Any]], str]: + """ + Fetch all entries from the GPCRMD API. + + Returns + ------- + Tuple[List[Dict[str, Any]], str]: + - A list of entries (JSON objects). + Returns an empty list if the request fails. + - The current timestamp in ISO 8601 format (e.g., '2023-03-05T22:01:12'). + """ + logger.debug( + "Fetching entries from GPCRMD API... (usually take \ + less than 1 minutes!)" + ) + # Current timestamp in ISO format + fetch_time: str = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + + try: + # Perform the HTTP GET request with a long timeout to accommodate large data + response = httpx.get(BASE_GPCRMD_URL, timeout=1000) + response.raise_for_status() + + # Parse JSON data + entries_md = response.json() + logger.success( + f"Fetched {len(entries_md)} MD-related entries from GPCRMD successfully! \n" + ) + return entries_md, fetch_time + + except httpx.HTTPError as e: + logger.error(f"HTTP error occurred: {e}") + return [], fetch_time + + +def retrieve_metadata(url: str, field_name: str, timeout: int = 10) -> str | None: + """ + Retrieve a specific metadata field from a webpage. + + Parameters + ---------- + url : str + The URL of the webpage to fetch. + field_name : str + The name of the metadata field to extract (case-sensitive). + timeout : int, optional + Timeout in seconds for the HTTP request (default is 10). + + Returns + ------- + str | None + The value of the metadata field if found, otherwise None. + + """ + # Try to send an HTTP GET request to the given URL of the dataset. + try: + response = httpx.get(url, timeout=timeout) + response.raise_for_status() + except httpx.RequestError as e: + logger.warning(f"Failed to fetch {field_name} from {url}: {e}") + return None + # Parse the HTML content of the page using BeautifulSoup + soup = BeautifulSoup(response.text, "html.parser") + bold_tag = soup.find("b", string=lambda t: t and t.strip() == field_name) + if not bold_tag: + return None + # Get all the text from the parent element of the tag + parent_text = bold_tag.parent.get_text(strip=True) + if ":" not in parent_text: + return None + # Get only what is after the "field_name:" + return parent_text.split(":", 1)[1].strip() or None + + +def retrieve_reference_links(url: str, timeout: int = 10) -> list[str] | None: + """ + Retrieve reference URLs from the References section of a GPCRMD entry page. + + Parameters + ---------- + url : str + The URL of the GPCRMD entry page. + timeout : int, optional + Timeout in seconds for the HTTP request (default is 10). + + Returns + ------- + list[str] | None + List of reference URLs (starting with http:// or https://) if found, + otherwise None. + """ + try: + response = httpx.get(url, timeout=timeout) + response.raise_for_status() + except httpx.RequestError as e: + logger.warning(f"Failed to fetch reference links from {url}: {e}") + return None + # Parse the HTML content + soup = BeautifulSoup(response.text, "html.parser") + # Find the

header with text "References" + header = soup.find("h3", string=lambda t: t and t.strip() == "References") + if not header: + return None + # Get the corresponding content div containing the links + content_div = header.find_next_sibling("div", class_="techinfo_content") + if not content_div: + return None + + links: list[str] = [] + # Collect all hrefs that start with http:// or https:// + for a in content_div.find_all("a", href=True): + href = a["href"].strip() + if href.startswith(("http://", "https://")): + links.append(href) + + return links or None + + +def count_simulation_files(url: str, timeout: int = 10) -> int | None: + """ + Count files in the dataset webpage. + + Especially in 'Simulation output files' and 'Simulation protocol \ + & starting files' sections. + + Returns + ------- + int | None + The number of files related to this dataset. + """ + try: + response = httpx.get(url, timeout=timeout) + response.raise_for_status() + except httpx.RequestError as e: + logger.warning(f"Failed to fetch file counts from {url}: {e}") + return None + + soup = BeautifulSoup(response.text, "html.parser") + + # Helper function to count unique links in a container div + def count_links(container_id: str) -> int: + container = soup.find("div", id=container_id) + if not container: + return 0 + + # Collect all hrefs and remove duplicates while preserving order + links = ([a["href"].strip() for a in container.find_all("a", href=True) + if a["href"].strip()] + ) + return len(dict.fromkeys(links)) + + output_files_count = count_links("allfiles") + protocol_files_count = count_links("paramfiles") + + return output_files_count + protocol_files_count + + +def parse_and_validate_entry_metadatas( + entries_list: list[dict], + fetch_time: str +) -> tuple[list[BaseDataset], list[dict]]: + """ + Parse and validate metadata fields for a list of GPCRMD entries. + + Parameters + ---------- + entries_list : list of dict + List of dictionaries, each representing the metadata of a GPCRMD entry. + fetch_time : str + Timestamp (as a string) indicating when the data was fetched. + + Returns + ------- + Tuple[List[BaseDataset], List[Dict]] + - List of successfully validated `BaseDataset` objects. + - List of parsed entry that failed validation. + """ + logger.info("Starting parsing and validation of GPCRMD entries...") + validated_entries = [] + non_validated_entry_ids = [] + total_entries = len(entries_list) + + for data in tqdm(entries_list): + entry_id = str(data.get("dyn_id")) + + # Extract molecules and number total of atoms if available + total_atoms = data.get("atom_num") + dyncomp = data.get("dyncomp", []) + molecules = [comp.get("resname") for comp in dyncomp if comp.get("resname")] + url = data.get("url") + author_names = [retrieve_metadata(url, "Submitted by")] + description = retrieve_metadata(url, "Description") + stime = retrieve_metadata(url, "Accumulated simulation time") + refs = retrieve_reference_links(url) + nb_files = count_simulation_files(url) + + parsed_entry = { + "dataset_repository": "GPCRMD", + "dataset_project": "GPCRMD", + "dataset_id_in_repository": entry_id, + "dataset_id_in_project": entry_id, + "dataset_url_in_repository": url, + "dataset_url_in_project": url, + "links": refs, + "title": data.get("modelname"), + "date_created": data.get("creation_timestamp"), + "date_last_fetched": fetch_time, + "nb_files": nb_files, + "author_names": author_names, + "description": description, + "simulation_program_name": data.get("mysoftware"), + "simulation_program_version": data.get("software_version"), + "nb_atoms": total_atoms, + "molecule_names": molecules, + "forcefield_model_name": data.get("forcefield"), + "forcefield_model_version": data.get("forcefield_version"), + "timestep": data.get("timestep"), + "delta": data.get("delta"), + "simulation_time": stime + } + try: + # Validate and normalize data collected wieh pydantic model + dataset_model = BaseDataset(**parsed_entry) + validated_entries.append(dataset_model) + except ValidationError as e: + logger.error(f"Validation failed for entry {entry_id}") + for err in e.errors(): + logger.error(f" Field: {'.'.join(str(x) for x in err['loc'])}") + logger.error(f" Error: {err['msg']} (type={err['type']})") + non_validated_entry_ids.append(parsed_entry) + + logger.success( + f"Parsing completed: {len(validated_entries)} validated / {total_entries} total\ + entries successfully! \n" + ) + return validated_entries, non_validated_entry_ids + + +def parse_and_validate_files_metadatas( + entries_list: list[dict], + fetch_time: str +) -> tuple[list[BaseFile], list[dict]]: + """ + Parse and validate metadata for GPCRMD files. + + Parameters + ---------- + entries_list : list[dict] + List of file entries, each containing metadata such as 'dyn_id' and 'url'. + fetch_time : str + Timestamp indicating when the data was fetched. + + Returns + ------- + tuple[list[BaseFile], list[dict]] + - List of validated `BaseFile` objects. + - List of file entries that failed validation. + """ + logger.info("Starting parsing and validation of GPCRMD files...") + validated_entries: list[BaseFile] = [] + non_validated_entry_ids: list[dict] = [] + total_files = len(entries_list) + + # Loop over the first two entries for demonstration + for data in tqdm(entries_list): + entry_id = str(data.get("dyn_id")) + url = data.get("url") + + # Fetch the file page + try: + response = httpx.get(url, timeout=10) + response.raise_for_status() + except httpx.RequestError as e: + logger.warning(f"Failed to fetch file page for {entry_id}: {e}") + non_validated_entry_ids.append(data) + continue + + soup = BeautifulSoup(response.text, "html.parser") + sections = ["allfiles", "paramfiles"] + + # Iterate over sections containing files + for sec_id in sections: + container = soup.find("div", id=sec_id) + if not container: + continue + + # Process each file link + for a in container.find_all("a", href=True): + file_path = f"https://www.gpcrmd.org/{a['href'].strip()}" + if not file_path: + continue + + file_name = os.path.basename(file_path) + file_extension = os.path.splitext(file_name)[1].lstrip(".").lower() + + # Try to fetch file size via HEAD request + size: int = None + try: + head_resp = httpx.head(file_path, timeout=10, follow_redirects=True) + size = int(head_resp.headers.get("Content-Length", 0)) + except httpx.RequestError as e: + logger.warning(f"Failed to fetch file size for {file_name}: {e}") + + parsed_entry = { + "dataset_repository": "GPCRMD", + "dataset_id_in_repository": entry_id, + "file_name": file_name, + "file_type": file_extension, + "file_size": size, + "file_url_in_repository": file_path, + "date_last_fetched": fetch_time, + } + + # Validate and normalize entry using Pydantic model + try: + dataset_model = BaseFile(**parsed_entry) + validated_entries.append(dataset_model) + except ValidationError as e: + logger.error(f"Validation failed for file {entry_id}: {e}") + non_validated_entry_ids.append(parsed_entry) + + logger.success( + f"Parsing completed: {len(validated_entries)} validated / {total_files} \ + total files successfully!" + ) + return validated_entries, non_validated_entry_ids + + +def save_metadatas_to_parquet( + folder_out_path: Path, + metadatas_validated: list[BaseDataset] | list[BaseFile], + metadatas_unvalidated: list[dict], + tag: str, +) -> None: + """ + Save GPCRMD validated and unvalidated metadata to Parquet files. + + Parameters + ---------- + folder_out_path : Path + Folder path where Parquet files will be saved. + metadatas_validated : List[BaseDataset] + List of validated entries. + metadatas_unvalidated : List[Dict] + List of unvalidated entries as dictionaries. + tag: str + Tag to know if its entries or files metadata to save. + """ + logger.info("Saving GPCRMD entries metadatas to a Parquet file...") + # Ensure output folder exists + Path(folder_out_path).mkdir(parents=True, exist_ok=True) + + # Save validated entries + if tag == "entries": + validated_path = os.path.join(folder_out_path, "validated_entries.parquet") + elif tag == "files": + validated_path = os.path.join(folder_out_path, "validated_files.parquet") + try: + # Convert list of Pydantic models to list of dicts + validated_dicts = [entry.model_dump() for entry in metadatas_validated] + df_validated = pd.DataFrame(validated_dicts) + df_validated.to_parquet(validated_path, index=False) + logger.success( + f"GPCRMD validated metadatas saved to: {validated_path} successfully!" + ) + except (ValueError, TypeError, OSError) as e: + logger.error(f"Failed to save validated metadata to {validated_path}: {e}") + + # Save unvalidated entries + if tag == "entries": + unvalidated_path = os.path.join( + folder_out_path, "unvalidated_entries.parquet" + ) + elif tag == "files": + unvalidated_path = os.path.join( + folder_out_path, "unvalidated_files.parquet" + ) + try: + if len(metadatas_unvalidated) != 0: + df_unvalidated = pd.DataFrame(metadatas_unvalidated) + df_unvalidated.to_parquet(unvalidated_path, index=False) + logger.success( + f"GPCRMD unvalidated metadatas saved to: {unvalidated_path} successfully!" + ) + else: + logger.warning("There is no unvalidated entries to save!") + except (ValueError, TypeError, OSError) as e: + logger.error(f"Failed to save unvalidated metadata to {unvalidated_path}: {e}") + + +@click.command() +@click.option( + "--out-path", + type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path), + default=Path(f"data/gpcrmd/{datetime.now().strftime('%Y%m%d_%H%M%S')}"), + show_default=True, + help="Folder path to save the scraped GPCRMD data (Dataset and File metadatas)" +) +def scrap_gpcrmd_data(out_path: Path) -> None: + """Scrap datasets and files from GPCRMD. + + Parameters + ---------- + out_path : Path + The output folder path for the scraped data. + """ + setup_logger(logger, out_path) + logger.info("Starting GPCRMD data scraping...") + start_time = time.time() + + # Fetch entries metadata + entries, fetch_time = fetch_entries_once() + if entries == []: + logger.warning("No data fetched from GPCRMD.") + return + # Parse and validate entry metadatas with a pydantic model (BaseDataset) + entries_validated, entries_unvalidated = ( + parse_and_validate_entry_metadatas(entries, fetch_time) + ) + # Save parsed metadata to local file + save_metadatas_to_parquet( + out_path, + entries_validated, + entries_unvalidated, + tag="entries" + ) + + # Fetch, parse and validate the file metadatas with a pydantic model (BaseFile) + files_metadata_validated, files_metadata_unvalidated = ( + parse_and_validate_files_metadatas(entries, fetch_time) + ) + save_metadatas_to_parquet( + out_path, + files_metadata_validated, + files_metadata_unvalidated, + tag="files" + ) + + end_time = time.time() + elapsed_time = end_time - start_time + hours = int(elapsed_time // 3600) + minutes = int((elapsed_time % 3600) // 60) + seconds = int(elapsed_time % 60) + + logger.success( + f"Completed GPCRMD data scraping in {hours} h {minutes} min {seconds} sec 🎉" + ) + + +if __name__ == "__main__": + # Scrap GPCRMD data + scrap_gpcrmd_data() From 73a16346f059a52f5e5b487865ec5a6e0f6fead6 Mon Sep 17 00:00:00 2001 From: essmaw Date: Mon, 29 Dec 2025 14:02:59 +0100 Subject: [PATCH 2/9] Refactor GPCRMD scraping script: improve metadata handling, enhance validation logging, and update function signatures for clarity. --- scripts/scrap_gpcrmd.py | 401 +++++++++++++++++++++++++++++----------- 1 file changed, 290 insertions(+), 111 deletions(-) diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py index 4d8ad29..b161807 100644 --- a/scripts/scrap_gpcrmd.py +++ b/scripts/scrap_gpcrmd.py @@ -32,7 +32,7 @@ uv run -m scripts.scrap_gpcrmd This command will: - 1. Fetch all available datasets from GPCRMD in batches. + 1. Fetch all available datasets from GPCRMD. 2. Parse their metadata and validate them using the Pydantic models `BaseDataset` and `BaseFile`. 3. Save both the validated and unvalidated dataset entries to @@ -59,12 +59,12 @@ import click import httpx import pandas as pd -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from loguru import logger from pydantic import ValidationError from tqdm import tqdm -from models.dataset_model import BaseDataset +from models.dataset_model import BaseDataset, DatasetProject, DatasetRepository from models.file_model import BaseFile # CONSTANTS @@ -117,8 +117,8 @@ def fetch_entries_once() -> tuple[list[dict[str, Any]], str]: - The current timestamp in ISO 8601 format (e.g., '2023-03-05T22:01:12'). """ logger.debug( - "Fetching entries from GPCRMD API... (usually take \ - less than 1 minutes!)" + "Fetching entries from GPCRMD API... " + "(usually takes less than 1 minute!)" ) # Current timestamp in ISO format fetch_time: str = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") @@ -140,7 +140,7 @@ def fetch_entries_once() -> tuple[list[dict[str, Any]], str]: return [], fetch_time -def retrieve_metadata(url: str, field_name: str, timeout: int = 10) -> str | None: +def retrieve_metadata(url: str, field_name: str, timeout: int = 50) -> str | None: """ Retrieve a specific metadata field from a webpage. @@ -163,6 +163,13 @@ def retrieve_metadata(url: str, field_name: str, timeout: int = 10) -> str | Non try: response = httpx.get(url, timeout=timeout) response.raise_for_status() + + except httpx.HTTPStatusError as e: + logger.warning( + f"HTTP error {e.response.status_code} for {url}" + ) + return None + except httpx.RequestError as e: logger.warning(f"Failed to fetch {field_name} from {url}: {e}") return None @@ -172,14 +179,18 @@ def retrieve_metadata(url: str, field_name: str, timeout: int = 10) -> str | Non if not bold_tag: return None # Get all the text from the parent element of the tag - parent_text = bold_tag.parent.get_text(strip=True) + parent = bold_tag.parent + if not isinstance(parent, Tag): + return None + parent_text = parent.get_text(strip=True) + if ":" not in parent_text: return None # Get only what is after the "field_name:" return parent_text.split(":", 1)[1].strip() or None -def retrieve_reference_links(url: str, timeout: int = 10) -> list[str] | None: +def retrieve_reference_links(url: str, timeout: int = 50) -> list[str] | None: """ Retrieve reference URLs from the References section of a GPCRMD entry page. @@ -199,6 +210,13 @@ def retrieve_reference_links(url: str, timeout: int = 10) -> list[str] | None: try: response = httpx.get(url, timeout=timeout) response.raise_for_status() + + except httpx.HTTPStatusError as e: + logger.warning( + f"HTTP error {e.response.status_code} for {url}" + ) + return None + except httpx.RequestError as e: logger.warning(f"Failed to fetch reference links from {url}: {e}") return None @@ -213,17 +231,22 @@ def retrieve_reference_links(url: str, timeout: int = 10) -> list[str] | None: if not content_div: return None + # Iterate over all elements with an href attribute inside the content div + # Only keep elements that are of type Tag to satisfy type checkers + content_div = header.find_next_sibling("div", class_="techinfo_content") + if not isinstance(content_div, Tag): + return None links: list[str] = [] - # Collect all hrefs that start with http:// or https:// - for a in content_div.find_all("a", href=True): + for a in filter(lambda x: isinstance(x, Tag), content_div.find_all("a", href=True)): href = a["href"].strip() + # Only include links that start with "http://" or "https://" if href.startswith(("http://", "https://")): links.append(href) return links or None -def count_simulation_files(url: str, timeout: int = 10) -> int | None: +def count_simulation_files(url: str, timeout: int = 50) -> int | None: """ Count files in the dataset webpage. @@ -238,6 +261,12 @@ def count_simulation_files(url: str, timeout: int = 10) -> int | None: try: response = httpx.get(url, timeout=timeout) response.raise_for_status() + + except httpx.HTTPStatusError as e: + logger.warning( + f"HTTP error {e.response.status_code} for {url}" + ) + return None except httpx.RequestError as e: logger.warning(f"Failed to fetch file counts from {url}: {e}") return None @@ -246,14 +275,20 @@ def count_simulation_files(url: str, timeout: int = 10) -> int | None: # Helper function to count unique links in a container div def count_links(container_id: str) -> int: + # Find the container
by ID container = soup.find("div", id=container_id) - if not container: + # Ensure the container is actually a Tag + if not isinstance(container, Tag): return 0 - # Collect all hrefs and remove duplicates while preserving order - links = ([a["href"].strip() for a in container.find_all("a", href=True) - if a["href"].strip()] - ) + # Collect all hrefs in tags, stripping whitespace + links = [ + str(a.get("href", "")).strip() + for a in container.find_all("a", href=True) + if isinstance(a, Tag) and str(a.get("href", "")).strip() + ] + + # Remove duplicates while preserving order return len(dict.fromkeys(links)) output_files_count = count_links("allfiles") @@ -263,9 +298,9 @@ def count_links(container_id: str) -> int: def parse_and_validate_entry_metadatas( - entries_list: list[dict], + entries_list: list[dict[str, Any]], fetch_time: str -) -> tuple[list[BaseDataset], list[dict]]: +) -> tuple[list[BaseDataset], list[dict[str, Any]]]: """ Parse and validate metadata fields for a list of GPCRMD entries. @@ -282,77 +317,230 @@ def parse_and_validate_entry_metadatas( - List of successfully validated `BaseDataset` objects. - List of parsed entry that failed validation. """ - logger.info("Starting parsing and validation of GPCRMD entries...") - validated_entries = [] - non_validated_entry_ids = [] - total_entries = len(entries_list) - - for data in tqdm(entries_list): - entry_id = str(data.get("dyn_id")) + logger.info("Starting parsing and validating GPCRMD entries...") + validated_entries: list[BaseDataset] = [] + non_validated_entries: list[dict[str, Any]] = [] + total_entries: int = len(entries_list) + + for entry in tqdm(entries_list, + desc="Validating GPCRmd entries", + colour="blue", + unit="entry" + ): + entry_id = str(entry.get("dyn_id")) # Extract molecules and number total of atoms if available - total_atoms = data.get("atom_num") - dyncomp = data.get("dyncomp", []) - molecules = [comp.get("resname") for comp in dyncomp if comp.get("resname")] - url = data.get("url") - author_names = [retrieve_metadata(url, "Submitted by")] - description = retrieve_metadata(url, "Description") - stime = retrieve_metadata(url, "Accumulated simulation time") - refs = retrieve_reference_links(url) - nb_files = count_simulation_files(url) + total_atoms: int | None = entry.get("atom_num") + dyncomp: list[dict[str, Any]] = entry.get("dyncomp", []) + molecules: list[str] = ( + [comp.get("resname") for comp in dyncomp if comp.get("resname")] + ) + url: str = entry.get("url") + author_names: list[str | None] = [retrieve_metadata(url, "Submitted by")] + description: str | None = retrieve_metadata(url, "Description") + stime: str | None = retrieve_metadata(url, "Accumulated simulation time") + refs: list[str] | None = retrieve_reference_links(url) + nb_files: int | None = count_simulation_files(url) + softname: str = entry.get("mysoftware") + softvers: str = entry.get("software_version") + ffm: str = entry.get("forcefield") + ffm_vers: str = entry.get("forcefield_version") + delta: float = entry.get("delta") + timestep: float = entry.get("timestep") + title: str = entry.get("modelname") + date: str = entry.get("creation_timestamp") parsed_entry = { - "dataset_repository": "GPCRMD", - "dataset_project": "GPCRMD", + "dataset_repository": DatasetRepository.GPCRMD, + "dataset_project": DatasetProject.GPCRMD, "dataset_id_in_repository": entry_id, "dataset_id_in_project": entry_id, "dataset_url_in_repository": url, "dataset_url_in_project": url, "links": refs, - "title": data.get("modelname"), - "date_created": data.get("creation_timestamp"), + "title": title, + "date_created": date, "date_last_fetched": fetch_time, "nb_files": nb_files, "author_names": author_names, "description": description, - "simulation_program_name": data.get("mysoftware"), - "simulation_program_version": data.get("software_version"), + "simulation_program_name": softname, + "simulation_program_version": softvers, "nb_atoms": total_atoms, "molecule_names": molecules, - "forcefield_model_name": data.get("forcefield"), - "forcefield_model_version": data.get("forcefield_version"), - "timestep": data.get("timestep"), - "delta": data.get("delta"), + "forcefield_model_name": ffm, + "forcefield_model_version": ffm_vers, + "timestep": timestep, + "delta": delta, "simulation_time": stime } try: - # Validate and normalize data collected wieh pydantic model - dataset_model = BaseDataset(**parsed_entry) + # Validate and normalize data collected with pydantic model + dataset_model = BaseDataset(**parsed_entry) # ty:ignore[invalid-argument-type] # noqa: E501 validated_entries.append(dataset_model) except ValidationError as e: - logger.error(f"Validation failed for entry {entry_id}") + reasons: list[str] = [] + for err in e.errors(): - logger.error(f" Field: {'.'.join(str(x) for x in err['loc'])}") - logger.error(f" Error: {err['msg']} (type={err['type']})") - non_validated_entry_ids.append(parsed_entry) + field = ".".join(str(x) for x in err["loc"]) + reason = err["msg"] + value = err.get("input") + + logger.error( + "Validation error on '{}': value={!r} (type={}) -> {}", + field, + value, + type(value).__name__, + reason, + ) + reasons.append(f"{field}: {reason}") + + parsed_entry["non_validation_reason"] = "; ".join(reasons) + non_validated_entries.append(parsed_entry) logger.success( f"Parsing completed: {len(validated_entries)} validated / {total_entries} total\ entries successfully! \n" ) - return validated_entries, non_validated_entry_ids + return validated_entries, non_validated_entries -def parse_and_validate_files_metadatas( - entries_list: list[dict], - fetch_time: str -) -> tuple[list[BaseFile], list[dict]]: +def make_base_parsed_entry( + entry_id: str, + url: str, + fetch_time: str, +) -> dict[str, Any]: + """Create a base parsed entry with empty file metadata. + + Parameters + ---------- + entry_id : str + The unique identifier of the GPCRMD entry. + url : str + The URL of the GPCRMD entry. + fetch_time : str + The timestamp indicating when the data was fetched. + + Returns + ------- + dict[str, Any] + A dictionary representing the base parsed entry with empty file metadata. """ - Parse and validate metadata for GPCRMD files. + return { + "dataset_repository": DatasetRepository.GPCRMD, + "dataset_id_in_repository": entry_id, + "file_name": None, + "file_type": None, + "file_size": None, + "file_url_in_repository": url, + "date_last_fetched": fetch_time, + } + + +def fetch_entry_page(url: str) -> str | None: + """Fetch an entry page and return its HTML content. Parameters ---------- - entries_list : list[dict] + url : str + The URL of the entry page to fetch. + + Returns + ------- + str | None + The HTML content of the page if the request is successful, otherwise None. + """ + try: + response = httpx.get(url, timeout=50) + response.raise_for_status() + except httpx.HTTPStatusError as exc: + logger.warning("HTTP error %s for %s", exc.response.status_code, url) + return None + except httpx.RequestError as exc: + logger.warning("Request error for %s: %s", url, exc) + return None + + return response.text + + +def fetch_file_size(file_path: str) -> int | None: + """Fetch file size using a HEAD request. + + Parameters + ---------- + file_path : str + The URL of the file to fetch the size for. + + Returns + ------- + int | None + The size of the file in bytes if available, otherwise None. + """ + try: + response = httpx.head(file_path, timeout=50, follow_redirects=True) + return int(response.headers.get("Content-Length", 0)) + except httpx.HTTPStatusError as exc: + logger.warning( + "HTTP error %s for %s", + exc.response.status_code, + file_path, + ) + except httpx.RequestError as exc: + logger.warning("Failed to fetch file size for %s: %s", file_path, exc) + + return None + + +def validate_parsed_entry( + parsed_entry: dict[str, Any], +) -> BaseFile | None: + """Validate a parsed entry using the BaseFile model. + + Parameters + ---------- + parsed_entry : dict[str, Any] + The parsed entry to validate. + + Returns + ------- + BaseFile | None + The validated BaseFile object if validation is successful, + otherwise None. + + """ + try: + return BaseFile(**parsed_entry) + except ValidationError as exc: + reasons: list[str] = [] + + for err in exc.errors(): + field = ".".join(str(x) for x in err["loc"]) + reason = err["msg"] + value = err.get("input") + + logger.error( + "Validation error on '{}': value={!r} (type={}) -> {}", + field, + value, + type(value).__name__, + reason, + ) + + reasons.append(f"{field}: {reason}") + + parsed_entry["non_validation_reason"] = "; ".join(reasons) + return None + + +def fetch_and_validate_file_metadatas( + entries: list[dict], + fetch_time: str, +) -> tuple[list[BaseFile], list[dict]]: + """Fetch and validate metadata for GPCRMD files. + + Parameters + ---------- + entries : list[dict] List of file entries, each containing metadata such as 'dyn_id' and 'url'. fetch_time : str Timestamp indicating when the data was fetched. @@ -363,74 +551,64 @@ def parse_and_validate_files_metadatas( - List of validated `BaseFile` objects. - List of file entries that failed validation. """ - logger.info("Starting parsing and validation of GPCRMD files...") - validated_entries: list[BaseFile] = [] - non_validated_entry_ids: list[dict] = [] - total_files = len(entries_list) + logger.info("Starting fetching and validating GPCRMD files...") - # Loop over the first two entries for demonstration - for data in tqdm(entries_list): - entry_id = str(data.get("dyn_id")) - url = data.get("url") - - # Fetch the file page - try: - response = httpx.get(url, timeout=10) - response.raise_for_status() - except httpx.RequestError as e: - logger.warning(f"Failed to fetch file page for {entry_id}: {e}") - non_validated_entry_ids.append(data) + validated_entries: list[BaseFile] = [] + non_validated_entries: list[dict] = [] + + for entry in tqdm( + entries, + desc="Validating GPCRmd files", + colour="blue", + unit="file", + ): + entry_id = str(entry.get("dyn_id")) + url = entry.get("url") + + base_entry = make_base_parsed_entry(entry_id, url, fetch_time) + + html = fetch_entry_page(url) + if html is None: + base_entry["non_validation_reason"] = "entry_page_fetch_failed" + non_validated_entries.append(base_entry) continue - soup = BeautifulSoup(response.text, "html.parser") - sections = ["allfiles", "paramfiles"] + soup = BeautifulSoup(html, "html.parser") - # Iterate over sections containing files - for sec_id in sections: + for sec_id in ("allfiles", "paramfiles"): container = soup.find("div", id=sec_id) - if not container: + # Ensure container is a Tag + if not isinstance(container, Tag): continue - # Process each file link - for a in container.find_all("a", href=True): - file_path = f"https://www.gpcrmd.org/{a['href'].strip()}" - if not file_path: + for link in container.find_all("a", href=True): + # Ensure link is a Tag to safely access ['href'] + if not isinstance(link, Tag): continue + # Use .get() to safely retrieve the href, then convert to str + href_value = str(link.get("href", "")).strip() + if not href_value: + continue + file_path = f"https://www.gpcrmd.org/{href_value}" file_name = os.path.basename(file_path) - file_extension = os.path.splitext(file_name)[1].lstrip(".").lower() - - # Try to fetch file size via HEAD request - size: int = None - try: - head_resp = httpx.head(file_path, timeout=10, follow_redirects=True) - size = int(head_resp.headers.get("Content-Length", 0)) - except httpx.RequestError as e: - logger.warning(f"Failed to fetch file size for {file_name}: {e}") + file_type = os.path.splitext(file_name)[1].lstrip(".").lower() parsed_entry = { - "dataset_repository": "GPCRMD", - "dataset_id_in_repository": entry_id, + **base_entry, "file_name": file_name, - "file_type": file_extension, - "file_size": size, + "file_type": file_type, + "file_size": fetch_file_size(file_path), "file_url_in_repository": file_path, - "date_last_fetched": fetch_time, } - # Validate and normalize entry using Pydantic model - try: - dataset_model = BaseFile(**parsed_entry) - validated_entries.append(dataset_model) - except ValidationError as e: - logger.error(f"Validation failed for file {entry_id}: {e}") - non_validated_entry_ids.append(parsed_entry) + validated = validate_parsed_entry(parsed_entry) + if validated is None: + non_validated_entries.append(parsed_entry) + else: + validated_entries.append(validated) - logger.success( - f"Parsing completed: {len(validated_entries)} validated / {total_files} \ - total files successfully!" - ) - return validated_entries, non_validated_entry_ids + return validated_entries, non_validated_entries def save_metadatas_to_parquet( @@ -447,9 +625,9 @@ def save_metadatas_to_parquet( folder_out_path : Path Folder path where Parquet files will be saved. metadatas_validated : List[BaseDataset] - List of validated entries. + List of validated metadatas. metadatas_unvalidated : List[Dict] - List of unvalidated entries as dictionaries. + List of unvalidated metadatas as dictionaries. tag: str Tag to know if its entries or files metadata to save. """ @@ -534,7 +712,7 @@ def scrap_gpcrmd_data(out_path: Path) -> None: # Fetch, parse and validate the file metadatas with a pydantic model (BaseFile) files_metadata_validated, files_metadata_unvalidated = ( - parse_and_validate_files_metadatas(entries, fetch_time) + fetch_and_validate_file_metadatas(entries, fetch_time) ) save_metadatas_to_parquet( out_path, @@ -543,6 +721,7 @@ def scrap_gpcrmd_data(out_path: Path) -> None: tag="files" ) + # Compute the elapsed time for scrapping end_time = time.time() elapsed_time = end_time - start_time hours = int(elapsed_time // 3600) From 9bcf8c9cc787b3abe854336e5bda8448caa10580 Mon Sep 17 00:00:00 2001 From: essmaw Date: Mon, 5 Jan 2026 15:18:11 +0100 Subject: [PATCH 3/9] Refactor code to reduce length, eliminate duplicate requests, and rename output files and models. --- scripts/scrap_gpcrmd.py | 503 +++++++++++++++++++--------------------- 1 file changed, 245 insertions(+), 258 deletions(-) diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py index b161807..41a5e2b 100644 --- a/scripts/scrap_gpcrmd.py +++ b/scripts/scrap_gpcrmd.py @@ -7,14 +7,14 @@ Additionally, it retrieves file metadata for each dataset, including file paths in GPCRMD, file size, type/extension, etc. -The scraped data is validated against Pydantic models (`BaseDataset` and `BaseFile`) +The scraped data is validated against Pydantic models (`DatasetModel` and `File Model`) and saved locally in Parquet format: -- "data/gpcrmd/{timestamp}/validated_entries.parquet" -- "data/gpcrmd/{timestamp}/validated_files.parquet" +- "data/gpcrmd/{timestamp}/gpcrmd_datasets.parquet" +- "data/gpcrmd/{timestamp}/gpcrmd_files.parquet" Entries that fail validation are saved as: -- "data/gpcrmd/{timestamp}/unvalidated_entries.parquet" -- "data/gpcrmd/{timestamp}/unvalidated_files.parquet" +- "data/gpcrmd/{timestamp}/not_validated_gpcrmd_datasets.parquet" +- "data/gpcrmd/{timestamp}/not_validated_gpcrmd_files.parquet" Usage: @@ -33,10 +33,11 @@ This command will: 1. Fetch all available datasets from GPCRMD. - 2. Parse their metadata and validate them using the Pydantic models `BaseDataset` - and `BaseFile`. + 2. Parse their metadata and validate them using the Pydantic models `DatasetModel` + and `File Model`. 3. Save both the validated and unvalidated dataset entries to - "data/gpcrmd/{timestamp}/{validated or unvalidated}_entries.parquet". + "data/gpcrmd/{timestamp}/gpcrmd_datasets.parquet" and + "data/gpcrmd/{timestamp}/not_validated_gpcrmd_datasets.parquet" 4. Save file metadata similarly for validated and unvalidated files. """ @@ -64,8 +65,8 @@ from pydantic import ValidationError from tqdm import tqdm -from models.dataset_model import BaseDataset, DatasetProject, DatasetRepository -from models.file_model import BaseFile +from models.dataset_model import DatasetModel, DatasetProject, DatasetRepository +from models.file_model import FileModel # CONSTANTS BASE_GPCRMD_URL = "https://www.gpcrmd.org/api/search_all/info/" @@ -140,14 +141,42 @@ def fetch_entries_once() -> tuple[list[dict[str, Any]], str]: return [], fetch_time -def retrieve_metadata(url: str, field_name: str, timeout: int = 50) -> str | None: +def fetch_entry_page(url: str) -> str | None: + """Fetch an entry page and return its HTML content. + + Parameters + ---------- + url : str + The URL of the entry page to fetch. + + Returns + ------- + str | None + The HTML content of the page if the request is successful, otherwise None. + """ + try: + response = httpx.get(url, timeout=50) + response.raise_for_status() + # Sleep briefly to avoid overwhelming the remote server + time.sleep(0.1) + except httpx.HTTPStatusError as exc: + logger.warning(f"HTTP error {exc.response.status_code} for {url}") + return None + except httpx.RequestError as exc: + logger.warning(f"Request error for {url}: {exc}") + return None + + return response.text + + +def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | None: """ Retrieve a specific metadata field from a webpage. Parameters ---------- - url : str - The URL of the webpage to fetch. + html : str + The HTML content of the page. field_name : str The name of the metadata field to extract (case-sensitive). timeout : int, optional @@ -159,45 +188,33 @@ def retrieve_metadata(url: str, field_name: str, timeout: int = 50) -> str | Non The value of the metadata field if found, otherwise None. """ - # Try to send an HTTP GET request to the given URL of the dataset. - try: - response = httpx.get(url, timeout=timeout) - response.raise_for_status() - - except httpx.HTTPStatusError as e: - logger.warning( - f"HTTP error {e.response.status_code} for {url}" - ) - return None - - except httpx.RequestError as e: - logger.warning(f"Failed to fetch {field_name} from {url}: {e}") - return None # Parse the HTML content of the page using BeautifulSoup - soup = BeautifulSoup(response.text, "html.parser") - bold_tag = soup.find("b", string=lambda t: t and t.strip() == field_name) - if not bold_tag: - return None - # Get all the text from the parent element of the tag - parent = bold_tag.parent - if not isinstance(parent, Tag): - return None - parent_text = parent.get_text(strip=True) - - if ":" not in parent_text: - return None - # Get only what is after the "field_name:" - return parent_text.split(":", 1)[1].strip() or None + if html: + soup = BeautifulSoup(html, "html.parser") + bold_tag = soup.find("b", string=lambda t: t and t.strip() == field_name) + if not bold_tag: + return None + # Get all the text from the parent element of the tag + parent = bold_tag.parent + if not isinstance(parent, Tag): + return None + parent_text = parent.get_text(strip=True) + if ":" not in parent_text: + return None + # Get only what is after the "field_name:" + metadata = parent_text.split(":", 1)[1].strip() + return metadata + return None -def retrieve_reference_links(url: str, timeout: int = 50) -> list[str] | None: +def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None: """ Retrieve reference URLs from the References section of a GPCRMD entry page. Parameters ---------- - url : str - The URL of the GPCRMD entry page. + html : str + The HTML content of the page. timeout : int, optional Timeout in seconds for the HTTP request (default is 10). @@ -207,122 +224,149 @@ def retrieve_reference_links(url: str, timeout: int = 50) -> list[str] | None: List of reference URLs (starting with http:// or https://) if found, otherwise None. """ - try: - response = httpx.get(url, timeout=timeout) - response.raise_for_status() - - except httpx.HTTPStatusError as e: - logger.warning( - f"HTTP error {e.response.status_code} for {url}" - ) - return None - - except httpx.RequestError as e: - logger.warning(f"Failed to fetch reference links from {url}: {e}") - return None - # Parse the HTML content - soup = BeautifulSoup(response.text, "html.parser") - # Find the

header with text "References" - header = soup.find("h3", string=lambda t: t and t.strip() == "References") - if not header: - return None - # Get the corresponding content div containing the links - content_div = header.find_next_sibling("div", class_="techinfo_content") - if not content_div: - return None - - # Iterate over all elements with an href attribute inside the content div - # Only keep elements that are of type Tag to satisfy type checkers - content_div = header.find_next_sibling("div", class_="techinfo_content") - if not isinstance(content_div, Tag): - return None - links: list[str] = [] - for a in filter(lambda x: isinstance(x, Tag), content_div.find_all("a", href=True)): - href = a["href"].strip() - # Only include links that start with "http://" or "https://" - if href.startswith(("http://", "https://")): - links.append(href) + if html: + # Parse the HTML content + soup = BeautifulSoup(html, "html.parser") + # Find the

header with text "References" + header = soup.find("h3", string=lambda t: t and t.strip() == "References") + if not header: + return None + # Get the corresponding content div containing the links + content_div = header.find_next_sibling("div", class_="techinfo_content") + if not content_div: + return None + + # Iterate over all elements with an href attribute inside the content div + # Only keep elements that are of type Tag to satisfy type checkers + content_div = header.find_next_sibling("div", class_="techinfo_content") + if not isinstance(content_div, Tag): + return None + links: list[str] = [] + for a in filter(lambda x: isinstance(x, Tag), content_div.find_all("a", href=True)): + href = a["href"].strip() + # Only include links that start with "http://" or "https://" + if href.startswith(("http://", "https://")): + links.append(href) + return links - return links or None + return None -def count_simulation_files(url: str, timeout: int = 50) -> int | None: +def count_simulation_files(html: str, timeout: int = 50) -> int | None: """ Count files in the dataset webpage. Especially in 'Simulation output files' and 'Simulation protocol \ & starting files' sections. + Parameters + ---------- + html : str + The HTML content of the page. + Returns ------- int | None The number of files related to this dataset. """ - try: - response = httpx.get(url, timeout=timeout) - response.raise_for_status() + if html: + # Parse the HTML content + soup = BeautifulSoup(html, "html.parser") - except httpx.HTTPStatusError as e: - logger.warning( - f"HTTP error {e.response.status_code} for {url}" - ) - return None - except httpx.RequestError as e: - logger.warning(f"Failed to fetch file counts from {url}: {e}") - return None + # Helper function to count unique links in a container div + def count_links(container_id: str) -> int: + # Find the container
by ID + container = soup.find("div", id=container_id) + # Ensure the container is actually a Tag + if not isinstance(container, Tag): + return 0 + + # Collect all hrefs in tags, stripping whitespace + links = [ + str(a.get("href", "")).strip() + for a in container.find_all("a", href=True) + if isinstance(a, Tag) and str(a.get("href", "")).strip() + ] + + # Remove duplicates while preserving order + return len(dict.fromkeys(links)) + + output_files_count = count_links("allfiles") + protocol_files_count = count_links("paramfiles") + return output_files_count + protocol_files_count + return None + + +def validate_parsed_entry( + parsed_entry: dict[str, Any], + out_model: type[FileModel | DatasetModel] +) -> tuple[FileModel | DatasetModel | None, dict[str, Any] | None]: + """Validate a parsed entry using the pydantic model. - soup = BeautifulSoup(response.text, "html.parser") + Parameters + ---------- + parsed_entry : dict[str, Any] + The parsed entry to validate. + out_model: FileModel | DatasetModel + The Pydantic model used for the validation. - # Helper function to count unique links in a container div - def count_links(container_id: str) -> int: - # Find the container
by ID - container = soup.find("div", id=container_id) - # Ensure the container is actually a Tag - if not isinstance(container, Tag): - return 0 + Returns + ------- + tuple[FileModel | DatasetModel | None, dict[str, Any] | None] + A tuple containing the validated model instance if validation succeeds, + otherwise None, and the enriched parsed entry containing validation + failure reasons if validation fails. + """ + try: + return out_model(**parsed_entry), None + except ValidationError as exc: + reasons: list[str] = [] - # Collect all hrefs in tags, stripping whitespace - links = [ - str(a.get("href", "")).strip() - for a in container.find_all("a", href=True) - if isinstance(a, Tag) and str(a.get("href", "")).strip() - ] + for err in exc.errors(): + field = ".".join(str(x) for x in err["loc"]) + reason = err["msg"] + value = err.get("input") - # Remove duplicates while preserving order - return len(dict.fromkeys(links)) + logger.error( + "Validation error on '{}': value={!r} (type={}) -> {}", + field, + value, + type(value).__name__, + reason, + ) - output_files_count = count_links("allfiles") - protocol_files_count = count_links("paramfiles") + reasons.append(f"{field}: {reason}") - return output_files_count + protocol_files_count + parsed_entry["non_validation_reason"] = "; ".join(reasons) + return None, parsed_entry def parse_and_validate_entry_metadatas( - entries_list: list[dict[str, Any]], + entries: list[dict[str, Any]], fetch_time: str -) -> tuple[list[BaseDataset], list[dict[str, Any]]]: +) -> tuple[list[DatasetModel], list[dict[str, Any]]]: """ Parse and validate metadata fields for a list of GPCRMD entries. Parameters ---------- - entries_list : list of dict + entries : list of dict List of dictionaries, each representing the metadata of a GPCRMD entry. fetch_time : str Timestamp (as a string) indicating when the data was fetched. Returns ------- - Tuple[List[BaseDataset], List[Dict]] - - List of successfully validated `BaseDataset` objects. + tuple[list[DatasetModel], list[dict[str, Any]]] + - List of successfully validated `DatasetModel` objects. - List of parsed entry that failed validation. """ logger.info("Starting parsing and validating GPCRMD entries...") - validated_entries: list[BaseDataset] = [] + validated_entries: list[DatasetModel] = [] non_validated_entries: list[dict[str, Any]] = [] - total_entries: int = len(entries_list) + total_entries: int = len(entries) - for entry in tqdm(entries_list, + for entry in tqdm(entries, desc="Validating GPCRmd entries", colour="blue", unit="entry" @@ -336,19 +380,20 @@ def parse_and_validate_entry_metadatas( [comp.get("resname") for comp in dyncomp if comp.get("resname")] ) url: str = entry.get("url") - author_names: list[str | None] = [retrieve_metadata(url, "Submitted by")] - description: str | None = retrieve_metadata(url, "Description") - stime: str | None = retrieve_metadata(url, "Accumulated simulation time") - refs: list[str] | None = retrieve_reference_links(url) - nb_files: int | None = count_simulation_files(url) - softname: str = entry.get("mysoftware") - softvers: str = entry.get("software_version") - ffm: str = entry.get("forcefield") - ffm_vers: str = entry.get("forcefield_version") - delta: float = entry.get("delta") - timestep: float = entry.get("timestep") - title: str = entry.get("modelname") - date: str = entry.get("creation_timestamp") + # Fetch entry page with url + html = fetch_entry_page(url) + if html: + author_names: str | None = retrieve_metadata(html, "Submitted by") + description: str | None = retrieve_metadata(html, "Description") + stime: str | None = retrieve_metadata(html, "Accumulated simulation time") + refs: list[str] | None = retrieve_reference_links(html) + nb_files: int | None = count_simulation_files(html) + else: + author_names = None + description = None + stime = None + refs = None + nb_files = None parsed_entry = { "dataset_repository": DatasetRepository.GPCRMD, @@ -358,49 +403,40 @@ def parse_and_validate_entry_metadatas( "dataset_url_in_repository": url, "dataset_url_in_project": url, "links": refs, - "title": title, - "date_created": date, + "title": entry.get("modelname"), + "date_created": entry.get("creation_timestamp"), "date_last_fetched": fetch_time, "nb_files": nb_files, - "author_names": author_names, + "author_names": author_names if author_names is None else [author_names], "description": description, - "simulation_program_name": softname, - "simulation_program_version": softvers, + "simulation_program_name": entry.get("mysoftware"), + "simulation_program_version": entry.get("software_version"), "nb_atoms": total_atoms, "molecule_names": molecules, - "forcefield_model_name": ffm, - "forcefield_model_version": ffm_vers, - "timestep": timestep, - "delta": delta, + "forcefield_model_name": entry.get("forcefield"), + "forcefield_model_version": entry.get("forcefield_version"), + "timestep": entry.get("timestep"), + "delta": entry.get("delta"), "simulation_time": stime } - try: - # Validate and normalize data collected with pydantic model - dataset_model = BaseDataset(**parsed_entry) # ty:ignore[invalid-argument-type] # noqa: E501 - validated_entries.append(dataset_model) - except ValidationError as e: - reasons: list[str] = [] - - for err in e.errors(): - field = ".".join(str(x) for x in err["loc"]) - reason = err["msg"] - value = err.get("input") - - logger.error( - "Validation error on '{}': value={!r} (type={}) -> {}", - field, - value, - type(value).__name__, - reason, - ) - reasons.append(f"{field}: {reason}") - - parsed_entry["non_validation_reason"] = "; ".join(reasons) - non_validated_entries.append(parsed_entry) + # Validate and normalize data collected with pydantic model + (dataset_model_entry, + non_validated_parsed_entry, + ) = validate_parsed_entry(parsed_entry, DatasetModel) + if isinstance(dataset_model_entry, DatasetModel): + validated_entries.append(dataset_model_entry) + if non_validated_parsed_entry: + non_validated_entries.append(non_validated_parsed_entry) + + percentage = ( + (len(validated_entries) / total_entries) * 100 + if total_entries > 0 + else 0.0 + ) logger.success( - f"Parsing completed: {len(validated_entries)} validated / {total_entries} total\ - entries successfully! \n" + f"Parsing completed: {percentage:.2f}% validated " + f"({len(validated_entries)}/{total_entries}) datasets successfully! \n" ) return validated_entries, non_validated_entries @@ -431,38 +467,12 @@ def make_base_parsed_entry( "dataset_id_in_repository": entry_id, "file_name": None, "file_type": None, - "file_size": None, + "file_size_in_bytes": None, "file_url_in_repository": url, "date_last_fetched": fetch_time, } -def fetch_entry_page(url: str) -> str | None: - """Fetch an entry page and return its HTML content. - - Parameters - ---------- - url : str - The URL of the entry page to fetch. - - Returns - ------- - str | None - The HTML content of the page if the request is successful, otherwise None. - """ - try: - response = httpx.get(url, timeout=50) - response.raise_for_status() - except httpx.HTTPStatusError as exc: - logger.warning("HTTP error %s for %s", exc.response.status_code, url) - return None - except httpx.RequestError as exc: - logger.warning("Request error for %s: %s", url, exc) - return None - - return response.text - - def fetch_file_size(file_path: str) -> int | None: """Fetch file size using a HEAD request. @@ -478,6 +488,8 @@ def fetch_file_size(file_path: str) -> int | None: """ try: response = httpx.head(file_path, timeout=50, follow_redirects=True) + # Sleep briefly to avoid overwhelming the remote server + time.sleep(0.1) return int(response.headers.get("Content-Length", 0)) except httpx.HTTPStatusError as exc: logger.warning( @@ -491,51 +503,10 @@ def fetch_file_size(file_path: str) -> int | None: return None -def validate_parsed_entry( - parsed_entry: dict[str, Any], -) -> BaseFile | None: - """Validate a parsed entry using the BaseFile model. - - Parameters - ---------- - parsed_entry : dict[str, Any] - The parsed entry to validate. - - Returns - ------- - BaseFile | None - The validated BaseFile object if validation is successful, - otherwise None. - - """ - try: - return BaseFile(**parsed_entry) - except ValidationError as exc: - reasons: list[str] = [] - - for err in exc.errors(): - field = ".".join(str(x) for x in err["loc"]) - reason = err["msg"] - value = err.get("input") - - logger.error( - "Validation error on '{}': value={!r} (type={}) -> {}", - field, - value, - type(value).__name__, - reason, - ) - - reasons.append(f"{field}: {reason}") - - parsed_entry["non_validation_reason"] = "; ".join(reasons) - return None - - def fetch_and_validate_file_metadatas( entries: list[dict], fetch_time: str, -) -> tuple[list[BaseFile], list[dict]]: +) -> tuple[list[FileModel], list[dict[str, Any]]]: """Fetch and validate metadata for GPCRMD files. Parameters @@ -547,14 +518,15 @@ def fetch_and_validate_file_metadatas( Returns ------- - tuple[list[BaseFile], list[dict]] - - List of validated `BaseFile` objects. - - List of file entries that failed validation. + tuple[list[DatasetModel], list[dict[str, Any]]] + - List of successfully validated `FileModel` objects. + - List of parsed entry that failed validation. """ logger.info("Starting fetching and validating GPCRMD files...") - validated_entries: list[BaseFile] = [] - non_validated_entries: list[dict] = [] + validated_files: list[FileModel] = [] + non_validated_files: list[dict] = [] + total_files = 0 for entry in tqdm( entries, @@ -570,7 +542,7 @@ def fetch_and_validate_file_metadatas( html = fetch_entry_page(url) if html is None: base_entry["non_validation_reason"] = "entry_page_fetch_failed" - non_validated_entries.append(base_entry) + non_validated_files.append(base_entry) continue soup = BeautifulSoup(html, "html.parser") @@ -581,7 +553,10 @@ def fetch_and_validate_file_metadatas( if not isinstance(container, Tag): continue - for link in container.find_all("a", href=True): + links = container.find_all("a", href=True) + total_files += len(links) + + for link in links: # Ensure link is a Tag to safely access ['href'] if not isinstance(link, Tag): continue @@ -598,22 +573,34 @@ def fetch_and_validate_file_metadatas( **base_entry, "file_name": file_name, "file_type": file_type, - "file_size": fetch_file_size(file_path), + "file_size_in_bytes": fetch_file_size(file_path), "file_url_in_repository": file_path, } - validated = validate_parsed_entry(parsed_entry) - if validated is None: - non_validated_entries.append(parsed_entry) - else: - validated_entries.append(validated) - - return validated_entries, non_validated_entries + # Validate and normalize data collected with pydantic model + (file_model_entry, + non_validated_parsed_entry, + ) = validate_parsed_entry(parsed_entry, FileModel) + if isinstance(file_model_entry, FileModel): + validated_files.append(file_model_entry) + if non_validated_parsed_entry: + non_validated_files.append(non_validated_parsed_entry) + + percentage = ( + (len(validated_files) / total_files) * 100 + if total_files > 0 + else 0.0 + ) + logger.success( + f"Parsing completed: {percentage:.2f}% validated " + f"({len(validated_files)}/{total_files}) files successfully! \n" + ) + return validated_files, non_validated_files def save_metadatas_to_parquet( folder_out_path: Path, - metadatas_validated: list[BaseDataset] | list[BaseFile], + metadatas_validated: list[DatasetModel] | list[FileModel], metadatas_unvalidated: list[dict], tag: str, ) -> None: @@ -624,7 +611,7 @@ def save_metadatas_to_parquet( ---------- folder_out_path : Path Folder path where Parquet files will be saved. - metadatas_validated : List[BaseDataset] + metadatas_validated : List[DatasetModel] List of validated metadatas. metadatas_unvalidated : List[Dict] List of unvalidated metadatas as dictionaries. @@ -637,9 +624,9 @@ def save_metadatas_to_parquet( # Save validated entries if tag == "entries": - validated_path = os.path.join(folder_out_path, "validated_entries.parquet") + validated_path = os.path.join(folder_out_path, "gpcrmd_datasets.parquet") elif tag == "files": - validated_path = os.path.join(folder_out_path, "validated_files.parquet") + validated_path = os.path.join(folder_out_path, "gpcrmd_files.parquet") try: # Convert list of Pydantic models to list of dicts validated_dicts = [entry.model_dump() for entry in metadatas_validated] @@ -654,11 +641,11 @@ def save_metadatas_to_parquet( # Save unvalidated entries if tag == "entries": unvalidated_path = os.path.join( - folder_out_path, "unvalidated_entries.parquet" + folder_out_path, "not_validated_gpcrmd_datasets.parquet" ) elif tag == "files": unvalidated_path = os.path.join( - folder_out_path, "unvalidated_files.parquet" + folder_out_path, "not_validated_gpcrmd_files.parquet" ) try: if len(metadatas_unvalidated) != 0: @@ -698,7 +685,7 @@ def scrap_gpcrmd_data(out_path: Path) -> None: if entries == []: logger.warning("No data fetched from GPCRMD.") return - # Parse and validate entry metadatas with a pydantic model (BaseDataset) + # Parse and validate entry metadatas with a pydantic model (DatasetModel) entries_validated, entries_unvalidated = ( parse_and_validate_entry_metadatas(entries, fetch_time) ) @@ -710,7 +697,7 @@ def scrap_gpcrmd_data(out_path: Path) -> None: tag="entries" ) - # Fetch, parse and validate the file metadatas with a pydantic model (BaseFile) + # Fetch, parse and validate the file metadatas with a pydantic model (File Model) files_metadata_validated, files_metadata_unvalidated = ( fetch_and_validate_file_metadatas(entries, fetch_time) ) From a2a8e87b3a3e4d569bdabeb564fd5f5c3bde1def Mon Sep 17 00:00:00 2001 From: essmaw Date: Mon, 5 Jan 2026 17:50:47 +0100 Subject: [PATCH 4/9] Correct one log for the validation of files. --- scripts/scrap_gpcrmd.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py index 41a5e2b..135c7e2 100644 --- a/scripts/scrap_gpcrmd.py +++ b/scripts/scrap_gpcrmd.py @@ -593,7 +593,8 @@ def fetch_and_validate_file_metadatas( ) logger.success( f"Parsing completed: {percentage:.2f}% validated " - f"({len(validated_files)}/{total_files}) files successfully! \n" + f"({len(validated_files) - len(non_validated_files)}/" + f"{total_files}) files successfully! \n" ) return validated_files, non_validated_files From 7250ffeba0fc349027443d44da346286360decc8 Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 8 Jan 2026 19:58:32 +0100 Subject: [PATCH 5/9] Refactor GPCRMD scraping script: update metadata handling, improve validation logging, and rename functions for clarity. --- scripts/scrap_gpcrmd.py | 337 ++++++++++++++++++++-------------------- 1 file changed, 167 insertions(+), 170 deletions(-) diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py index 135c7e2..4d68031 100644 --- a/scripts/scrap_gpcrmd.py +++ b/scripts/scrap_gpcrmd.py @@ -7,14 +7,14 @@ Additionally, it retrieves file metadata for each dataset, including file paths in GPCRMD, file size, type/extension, etc. -The scraped data is validated against Pydantic models (`DatasetModel` and `File Model`) -and saved locally in Parquet format: -- "data/gpcrmd/{timestamp}/gpcrmd_datasets.parquet" -- "data/gpcrmd/{timestamp}/gpcrmd_files.parquet" +The scraped data is validated against Pydantic models (`DatasetMetadata` +and `File Model`) and saved locally in Parquet format: +- "data/gpcrmd/gpcrmd_datasets.parquet" +- "data/gpcrmd/gpcrmd_files.parquet" -Entries that fail validation are saved as: -- "data/gpcrmd/{timestamp}/not_validated_gpcrmd_datasets.parquet" -- "data/gpcrmd/{timestamp}/not_validated_gpcrmd_files.parquet" +Datasets that fail validation are saved as: +- "data/gpcrmd/not_validated_gpcrmd_datasets.parquet" +- "data/gpcrmd/not_validated_gpcrmd_files.parquet" Usage: @@ -25,7 +25,7 @@ ========== --out-path : (optional) Folder path to save the scraped GPCRMD data (dataset and file metadata). - Default is "data/gpcrmd/{timestamp}". + Default is "data/gpcrmd". Example: ======== @@ -33,11 +33,11 @@ This command will: 1. Fetch all available datasets from GPCRMD. - 2. Parse their metadata and validate them using the Pydantic models `DatasetModel` - and `File Model`. - 3. Save both the validated and unvalidated dataset entries to - "data/gpcrmd/{timestamp}/gpcrmd_datasets.parquet" and - "data/gpcrmd/{timestamp}/not_validated_gpcrmd_datasets.parquet" + 2. Parse their metadata and validate them using the Pydantic models + `DatasetMetadata` and `File Model`. + 3. Save both the validated and unvalidated dataset datasets to + "data/gpcrmd/gpcrmd_datasets.parquet" and + "data/gpcrmd/not_validated_gpcrmd_datasets.parquet" 4. Save file metadata similarly for validated and unvalidated files. """ @@ -53,7 +53,7 @@ import os import sys import time -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path from typing import Any @@ -63,10 +63,9 @@ from bs4 import BeautifulSoup, Tag from loguru import logger from pydantic import ValidationError -from tqdm import tqdm -from models.dataset_model import DatasetModel, DatasetProject, DatasetRepository -from models.file_model import FileModel +from models.dataset import DatasetMetadata, DatasetProject, DatasetRepository +from models.file import FileMetadata # CONSTANTS BASE_GPCRMD_URL = "https://www.gpcrmd.org/api/search_all/info/" @@ -98,6 +97,7 @@ def setup_logger(loguru_logger: Any, log_dir: str | Path = "logs") -> None: log_folder / "scrap_gpcrmd_data_{time:YYYY-MM-DD}.log", format=fmt, level="DEBUG", + mode="w", ) loguru_logger.add( sys.stdout, @@ -106,19 +106,19 @@ def setup_logger(loguru_logger: Any, log_dir: str | Path = "logs") -> None: ) -def fetch_entries_once() -> tuple[list[dict[str, Any]], str]: +def fetch_datasets_once() -> tuple[list[dict[str, Any]], str]: """ - Fetch all entries from the GPCRMD API. + Fetch all datasets from the GPCRMD API. Returns ------- Tuple[List[Dict[str, Any]], str]: - - A list of entries (JSON objects). + - A list of datasets (JSON objects). Returns an empty list if the request fails. - The current timestamp in ISO 8601 format (e.g., '2023-03-05T22:01:12'). """ logger.debug( - "Fetching entries from GPCRMD API... " + "Fetching datasets from GPCRMD API... " "(usually takes less than 1 minute!)" ) # Current timestamp in ISO format @@ -130,24 +130,24 @@ def fetch_entries_once() -> tuple[list[dict[str, Any]], str]: response.raise_for_status() # Parse JSON data - entries_md = response.json() + datasets = response.json() logger.success( - f"Fetched {len(entries_md)} MD-related entries from GPCRMD successfully! \n" + f"Fetched {len(datasets)} MD-related datasets from GPCRMD successfully! \n" ) - return entries_md, fetch_time + return datasets, fetch_time except httpx.HTTPError as e: logger.error(f"HTTP error occurred: {e}") return [], fetch_time -def fetch_entry_page(url: str) -> str | None: - """Fetch an entry page and return its HTML content. +def fetch_dataset_page(url: str) -> str | None: + """Fetch an dataset page and return its HTML content. Parameters ---------- url : str - The URL of the entry page to fetch. + The URL of the dataset page to fetch. Returns ------- @@ -209,7 +209,7 @@ def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | No def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None: """ - Retrieve reference URLs from the References section of a GPCRMD entry page. + Retrieve reference URLs from the References section of a GPCRMD dataset page. Parameters ---------- @@ -242,7 +242,8 @@ def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None: if not isinstance(content_div, Tag): return None links: list[str] = [] - for a in filter(lambda x: isinstance(x, Tag), content_div.find_all("a", href=True)): + for a in filter(lambda x: isinstance(x, Tag), + content_div.find_all("a", href=True)): href = a["href"].strip() # Only include links that start with "http://" or "https://" if href.startswith(("http://", "https://")): @@ -297,28 +298,28 @@ def count_links(container_id: str) -> int: return None -def validate_parsed_entry( - parsed_entry: dict[str, Any], - out_model: type[FileModel | DatasetModel] -) -> tuple[FileModel | DatasetModel | None, dict[str, Any] | None]: - """Validate a parsed entry using the pydantic model. +def validate_parsed_metadatas( + parsed: dict[str, Any], + out_model: type[FileMetadata | DatasetMetadata] +) -> tuple[FileMetadata | DatasetMetadata | None, dict[str, Any] | None]: + """Validate a parsed dataset using the pydantic model. Parameters ---------- - parsed_entry : dict[str, Any] - The parsed entry to validate. - out_model: FileModel | DatasetModel + parsed : dict[str, Any] + The parsed dataset or file to validate. + out_model: FileMetadata | DatasetMetadata The Pydantic model used for the validation. Returns ------- - tuple[FileModel | DatasetModel | None, dict[str, Any] | None] + tuple[FileMetadata | DatasetMetadata | None, dict[str, Any] | None] A tuple containing the validated model instance if validation succeeds, - otherwise None, and the enriched parsed entry containing validation + otherwise None, and the enriched parsed dataset containing validation failure reasons if validation fails. """ try: - return out_model(**parsed_entry), None + return out_model(**parsed), None except ValidationError as exc: reasons: list[str] = [] @@ -327,65 +328,54 @@ def validate_parsed_entry( reason = err["msg"] value = err.get("input") - logger.error( - "Validation error on '{}': value={!r} (type={}) -> {}", - field, - value, - type(value).__name__, - reason, - ) - - reasons.append(f"{field}: {reason}") + reasons.append(f"{field}: {reason} (input={value!r})") - parsed_entry["non_validation_reason"] = "; ".join(reasons) - return None, parsed_entry + parsed["non_validation_reason"] = "; ".join(reasons) + return None, parsed -def parse_and_validate_entry_metadatas( - entries: list[dict[str, Any]], +def parse_and_validate_dataset_metadatas( + datasets: list[dict[str, Any]], fetch_time: str -) -> tuple[list[DatasetModel], list[dict[str, Any]]]: +) -> tuple[list[DatasetMetadata], list[dict[str, Any]]]: """ - Parse and validate metadata fields for a list of GPCRMD entries. + Parse and validate metadata fields for a list of GPCRMD datasets. Parameters ---------- - entries : list of dict - List of dictionaries, each representing the metadata of a GPCRMD entry. + datasets : list of dict + List of dictionaries, each representing the metadata of a GPCRMD dataset. fetch_time : str Timestamp (as a string) indicating when the data was fetched. Returns ------- - tuple[list[DatasetModel], list[dict[str, Any]]] - - List of successfully validated `DatasetModel` objects. - - List of parsed entry that failed validation. + tuple[list[DatasetMetadata], list[dict[str, Any]]] + - List of successfully validated `DatasetMetadata` objects. + - List of parsed dataset that failed validation. """ - logger.info("Starting parsing and validating GPCRMD entries...") - validated_entries: list[DatasetModel] = [] - non_validated_entries: list[dict[str, Any]] = [] - total_entries: int = len(entries) - - for entry in tqdm(entries, - desc="Validating GPCRmd entries", - colour="blue", - unit="entry" - ): - entry_id = str(entry.get("dyn_id")) + logger.info("Starting parsing and validating GPCRMD datasets...") + validated_datasets: list[DatasetMetadata] = [] + non_validated_datasets: list[dict[str, Any]] = [] + total_datasets: int = len(datasets) + + for i, dataset in enumerate(datasets): + dataset_id = str(dataset.get("dyn_id")) # Extract molecules and number total of atoms if available - total_atoms: int | None = entry.get("atom_num") - dyncomp: list[dict[str, Any]] = entry.get("dyncomp", []) + total_atoms: int | None = dataset.get("atom_num") + dyncomp: list[dict[str, Any]] = dataset.get("dyncomp", []) molecules: list[str] = ( [comp.get("resname") for comp in dyncomp if comp.get("resname")] ) - url: str = entry.get("url") - # Fetch entry page with url - html = fetch_entry_page(url) + url = dataset.get("url") + # Fetch dataset page with url + html = fetch_dataset_page(url) if url else None if html: author_names: str | None = retrieve_metadata(html, "Submitted by") description: str | None = retrieve_metadata(html, "Description") stime: str | None = retrieve_metadata(html, "Accumulated simulation time") + stime_list: list[str] | None = [stime] if stime is not None else None refs: list[str] | None = retrieve_reference_links(html) nb_files: int | None = count_simulation_files(html) else: @@ -395,76 +385,85 @@ def parse_and_validate_entry_metadatas( refs = None nb_files = None - parsed_entry = { - "dataset_repository": DatasetRepository.GPCRMD, + parsed_dataset = { + "dataset_repository_name": DatasetRepository.GPCRMD, "dataset_project": DatasetProject.GPCRMD, - "dataset_id_in_repository": entry_id, - "dataset_id_in_project": entry_id, + "dataset_id_in_repository": dataset_id, + "dataset_id_in_project": dataset_id, "dataset_url_in_repository": url, "dataset_url_in_project": url, "links": refs, - "title": entry.get("modelname"), - "date_created": entry.get("creation_timestamp"), + "title": dataset.get("modelname"), + "date_created": dataset.get("creation_timestamp"), "date_last_fetched": fetch_time, "nb_files": nb_files, "author_names": author_names if author_names is None else [author_names], "description": description, - "simulation_program_name": entry.get("mysoftware"), - "simulation_program_version": entry.get("software_version"), + "simulation_program_name": dataset.get("mysoftware"), + "simulation_program_version": dataset.get("software_version"), "nb_atoms": total_atoms, "molecule_names": molecules, - "forcefield_model_name": entry.get("forcefield"), - "forcefield_model_version": entry.get("forcefield_version"), - "timestep": entry.get("timestep"), - "delta": entry.get("delta"), - "simulation_time": stime + "forcefield_model_name": dataset.get("forcefield"), + "forcefield_model_version": dataset.get("forcefield_version"), + "timestep": dataset.get("timestep"), + "delta": dataset.get("delta"), + "simulation_time": stime_list } # Validate and normalize data collected with pydantic model - (dataset_model_entry, - non_validated_parsed_entry, - ) = validate_parsed_entry(parsed_entry, DatasetModel) - if isinstance(dataset_model_entry, DatasetModel): - validated_entries.append(dataset_model_entry) - if non_validated_parsed_entry: - non_validated_entries.append(non_validated_parsed_entry) + (parsed_dataset_model, + non_validated_parsed_dataset, + ) = validate_parsed_metadatas(parsed_dataset, DatasetMetadata) + # If it return a DatasetMetadata object + if isinstance(parsed_dataset_model, DatasetMetadata): + # Validation succeed + logger.debug(f"Parsed {i}/{len(datasets)} datasets") + validated_datasets.append(parsed_dataset_model) + # If not + if non_validated_parsed_dataset: + # Validation failed + logger.error(f"Validation failed for dataset `{dataset_id}` ({url})" + ". Invalid field(s) detected : " + f"{non_validated_parsed_dataset["non_validation_reason"]}" + ) + non_validated_datasets.append(non_validated_parsed_dataset) percentage = ( - (len(validated_entries) / total_entries) * 100 - if total_entries > 0 + (len(validated_datasets) / total_datasets) * 100 + if total_datasets > 0 else 0.0 ) logger.success( f"Parsing completed: {percentage:.2f}% validated " - f"({len(validated_entries)}/{total_entries}) datasets successfully! \n" + f"({len(validated_datasets)}/{total_datasets}) datasets successfully! \n" ) - return validated_entries, non_validated_entries + return validated_datasets, non_validated_datasets -def make_base_parsed_entry( - entry_id: str, +def make_base_parsed_file( + dataset_id: str, url: str, fetch_time: str, ) -> dict[str, Any]: - """Create a base parsed entry with empty file metadata. + """Create a base parsed dataset with empty file metadata. Parameters ---------- - entry_id : str - The unique identifier of the GPCRMD entry. + dataset_id : str + The unique identifier of the GPCRMD dataset. url : str - The URL of the GPCRMD entry. + The URL of the GPCRMD dataset. fetch_time : str The timestamp indicating when the data was fetched. Returns ------- dict[str, Any] - A dictionary representing the base parsed entry with empty file metadata. + A dictionary representing the base parsed dataset with empty file metadata. """ return { - "dataset_repository": DatasetRepository.GPCRMD, - "dataset_id_in_repository": entry_id, + "dataset_repository_name": DatasetRepository.GPCRMD, + "dataset_id_in_repository": dataset_id, "file_name": None, "file_type": None, "file_size_in_bytes": None, @@ -504,45 +503,42 @@ def fetch_file_size(file_path: str) -> int | None: def fetch_and_validate_file_metadatas( - entries: list[dict], + datasets: list[dict], fetch_time: str, -) -> tuple[list[FileModel], list[dict[str, Any]]]: +) -> tuple[list[FileMetadata], list[dict[str, Any]]]: """Fetch and validate metadata for GPCRMD files. Parameters ---------- - entries : list[dict] - List of file entries, each containing metadata such as 'dyn_id' and 'url'. + datasets : list[dict] + List of file datasets, each containing metadata such as 'dyn_id' and 'url'. fetch_time : str Timestamp indicating when the data was fetched. Returns ------- - tuple[list[DatasetModel], list[dict[str, Any]]] - - List of successfully validated `FileModel` objects. - - List of parsed entry that failed validation. + tuple[list[DatasetMetadata], list[dict[str, Any]]] + - List of successfully validated `FileMetadata` objects. + - List of parsed dataset that failed validation. """ logger.info("Starting fetching and validating GPCRMD files...") - validated_files: list[FileModel] = [] + validated_files: list[FileMetadata] = [] non_validated_files: list[dict] = [] total_files = 0 - for entry in tqdm( - entries, - desc="Validating GPCRmd files", - colour="blue", - unit="file", - ): - entry_id = str(entry.get("dyn_id")) - url = entry.get("url") + for i, dataset in enumerate(datasets): + dataset_id = str(dataset.get("dyn_id")) + url = dataset.get("url") + files_parsed_for_dataset = 0 - base_entry = make_base_parsed_entry(entry_id, url, fetch_time) + base_file = (make_base_parsed_file(dataset_id, url, fetch_time) + if url else {}) - html = fetch_entry_page(url) + html = fetch_dataset_page(url) if url else None if html is None: - base_entry["non_validation_reason"] = "entry_page_fetch_failed" - non_validated_files.append(base_entry) + base_file["non_validation_reason"] = "dataset_page_fetch_failed" + non_validated_files.append(base_file) continue soup = BeautifulSoup(html, "html.parser") @@ -569,8 +565,8 @@ def fetch_and_validate_file_metadatas( file_name = os.path.basename(file_path) file_type = os.path.splitext(file_name)[1].lstrip(".").lower() - parsed_entry = { - **base_entry, + parsed_file = { + **base_file, "file_name": file_name, "file_type": file_type, "file_size_in_bytes": fetch_file_size(file_path), @@ -578,13 +574,22 @@ def fetch_and_validate_file_metadatas( } # Validate and normalize data collected with pydantic model - (file_model_entry, - non_validated_parsed_entry, - ) = validate_parsed_entry(parsed_entry, FileModel) - if isinstance(file_model_entry, FileModel): - validated_files.append(file_model_entry) - if non_validated_parsed_entry: - non_validated_files.append(non_validated_parsed_entry) + (parsed_file_model, + non_validated_parsed_file, + ) = validate_parsed_metadatas(parsed_file, FileMetadata) + files_parsed_for_dataset += 1 + if isinstance(parsed_file_model, FileMetadata): + logger.debug( + f"Parsed {files_parsed_for_dataset} file(s) for dataset " + f"{i}/{len(datasets)}" + ) + validated_files.append(parsed_file_model) + if non_validated_parsed_file: + logger.error(f"Validation failed for file `{file_name}` " + f"({file_path}). Invalid field(s) detected : " + f"{non_validated_parsed_file["non_validation_reason"]}" + ) + non_validated_files.append(non_validated_parsed_file) percentage = ( (len(validated_files) / total_files) * 100 @@ -601,7 +606,7 @@ def fetch_and_validate_file_metadatas( def save_metadatas_to_parquet( folder_out_path: Path, - metadatas_validated: list[DatasetModel] | list[FileModel], + metadatas_validated: list[DatasetMetadata] | list[FileMetadata], metadatas_unvalidated: list[dict], tag: str, ) -> None: @@ -612,25 +617,25 @@ def save_metadatas_to_parquet( ---------- folder_out_path : Path Folder path where Parquet files will be saved. - metadatas_validated : List[DatasetModel] + metadatas_validated : List[DatasetMetadata] List of validated metadatas. metadatas_unvalidated : List[Dict] List of unvalidated metadatas as dictionaries. tag: str - Tag to know if its entries or files metadata to save. + Tag to know if its datasets or files metadata to save. """ - logger.info("Saving GPCRMD entries metadatas to a Parquet file...") + logger.info("Saving GPCRMD datasets metadatas to a Parquet file...") # Ensure output folder exists Path(folder_out_path).mkdir(parents=True, exist_ok=True) - # Save validated entries - if tag == "entries": + # Save validated datasets + if tag == "datasets": validated_path = os.path.join(folder_out_path, "gpcrmd_datasets.parquet") elif tag == "files": validated_path = os.path.join(folder_out_path, "gpcrmd_files.parquet") try: # Convert list of Pydantic models to list of dicts - validated_dicts = [entry.model_dump() for entry in metadatas_validated] + validated_dicts = [dataset.model_dump() for dataset in metadatas_validated] df_validated = pd.DataFrame(validated_dicts) df_validated.to_parquet(validated_path, index=False) logger.success( @@ -639,8 +644,8 @@ def save_metadatas_to_parquet( except (ValueError, TypeError, OSError) as e: logger.error(f"Failed to save validated metadata to {validated_path}: {e}") - # Save unvalidated entries - if tag == "entries": + # Save unvalidated datasets + if tag == "datasets": unvalidated_path = os.path.join( folder_out_path, "not_validated_gpcrmd_datasets.parquet" ) @@ -656,7 +661,7 @@ def save_metadatas_to_parquet( f"GPCRMD unvalidated metadatas saved to: {unvalidated_path} successfully!" ) else: - logger.warning("There is no unvalidated entries to save!") + logger.warning("There is no unvalidated datasets to save!") except (ValueError, TypeError, OSError) as e: logger.error(f"Failed to save unvalidated metadata to {unvalidated_path}: {e}") @@ -665,7 +670,7 @@ def save_metadatas_to_parquet( @click.option( "--out-path", type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path), - default=Path(f"data/gpcrmd/{datetime.now().strftime('%Y%m%d_%H%M%S')}"), + default=Path("data/gpcrmd"), show_default=True, help="Folder path to save the scraped GPCRMD data (Dataset and File metadatas)" ) @@ -679,28 +684,28 @@ def scrap_gpcrmd_data(out_path: Path) -> None: """ setup_logger(logger, out_path) logger.info("Starting GPCRMD data scraping...") - start_time = time.time() + start_time = time.perf_counter() - # Fetch entries metadata - entries, fetch_time = fetch_entries_once() - if entries == []: + # Fetch datasets metadata + datasets, fetch_time = fetch_datasets_once() + if datasets == []: logger.warning("No data fetched from GPCRMD.") return - # Parse and validate entry metadatas with a pydantic model (DatasetModel) - entries_validated, entries_unvalidated = ( - parse_and_validate_entry_metadatas(entries, fetch_time) + # Parse and validate dataset metadatas with a pydantic model (DatasetMetadata) + datasets_validated, datasets_unvalidated = ( + parse_and_validate_dataset_metadatas(datasets, fetch_time) ) # Save parsed metadata to local file save_metadatas_to_parquet( out_path, - entries_validated, - entries_unvalidated, - tag="entries" + datasets_validated, + datasets_unvalidated, + tag="datasets" ) # Fetch, parse and validate the file metadatas with a pydantic model (File Model) files_metadata_validated, files_metadata_unvalidated = ( - fetch_and_validate_file_metadatas(entries, fetch_time) + fetch_and_validate_file_metadatas(datasets, fetch_time) ) save_metadatas_to_parquet( out_path, @@ -709,16 +714,8 @@ def scrap_gpcrmd_data(out_path: Path) -> None: tag="files" ) - # Compute the elapsed time for scrapping - end_time = time.time() - elapsed_time = end_time - start_time - hours = int(elapsed_time // 3600) - minutes = int((elapsed_time % 3600) // 60) - seconds = int(elapsed_time % 60) - - logger.success( - f"Completed GPCRMD data scraping in {hours} h {minutes} min {seconds} sec 🎉" - ) + elapsed_time = int(time.perf_counter() - start_time) + logger.success(f"Scraping duration: {timedelta(seconds=elapsed_time)} 🎉") if __name__ == "__main__": From 846e47390bec7f7eb1aa302eb99a513ed2967b35 Mon Sep 17 00:00:00 2001 From: essmaw Date: Fri, 9 Jan 2026 18:39:28 +0100 Subject: [PATCH 6/9] Remove unnecessary timeout parameter. --- scripts/scrap_gpcrmd.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py index 4d68031..aa6f4df 100644 --- a/scripts/scrap_gpcrmd.py +++ b/scripts/scrap_gpcrmd.py @@ -169,7 +169,7 @@ def fetch_dataset_page(url: str) -> str | None: return response.text -def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | None: +def retrieve_metadata(html: str, field_name: str) -> str | None: """ Retrieve a specific metadata field from a webpage. @@ -179,8 +179,6 @@ def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | No The HTML content of the page. field_name : str The name of the metadata field to extract (case-sensitive). - timeout : int, optional - Timeout in seconds for the HTTP request (default is 10). Returns ------- @@ -207,7 +205,7 @@ def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | No return None -def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None: +def retrieve_reference_links(html: str) -> list[str] | None: """ Retrieve reference URLs from the References section of a GPCRMD dataset page. @@ -215,8 +213,6 @@ def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None: ---------- html : str The HTML content of the page. - timeout : int, optional - Timeout in seconds for the HTTP request (default is 10). Returns ------- @@ -253,7 +249,7 @@ def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None: return None -def count_simulation_files(html: str, timeout: int = 50) -> int | None: +def count_simulation_files(html: str) -> int | None: """ Count files in the dataset webpage. From 7a3831810bf4fb0d62bc7b7df1a41619f999e7a5 Mon Sep 17 00:00:00 2001 From: essmaw Date: Fri, 9 Jan 2026 21:18:49 +0100 Subject: [PATCH 7/9] Remove unnecessary timeout + Improve logging messages. --- scripts/scrap_gpcrmd.py | 152 +++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 80 deletions(-) diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py index aa6f4df..f0b0fb9 100644 --- a/scripts/scrap_gpcrmd.py +++ b/scripts/scrap_gpcrmd.py @@ -12,11 +12,6 @@ - "data/gpcrmd/gpcrmd_datasets.parquet" - "data/gpcrmd/gpcrmd_files.parquet" -Datasets that fail validation are saved as: -- "data/gpcrmd/not_validated_gpcrmd_datasets.parquet" -- "data/gpcrmd/not_validated_gpcrmd_files.parquet" - - Usage: ====== uv run -m scripts.scrap_gpcrmd [--out-path] @@ -34,11 +29,9 @@ This command will: 1. Fetch all available datasets from GPCRMD. 2. Parse their metadata and validate them using the Pydantic models - `DatasetMetadata` and `File Model`. - 3. Save both the validated and unvalidated dataset datasets to - "data/gpcrmd/gpcrmd_datasets.parquet" and - "data/gpcrmd/not_validated_gpcrmd_datasets.parquet" - 4. Save file metadata similarly for validated and unvalidated files. + `DatasetMetadata` and `FileMetadata`. + 3. Save both the validated dataset datasets to "data/gpcrmd/gpcrmd_datasets.parquet" + 4. Save file metadata similarly for validated files. """ # METADATAS @@ -297,7 +290,7 @@ def count_links(container_id: str) -> int: def validate_parsed_metadatas( parsed: dict[str, Any], out_model: type[FileMetadata | DatasetMetadata] -) -> tuple[FileMetadata | DatasetMetadata | None, dict[str, Any] | None]: +) -> tuple[FileMetadata | DatasetMetadata | None, str | None]: """Validate a parsed dataset using the pydantic model. Parameters @@ -309,10 +302,9 @@ def validate_parsed_metadatas( Returns ------- - tuple[FileMetadata | DatasetMetadata | None, dict[str, Any] | None] + tuple[FileMetadata | DatasetMetadata | None, str | None] A tuple containing the validated model instance if validation succeeds, - otherwise None, and the enriched parsed dataset containing validation - failure reasons if validation fails. + otherwise None, and the validation failure reasons if validation fails. """ try: return out_model(**parsed), None @@ -326,14 +318,14 @@ def validate_parsed_metadatas( reasons.append(f"{field}: {reason} (input={value!r})") - parsed["non_validation_reason"] = "; ".join(reasons) - return None, parsed + non_validation_reason = "; ".join(reasons) + return None, non_validation_reason def parse_and_validate_dataset_metadatas( datasets: list[dict[str, Any]], fetch_time: str -) -> tuple[list[DatasetMetadata], list[dict[str, Any]]]: +) -> list[DatasetMetadata]: """ Parse and validate metadata fields for a list of GPCRMD datasets. @@ -346,16 +338,14 @@ def parse_and_validate_dataset_metadatas( Returns ------- - tuple[list[DatasetMetadata], list[dict[str, Any]]] - - List of successfully validated `DatasetMetadata` objects. - - List of parsed dataset that failed validation. + list[DatasetMetadata] + List of successfully validated `DatasetMetadata` objects. """ logger.info("Starting parsing and validating GPCRMD datasets...") validated_datasets: list[DatasetMetadata] = [] - non_validated_datasets: list[dict[str, Any]] = [] total_datasets: int = len(datasets) - for i, dataset in enumerate(datasets): + for i, dataset in enumerate(datasets, start=1): dataset_id = str(dataset.get("dyn_id")) # Extract molecules and number total of atoms if available @@ -375,6 +365,8 @@ def parse_and_validate_dataset_metadatas( refs: list[str] | None = retrieve_reference_links(html) nb_files: int | None = count_simulation_files(html) else: + logger.warning(f"Dataset `{dataset_id}` ({url}): " + "page HTML missing; web metadata extraction skipped.") author_names = None description = None stime = None @@ -408,21 +400,19 @@ def parse_and_validate_dataset_metadatas( # Validate and normalize data collected with pydantic model (parsed_dataset_model, - non_validated_parsed_dataset, + non_validation_reason, ) = validate_parsed_metadatas(parsed_dataset, DatasetMetadata) # If it return a DatasetMetadata object if isinstance(parsed_dataset_model, DatasetMetadata): # Validation succeed - logger.debug(f"Parsed {i}/{len(datasets)} datasets") + logger.debug(f"Parsed dataset id `{dataset_id}` ({i}/{len(datasets)})") validated_datasets.append(parsed_dataset_model) - # If not - if non_validated_parsed_dataset: + else: # Validation failed logger.error(f"Validation failed for dataset `{dataset_id}` ({url})" ". Invalid field(s) detected : " - f"{non_validated_parsed_dataset["non_validation_reason"]}" + f"{non_validation_reason}" ) - non_validated_datasets.append(non_validated_parsed_dataset) percentage = ( (len(validated_datasets) / total_datasets) * 100 @@ -433,7 +423,7 @@ def parse_and_validate_dataset_metadatas( f"Parsing completed: {percentage:.2f}% validated " f"({len(validated_datasets)}/{total_datasets}) datasets successfully! \n" ) - return validated_datasets, non_validated_datasets + return validated_datasets def make_base_parsed_file( @@ -501,7 +491,7 @@ def fetch_file_size(file_path: str) -> int | None: def fetch_and_validate_file_metadatas( datasets: list[dict], fetch_time: str, -) -> tuple[list[FileMetadata], list[dict[str, Any]]]: +) -> list[FileMetadata]: """Fetch and validate metadata for GPCRMD files. Parameters @@ -513,28 +503,33 @@ def fetch_and_validate_file_metadatas( Returns ------- - tuple[list[DatasetMetadata], list[dict[str, Any]]] - - List of successfully validated `FileMetadata` objects. - - List of parsed dataset that failed validation. + list[FileMetadata] + List of successfully validated `FileMetadata` objects. """ logger.info("Starting fetching and validating GPCRMD files...") validated_files: list[FileMetadata] = [] - non_validated_files: list[dict] = [] total_files = 0 + non_validated_files_count = 0 - for i, dataset in enumerate(datasets): + for i, dataset in enumerate(datasets, start=1): dataset_id = str(dataset.get("dyn_id")) url = dataset.get("url") - files_parsed_for_dataset = 0 + count_files_parsed_for_dataset = 0 + + if not url: + logger.error( + f"Dataset `{dataset_id}` skipped: missing dataset URL." + ) + continue - base_file = (make_base_parsed_file(dataset_id, url, fetch_time) - if url else {}) + base_file = make_base_parsed_file(dataset_id, url, fetch_time) html = fetch_dataset_page(url) if url else None if html is None: - base_file["non_validation_reason"] = "dataset_page_fetch_failed" - non_validated_files.append(base_file) + logger.error( + f"Dataset `{dataset_id}` ({url}) skipped: page retrieval failed." + ) continue soup = BeautifulSoup(html, "html.parser") @@ -543,6 +538,21 @@ def fetch_and_validate_file_metadatas( container = soup.find("div", id=sec_id) # Ensure container is a Tag if not isinstance(container, Tag): + if sec_id == "allfiles": + # allfiles mandatory + logger.warning( + f"Dataset id `{dataset_id}` ({url}):" + f"mandatory section `{sec_id}` is missing or invalid. " + "Files required for simulation parsing cannot be retrieved." + ) + else: + # paramfiles optional + # logger.warning( + # f"Dataset id `{dataset_id}` ({url}): " + # f"optional section `{sec_id}` not found. " + # "Parameter files for simulations will be skipped." + # ) + pass continue links = container.find_all("a", href=True) @@ -551,11 +561,19 @@ def fetch_and_validate_file_metadatas( for link in links: # Ensure link is a Tag to safely access ['href'] if not isinstance(link, Tag): + logger.warning( + f"Dataset `{dataset_id}` ({url}): " + "encountered non-HTML link element." + ) continue # Use .get() to safely retrieve the href, then convert to str href_value = str(link.get("href", "")).strip() if not href_value: + logger.warning( + f"Dataset `{dataset_id}` ({url}): " + "file link without href attribute." + ) continue file_path = f"https://www.gpcrmd.org/{href_value}" file_name = os.path.basename(file_path) @@ -571,21 +589,21 @@ def fetch_and_validate_file_metadatas( # Validate and normalize data collected with pydantic model (parsed_file_model, - non_validated_parsed_file, + non_validation_reason, ) = validate_parsed_metadatas(parsed_file, FileMetadata) - files_parsed_for_dataset += 1 + count_files_parsed_for_dataset += 1 if isinstance(parsed_file_model, FileMetadata): logger.debug( - f"Parsed {files_parsed_for_dataset} file(s) for dataset " - f"{i}/{len(datasets)}" + f"Parsed file `{file_name}` from dataset " + f"`{dataset_id}` ({i}/{len(datasets)})" ) validated_files.append(parsed_file_model) - if non_validated_parsed_file: + else: logger.error(f"Validation failed for file `{file_name}` " f"({file_path}). Invalid field(s) detected : " - f"{non_validated_parsed_file["non_validation_reason"]}" + f"{non_validation_reason}" ) - non_validated_files.append(non_validated_parsed_file) + non_validated_files_count += 1 percentage = ( (len(validated_files) / total_files) * 100 @@ -594,16 +612,15 @@ def fetch_and_validate_file_metadatas( ) logger.success( f"Parsing completed: {percentage:.2f}% validated " - f"({len(validated_files) - len(non_validated_files)}/" + f"({len(validated_files) - non_validated_files_count}/" f"{total_files}) files successfully! \n" ) - return validated_files, non_validated_files + return validated_files def save_metadatas_to_parquet( folder_out_path: Path, metadatas_validated: list[DatasetMetadata] | list[FileMetadata], - metadatas_unvalidated: list[dict], tag: str, ) -> None: """ @@ -615,16 +632,14 @@ def save_metadatas_to_parquet( Folder path where Parquet files will be saved. metadatas_validated : List[DatasetMetadata] List of validated metadatas. - metadatas_unvalidated : List[Dict] - List of unvalidated metadatas as dictionaries. tag: str Tag to know if its datasets or files metadata to save. """ - logger.info("Saving GPCRMD datasets metadatas to a Parquet file...") + logger.info(f"Saving GPCRMD {tag} metadatas to a Parquet file...") # Ensure output folder exists Path(folder_out_path).mkdir(parents=True, exist_ok=True) - # Save validated datasets + # Save validated datasets and files if tag == "datasets": validated_path = os.path.join(folder_out_path, "gpcrmd_datasets.parquet") elif tag == "files": @@ -638,28 +653,7 @@ def save_metadatas_to_parquet( f"GPCRMD validated metadatas saved to: {validated_path} successfully!" ) except (ValueError, TypeError, OSError) as e: - logger.error(f"Failed to save validated metadata to {validated_path}: {e}") - - # Save unvalidated datasets - if tag == "datasets": - unvalidated_path = os.path.join( - folder_out_path, "not_validated_gpcrmd_datasets.parquet" - ) - elif tag == "files": - unvalidated_path = os.path.join( - folder_out_path, "not_validated_gpcrmd_files.parquet" - ) - try: - if len(metadatas_unvalidated) != 0: - df_unvalidated = pd.DataFrame(metadatas_unvalidated) - df_unvalidated.to_parquet(unvalidated_path, index=False) - logger.success( - f"GPCRMD unvalidated metadatas saved to: {unvalidated_path} successfully!" - ) - else: - logger.warning("There is no unvalidated datasets to save!") - except (ValueError, TypeError, OSError) as e: - logger.error(f"Failed to save unvalidated metadata to {unvalidated_path}: {e}") + logger.error(f"Failed to save validated {tag} to {validated_path}: {e}") @click.command() @@ -688,25 +682,23 @@ def scrap_gpcrmd_data(out_path: Path) -> None: logger.warning("No data fetched from GPCRMD.") return # Parse and validate dataset metadatas with a pydantic model (DatasetMetadata) - datasets_validated, datasets_unvalidated = ( + datasets_validated = ( parse_and_validate_dataset_metadatas(datasets, fetch_time) ) # Save parsed metadata to local file save_metadatas_to_parquet( out_path, datasets_validated, - datasets_unvalidated, tag="datasets" ) # Fetch, parse and validate the file metadatas with a pydantic model (File Model) - files_metadata_validated, files_metadata_unvalidated = ( + files_metadata_validated = ( fetch_and_validate_file_metadatas(datasets, fetch_time) ) save_metadatas_to_parquet( out_path, files_metadata_validated, - files_metadata_unvalidated, tag="files" ) From bacdadd38166fa190667327ec5f8494deaab8266 Mon Sep 17 00:00:00 2001 From: essmaw Date: Fri, 9 Jan 2026 21:24:06 +0100 Subject: [PATCH 8/9] Fix Ruff C901 by extracting dataset section parsing logic. --- scripts/scrap_gpcrmd.py | 64 +++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py index f0b0fb9..788b1a3 100644 --- a/scripts/scrap_gpcrmd.py +++ b/scripts/scrap_gpcrmd.py @@ -488,6 +488,50 @@ def fetch_file_size(file_path: str) -> int | None: return None +def get_section_container( + soup: BeautifulSoup, + sec_id: str, + dataset_id: str, + url: str, +) -> Tag | None: + """Retrieve a valid HTML section container from a dataset page. + + The function looks for a ``
`` element with the given section ID. + If the section is mandatory (``allfiles``) and missing or invalid, a + warning is logged. Optional sections return ``None`` silently. + + Parameters + ---------- + soup : BeautifulSoup + Parsed HTML content of the dataset page. + sec_id : str + Identifier of the HTML section to retrieve (e.g. ``allfiles``, + ``paramfiles``). + dataset_id : str + Identifier of the dataset, used for logging purposes. + url : str + URL of the dataset page, used for logging purposes. + + Returns + ------- + Tag | None + The HTML ``Tag`` corresponding to the requested section if found + and valid, otherwise ``None``. + """ + container = soup.find("div", id=sec_id) + if isinstance(container, Tag): + return container + + if sec_id == "allfiles": + logger.warning( + f"Dataset id `{dataset_id}` ({url}): " + f"mandatory section `{sec_id}` is missing or invalid. " + "Files required for simulation parsing cannot be retrieved." + ) + + return None + + def fetch_and_validate_file_metadatas( datasets: list[dict], fetch_time: str, @@ -535,24 +579,8 @@ def fetch_and_validate_file_metadatas( soup = BeautifulSoup(html, "html.parser") for sec_id in ("allfiles", "paramfiles"): - container = soup.find("div", id=sec_id) - # Ensure container is a Tag - if not isinstance(container, Tag): - if sec_id == "allfiles": - # allfiles mandatory - logger.warning( - f"Dataset id `{dataset_id}` ({url}):" - f"mandatory section `{sec_id}` is missing or invalid. " - "Files required for simulation parsing cannot be retrieved." - ) - else: - # paramfiles optional - # logger.warning( - # f"Dataset id `{dataset_id}` ({url}): " - # f"optional section `{sec_id}` not found. " - # "Parameter files for simulations will be skipped." - # ) - pass + container = get_section_container(soup, sec_id, dataset_id, url) + if container is None: continue links = container.find_all("a", href=True) From 941d72060acffeb52440e6f120f88661722cbe6f Mon Sep 17 00:00:00 2001 From: essmaw Date: Fri, 9 Jan 2026 21:38:59 +0100 Subject: [PATCH 9/9] Add scrap gpcrmd into readme --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index e8da25b..6b482a0 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,22 @@ This command (takes usually less than 6 minutes) will: 4. Save file metadata similarly for validated and unvalidated files. +## Scrap GPCRmd + +Scrape GPCRmd to collect molecular dynamics (MD) datasets and files related to G-protein-coupled receptors (GPCRs), a major family of membrane proteins and common drug targets. + +```bash +uv run -m scripts.scrap_gpcrmd +``` + +This command will: + 1. Fetch all available datasets from GPCRMD. + 2. Parse their metadata and validate them using the Pydantic models + `DatasetMetadata` and `FileMetadata`. + 3. Save validated dataset metadatas to `data/gpcrmd/gpcrmd_datasets.parquet`. + 4. Save validated file metadatas to `data/gpcrmd/gpcrmd_files.parquet`. + + ## Analyze Gromacs mdp and gro files ### Download files