From 09e5d513508ec7f9d63a8f86f00dfb7c9adf7773 Mon Sep 17 00:00:00 2001
From: essmaw <essmay.touami@etu.u-paris.fr>
Date: Wed, 24 Dec 2025 16:17:34 +0100
Subject: [PATCH 1/9] Introduce new script for scraping GPCRMD datasets and
 files. Enhance dataset model to use float for timestep and delta; add
 simulation_time field.

---
 models/dataset_model.py |   7 +-
 scripts/scrap_gpcrmd.py | 559 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 564 insertions(+), 2 deletions(-)
 create mode 100644 scripts/scrap_gpcrmd.py

diff --git a/models/dataset_model.py b/models/dataset_model.py
index b89d42b..0629c19 100644
--- a/models/dataset_model.py
+++ b/models/dataset_model.py
@@ -182,12 +182,15 @@ class BaseDataset(BaseModel):
         None,
         description="Version of the forcefield model.",
     )
-    timestep: int | None = Field(
+    timestep: float | None = Field(
         None, description="The time interval between new positions computation (in fs)."
     )
-    delta: int | None = Field(
+    delta: float | None = Field(
         None, description="The time gap between frames (in ns)."
     )
+    simulation_time: str | None = Field(
+        None, description="The accumulated simulation time (in μs)."
+    )
 
     # ------------------------------------------------------------------
     # Validators
diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py
new file mode 100644
index 0000000..4d8ad29
--- /dev/null
+++ b/scripts/scrap_gpcrmd.py
@@ -0,0 +1,559 @@
+"""
+Scrape datasets and files from GPCRMD.
+
+This script fetches datasets from the GPCRMD repository (https://www.gpcrmd.org/).
+It collects metadata such as dataset names, descriptions, authors, download links,
+and other relevant information for all available datasets.
+Additionally, it retrieves file metadata for each dataset, including file paths
+in GPCRMD, file size, type/extension, etc.
+
+The scraped data is validated against Pydantic models (`BaseDataset` and `BaseFile`)
+and saved locally in Parquet format:
+- "data/gpcrmd/{timestamp}/validated_entries.parquet"
+- "data/gpcrmd/{timestamp}/validated_files.parquet"
+
+Entries that fail validation are saved as:
+- "data/gpcrmd/{timestamp}/unvalidated_entries.parquet"
+- "data/gpcrmd/{timestamp}/unvalidated_files.parquet"
+
+
+Usage:
+======
+    uv run -m scripts.scrap_gpcrmd [--out-path]
+
+Arguments:
+==========
+    --out-path : (optional)
+        Folder path to save the scraped GPCRMD data (dataset and file metadata).
+        Default is "data/gpcrmd/{timestamp}".
+
+Example:
+========
+   uv run -m scripts.scrap_gpcrmd
+
+This command will:
+    1. Fetch all available datasets from GPCRMD in batches.
+    2. Parse their metadata and validate them using the Pydantic models `BaseDataset`
+       and `BaseFile`.
+    3. Save both the validated and unvalidated dataset entries to
+       "data/gpcrmd/{timestamp}/{validated or unvalidated}_entries.parquet".
+    4. Save file metadata similarly for validated and unvalidated files.
+"""
+
+# METADATAS
+__authors__ = ("Pierre Poulain", "Essmay Touami")
+__contact__ = "pierre.poulain@u-paris.fr"
+__copyright__ = "AGPL-3.0 license"
+__date__ = "2025"
+__version__ = "1.0.0"
+
+
+# LIBRARY IMPORTS
+import os
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import click
+import httpx
+import pandas as pd
+from bs4 import BeautifulSoup
+from loguru import logger
+from pydantic import ValidationError
+from tqdm import tqdm
+
+from models.dataset_model import BaseDataset
+from models.file_model import BaseFile
+
+# CONSTANTS
+BASE_GPCRMD_URL = "https://www.gpcrmd.org/api/search_all/info/"
+
+
+# FUNCTIONS
+def setup_logger(loguru_logger: Any, log_dir: str | Path = "logs") -> None:
+    """Configure a Loguru logger to write logs into a rotating daily log file.
+
+    Parameters
+    ----------
+    loguru_logger : Any
+        A Loguru logger instance (typically `loguru.logger`).
+    log_dir : str or Path, optional
+        Directory where log files will be stored. Default is "logs".
+    """
+    # Ensure log directory exists
+    log_folder = Path(log_dir)
+    log_folder.mkdir(parents=True, exist_ok=True)
+    # Reset any previous configuration
+    loguru_logger.remove()
+    # Define log format
+    fmt = (
+        "{time:YYYY-MM-DD HH:mm:ss}"
+        "| <level>{level:<8}</level> "
+        "| <level>{message}</level>"
+    )
+    loguru_logger.add(
+        log_folder / "scrap_gpcrmd_data_{time:YYYY-MM-DD}.log",
+        format=fmt,
+        level="DEBUG",
+    )
+    loguru_logger.add(
+        sys.stdout,
+        format=fmt,
+        level="DEBUG",
+    )
+
+
+def fetch_entries_once() -> tuple[list[dict[str, Any]], str]:
+    """
+    Fetch all entries from the GPCRMD API.
+
+    Returns
+    -------
+    Tuple[List[Dict[str, Any]], str]:
+        - A list of entries (JSON objects).
+        Returns an empty list if the request fails.
+        - The current timestamp in ISO 8601 format (e.g., '2023-03-05T22:01:12').
+    """
+    logger.debug(
+        "Fetching entries from GPCRMD API... (usually take \
+        less than 1 minutes!)"
+    )
+    # Current timestamp in ISO format
+    fetch_time: str = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
+
+    try:
+        # Perform the HTTP GET request with a long timeout to accommodate large data
+        response = httpx.get(BASE_GPCRMD_URL, timeout=1000)
+        response.raise_for_status()
+
+        # Parse JSON data
+        entries_md = response.json()
+        logger.success(
+            f"Fetched {len(entries_md)} MD-related entries from GPCRMD successfully! \n"
+        )
+        return entries_md, fetch_time
+
+    except httpx.HTTPError as e:
+        logger.error(f"HTTP error occurred: {e}")
+        return [], fetch_time
+
+
+def retrieve_metadata(url: str, field_name: str, timeout: int = 10) -> str | None:
+    """
+    Retrieve a specific metadata field from a webpage.
+
+    Parameters
+    ----------
+    url : str
+        The URL of the webpage to fetch.
+    field_name : str
+        The name of the metadata field to extract (case-sensitive).
+    timeout : int, optional
+        Timeout in seconds for the HTTP request (default is 10).
+
+    Returns
+    -------
+    str | None
+        The value of the metadata field if found, otherwise None.
+
+    """
+    # Try to send an HTTP GET request to the given URL of the dataset.
+    try:
+        response = httpx.get(url, timeout=timeout)
+        response.raise_for_status()
+    except httpx.RequestError as e:
+        logger.warning(f"Failed to fetch {field_name} from {url}: {e}")
+        return None
+    # Parse the HTML content of the page using BeautifulSoup
+    soup = BeautifulSoup(response.text, "html.parser")
+    bold_tag = soup.find("b", string=lambda t: t and t.strip() == field_name)
+    if not bold_tag:
+        return None
+    # Get all the text from the parent element of the <b> tag
+    parent_text = bold_tag.parent.get_text(strip=True)
+    if ":" not in parent_text:
+        return None
+    # Get only what is after the "field_name:"
+    return parent_text.split(":", 1)[1].strip() or None
+
+
+def retrieve_reference_links(url: str, timeout: int = 10) -> list[str] | None:
+    """
+    Retrieve reference URLs from the References section of a GPCRMD entry page.
+
+    Parameters
+    ----------
+    url : str
+        The URL of the GPCRMD entry page.
+    timeout : int, optional
+        Timeout in seconds for the HTTP request (default is 10).
+
+    Returns
+    -------
+    list[str] | None
+        List of reference URLs (starting with http:// or https://) if found,
+        otherwise None.
+    """
+    try:
+        response = httpx.get(url, timeout=timeout)
+        response.raise_for_status()
+    except httpx.RequestError as e:
+        logger.warning(f"Failed to fetch reference links from {url}: {e}")
+        return None
+    # Parse the HTML content
+    soup = BeautifulSoup(response.text, "html.parser")
+    # Find the <h3> header with text "References"
+    header = soup.find("h3", string=lambda t: t and t.strip() == "References")
+    if not header:
+        return None
+    # Get the corresponding content div containing the links
+    content_div = header.find_next_sibling("div", class_="techinfo_content")
+    if not content_div:
+        return None
+
+    links: list[str] = []
+    # Collect all hrefs that start with http:// or https://
+    for a in content_div.find_all("a", href=True):
+        href = a["href"].strip()
+        if href.startswith(("http://", "https://")):
+            links.append(href)
+
+    return links or None
+
+
+def count_simulation_files(url: str, timeout: int = 10) -> int | None:
+    """
+    Count files in the dataset webpage.
+
+    Especially in 'Simulation output files' and 'Simulation protocol \
+    & starting files' sections.
+
+    Returns
+    -------
+    int | None
+        The number of files related to this dataset.
+    """
+    try:
+        response = httpx.get(url, timeout=timeout)
+        response.raise_for_status()
+    except httpx.RequestError as e:
+        logger.warning(f"Failed to fetch file counts from {url}: {e}")
+        return None
+
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # Helper function to count unique links in a container div
+    def count_links(container_id: str) -> int:
+        container = soup.find("div", id=container_id)
+        if not container:
+            return 0
+
+        # Collect all hrefs and remove duplicates while preserving order
+        links = ([a["href"].strip() for a in container.find_all("a", href=True)
+            if a["href"].strip()]
+        )
+        return len(dict.fromkeys(links))
+
+    output_files_count = count_links("allfiles")
+    protocol_files_count = count_links("paramfiles")
+
+    return output_files_count + protocol_files_count
+
+
+def parse_and_validate_entry_metadatas(
+    entries_list: list[dict],
+    fetch_time: str
+) -> tuple[list[BaseDataset], list[dict]]:
+    """
+    Parse and validate metadata fields for a list of GPCRMD entries.
+
+    Parameters
+    ----------
+    entries_list : list of dict
+        List of dictionaries, each representing the metadata of a GPCRMD entry.
+    fetch_time : str
+        Timestamp (as a string) indicating when the data was fetched.
+
+    Returns
+    -------
+    Tuple[List[BaseDataset], List[Dict]]
+        - List of successfully validated `BaseDataset` objects.
+        - List of parsed entry that failed validation.
+    """
+    logger.info("Starting parsing and validation of GPCRMD entries...")
+    validated_entries = []
+    non_validated_entry_ids = []
+    total_entries = len(entries_list)
+
+    for data in tqdm(entries_list):
+        entry_id = str(data.get("dyn_id"))
+
+        # Extract molecules and number total of atoms if available
+        total_atoms = data.get("atom_num")
+        dyncomp = data.get("dyncomp", [])
+        molecules = [comp.get("resname") for comp in dyncomp if comp.get("resname")]
+        url = data.get("url")
+        author_names = [retrieve_metadata(url, "Submitted by")]
+        description = retrieve_metadata(url, "Description")
+        stime = retrieve_metadata(url, "Accumulated simulation time")
+        refs = retrieve_reference_links(url)
+        nb_files = count_simulation_files(url)
+
+        parsed_entry = {
+            "dataset_repository": "GPCRMD",
+            "dataset_project": "GPCRMD",
+            "dataset_id_in_repository": entry_id,
+            "dataset_id_in_project": entry_id,
+            "dataset_url_in_repository": url,
+            "dataset_url_in_project": url,
+            "links": refs,
+            "title": data.get("modelname"),
+            "date_created": data.get("creation_timestamp"),
+            "date_last_fetched": fetch_time,
+            "nb_files": nb_files,
+            "author_names": author_names,
+            "description": description,
+            "simulation_program_name": data.get("mysoftware"),
+            "simulation_program_version": data.get("software_version"),
+            "nb_atoms": total_atoms,
+            "molecule_names": molecules,
+            "forcefield_model_name": data.get("forcefield"),
+            "forcefield_model_version": data.get("forcefield_version"),
+            "timestep": data.get("timestep"),
+            "delta": data.get("delta"),
+            "simulation_time": stime
+            }
+        try:
+            # Validate and normalize data collected wieh pydantic model
+            dataset_model = BaseDataset(**parsed_entry)
+            validated_entries.append(dataset_model)
+        except ValidationError as e:
+            logger.error(f"Validation failed for entry {entry_id}")
+            for err in e.errors():
+                logger.error(f"  Field: {'.'.join(str(x) for x in err['loc'])}")
+                logger.error(f"  Error: {err['msg']} (type={err['type']})")
+            non_validated_entry_ids.append(parsed_entry)
+
+    logger.success(
+        f"Parsing completed: {len(validated_entries)} validated / {total_entries} total\
+            entries successfully! \n"
+    )
+    return validated_entries, non_validated_entry_ids
+
+
+def parse_and_validate_files_metadatas(
+    entries_list: list[dict],
+    fetch_time: str
+) -> tuple[list[BaseFile], list[dict]]:
+    """
+    Parse and validate metadata for GPCRMD files.
+
+    Parameters
+    ----------
+    entries_list : list[dict]
+        List of file entries, each containing metadata such as 'dyn_id' and 'url'.
+    fetch_time : str
+        Timestamp indicating when the data was fetched.
+
+    Returns
+    -------
+    tuple[list[BaseFile], list[dict]]
+        - List of validated `BaseFile` objects.
+        - List of file entries that failed validation.
+    """
+    logger.info("Starting parsing and validation of GPCRMD files...")
+    validated_entries: list[BaseFile] = []
+    non_validated_entry_ids: list[dict] = []
+    total_files = len(entries_list)
+
+    # Loop over the first two entries for demonstration
+    for data in tqdm(entries_list):
+        entry_id = str(data.get("dyn_id"))
+        url = data.get("url")
+
+        # Fetch the file page
+        try:
+            response = httpx.get(url, timeout=10)
+            response.raise_for_status()
+        except httpx.RequestError as e:
+            logger.warning(f"Failed to fetch file page for {entry_id}: {e}")
+            non_validated_entry_ids.append(data)
+            continue
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        sections = ["allfiles", "paramfiles"]
+
+        # Iterate over sections containing files
+        for sec_id in sections:
+            container = soup.find("div", id=sec_id)
+            if not container:
+                continue
+
+            # Process each file link
+            for a in container.find_all("a", href=True):
+                file_path = f"https://www.gpcrmd.org/{a['href'].strip()}"
+                if not file_path:
+                    continue
+
+                file_name = os.path.basename(file_path)
+                file_extension = os.path.splitext(file_name)[1].lstrip(".").lower()
+
+                # Try to fetch file size via HEAD request
+                size: int = None
+                try:
+                    head_resp = httpx.head(file_path, timeout=10, follow_redirects=True)
+                    size = int(head_resp.headers.get("Content-Length", 0))
+                except httpx.RequestError as e:
+                    logger.warning(f"Failed to fetch file size for {file_name}: {e}")
+
+                parsed_entry = {
+                    "dataset_repository": "GPCRMD",
+                    "dataset_id_in_repository": entry_id,
+                    "file_name": file_name,
+                    "file_type": file_extension,
+                    "file_size": size,
+                    "file_url_in_repository": file_path,
+                    "date_last_fetched": fetch_time,
+                }
+
+                # Validate and normalize entry using Pydantic model
+                try:
+                    dataset_model = BaseFile(**parsed_entry)
+                    validated_entries.append(dataset_model)
+                except ValidationError as e:
+                    logger.error(f"Validation failed for file {entry_id}: {e}")
+                    non_validated_entry_ids.append(parsed_entry)
+
+    logger.success(
+        f"Parsing completed: {len(validated_entries)} validated / {total_files} \
+            total files successfully!"
+    )
+    return validated_entries, non_validated_entry_ids
+
+
+def save_metadatas_to_parquet(
+    folder_out_path: Path,
+    metadatas_validated: list[BaseDataset] | list[BaseFile],
+    metadatas_unvalidated: list[dict],
+    tag: str,
+) -> None:
+    """
+    Save GPCRMD validated and unvalidated metadata to Parquet files.
+
+    Parameters
+    ----------
+    folder_out_path : Path
+        Folder path where Parquet files will be saved.
+    metadatas_validated : List[BaseDataset]
+        List of validated entries.
+    metadatas_unvalidated : List[Dict]
+        List of unvalidated entries as dictionaries.
+    tag: str
+        Tag to know if its entries or files metadata to save.
+    """
+    logger.info("Saving GPCRMD entries metadatas to a Parquet file...")
+    # Ensure output folder exists
+    Path(folder_out_path).mkdir(parents=True, exist_ok=True)
+
+    # Save validated entries
+    if tag == "entries":
+        validated_path = os.path.join(folder_out_path, "validated_entries.parquet")
+    elif tag == "files":
+        validated_path = os.path.join(folder_out_path, "validated_files.parquet")
+    try:
+        # Convert list of Pydantic models to list of dicts
+        validated_dicts = [entry.model_dump() for entry in metadatas_validated]
+        df_validated = pd.DataFrame(validated_dicts)
+        df_validated.to_parquet(validated_path, index=False)
+        logger.success(
+            f"GPCRMD validated metadatas saved to: {validated_path} successfully!"
+        )
+    except (ValueError, TypeError, OSError) as e:
+        logger.error(f"Failed to save validated metadata to {validated_path}: {e}")
+
+    # Save unvalidated entries
+    if tag == "entries":
+        unvalidated_path = os.path.join(
+            folder_out_path, "unvalidated_entries.parquet"
+        )
+    elif tag == "files":
+        unvalidated_path = os.path.join(
+            folder_out_path, "unvalidated_files.parquet"
+        )
+    try:
+        if len(metadatas_unvalidated) != 0:
+            df_unvalidated = pd.DataFrame(metadatas_unvalidated)
+            df_unvalidated.to_parquet(unvalidated_path, index=False)
+            logger.success(
+            f"GPCRMD unvalidated metadatas saved to: {unvalidated_path} successfully!"
+            )
+        else:
+            logger.warning("There is no unvalidated entries to save!")
+    except (ValueError, TypeError, OSError) as e:
+        logger.error(f"Failed to save unvalidated metadata to {unvalidated_path}: {e}")
+
+
+@click.command()
+@click.option(
+    "--out-path",
+    type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path),
+    default=Path(f"data/gpcrmd/{datetime.now().strftime('%Y%m%d_%H%M%S')}"),
+    show_default=True,
+    help="Folder path to save the scraped GPCRMD data (Dataset and File metadatas)"
+)
+def scrap_gpcrmd_data(out_path: Path) -> None:
+    """Scrap datasets and files from GPCRMD.
+
+    Parameters
+    ----------
+    out_path : Path
+        The output folder path for the scraped data.
+    """
+    setup_logger(logger, out_path)
+    logger.info("Starting GPCRMD data scraping...")
+    start_time = time.time()
+
+    # Fetch entries metadata
+    entries, fetch_time = fetch_entries_once()
+    if entries == []:
+        logger.warning("No data fetched from GPCRMD.")
+        return
+    # Parse and validate  entry metadatas with a pydantic model (BaseDataset)
+    entries_validated, entries_unvalidated = (
+        parse_and_validate_entry_metadatas(entries, fetch_time)
+    )
+    # Save parsed metadata to local file
+    save_metadatas_to_parquet(
+        out_path,
+        entries_validated,
+        entries_unvalidated,
+        tag="entries"
+    )
+
+    # Fetch, parse and validate the file metadatas with a pydantic model (BaseFile)
+    files_metadata_validated, files_metadata_unvalidated = (
+        parse_and_validate_files_metadatas(entries, fetch_time)
+    )
+    save_metadatas_to_parquet(
+        out_path,
+        files_metadata_validated,
+        files_metadata_unvalidated,
+        tag="files"
+    )
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    hours = int(elapsed_time // 3600)
+    minutes = int((elapsed_time % 3600) // 60)
+    seconds = int(elapsed_time % 60)
+
+    logger.success(
+        f"Completed GPCRMD data scraping in {hours} h {minutes} min {seconds} sec 🎉"
+    )
+
+
+if __name__ == "__main__":
+    # Scrap GPCRMD data
+    scrap_gpcrmd_data()

From 73a16346f059a52f5e5b487865ec5a6e0f6fead6 Mon Sep 17 00:00:00 2001
From: essmaw <essmay.touami@etu.u-paris.fr>
Date: Mon, 29 Dec 2025 14:02:59 +0100
Subject: [PATCH 2/9] Refactor GPCRMD scraping script: improve metadata
 handling, enhance validation logging, and update function signatures for
 clarity.

---
 scripts/scrap_gpcrmd.py | 401 +++++++++++++++++++++++++++++-----------
 1 file changed, 290 insertions(+), 111 deletions(-)

diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py
index 4d8ad29..b161807 100644
--- a/scripts/scrap_gpcrmd.py
+++ b/scripts/scrap_gpcrmd.py
@@ -32,7 +32,7 @@
    uv run -m scripts.scrap_gpcrmd
 
 This command will:
-    1. Fetch all available datasets from GPCRMD in batches.
+    1. Fetch all available datasets from GPCRMD.
     2. Parse their metadata and validate them using the Pydantic models `BaseDataset`
        and `BaseFile`.
     3. Save both the validated and unvalidated dataset entries to
@@ -59,12 +59,12 @@
 import click
 import httpx
 import pandas as pd
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from loguru import logger
 from pydantic import ValidationError
 from tqdm import tqdm
 
-from models.dataset_model import BaseDataset
+from models.dataset_model import BaseDataset, DatasetProject, DatasetRepository
 from models.file_model import BaseFile
 
 # CONSTANTS
@@ -117,8 +117,8 @@ def fetch_entries_once() -> tuple[list[dict[str, Any]], str]:
         - The current timestamp in ISO 8601 format (e.g., '2023-03-05T22:01:12').
     """
     logger.debug(
-        "Fetching entries from GPCRMD API... (usually take \
-        less than 1 minutes!)"
+        "Fetching entries from GPCRMD API... "
+        "(usually takes less than 1 minute!)"
     )
     # Current timestamp in ISO format
     fetch_time: str = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@@ -140,7 +140,7 @@ def fetch_entries_once() -> tuple[list[dict[str, Any]], str]:
         return [], fetch_time
 
 
-def retrieve_metadata(url: str, field_name: str, timeout: int = 10) -> str | None:
+def retrieve_metadata(url: str, field_name: str, timeout: int = 50) -> str | None:
     """
     Retrieve a specific metadata field from a webpage.
 
@@ -163,6 +163,13 @@ def retrieve_metadata(url: str, field_name: str, timeout: int = 10) -> str | Non
     try:
         response = httpx.get(url, timeout=timeout)
         response.raise_for_status()
+
+    except httpx.HTTPStatusError as e:
+        logger.warning(
+            f"HTTP error {e.response.status_code} for {url}"
+        )
+        return None
+
     except httpx.RequestError as e:
         logger.warning(f"Failed to fetch {field_name} from {url}: {e}")
         return None
@@ -172,14 +179,18 @@ def retrieve_metadata(url: str, field_name: str, timeout: int = 10) -> str | Non
     if not bold_tag:
         return None
     # Get all the text from the parent element of the <b> tag
-    parent_text = bold_tag.parent.get_text(strip=True)
+    parent = bold_tag.parent
+    if not isinstance(parent, Tag):
+        return None
+    parent_text = parent.get_text(strip=True)
+
     if ":" not in parent_text:
         return None
     # Get only what is after the "field_name:"
     return parent_text.split(":", 1)[1].strip() or None
 
 
-def retrieve_reference_links(url: str, timeout: int = 10) -> list[str] | None:
+def retrieve_reference_links(url: str, timeout: int = 50) -> list[str] | None:
     """
     Retrieve reference URLs from the References section of a GPCRMD entry page.
 
@@ -199,6 +210,13 @@ def retrieve_reference_links(url: str, timeout: int = 10) -> list[str] | None:
     try:
         response = httpx.get(url, timeout=timeout)
         response.raise_for_status()
+
+    except httpx.HTTPStatusError as e:
+        logger.warning(
+            f"HTTP error {e.response.status_code} for {url}"
+        )
+        return None
+
     except httpx.RequestError as e:
         logger.warning(f"Failed to fetch reference links from {url}: {e}")
         return None
@@ -213,17 +231,22 @@ def retrieve_reference_links(url: str, timeout: int = 10) -> list[str] | None:
     if not content_div:
         return None
 
+    # Iterate over all <a> elements with an href attribute inside the content div
+    # Only keep elements that are of type Tag to satisfy type checkers
+    content_div = header.find_next_sibling("div", class_="techinfo_content")
+    if not isinstance(content_div, Tag):
+        return None
     links: list[str] = []
-    # Collect all hrefs that start with http:// or https://
-    for a in content_div.find_all("a", href=True):
+    for a in filter(lambda x: isinstance(x, Tag), content_div.find_all("a", href=True)):
         href = a["href"].strip()
+        # Only include links that start with "http://" or "https://"
         if href.startswith(("http://", "https://")):
             links.append(href)
 
     return links or None
 
 
-def count_simulation_files(url: str, timeout: int = 10) -> int | None:
+def count_simulation_files(url: str, timeout: int = 50) -> int | None:
     """
     Count files in the dataset webpage.
 
@@ -238,6 +261,12 @@ def count_simulation_files(url: str, timeout: int = 10) -> int | None:
     try:
         response = httpx.get(url, timeout=timeout)
         response.raise_for_status()
+
+    except httpx.HTTPStatusError as e:
+        logger.warning(
+            f"HTTP error {e.response.status_code} for {url}"
+        )
+        return None
     except httpx.RequestError as e:
         logger.warning(f"Failed to fetch file counts from {url}: {e}")
         return None
@@ -246,14 +275,20 @@ def count_simulation_files(url: str, timeout: int = 10) -> int | None:
 
     # Helper function to count unique links in a container div
     def count_links(container_id: str) -> int:
+        # Find the container <div> by ID
         container = soup.find("div", id=container_id)
-        if not container:
+        # Ensure the container is actually a Tag
+        if not isinstance(container, Tag):
             return 0
 
-        # Collect all hrefs and remove duplicates while preserving order
-        links = ([a["href"].strip() for a in container.find_all("a", href=True)
-            if a["href"].strip()]
-        )
+        # Collect all hrefs in <a> tags, stripping whitespace
+        links = [
+            str(a.get("href", "")).strip()
+            for a in container.find_all("a", href=True)
+            if isinstance(a, Tag) and str(a.get("href", "")).strip()
+        ]
+
+        # Remove duplicates while preserving order
         return len(dict.fromkeys(links))
 
     output_files_count = count_links("allfiles")
@@ -263,9 +298,9 @@ def count_links(container_id: str) -> int:
 
 
 def parse_and_validate_entry_metadatas(
-    entries_list: list[dict],
+    entries_list: list[dict[str, Any]],
     fetch_time: str
-) -> tuple[list[BaseDataset], list[dict]]:
+) -> tuple[list[BaseDataset], list[dict[str, Any]]]:
     """
     Parse and validate metadata fields for a list of GPCRMD entries.
 
@@ -282,77 +317,230 @@ def parse_and_validate_entry_metadatas(
         - List of successfully validated `BaseDataset` objects.
         - List of parsed entry that failed validation.
     """
-    logger.info("Starting parsing and validation of GPCRMD entries...")
-    validated_entries = []
-    non_validated_entry_ids = []
-    total_entries = len(entries_list)
-
-    for data in tqdm(entries_list):
-        entry_id = str(data.get("dyn_id"))
+    logger.info("Starting parsing and validating GPCRMD entries...")
+    validated_entries: list[BaseDataset] = []
+    non_validated_entries: list[dict[str, Any]] = []
+    total_entries: int = len(entries_list)
+
+    for entry in tqdm(entries_list,
+            desc="Validating GPCRmd entries",
+            colour="blue",
+            unit="entry"
+        ):
+        entry_id = str(entry.get("dyn_id"))
 
         # Extract molecules and number total of atoms if available
-        total_atoms = data.get("atom_num")
-        dyncomp = data.get("dyncomp", [])
-        molecules = [comp.get("resname") for comp in dyncomp if comp.get("resname")]
-        url = data.get("url")
-        author_names = [retrieve_metadata(url, "Submitted by")]
-        description = retrieve_metadata(url, "Description")
-        stime = retrieve_metadata(url, "Accumulated simulation time")
-        refs = retrieve_reference_links(url)
-        nb_files = count_simulation_files(url)
+        total_atoms: int | None = entry.get("atom_num")
+        dyncomp: list[dict[str, Any]] = entry.get("dyncomp", [])
+        molecules: list[str] = (
+            [comp.get("resname") for comp in dyncomp if comp.get("resname")]
+        )
+        url: str = entry.get("url")
+        author_names: list[str | None] = [retrieve_metadata(url, "Submitted by")]
+        description: str | None = retrieve_metadata(url, "Description")
+        stime: str | None = retrieve_metadata(url, "Accumulated simulation time")
+        refs: list[str] | None = retrieve_reference_links(url)
+        nb_files: int | None = count_simulation_files(url)
+        softname: str = entry.get("mysoftware")
+        softvers: str = entry.get("software_version")
+        ffm: str = entry.get("forcefield")
+        ffm_vers: str = entry.get("forcefield_version")
+        delta: float = entry.get("delta")
+        timestep: float = entry.get("timestep")
+        title: str = entry.get("modelname")
+        date: str = entry.get("creation_timestamp")
 
         parsed_entry = {
-            "dataset_repository": "GPCRMD",
-            "dataset_project": "GPCRMD",
+            "dataset_repository": DatasetRepository.GPCRMD,
+            "dataset_project": DatasetProject.GPCRMD,
             "dataset_id_in_repository": entry_id,
             "dataset_id_in_project": entry_id,
             "dataset_url_in_repository": url,
             "dataset_url_in_project": url,
             "links": refs,
-            "title": data.get("modelname"),
-            "date_created": data.get("creation_timestamp"),
+            "title": title,
+            "date_created": date,
             "date_last_fetched": fetch_time,
             "nb_files": nb_files,
             "author_names": author_names,
             "description": description,
-            "simulation_program_name": data.get("mysoftware"),
-            "simulation_program_version": data.get("software_version"),
+            "simulation_program_name": softname,
+            "simulation_program_version": softvers,
             "nb_atoms": total_atoms,
             "molecule_names": molecules,
-            "forcefield_model_name": data.get("forcefield"),
-            "forcefield_model_version": data.get("forcefield_version"),
-            "timestep": data.get("timestep"),
-            "delta": data.get("delta"),
+            "forcefield_model_name": ffm,
+            "forcefield_model_version": ffm_vers,
+            "timestep": timestep,
+            "delta": delta,
             "simulation_time": stime
             }
         try:
-            # Validate and normalize data collected wieh pydantic model
-            dataset_model = BaseDataset(**parsed_entry)
+            # Validate and normalize data collected with pydantic model
+            dataset_model = BaseDataset(**parsed_entry)  # ty:ignore[invalid-argument-type]  # noqa: E501
             validated_entries.append(dataset_model)
         except ValidationError as e:
-            logger.error(f"Validation failed for entry {entry_id}")
+            reasons: list[str] = []
+
             for err in e.errors():
-                logger.error(f"  Field: {'.'.join(str(x) for x in err['loc'])}")
-                logger.error(f"  Error: {err['msg']} (type={err['type']})")
-            non_validated_entry_ids.append(parsed_entry)
+                field = ".".join(str(x) for x in err["loc"])
+                reason = err["msg"]
+                value = err.get("input")
+
+                logger.error(
+                    "Validation error on '{}': value={!r} (type={}) -> {}",
+                    field,
+                    value,
+                    type(value).__name__,
+                    reason,
+                )
+                reasons.append(f"{field}: {reason}")
+
+            parsed_entry["non_validation_reason"] = "; ".join(reasons)
+            non_validated_entries.append(parsed_entry)
 
     logger.success(
         f"Parsing completed: {len(validated_entries)} validated / {total_entries} total\
             entries successfully! \n"
     )
-    return validated_entries, non_validated_entry_ids
+    return validated_entries, non_validated_entries
 
 
-def parse_and_validate_files_metadatas(
-    entries_list: list[dict],
-    fetch_time: str
-) -> tuple[list[BaseFile], list[dict]]:
+def make_base_parsed_entry(
+    entry_id: str,
+    url: str,
+    fetch_time: str,
+) -> dict[str, Any]:
+    """Create a base parsed entry with empty file metadata.
+
+    Parameters
+    ----------
+    entry_id : str
+        The unique identifier of the GPCRMD entry.
+    url : str
+        The URL of the GPCRMD entry.
+    fetch_time : str
+        The timestamp indicating when the data was fetched.
+
+    Returns
+    -------
+    dict[str, Any]
+        A dictionary representing the base parsed entry with empty file metadata.
     """
-    Parse and validate metadata for GPCRMD files.
+    return {
+        "dataset_repository": DatasetRepository.GPCRMD,
+        "dataset_id_in_repository": entry_id,
+        "file_name": None,
+        "file_type": None,
+        "file_size": None,
+        "file_url_in_repository": url,
+        "date_last_fetched": fetch_time,
+    }
+
+
+def fetch_entry_page(url: str) -> str | None:
+    """Fetch an entry page and return its HTML content.
 
     Parameters
     ----------
-    entries_list : list[dict]
+    url : str
+        The URL of the entry page to fetch.
+
+    Returns
+    -------
+    str | None
+        The HTML content of the page if the request is successful, otherwise None.
+    """
+    try:
+        response = httpx.get(url, timeout=50)
+        response.raise_for_status()
+    except httpx.HTTPStatusError as exc:
+        logger.warning("HTTP error %s for %s", exc.response.status_code, url)
+        return None
+    except httpx.RequestError as exc:
+        logger.warning("Request error for %s: %s", url, exc)
+        return None
+
+    return response.text
+
+
+def fetch_file_size(file_path: str) -> int | None:
+    """Fetch file size using a HEAD request.
+
+    Parameters
+    ----------
+    file_path : str
+        The URL of the file to fetch the size for.
+
+    Returns
+    -------
+        int | None
+        The size of the file in bytes if available, otherwise None.
+    """
+    try:
+        response = httpx.head(file_path, timeout=50, follow_redirects=True)
+        return int(response.headers.get("Content-Length", 0))
+    except httpx.HTTPStatusError as exc:
+        logger.warning(
+            "HTTP error %s for %s",
+            exc.response.status_code,
+            file_path,
+        )
+    except httpx.RequestError as exc:
+        logger.warning("Failed to fetch file size for %s: %s", file_path, exc)
+
+    return None
+
+
+def validate_parsed_entry(
+    parsed_entry: dict[str, Any],
+) -> BaseFile | None:
+    """Validate a parsed entry using the BaseFile model.
+
+    Parameters
+    ----------
+    parsed_entry : dict[str, Any]
+        The parsed entry to validate.
+
+    Returns
+    -------
+    BaseFile | None
+        The validated BaseFile object if validation is successful,
+        otherwise None.
+
+    """
+    try:
+        return BaseFile(**parsed_entry)
+    except ValidationError as exc:
+        reasons: list[str] = []
+
+        for err in exc.errors():
+            field = ".".join(str(x) for x in err["loc"])
+            reason = err["msg"]
+            value = err.get("input")
+
+            logger.error(
+                "Validation error on '{}': value={!r} (type={}) -> {}",
+                field,
+                value,
+                type(value).__name__,
+                reason,
+            )
+
+            reasons.append(f"{field}: {reason}")
+
+        parsed_entry["non_validation_reason"] = "; ".join(reasons)
+        return None
+
+
+def fetch_and_validate_file_metadatas(
+    entries: list[dict],
+    fetch_time: str,
+) -> tuple[list[BaseFile], list[dict]]:
+    """Fetch and validate metadata for GPCRMD files.
+
+    Parameters
+    ----------
+    entries : list[dict]
         List of file entries, each containing metadata such as 'dyn_id' and 'url'.
     fetch_time : str
         Timestamp indicating when the data was fetched.
@@ -363,74 +551,64 @@ def parse_and_validate_files_metadatas(
         - List of validated `BaseFile` objects.
         - List of file entries that failed validation.
     """
-    logger.info("Starting parsing and validation of GPCRMD files...")
-    validated_entries: list[BaseFile] = []
-    non_validated_entry_ids: list[dict] = []
-    total_files = len(entries_list)
+    logger.info("Starting fetching and validating GPCRMD files...")
 
-    # Loop over the first two entries for demonstration
-    for data in tqdm(entries_list):
-        entry_id = str(data.get("dyn_id"))
-        url = data.get("url")
-
-        # Fetch the file page
-        try:
-            response = httpx.get(url, timeout=10)
-            response.raise_for_status()
-        except httpx.RequestError as e:
-            logger.warning(f"Failed to fetch file page for {entry_id}: {e}")
-            non_validated_entry_ids.append(data)
+    validated_entries: list[BaseFile] = []
+    non_validated_entries: list[dict] = []
+
+    for entry in tqdm(
+        entries,
+        desc="Validating GPCRmd files",
+        colour="blue",
+        unit="file",
+    ):
+        entry_id = str(entry.get("dyn_id"))
+        url = entry.get("url")
+
+        base_entry = make_base_parsed_entry(entry_id, url, fetch_time)
+
+        html = fetch_entry_page(url)
+        if html is None:
+            base_entry["non_validation_reason"] = "entry_page_fetch_failed"
+            non_validated_entries.append(base_entry)
             continue
 
-        soup = BeautifulSoup(response.text, "html.parser")
-        sections = ["allfiles", "paramfiles"]
+        soup = BeautifulSoup(html, "html.parser")
 
-        # Iterate over sections containing files
-        for sec_id in sections:
+        for sec_id in ("allfiles", "paramfiles"):
             container = soup.find("div", id=sec_id)
-            if not container:
+            # Ensure container is a Tag
+            if not isinstance(container, Tag):
                 continue
 
-            # Process each file link
-            for a in container.find_all("a", href=True):
-                file_path = f"https://www.gpcrmd.org/{a['href'].strip()}"
-                if not file_path:
+            for link in container.find_all("a", href=True):
+                # Ensure link is a Tag to safely access ['href']
+                if not isinstance(link, Tag):
                     continue
 
+                # Use .get() to safely retrieve the href, then convert to str
+                href_value = str(link.get("href", "")).strip()
+                if not href_value:
+                    continue
+                file_path = f"https://www.gpcrmd.org/{href_value}"
                 file_name = os.path.basename(file_path)
-                file_extension = os.path.splitext(file_name)[1].lstrip(".").lower()
-
-                # Try to fetch file size via HEAD request
-                size: int = None
-                try:
-                    head_resp = httpx.head(file_path, timeout=10, follow_redirects=True)
-                    size = int(head_resp.headers.get("Content-Length", 0))
-                except httpx.RequestError as e:
-                    logger.warning(f"Failed to fetch file size for {file_name}: {e}")
+                file_type = os.path.splitext(file_name)[1].lstrip(".").lower()
 
                 parsed_entry = {
-                    "dataset_repository": "GPCRMD",
-                    "dataset_id_in_repository": entry_id,
+                    **base_entry,
                     "file_name": file_name,
-                    "file_type": file_extension,
-                    "file_size": size,
+                    "file_type": file_type,
+                    "file_size": fetch_file_size(file_path),
                     "file_url_in_repository": file_path,
-                    "date_last_fetched": fetch_time,
                 }
 
-                # Validate and normalize entry using Pydantic model
-                try:
-                    dataset_model = BaseFile(**parsed_entry)
-                    validated_entries.append(dataset_model)
-                except ValidationError as e:
-                    logger.error(f"Validation failed for file {entry_id}: {e}")
-                    non_validated_entry_ids.append(parsed_entry)
+                validated = validate_parsed_entry(parsed_entry)
+                if validated is None:
+                    non_validated_entries.append(parsed_entry)
+                else:
+                    validated_entries.append(validated)
 
-    logger.success(
-        f"Parsing completed: {len(validated_entries)} validated / {total_files} \
-            total files successfully!"
-    )
-    return validated_entries, non_validated_entry_ids
+    return validated_entries, non_validated_entries
 
 
 def save_metadatas_to_parquet(
@@ -447,9 +625,9 @@ def save_metadatas_to_parquet(
     folder_out_path : Path
         Folder path where Parquet files will be saved.
     metadatas_validated : List[BaseDataset]
-        List of validated entries.
+        List of validated metadatas.
     metadatas_unvalidated : List[Dict]
-        List of unvalidated entries as dictionaries.
+        List of unvalidated metadatas as dictionaries.
     tag: str
         Tag to know if its entries or files metadata to save.
     """
@@ -534,7 +712,7 @@ def scrap_gpcrmd_data(out_path: Path) -> None:
 
     # Fetch, parse and validate the file metadatas with a pydantic model (BaseFile)
     files_metadata_validated, files_metadata_unvalidated = (
-        parse_and_validate_files_metadatas(entries, fetch_time)
+        fetch_and_validate_file_metadatas(entries, fetch_time)
     )
     save_metadatas_to_parquet(
         out_path,
@@ -543,6 +721,7 @@ def scrap_gpcrmd_data(out_path: Path) -> None:
         tag="files"
     )
 
+    # Compute the elapsed time for scrapping
     end_time = time.time()
     elapsed_time = end_time - start_time
     hours = int(elapsed_time // 3600)

From 9bcf8c9cc787b3abe854336e5bda8448caa10580 Mon Sep 17 00:00:00 2001
From: essmaw <essmay.touami@ens.uvsq.fr>
Date: Mon, 5 Jan 2026 15:18:11 +0100
Subject: [PATCH 3/9] Refactor code to reduce length, eliminate duplicate
 requests, and rename output files and models.

---
 scripts/scrap_gpcrmd.py | 503 +++++++++++++++++++---------------------
 1 file changed, 245 insertions(+), 258 deletions(-)

diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py
index b161807..41a5e2b 100644
--- a/scripts/scrap_gpcrmd.py
+++ b/scripts/scrap_gpcrmd.py
@@ -7,14 +7,14 @@
 Additionally, it retrieves file metadata for each dataset, including file paths
 in GPCRMD, file size, type/extension, etc.
 
-The scraped data is validated against Pydantic models (`BaseDataset` and `BaseFile`)
+The scraped data is validated against Pydantic models (`DatasetModel` and `File Model`)
 and saved locally in Parquet format:
-- "data/gpcrmd/{timestamp}/validated_entries.parquet"
-- "data/gpcrmd/{timestamp}/validated_files.parquet"
+- "data/gpcrmd/{timestamp}/gpcrmd_datasets.parquet"
+- "data/gpcrmd/{timestamp}/gpcrmd_files.parquet"
 
 Entries that fail validation are saved as:
-- "data/gpcrmd/{timestamp}/unvalidated_entries.parquet"
-- "data/gpcrmd/{timestamp}/unvalidated_files.parquet"
+- "data/gpcrmd/{timestamp}/not_validated_gpcrmd_datasets.parquet"
+- "data/gpcrmd/{timestamp}/not_validated_gpcrmd_files.parquet"
 
 
 Usage:
@@ -33,10 +33,11 @@
 
 This command will:
     1. Fetch all available datasets from GPCRMD.
-    2. Parse their metadata and validate them using the Pydantic models `BaseDataset`
-       and `BaseFile`.
+    2. Parse their metadata and validate them using the Pydantic models `DatasetModel`
+       and `File Model`.
     3. Save both the validated and unvalidated dataset entries to
-       "data/gpcrmd/{timestamp}/{validated or unvalidated}_entries.parquet".
+       "data/gpcrmd/{timestamp}/gpcrmd_datasets.parquet" and
+       "data/gpcrmd/{timestamp}/not_validated_gpcrmd_datasets.parquet"
     4. Save file metadata similarly for validated and unvalidated files.
 """
 
@@ -64,8 +65,8 @@
 from pydantic import ValidationError
 from tqdm import tqdm
 
-from models.dataset_model import BaseDataset, DatasetProject, DatasetRepository
-from models.file_model import BaseFile
+from models.dataset_model import DatasetModel, DatasetProject, DatasetRepository
+from models.file_model import FileModel
 
 # CONSTANTS
 BASE_GPCRMD_URL = "https://www.gpcrmd.org/api/search_all/info/"
@@ -140,14 +141,42 @@ def fetch_entries_once() -> tuple[list[dict[str, Any]], str]:
         return [], fetch_time
 
 
-def retrieve_metadata(url: str, field_name: str, timeout: int = 50) -> str | None:
+def fetch_entry_page(url: str) -> str | None:
+    """Fetch an entry page and return its HTML content.
+
+    Parameters
+    ----------
+    url : str
+        The URL of the entry page to fetch.
+
+    Returns
+    -------
+    str | None
+        The HTML content of the page if the request is successful, otherwise None.
+    """
+    try:
+        response = httpx.get(url, timeout=50)
+        response.raise_for_status()
+        # Sleep briefly to avoid overwhelming the remote server
+        time.sleep(0.1)
+    except httpx.HTTPStatusError as exc:
+        logger.warning(f"HTTP error {exc.response.status_code} for {url}")
+        return None
+    except httpx.RequestError as exc:
+        logger.warning(f"Request error for {url}: {exc}")
+        return None
+
+    return response.text
+
+
+def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | None:
     """
     Retrieve a specific metadata field from a webpage.
 
     Parameters
     ----------
-    url : str
-        The URL of the webpage to fetch.
+    html : str
+        The HTML content of the page.
     field_name : str
         The name of the metadata field to extract (case-sensitive).
     timeout : int, optional
@@ -159,45 +188,33 @@ def retrieve_metadata(url: str, field_name: str, timeout: int = 50) -> str | Non
         The value of the metadata field if found, otherwise None.
 
     """
-    # Try to send an HTTP GET request to the given URL of the dataset.
-    try:
-        response = httpx.get(url, timeout=timeout)
-        response.raise_for_status()
-
-    except httpx.HTTPStatusError as e:
-        logger.warning(
-            f"HTTP error {e.response.status_code} for {url}"
-        )
-        return None
-
-    except httpx.RequestError as e:
-        logger.warning(f"Failed to fetch {field_name} from {url}: {e}")
-        return None
     # Parse the HTML content of the page using BeautifulSoup
-    soup = BeautifulSoup(response.text, "html.parser")
-    bold_tag = soup.find("b", string=lambda t: t and t.strip() == field_name)
-    if not bold_tag:
-        return None
-    # Get all the text from the parent element of the <b> tag
-    parent = bold_tag.parent
-    if not isinstance(parent, Tag):
-        return None
-    parent_text = parent.get_text(strip=True)
-
-    if ":" not in parent_text:
-        return None
-    # Get only what is after the "field_name:"
-    return parent_text.split(":", 1)[1].strip() or None
+    if html:
+        soup = BeautifulSoup(html, "html.parser")
+        bold_tag = soup.find("b", string=lambda t: t and t.strip() == field_name)
+        if not bold_tag:
+            return None
+        # Get all the text from the parent element of the <b> tag
+        parent = bold_tag.parent
+        if not isinstance(parent, Tag):
+            return None
+        parent_text = parent.get_text(strip=True)
+        if ":" not in parent_text:
+            return None
+        # Get only what is after the "field_name:"
+        metadata = parent_text.split(":", 1)[1].strip()
+        return metadata
+    return None
 
 
-def retrieve_reference_links(url: str, timeout: int = 50) -> list[str] | None:
+def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None:
     """
     Retrieve reference URLs from the References section of a GPCRMD entry page.
 
     Parameters
     ----------
-    url : str
-        The URL of the GPCRMD entry page.
+    html : str
+        The HTML content of the page.
     timeout : int, optional
         Timeout in seconds for the HTTP request (default is 10).
 
@@ -207,122 +224,149 @@ def retrieve_reference_links(url: str, timeout: int = 50) -> list[str] | None:
         List of reference URLs (starting with http:// or https://) if found,
         otherwise None.
     """
-    try:
-        response = httpx.get(url, timeout=timeout)
-        response.raise_for_status()
-
-    except httpx.HTTPStatusError as e:
-        logger.warning(
-            f"HTTP error {e.response.status_code} for {url}"
-        )
-        return None
-
-    except httpx.RequestError as e:
-        logger.warning(f"Failed to fetch reference links from {url}: {e}")
-        return None
-    # Parse the HTML content
-    soup = BeautifulSoup(response.text, "html.parser")
-    # Find the <h3> header with text "References"
-    header = soup.find("h3", string=lambda t: t and t.strip() == "References")
-    if not header:
-        return None
-    # Get the corresponding content div containing the links
-    content_div = header.find_next_sibling("div", class_="techinfo_content")
-    if not content_div:
-        return None
-
-    # Iterate over all <a> elements with an href attribute inside the content div
-    # Only keep elements that are of type Tag to satisfy type checkers
-    content_div = header.find_next_sibling("div", class_="techinfo_content")
-    if not isinstance(content_div, Tag):
-        return None
-    links: list[str] = []
-    for a in filter(lambda x: isinstance(x, Tag), content_div.find_all("a", href=True)):
-        href = a["href"].strip()
-        # Only include links that start with "http://" or "https://"
-        if href.startswith(("http://", "https://")):
-            links.append(href)
+    if html:
+        # Parse the HTML content
+        soup = BeautifulSoup(html, "html.parser")
+        # Find the <h3> header with text "References"
+        header = soup.find("h3", string=lambda t: t and t.strip() == "References")
+        if not header:
+            return None
+        # Get the corresponding content div containing the links
+        content_div = header.find_next_sibling("div", class_="techinfo_content")
+        if not content_div:
+            return None
+
+        # Iterate over all <a> elements with an href attribute inside the content div
+        # Only keep elements that are of type Tag to satisfy type checkers
+        content_div = header.find_next_sibling("div", class_="techinfo_content")
+        if not isinstance(content_div, Tag):
+            return None
+        links: list[str] = []
+        for a in filter(lambda x: isinstance(x, Tag), content_div.find_all("a", href=True)):
+            href = a["href"].strip()
+            # Only include links that start with "http://" or "https://"
+            if href.startswith(("http://", "https://")):
+                links.append(href)
+        return links
 
-    return links or None
+    return None
 
 
-def count_simulation_files(url: str, timeout: int = 50) -> int | None:
+def count_simulation_files(html: str, timeout: int = 50) -> int | None:
     """
     Count files in the dataset webpage.
 
     Especially in 'Simulation output files' and 'Simulation protocol \
     & starting files' sections.
 
+    Parameters
+    ----------
+    html : str
+        The HTML content of the page.
+
     Returns
     -------
     int | None
         The number of files related to this dataset.
     """
-    try:
-        response = httpx.get(url, timeout=timeout)
-        response.raise_for_status()
+    if html:
+        # Parse the HTML content
+        soup = BeautifulSoup(html, "html.parser")
 
-    except httpx.HTTPStatusError as e:
-        logger.warning(
-            f"HTTP error {e.response.status_code} for {url}"
-        )
-        return None
-    except httpx.RequestError as e:
-        logger.warning(f"Failed to fetch file counts from {url}: {e}")
-        return None
+        # Helper function to count unique links in a container div
+        def count_links(container_id: str) -> int:
+            # Find the container <div> by ID
+            container = soup.find("div", id=container_id)
+            # Ensure the container is actually a Tag
+            if not isinstance(container, Tag):
+                return 0
+
+            # Collect all hrefs in <a> tags, stripping whitespace
+            links = [
+                str(a.get("href", "")).strip()
+                for a in container.find_all("a", href=True)
+                if isinstance(a, Tag) and str(a.get("href", "")).strip()
+            ]
+
+            # Remove duplicates while preserving order
+            return len(dict.fromkeys(links))
+
+        output_files_count = count_links("allfiles")
+        protocol_files_count = count_links("paramfiles")
+        return output_files_count + protocol_files_count
+    return None
+
+
+def validate_parsed_entry(
+    parsed_entry: dict[str, Any],
+    out_model: type[FileModel | DatasetModel]
+) -> tuple[FileModel | DatasetModel | None, dict[str, Any] | None]:
+    """Validate a parsed entry using the pydantic model.
 
-    soup = BeautifulSoup(response.text, "html.parser")
+    Parameters
+    ----------
+    parsed_entry : dict[str, Any]
+        The parsed entry to validate.
+    out_model: FileModel | DatasetModel
+        The Pydantic model used for the validation.
 
-    # Helper function to count unique links in a container div
-    def count_links(container_id: str) -> int:
-        # Find the container <div> by ID
-        container = soup.find("div", id=container_id)
-        # Ensure the container is actually a Tag
-        if not isinstance(container, Tag):
-            return 0
+    Returns
+    -------
+    tuple[FileModel | DatasetModel | None,  dict[str, Any] | None]
+        A tuple containing the validated model instance if validation succeeds,
+        otherwise None, and the enriched parsed entry containing validation
+        failure reasons if validation fails.
+    """
+    try:
+        return out_model(**parsed_entry), None
+    except ValidationError as exc:
+        reasons: list[str] = []
 
-        # Collect all hrefs in <a> tags, stripping whitespace
-        links = [
-            str(a.get("href", "")).strip()
-            for a in container.find_all("a", href=True)
-            if isinstance(a, Tag) and str(a.get("href", "")).strip()
-        ]
+        for err in exc.errors():
+            field = ".".join(str(x) for x in err["loc"])
+            reason = err["msg"]
+            value = err.get("input")
 
-        # Remove duplicates while preserving order
-        return len(dict.fromkeys(links))
+            logger.error(
+                "Validation error on '{}': value={!r} (type={}) -> {}",
+                field,
+                value,
+                type(value).__name__,
+                reason,
+            )
 
-    output_files_count = count_links("allfiles")
-    protocol_files_count = count_links("paramfiles")
+            reasons.append(f"{field}: {reason}")
 
-    return output_files_count + protocol_files_count
+        parsed_entry["non_validation_reason"] = "; ".join(reasons)
+        return None, parsed_entry
 
 
 def parse_and_validate_entry_metadatas(
-    entries_list: list[dict[str, Any]],
+    entries: list[dict[str, Any]],
     fetch_time: str
-) -> tuple[list[BaseDataset], list[dict[str, Any]]]:
+) -> tuple[list[DatasetModel], list[dict[str, Any]]]:
     """
     Parse and validate metadata fields for a list of GPCRMD entries.
 
     Parameters
     ----------
-    entries_list : list of dict
+    entries : list of dict
         List of dictionaries, each representing the metadata of a GPCRMD entry.
     fetch_time : str
         Timestamp (as a string) indicating when the data was fetched.
 
     Returns
     -------
-    Tuple[List[BaseDataset], List[Dict]]
-        - List of successfully validated `BaseDataset` objects.
+    tuple[list[DatasetModel], list[dict[str, Any]]]
+        - List of successfully validated `DatasetModel` objects.
         - List of parsed entry that failed validation.
     """
     logger.info("Starting parsing and validating GPCRMD entries...")
-    validated_entries: list[BaseDataset] = []
+    validated_entries: list[DatasetModel] = []
     non_validated_entries: list[dict[str, Any]] = []
-    total_entries: int = len(entries_list)
+    total_entries: int = len(entries)
 
-    for entry in tqdm(entries_list,
+    for entry in tqdm(entries,
             desc="Validating GPCRmd entries",
             colour="blue",
             unit="entry"
@@ -336,19 +380,20 @@ def parse_and_validate_entry_metadatas(
             [comp.get("resname") for comp in dyncomp if comp.get("resname")]
         )
         url: str = entry.get("url")
-        author_names: list[str | None] = [retrieve_metadata(url, "Submitted by")]
-        description: str | None = retrieve_metadata(url, "Description")
-        stime: str | None = retrieve_metadata(url, "Accumulated simulation time")
-        refs: list[str] | None = retrieve_reference_links(url)
-        nb_files: int | None = count_simulation_files(url)
-        softname: str = entry.get("mysoftware")
-        softvers: str = entry.get("software_version")
-        ffm: str = entry.get("forcefield")
-        ffm_vers: str = entry.get("forcefield_version")
-        delta: float = entry.get("delta")
-        timestep: float = entry.get("timestep")
-        title: str = entry.get("modelname")
-        date: str = entry.get("creation_timestamp")
+        # Fetch entry page with url
+        html = fetch_entry_page(url)
+        if html:
+            author_names: str | None = retrieve_metadata(html, "Submitted by")
+            description: str | None = retrieve_metadata(html, "Description")
+            stime: str | None = retrieve_metadata(html, "Accumulated simulation time")
+            refs: list[str] | None = retrieve_reference_links(html)
+            nb_files: int | None = count_simulation_files(html)
+        else:
+            author_names = None
+            description = None
+            stime = None
+            refs = None
+            nb_files = None
 
         parsed_entry = {
             "dataset_repository": DatasetRepository.GPCRMD,
@@ -358,49 +403,40 @@ def parse_and_validate_entry_metadatas(
             "dataset_url_in_repository": url,
             "dataset_url_in_project": url,
             "links": refs,
-            "title": title,
-            "date_created": date,
+            "title": entry.get("modelname"),
+            "date_created": entry.get("creation_timestamp"),
             "date_last_fetched": fetch_time,
             "nb_files": nb_files,
-            "author_names": author_names,
+            "author_names": author_names if author_names is None else [author_names],
             "description": description,
-            "simulation_program_name": softname,
-            "simulation_program_version": softvers,
+            "simulation_program_name": entry.get("mysoftware"),
+            "simulation_program_version": entry.get("software_version"),
             "nb_atoms": total_atoms,
             "molecule_names": molecules,
-            "forcefield_model_name": ffm,
-            "forcefield_model_version": ffm_vers,
-            "timestep": timestep,
-            "delta": delta,
+            "forcefield_model_name": entry.get("forcefield"),
+            "forcefield_model_version": entry.get("forcefield_version"),
+            "timestep": entry.get("timestep"),
+            "delta": entry.get("delta"),
             "simulation_time": stime
             }
-        try:
-            # Validate and normalize data collected with pydantic model
-            dataset_model = BaseDataset(**parsed_entry)  # ty:ignore[invalid-argument-type]  # noqa: E501
-            validated_entries.append(dataset_model)
-        except ValidationError as e:
-            reasons: list[str] = []
-
-            for err in e.errors():
-                field = ".".join(str(x) for x in err["loc"])
-                reason = err["msg"]
-                value = err.get("input")
-
-                logger.error(
-                    "Validation error on '{}': value={!r} (type={}) -> {}",
-                    field,
-                    value,
-                    type(value).__name__,
-                    reason,
-                )
-                reasons.append(f"{field}: {reason}")
-
-            parsed_entry["non_validation_reason"] = "; ".join(reasons)
-            non_validated_entries.append(parsed_entry)
 
+        # Validate and normalize data collected with pydantic model
+        (dataset_model_entry,
+            non_validated_parsed_entry,
+        ) = validate_parsed_entry(parsed_entry, DatasetModel)
+        if isinstance(dataset_model_entry, DatasetModel):
+            validated_entries.append(dataset_model_entry)
+        if non_validated_parsed_entry:
+            non_validated_entries.append(non_validated_parsed_entry)
+
+    percentage = (
+        (len(validated_entries) / total_entries) * 100
+        if total_entries > 0
+        else 0.0
+    )
     logger.success(
-        f"Parsing completed: {len(validated_entries)} validated / {total_entries} total\
-            entries successfully! \n"
+        f"Parsing completed: {percentage:.2f}% validated "
+        f"({len(validated_entries)}/{total_entries}) datasets successfully! \n"
     )
     return validated_entries, non_validated_entries
 
@@ -431,38 +467,12 @@ def make_base_parsed_entry(
         "dataset_id_in_repository": entry_id,
         "file_name": None,
         "file_type": None,
-        "file_size": None,
+        "file_size_in_bytes": None,
         "file_url_in_repository": url,
         "date_last_fetched": fetch_time,
     }
 
 
-def fetch_entry_page(url: str) -> str | None:
-    """Fetch an entry page and return its HTML content.
-
-    Parameters
-    ----------
-    url : str
-        The URL of the entry page to fetch.
-
-    Returns
-    -------
-    str | None
-        The HTML content of the page if the request is successful, otherwise None.
-    """
-    try:
-        response = httpx.get(url, timeout=50)
-        response.raise_for_status()
-    except httpx.HTTPStatusError as exc:
-        logger.warning("HTTP error %s for %s", exc.response.status_code, url)
-        return None
-    except httpx.RequestError as exc:
-        logger.warning("Request error for %s: %s", url, exc)
-        return None
-
-    return response.text
-
-
 def fetch_file_size(file_path: str) -> int | None:
     """Fetch file size using a HEAD request.
 
@@ -478,6 +488,8 @@ def fetch_file_size(file_path: str) -> int | None:
     """
     try:
         response = httpx.head(file_path, timeout=50, follow_redirects=True)
+        # Sleep briefly to avoid overwhelming the remote server
+        time.sleep(0.1)
         return int(response.headers.get("Content-Length", 0))
     except httpx.HTTPStatusError as exc:
         logger.warning(
@@ -491,51 +503,10 @@ def fetch_file_size(file_path: str) -> int | None:
     return None
 
 
-def validate_parsed_entry(
-    parsed_entry: dict[str, Any],
-) -> BaseFile | None:
-    """Validate a parsed entry using the BaseFile model.
-
-    Parameters
-    ----------
-    parsed_entry : dict[str, Any]
-        The parsed entry to validate.
-
-    Returns
-    -------
-    BaseFile | None
-        The validated BaseFile object if validation is successful,
-        otherwise None.
-
-    """
-    try:
-        return BaseFile(**parsed_entry)
-    except ValidationError as exc:
-        reasons: list[str] = []
-
-        for err in exc.errors():
-            field = ".".join(str(x) for x in err["loc"])
-            reason = err["msg"]
-            value = err.get("input")
-
-            logger.error(
-                "Validation error on '{}': value={!r} (type={}) -> {}",
-                field,
-                value,
-                type(value).__name__,
-                reason,
-            )
-
-            reasons.append(f"{field}: {reason}")
-
-        parsed_entry["non_validation_reason"] = "; ".join(reasons)
-        return None
-
-
 def fetch_and_validate_file_metadatas(
     entries: list[dict],
     fetch_time: str,
-) -> tuple[list[BaseFile], list[dict]]:
+) -> tuple[list[FileModel], list[dict[str, Any]]]:
     """Fetch and validate metadata for GPCRMD files.
 
     Parameters
@@ -547,14 +518,15 @@ def fetch_and_validate_file_metadatas(
 
     Returns
     -------
-    tuple[list[BaseFile], list[dict]]
-        - List of validated `BaseFile` objects.
-        - List of file entries that failed validation.
+    tuple[list[DatasetModel], list[dict[str, Any]]]
+        - List of successfully validated `FileModel` objects.
+        - List of parsed entry that failed validation.
     """
     logger.info("Starting fetching and validating GPCRMD files...")
 
-    validated_entries: list[BaseFile] = []
-    non_validated_entries: list[dict] = []
+    validated_files: list[FileModel] = []
+    non_validated_files: list[dict] = []
+    total_files = 0
 
     for entry in tqdm(
         entries,
@@ -570,7 +542,7 @@ def fetch_and_validate_file_metadatas(
         html = fetch_entry_page(url)
         if html is None:
             base_entry["non_validation_reason"] = "entry_page_fetch_failed"
-            non_validated_entries.append(base_entry)
+            non_validated_files.append(base_entry)
             continue
 
         soup = BeautifulSoup(html, "html.parser")
@@ -581,7 +553,10 @@ def fetch_and_validate_file_metadatas(
             if not isinstance(container, Tag):
                 continue
 
-            for link in container.find_all("a", href=True):
+            links = container.find_all("a", href=True)
+            total_files += len(links)
+
+            for link in links:
                 # Ensure link is a Tag to safely access ['href']
                 if not isinstance(link, Tag):
                     continue
@@ -598,22 +573,34 @@ def fetch_and_validate_file_metadatas(
                     **base_entry,
                     "file_name": file_name,
                     "file_type": file_type,
-                    "file_size": fetch_file_size(file_path),
+                    "file_size_in_bytes": fetch_file_size(file_path),
                     "file_url_in_repository": file_path,
                 }
 
-                validated = validate_parsed_entry(parsed_entry)
-                if validated is None:
-                    non_validated_entries.append(parsed_entry)
-                else:
-                    validated_entries.append(validated)
-
-    return validated_entries, non_validated_entries
+                # Validate and normalize data collected with pydantic model
+                (file_model_entry,
+                    non_validated_parsed_entry,
+                ) = validate_parsed_entry(parsed_entry, FileModel)
+                if isinstance(file_model_entry, FileModel):
+                    validated_files.append(file_model_entry)
+                if non_validated_parsed_entry:
+                    non_validated_files.append(non_validated_parsed_entry)
+
+    percentage = (
+        (len(validated_files) / total_files) * 100
+        if total_files > 0
+        else 0.0
+    )
+    logger.success(
+        f"Parsing completed: {percentage:.2f}% validated "
+        f"({len(validated_files)}/{total_files}) files successfully! \n"
+    )
+    return validated_files, non_validated_files
 
 
 def save_metadatas_to_parquet(
     folder_out_path: Path,
-    metadatas_validated: list[BaseDataset] | list[BaseFile],
+    metadatas_validated: list[DatasetModel] | list[FileModel],
     metadatas_unvalidated: list[dict],
     tag: str,
 ) -> None:
@@ -624,7 +611,7 @@ def save_metadatas_to_parquet(
     ----------
     folder_out_path : Path
         Folder path where Parquet files will be saved.
-    metadatas_validated : List[BaseDataset]
+    metadatas_validated : List[DatasetModel]
         List of validated metadatas.
     metadatas_unvalidated : List[Dict]
         List of unvalidated metadatas as dictionaries.
@@ -637,9 +624,9 @@ def save_metadatas_to_parquet(
 
     # Save validated entries
     if tag == "entries":
-        validated_path = os.path.join(folder_out_path, "validated_entries.parquet")
+        validated_path = os.path.join(folder_out_path, "gpcrmd_datasets.parquet")
     elif tag == "files":
-        validated_path = os.path.join(folder_out_path, "validated_files.parquet")
+        validated_path = os.path.join(folder_out_path, "gpcrmd_files.parquet")
     try:
         # Convert list of Pydantic models to list of dicts
         validated_dicts = [entry.model_dump() for entry in metadatas_validated]
@@ -654,11 +641,11 @@ def save_metadatas_to_parquet(
     # Save unvalidated entries
     if tag == "entries":
         unvalidated_path = os.path.join(
-            folder_out_path, "unvalidated_entries.parquet"
+            folder_out_path, "not_validated_gpcrmd_datasets.parquet"
         )
     elif tag == "files":
         unvalidated_path = os.path.join(
-            folder_out_path, "unvalidated_files.parquet"
+            folder_out_path, "not_validated_gpcrmd_files.parquet"
         )
     try:
         if len(metadatas_unvalidated) != 0:
@@ -698,7 +685,7 @@ def scrap_gpcrmd_data(out_path: Path) -> None:
     if entries == []:
         logger.warning("No data fetched from GPCRMD.")
         return
-    # Parse and validate  entry metadatas with a pydantic model (BaseDataset)
+    # Parse and validate  entry metadatas with a pydantic model (DatasetModel)
     entries_validated, entries_unvalidated = (
         parse_and_validate_entry_metadatas(entries, fetch_time)
     )
@@ -710,7 +697,7 @@ def scrap_gpcrmd_data(out_path: Path) -> None:
         tag="entries"
     )
 
-    # Fetch, parse and validate the file metadatas with a pydantic model (BaseFile)
+    # Fetch, parse and validate the file metadatas with a pydantic model (File Model)
     files_metadata_validated, files_metadata_unvalidated = (
         fetch_and_validate_file_metadatas(entries, fetch_time)
     )

From a2a8e87b3a3e4d569bdabeb564fd5f5c3bde1def Mon Sep 17 00:00:00 2001
From: essmaw <essmay.touami@ens.uvsq.fr>
Date: Mon, 5 Jan 2026 17:50:47 +0100
Subject: [PATCH 4/9] Correct one log for the validation of  files.

---
 scripts/scrap_gpcrmd.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py
index 41a5e2b..135c7e2 100644
--- a/scripts/scrap_gpcrmd.py
+++ b/scripts/scrap_gpcrmd.py
@@ -593,7 +593,8 @@ def fetch_and_validate_file_metadatas(
     )
     logger.success(
         f"Parsing completed: {percentage:.2f}% validated "
-        f"({len(validated_files)}/{total_files}) files successfully! \n"
+        f"({len(validated_files) - len(non_validated_files)}/"
+        f"{total_files}) files successfully! \n"
     )
     return validated_files, non_validated_files
 

From 7250ffeba0fc349027443d44da346286360decc8 Mon Sep 17 00:00:00 2001
From: essmaw <essmay.touami@etu.u-paris.fr>
Date: Thu, 8 Jan 2026 19:58:32 +0100
Subject: [PATCH 5/9] Refactor GPCRMD scraping script: update metadata
 handling, improve validation logging, and rename functions for clarity.

---
 scripts/scrap_gpcrmd.py | 337 ++++++++++++++++++++--------------------
 1 file changed, 167 insertions(+), 170 deletions(-)

diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py
index 135c7e2..4d68031 100644
--- a/scripts/scrap_gpcrmd.py
+++ b/scripts/scrap_gpcrmd.py
@@ -7,14 +7,14 @@
 Additionally, it retrieves file metadata for each dataset, including file paths
 in GPCRMD, file size, type/extension, etc.
 
-The scraped data is validated against Pydantic models (`DatasetModel` and `File Model`)
-and saved locally in Parquet format:
-- "data/gpcrmd/{timestamp}/gpcrmd_datasets.parquet"
-- "data/gpcrmd/{timestamp}/gpcrmd_files.parquet"
+The scraped data is validated against Pydantic models (`DatasetMetadata`
+and `File Model`) and saved locally in Parquet format:
+- "data/gpcrmd/gpcrmd_datasets.parquet"
+- "data/gpcrmd/gpcrmd_files.parquet"
 
-Entries that fail validation are saved as:
-- "data/gpcrmd/{timestamp}/not_validated_gpcrmd_datasets.parquet"
-- "data/gpcrmd/{timestamp}/not_validated_gpcrmd_files.parquet"
+Datasets that fail validation are saved as:
+- "data/gpcrmd/not_validated_gpcrmd_datasets.parquet"
+- "data/gpcrmd/not_validated_gpcrmd_files.parquet"
 
 
 Usage:
@@ -25,7 +25,7 @@
 ==========
     --out-path : (optional)
         Folder path to save the scraped GPCRMD data (dataset and file metadata).
-        Default is "data/gpcrmd/{timestamp}".
+        Default is "data/gpcrmd".
 
 Example:
 ========
@@ -33,11 +33,11 @@
 
 This command will:
     1. Fetch all available datasets from GPCRMD.
-    2. Parse their metadata and validate them using the Pydantic models `DatasetModel`
-       and `File Model`.
-    3. Save both the validated and unvalidated dataset entries to
-       "data/gpcrmd/{timestamp}/gpcrmd_datasets.parquet" and
-       "data/gpcrmd/{timestamp}/not_validated_gpcrmd_datasets.parquet"
+    2. Parse their metadata and validate them using the Pydantic models
+    `DatasetMetadata` and `File Model`.
+    3. Save both the validated and unvalidated dataset datasets to
+       "data/gpcrmd/gpcrmd_datasets.parquet" and
+       "data/gpcrmd/not_validated_gpcrmd_datasets.parquet"
     4. Save file metadata similarly for validated and unvalidated files.
 """
 
@@ -53,7 +53,7 @@
 import os
 import sys
 import time
-from datetime import datetime
+from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Any
 
@@ -63,10 +63,9 @@
 from bs4 import BeautifulSoup, Tag
 from loguru import logger
 from pydantic import ValidationError
-from tqdm import tqdm
 
-from models.dataset_model import DatasetModel, DatasetProject, DatasetRepository
-from models.file_model import FileModel
+from models.dataset import DatasetMetadata, DatasetProject, DatasetRepository
+from models.file import FileMetadata
 
 # CONSTANTS
 BASE_GPCRMD_URL = "https://www.gpcrmd.org/api/search_all/info/"
@@ -98,6 +97,7 @@ def setup_logger(loguru_logger: Any, log_dir: str | Path = "logs") -> None:
         log_folder / "scrap_gpcrmd_data_{time:YYYY-MM-DD}.log",
         format=fmt,
         level="DEBUG",
+        mode="w",
     )
     loguru_logger.add(
         sys.stdout,
@@ -106,19 +106,19 @@ def setup_logger(loguru_logger: Any, log_dir: str | Path = "logs") -> None:
     )
 
 
-def fetch_entries_once() -> tuple[list[dict[str, Any]], str]:
+def fetch_datasets_once() -> tuple[list[dict[str, Any]], str]:
     """
-    Fetch all entries from the GPCRMD API.
+    Fetch all datasets from the GPCRMD API.
 
     Returns
     -------
     Tuple[List[Dict[str, Any]], str]:
-        - A list of entries (JSON objects).
+        - A list of datasets (JSON objects).
         Returns an empty list if the request fails.
         - The current timestamp in ISO 8601 format (e.g., '2023-03-05T22:01:12').
     """
     logger.debug(
-        "Fetching entries from GPCRMD API... "
+        "Fetching datasets from GPCRMD API... "
         "(usually takes less than 1 minute!)"
     )
     # Current timestamp in ISO format
@@ -130,24 +130,24 @@ def fetch_entries_once() -> tuple[list[dict[str, Any]], str]:
         response.raise_for_status()
 
         # Parse JSON data
-        entries_md = response.json()
+        datasets = response.json()
         logger.success(
-            f"Fetched {len(entries_md)} MD-related entries from GPCRMD successfully! \n"
+            f"Fetched {len(datasets)} MD-related datasets from GPCRMD successfully! \n"
         )
-        return entries_md, fetch_time
+        return datasets, fetch_time
 
     except httpx.HTTPError as e:
         logger.error(f"HTTP error occurred: {e}")
         return [], fetch_time
 
 
-def fetch_entry_page(url: str) -> str | None:
-    """Fetch an entry page and return its HTML content.
+def fetch_dataset_page(url: str) -> str | None:
+    """Fetch an dataset page and return its HTML content.
 
     Parameters
     ----------
     url : str
-        The URL of the entry page to fetch.
+        The URL of the dataset page to fetch.
 
     Returns
     -------
@@ -209,7 +209,7 @@ def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | No
 
 def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None:
     """
-    Retrieve reference URLs from the References section of a GPCRMD entry page.
+    Retrieve reference URLs from the References section of a GPCRMD dataset page.
 
     Parameters
     ----------
@@ -242,7 +242,8 @@ def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None:
         if not isinstance(content_div, Tag):
             return None
         links: list[str] = []
-        for a in filter(lambda x: isinstance(x, Tag), content_div.find_all("a", href=True)):
+        for a in filter(lambda x: isinstance(x, Tag),
+            content_div.find_all("a", href=True)):
             href = a["href"].strip()
             # Only include links that start with "http://" or "https://"
             if href.startswith(("http://", "https://")):
@@ -297,28 +298,28 @@ def count_links(container_id: str) -> int:
     return None
 
 
-def validate_parsed_entry(
-    parsed_entry: dict[str, Any],
-    out_model: type[FileModel | DatasetModel]
-) -> tuple[FileModel | DatasetModel | None, dict[str, Any] | None]:
-    """Validate a parsed entry using the pydantic model.
+def validate_parsed_metadatas(
+    parsed: dict[str, Any],
+    out_model: type[FileMetadata | DatasetMetadata]
+) -> tuple[FileMetadata | DatasetMetadata | None, dict[str, Any] | None]:
+    """Validate a parsed dataset using the pydantic model.
 
     Parameters
     ----------
-    parsed_entry : dict[str, Any]
-        The parsed entry to validate.
-    out_model: FileModel | DatasetModel
+    parsed : dict[str, Any]
+        The parsed dataset or file to validate.
+    out_model: FileMetadata | DatasetMetadata
         The Pydantic model used for the validation.
 
     Returns
     -------
-    tuple[FileModel | DatasetModel | None,  dict[str, Any] | None]
+    tuple[FileMetadata | DatasetMetadata | None,  dict[str, Any] | None]
         A tuple containing the validated model instance if validation succeeds,
-        otherwise None, and the enriched parsed entry containing validation
+        otherwise None, and the enriched parsed dataset containing validation
         failure reasons if validation fails.
     """
     try:
-        return out_model(**parsed_entry), None
+        return out_model(**parsed), None
     except ValidationError as exc:
         reasons: list[str] = []
 
@@ -327,65 +328,54 @@ def validate_parsed_entry(
             reason = err["msg"]
             value = err.get("input")
 
-            logger.error(
-                "Validation error on '{}': value={!r} (type={}) -> {}",
-                field,
-                value,
-                type(value).__name__,
-                reason,
-            )
-
-            reasons.append(f"{field}: {reason}")
+            reasons.append(f"{field}: {reason} (input={value!r})")
 
-        parsed_entry["non_validation_reason"] = "; ".join(reasons)
-        return None, parsed_entry
+        parsed["non_validation_reason"] = "; ".join(reasons)
+        return None, parsed
 
 
-def parse_and_validate_entry_metadatas(
-    entries: list[dict[str, Any]],
+def parse_and_validate_dataset_metadatas(
+    datasets: list[dict[str, Any]],
     fetch_time: str
-) -> tuple[list[DatasetModel], list[dict[str, Any]]]:
+) -> tuple[list[DatasetMetadata], list[dict[str, Any]]]:
     """
-    Parse and validate metadata fields for a list of GPCRMD entries.
+    Parse and validate metadata fields for a list of GPCRMD datasets.
 
     Parameters
     ----------
-    entries : list of dict
-        List of dictionaries, each representing the metadata of a GPCRMD entry.
+    datasets : list of dict
+        List of dictionaries, each representing the metadata of a GPCRMD dataset.
     fetch_time : str
         Timestamp (as a string) indicating when the data was fetched.
 
     Returns
     -------
-    tuple[list[DatasetModel], list[dict[str, Any]]]
-        - List of successfully validated `DatasetModel` objects.
-        - List of parsed entry that failed validation.
+    tuple[list[DatasetMetadata], list[dict[str, Any]]]
+        - List of successfully validated `DatasetMetadata` objects.
+        - List of parsed dataset that failed validation.
     """
-    logger.info("Starting parsing and validating GPCRMD entries...")
-    validated_entries: list[DatasetModel] = []
-    non_validated_entries: list[dict[str, Any]] = []
-    total_entries: int = len(entries)
-
-    for entry in tqdm(entries,
-            desc="Validating GPCRmd entries",
-            colour="blue",
-            unit="entry"
-        ):
-        entry_id = str(entry.get("dyn_id"))
+    logger.info("Starting parsing and validating GPCRMD datasets...")
+    validated_datasets: list[DatasetMetadata] = []
+    non_validated_datasets: list[dict[str, Any]] = []
+    total_datasets: int = len(datasets)
+
+    for i, dataset in enumerate(datasets):
+        dataset_id = str(dataset.get("dyn_id"))
 
         # Extract molecules and number total of atoms if available
-        total_atoms: int | None = entry.get("atom_num")
-        dyncomp: list[dict[str, Any]] = entry.get("dyncomp", [])
+        total_atoms: int | None = dataset.get("atom_num")
+        dyncomp: list[dict[str, Any]] = dataset.get("dyncomp", [])
         molecules: list[str] = (
             [comp.get("resname") for comp in dyncomp if comp.get("resname")]
         )
-        url: str = entry.get("url")
-        # Fetch entry page with url
-        html = fetch_entry_page(url)
+        url = dataset.get("url")
+        # Fetch dataset page with url
+        html = fetch_dataset_page(url) if url else None
         if html:
             author_names: str | None = retrieve_metadata(html, "Submitted by")
             description: str | None = retrieve_metadata(html, "Description")
             stime: str | None = retrieve_metadata(html, "Accumulated simulation time")
+            stime_list: list[str] | None = [stime] if stime is not None else None
             refs: list[str] | None = retrieve_reference_links(html)
             nb_files: int | None = count_simulation_files(html)
         else:
@@ -395,76 +385,85 @@ def parse_and_validate_entry_metadatas(
             refs = None
             nb_files = None
 
-        parsed_entry = {
-            "dataset_repository": DatasetRepository.GPCRMD,
+        parsed_dataset = {
+            "dataset_repository_name": DatasetRepository.GPCRMD,
             "dataset_project": DatasetProject.GPCRMD,
-            "dataset_id_in_repository": entry_id,
-            "dataset_id_in_project": entry_id,
+            "dataset_id_in_repository": dataset_id,
+            "dataset_id_in_project": dataset_id,
             "dataset_url_in_repository": url,
             "dataset_url_in_project": url,
             "links": refs,
-            "title": entry.get("modelname"),
-            "date_created": entry.get("creation_timestamp"),
+            "title": dataset.get("modelname"),
+            "date_created": dataset.get("creation_timestamp"),
             "date_last_fetched": fetch_time,
             "nb_files": nb_files,
             "author_names": author_names if author_names is None else [author_names],
             "description": description,
-            "simulation_program_name": entry.get("mysoftware"),
-            "simulation_program_version": entry.get("software_version"),
+            "simulation_program_name": dataset.get("mysoftware"),
+            "simulation_program_version": dataset.get("software_version"),
             "nb_atoms": total_atoms,
             "molecule_names": molecules,
-            "forcefield_model_name": entry.get("forcefield"),
-            "forcefield_model_version": entry.get("forcefield_version"),
-            "timestep": entry.get("timestep"),
-            "delta": entry.get("delta"),
-            "simulation_time": stime
+            "forcefield_model_name": dataset.get("forcefield"),
+            "forcefield_model_version": dataset.get("forcefield_version"),
+            "timestep": dataset.get("timestep"),
+            "delta": dataset.get("delta"),
+            "simulation_time": stime_list
             }
 
         # Validate and normalize data collected with pydantic model
-        (dataset_model_entry,
-            non_validated_parsed_entry,
-        ) = validate_parsed_entry(parsed_entry, DatasetModel)
-        if isinstance(dataset_model_entry, DatasetModel):
-            validated_entries.append(dataset_model_entry)
-        if non_validated_parsed_entry:
-            non_validated_entries.append(non_validated_parsed_entry)
+        (parsed_dataset_model,
+            non_validated_parsed_dataset,
+        ) = validate_parsed_metadatas(parsed_dataset, DatasetMetadata)
+        # If it return a DatasetMetadata object
+        if isinstance(parsed_dataset_model, DatasetMetadata):
+            # Validation succeed
+            logger.debug(f"Parsed {i}/{len(datasets)} datasets")
+            validated_datasets.append(parsed_dataset_model)
+        # If not
+        if non_validated_parsed_dataset:
+            # Validation failed
+            logger.error(f"Validation failed for dataset `{dataset_id}` ({url})"
+                                ". Invalid field(s) detected : "
+                                f"{non_validated_parsed_dataset["non_validation_reason"]}"
+                    )
+            non_validated_datasets.append(non_validated_parsed_dataset)
 
     percentage = (
-        (len(validated_entries) / total_entries) * 100
-        if total_entries > 0
+        (len(validated_datasets) / total_datasets) * 100
+        if total_datasets > 0
         else 0.0
     )
     logger.success(
         f"Parsing completed: {percentage:.2f}% validated "
-        f"({len(validated_entries)}/{total_entries}) datasets successfully! \n"
+        f"({len(validated_datasets)}/{total_datasets}) datasets successfully! \n"
     )
-    return validated_entries, non_validated_entries
+    return validated_datasets, non_validated_datasets
 
 
-def make_base_parsed_entry(
-    entry_id: str,
+def make_base_parsed_file(
+    dataset_id: str,
     url: str,
     fetch_time: str,
 ) -> dict[str, Any]:
-    """Create a base parsed entry with empty file metadata.
+    """Create a base parsed dataset with empty file metadata.
 
     Parameters
     ----------
-    entry_id : str
-        The unique identifier of the GPCRMD entry.
+    dataset_id : str
+        The unique identifier of the GPCRMD dataset.
     url : str
-        The URL of the GPCRMD entry.
+        The URL of the GPCRMD dataset.
     fetch_time : str
         The timestamp indicating when the data was fetched.
 
     Returns
     -------
     dict[str, Any]
-        A dictionary representing the base parsed entry with empty file metadata.
+        A dictionary representing the base parsed dataset with empty file metadata.
     """
     return {
-        "dataset_repository": DatasetRepository.GPCRMD,
-        "dataset_id_in_repository": entry_id,
+        "dataset_repository_name": DatasetRepository.GPCRMD,
+        "dataset_id_in_repository": dataset_id,
         "file_name": None,
         "file_type": None,
         "file_size_in_bytes": None,
@@ -504,45 +503,42 @@ def fetch_file_size(file_path: str) -> int | None:
 
 
 def fetch_and_validate_file_metadatas(
-    entries: list[dict],
+    datasets: list[dict],
     fetch_time: str,
-) -> tuple[list[FileModel], list[dict[str, Any]]]:
+) -> tuple[list[FileMetadata], list[dict[str, Any]]]:
     """Fetch and validate metadata for GPCRMD files.
 
     Parameters
     ----------
-    entries : list[dict]
-        List of file entries, each containing metadata such as 'dyn_id' and 'url'.
+    datasets : list[dict]
+        List of file datasets, each containing metadata such as 'dyn_id' and 'url'.
     fetch_time : str
         Timestamp indicating when the data was fetched.
 
     Returns
     -------
-    tuple[list[DatasetModel], list[dict[str, Any]]]
-        - List of successfully validated `FileModel` objects.
-        - List of parsed entry that failed validation.
+    tuple[list[DatasetMetadata], list[dict[str, Any]]]
+        - List of successfully validated `FileMetadata` objects.
+        - List of parsed dataset that failed validation.
     """
     logger.info("Starting fetching and validating GPCRMD files...")
 
-    validated_files: list[FileModel] = []
+    validated_files: list[FileMetadata] = []
     non_validated_files: list[dict] = []
     total_files = 0
 
-    for entry in tqdm(
-        entries,
-        desc="Validating GPCRmd files",
-        colour="blue",
-        unit="file",
-    ):
-        entry_id = str(entry.get("dyn_id"))
-        url = entry.get("url")
+    for i, dataset in enumerate(datasets):
+        dataset_id = str(dataset.get("dyn_id"))
+        url = dataset.get("url")
+        files_parsed_for_dataset = 0
 
-        base_entry = make_base_parsed_entry(entry_id, url, fetch_time)
+        base_file = (make_base_parsed_file(dataset_id, url, fetch_time)
+                                                if url else {})
 
-        html = fetch_entry_page(url)
+        html = fetch_dataset_page(url) if url else None
         if html is None:
-            base_entry["non_validation_reason"] = "entry_page_fetch_failed"
-            non_validated_files.append(base_entry)
+            base_file["non_validation_reason"] = "dataset_page_fetch_failed"
+            non_validated_files.append(base_file)
             continue
 
         soup = BeautifulSoup(html, "html.parser")
@@ -569,8 +565,8 @@ def fetch_and_validate_file_metadatas(
                 file_name = os.path.basename(file_path)
                 file_type = os.path.splitext(file_name)[1].lstrip(".").lower()
 
-                parsed_entry = {
-                    **base_entry,
+                parsed_file = {
+                    **base_file,
                     "file_name": file_name,
                     "file_type": file_type,
                     "file_size_in_bytes": fetch_file_size(file_path),
@@ -578,13 +574,22 @@ def fetch_and_validate_file_metadatas(
                 }
 
                 # Validate and normalize data collected with pydantic model
-                (file_model_entry,
-                    non_validated_parsed_entry,
-                ) = validate_parsed_entry(parsed_entry, FileModel)
-                if isinstance(file_model_entry, FileModel):
-                    validated_files.append(file_model_entry)
-                if non_validated_parsed_entry:
-                    non_validated_files.append(non_validated_parsed_entry)
+                (parsed_file_model,
+                    non_validated_parsed_file,
+                ) = validate_parsed_metadatas(parsed_file, FileMetadata)
+                files_parsed_for_dataset += 1
+                if isinstance(parsed_file_model, FileMetadata):
+                    logger.debug(
+                        f"Parsed {files_parsed_for_dataset} file(s) for dataset "
+                        f"{i}/{len(datasets)}"
+                    )
+                    validated_files.append(parsed_file_model)
+                if non_validated_parsed_file:
+                    logger.error(f"Validation failed for file `{file_name}` "
+                                 f"({file_path}). Invalid field(s) detected : "
+                                f"{non_validated_parsed_file["non_validation_reason"]}"
+                    )
+                    non_validated_files.append(non_validated_parsed_file)
 
     percentage = (
         (len(validated_files) / total_files) * 100
@@ -601,7 +606,7 @@ def fetch_and_validate_file_metadatas(
 
 def save_metadatas_to_parquet(
     folder_out_path: Path,
-    metadatas_validated: list[DatasetModel] | list[FileModel],
+    metadatas_validated: list[DatasetMetadata] | list[FileMetadata],
     metadatas_unvalidated: list[dict],
     tag: str,
 ) -> None:
@@ -612,25 +617,25 @@ def save_metadatas_to_parquet(
     ----------
     folder_out_path : Path
         Folder path where Parquet files will be saved.
-    metadatas_validated : List[DatasetModel]
+    metadatas_validated : List[DatasetMetadata]
         List of validated metadatas.
     metadatas_unvalidated : List[Dict]
         List of unvalidated metadatas as dictionaries.
     tag: str
-        Tag to know if its entries or files metadata to save.
+        Tag to know if its datasets or files metadata to save.
     """
-    logger.info("Saving GPCRMD entries metadatas to a Parquet file...")
+    logger.info("Saving GPCRMD datasets metadatas to a Parquet file...")
     # Ensure output folder exists
     Path(folder_out_path).mkdir(parents=True, exist_ok=True)
 
-    # Save validated entries
-    if tag == "entries":
+    # Save validated datasets
+    if tag == "datasets":
         validated_path = os.path.join(folder_out_path, "gpcrmd_datasets.parquet")
     elif tag == "files":
         validated_path = os.path.join(folder_out_path, "gpcrmd_files.parquet")
     try:
         # Convert list of Pydantic models to list of dicts
-        validated_dicts = [entry.model_dump() for entry in metadatas_validated]
+        validated_dicts = [dataset.model_dump() for dataset in metadatas_validated]
         df_validated = pd.DataFrame(validated_dicts)
         df_validated.to_parquet(validated_path, index=False)
         logger.success(
@@ -639,8 +644,8 @@ def save_metadatas_to_parquet(
     except (ValueError, TypeError, OSError) as e:
         logger.error(f"Failed to save validated metadata to {validated_path}: {e}")
 
-    # Save unvalidated entries
-    if tag == "entries":
+    # Save unvalidated datasets
+    if tag == "datasets":
         unvalidated_path = os.path.join(
             folder_out_path, "not_validated_gpcrmd_datasets.parquet"
         )
@@ -656,7 +661,7 @@ def save_metadatas_to_parquet(
             f"GPCRMD unvalidated metadatas saved to: {unvalidated_path} successfully!"
             )
         else:
-            logger.warning("There is no unvalidated entries to save!")
+            logger.warning("There is no unvalidated datasets to save!")
     except (ValueError, TypeError, OSError) as e:
         logger.error(f"Failed to save unvalidated metadata to {unvalidated_path}: {e}")
 
@@ -665,7 +670,7 @@ def save_metadatas_to_parquet(
 @click.option(
     "--out-path",
     type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path),
-    default=Path(f"data/gpcrmd/{datetime.now().strftime('%Y%m%d_%H%M%S')}"),
+    default=Path("data/gpcrmd"),
     show_default=True,
     help="Folder path to save the scraped GPCRMD data (Dataset and File metadatas)"
 )
@@ -679,28 +684,28 @@ def scrap_gpcrmd_data(out_path: Path) -> None:
     """
     setup_logger(logger, out_path)
     logger.info("Starting GPCRMD data scraping...")
-    start_time = time.time()
+    start_time = time.perf_counter()
 
-    # Fetch entries metadata
-    entries, fetch_time = fetch_entries_once()
-    if entries == []:
+    # Fetch datasets metadata
+    datasets, fetch_time = fetch_datasets_once()
+    if datasets == []:
         logger.warning("No data fetched from GPCRMD.")
         return
-    # Parse and validate  entry metadatas with a pydantic model (DatasetModel)
-    entries_validated, entries_unvalidated = (
-        parse_and_validate_entry_metadatas(entries, fetch_time)
+    # Parse and validate dataset metadatas with a pydantic model (DatasetMetadata)
+    datasets_validated, datasets_unvalidated = (
+        parse_and_validate_dataset_metadatas(datasets, fetch_time)
     )
     # Save parsed metadata to local file
     save_metadatas_to_parquet(
         out_path,
-        entries_validated,
-        entries_unvalidated,
-        tag="entries"
+        datasets_validated,
+        datasets_unvalidated,
+        tag="datasets"
     )
 
     # Fetch, parse and validate the file metadatas with a pydantic model (File Model)
     files_metadata_validated, files_metadata_unvalidated = (
-        fetch_and_validate_file_metadatas(entries, fetch_time)
+        fetch_and_validate_file_metadatas(datasets, fetch_time)
     )
     save_metadatas_to_parquet(
         out_path,
@@ -709,16 +714,8 @@ def scrap_gpcrmd_data(out_path: Path) -> None:
         tag="files"
     )
 
-    # Compute the elapsed time for scrapping
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    hours = int(elapsed_time // 3600)
-    minutes = int((elapsed_time % 3600) // 60)
-    seconds = int(elapsed_time % 60)
-
-    logger.success(
-        f"Completed GPCRMD data scraping in {hours} h {minutes} min {seconds} sec 🎉"
-    )
+    elapsed_time = int(time.perf_counter() - start_time)
+    logger.success(f"Scraping duration: {timedelta(seconds=elapsed_time)} 🎉")
 
 
 if __name__ == "__main__":

From 846e47390bec7f7eb1aa302eb99a513ed2967b35 Mon Sep 17 00:00:00 2001
From: essmaw <essmay.touami@ens.uvsq.fr>
Date: Fri, 9 Jan 2026 18:39:28 +0100
Subject: [PATCH 6/9] Remove unnecessary timeout parameter.

---
 scripts/scrap_gpcrmd.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py
index 4d68031..aa6f4df 100644
--- a/scripts/scrap_gpcrmd.py
+++ b/scripts/scrap_gpcrmd.py
@@ -169,7 +169,7 @@ def fetch_dataset_page(url: str) -> str | None:
     return response.text
 
 
-def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | None:
+def retrieve_metadata(html: str, field_name: str) -> str | None:
     """
     Retrieve a specific metadata field from a webpage.
 
@@ -179,8 +179,6 @@ def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | No
         The HTML content of the page.
     field_name : str
         The name of the metadata field to extract (case-sensitive).
-    timeout : int, optional
-        Timeout in seconds for the HTTP request (default is 10).
 
     Returns
     -------
@@ -207,7 +205,7 @@ def retrieve_metadata(html: str, field_name: str, timeout: int = 50) -> str | No
     return None
 
 
-def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None:
+def retrieve_reference_links(html: str) -> list[str] | None:
     """
     Retrieve reference URLs from the References section of a GPCRMD dataset page.
 
@@ -215,8 +213,6 @@ def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None:
     ----------
     html : str
         The HTML content of the page.
-    timeout : int, optional
-        Timeout in seconds for the HTTP request (default is 10).
 
     Returns
     -------
@@ -253,7 +249,7 @@ def retrieve_reference_links(html: str, timeout: int = 50) -> list[str] | None:
     return None
 
 
-def count_simulation_files(html: str, timeout: int = 50) -> int | None:
+def count_simulation_files(html: str) -> int | None:
     """
     Count files in the dataset webpage.
 

From 7a3831810bf4fb0d62bc7b7df1a41619f999e7a5 Mon Sep 17 00:00:00 2001
From: essmaw <essmay.touami@ens.uvsq.fr>
Date: Fri, 9 Jan 2026 21:18:49 +0100
Subject: [PATCH 7/9] Remove unnecessary timeout + Improve logging messages.

---
 scripts/scrap_gpcrmd.py | 152 +++++++++++++++++++---------------------
 1 file changed, 72 insertions(+), 80 deletions(-)

diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py
index aa6f4df..f0b0fb9 100644
--- a/scripts/scrap_gpcrmd.py
+++ b/scripts/scrap_gpcrmd.py
@@ -12,11 +12,6 @@
 - "data/gpcrmd/gpcrmd_datasets.parquet"
 - "data/gpcrmd/gpcrmd_files.parquet"
 
-Datasets that fail validation are saved as:
-- "data/gpcrmd/not_validated_gpcrmd_datasets.parquet"
-- "data/gpcrmd/not_validated_gpcrmd_files.parquet"
-
-
 Usage:
 ======
     uv run -m scripts.scrap_gpcrmd [--out-path]
@@ -34,11 +29,9 @@
 This command will:
     1. Fetch all available datasets from GPCRMD.
     2. Parse their metadata and validate them using the Pydantic models
-    `DatasetMetadata` and `File Model`.
-    3. Save both the validated and unvalidated dataset datasets to
-       "data/gpcrmd/gpcrmd_datasets.parquet" and
-       "data/gpcrmd/not_validated_gpcrmd_datasets.parquet"
-    4. Save file metadata similarly for validated and unvalidated files.
+    `DatasetMetadata` and `FileMetadata`.
+    3. Save both the validated dataset datasets to "data/gpcrmd/gpcrmd_datasets.parquet"
+    4. Save file metadata similarly for validated files.
 """
 
 # METADATAS
@@ -297,7 +290,7 @@ def count_links(container_id: str) -> int:
 def validate_parsed_metadatas(
     parsed: dict[str, Any],
     out_model: type[FileMetadata | DatasetMetadata]
-) -> tuple[FileMetadata | DatasetMetadata | None, dict[str, Any] | None]:
+) -> tuple[FileMetadata | DatasetMetadata | None, str | None]:
     """Validate a parsed dataset using the pydantic model.
 
     Parameters
@@ -309,10 +302,9 @@ def validate_parsed_metadatas(
 
     Returns
     -------
-    tuple[FileMetadata | DatasetMetadata | None,  dict[str, Any] | None]
+    tuple[FileMetadata | DatasetMetadata | None,  str | None]
         A tuple containing the validated model instance if validation succeeds,
-        otherwise None, and the enriched parsed dataset containing validation
-        failure reasons if validation fails.
+        otherwise None, and the validation failure reasons if validation fails.
     """
     try:
         return out_model(**parsed), None
@@ -326,14 +318,14 @@ def validate_parsed_metadatas(
 
             reasons.append(f"{field}: {reason} (input={value!r})")
 
-        parsed["non_validation_reason"] = "; ".join(reasons)
-        return None, parsed
+        non_validation_reason = "; ".join(reasons)
+        return None, non_validation_reason
 
 
 def parse_and_validate_dataset_metadatas(
     datasets: list[dict[str, Any]],
     fetch_time: str
-) -> tuple[list[DatasetMetadata], list[dict[str, Any]]]:
+) -> list[DatasetMetadata]:
     """
     Parse and validate metadata fields for a list of GPCRMD datasets.
 
@@ -346,16 +338,14 @@ def parse_and_validate_dataset_metadatas(
 
     Returns
     -------
-    tuple[list[DatasetMetadata], list[dict[str, Any]]]
-        - List of successfully validated `DatasetMetadata` objects.
-        - List of parsed dataset that failed validation.
+    list[DatasetMetadata]
+        List of successfully validated `DatasetMetadata` objects.
     """
     logger.info("Starting parsing and validating GPCRMD datasets...")
     validated_datasets: list[DatasetMetadata] = []
-    non_validated_datasets: list[dict[str, Any]] = []
     total_datasets: int = len(datasets)
 
-    for i, dataset in enumerate(datasets):
+    for i, dataset in enumerate(datasets, start=1):
         dataset_id = str(dataset.get("dyn_id"))
 
         # Extract molecules and number total of atoms if available
@@ -375,6 +365,8 @@ def parse_and_validate_dataset_metadatas(
             refs: list[str] | None = retrieve_reference_links(html)
             nb_files: int | None = count_simulation_files(html)
         else:
+            logger.warning(f"Dataset `{dataset_id}` ({url}): "
+                           "page HTML missing; web metadata extraction skipped.")
             author_names = None
             description = None
             stime = None
@@ -408,21 +400,19 @@ def parse_and_validate_dataset_metadatas(
 
         # Validate and normalize data collected with pydantic model
         (parsed_dataset_model,
-            non_validated_parsed_dataset,
+            non_validation_reason,
         ) = validate_parsed_metadatas(parsed_dataset, DatasetMetadata)
         # If it return a DatasetMetadata object
         if isinstance(parsed_dataset_model, DatasetMetadata):
             # Validation succeed
-            logger.debug(f"Parsed {i}/{len(datasets)} datasets")
+            logger.debug(f"Parsed dataset id `{dataset_id}` ({i}/{len(datasets)})")
             validated_datasets.append(parsed_dataset_model)
-        # If not
-        if non_validated_parsed_dataset:
+        else:
             # Validation failed
             logger.error(f"Validation failed for dataset `{dataset_id}` ({url})"
                                 ". Invalid field(s) detected : "
-                                f"{non_validated_parsed_dataset["non_validation_reason"]}"
+                                f"{non_validation_reason}"
                     )
-            non_validated_datasets.append(non_validated_parsed_dataset)
 
     percentage = (
         (len(validated_datasets) / total_datasets) * 100
@@ -433,7 +423,7 @@ def parse_and_validate_dataset_metadatas(
         f"Parsing completed: {percentage:.2f}% validated "
         f"({len(validated_datasets)}/{total_datasets}) datasets successfully! \n"
     )
-    return validated_datasets, non_validated_datasets
+    return validated_datasets
 
 
 def make_base_parsed_file(
@@ -501,7 +491,7 @@ def fetch_file_size(file_path: str) -> int | None:
 def fetch_and_validate_file_metadatas(
     datasets: list[dict],
     fetch_time: str,
-) -> tuple[list[FileMetadata], list[dict[str, Any]]]:
+) -> list[FileMetadata]:
     """Fetch and validate metadata for GPCRMD files.
 
     Parameters
@@ -513,28 +503,33 @@ def fetch_and_validate_file_metadatas(
 
     Returns
     -------
-    tuple[list[DatasetMetadata], list[dict[str, Any]]]
-        - List of successfully validated `FileMetadata` objects.
-        - List of parsed dataset that failed validation.
+    list[FileMetadata]
+        List of successfully validated `FileMetadata` objects.
     """
     logger.info("Starting fetching and validating GPCRMD files...")
 
     validated_files: list[FileMetadata] = []
-    non_validated_files: list[dict] = []
     total_files = 0
+    non_validated_files_count = 0
 
-    for i, dataset in enumerate(datasets):
+    for i, dataset in enumerate(datasets, start=1):
         dataset_id = str(dataset.get("dyn_id"))
         url = dataset.get("url")
-        files_parsed_for_dataset = 0
+        count_files_parsed_for_dataset = 0
+
+        if not url:
+            logger.error(
+                    f"Dataset `{dataset_id}` skipped: missing dataset URL."
+                )
+            continue
 
-        base_file = (make_base_parsed_file(dataset_id, url, fetch_time)
-                                                if url else {})
+        base_file = make_base_parsed_file(dataset_id, url, fetch_time)
 
         html = fetch_dataset_page(url) if url else None
         if html is None:
-            base_file["non_validation_reason"] = "dataset_page_fetch_failed"
-            non_validated_files.append(base_file)
+            logger.error(
+                f"Dataset `{dataset_id}` ({url}) skipped: page retrieval failed."
+            )
             continue
 
         soup = BeautifulSoup(html, "html.parser")
@@ -543,6 +538,21 @@ def fetch_and_validate_file_metadatas(
             container = soup.find("div", id=sec_id)
             # Ensure container is a Tag
             if not isinstance(container, Tag):
+                if sec_id == "allfiles":
+                    # allfiles mandatory
+                    logger.warning(
+                        f"Dataset id `{dataset_id}` ({url}):"
+                        f"mandatory section `{sec_id}` is missing or invalid. "
+                        "Files required for simulation parsing cannot be retrieved."
+                    )
+                else:
+                    # paramfiles optional
+                    # logger.warning(
+                    #     f"Dataset id `{dataset_id}` ({url}): "
+                    #     f"optional section `{sec_id}` not found. "
+                    #     "Parameter files for simulations will be skipped."
+                    # )
+                    pass
                 continue
 
             links = container.find_all("a", href=True)
@@ -551,11 +561,19 @@ def fetch_and_validate_file_metadatas(
             for link in links:
                 # Ensure link is a Tag to safely access ['href']
                 if not isinstance(link, Tag):
+                    logger.warning(
+                        f"Dataset `{dataset_id}` ({url}): "
+                        "encountered non-HTML link element."
+                    )
                     continue
 
                 # Use .get() to safely retrieve the href, then convert to str
                 href_value = str(link.get("href", "")).strip()
                 if not href_value:
+                    logger.warning(
+                        f"Dataset `{dataset_id}` ({url}): "
+                        "file link without href attribute."
+                    )
                     continue
                 file_path = f"https://www.gpcrmd.org/{href_value}"
                 file_name = os.path.basename(file_path)
@@ -571,21 +589,21 @@ def fetch_and_validate_file_metadatas(
 
                 # Validate and normalize data collected with pydantic model
                 (parsed_file_model,
-                    non_validated_parsed_file,
+                    non_validation_reason,
                 ) = validate_parsed_metadatas(parsed_file, FileMetadata)
-                files_parsed_for_dataset += 1
+                count_files_parsed_for_dataset += 1
                 if isinstance(parsed_file_model, FileMetadata):
                     logger.debug(
-                        f"Parsed {files_parsed_for_dataset} file(s) for dataset "
-                        f"{i}/{len(datasets)}"
+                        f"Parsed file `{file_name}` from dataset "
+                        f"`{dataset_id}` ({i}/{len(datasets)})"
                     )
                     validated_files.append(parsed_file_model)
-                if non_validated_parsed_file:
+                else:
                     logger.error(f"Validation failed for file `{file_name}` "
                                  f"({file_path}). Invalid field(s) detected : "
-                                f"{non_validated_parsed_file["non_validation_reason"]}"
+                                f"{non_validation_reason}"
                     )
-                    non_validated_files.append(non_validated_parsed_file)
+                    non_validated_files_count += 1
 
     percentage = (
         (len(validated_files) / total_files) * 100
@@ -594,16 +612,15 @@ def fetch_and_validate_file_metadatas(
     )
     logger.success(
         f"Parsing completed: {percentage:.2f}% validated "
-        f"({len(validated_files) - len(non_validated_files)}/"
+        f"({len(validated_files) - non_validated_files_count}/"
         f"{total_files}) files successfully! \n"
     )
-    return validated_files, non_validated_files
+    return validated_files
 
 
 def save_metadatas_to_parquet(
     folder_out_path: Path,
     metadatas_validated: list[DatasetMetadata] | list[FileMetadata],
-    metadatas_unvalidated: list[dict],
     tag: str,
 ) -> None:
     """
@@ -615,16 +632,14 @@ def save_metadatas_to_parquet(
         Folder path where Parquet files will be saved.
     metadatas_validated : List[DatasetMetadata]
         List of validated metadatas.
-    metadatas_unvalidated : List[Dict]
-        List of unvalidated metadatas as dictionaries.
     tag: str
         Tag to know if its datasets or files metadata to save.
     """
-    logger.info("Saving GPCRMD datasets metadatas to a Parquet file...")
+    logger.info(f"Saving GPCRMD {tag} metadatas to a Parquet file...")
     # Ensure output folder exists
     Path(folder_out_path).mkdir(parents=True, exist_ok=True)
 
-    # Save validated datasets
+    # Save validated datasets and files
     if tag == "datasets":
         validated_path = os.path.join(folder_out_path, "gpcrmd_datasets.parquet")
     elif tag == "files":
@@ -638,28 +653,7 @@ def save_metadatas_to_parquet(
             f"GPCRMD validated metadatas saved to: {validated_path} successfully!"
         )
     except (ValueError, TypeError, OSError) as e:
-        logger.error(f"Failed to save validated metadata to {validated_path}: {e}")
-
-    # Save unvalidated datasets
-    if tag == "datasets":
-        unvalidated_path = os.path.join(
-            folder_out_path, "not_validated_gpcrmd_datasets.parquet"
-        )
-    elif tag == "files":
-        unvalidated_path = os.path.join(
-            folder_out_path, "not_validated_gpcrmd_files.parquet"
-        )
-    try:
-        if len(metadatas_unvalidated) != 0:
-            df_unvalidated = pd.DataFrame(metadatas_unvalidated)
-            df_unvalidated.to_parquet(unvalidated_path, index=False)
-            logger.success(
-            f"GPCRMD unvalidated metadatas saved to: {unvalidated_path} successfully!"
-            )
-        else:
-            logger.warning("There is no unvalidated datasets to save!")
-    except (ValueError, TypeError, OSError) as e:
-        logger.error(f"Failed to save unvalidated metadata to {unvalidated_path}: {e}")
+        logger.error(f"Failed to save validated {tag} to {validated_path}: {e}")
 
 
 @click.command()
@@ -688,25 +682,23 @@ def scrap_gpcrmd_data(out_path: Path) -> None:
         logger.warning("No data fetched from GPCRMD.")
         return
     # Parse and validate dataset metadatas with a pydantic model (DatasetMetadata)
-    datasets_validated, datasets_unvalidated = (
+    datasets_validated = (
         parse_and_validate_dataset_metadatas(datasets, fetch_time)
     )
     # Save parsed metadata to local file
     save_metadatas_to_parquet(
         out_path,
         datasets_validated,
-        datasets_unvalidated,
         tag="datasets"
     )
 
     # Fetch, parse and validate the file metadatas with a pydantic model (File Model)
-    files_metadata_validated, files_metadata_unvalidated = (
+    files_metadata_validated = (
         fetch_and_validate_file_metadatas(datasets, fetch_time)
     )
     save_metadatas_to_parquet(
         out_path,
         files_metadata_validated,
-        files_metadata_unvalidated,
         tag="files"
     )
 

From bacdadd38166fa190667327ec5f8494deaab8266 Mon Sep 17 00:00:00 2001
From: essmaw <essmay.touami@ens.uvsq.fr>
Date: Fri, 9 Jan 2026 21:24:06 +0100
Subject: [PATCH 8/9] Fix Ruff C901 by extracting dataset section parsing
 logic.

---
 scripts/scrap_gpcrmd.py | 64 +++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/scripts/scrap_gpcrmd.py b/scripts/scrap_gpcrmd.py
index f0b0fb9..788b1a3 100644
--- a/scripts/scrap_gpcrmd.py
+++ b/scripts/scrap_gpcrmd.py
@@ -488,6 +488,50 @@ def fetch_file_size(file_path: str) -> int | None:
     return None
 
 
+def get_section_container(
+    soup: BeautifulSoup,
+    sec_id: str,
+    dataset_id: str,
+    url: str,
+) -> Tag | None:
+    """Retrieve a valid HTML section container from a dataset page.
+
+    The function looks for a ``<div>`` element with the given section ID.
+    If the section is mandatory (``allfiles``) and missing or invalid, a
+    warning is logged. Optional sections return ``None`` silently.
+
+    Parameters
+    ----------
+    soup : BeautifulSoup
+        Parsed HTML content of the dataset page.
+    sec_id : str
+        Identifier of the HTML section to retrieve (e.g. ``allfiles``,
+        ``paramfiles``).
+    dataset_id : str
+        Identifier of the dataset, used for logging purposes.
+    url : str
+        URL of the dataset page, used for logging purposes.
+
+    Returns
+    -------
+    Tag | None
+        The HTML ``Tag`` corresponding to the requested section if found
+        and valid, otherwise ``None``.
+    """
+    container = soup.find("div", id=sec_id)
+    if isinstance(container, Tag):
+        return container
+
+    if sec_id == "allfiles":
+        logger.warning(
+            f"Dataset id `{dataset_id}` ({url}): "
+            f"mandatory section `{sec_id}` is missing or invalid. "
+            "Files required for simulation parsing cannot be retrieved."
+        )
+
+    return None
+
+
 def fetch_and_validate_file_metadatas(
     datasets: list[dict],
     fetch_time: str,
@@ -535,24 +579,8 @@ def fetch_and_validate_file_metadatas(
         soup = BeautifulSoup(html, "html.parser")
 
         for sec_id in ("allfiles", "paramfiles"):
-            container = soup.find("div", id=sec_id)
-            # Ensure container is a Tag
-            if not isinstance(container, Tag):
-                if sec_id == "allfiles":
-                    # allfiles mandatory
-                    logger.warning(
-                        f"Dataset id `{dataset_id}` ({url}):"
-                        f"mandatory section `{sec_id}` is missing or invalid. "
-                        "Files required for simulation parsing cannot be retrieved."
-                    )
-                else:
-                    # paramfiles optional
-                    # logger.warning(
-                    #     f"Dataset id `{dataset_id}` ({url}): "
-                    #     f"optional section `{sec_id}` not found. "
-                    #     "Parameter files for simulations will be skipped."
-                    # )
-                    pass
+            container = get_section_container(soup, sec_id, dataset_id, url)
+            if container is None:
                 continue
 
             links = container.find_all("a", href=True)

From 941d72060acffeb52440e6f120f88661722cbe6f Mon Sep 17 00:00:00 2001
From: essmaw <essmay.touami@ens.uvsq.fr>
Date: Fri, 9 Jan 2026 21:38:59 +0100
Subject: [PATCH 9/9] Add scrap gpcrmd into readme

---
 README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.md b/README.md
index e8da25b..6b482a0 100644
--- a/README.md
+++ b/README.md
@@ -130,6 +130,22 @@ This command (takes usually less than 6 minutes) will:
     4. Save file metadata similarly for validated and unvalidated files.
 
 
+## Scrap GPCRmd
+
+Scrape GPCRmd to collect molecular dynamics (MD) datasets and files related to G-protein-coupled receptors (GPCRs), a major family of membrane proteins and common drug targets.
+
+```bash
+uv run -m scripts.scrap_gpcrmd
+```
+
+This command will:
+   1. Fetch all available datasets from GPCRMD.
+   2. Parse their metadata and validate them using the Pydantic models
+    `DatasetMetadata` and `FileMetadata`.
+   3. Save validated dataset metadatas to `data/gpcrmd/gpcrmd_datasets.parquet`.
+   4. Save validated file metadatas to `data/gpcrmd/gpcrmd_files.parquet`.
+
+
 ## Analyze Gromacs mdp and gro files
 
 ### Download files