by ID
+ container = soup.find("div", id=container_id)
+ # Ensure the container is actually a Tag
+ if not isinstance(container, Tag):
+ return 0
+
+ # Collect all hrefs in
tags, stripping whitespace
+ links = [
+ str(a.get("href", "")).strip()
+ for a in container.find_all("a", href=True)
+ if isinstance(a, Tag) and str(a.get("href", "")).strip()
+ ]
+
+ # Remove duplicates while preserving order
+ return len(dict.fromkeys(links))
+
+ output_files_count = count_links("allfiles")
+ protocol_files_count = count_links("paramfiles")
+ return output_files_count + protocol_files_count
+ return None
+
+
+def validate_parsed_metadatas(
+ parsed: dict[str, Any],
+ out_model: type[FileMetadata | DatasetMetadata]
+) -> tuple[FileMetadata | DatasetMetadata | None, str | None]:
+ """Validate a parsed dataset using the pydantic model.
+
+ Parameters
+ ----------
+ parsed : dict[str, Any]
+ The parsed dataset or file to validate.
+ out_model: FileMetadata | DatasetMetadata
+ The Pydantic model used for the validation.
+
+ Returns
+ -------
+ tuple[FileMetadata | DatasetMetadata | None, str | None]
+ A tuple containing the validated model instance if validation succeeds,
+ otherwise None, and the validation failure reasons if validation fails.
+ """
+ try:
+ return out_model(**parsed), None
+ except ValidationError as exc:
+ reasons: list[str] = []
+
+ for err in exc.errors():
+ field = ".".join(str(x) for x in err["loc"])
+ reason = err["msg"]
+ value = err.get("input")
+
+ reasons.append(f"{field}: {reason} (input={value!r})")
+
+ non_validation_reason = "; ".join(reasons)
+ return None, non_validation_reason
+
+
+def parse_and_validate_dataset_metadatas(
+ datasets: list[dict[str, Any]],
+ fetch_time: str
+) -> list[DatasetMetadata]:
+ """
+ Parse and validate metadata fields for a list of GPCRMD datasets.
+
+ Parameters
+ ----------
+ datasets : list of dict
+ List of dictionaries, each representing the metadata of a GPCRMD dataset.
+ fetch_time : str
+ Timestamp (as a string) indicating when the data was fetched.
+
+ Returns
+ -------
+ list[DatasetMetadata]
+ List of successfully validated `DatasetMetadata` objects.
+ """
+ logger.info("Starting parsing and validating GPCRMD datasets...")
+ validated_datasets: list[DatasetMetadata] = []
+ total_datasets: int = len(datasets)
+
+ for i, dataset in enumerate(datasets, start=1):
+ dataset_id = str(dataset.get("dyn_id"))
+
+ # Extract molecules and number total of atoms if available
+ total_atoms: int | None = dataset.get("atom_num")
+ dyncomp: list[dict[str, Any]] = dataset.get("dyncomp", [])
+ molecules: list[str] = (
+ [comp.get("resname") for comp in dyncomp if comp.get("resname")]
+ )
+ url = dataset.get("url")
+ # Fetch dataset page with url
+ html = fetch_dataset_page(url) if url else None
+ if html:
+ author_names: str | None = retrieve_metadata(html, "Submitted by")
+ description: str | None = retrieve_metadata(html, "Description")
+ stime: str | None = retrieve_metadata(html, "Accumulated simulation time")
+ stime_list: list[str] | None = [stime] if stime is not None else None
+ refs: list[str] | None = retrieve_reference_links(html)
+ nb_files: int | None = count_simulation_files(html)
+ else:
+ logger.warning(f"Dataset `{dataset_id}` ({url}): "
+ "page HTML missing; web metadata extraction skipped.")
+ author_names = None
+ description = None
+ stime = None
+ refs = None
+ nb_files = None
+
+ parsed_dataset = {
+ "dataset_repository_name": DatasetRepository.GPCRMD,
+ "dataset_project": DatasetProject.GPCRMD,
+ "dataset_id_in_repository": dataset_id,
+ "dataset_id_in_project": dataset_id,
+ "dataset_url_in_repository": url,
+ "dataset_url_in_project": url,
+ "links": refs,
+ "title": dataset.get("modelname"),
+ "date_created": dataset.get("creation_timestamp"),
+ "date_last_fetched": fetch_time,
+ "nb_files": nb_files,
+ "author_names": author_names if author_names is None else [author_names],
+ "description": description,
+ "simulation_program_name": dataset.get("mysoftware"),
+ "simulation_program_version": dataset.get("software_version"),
+ "nb_atoms": total_atoms,
+ "molecule_names": molecules,
+ "forcefield_model_name": dataset.get("forcefield"),
+ "forcefield_model_version": dataset.get("forcefield_version"),
+ "timestep": dataset.get("timestep"),
+ "delta": dataset.get("delta"),
+ "simulation_time": stime_list
+ }
+
+ # Validate and normalize data collected with pydantic model
+ (parsed_dataset_model,
+ non_validation_reason,
+ ) = validate_parsed_metadatas(parsed_dataset, DatasetMetadata)
+ # If it return a DatasetMetadata object
+ if isinstance(parsed_dataset_model, DatasetMetadata):
+ # Validation succeed
+ logger.debug(f"Parsed dataset id `{dataset_id}` ({i}/{len(datasets)})")
+ validated_datasets.append(parsed_dataset_model)
+ else:
+ # Validation failed
+ logger.error(f"Validation failed for dataset `{dataset_id}` ({url})"
+ ". Invalid field(s) detected : "
+ f"{non_validation_reason}"
+ )
+
+ percentage = (
+ (len(validated_datasets) / total_datasets) * 100
+ if total_datasets > 0
+ else 0.0
+ )
+ logger.success(
+ f"Parsing completed: {percentage:.2f}% validated "
+ f"({len(validated_datasets)}/{total_datasets}) datasets successfully! \n"
+ )
+ return validated_datasets
+
+
+def make_base_parsed_file(
+ dataset_id: str,
+ url: str,
+ fetch_time: str,
+) -> dict[str, Any]:
+ """Create a base parsed dataset with empty file metadata.
+
+ Parameters
+ ----------
+ dataset_id : str
+ The unique identifier of the GPCRMD dataset.
+ url : str
+ The URL of the GPCRMD dataset.
+ fetch_time : str
+ The timestamp indicating when the data was fetched.
+
+ Returns
+ -------
+ dict[str, Any]
+ A dictionary representing the base parsed dataset with empty file metadata.
+ """
+ return {
+ "dataset_repository_name": DatasetRepository.GPCRMD,
+ "dataset_id_in_repository": dataset_id,
+ "file_name": None,
+ "file_type": None,
+ "file_size_in_bytes": None,
+ "file_url_in_repository": url,
+ "date_last_fetched": fetch_time,
+ }
+
+
+def fetch_file_size(file_path: str) -> int | None:
+ """Fetch file size using a HEAD request.
+
+ Parameters
+ ----------
+ file_path : str
+ The URL of the file to fetch the size for.
+
+ Returns
+ -------
+ int | None
+ The size of the file in bytes if available, otherwise None.
+ """
+ try:
+ response = httpx.head(file_path, timeout=50, follow_redirects=True)
+ # Sleep briefly to avoid overwhelming the remote server
+ time.sleep(0.1)
+ return int(response.headers.get("Content-Length", 0))
+ except httpx.HTTPStatusError as exc:
+ logger.warning(
+ "HTTP error %s for %s",
+ exc.response.status_code,
+ file_path,
+ )
+ except httpx.RequestError as exc:
+ logger.warning("Failed to fetch file size for %s: %s", file_path, exc)
+
+ return None
+
+
+def get_section_container(
+ soup: BeautifulSoup,
+ sec_id: str,
+ dataset_id: str,
+ url: str,
+) -> Tag | None:
+ """Retrieve a valid HTML section container from a dataset page.
+
+ The function looks for a ```` element with the given section ID.
+ If the section is mandatory (``allfiles``) and missing or invalid, a
+ warning is logged. Optional sections return ``None`` silently.
+
+ Parameters
+ ----------
+ soup : BeautifulSoup
+ Parsed HTML content of the dataset page.
+ sec_id : str
+ Identifier of the HTML section to retrieve (e.g. ``allfiles``,
+ ``paramfiles``).
+ dataset_id : str
+ Identifier of the dataset, used for logging purposes.
+ url : str
+ URL of the dataset page, used for logging purposes.
+
+ Returns
+ -------
+ Tag | None
+ The HTML ``Tag`` corresponding to the requested section if found
+ and valid, otherwise ``None``.
+ """
+ container = soup.find("div", id=sec_id)
+ if isinstance(container, Tag):
+ return container
+
+ if sec_id == "allfiles":
+ logger.warning(
+ f"Dataset id `{dataset_id}` ({url}): "
+ f"mandatory section `{sec_id}` is missing or invalid. "
+ "Files required for simulation parsing cannot be retrieved."
+ )
+
+ return None
+
+
+def fetch_and_validate_file_metadatas(
+ datasets: list[dict],
+ fetch_time: str,
+) -> list[FileMetadata]:
+ """Fetch and validate metadata for GPCRMD files.
+
+ Parameters
+ ----------
+ datasets : list[dict]
+ List of file datasets, each containing metadata such as 'dyn_id' and 'url'.
+ fetch_time : str
+ Timestamp indicating when the data was fetched.
+
+ Returns
+ -------
+ list[FileMetadata]
+ List of successfully validated `FileMetadata` objects.
+ """
+ logger.info("Starting fetching and validating GPCRMD files...")
+
+ validated_files: list[FileMetadata] = []
+ total_files = 0
+ non_validated_files_count = 0
+
+ for i, dataset in enumerate(datasets, start=1):
+ dataset_id = str(dataset.get("dyn_id"))
+ url = dataset.get("url")
+ count_files_parsed_for_dataset = 0
+
+ if not url:
+ logger.error(
+ f"Dataset `{dataset_id}` skipped: missing dataset URL."
+ )
+ continue
+
+ base_file = make_base_parsed_file(dataset_id, url, fetch_time)
+
+ html = fetch_dataset_page(url) if url else None
+ if html is None:
+ logger.error(
+ f"Dataset `{dataset_id}` ({url}) skipped: page retrieval failed."
+ )
+ continue
+
+ soup = BeautifulSoup(html, "html.parser")
+
+ for sec_id in ("allfiles", "paramfiles"):
+ container = get_section_container(soup, sec_id, dataset_id, url)
+ if container is None:
+ continue
+
+ links = container.find_all("a", href=True)
+ total_files += len(links)
+
+ for link in links:
+ # Ensure link is a Tag to safely access ['href']
+ if not isinstance(link, Tag):
+ logger.warning(
+ f"Dataset `{dataset_id}` ({url}): "
+ "encountered non-HTML link element."
+ )
+ continue
+
+ # Use .get() to safely retrieve the href, then convert to str
+ href_value = str(link.get("href", "")).strip()
+ if not href_value:
+ logger.warning(
+ f"Dataset `{dataset_id}` ({url}): "
+ "file link without href attribute."
+ )
+ continue
+ file_path = f"https://www.gpcrmd.org/{href_value}"
+ file_name = os.path.basename(file_path)
+ file_type = os.path.splitext(file_name)[1].lstrip(".").lower()
+
+ parsed_file = {
+ **base_file,
+ "file_name": file_name,
+ "file_type": file_type,
+ "file_size_in_bytes": fetch_file_size(file_path),
+ "file_url_in_repository": file_path,
+ }
+
+ # Validate and normalize data collected with pydantic model
+ (parsed_file_model,
+ non_validation_reason,
+ ) = validate_parsed_metadatas(parsed_file, FileMetadata)
+ count_files_parsed_for_dataset += 1
+ if isinstance(parsed_file_model, FileMetadata):
+ logger.debug(
+ f"Parsed file `{file_name}` from dataset "
+ f"`{dataset_id}` ({i}/{len(datasets)})"
+ )
+ validated_files.append(parsed_file_model)
+ else:
+ logger.error(f"Validation failed for file `{file_name}` "
+ f"({file_path}). Invalid field(s) detected : "
+ f"{non_validation_reason}"
+ )
+ non_validated_files_count += 1
+
+ percentage = (
+ (len(validated_files) / total_files) * 100
+ if total_files > 0
+ else 0.0
+ )
+ logger.success(
+ f"Parsing completed: {percentage:.2f}% validated "
+ f"({len(validated_files) - non_validated_files_count}/"
+ f"{total_files}) files successfully! \n"
+ )
+ return validated_files
+
+
+def save_metadatas_to_parquet(
+ folder_out_path: Path,
+ metadatas_validated: list[DatasetMetadata] | list[FileMetadata],
+ tag: str,
+) -> None:
+ """
+ Save GPCRMD validated and unvalidated metadata to Parquet files.
+
+ Parameters
+ ----------
+ folder_out_path : Path
+ Folder path where Parquet files will be saved.
+ metadatas_validated : List[DatasetMetadata]
+ List of validated metadatas.
+ tag: str
+ Tag to know if its datasets or files metadata to save.
+ """
+ logger.info(f"Saving GPCRMD {tag} metadatas to a Parquet file...")
+ # Ensure output folder exists
+ Path(folder_out_path).mkdir(parents=True, exist_ok=True)
+
+ # Save validated datasets and files
+ if tag == "datasets":
+ validated_path = os.path.join(folder_out_path, "gpcrmd_datasets.parquet")
+ elif tag == "files":
+ validated_path = os.path.join(folder_out_path, "gpcrmd_files.parquet")
+ try:
+ # Convert list of Pydantic models to list of dicts
+ validated_dicts = [dataset.model_dump() for dataset in metadatas_validated]
+ df_validated = pd.DataFrame(validated_dicts)
+ df_validated.to_parquet(validated_path, index=False)
+ logger.success(
+ f"GPCRMD validated metadatas saved to: {validated_path} successfully!"
+ )
+ except (ValueError, TypeError, OSError) as e:
+ logger.error(f"Failed to save validated {tag} to {validated_path}: {e}")
+
+
+@click.command()
+@click.option(
+ "--out-path",
+ type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path),
+ default=Path("data/gpcrmd"),
+ show_default=True,
+ help="Folder path to save the scraped GPCRMD data (Dataset and File metadatas)"
+)
+def scrap_gpcrmd_data(out_path: Path) -> None:
+ """Scrap datasets and files from GPCRMD.
+
+ Parameters
+ ----------
+ out_path : Path
+ The output folder path for the scraped data.
+ """
+ setup_logger(logger, out_path)
+ logger.info("Starting GPCRMD data scraping...")
+ start_time = time.perf_counter()
+
+ # Fetch datasets metadata
+ datasets, fetch_time = fetch_datasets_once()
+ if datasets == []:
+ logger.warning("No data fetched from GPCRMD.")
+ return
+ # Parse and validate dataset metadatas with a pydantic model (DatasetMetadata)
+ datasets_validated = (
+ parse_and_validate_dataset_metadatas(datasets, fetch_time)
+ )
+ # Save parsed metadata to local file
+ save_metadatas_to_parquet(
+ out_path,
+ datasets_validated,
+ tag="datasets"
+ )
+
+ # Fetch, parse and validate the file metadatas with a pydantic model (File Model)
+ files_metadata_validated = (
+ fetch_and_validate_file_metadatas(datasets, fetch_time)
+ )
+ save_metadatas_to_parquet(
+ out_path,
+ files_metadata_validated,
+ tag="files"
+ )
+
+ elapsed_time = int(time.perf_counter() - start_time)
+ logger.success(f"Scraping duration: {timedelta(seconds=elapsed_time)} 🎉")
+
+
+if __name__ == "__main__":
+ # Scrap GPCRMD data
+ scrap_gpcrmd_data()