From 45f274da64eef5a59dd33767bf3e2f2d190d373f Mon Sep 17 00:00:00 2001 From: essmaw Date: Wed, 7 Jan 2026 16:59:14 +0100 Subject: [PATCH 1/3] Apply suggested modifications from @pierrepo. --- models/{dataset_model.py => dataset.py} | 51 +++++++------------------ models/{file_model.py => file.py} | 38 +++++------------- 2 files changed, 23 insertions(+), 66 deletions(-) rename models/{dataset_model.py => dataset.py} (86%) rename models/{file_model.py => file.py} (83%) diff --git a/models/dataset_model.py b/models/dataset.py similarity index 86% rename from models/dataset_model.py rename to models/dataset.py index feaff05..eec8552 100644 --- a/models/dataset_model.py +++ b/models/dataset.py @@ -21,12 +21,7 @@ from pydantic import BaseModel, Field, StringConstraints, field_validator -from scripts.toolbox import ( - DatasetProject, - DatasetRepository, - format_date, - validate_http_url, -) +from scripts.toolbox import DatasetProject, DatasetRepository, format_date DOI = Annotated[ str, @@ -39,7 +34,7 @@ # ===================================================================== # Base dataset class # ===================================================================== -class DatasetModel(BaseModel): +class DatasetMetadata(BaseModel): """ Base Pydantic model for scraped molecular dynamics datasets. @@ -142,7 +137,7 @@ class DatasetModel(BaseModel): "Must start with '10.' and follow the standard DOI format." ), ) - links: list[str] | None = Field( + external_links: list[str] | None = Field( None, description="External links to papers or other databases.", ) @@ -150,19 +145,19 @@ class DatasetModel(BaseModel): # ------------------------------------------------------------------ # File-level metadata # ------------------------------------------------------------------ - nb_files: int = Field( - ..., + nb_files: int | None = Field( + None, description="Total number of files in the dataset.", ) # ------------------------------------------------------------------ # Simulation metadata # ------------------------------------------------------------------ - simulation_program_name: str | None = Field( + software_name: str | None = Field( None, description="Molecular dynamics engine used (e.g. GROMACS, NAMD).", ) - simulation_program_version: str | None = Field( + software_version: str | None = Field( None, description="Version of the simulation engine.", ) @@ -182,15 +177,15 @@ class DatasetModel(BaseModel): None, description="Version of the forcefield model.", ) - timestep: float | None = Field( + simulation_timestep: float | None = Field( None, description="The time interval between new positions computation (in fs)." ) - delta: float | None = Field( - None, description="The time gap between frames (in ns)." - ) - simulation_time: str | None = Field( + simulation_time: list[str] | None = Field( None, description="The accumulated simulation time (in μs)." ) + simulation_temperature: list[str] | None = Field( + None, description="The temperature chosen for the simulations (in K ou °C)." + ) # ------------------------------------------------------------------ # Validators @@ -215,28 +210,8 @@ def format_dates(cls, v: datetime | str) -> str: # noqa: N805 """ return format_date(v) - # To uncomment if u won't take time to valid all the dataset urls - # @field_validator("url", mode="before") - def validate_url(cls, v: str) -> str: # noqa: N805 - """ - Validate that the URL field is a properly formatted HTTP/HTTPS URL. - - Parameters - ---------- - cls : type[BaseDataset] - The Pydantic model class being validated. - v : str - The input value of the 'url' field to validate. - - Returns - ------- - str - The validated URL string. - """ - return validate_http_url(v) - @field_validator( - "description", "keywords", "links", "license", "author_names", + "description", "keywords", "external_links", "license", "author_names", "molecule_names", mode="before") def empty_to_none(cls, v: list | str) -> list | str | None: # noqa: N805 """ diff --git a/models/file_model.py b/models/file.py similarity index 83% rename from models/file_model.py rename to models/file.py index 7103d10..5844f10 100644 --- a/models/file_model.py +++ b/models/file.py @@ -20,17 +20,13 @@ from pydantic import BaseModel, Field, computed_field, field_validator -from scripts.toolbox import ( - DatasetRepository, - format_date, - validate_http_url, -) +from scripts.toolbox import DatasetRepository, format_date # ===================================================================== # Base file class # ===================================================================== -class FileModel(BaseModel): +class FileMetadata(BaseModel): """ Base Pydantic model for scraped molecular dynamics files. @@ -55,6 +51,10 @@ class FileModel(BaseModel): ..., description="Unique identifier of the dataset in the source repository.", ) + dataset_url_in_repository: str = Field( + ..., + description="Canonical URL to access the dataset in the repository.", + ) file_url_in_repository: str = Field( ..., description="Direct URL to access the file.", @@ -78,7 +78,7 @@ class FileModel(BaseModel): date_last_fetched: str = Field( ..., description="Date when the file was last fetched." ) - containing_archive_fie_name: str | None = Field( + containing_archive_file_name: str | None = Field( None, description="Archive file name this file was extracted from, if applicable." ) @@ -104,29 +104,9 @@ def format_dates(cls, v: datetime | str) -> str: # noqa: N805 """ return format_date(v) - # To uncomment if u won't take time to valid all the file urls - # @field_validator("file_url_in_repository", mode="before") - def valid_url(cls, v: str) -> str: # noqa: N805 - """ - Validate that the URL field is a properly formatted HTTP/HTTPS URL. - - Parameters - ---------- - cls : type[NomadFiles] - The Pydantic model class being validated. - v : str - The input value of the 'url' field to validate. - - Returns - ------- - str - The validated URL string. - """ - return validate_http_url(v) - @computed_field @property - def file_size_with_readable_unit(self) -> str | None: + def file_size_with_human_readable_unit(self) -> str | None: """ Convert the file size in bytes into a human-readable format. @@ -143,3 +123,5 @@ def file_size_with_readable_unit(self) -> str | None: size /= 1024 idx += 1 return f"{size:.2f} {units[idx]}" + else: + return None From 66b2b6b6e19e2a2c3ec30690ce9b809f207715ed Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 8 Jan 2026 17:37:56 +0100 Subject: [PATCH 2/3] Rename dataset_repository fields to dataset_repository_name for clarity in DatasetMetadata model + Set dataset project name and id optional. --- models/dataset.py | 10 +++++----- models/file.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/models/dataset.py b/models/dataset.py index eec8552..d7dd891 100644 --- a/models/dataset.py +++ b/models/dataset.py @@ -48,15 +48,15 @@ class DatasetMetadata(BaseModel): # ------------------------------------------------------------------ # Core provenance # ------------------------------------------------------------------ - dataset_repository: DatasetRepository = Field( + dataset_repository_name: DatasetRepository = Field( ..., description=( "Name of the source repository. " "Allowed values: ZENODO, FIGSHARE, OSF, NOMAD, ATLAS, GPCRMD." ), ) - dataset_project: DatasetProject = Field( - ..., + dataset_project_name: DatasetProject | None = Field( + None, description=( "Name of the project." "Allowed values: ZENODO, FIGSHARE, OSF, NOMAD, ATLAS, GPCRMD." @@ -66,8 +66,8 @@ class DatasetMetadata(BaseModel): ..., description="Unique identifier of the dataset in the source repository.", ) - dataset_id_in_project: str = Field( - ..., + dataset_id_in_project: str | None = Field( + None, description="Unique identifier of the dataset in the project.", ) dataset_url_in_repository: str = Field( diff --git a/models/file.py b/models/file.py index 5844f10..2f22781 100644 --- a/models/file.py +++ b/models/file.py @@ -40,7 +40,7 @@ class FileMetadata(BaseModel): # ------------------------------------------------------------------ # Core provenance # ------------------------------------------------------------------ - dataset_repository: DatasetRepository = Field( + dataset_repository_name: DatasetRepository = Field( ..., description=( "Name of the source repository. " From 09c9843d5768d05c5159f502a0db1242bf89aa98 Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 8 Jan 2026 19:00:37 +0100 Subject: [PATCH 3/3] Remove duplicata dataset_url_in_repository field from FileMetadata model. --- models/file.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/models/file.py b/models/file.py index 2f22781..9eac32c 100644 --- a/models/file.py +++ b/models/file.py @@ -51,10 +51,6 @@ class FileMetadata(BaseModel): ..., description="Unique identifier of the dataset in the source repository.", ) - dataset_url_in_repository: str = Field( - ..., - description="Canonical URL to access the dataset in the repository.", - ) file_url_in_repository: str = Field( ..., description="Direct URL to access the file.",