Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 18 additions & 43 deletions models/dataset_model.py → models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,7 @@

from pydantic import BaseModel, Field, StringConstraints, field_validator

from scripts.toolbox import (
DatasetProject,
DatasetRepository,
format_date,
validate_http_url,
)
from scripts.toolbox import DatasetProject, DatasetRepository, format_date

DOI = Annotated[
str,
Expand All @@ -39,7 +34,7 @@
# =====================================================================
# Base dataset class
# =====================================================================
class DatasetModel(BaseModel):
class DatasetMetadata(BaseModel):
"""
Base Pydantic model for scraped molecular dynamics datasets.

Expand All @@ -53,15 +48,15 @@ class DatasetModel(BaseModel):
# ------------------------------------------------------------------
# Core provenance
# ------------------------------------------------------------------
dataset_repository: DatasetRepository = Field(
dataset_repository_name: DatasetRepository = Field(
...,
description=(
"Name of the source repository. "
"Allowed values: ZENODO, FIGSHARE, OSF, NOMAD, ATLAS, GPCRMD."
),
)
dataset_project: DatasetProject = Field(
...,
dataset_project_name: DatasetProject | None = Field(
None,
description=(
"Name of the project."
"Allowed values: ZENODO, FIGSHARE, OSF, NOMAD, ATLAS, GPCRMD."
Expand All @@ -71,8 +66,8 @@ class DatasetModel(BaseModel):
...,
description="Unique identifier of the dataset in the source repository.",
)
dataset_id_in_project: str = Field(
...,
dataset_id_in_project: str | None = Field(
None,
description="Unique identifier of the dataset in the project.",
)
dataset_url_in_repository: str = Field(
Expand Down Expand Up @@ -142,27 +137,27 @@ class DatasetModel(BaseModel):
"Must start with '10.' and follow the standard DOI format."
),
)
links: list[str] | None = Field(
external_links: list[str] | None = Field(
None,
description="External links to papers or other databases.",
)

# ------------------------------------------------------------------
# File-level metadata
# ------------------------------------------------------------------
nb_files: int = Field(
...,
nb_files: int | None = Field(
None,
description="Total number of files in the dataset.",
)

# ------------------------------------------------------------------
# Simulation metadata
# ------------------------------------------------------------------
simulation_program_name: str | None = Field(
software_name: str | None = Field(
None,
description="Molecular dynamics engine used (e.g. GROMACS, NAMD).",
)
simulation_program_version: str | None = Field(
software_version: str | None = Field(
None,
description="Version of the simulation engine.",
)
Expand All @@ -182,15 +177,15 @@ class DatasetModel(BaseModel):
None,
description="Version of the forcefield model.",
)
timestep: float | None = Field(
simulation_timestep: float | None = Field(
None, description="The time interval between new positions computation (in fs)."
)
delta: float | None = Field(
None, description="The time gap between frames (in ns)."
)
simulation_time: str | None = Field(
simulation_time: list[str] | None = Field(
None, description="The accumulated simulation time (in μs)."
)
simulation_temperature: list[str] | None = Field(
None, description="The temperature chosen for the simulations (in K ou °C)."
)

# ------------------------------------------------------------------
# Validators
Expand All @@ -215,28 +210,8 @@ def format_dates(cls, v: datetime | str) -> str: # noqa: N805
"""
return format_date(v)

# To uncomment if u won't take time to valid all the dataset urls
# @field_validator("url", mode="before")
def validate_url(cls, v: str) -> str: # noqa: N805
"""
Validate that the URL field is a properly formatted HTTP/HTTPS URL.

Parameters
----------
cls : type[BaseDataset]
The Pydantic model class being validated.
v : str
The input value of the 'url' field to validate.

Returns
-------
str
The validated URL string.
"""
return validate_http_url(v)

@field_validator(
"description", "keywords", "links", "license", "author_names",
"description", "keywords", "external_links", "license", "author_names",
"molecule_names", mode="before")
def empty_to_none(cls, v: list | str) -> list | str | None: # noqa: N805
"""
Expand Down
36 changes: 7 additions & 29 deletions models/file_model.py → models/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,13 @@

from pydantic import BaseModel, Field, computed_field, field_validator

from scripts.toolbox import (
DatasetRepository,
format_date,
validate_http_url,
)
from scripts.toolbox import DatasetRepository, format_date


# =====================================================================
# Base file class
# =====================================================================
class FileModel(BaseModel):
class FileMetadata(BaseModel):
"""
Base Pydantic model for scraped molecular dynamics files.

Expand All @@ -44,7 +40,7 @@ class FileModel(BaseModel):
# ------------------------------------------------------------------
# Core provenance
# ------------------------------------------------------------------
dataset_repository: DatasetRepository = Field(
dataset_repository_name: DatasetRepository = Field(
...,
description=(
"Name of the source repository. "
Expand Down Expand Up @@ -78,7 +74,7 @@ class FileModel(BaseModel):
date_last_fetched: str = Field(
..., description="Date when the file was last fetched."
)
containing_archive_fie_name: str | None = Field(
containing_archive_file_name: str | None = Field(
None,
description="Archive file name this file was extracted from, if applicable."
)
Expand All @@ -104,29 +100,9 @@ def format_dates(cls, v: datetime | str) -> str: # noqa: N805
"""
return format_date(v)

# To uncomment if u won't take time to valid all the file urls
# @field_validator("file_url_in_repository", mode="before")
def valid_url(cls, v: str) -> str: # noqa: N805
"""
Validate that the URL field is a properly formatted HTTP/HTTPS URL.

Parameters
----------
cls : type[NomadFiles]
The Pydantic model class being validated.
v : str
The input value of the 'url' field to validate.

Returns
-------
str
The validated URL string.
"""
return validate_http_url(v)

@computed_field
@property
def file_size_with_readable_unit(self) -> str | None:
def file_size_with_human_readable_unit(self) -> str | None:
"""
Convert the file size in bytes into a human-readable format.

Expand All @@ -143,3 +119,5 @@ def file_size_with_readable_unit(self) -> str | None:
size /= 1024
idx += 1
return f"{size:.2f} {units[idx]}"
else:
return None