Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified data/example_hydrofabric_2/hydrofabric.gpkg
Binary file not shown.
2 changes: 1 addition & 1 deletion python/lib/core/dmod/core/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.13.1'
__version__ = '0.14.0'
14 changes: 14 additions & 0 deletions python/lib/core/dmod/core/common/reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
from typing_extensions import Protocol, runtime_checkable
from os import SEEK_SET


@runtime_checkable
class Reader(Protocol):
def read(self, size: int = -1, /) -> bytes:
"""EOF if empty b''."""


@runtime_checkable
class Seeker(Protocol):
def seek(self, offset: int, whence: int = SEEK_SET) -> int:
""" Change the position to the given offset, returning the absolute position. """


@runtime_checkable
class ReadSeeker(Reader, Seeker, Protocol):
"""
A :class:`Reader` capable of changing the position from which it is reading.
"""
30 changes: 14 additions & 16 deletions python/lib/core/dmod/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,19 @@ def cond_eq(a, b):
and cond_eq(self.uuid, other.uuid)
)

def __hash__(self):
members = [
self.__class__.__name__,
self.name,
self.category.name,
str(hash(self.data_domain)),
self.access_location,
str(self.is_read_only),
str(hash(self.created_on)),
]
description = ",".join(members)
return hash(description)

@property
def manager(self) -> Optional[DatasetManager]:
"""
Expand Down Expand Up @@ -242,19 +255,6 @@ def set_manager(self, value: Union[DatasetManager, None]):
self._manager = value
self.manager_uuid = value.uuid

def __hash__(self):
members = [
self.__class__.__name__,
self.name,
self.category.name,
str(hash(self.data_domain)),
self.access_location,
str(self.is_read_only),
str(hash(self.created_on)),
]
description = ",".join(members)
return hash(description)

def _set_expires(self, new_expires: datetime):
"""
"Private" function to set the ::attribute:`expires` property.
Expand Down Expand Up @@ -529,8 +529,6 @@ def __init__(self, uuid: Optional[UUID] = None, datasets: Optional[Dict[str, Dat
self._errors = []
""" A property attribute to hold errors encountered during operations. """

# TODO: implement functions and routines for scrubbing temporary datasets as needed

@abstractmethod
def add_data(self, dataset_name: str, dest: str, data: Optional[Union[bytes, Reader]] = None, source: Optional[str] = None,
is_temp: bool = False, **kwargs) -> bool:
Expand Down Expand Up @@ -848,7 +846,7 @@ def is_managed_dataset(self, dataset: Dataset) -> bool:
if dataset.manager is None and self.uuid == dataset.manager_uuid:
dataset.set_manager(self)

return
return dataset.manager_uuid == self.uuid

def link_user(self, user: DatasetUser, dataset: Dataset) -> bool:
"""
Expand Down
81 changes: 71 additions & 10 deletions python/lib/core/dmod/core/meta_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ class StandardDatasetIndex(str, PydanticEnum):
""" Index for the name of a data file within a dataset. """
COMPOSITE_SOURCE_ID = (9, str, "COMPOSITE_SOURCE_ID")
""" Index for DATA_ID values of source dataset(s) when dataset is composite format and derives from others. """
HYDROFABRIC_VERSION = (10, str, "HYDROFABRIC_VERSION")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts on making this a generic VERSION?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should keep it separate, at least for now (does any other data format use any versioning at the moment?).

A lot of - maybe all - the other data DMOD works with is only fully valid if we assume its applied within some hydrofabric. I suspect before too long we will want or need a hydrofabric version index in constraints defining the domain of regridded forcings or BMI init config datasets, to be able to tell if the cat-1156 involved is actually the cat-1156 we are interested in. And that would be a more flexible way to constrain things than aligning the specific hydrofabric id.

""" Version string for version of the hydrofabric to use (e.g., 2.0.1). """
HYDROFABRIC_REGION = (11, str, "HYDROFABRIC_REGION")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of scope, but related, we will need something for the hydrofabric model attributes as well. We should probably chat with the HF team about how they intend to version those.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've opened #569 for tracking this in the future.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for opening something to track this!

""" Region string (e.g., conus, vpu01) for the applicable region of the hydrofabric. """

def __new__(cls, index: int, ty: type, name: str):
o = str.__new__(cls, name)
Expand Down Expand Up @@ -90,8 +94,8 @@ class DataFormat(PydanticEnum):
index that can be used to distinguish the collections, so that the right data can be identified.
"""
AORC_CSV = (0,
{StandardDatasetIndex.CATCHMENT_ID: None, StandardDatasetIndex.TIME: ""},
{"": datetime, "APCP_surface": float, "DLWRF_surface": float, "DSWRF_surface": float,
{StandardDatasetIndex.CATCHMENT_ID: None, StandardDatasetIndex.TIME: "Time"},
{"Time": datetime, "APCP_surface": float, "DLWRF_surface": float, "DSWRF_surface": float,
"PRES_surface": float, "SPFH_2maboveground": float, "TMP_2maboveground": float,
"UGRD_10maboveground": float, "VGRD_10maboveground": float, "precip_rate": float},
True
Expand Down Expand Up @@ -184,6 +188,38 @@ class DataFormat(PydanticEnum):
T_ROUTE_CONFIG = (13, {StandardDatasetIndex.DATA_ID: None, StandardDatasetIndex.HYDROFABRIC_ID: None}, None, False)
""" Format for t-route application configuration. """

NGEN_GEOPACKAGE_HYDROFABRIC_V2 = (14,
{StandardDatasetIndex.CATCHMENT_ID: "divide_id",
StandardDatasetIndex.HYDROFABRIC_ID: None,
StandardDatasetIndex.HYDROFABRIC_REGION: None,
StandardDatasetIndex.HYDROFABRIC_VERSION: None},
{"fid": int, "divide_id": str, "geom": Any, "toid": str, "type": str,
"ds_id": float, "areasqkm": float, "id": str, "lengthkm": float,
"tot_drainage_areasqkm": float, "has_flowline": bool},
)
""" GeoPackage hydrofabric format v2 used by NextGen (id is catchment id). """

EMPTY = (15, {}, None, False)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do these interact with the Dataset's DataCategory? At least, EMPTY and GENERIC seem related to that concept.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm ... I don't think there's any coupling, strictly speaking, between categories and either formats or domains. For this in particular, I think that's fine: it's reasonable to have a dataset in any category for which the data coverage is defined to be either empty or unknown.

More generally, I can already see things with categories that need tweaking (the notion of an OUTPUT category is probably not valid, if we start to think of evaluation task needing modeling "output" as input, or forcing regridding jobs producing as "output" something belonging to FORCING category). And we may not be using category sufficiently and/or consistently throughout our data modeling.

Did you have something specific in mind when you noted this, or just a general concern?

Copy link
Copy Markdown
Member

@aaraney aaraney Apr 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've always thought there was an implied unidirectional relationship from a DataCategory to a set of DataFormat variants. So, for a given DataCategory variant there is an implied set of valid DataFormats. If that is the case, let's say a dataset is created with a DataCategory of HYDROFABRIC and data is uploaded to the dataset without specifying its DataFormat, thus its initial DataFormat is Generic. The uploaded data is detected to be AORC_CSV. Assuming AORC_CSV is not in the set of HYDROFABRICs valid DataFormats, how should DMOD handle this case in accounting for dataset metadata?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There definitely is conceptually, just not one that's explicitly defined somewhere.

I haven't tested that scenario, but I expect it would allow the HYDROFABRIC category dataset to have a defined domain with the AORC_CSV format. I'm pretty sure the same thing would happen if the user tried to provided the domain manually when creating a dataset, though. So this is definitely a problem, but outside the scope of the automatic detection work. I've opened #567 specifically for it.

"""
"Format" for an empty dataset that, having no data (yet), doesn't have (or need) an applicable defined structure.

The intent of this is for simplicity when creating dataset. This format represents a type of dataset that doesn't,
and importantly, **cannot** yet truly have a more specific format that matches its contents. A key implication is
an expectation is that the domain of the dataset (including the format) **must** be changed as soon as any data is
added to the dataset.
"""

GENERIC = (16, {}, None, False)
"""
Format without any indications or restrictions on the defined structure of contained data.

This value is very much like ``EMPTY`` except that it is applicable to non-empty datasets. It represents absolutely
nothing about the structure of any contents, and thus that absolutely anything can be contained or added. In
practice, the main intended difference from ``EMPTY`` is that datasets in this format will not be required to update
their data domain at the time new data is added (while not applicable to ``EMPTY``, the same is true when any data
is removed).
"""

@classmethod
def can_format_fulfill(cls, needed: 'DataFormat', alternate: 'DataFormat') -> bool:
"""
Expand Down Expand Up @@ -325,7 +361,9 @@ class ContinuousRestriction(Serializable):

variable: StandardDatasetIndex
begin: datetime
""" An inclusive beginning value. """
end: datetime
""" An exclusive end value. """
datetime_pattern: Optional[str]
subclass: str = None
"""
Expand Down Expand Up @@ -439,9 +477,6 @@ def convert_truncated_serial_form(cls, truncated_json_obj: dict, datetime_format

return json_copy

def __hash__(self) -> int:
return hash((self.variable.name, self.begin, self.end))

def contains(self, other: 'ContinuousRestriction') -> bool:
"""
Whether this object contains all the values of the given object and the two are of the same index.
Expand Down Expand Up @@ -492,6 +527,11 @@ def __init__(
if allow_reorder:
self.values.sort()

def __eq__(self, other):
if not isinstance(other, DiscreteRestriction):
return False
return self.variable == other.variable and sorted(self.values) == sorted(other.values)

def __hash__(self) -> int:
return hash((self.variable.name, *self.values))

Expand Down Expand Up @@ -551,7 +591,22 @@ def is_all_possible_values(self) -> bool:

class DataDomain(Serializable):
"""
A domain for a dataset, with domain-defining values contained by one or more discrete and/or continuous components.
A domain for some collection of data, with defining values contained by discrete and/or continuous components.

A definition for the domain of some kind of collection of data. The collection may be something more concrete, like
a ::class:`Dataset` instance, or more abstract, like forcing data sufficient to run a requested model execution.

The definition consists of details on the structure and content of the data within the collection. Structure is
represented by a ::class:`DataFormat` attribute, and contents are represented by collections of
::class:`ContinuousRestriction` and ::class:`DiscreteRestriction` objects.

While a domain may have any number of continuous or discrete restrictions individually, combined it must have at
least one, or validation will fail.

There is a notion of whether a domain "contains" certain described data. This described data can be a simple
description of some data index and values it, fundamentally the definition of ::class:`ContinuousRestriction` and
::class:`DiscreteRestriction` objects. The described data can also be more complex, like another fully defined
domain. A function is provided by the type for performing such tests.
"""
data_format: DataFormat = Field(
description="The format for the data in this domain, which contains details like the indices and other data fields."
Expand Down Expand Up @@ -622,12 +677,17 @@ def handle_type_map(t):

@root_validator()
def validate_sufficient_restrictions(cls, values):
data_format = values.get("data_format")
if data_format == DataFormat.EMPTY or data_format == DataFormat.GENERIC:
return values
continuous_restrictions = values.get("continuous_restrictions", {})
discrete_restrictions = values.get("discrete_restrictions", {})
if len(continuous_restrictions) + len(discrete_restrictions) == 0:
msg = "Cannot create {} without at least one finite continuous or discrete restriction"
raise RuntimeError(msg.format(cls.__name__))
return values
if len(continuous_restrictions) + len(discrete_restrictions) > 0:
return values
raise RuntimeError(f"Cannot create {cls.__name__} without at least one finite continuous or discrete "
f"restriction, except when data format is {DataFormat.GENERIC.name} or "
f"{DataFormat.EMPTY.name} (provided value was: "
f"{'None' if data_format is None else data_format.name})")

@classmethod
def factory_init_from_restriction_collections(cls, data_format: DataFormat, **kwargs) -> 'DataDomain':
Expand Down Expand Up @@ -836,6 +896,7 @@ def dict(
return serial



class DataCategory(PydanticEnum):
"""
The general category values for different data.
Expand Down
11 changes: 11 additions & 0 deletions python/lib/core/dmod/core/serializable.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,17 @@ class ResultIndicator(Serializable, ABC):
reason: str = Field(description="A very short, high-level summary of the result.")
message: str = Field("", description="An optional, more detailed explanation of the result, which by default is an empty string.")

def __bool__(self) -> bool:
"""
Implementation of truth value testing for instances, which directly depends on the value of ``success``.

Returns
-------
bool
The current value of the instance's ::attribute:`success` attribute.
"""
return self.success


class BasicResultIndicator(ResultIndicator):
"""
Expand Down
2 changes: 1 addition & 1 deletion python/lib/modeldata/dmod/modeldata/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.9.5'
__version__ = '0.10.0'
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import fiona
import pyogrio
import geopandas as gpd
import hashlib
from pandas.util import hash_pandas_object
Expand Down Expand Up @@ -316,22 +316,25 @@ class GeoPackageHydrofabric(Hydrofabric):
#_FLOWPATHS_TO_NEX_COL = 'toid'

_DIVIDES_LAYER_NAME = 'divides'
_DIVIDES_CAT_ID_COL = 'id'
_DIVIDES_CAT_ID_COL = 'divide_id'
_DIVIDES_TO_NEX_COL = 'toid'

_NEXUS_LAYER_NAME = 'nexus'
_NEXUS_NEX_ID_COL = 'id'
_NEXUS_TO_CAT_COL = 'toid'

@classmethod
def from_file(cls, geopackage_file: Union[str, Path], vpu: Optional[int] = None, is_conus: bool = False) -> 'GeoPackageHydrofabric':
def from_file(cls, geopackage_file: Union[str, Path, bytes], vpu: Optional[int] = None, is_conus: bool = False) -> 'GeoPackageHydrofabric':
"""
Initialize a new instance from a GeoPackage file.
Initialize a new instance from a GeoPackage file or contents of such a file (as ``bytes``).

Note that while a warning may appear because of implementation details in ``pyogrio``, this should work
perfectly well if passed raw bytes from a file.

Parameters
----------
geopackage_file: Union[str, Path]
The source file for data from which to instantiate.
geopackage_file: Union[str, Path, bytes]
The source file for data, or raw data from such a file, from which to instantiate.
vpu: Optional[int]
The VPU of the hydrofabric to create, if it is known (defaults to ``None``).
is_conus: bool
Expand All @@ -342,7 +345,9 @@ def from_file(cls, geopackage_file: Union[str, Path], vpu: Optional[int] = None,
GeoPackageHydrofabric
A new instance of this type.
"""
layer_names = fiona.listlayers(geopackage_file)
# pyogrio's function returns an ndarry of ndarrays, with inner layer info array containing layer name and type
# We only need a list of layer names, though
layer_names = [layer_info[0] for layer_info in pyogrio.list_layers(geopackage_file)]
return cls(layer_names=layer_names,
layer_dataframes={ln: gpd.read_file(geopackage_file, layer=ln, engine="pyogrio") for ln in layer_names},
vpu=vpu,
Expand Down Expand Up @@ -442,8 +447,8 @@ def get_subset_hydrofabric(self, subset: SubsetDefinition) -> 'GeoPackageHydrofa
# Value[1]: callable no arg function, returning collection of ids for records/rows to include in subset
subset_query_setups: Dict[str, Tuple[str, Callable[[], Iterable[str]]]] = {
'flowpaths': ('realized_catchment', lambda: subset.catchment_ids),
'divides': ('id', lambda: subset.catchment_ids),
'nexus': ('id', lambda: subset.nexus_ids),
'divides': (self._DIVIDES_CAT_ID_COL, lambda: subset.catchment_ids),
'nexus': (self._NEXUS_NEX_ID_COL, lambda: subset.nexus_ids),
'flowpath_attributes': ('id', lambda: new_dfs['flowpaths']['id']),
'flowpath_edge_list': ('id', lambda: new_dfs['flowpaths']['id']),
'crosswalk': ('id', lambda: new_dfs['flowpaths']['id']),
Expand Down
1 change: 0 additions & 1 deletion python/lib/modeldata/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
install_requires=[
"numpy>=1.20.1",
"pandas",
"fiona",
"geopandas",
"dmod-communication>=0.4.2",
"dmod-core>=0.13.1",
Expand Down
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ Deprecated
cryptography
flask
pandas
fiona
geopandas
gitpython
python-dotenv
Expand Down Expand Up @@ -42,7 +41,6 @@ attrs
Pillow
Jinja2
click
Fiona
cligj
munch
six
Expand Down