Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
fb8f666
Add abstract domain detectors.
robertbartel Apr 2, 2024
b50a2e5
Add UniversalItemDomainDetector.
robertbartel Apr 2, 2024
fbf0c9d
Add first format specific detector classes.
robertbartel Apr 2, 2024
6c556de
Add tests for AORC CSV file domain detector.
robertbartel Apr 2, 2024
f84e849
Add tests for geopackage file domain detector.
robertbartel Apr 2, 2024
7c04d5d
Add tests for universal item domain detector.
robertbartel Apr 2, 2024
a4d4d8e
Add common function needed for several tests.
robertbartel Apr 2, 2024
da8a377
Adding test CSV forcing files.
robertbartel Apr 2, 2024
d4a17c6
Switch to ReadSeeker in dataset.py.
robertbartel Apr 3, 2024
4994384
Switch to ReadSeeker in item_domain_detector.py.
robertbartel Apr 3, 2024
7eb6fad
Remove lazy property from AbstractDomainDetector.
robertbartel Apr 4, 2024
576a687
Optimizing re use in aorc csv domain detector.
robertbartel Apr 5, 2024
5b1510d
Redesign to use ItemDataDomainDetectorRegistry.
robertbartel Apr 9, 2024
5162628
Update detector implementations for new registry.
robertbartel Apr 9, 2024
968b045
Fix AorcCsvFileDomainDetector tests for redesign.
robertbartel Apr 9, 2024
5e99138
Fix geopackage domain detector tests for redesign.
robertbartel Apr 9, 2024
827086e
Fix universal domain detector tests for redesign.
robertbartel Apr 9, 2024
e5545b1
Bump core package version to 0.15.1.
robertbartel Apr 10, 2024
bca8586
Update modeldata dependency on core to ver 0.15.1.
robertbartel Apr 10, 2024
e4a1de5
Bump modeldata package version to 0.11.0.
robertbartel Apr 10, 2024
6c7b408
Update client dependency on core to ver 0.15.1.
robertbartel Apr 10, 2024
105fd3e
Bump client package version to 0.6.0.
robertbartel Apr 10, 2024
389a127
Scrap private _is_item_file for domain detector.
robertbartel May 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
745 changes: 745 additions & 0 deletions data/example_forcing_aorc_csv/cat-12.csv

Large diffs are not rendered by default.

745 changes: 745 additions & 0 deletions data/example_forcing_aorc_csv/cat-13.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion python/lib/client/dmod/client/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.5.0'
__version__ = '0.6.0'
2 changes: 1 addition & 1 deletion python/lib/client/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
license='',
include_package_data=True,
#install_requires=['websockets', 'jsonschema'],vi
install_requires=['dmod-core>=0.11.0', 'websockets>=8.1', 'pydantic>=1.10.8,~=1.10', 'dmod-communication>=0.17.0',
install_requires=['dmod-core>=0.15.0', 'websockets>=8.1', 'pydantic>=1.10.8,~=1.10', 'dmod-communication>=0.17.0',
'dmod-externalrequests>=0.6.0'],
packages=find_namespace_packages(include=['dmod.*'], exclude=['dmod.test'])
)
2 changes: 1 addition & 1 deletion python/lib/core/dmod/core/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.15.0'
__version__ = '0.15.1'
333 changes: 331 additions & 2 deletions python/lib/core/dmod/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@

from .serializable import Serializable, ResultIndicator
from .enum import PydanticEnum
from typing import Any, Callable, ClassVar, Dict, FrozenSet, List, Optional, Set, Tuple, Type, Union
from pathlib import Path
from typing import Any, Callable, ClassVar, Dict, FrozenSet, List, Optional, Set, Tuple, Type, TypeVar, Union
from pydantic import Field, validator, PrivateAttr
from pydantic.fields import ModelField
from uuid import UUID, uuid4

from .common.reader import Reader
from .common.reader import Reader, ReadSeeker


class DatasetType(PydanticEnum):
Expand Down Expand Up @@ -486,6 +487,334 @@ def unlink_to_dataset(self, dataset: Dataset) -> bool:
return dataset.manager is not None and dataset.manager.unlink_user(user=self, dataset=dataset)


DataItem = TypeVar('DataItem', bound=Union[bytes, ReadSeeker, Path])


class AbstractDomainDetector(ABC):
""" Abstraction for something that will automatically detect a :class:`DataDomain` for some data. """

@abstractmethod
def detect(self, **kwargs) -> DataDomain:
"""
Detect and return the data domain.

Parameters
----------
kwargs
Optional kwargs applicable to the subtype, which may enhance or add to the domain detection and generation
capabilities, but which should not be required to produce a valid domain.

Returns
-------
DataDomain
The detected domain.

Raises
------
DmodRuntimeError
If it was not possible to properly detect the domain.
"""
pass


class ItemDataDomainDetectorRegistry:
""" A singleton registry in which to track the subtypes of :class`ItemDataDomainDetector`. """

_instance = None

@classmethod
def get_instance(cls) -> ItemDataDomainDetectorRegistry:
""" Get the singleton registry instance. """
if cls._instance is None:
cls._instance = ItemDataDomainDetectorRegistry()
Comment thread
aaraney marked this conversation as resolved.
return cls._instance

def __init__(self):
if self._instance is not None:
raise RuntimeError(f"Attempting to create second {self.__class__.__name__} instance!")
self._detectors: Dict[str, Type[ItemDataDomainDetector]] = dict()
""" All registered subclasses, keyed by name. """
self._formats_to_detectors: Dict[DataFormat, Set[str]] = {f: set() for f in DataFormat}
"""
Collection of :class:`DataFormat` enum values mapped to sets containing the names of the registered subclasses
that are associated with that particular format.
"""

def is_registered(self, entry: Union[str, Type[ItemDataDomainDetector]]) -> bool:
"""
Whether this is a registered subclass or the registered name of one.

Parameters
----------
entry: Union[str, Type[ItemDataDomainDetector]]
The potential registration name or subclass type.

Returns
-------
bool
Whether this is a registered subclass or the registered name of one.
"""
return (entry if isinstance(entry, str) else entry.get_registration_name()) in self._detectors

def get_for_format(self, data_format: DataFormat) -> List[Type[ItemDataDomainDetector]]:
"""
Get a list (sorted by registration name) of the detector subclasses associated with the given format.

Parameters
----------
data_format: DataFormat
The data format of interest.

Returns
-------
List[Type[ItemDataDomainDetector]]
The sorted detector subclasses associated with the given format.
"""
return [self._detectors[name] for name in sorted(self._formats_to_detectors[data_format])]

def get_for_name(self, name: str) -> Type[ItemDataDomainDetector]:
"""
Get the registered subclass for the given registration name.

Parameters
----------
name: str
The subclass registration name.

Returns
-------
Type[ItemDataDomainDetector]
The registered subclass for the given registration name.

Raises
------
KeyError
Raised if this is not a valid, recognized registration name.
"""
return self._detectors[name]

def register(self, subclass: Type[ItemDataDomainDetector]):
"""
Register the given subclass of :class:`ItemDataDomainDetector`.

Parameters
----------
subclass: Type[ItemDataDomainDetector]
A subclass of :class:`ItemDataDomainDetector`.

Notes
-----
If an already-registered subclass is passed in another call to this method, nothing will happen. The instance's
state will not change, nor will an error be thrown, and the method will quietly return.

Raises
------
DmodRuntimeError
If the registration name is already in use for a different subclass.
"""
name = subclass.get_registration_name()
if name not in self._detectors:
self._detectors[name] = subclass
if subclass.get_data_format() is not None:
self._formats_to_detectors[subclass.get_data_format()].add(name)
elif self._detectors[name] != subclass:
raise DmodRuntimeError(f"{self.__class__.__name__} failed to register subclass '{subclass.__name__}' with "
f"registration name '{name}' that is already in use")

def unregister(self, subclass: Type[ItemDataDomainDetector]):
"""
Unregister the given subclass of :class:`ItemDataDomainDetector`.

Parameters
----------
subclass

Raises
-------
DmodRuntimeError
If the given subclass was not already registered.
"""
name = subclass.get_registration_name()
if name not in self._detectors:
Comment thread
aaraney marked this conversation as resolved.
raise DmodRuntimeError(f"{self.__class__.__name__} can't unregister unknown name '{name}'")
subclass = self._detectors.pop(name)
self._formats_to_detectors[subclass.get_data_format()].remove(name)


class ItemDataDomainDetector(AbstractDomainDetector, ABC):
"""
Type that can examine a data item and detect its individual :class:`DataDomain`.

Abstraction for detecting the of a single data item. Here, a data item specifically means either :class:`bytes`
object with raw data, a :class:`ReadSeeker` object that can read data multiple times, or a :class:`Path` object
pointing to a file (not a directory).

This class provides two class functions important for use with the :class:`ItemDataDomainDetectorRegistry` singleton
object: :method:`get_data_format` and :method:`get_registration_name`. Subclasses must be implemented to set
backing class variables if they want to change the default behavior.

The :method:`get_data_format` class method helps the registry (and other users) identify the :class:`DataFormat` for
which a subclass can determine domains. It is optional, though, and the default is ``None``.

The :method:`get_registration_name` method provides a unique registration name, primarily intended for use with the
registry. If the backing class attribute is not explicitly set, it will default to the name of the subclass.

Note that subtypes must explicitly be registered with the :class:`ItemDataDomainDetectorRegistry` singleton.
"""

_data_format: Optional[DataFormat] = None
""" The associated :class:`DataFormat` of this subclass. """
_registration_name: Optional[str] = None
""" The registration name identifier for this subclass; it will be replaced with class name if ``None``. """

@classmethod
def get_data_format(cls) -> Optional[DataFormat]:
"""
Get the associated data format for this subclass of :class:`ItemDataDomainDetector`, if it has one.

Returns
-------
Optional[DataFormat]
The associated :class:`DataFormat` of this subclass, if it has one; otherwise ``None``.
"""
return cls._data_format

@classmethod
def get_registration_name(cls) -> str:
"""
Get the registration name for this subclass.

If the backing private class attribute is set to ``None``, this method will return the subclass's ``__name__``.

Returns
-------
str
The registration name for this subclass.
"""
return cls._registration_name if cls._registration_name else cls.__name__

def __init__(self, item: DataItem, item_name: Optional[str] = None, decode_format: str = 'utf-8', *args, **kwargs):
super().__init__(*args, **kwargs)
self._item: DataItem = item
Comment thread
aaraney marked this conversation as resolved.
""" The data item for which to detect a domain. """
is_item_file = isinstance(self._item, Path)
self._item_name = self._item.name if is_item_file else item_name
""" Name for the item; in some situations, contains important constraint metadata (e.g. catchment name). """
self._decode_format = decode_format
""" A decoder format sometimes used when reading data item in order to get metadata. """
if is_item_file and self._item.is_dir():
raise ValueError(f"{self.__class__.__name__} can't initialize with a directory path as its data item")


class UniversalItemDomainDetector(ItemDataDomainDetector):
Comment thread
aaraney marked this conversation as resolved.
"""
General type of detector that works with all supported formats by trying all registered, format-specific subtypes.
"""

def detect(self, **kwargs) -> DataDomain:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we improve the type hint experience here a little? Perhaps something like:

Suggested change
def detect(self, **kwargs) -> DataDomain:
def detect(
self,
*,
excluded_formats: Optional[Union[DataFormat, Set[DataFormat]]] = None,
suggested_formats: Optional[Union[DataFormat, List[DataFormat]]] = None,
sort_key: Optional[Callable[[Type[ItemDataDomainDetector]], T]] = None,
**kwargs
) -> DataDomain: ...

"""
Detect and return the data domain.

Detect a domain by calling the analogous method of an instance of some or all registered subclasses of
:class:`ItemDataDomainDetector`. Selection of the right subclass to use for this is based on brute-force
trials - i.e., a subclass is selected, an instance is created, the ``detect`` method is called, and we assess
what happens - along with an early exit mechanism for explicit format suggestions.

Subclasses are tried in groups according to their associated :class:`DataFormat`. The order of groups may be
controlled by providing one or more format "suggestions", which will be tried first in the order provided. Also,
one or more excluded formats can be optionally provided. Iteration order of subclasses within a group is based
on registration name by default - i.e., the value from :method:`ItemDataDomainDetector.get_registration_name`
for each subclass - but a sorting key function can be provided to control this also.

If/when a subclass instance's ``detect`` call returns a domain, no other subclasses for that format group are
tried, but this function only returns that domain value immediately if the associated format was a provided
suggestion. Otherwise, iteration continues to the next group. This is important, because if more than one
class can detect a domain, there is an implicit ambiguity in the domain, and a :class:`DmodRuntimeError` is
raised.

Parameters
----------
kwargs
Optional kwargs applicable to the subtype, which may enhance or add to the domain detection and generation
capabilities, but which should not be required to produce a valid domain.

Keyword Args
------------
excluded_formats: Union[DataFormat, Set[DataFormat]]
Optional individual or set of :class:`DataFormat` to be excluded from testing; a :class:`ValueError` is
raised if a format appears in both this and ``suggested_formats``.
suggested_formats: Union[DataFormat, List[DataFormat]]
An optional :class:`DataFormat` or list of :class:`DataFormat` values to try first, with any successes
being immediately returned; a :class:`ValueError` is raised if a format appears more than once across both
this and ``excluded_formats``.
sort_key:
Optional function of one argument (the subclass type) used to extract a comparison key from each registered
subclasses when attempting to determine the order in which to try them (within the context of those
associated with the particular data format being tried); if not provided, the order is based on each
subclass's registration ``name``.

Returns
-------
DataDomain
The detected domain.

Raises
------
ValueError
Raised if a :class:`DataFormat` appears multiple times across both ``excluded_formats`` and
``suggested_formats``; i.e., if any data format value is duplicated in the hypothetical list produced by
``list(kwargs.get('excluded_formats', [])) + list(kwargs.get('suggested_formats', []))``.
DmodRuntimeError
If it was not possible to properly detect the domain.
"""
def try_detection(d_format: DataFormat) -> Optional[DataDomain]:
subclasses = ItemDataDomainDetectorRegistry.get_instance().get_for_format(d_format)
if 'sort_key' in kwargs:
subclasses = sorted(subclasses, key=kwargs['sort_key'])
for subclass_type in subclasses:
try:
return subclass_type(item=self._item, item_name=self._item_name).detect()
except:
pass
return None

excluded = kwargs.get('excluded_formats', set())
if isinstance(excluded, DataFormat):
excluded = {excluded}
suggested = kwargs.get('suggested_formats', list())
if isinstance(suggested, DataFormat):
suggested = [suggested]
if not excluded.isdisjoint(suggested):
raise ValueError(f"Can't include data format in both exclusions and suggestions for domain detection.")
if len(suggested) != len(set(suggested)):
raise ValueError(f"Can't include data format multiple times in ordered suggestions for domain detection.")

remaining_formats = {df for df in DataFormat if df not in excluded}

# Try suggestions first, returning immediately if any are successful
for data_format in suggested:
remaining_formats.remove(data_format)
result = try_detection(d_format=data_format)
if result is not None:
return result

# Now we get to others
main_trials = (try_detection(d_format=df) for df in remaining_formats)
main_results = [t for t in main_trials if t is not None]
if len(main_results) == 0:
raise DmodRuntimeError("No domain could be detected for item.")
elif len(main_results) == 1:
return main_results[0]
# Multiple results mean there's a problem (also, they can't be equal because they will have different formats)
else:
raise DmodRuntimeError(f"Multiple conflicting domain detected for item in the following formats: "
f"{','.join([d.data_format.name for d in main_results])}")


# Register the universal tracker type
ItemDataDomainDetectorRegistry.get_instance().register(UniversalItemDomainDetector)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My above point about setting ._instance in __init__ is moot, if this stays present. Im on the fence. Ill leave it to your discretion if you think the change is worth it.



class DatasetManager(ABC):
"""
Abstract representation of manager of ::class:`Dataset` instances.
Expand Down
2 changes: 1 addition & 1 deletion python/lib/modeldata/dmod/modeldata/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.10.0'
__version__ = '0.11.0'
Loading