-
Notifications
You must be signed in to change notification settings - Fork 34
implement coordinate hash #651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,7 +38,11 @@ | |
| get_quantity_str, | ||
| percent, | ||
| ) | ||
| from dascore.utils.array import _coerce_text_array, _is_text_coercible_array | ||
| from dascore.utils.array import ( | ||
| _coerce_text_array, | ||
| _is_text_coercible_array, | ||
| hash_array, | ||
| ) | ||
| from dascore.utils.display import get_nice_text | ||
| from dascore.utils.docs import compose_docstring | ||
| from dascore.utils.misc import ( | ||
|
|
@@ -450,6 +454,57 @@ def __array__(self, dtype=None, copy=False): | |
| """Numpy method for getting array data with `np.array(coord)`.""" | ||
| return self.data | ||
|
|
||
| def _get_hashable_coord(self) -> Self: | ||
| """Return a coordinate normalized for stable hashing.""" | ||
| if self.units is None or dtype_time_like(self.dtype): | ||
| return self | ||
| return self.simplify_units() | ||
|
|
||
| @staticmethod | ||
| def _hash_scalar(value) -> tuple[str, str | None]: | ||
| """Return a dtype-aware scalar hash token.""" | ||
| if value is None: | ||
| return ("none", None) | ||
| return ("scalar", hash_array(np.asarray([value]))) | ||
|
|
||
| @staticmethod | ||
| def _hash_array_token(values: np.ndarray) -> tuple[str, str | tuple[int, ...]]: | ||
| """Return a hash token consistent with coord array equality.""" | ||
| dtype = np.dtype(values.dtype) | ||
| if np.issubdtype(dtype, np.inexact): | ||
| # Coord equality for inexact arrays uses np.isclose semantics, so | ||
| # hashing raw bytes would violate the hash contract. Fall back to | ||
| # metadata-only hashing for these dtypes. | ||
| return ("approx-array", str(dtype), values.shape) | ||
| return ("array", hash_array(values)) | ||
|
|
||
| def __hash__(self): | ||
| coord = self._get_hashable_coord() | ||
| unit_str = coord.unit_str | ||
| if isinstance(coord, CoordRange): | ||
| payload = ( | ||
| coord.__class__, | ||
| unit_str, | ||
| coord.shape, | ||
| self._hash_scalar(coord.start), | ||
| self._hash_scalar(coord.stop), | ||
| self._hash_scalar(coord.step), | ||
| ) | ||
| return hash(payload) | ||
| if isinstance(coord, CoordPartial): | ||
| payload = ( | ||
| coord.__class__, | ||
| unit_str, | ||
| coord.shape, | ||
| str(np.dtype(coord.dtype)), | ||
| self._hash_scalar(coord.start), | ||
| self._hash_scalar(coord.stop), | ||
| self._hash_scalar(coord.step), | ||
| ) | ||
| return hash(payload) | ||
| payload = (coord.__class__, unit_str, self._hash_array_token(coord.values)) | ||
| return hash(payload) | ||
|
Comment on lines
+481
to
+506
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "Documented cross-family equality example:"
sed -n '1888,1895p' dascore/core/coords.py
echo
echo "Current __hash__ implementation:"
sed -n '481,506p' dascore/core/coords.py
echo
echo "Shared model equality implementation:"
fd 'models.py' dascore | xargs -r rg -n -C3 'class DascoreBaseModel|def __eq__'Repository: DASDAE/dascore Length of output: 1624 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Finding _get_hashable_coord implementation ==="
rg -n -A10 '_get_hashable_coord' dascore/core/coords.py | head -30
echo
echo "=== Finding __eq__ implementation in BaseCoord ==="
rg -n -B2 -A15 'def __eq__' dascore/core/coords.py | head -50
echo
echo "=== Checking _hash_scalar and _hash_array_token implementations ==="
rg -n -A8 'def _hash_scalar|def _hash_array_token' dascore/core/coords.py
echo
echo "=== Checking if CoordRange is converted to CoordArray anywhere ==="
rg -n 'to_array|convert.*array' dascore/core/coords.py | head -20Repository: DASDAE/dascore Length of output: 1057 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Class definitions and inheritance ==="
rg -n 'class BaseCoord|class CoordRange|class CoordArray' dascore/core/coords.py | head -10
echo
echo "=== Search for __eq__ in entire coords.py file ==="
rg -n '__eq__' dascore/core/coords.py
echo
echo "=== Check if BaseModel or parent provides __eq__ ==="
rg -n 'class DascoreBaseModel' dascore/models.py -A15 | head -30
echo
echo "=== Check simplify_units implementation ==="
rg -n -A20 'def simplify_units' dascore/core/coords.py
echo
echo "=== Look at entire BaseCoord class definition start ==="
sed -n '80,130p' dascore/core/coords.pyRepository: DASDAE/dascore Length of output: 267 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== DascoreBaseModel full implementation ==="
rg -n -A50 'class DascoreBaseModel' dascore/models.py | head -70
echo
echo "=== Check if Pydantic BaseModel is used (implicit __eq__) ==="
rg -n 'from pydantic import BaseModel' dascore/models.py
echo
echo "=== Check simplify_units to understand what it does ==="
rg -n -A30 'def simplify_units' dascore/core/coords.py | head -50
echo
echo "=== Look for any coord value/data comparison methods ==="
rg -n 'values.*==|data.*==|array.*==' dascore/core/coords.py | head -20Repository: DASDAE/dascore Length of output: 177 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Find where DascoreBaseModel is defined ==="
rg -n 'class DascoreBaseModel' dascore/
echo
echo "=== Check imports in coords.py ==="
head -50 dascore/core/coords.py | rg -n 'DascoreBaseModel|import'
echo
echo "=== Try to find models.py in different locations ==="
fd -t f 'models.py' dascore/
echo
echo "=== Check entire dascore structure ==="
ls -la dascore/ | head -20Repository: DASDAE/dascore Length of output: 2247 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Full DascoreBaseModel implementation ==="
rg -n -A60 'class DascoreBaseModel' dascore/utils/models.py
echo
echo "=== Check if __eq__ is explicitly defined in DascoreBaseModel ==="
rg -n '__eq__' dascore/utils/models.py
echo
echo "=== Check full import list in coords.py ==="
head -60 dascore/core/coords.py | grep -n 'from\|import'Repository: DASDAE/dascore Length of output: 2255 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Find sensible_model_equals implementation ==="
rg -n -B2 -A20 'def sensible_model_equals' dascore/utils/models.py
echo
echo "=== Check what is imported from dascore.utils.models in coords.py ==="
rg -n -A10 'from dascore.utils.models import' dascore/core/coords.py
echo
echo "=== Look at CoordRange and CoordArray definitions to understand their structure ==="
rg -n -A5 'class CoordRange|class CoordArray' dascore/core/coords.py | head -30Repository: DASDAE/dascore Length of output: 1782 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Full CoordRange class definition ==="
sed -n '1164,1250p' dascore/core/coords.py
echo
echo "=== Full CoordArray class definition ==="
sed -n '1422,1500p' dascore/core/coords.py
echo
echo "=== Check model_fields for CoordRange ==="
rg -n 'model_fields|__init__' dascore/core/coords.py | grep -A5 -B5 'CoordRange'Repository: DASDAE/dascore Length of output: 6601 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Search for any override of __eq__ in CoordRange or CoordArray ==="
rg -n 'class CoordRange|class CoordArray' dascore/core/coords.py -A100 | grep -E 'def __eq__|class CoordRange|class CoordArray'
echo
echo "=== Check if there's any conversion from CoordRange to CoordArray before comparison ==="
rg -n 'to_array|values' dascore/core/coords.py | grep -i 'coordrange\|coordarray' | head -20
echo
echo "=== Look at the exact test case from the docstring to understand the claim ==="
sed -n '1888,1905p' dascore/core/coords.py
echo
echo "=== Check if there's a conversion method on CoordRange ==="
rg -n 'def values|def to_array' dascore/core/coords.py | grep -B5 -A5 'CoordRange\|1[2-3][0-9][0-9]'Repository: DASDAE/dascore Length of output: 1392 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Check BaseCoord.values property (line 1097) ==="
sed -n '1097,1110p' dascore/core/coords.py
echo
echo "=== Check CoordArray.values property (line 1370) ==="
sed -n '1370,1380p' dascore/core/coords.py
echo
echo "=== Check all fields defined in BaseCoord and subclasses ==="
sed -n '211,280p' dascore/core/coords.py
echo
echo "=== Now verify the actual test - run a minimal Python test to see if they compare equal ==="
python3 << 'EOF'
import sys
sys.path.insert(0, '/root/code')
import numpy as np
from dascore.core.coords import get_coord
# Create test coordinates
range_coord = get_coord(start=1, stop=12, step=1)
array_coord = get_coord(data=np.arange(1, 12, 1))
print(f"Range coord type: {type(range_coord)}")
print(f"Array coord type: {type(array_coord)}")
print(f"Range coord fields: {range_coord.model_dump().keys()}")
print(f"Array coord fields: {array_coord.model_dump().keys()}")
print(f"Are they equal? {range_coord == array_coord}")
print(f"Range coord dump: {range_coord.model_dump()}")
print(f"Array coord dump: {array_coord.model_dump()}")
EOFRepository: DASDAE/dascore Length of output: 5129 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Find get_coord function to see if it normalizes types ==="
rg -n -A50 'def get_coord' dascore/core/coords.py | head -80
echo
echo "=== Check what line 1370 actually is - full CoordRange.values property ==="
sed -n '1370,1400p' dascore/core/coords.py
echo
echo "=== Look for any __eq__ override in CoordRange or subclasses ==="
sed -n '1164,1420p' dascore/core/coords.py | grep -n '__eq__\|def values'Repository: DASDAE/dascore Length of output: 3370 Fix hash contract violation and document cross-family equality mismatch. The documentation at line 1894 claims |
||
|
|
||
| @cached_method | ||
| def min(self): | ||
| """Return min value.""" | ||
|
|
@@ -1016,7 +1071,20 @@ def update(self, **kwargs): | |
| # Other operations that normally modify data do not in this case. | ||
| update_limits = update | ||
| set_units = update | ||
| convert_units = update | ||
|
|
||
| def convert_units(self, units) -> Self: | ||
| """Convert scalar metadata units, or set units if none exist.""" | ||
| if self.units is None or dtype_time_like(self.dtype): | ||
| return self.set_units(units=units) | ||
| out = {"units": units} | ||
| for name in ("start", "stop", "step"): | ||
| value = getattr(self, name) | ||
| out[name] = ( | ||
| value | ||
| if pd.isnull(value) | ||
| else convert_units(value, to_units=units, from_units=self.units) | ||
| ) | ||
| return self.new(**out) | ||
|
|
||
| def sort(self, reverse=False): | ||
| """Sort dummy array. Does nothing.""" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| from functools import lru_cache | ||
| from hashlib import sha256 | ||
| from pathlib import Path | ||
| from urllib.request import Request, urlopen | ||
|
|
||
| from dascore.compat import UPath | ||
| from dascore.config import get_config | ||
|
|
@@ -125,7 +126,6 @@ def _download_remote_file(path, local_path: Path): | |
| """Download a remote path into its cache location.""" | ||
| resource = coerce_to_upath(path) | ||
| protocol = getattr(resource, "protocol", None) | ||
| open_kwargs = {"block_size": 0} if protocol in _HTTP_PROTOCOLS else {} | ||
| local_path.parent.mkdir(parents=True, exist_ok=True) | ||
| fd, temp_name = tempfile.mkstemp( | ||
| dir=local_path.parent, | ||
|
|
@@ -135,12 +135,20 @@ def _download_remote_file(path, local_path: Path): | |
| os.close(fd) | ||
| tmp_path = Path(temp_name) | ||
| try: | ||
| with ( | ||
| resource.open("rb", **open_kwargs) as remote_fi, | ||
| tmp_path.open("wb") as local_fi, | ||
| ): | ||
| while chunk := remote_fi.read(get_config().remote_download_block_size): | ||
| local_fi.write(chunk) | ||
| if protocol in _HTTP_PROTOCOLS: | ||
| headers = dict(getattr(resource, "storage_options", {}) or {}) | ||
| request = Request(str(resource), headers=headers) | ||
|
Comment on lines
+139
to
+140
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The HTTP download path now converts Useful? React with 👍 / 👎. |
||
| timeout = get_config().remote_download_timeout | ||
| with ( | ||
| urlopen(request, timeout=timeout) as remote_fi, | ||
|
Comment on lines
+138
to
+143
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Verify how HTTP headers are represented in UPath usage across this repo.
rg -nP 'UPath\([^)]*headers\s*=' --type=py -C2
rg -nP 'storage_options.*headers|["'\'']headers["'\'']' --type=py -C2
rg -nP '\bRequest\(|\burlopen\(' dascore/utils/remote_io.py -C2Repository: DASDAE/dascore Length of output: 1334 🏁 Script executed: # 1. Examine the test case to understand how storage_options are passed
sed -n '670,700p' tests/test_utils/test_io_utils.py
# 2. Find _HTTP_PROTOCOLS definition
rg -n '_HTTP_PROTOCOLS' dascore/utils/remote_io.py -B2 -A2
# 3. Look at the full context of _download_remote_file function
sed -n '120,160p' dascore/utils/remote_io.py
# 4. Check for any other storage_options usage in HTTP context
rg -n 'storage_options' dascore/utils/remote_io.py -B2 -A2Repository: DASDAE/dascore Length of output: 3962 🏁 Script executed: # Examine the _HTTPResource test class definition
sed -n '660,680p' tests/test_utils/test_io_utils.py
# Check for any other HTTP storage_options examples in tests
rg -n 'storage_options.*=' tests/test_utils/test_io_utils.py -B1 -A1 | grep -A5 -B5 'http'
# Check if there's any fsspec/upath dependency that defines headers structure
rg -n 'fsspec|upath' dascore/utils/remote_io.py
# Look for any imports or references to understand the HTTP backend
head -30 dascore/utils/remote_io.pyRepository: DASDAE/dascore Length of output: 2178 🏁 Script executed: # Check test setup for HTTP resource to see the actual storage_options structure
sed -n '645,650p' tests/test_utils/test_io_utils.py
# Look for all storage_options structures in tests
rg -n 'storage_options\s*=' tests/test_utils/test_io_utils.py -A1
# Check if there's any documentation about storage_options format in comments/docstrings
rg -n 'storage_options' dascore/utils/remote_io.py -B3 -A3 | head -40
# Check what UPath (coerce_to_upath) is and if it has documentation about storage_options
rg -n 'class UPath|def coerce_to_upath' dascore/compat.py -A5Repository: DASDAE/dascore Length of output: 1266 🏁 Script executed: # Check fsspec and upath imports to understand storage_options structure
rg -n 'from dascore.compat import' dascore/utils/remote_io.py
# Look at the compat module to see what UPath is
cat dascore/compat.py | head -50
# Search for any storage_options nesting patterns in actual usage (not just tests)
rg -n 'storage_options' dascore/ --type=py -B2 -A2 | grep -i "headers" | head -20
# Check if there are any real-world (non-test) HTTP examples with storage_options
rg -rn '"headers".*storage_options|storage_options.*"headers"' dascore/ --type=pyRepository: DASDAE/dascore Length of output: 1488 🏁 Script executed: # Search for any usage of UPath with HTTP in the actual codebase (not just tests)
rg -n 'UPath.*http|http.*UPath' dascore/ --type=py
# Check if there are any real-world examples of storage_options being passed
rg -n '_download_remote_file' dascore/ --type=py -B2 -A2
# Look for any integration tests or examples that use HTTP downloading
find dascore/ -name '*.py' -type f -exec grep -l 'http.*download\|download.*http' {} \;
# Check if upath/fsspec HTTP backend documentation is referenced anywhere
rg -n 'fsspec|fsspec.implementations.http' dascore/ --type=pyRepository: DASDAE/dascore Length of output: 695 🏁 Script executed: # Check fsspec documentation or any comments about storage_options structure
web_search query about fsspec HTTP storage_options structure
# Also check if there's any version pinning or requirements file
cat dascore/pyproject.toml 2>/dev/null | grep -i 'upath\|fsspec\|urllib' || cat setup.py 2>/dev/null | grep -i 'upath\|fsspec'
# Check the actual test to see what storage_options are expected to work
sed -n '640,695p' tests/test_utils/test_io_utils.pyRepository: DASDAE/dascore Length of output: 2115 🌐 Web query:
💡 Result: The fsspec HTTP backend (HTTPFileSystem) accepts storage_options as keyword arguments in its constructor, which are stored in self.kwargs and passed to aiohttp session requests (e.g., session.get(url, self.kwargs)). Key storage_options include: - headers: dict of HTTP headers applied to all requests (e.g., {"Authorization": "Bearer token"}). Headers can also be overridden per-operation via kwargs in methods like open. - client_kwargs: dict passed to aiohttp.ClientSession constructor (e.g., for auth: {"auth": aiohttp.BasicAuth("user", "pass")}). - Other common options: block_size (read block size, default 5MB), simple_links (for ls), cache_type, cache_options. Example usage: fs = fsspec.filesystem("http", headers={"User-Agent": "my-app"}, client_kwargs={"timeout": aiohttp.ClientTimeout(total=30)}) with fs.open("https://example.com/file.txt") as f: data = f.read For high-level functions like fsspec.open("https://...", storage_options={"headers": {...}}). Source code confirms storage_options become self.kwargs for requests, and explicit params like client_kwargs are separate. Headers in storage_options go to per-request kwargs. Citations:
🏁 Script executed: # Verify if the protocol field from resource always matches the actual URL scheme
rg -n 'protocol.*=' tests/test_utils/test_io_utils.py | grep -i http
# Check if there are any cases where protocol and URL scheme could mismatch
rg -n 'coerce_to_upath' tests/test_utils/test_io_utils.py -B2 -A2 | head -30
# Check if there are any edge cases in URL construction
sed -n '125,150p' dascore/utils/remote_io.py
# Verify RemoteCacheError exists
rg -n 'class RemoteCacheError' dascore/Repository: DASDAE/dascore Length of output: 2146 🏁 Script executed: # Final verification: check if there are any real-world calls to _download_remote_file with nested headers
rg -n '_download_remote_file\|materialize_remote_file' dascore/ --type=py -B3 -A3 | head -50Repository: DASDAE/dascore Length of output: 40 Add explicit URL scheme validation and support nested header extraction for fsspec compatibility. Line 139 treats Proposed fix+from urllib.parse import urlsplit
from urllib.request import Request, urlopen
@@
if protocol in _HTTP_PROTOCOLS:
- headers = dict(getattr(resource, "storage_options", {}) or {})
- request = Request(str(resource), headers=headers)
+ storage_options = dict(getattr(resource, "storage_options", {}) or {})
+ raw_headers = storage_options.get("headers", storage_options)
+ headers = {str(k): str(v) for k, v in (raw_headers or {}).items()}
+ url = str(resource)
+ scheme = urlsplit(url).scheme.lower()
+ if scheme not in _HTTP_PROTOCOLS:
+ msg = f"Refusing non-HTTP URL for remote download: {url!r}"
+ raise RemoteCacheError(msg)
+ request = Request(url, headers=headers)
timeout = get_config().remote_download_timeout
with (
- urlopen(request, timeout=timeout) as remote_fi,
+ urlopen(request, timeout=timeout) as remote_fi, # noqa: S310
tmp_path.open("wb") as local_fi,
):🧰 Tools🪛 Ruff (0.15.9)[error] 140-140: Audit URL open for permitted schemes. Allowing use of (S310) [error] 143-143: Audit URL open for permitted schemes. Allowing use of (S310) 🤖 Prompt for AI Agents |
||
| tmp_path.open("wb") as local_fi, | ||
| ): | ||
| while chunk := remote_fi.read(get_config().remote_download_block_size): | ||
| local_fi.write(chunk) | ||
| else: | ||
| with resource.open("rb") as remote_fi, tmp_path.open("wb") as local_fi: | ||
| while chunk := remote_fi.read(get_config().remote_download_block_size): | ||
| local_fi.write(chunk) | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| tmp_path.replace(local_path) | ||
| finally: | ||
| tmp_path.unlink(missing_ok=True) | ||
|
|
@@ -201,6 +209,19 @@ def ensure_local_file(resource) -> Path: | |
| raise TypeError(msg) | ||
|
|
||
|
|
||
| def get_cached_local_file(resource) -> Path | None: | ||
| """Return the cached local path for one remote resource if it exists.""" | ||
| if not is_pathlike(resource) or is_local_path(resource): | ||
| return None | ||
| remote = coerce_to_upath(resource) | ||
| cache_root = _normalize_cache_root(get_remote_cache_path()) | ||
| remote_id = normalize_remote_id(remote) | ||
| local_path = ( | ||
| cache_root / sha256(remote_id.encode()).hexdigest() / _safe_remote_name(remote) | ||
| ) | ||
| return local_path if local_path.exists() else None | ||
|
|
||
|
|
||
| def get_local_handle(resource, opener): | ||
| """Materialize a resource locally, then pass it to an opener.""" | ||
| return opener(ensure_local_file(resource)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CoordPartialunit normalization is a no-op for the hashed scalars.This path assumes
simplify_units()also normalizesstart/stop/step, butCoordPartial.convert_unitsis justupdate, so those magnitudes stay in their original units. A partial coord in centimeters and the equivalent one in meters will still hash differently.🤖 Prompt for AI Agents