Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dascore/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ class DascoreConfig(BaseModel):
default=1_048_576,
description="Block size in bytes for general remote file downloads.",
)
remote_download_timeout: float = Field(
default=60.0,
description="Timeout in seconds for blocking remote file downloads.",
)
remote_hdf5_block_size: int = Field(
default=5_242_880,
description="Block size in bytes for remote HDF5 access on tuned protocols.",
Expand Down
72 changes: 70 additions & 2 deletions dascore/core/coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@
get_quantity_str,
percent,
)
from dascore.utils.array import _coerce_text_array, _is_text_coercible_array
from dascore.utils.array import (
_coerce_text_array,
_is_text_coercible_array,
hash_array,
)
from dascore.utils.display import get_nice_text
from dascore.utils.docs import compose_docstring
from dascore.utils.misc import (
Expand Down Expand Up @@ -450,6 +454,57 @@ def __array__(self, dtype=None, copy=False):
"""Numpy method for getting array data with `np.array(coord)`."""
return self.data

def _get_hashable_coord(self) -> Self:
"""Return a coordinate normalized for stable hashing."""
if self.units is None or dtype_time_like(self.dtype):
return self
return self.simplify_units()
Comment on lines +457 to +461
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

CoordPartial unit normalization is a no-op for the hashed scalars.

This path assumes simplify_units() also normalizes start/stop/step, but CoordPartial.convert_units is just update, so those magnitudes stay in their original units. A partial coord in centimeters and the equivalent one in meters will still hash differently.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@dascore/core/coords.py` around lines 457 - 461, The hashing path currently
leaves scalar magnitudes in their original units because
CoordPartial.convert_units just updates unit metadata; modify the normalization
so start/stop/step magnitudes are converted to the target units before hashing.
Concretely, update CoordPartial.convert_units (or inside _get_hashable_coord) to
actually convert numeric magnitudes (start, stop, step) using the units
conversion (e.g., via the units library's to/convert method) rather than only
updating the units attribute, or ensure _get_hashable_coord calls simplify_units
on a copy whose start/stop/step have been converted to the simplified units;
reference the methods _get_hashable_coord, simplify_units, and
CoordPartial.convert_units and convert those scalar fields to the target units
so equivalent values in cm vs m hash identically.


@staticmethod
def _hash_scalar(value) -> tuple[str, str | None]:
"""Return a dtype-aware scalar hash token."""
if value is None:
return ("none", None)
return ("scalar", hash_array(np.asarray([value])))

@staticmethod
def _hash_array_token(values: np.ndarray) -> tuple[str, str | tuple[int, ...]]:
"""Return a hash token consistent with coord array equality."""
dtype = np.dtype(values.dtype)
if np.issubdtype(dtype, np.inexact):
# Coord equality for inexact arrays uses np.isclose semantics, so
# hashing raw bytes would violate the hash contract. Fall back to
# metadata-only hashing for these dtypes.
return ("approx-array", str(dtype), values.shape)
return ("array", hash_array(values))

def __hash__(self):
coord = self._get_hashable_coord()
unit_str = coord.unit_str
if isinstance(coord, CoordRange):
payload = (
coord.__class__,
unit_str,
coord.shape,
self._hash_scalar(coord.start),
self._hash_scalar(coord.stop),
self._hash_scalar(coord.step),
)
return hash(payload)
if isinstance(coord, CoordPartial):
payload = (
coord.__class__,
unit_str,
coord.shape,
str(np.dtype(coord.dtype)),
self._hash_scalar(coord.start),
self._hash_scalar(coord.stop),
self._hash_scalar(coord.step),
)
return hash(payload)
payload = (coord.__class__, unit_str, self._hash_array_token(coord.values))
return hash(payload)
Comment on lines +481 to +506
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "Documented cross-family equality example:"
sed -n '1888,1895p' dascore/core/coords.py

echo
echo "Current __hash__ implementation:"
sed -n '481,506p' dascore/core/coords.py

echo
echo "Shared model equality implementation:"
fd 'models.py' dascore | xargs -r rg -n -C3 'class DascoreBaseModel|def __eq__'

Repository: DASDAE/dascore

Length of output: 1624


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Finding _get_hashable_coord implementation ==="
rg -n -A10 '_get_hashable_coord' dascore/core/coords.py | head -30

echo
echo "=== Finding __eq__ implementation in BaseCoord ==="
rg -n -B2 -A15 'def __eq__' dascore/core/coords.py | head -50

echo
echo "=== Checking _hash_scalar and _hash_array_token implementations ==="
rg -n -A8 'def _hash_scalar|def _hash_array_token' dascore/core/coords.py

echo
echo "=== Checking if CoordRange is converted to CoordArray anywhere ==="
rg -n 'to_array|convert.*array' dascore/core/coords.py | head -20

Repository: DASDAE/dascore

Length of output: 1057


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Class definitions and inheritance ==="
rg -n 'class BaseCoord|class CoordRange|class CoordArray' dascore/core/coords.py | head -10

echo
echo "=== Search for __eq__ in entire coords.py file ==="
rg -n '__eq__' dascore/core/coords.py

echo
echo "=== Check if BaseModel or parent provides __eq__ ==="
rg -n 'class DascoreBaseModel' dascore/models.py -A15 | head -30

echo
echo "=== Check simplify_units implementation ==="
rg -n -A20 'def simplify_units' dascore/core/coords.py

echo
echo "=== Look at entire BaseCoord class definition start ==="
sed -n '80,130p' dascore/core/coords.py

Repository: DASDAE/dascore

Length of output: 267


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== DascoreBaseModel full implementation ==="
rg -n -A50 'class DascoreBaseModel' dascore/models.py | head -70

echo
echo "=== Check if Pydantic BaseModel is used (implicit __eq__) ==="
rg -n 'from pydantic import BaseModel' dascore/models.py

echo
echo "=== Check simplify_units to understand what it does ==="
rg -n -A30 'def simplify_units' dascore/core/coords.py | head -50

echo
echo "=== Look for any coord value/data comparison methods ==="
rg -n 'values.*==|data.*==|array.*==' dascore/core/coords.py | head -20

Repository: DASDAE/dascore

Length of output: 177


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Find where DascoreBaseModel is defined ==="
rg -n 'class DascoreBaseModel' dascore/

echo
echo "=== Check imports in coords.py ==="
head -50 dascore/core/coords.py | rg -n 'DascoreBaseModel|import'

echo
echo "=== Try to find models.py in different locations ==="
fd -t f 'models.py' dascore/

echo
echo "=== Check entire dascore structure ==="
ls -la dascore/ | head -20

Repository: DASDAE/dascore

Length of output: 2247


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Full DascoreBaseModel implementation ==="
rg -n -A60 'class DascoreBaseModel' dascore/utils/models.py

echo
echo "=== Check if __eq__ is explicitly defined in DascoreBaseModel ==="
rg -n '__eq__' dascore/utils/models.py

echo
echo "=== Check full import list in coords.py ==="
head -60 dascore/core/coords.py | grep -n 'from\|import'

Repository: DASDAE/dascore

Length of output: 2255


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Find sensible_model_equals implementation ==="
rg -n -B2 -A20 'def sensible_model_equals' dascore/utils/models.py

echo
echo "=== Check what is imported from dascore.utils.models in coords.py ==="
rg -n -A10 'from dascore.utils.models import' dascore/core/coords.py

echo
echo "=== Look at CoordRange and CoordArray definitions to understand their structure ==="
rg -n -A5 'class CoordRange|class CoordArray' dascore/core/coords.py | head -30

Repository: DASDAE/dascore

Length of output: 1782


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Full CoordRange class definition ==="
sed -n '1164,1250p' dascore/core/coords.py

echo
echo "=== Full CoordArray class definition ==="
sed -n '1422,1500p' dascore/core/coords.py

echo
echo "=== Check model_fields for CoordRange ==="
rg -n 'model_fields|__init__' dascore/core/coords.py | grep -A5 -B5 'CoordRange'

Repository: DASDAE/dascore

Length of output: 6601


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Search for any override of __eq__ in CoordRange or CoordArray ==="
rg -n 'class CoordRange|class CoordArray' dascore/core/coords.py -A100 | grep -E 'def __eq__|class CoordRange|class CoordArray'

echo
echo "=== Check if there's any conversion from CoordRange to CoordArray before comparison ==="
rg -n 'to_array|values' dascore/core/coords.py | grep -i 'coordrange\|coordarray' | head -20

echo
echo "=== Look at the exact test case from the docstring to understand the claim ==="
sed -n '1888,1905p' dascore/core/coords.py

echo
echo "=== Check if there's a conversion method on CoordRange ==="
rg -n 'def values|def to_array' dascore/core/coords.py | grep -B5 -A5 'CoordRange\|1[2-3][0-9][0-9]'

Repository: DASDAE/dascore

Length of output: 1392


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Check BaseCoord.values property (line 1097) ==="
sed -n '1097,1110p' dascore/core/coords.py

echo
echo "=== Check CoordArray.values property (line 1370) ==="
sed -n '1370,1380p' dascore/core/coords.py

echo
echo "=== Check all fields defined in BaseCoord and subclasses ==="
sed -n '211,280p' dascore/core/coords.py

echo
echo "=== Now verify the actual test - run a minimal Python test to see if they compare equal ==="
python3 << 'EOF'
import sys
sys.path.insert(0, '/root/code')

import numpy as np
from dascore.core.coords import get_coord

# Create test coordinates
range_coord = get_coord(start=1, stop=12, step=1)
array_coord = get_coord(data=np.arange(1, 12, 1))

print(f"Range coord type: {type(range_coord)}")
print(f"Array coord type: {type(array_coord)}")
print(f"Range coord fields: {range_coord.model_dump().keys()}")
print(f"Array coord fields: {array_coord.model_dump().keys()}")
print(f"Are they equal? {range_coord == array_coord}")
print(f"Range coord dump: {range_coord.model_dump()}")
print(f"Array coord dump: {array_coord.model_dump()}")
EOF

Repository: DASDAE/dascore

Length of output: 5129


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Find get_coord function to see if it normalizes types ==="
rg -n -A50 'def get_coord' dascore/core/coords.py | head -80

echo
echo "=== Check what line 1370 actually is - full CoordRange.values property ==="
sed -n '1370,1400p' dascore/core/coords.py

echo
echo "=== Look for any __eq__ override in CoordRange or subclasses ==="
sed -n '1164,1420p' dascore/core/coords.py | grep -n '__eq__\|def values'

Repository: DASDAE/dascore

Length of output: 3370


Fix hash contract violation and document cross-family equality mismatch.

The documentation at line 1894 claims CoordRange and CoordArray instances should compare equal when they represent the same data, but sensible_model_equals will return False because they have different field sets—CoordRange has (start, stop, step) while CoordArray has (values). Additionally, the __hash__ implementation includes coord.__class__ in each payload and uses separate field combinations for CoordRange vs CoordArray, which would violate Python's hash contract if the equality claim were true (equal objects must have equal hash values). Either implement the cross-family equality properly by normalizing coords to a canonical representation before comparison and hashing, or update the documentation to reflect that different coord families are not equal.


@cached_method
def min(self):
"""Return min value."""
Expand Down Expand Up @@ -1016,7 +1071,20 @@ def update(self, **kwargs):
# Other operations that normally modify data do not in this case.
update_limits = update
set_units = update
convert_units = update

def convert_units(self, units) -> Self:
"""Convert scalar metadata units, or set units if none exist."""
if self.units is None or dtype_time_like(self.dtype):
return self.set_units(units=units)
out = {"units": units}
for name in ("start", "stop", "step"):
value = getattr(self, name)
out[name] = (
value
if pd.isnull(value)
else convert_units(value, to_units=units, from_units=self.units)
)
return self.new(**out)

def sort(self, reverse=False):
"""Sort dummy array. Does nothing."""
Expand Down
22 changes: 14 additions & 8 deletions dascore/utils/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ def patch_array_function(self, func, types, args, kwargs):
return apply_array_func(func, *args, **kwargs)


def hash_numpy_array(arr: np.ndarray) -> str:
def hash_array(arr: np.ndarray) -> str:
"""
Return a stable hash for a NumPy array.

Expand All @@ -644,18 +644,18 @@ def hash_numpy_array(arr: np.ndarray) -> str:
Examples
--------
>>> import numpy as np
>>> from dascore.utils.array import hash_numpy_array
>>> from dascore.utils.array import hash_array
>>> a = np.array([1.0, 2.0, 3.0])
>>> h = hash_numpy_array(a)
>>> h = hash_array(a)
>>> assert isinstance(h, str) and len(h) == 32
>>> # Same data always produces the same hash
>>> assert hash_numpy_array(a) == hash_numpy_array(a.copy())
>>> assert hash_array(a) == hash_array(a.copy())
>>> # Different dtype produces a different hash
>>> assert hash_numpy_array(a) != hash_numpy_array(a.astype(np.float32))
>>> assert hash_array(a) != hash_array(a.astype(np.float32))
"""
arr = np.asarray(arr)
if arr.dtype == object:
msg = "hash_numpy_array does not support object arrays."
msg = "hash_array does not support object arrays."
raise ParameterError(msg)

h = hashlib.blake2b(digest_size=16)
Expand All @@ -665,11 +665,17 @@ def hash_numpy_array(arr: np.ndarray) -> str:
h.update(arr.dtype.str.encode("ascii"))
h.update(np.asarray(arr.shape, dtype=np.int64).tobytes())

if arr.flags.c_contiguous:
if arr.flags.c_contiguous and arr.dtype.kind not in {"M", "m"}:
# Zero-copy fast path
h.update(memoryview(arr).cast("B"))
else:
# Canonicalize layout; this copies once
# Canonicalize layout; this also handles datetime/timedelta dtypes,
# which do not expose a Python buffer directly.
h.update(np.ascontiguousarray(arr).view(np.uint8))

return h.hexdigest()


def hash_numpy_array(arr: np.ndarray) -> str:
"""Backward-compatible alias for `hash_array`."""
return hash_array(arr)
59 changes: 56 additions & 3 deletions dascore/utils/hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from dascore.utils.remote_io import (
FallbackFileObj,
ensure_local_file,
get_cached_local_file,
get_local_handle,
is_no_range_http_error,
)
Expand All @@ -63,6 +64,50 @@
ns_to_timedelta = partial(pd.to_timedelta, unit="ns")


class _ManagedH5pyFile:
"""Proxy an h5py file while owning the underlying file object lifecycle."""

def __init__(self, handle: H5pyFile, owned_fileobj):
self._handle = handle
self._owned_fileobj = owned_fileobj
self._closed = False

def close(self):
"""Close both the h5py file and the owned file object."""
if self._closed:
return
try:
self._handle.close()
finally:
with suppress(Exception):
self._owned_fileobj.close()
self._closed = True

def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
self.close()
return False

def __getitem__(self, item):
return self._handle[item]

def __contains__(self, item):
return item in self._handle

def __iter__(self):
return iter(self._handle)

@property
def closed(self):
"""Return True when close has been called on the proxy."""
return self._closed

def __getattr__(self, item):
return getattr(self._handle, item)


class _HDF5Store(pd.HDFStore):
"""
A work-around for pandas HDF5 store not accepting
Expand Down Expand Up @@ -473,6 +518,12 @@ class H5Reader(PyTablesReader):
mode = "r"
constructor = H5pyFile

@classmethod
def _open_fileobj_handle(cls, fileobj):
"""Open an h5py file and retain ownership of the wrapped file object."""
handle = cls.constructor(fileobj, mode=cls.mode, driver="fileobj")
return _ManagedH5pyFile(handle, fileobj)

@staticmethod
def _get_open_kwargs(resource: UPath) -> dict[str, object]:
"""Return backend-specific kwargs for remote HDF5 file objects."""
Expand All @@ -495,11 +546,13 @@ def get_handle(cls, resource):
Unlike PyTablesReader, h5py can consume a binary file object via the
``fileobj`` driver, so remote UPath inputs stay streaming-based here.
"""
if isinstance(resource, cls | H5pyFile):
if isinstance(resource, cls | H5pyFile | _ManagedH5pyFile):
return resource
if isinstance(resource, io.IOBase):
return cls.constructor(resource, mode=cls.mode, driver="fileobj")
return cls._open_fileobj_handle(resource)
if isinstance(resource, UPath):
if cached_path := get_cached_local_file(resource):
return super().get_handle(cached_path)
mode = "rb" if cls.mode == "r" else "r+b"
open_kwargs = cls._get_open_kwargs(resource)
handle = FallbackFileObj(
Expand All @@ -508,7 +561,7 @@ def get_handle(cls, resource):
error_predicate=is_no_range_http_error,
)
try:
return cls.constructor(handle, mode=cls.mode, driver="fileobj")
return cls._open_fileobj_handle(handle)
except Exception:
handle.close()
raise
Expand Down
35 changes: 28 additions & 7 deletions dascore/utils/remote_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from functools import lru_cache
from hashlib import sha256
from pathlib import Path
from urllib.request import Request, urlopen

from dascore.compat import UPath
from dascore.config import get_config
Expand Down Expand Up @@ -125,7 +126,6 @@ def _download_remote_file(path, local_path: Path):
"""Download a remote path into its cache location."""
resource = coerce_to_upath(path)
protocol = getattr(resource, "protocol", None)
open_kwargs = {"block_size": 0} if protocol in _HTTP_PROTOCOLS else {}
local_path.parent.mkdir(parents=True, exist_ok=True)
fd, temp_name = tempfile.mkstemp(
dir=local_path.parent,
Expand All @@ -135,12 +135,20 @@ def _download_remote_file(path, local_path: Path):
os.close(fd)
tmp_path = Path(temp_name)
try:
with (
resource.open("rb", **open_kwargs) as remote_fi,
tmp_path.open("wb") as local_fi,
):
while chunk := remote_fi.read(get_config().remote_download_block_size):
local_fi.write(chunk)
if protocol in _HTTP_PROTOCOLS:
headers = dict(getattr(resource, "storage_options", {}) or {})
request = Request(str(resource), headers=headers)
Comment on lines +139 to +140
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve HTTP options instead of mapping all to headers

The HTTP download path now converts resource.storage_options directly into request headers, which misinterprets common UPath/fsspec options (for example headers={...} becomes a literal headers: header, and non-header options are also injected). In authenticated or customized HTTP setups this drops the intended auth/config and can make fallback materialization fail (e.g., 401/403) when ensure_local_file is triggered.

Useful? React with 👍 / 👎.

timeout = get_config().remote_download_timeout
with (
urlopen(request, timeout=timeout) as remote_fi,
Comment on lines +138 to +143
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify how HTTP headers are represented in UPath usage across this repo.
rg -nP 'UPath\([^)]*headers\s*=' --type=py -C2
rg -nP 'storage_options.*headers|["'\'']headers["'\'']' --type=py -C2
rg -nP '\bRequest\(|\burlopen\(' dascore/utils/remote_io.py -C2

Repository: DASDAE/dascore

Length of output: 1334


🏁 Script executed:

# 1. Examine the test case to understand how storage_options are passed
sed -n '670,700p' tests/test_utils/test_io_utils.py

# 2. Find _HTTP_PROTOCOLS definition
rg -n '_HTTP_PROTOCOLS' dascore/utils/remote_io.py -B2 -A2

# 3. Look at the full context of _download_remote_file function
sed -n '120,160p' dascore/utils/remote_io.py

# 4. Check for any other storage_options usage in HTTP context
rg -n 'storage_options' dascore/utils/remote_io.py -B2 -A2

Repository: DASDAE/dascore

Length of output: 3962


🏁 Script executed:

# Examine the _HTTPResource test class definition
sed -n '660,680p' tests/test_utils/test_io_utils.py

# Check for any other HTTP storage_options examples in tests
rg -n 'storage_options.*=' tests/test_utils/test_io_utils.py -B1 -A1 | grep -A5 -B5 'http'

# Check if there's any fsspec/upath dependency that defines headers structure
rg -n 'fsspec|upath' dascore/utils/remote_io.py

# Look for any imports or references to understand the HTTP backend
head -30 dascore/utils/remote_io.py

Repository: DASDAE/dascore

Length of output: 2178


🏁 Script executed:

# Check test setup for HTTP resource to see the actual storage_options structure
sed -n '645,650p' tests/test_utils/test_io_utils.py

# Look for all storage_options structures in tests
rg -n 'storage_options\s*=' tests/test_utils/test_io_utils.py -A1

# Check if there's any documentation about storage_options format in comments/docstrings
rg -n 'storage_options' dascore/utils/remote_io.py -B3 -A3 | head -40

# Check what UPath (coerce_to_upath) is and if it has documentation about storage_options
rg -n 'class UPath|def coerce_to_upath' dascore/compat.py -A5

Repository: DASDAE/dascore

Length of output: 1266


🏁 Script executed:

# Check fsspec and upath imports to understand storage_options structure
rg -n 'from dascore.compat import' dascore/utils/remote_io.py

# Look at the compat module to see what UPath is
cat dascore/compat.py | head -50

# Search for any storage_options nesting patterns in actual usage (not just tests)
rg -n 'storage_options' dascore/ --type=py -B2 -A2 | grep -i "headers" | head -20

# Check if there are any real-world (non-test) HTTP examples with storage_options
rg -rn '"headers".*storage_options|storage_options.*"headers"' dascore/ --type=py

Repository: DASDAE/dascore

Length of output: 1488


🏁 Script executed:

# Search for any usage of UPath with HTTP in the actual codebase (not just tests)
rg -n 'UPath.*http|http.*UPath' dascore/ --type=py

# Check if there are any real-world examples of storage_options being passed
rg -n '_download_remote_file' dascore/ --type=py -B2 -A2

# Look for any integration tests or examples that use HTTP downloading
find dascore/ -name '*.py' -type f -exec grep -l 'http.*download\|download.*http' {} \;

# Check if upath/fsspec HTTP backend documentation is referenced anywhere
rg -n 'fsspec|fsspec.implementations.http' dascore/ --type=py

Repository: DASDAE/dascore

Length of output: 695


🏁 Script executed:

# Check fsspec documentation or any comments about storage_options structure
web_search query about fsspec HTTP storage_options structure

# Also check if there's any version pinning or requirements file
cat dascore/pyproject.toml 2>/dev/null | grep -i 'upath\|fsspec\|urllib' || cat setup.py 2>/dev/null | grep -i 'upath\|fsspec'

# Check the actual test to see what storage_options are expected to work
sed -n '640,695p' tests/test_utils/test_io_utils.py

Repository: DASDAE/dascore

Length of output: 2115


🌐 Web query:

fsspec HTTP backend storage_options structure headers

💡 Result:

The fsspec HTTP backend (HTTPFileSystem) accepts storage_options as keyword arguments in its constructor, which are stored in self.kwargs and passed to aiohttp session requests (e.g., session.get(url, self.kwargs)). Key storage_options include: - headers: dict of HTTP headers applied to all requests (e.g., {"Authorization": "Bearer token"}). Headers can also be overridden per-operation via kwargs in methods like open. - client_kwargs: dict passed to aiohttp.ClientSession constructor (e.g., for auth: {"auth": aiohttp.BasicAuth("user", "pass")}). - Other common options: block_size (read block size, default 5MB), simple_links (for ls), cache_type, cache_options. Example usage: fs = fsspec.filesystem("http", headers={"User-Agent": "my-app"}, client_kwargs={"timeout": aiohttp.ClientTimeout(total=30)}) with fs.open("https://example.com/file.txt") as f: data = f.read For high-level functions like fsspec.open("https://...", storage_options={"headers": {...}}). Source code confirms storage_options become self.kwargs for requests, and explicit params like client_kwargs are separate. Headers in storage_options go to per-request kwargs.

Citations:


🏁 Script executed:

# Verify if the protocol field from resource always matches the actual URL scheme
rg -n 'protocol.*=' tests/test_utils/test_io_utils.py | grep -i http

# Check if there are any cases where protocol and URL scheme could mismatch
rg -n 'coerce_to_upath' tests/test_utils/test_io_utils.py -B2 -A2 | head -30

# Check if there are any edge cases in URL construction
sed -n '125,150p' dascore/utils/remote_io.py

# Verify RemoteCacheError exists
rg -n 'class RemoteCacheError' dascore/

Repository: DASDAE/dascore

Length of output: 2146


🏁 Script executed:

# Final verification: check if there are any real-world calls to _download_remote_file with nested headers
rg -n '_download_remote_file\|materialize_remote_file' dascore/ --type=py -B3 -A3 | head -50

Repository: DASDAE/dascore

Length of output: 40


Add explicit URL scheme validation and support nested header extraction for fsspec compatibility.

Line 139 treats storage_options directly as HTTP headers. While the current test passes with flat headers (e.g., {"User-Agent": "..."}"), fsspec's HTTP backend also supports headers nested under a "headers" key for consistency with other storage backends. Additionally, line 143 should validate the actual URL scheme before passing to urlopen (S310 defense-in-depth), rather than relying solely on the resource's declared protocol.

Proposed fix
+from urllib.parse import urlsplit
 from urllib.request import Request, urlopen
@@
         if protocol in _HTTP_PROTOCOLS:
-            headers = dict(getattr(resource, "storage_options", {}) or {})
-            request = Request(str(resource), headers=headers)
+            storage_options = dict(getattr(resource, "storage_options", {}) or {})
+            raw_headers = storage_options.get("headers", storage_options)
+            headers = {str(k): str(v) for k, v in (raw_headers or {}).items()}
+            url = str(resource)
+            scheme = urlsplit(url).scheme.lower()
+            if scheme not in _HTTP_PROTOCOLS:
+                msg = f"Refusing non-HTTP URL for remote download: {url!r}"
+                raise RemoteCacheError(msg)
+            request = Request(url, headers=headers)
             timeout = get_config().remote_download_timeout
             with (
-                urlopen(request, timeout=timeout) as remote_fi,
+                urlopen(request, timeout=timeout) as remote_fi,  # noqa: S310
                 tmp_path.open("wb") as local_fi,
             ):
🧰 Tools
🪛 Ruff (0.15.9)

[error] 140-140: Audit URL open for permitted schemes. Allowing use of file: or custom schemes is often unexpected.

(S310)


[error] 143-143: Audit URL open for permitted schemes. Allowing use of file: or custom schemes is often unexpected.

(S310)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@dascore/utils/remote_io.py` around lines 138 - 143, The code treats
storage_options as flat HTTP headers and trusts the resource protocol; update
the HTTP branch handling around _HTTP_PROTOCOLS, Request, and urlopen to (1)
extract headers compatibly from getattr(resource, "storage_options", {}) by
checking for a nested "headers" key first and falling back to the top-level
mapping (ensuring a dict of header strings), and (2) explicitly parse and
validate the URL scheme of str(resource) (e.g., via urllib.parse.urlparse) to
allow only "http" or "https" before calling urlopen (use
get_config().remote_download_timeout as the timeout). Ensure Request receives
the validated URL and the constructed headers.

tmp_path.open("wb") as local_fi,
):
while chunk := remote_fi.read(get_config().remote_download_block_size):
local_fi.write(chunk)
else:
with resource.open("rb") as remote_fi, tmp_path.open("wb") as local_fi:
while chunk := remote_fi.read(get_config().remote_download_block_size):
local_fi.write(chunk)
tmp_path.replace(local_path)
finally:
tmp_path.unlink(missing_ok=True)
Expand Down Expand Up @@ -201,6 +209,19 @@ def ensure_local_file(resource) -> Path:
raise TypeError(msg)


def get_cached_local_file(resource) -> Path | None:
"""Return the cached local path for one remote resource if it exists."""
if not is_pathlike(resource) or is_local_path(resource):
return None
remote = coerce_to_upath(resource)
cache_root = _normalize_cache_root(get_remote_cache_path())
remote_id = normalize_remote_id(remote)
local_path = (
cache_root / sha256(remote_id.encode()).hexdigest() / _safe_remote_name(remote)
)
return local_path if local_path.exists() else None


def get_local_handle(resource, opener):
"""Materialize a resource locally, then pass it to an opener."""
return opener(ensure_local_file(resource))
Expand Down
Loading
Loading