Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions apis/python/src/tiledb/vector_search/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def ingest(
config: Optional[Mapping[str, Any]] = None,
namespace: Optional[str] = None,
size: int = -1,
dimensions: int = -1,
partitions: int = -1,
num_subspaces: int = -1,
l_build: int = -1,
Expand Down Expand Up @@ -122,6 +123,10 @@ def ingest(
size: int
Number of input vectors, if not provided use the full size of the input dataset.
If provided, we filter the first vectors from the input source.
dimensions: int
Number of vector dimensions, if not provided use the dimensions detected from the input dataset.
If provided, this overrides the dimensions detected by read_source_metadata. This is only used when
the input_vectors is not provided. Otherwise, it is ignored.
partitions: int
For IVF_FLAT and IVF_PQ indexes, the number of partitions to generate from the data during k-means clustering.
If not provided, is auto-configured based on the dataset size.
Expand Down Expand Up @@ -3052,15 +3057,23 @@ def consolidate_and_vacuum(

if input_vectors is not None:
in_size = input_vectors.shape[0]
dimensions = input_vectors.shape[1]
# When input_vectors is provided, use detected dimensions (ignore dimensions parameter)
dimensions = int(input_vectors.shape[1])
vector_type = input_vectors.dtype
source_type = "TILEDB_ARRAY"
logger.debug("Using dimensions from input_vectors: %d", dimensions)
else:
if source_type is None:
source_type = autodetect_source_type(source_uri=source_uri)
in_size, dimensions, vector_type = read_source_metadata(
in_size, detected_dimensions, vector_type = read_source_metadata(
source_uri=source_uri, source_type=source_type
)
# Use provided dimensions if specified, otherwise use detected dimensions
if dimensions == -1:
dimensions = int(detected_dimensions)
logger.debug(
"Using dimensions: %d (detected: %d)", dimensions, detected_dimensions
)
logger.debug("Ingesting Vectors into %r", index_group_uri)
arrays_created = False
if is_type_erased_index(index_type):
Expand Down
173 changes: 173 additions & 0 deletions apis/python/test/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2010,3 +2010,176 @@ def test_ivf_flat_taskgraph_query(tmp_path):
queries, k=k, nprobe=nprobe, nthreads=8, mode=Mode.LOCAL, num_partitions=10
)
assert accuracy(result, gt_i) > MINIMUM_ACCURACY


def test_dimensions_parameter_override(tmp_path):
"""
Test the dimensions parameter functionality with TileDB array input.

This test verifies that the dimensions parameter can override
the dimensions detected from the source array, which is useful
for handling cases where the source array has an artificially
large domain (e.g., due to TileDBSOMA; https://github.com/TileDB-Inc/TileDB-Vector-Search/issues/564).
"""
# Create test data
actual_dimensions = 64
nb = 1000
nq = 10
k = 5

# Create random test vectors with actual dimensions
test_vectors = np.random.rand(nb, actual_dimensions).astype(np.float32)
queries = np.random.rand(nq, actual_dimensions).astype(np.float32)

# Create a TileDB array with artificially large domain (simulating the problem)
source_uri = os.path.join(tmp_path, "source_array")
large_domain_value = 100000

# Create schema with large dimension domain
schema = tiledb.ArraySchema(
domain=tiledb.Domain(
tiledb.Dim(
name="__dim_0",
domain=(0, large_domain_value),
tile=1000,
dtype="int32",
),
tiledb.Dim(
name="__dim_1",
domain=(0, large_domain_value),
tile=actual_dimensions,
dtype="int32",
),
),
attrs=[
tiledb.Attr(name="values", dtype="float32", var=False, nullable=False),
],
cell_order="col-major",
tile_order="col-major",
capacity=10000,
sparse=False,
)

# Create the array and write test data
tiledb.Array.create(source_uri, schema)
with tiledb.open(source_uri, "w") as A:
A[0:nb, 0:actual_dimensions] = test_vectors

# Test ingestion with dimensions parameter override
# Without the override, the large domain would be detected as 100001 dimensions
# With the override, we explicitly set it to the actual dimensions (64)
index_uri = os.path.join(tmp_path, "test_index")

index = ingest(
index_type="FLAT",
index_uri=index_uri,
source_uri=source_uri,
source_type="TILEDB_ARRAY",
dimensions=actual_dimensions, # Override the detected large dimensions
size=nb,
)

# Verify the index was created successfully
assert index is not None
index.vacuum()

# Verify the index works correctly with queries
distances, indices = index.query(queries, k=k)

# Basic sanity checks
assert distances.shape == (nq, k)
assert indices.shape == (nq, k)
assert np.all(indices >= 0)
assert np.all(indices < nb)

# Verify that dimensions=-1 (or not passing at all) detects large dimensions but creates unusable index
# This demonstrates the problem that the dimensions parameter is meant to solve
index_uri_2 = os.path.join(tmp_path, "test_index_2")

# Create with explicit dimensions=-1 - this will use the large detected dimensions
# The index creation will succeed, but queries will fail due to dimension mismatch
index_2 = ingest(
index_type="FLAT",
index_uri=index_uri_2,
source_uri=source_uri,
source_type="TILEDB_ARRAY",
dimensions=-1, # Uses detected large dimensions (100001)
size=nb,
)

assert index_2 is not None
index_2.vacuum()

# Verify that the index was created with the large detected dimensions
assert index_2.dimensions == large_domain_value + 1 # 100001 dimensions

# Verify that queries fail due to dimension mismatch
# This demonstrates why the dimensions parameter override is needed
with pytest.raises(Exception) as exc_info:
index_2.query(queries, k=k)
assert (
"A query in queries has 64 dimensions, but the indexed data had 100001 dimensions"
in str(exc_info.value)
) # Should contain dimension mismatch error


def test_dimensions_parameter_with_numpy_input(tmp_path):
"""
Test the dimensions parameter with numpy input vectors.

This is to ensure that when input_vectors is provided as a numpy array,
the dimensions parameter is either ignored or validated correctly.
"""
# Create test data
nb = 100
actual_dimensions = 32
nq = 5
k = 3

# Create random test vectors
input_vectors = np.random.rand(nb, actual_dimensions).astype(np.float32)
queries = np.random.rand(nq, actual_dimensions).astype(np.float32)

# Ingest with numpy input and dimensions parameter (should be ignored since input_vectors is provided)
index_uri = os.path.join(tmp_path, "test_numpy_index")

# When input_vectors is provided, the dimensions parameter should not affect the detected dimensions
# but the function should still accept it without error
index = ingest(
index_type="FLAT",
index_uri=index_uri,
input_vectors=input_vectors,
dimensions=999, # This should be ignored since input_vectors is provided
)

# Verify the index was created successfully
assert index is not None
index.vacuum()

# Test that queries work correctly
distances, indices = index.query(queries, k=k)

# Basic sanity checks
assert distances.shape == (nq, k)
assert indices.shape == (nq, k)
assert np.all(indices >= 0)
assert np.all(indices < nb)

# Verify that dimensions parameter doesn't cause issues with default behavior
index_uri_2 = os.path.join(tmp_path, "test_numpy_index_2")

# Test without dimensions parameter (default behavior)
index_2 = ingest(
index_type="FLAT",
index_uri=index_uri_2,
input_vectors=input_vectors,
# No dimensions parameter - should work as before
)

assert index_2 is not None
index_2.vacuum()

# Verify both indexes produce similar results
distances_2, indices_2 = index_2.query(queries, k=k)
assert distances_2.shape == (nq, k)
assert indices_2.shape == (nq, k)
Loading