From 1bc5188c5368688d91e91cdea43eab1f0d1696ef Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Tue, 9 Sep 2025 18:10:27 +0300 Subject: [PATCH 1/2] Add dimensions parameter to `ingest` method --- .../src/tiledb/vector_search/ingestion.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py index b5652be62..67a03f321 100644 --- a/apis/python/src/tiledb/vector_search/ingestion.py +++ b/apis/python/src/tiledb/vector_search/ingestion.py @@ -54,6 +54,7 @@ def ingest( config: Optional[Mapping[str, Any]] = None, namespace: Optional[str] = None, size: int = -1, + dimensions: int = -1, partitions: int = -1, num_subspaces: int = -1, l_build: int = -1, @@ -122,6 +123,10 @@ def ingest( size: int Number of input vectors, if not provided use the full size of the input dataset. If provided, we filter the first vectors from the input source. + dimensions: int + Number of vector dimensions, if not provided use the dimensions detected from the input dataset. + If provided, this overrides the dimensions detected by read_source_metadata. This is only used when + the input_vectors is not provided. Otherwise, it is ignored. partitions: int For IVF_FLAT and IVF_PQ indexes, the number of partitions to generate from the data during k-means clustering. If not provided, is auto-configured based on the dataset size. @@ -3052,15 +3057,23 @@ def consolidate_and_vacuum( if input_vectors is not None: in_size = input_vectors.shape[0] - dimensions = input_vectors.shape[1] + # When input_vectors is provided, use detected dimensions (ignore dimensions parameter) + dimensions = int(input_vectors.shape[1]) vector_type = input_vectors.dtype source_type = "TILEDB_ARRAY" + logger.debug("Using dimensions from input_vectors: %d", dimensions) else: if source_type is None: source_type = autodetect_source_type(source_uri=source_uri) - in_size, dimensions, vector_type = read_source_metadata( + in_size, detected_dimensions, vector_type = read_source_metadata( source_uri=source_uri, source_type=source_type ) + # Use provided dimensions if specified, otherwise use detected dimensions + if dimensions == -1: + dimensions = int(detected_dimensions) + logger.debug( + "Using dimensions: %d (detected: %d)", dimensions, detected_dimensions + ) logger.debug("Ingesting Vectors into %r", index_group_uri) arrays_created = False if is_type_erased_index(index_type): From 22a0f6dbf8b9b0b037f6630412c2354528e08cfc Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Tue, 9 Sep 2025 18:10:38 +0300 Subject: [PATCH 2/2] Add tests --- apis/python/test/test_ingestion.py | 173 +++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py index 4b4450473..9c44ae500 100644 --- a/apis/python/test/test_ingestion.py +++ b/apis/python/test/test_ingestion.py @@ -2010,3 +2010,176 @@ def test_ivf_flat_taskgraph_query(tmp_path): queries, k=k, nprobe=nprobe, nthreads=8, mode=Mode.LOCAL, num_partitions=10 ) assert accuracy(result, gt_i) > MINIMUM_ACCURACY + + +def test_dimensions_parameter_override(tmp_path): + """ + Test the dimensions parameter functionality with TileDB array input. + + This test verifies that the dimensions parameter can override + the dimensions detected from the source array, which is useful + for handling cases where the source array has an artificially + large domain (e.g., due to TileDBSOMA; https://github.com/TileDB-Inc/TileDB-Vector-Search/issues/564). + """ + # Create test data + actual_dimensions = 64 + nb = 1000 + nq = 10 + k = 5 + + # Create random test vectors with actual dimensions + test_vectors = np.random.rand(nb, actual_dimensions).astype(np.float32) + queries = np.random.rand(nq, actual_dimensions).astype(np.float32) + + # Create a TileDB array with artificially large domain (simulating the problem) + source_uri = os.path.join(tmp_path, "source_array") + large_domain_value = 100000 + + # Create schema with large dimension domain + schema = tiledb.ArraySchema( + domain=tiledb.Domain( + tiledb.Dim( + name="__dim_0", + domain=(0, large_domain_value), + tile=1000, + dtype="int32", + ), + tiledb.Dim( + name="__dim_1", + domain=(0, large_domain_value), + tile=actual_dimensions, + dtype="int32", + ), + ), + attrs=[ + tiledb.Attr(name="values", dtype="float32", var=False, nullable=False), + ], + cell_order="col-major", + tile_order="col-major", + capacity=10000, + sparse=False, + ) + + # Create the array and write test data + tiledb.Array.create(source_uri, schema) + with tiledb.open(source_uri, "w") as A: + A[0:nb, 0:actual_dimensions] = test_vectors + + # Test ingestion with dimensions parameter override + # Without the override, the large domain would be detected as 100001 dimensions + # With the override, we explicitly set it to the actual dimensions (64) + index_uri = os.path.join(tmp_path, "test_index") + + index = ingest( + index_type="FLAT", + index_uri=index_uri, + source_uri=source_uri, + source_type="TILEDB_ARRAY", + dimensions=actual_dimensions, # Override the detected large dimensions + size=nb, + ) + + # Verify the index was created successfully + assert index is not None + index.vacuum() + + # Verify the index works correctly with queries + distances, indices = index.query(queries, k=k) + + # Basic sanity checks + assert distances.shape == (nq, k) + assert indices.shape == (nq, k) + assert np.all(indices >= 0) + assert np.all(indices < nb) + + # Verify that dimensions=-1 (or not passing at all) detects large dimensions but creates unusable index + # This demonstrates the problem that the dimensions parameter is meant to solve + index_uri_2 = os.path.join(tmp_path, "test_index_2") + + # Create with explicit dimensions=-1 - this will use the large detected dimensions + # The index creation will succeed, but queries will fail due to dimension mismatch + index_2 = ingest( + index_type="FLAT", + index_uri=index_uri_2, + source_uri=source_uri, + source_type="TILEDB_ARRAY", + dimensions=-1, # Uses detected large dimensions (100001) + size=nb, + ) + + assert index_2 is not None + index_2.vacuum() + + # Verify that the index was created with the large detected dimensions + assert index_2.dimensions == large_domain_value + 1 # 100001 dimensions + + # Verify that queries fail due to dimension mismatch + # This demonstrates why the dimensions parameter override is needed + with pytest.raises(Exception) as exc_info: + index_2.query(queries, k=k) + assert ( + "A query in queries has 64 dimensions, but the indexed data had 100001 dimensions" + in str(exc_info.value) + ) # Should contain dimension mismatch error + + +def test_dimensions_parameter_with_numpy_input(tmp_path): + """ + Test the dimensions parameter with numpy input vectors. + + This is to ensure that when input_vectors is provided as a numpy array, + the dimensions parameter is either ignored or validated correctly. + """ + # Create test data + nb = 100 + actual_dimensions = 32 + nq = 5 + k = 3 + + # Create random test vectors + input_vectors = np.random.rand(nb, actual_dimensions).astype(np.float32) + queries = np.random.rand(nq, actual_dimensions).astype(np.float32) + + # Ingest with numpy input and dimensions parameter (should be ignored since input_vectors is provided) + index_uri = os.path.join(tmp_path, "test_numpy_index") + + # When input_vectors is provided, the dimensions parameter should not affect the detected dimensions + # but the function should still accept it without error + index = ingest( + index_type="FLAT", + index_uri=index_uri, + input_vectors=input_vectors, + dimensions=999, # This should be ignored since input_vectors is provided + ) + + # Verify the index was created successfully + assert index is not None + index.vacuum() + + # Test that queries work correctly + distances, indices = index.query(queries, k=k) + + # Basic sanity checks + assert distances.shape == (nq, k) + assert indices.shape == (nq, k) + assert np.all(indices >= 0) + assert np.all(indices < nb) + + # Verify that dimensions parameter doesn't cause issues with default behavior + index_uri_2 = os.path.join(tmp_path, "test_numpy_index_2") + + # Test without dimensions parameter (default behavior) + index_2 = ingest( + index_type="FLAT", + index_uri=index_uri_2, + input_vectors=input_vectors, + # No dimensions parameter - should work as before + ) + + assert index_2 is not None + index_2.vacuum() + + # Verify both indexes produce similar results + distances_2, indices_2 = index_2.query(queries, k=k) + assert distances_2.shape == (nq, k) + assert indices_2.shape == (nq, k)