TileDB-Inc · kounelisagis · Sep 18, 2025 · Sep 9, 2025 · Sep 9, 2025
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
@@ -54,6 +54,7 @@ def ingest(
     config: Optional[Mapping[str, Any]] = None,
     namespace: Optional[str] = None,
     size: int = -1,
+    dimensions: int = -1,
     partitions: int = -1,
     num_subspaces: int = -1,
     l_build: int = -1,
@@ -122,6 +123,10 @@ def ingest(
     size: int
         Number of input vectors, if not provided use the full size of the input dataset.
         If provided, we filter the first vectors from the input source.
+    dimensions: int
+        Number of vector dimensions, if not provided use the dimensions detected from the input dataset.
+        If provided, this overrides the dimensions detected by read_source_metadata. This is only used when
+        the input_vectors is not provided. Otherwise, it is ignored.
     partitions: int
         For IVF_FLAT and IVF_PQ indexes, the number of partitions to generate from the data during k-means clustering.
         If not provided, is auto-configured based on the dataset size.
@@ -3052,15 +3057,23 @@ def consolidate_and_vacuum(
 
         if input_vectors is not None:
             in_size = input_vectors.shape[0]
-            dimensions = input_vectors.shape[1]
+            # When input_vectors is provided, use detected dimensions (ignore dimensions parameter)
+            dimensions = int(input_vectors.shape[1])
             vector_type = input_vectors.dtype
             source_type = "TILEDB_ARRAY"
+            logger.debug("Using dimensions from input_vectors: %d", dimensions)
         else:
             if source_type is None:
                 source_type = autodetect_source_type(source_uri=source_uri)
-            in_size, dimensions, vector_type = read_source_metadata(
+            in_size, detected_dimensions, vector_type = read_source_metadata(
                 source_uri=source_uri, source_type=source_type
             )
+            # Use provided dimensions if specified, otherwise use detected dimensions
+            if dimensions == -1:
+                dimensions = int(detected_dimensions)
+            logger.debug(
+                "Using dimensions: %d (detected: %d)", dimensions, detected_dimensions
+            )
         logger.debug("Ingesting Vectors into %r", index_group_uri)
         arrays_created = False
         if is_type_erased_index(index_type):

diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -2010,3 +2010,176 @@ def test_ivf_flat_taskgraph_query(tmp_path):
         queries, k=k, nprobe=nprobe, nthreads=8, mode=Mode.LOCAL, num_partitions=10
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
+
+
+def test_dimensions_parameter_override(tmp_path):
+    """
+    Test the dimensions parameter functionality with TileDB array input.
+
+    This test verifies that the dimensions parameter can override
+    the dimensions detected from the source array, which is useful
+    for handling cases where the source array has an artificially
+    large domain (e.g., due to TileDBSOMA; https://github.com/TileDB-Inc/TileDB-Vector-Search/issues/564).
+    """
+    # Create test data
+    actual_dimensions = 64
+    nb = 1000
+    nq = 10
+    k = 5
+
+    # Create random test vectors with actual dimensions
+    test_vectors = np.random.rand(nb, actual_dimensions).astype(np.float32)
+    queries = np.random.rand(nq, actual_dimensions).astype(np.float32)
+
+    # Create a TileDB array with artificially large domain (simulating the problem)
+    source_uri = os.path.join(tmp_path, "source_array")
+    large_domain_value = 100000
+
+    # Create schema with large dimension domain
+    schema = tiledb.ArraySchema(
+        domain=tiledb.Domain(
+            tiledb.Dim(
+                name="__dim_0",
+                domain=(0, large_domain_value),
+                tile=1000,
+                dtype="int32",
+            ),
+            tiledb.Dim(
+                name="__dim_1",
+                domain=(0, large_domain_value),
+                tile=actual_dimensions,
+                dtype="int32",
+            ),
+        ),
+        attrs=[
+            tiledb.Attr(name="values", dtype="float32", var=False, nullable=False),
+        ],
+        cell_order="col-major",
+        tile_order="col-major",
+        capacity=10000,
+        sparse=False,
+    )
+
+    # Create the array and write test data
+    tiledb.Array.create(source_uri, schema)
+    with tiledb.open(source_uri, "w") as A:
+        A[0:nb, 0:actual_dimensions] = test_vectors
+
+    # Test ingestion with dimensions parameter override
+    # Without the override, the large domain would be detected as 100001 dimensions
+    # With the override, we explicitly set it to the actual dimensions (64)
+    index_uri = os.path.join(tmp_path, "test_index")
+
+    index = ingest(
+        index_type="FLAT",
+        index_uri=index_uri,
+        source_uri=source_uri,
+        source_type="TILEDB_ARRAY",
+        dimensions=actual_dimensions,  # Override the detected large dimensions
+        size=nb,
+    )
+
+    # Verify the index was created successfully
+    assert index is not None
+    index.vacuum()
+
+    # Verify the index works correctly with queries
+    distances, indices = index.query(queries, k=k)
+
+    # Basic sanity checks
+    assert distances.shape == (nq, k)
+    assert indices.shape == (nq, k)
+    assert np.all(indices >= 0)
+    assert np.all(indices < nb)
+
+    # Verify that dimensions=-1 (or not passing at all) detects large dimensions but creates unusable index
+    # This demonstrates the problem that the dimensions parameter is meant to solve
+    index_uri_2 = os.path.join(tmp_path, "test_index_2")
+
+    # Create with explicit dimensions=-1 - this will use the large detected dimensions
+    # The index creation will succeed, but queries will fail due to dimension mismatch
+    index_2 = ingest(
+        index_type="FLAT",
+        index_uri=index_uri_2,
+        source_uri=source_uri,
+        source_type="TILEDB_ARRAY",
+        dimensions=-1,  # Uses detected large dimensions (100001)
+        size=nb,
+    )
+
+    assert index_2 is not None
+    index_2.vacuum()
+
+    # Verify that the index was created with the large detected dimensions
+    assert index_2.dimensions == large_domain_value + 1  # 100001 dimensions
+
+    # Verify that queries fail due to dimension mismatch
+    # This demonstrates why the dimensions parameter override is needed
+    with pytest.raises(Exception) as exc_info:
+        index_2.query(queries, k=k)
+    assert (
+        "A query in queries has 64 dimensions, but the indexed data had 100001 dimensions"
+        in str(exc_info.value)
+    )  # Should contain dimension mismatch error
+
+
+def test_dimensions_parameter_with_numpy_input(tmp_path):
+    """
+    Test the dimensions parameter with numpy input vectors.
+
+    This is to ensure that when input_vectors is provided as a numpy array,
+    the dimensions parameter is either ignored or validated correctly.
+    """
+    # Create test data
+    nb = 100
+    actual_dimensions = 32
+    nq = 5
+    k = 3
+
+    # Create random test vectors
+    input_vectors = np.random.rand(nb, actual_dimensions).astype(np.float32)
+    queries = np.random.rand(nq, actual_dimensions).astype(np.float32)
+
+    # Ingest with numpy input and dimensions parameter (should be ignored since input_vectors is provided)
+    index_uri = os.path.join(tmp_path, "test_numpy_index")
+
+    # When input_vectors is provided, the dimensions parameter should not affect the detected dimensions
+    # but the function should still accept it without error
+    index = ingest(
+        index_type="FLAT",
+        index_uri=index_uri,
+        input_vectors=input_vectors,
+        dimensions=999,  # This should be ignored since input_vectors is provided
+    )
+
+    # Verify the index was created successfully
+    assert index is not None
+    index.vacuum()
+
+    # Test that queries work correctly
+    distances, indices = index.query(queries, k=k)
+
+    # Basic sanity checks
+    assert distances.shape == (nq, k)
+    assert indices.shape == (nq, k)
+    assert np.all(indices >= 0)
+    assert np.all(indices < nb)
+
+    # Verify that dimensions parameter doesn't cause issues with default behavior
+    index_uri_2 = os.path.join(tmp_path, "test_numpy_index_2")
+
+    # Test without dimensions parameter (default behavior)
+    index_2 = ingest(
+        index_type="FLAT",
+        index_uri=index_uri_2,
+        input_vectors=input_vectors,
+        # No dimensions parameter - should work as before
+    )
+
+    assert index_2 is not None
+    index_2.vacuum()
+
+    # Verify both indexes produce similar results
+    distances_2, indices_2 = index_2.query(queries, k=k)
+    assert distances_2.shape == (nq, k)
+    assert indices_2.shape == (nq, k)