From 24376e5cbcb25b77a80e69d2e1b9798466d44e41 Mon Sep 17 00:00:00 2001 From: gciro-ifom Date: Wed, 6 Aug 2025 16:33:47 +0200 Subject: [PATCH] fix(ingestion): Pad sparse matrices to correct dimensions This commit fixes a bug in `ingestion.py` where sparse matrices with null entries at the end were being read with incorrect dimensions. Previously, the `read_input_vectors` function would convert a sparse matrix to a dense matrix without accounting for trailing null columns. This could lead to a mismatch in dimensions later in the code, as a `10x100` sparse matrix might be read as a `10x90` dense matrix if its last 10 columns were empty. The fix adds a check to ensure the matrix is padded with zeros to the expected dimensions, preventing the dimension mismatch. --- apis/python/src/tiledb/vector_search/ingestion.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py index b5652be62..48793bee7 100644 --- a/apis/python/src/tiledb/vector_search/ingestion.py +++ b/apis/python/src/tiledb/vector_search/ingestion.py @@ -941,7 +941,8 @@ def read_input_vectors( ) as src_array: src_array_schema = src_array.schema data = src_array[start_pos:end_pos, 0:dimensions] - return coo_matrix( + + matrix = coo_matrix( ( data[src_array_schema.attr(0).name], ( @@ -950,6 +951,15 @@ def read_input_vectors( ), ) ).toarray() + + if matrix.shape[1] < dimensions: + matrix = np.concatenate([ + matrix, + np.zeros(shape=(matrix.shape[0], dimensions - matrix.shape[1])) + ], axis=1) + + return matrix + elif source_type == "TILEDB_PARTITIONED_ARRAY": with tiledb.open( source_uri, "r", timestamp=index_timestamp, config=config