Skip to content

Commit 5df83cb

Browse files
JM-Labsobychacko
andcommitted
Add approximate k-NN search support to OpenSearchVectorStore
This commit adds HNSW-based approximate k-NN search as an alternative to the existing exact k-NN search, providing better performance and scalability for large vector datasets. Key changes: - Add useApproximateKnn, dimensions, and similarity configuration options - Implement buildApproximateQuery() using OpenSearch native knn query - Maintain buildExactQuery() for backwards compatibility (default) - Add comprehensive test coverage for both exact and approximate modes - Test multiple similarity functions: cosinesimil, l1, l2, linf, innerproduct - Fix test isolation by using dedicated beans and indexes per similarity function - Update documentation with new properties and usage examples The implementation maintains full backward compatibility with approximate k-NN disabled by default. Signed-off-by: Jemin Huh <hjm1980@gmail.com> Co-authored-by: Soby Chacko <soby.chacko@broadcom.com>
1 parent 8327c0f commit 5df83cb

File tree

5 files changed

+498
-40
lines changed

5 files changed

+498
-40
lines changed

auto-configurations/vector-stores/spring-ai-autoconfigure-vector-store-opensearch/src/main/java/org/springframework/ai/vectorstore/opensearch/autoconfigure/OpenSearchVectorStoreAutoConfiguration.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,19 @@ OpenSearchVectorStore vectorStore(OpenSearchVectorStoreProperties properties, Op
8888
var mappingJson = Optional.ofNullable(properties.getMappingJson())
8989
.orElse(OpenSearchVectorStore.DEFAULT_MAPPING_EMBEDDING_TYPE_KNN_VECTOR_DIMENSION);
9090

91-
return OpenSearchVectorStore.builder(openSearchClient, embeddingModel)
91+
var builder = OpenSearchVectorStore.builder(openSearchClient, embeddingModel)
9292
.index(indexName)
9393
.mappingJson(mappingJson)
9494
.initializeSchema(properties.isInitializeSchema())
9595
.observationRegistry(observationRegistry.getIfUnique(() -> ObservationRegistry.NOOP))
9696
.customObservationConvention(customObservationConvention.getIfAvailable(() -> null))
97-
.batchingStrategy(batchingStrategy)
98-
.build();
97+
.batchingStrategy(batchingStrategy);
98+
99+
Optional.ofNullable(properties.getUseApproximateKnn()).ifPresent(builder::useApproximateKnn);
100+
Optional.ofNullable(properties.getDimensions()).ifPresent(builder::dimensions);
101+
Optional.ofNullable(properties.getSimilarity()).ifPresent(builder::similarityFunction);
102+
103+
return builder.build();
99104
}
100105

101106
@Configuration(proxyBeanMethods = false)

auto-configurations/vector-stores/spring-ai-autoconfigure-vector-store-opensearch/src/main/java/org/springframework/ai/vectorstore/opensearch/autoconfigure/OpenSearchVectorStoreProperties.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ public class OpenSearchVectorStoreProperties extends CommonVectorStoreProperties
3838

3939
private String password;
4040

41+
private Boolean useApproximateKnn;
42+
43+
private Integer dimensions;
44+
45+
private String similarity;
46+
4147
private String mappingJson;
4248

4349
/**
@@ -100,6 +106,30 @@ public String getMappingJson() {
100106
return this.mappingJson;
101107
}
102108

109+
public Boolean getUseApproximateKnn() {
110+
return this.useApproximateKnn;
111+
}
112+
113+
public void setUseApproximateKnn(Boolean useApproximateKnn) {
114+
this.useApproximateKnn = useApproximateKnn;
115+
}
116+
117+
public Integer getDimensions() {
118+
return this.dimensions;
119+
}
120+
121+
public void setDimensions(Integer dimensions) {
122+
this.dimensions = dimensions;
123+
}
124+
125+
public String getSimilarity() {
126+
return this.similarity;
127+
}
128+
129+
public void setSimilarity(String similarity) {
130+
this.similarity = similarity;
131+
}
132+
103133
public void setMappingJson(String mappingJson) {
104134
this.mappingJson = mappingJson;
105135
}

spring-ai-docs/src/main/antora/modules/ROOT/pages/api/vectordbs/opensearch.adoc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,10 @@ Properties starting with `spring.ai.vectorstore.opensearch.*` are used to config
108108
|`spring.ai.vectorstore.opensearch.password`| Password for the specified username | -
109109
|`spring.ai.vectorstore.opensearch.index-name`| Name of the index to store vectors | `spring-ai-document-index`
110110
|`spring.ai.vectorstore.opensearch.initialize-schema`| Whether to initialize the required schema | `false`
111-
|`spring.ai.vectorstore.opensearch.similarity-function`| The similarity function to use | `cosinesimil`
111+
|`spring.ai.vectorstore.opensearch.similarity-function`| The similarity function to use (cosinesimil, l1, l2, linf, innerproduct) | `cosinesimil`
112+
|`spring.ai.vectorstore.opensearch.use-approximate-knn`| Whether to use approximate k-NN for faster searches. If true, uses HNSW-based approximate search. If false, uses exact brute-force k-NN. See link:https://opensearch.org/docs/latest/search-plugins/knn/approximate-knn/[Approximate k-NN] and link:https://opensearch.org/docs/latest/search-plugins/knn/knn-score-script/[Exact k-NN] | `false`
113+
|`spring.ai.vectorstore.opensearch.dimensions`| Number of dimensions for vector embeddings. Used when creating index mapping for approximate k-NN. If not set, uses the embedding model's dimensions. | `1536`
114+
|`spring.ai.vectorstore.opensearch.mapping-json`| Custom JSON mapping for the index. Overrides default mapping generation. | -
112115
|`spring.ai.vectorstore.opensearch.read-timeout`| Time to wait for response from the opposite endpoint. 0 - infinity. | -
113116
|`spring.ai.vectorstore.opensearch.connect-timeout`| Time to wait until connection established. 0 - infinity. | -
114117
|`spring.ai.vectorstore.opensearch.path-prefix`| Path prefix for OpenSearch API endpoints. Useful when OpenSearch is behind a reverse proxy with a non-root path. | -
@@ -191,6 +194,8 @@ public VectorStore vectorStore(OpenSearchClient openSearchClient, EmbeddingModel
191194
return OpenSearchVectorStore.builder(openSearchClient, embeddingModel)
192195
.index("custom-index") // Optional: defaults to "spring-ai-document-index"
193196
.similarityFunction("l2") // Optional: defaults to "cosinesimil"
197+
.useApproximateKnn(true) // Optional: defaults to false (exact k-NN)
198+
.dimensions(1536) // Optional: defaults to 1536 or embedding model's dimensions
194199
.initializeSchema(true) // Optional: defaults to false
195200
.batchingStrategy(new TokenCountBatchingStrategy()) // Optional: defaults to TokenCountBatchingStrategy
196201
.build();

vector-stores/spring-ai-opensearch-store/src/main/java/org/springframework/ai/vectorstore/opensearch/OpenSearchVectorStore.java

Lines changed: 102 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,10 @@ public class OpenSearchVectorStore extends AbstractObservationVectorStore implem
170170

171171
private String similarityFunction;
172172

173+
private final boolean useApproximateKnn;
174+
175+
private final int dimensions;
176+
173177
/**
174178
* Creates a new OpenSearchVectorStore using the builder pattern.
175179
* @param builder The configured builder instance
@@ -187,6 +191,8 @@ protected OpenSearchVectorStore(Builder builder) {
187191
// https://opensearch.org/docs/latest/search-plugins/knn/approximate-knn/#spaces
188192
this.similarityFunction = builder.similarityFunction;
189193
this.initializeSchema = builder.initializeSchema;
194+
this.useApproximateKnn = builder.useApproximateKnn;
195+
this.dimensions = builder.dimensions;
190196
}
191197

192198
/**
@@ -270,17 +276,38 @@ public List<Document> doSimilaritySearch(SearchRequest searchRequest) {
270276

271277
public List<Document> similaritySearch(float[] embedding, int topK, double similarityThreshold,
272278
Filter.Expression filterExpression) {
273-
return similaritySearch(new org.opensearch.client.opensearch.core.SearchRequest.Builder()
274-
.query(getOpenSearchSimilarityQuery(embedding, filterExpression))
279+
return similaritySearch(
280+
this.useApproximateKnn ? buildApproximateQuery(embedding, topK, similarityThreshold, filterExpression)
281+
: buildExactQuery(embedding, topK, similarityThreshold, filterExpression));
282+
}
283+
284+
private org.opensearch.client.opensearch.core.SearchRequest buildApproximateQuery(float[] embedding, int topK,
285+
double similarityThreshold, Filter.Expression filterExpression) {
286+
return new org.opensearch.client.opensearch.core.SearchRequest.Builder().index(this.index)
287+
.query(Query.of(builder -> builder.knn(knnQueryBuilder -> knnQueryBuilder
288+
.filter(Query
289+
.of(queryBuilder -> queryBuilder.queryString(queryStringQuerybuilder -> queryStringQuerybuilder
290+
.query(getOpenSearchQueryString(filterExpression)))))
291+
.field("embedding")
292+
.k(topK)
293+
.vector(embedding))))
294+
.minScore(similarityThreshold)
295+
.build();
296+
}
297+
298+
private org.opensearch.client.opensearch.core.SearchRequest buildExactQuery(float[] embedding, int topK,
299+
double similarityThreshold, Filter.Expression filterExpression) {
300+
return new org.opensearch.client.opensearch.core.SearchRequest.Builder()
301+
.query(buildExactQuery(embedding, filterExpression))
275302
.index(this.index)
276303
.sort(sortOptionsBuilder -> sortOptionsBuilder
277304
.score(scoreSortBuilder -> scoreSortBuilder.order(SortOrder.Desc)))
278305
.size(topK)
279306
.minScore(similarityThreshold)
280-
.build());
307+
.build();
281308
}
282309

283-
private Query getOpenSearchSimilarityQuery(float[] embedding, Filter.Expression filterExpression) {
310+
private Query buildExactQuery(float[] embedding, Filter.Expression filterExpression) {
284311
return Query.of(queryBuilder -> queryBuilder.scriptScore(scriptScoreQueryBuilder -> {
285312
scriptScoreQueryBuilder
286313
.query(queryBuilder2 -> queryBuilder2.queryString(queryStringQuerybuilder -> queryStringQuerybuilder
@@ -358,8 +385,41 @@ private CreateIndexResponse createIndexMapping(String index, String mappingJson)
358385

359386
@Override
360387
public void afterPropertiesSet() {
388+
/**
389+
* Generates a JSON string for the k-NN vector mapping configuration. The
390+
* knn_vector field allows k-NN vectors ingestion into OpenSearch and supports
391+
* various k-NN searches.
392+
* @see <a href=
393+
* "https://opensearch.org/docs/latest/search-plugins/knn/knn-index#method-definitions">OpenSearch
394+
* k-NN Method Definitions</a>
395+
*/
361396
if (this.initializeSchema && !exists(this.index)) {
362-
createIndexMapping(this.index, String.format(this.mappingJson, this.embeddingModel.dimensions()));
397+
String finalMappingJson;
398+
if (this.useApproximateKnn
399+
&& this.mappingJson.equals(DEFAULT_MAPPING_EMBEDDING_TYPE_KNN_VECTOR_DIMENSION)) {
400+
// Generate approximate k-NN mapping with HNSW method
401+
finalMappingJson = """
402+
{
403+
"properties": {
404+
"embedding": {
405+
"type": "knn_vector",
406+
"dimension": %d,
407+
"method": {
408+
"name": "hnsw",
409+
"engine": "lucene",
410+
"space_type": "%s"
411+
}
412+
}
413+
}
414+
}
415+
""".formatted(this.dimensions > 0 ? this.dimensions : this.embeddingModel.dimensions(),
416+
this.similarityFunction);
417+
}
418+
else {
419+
// Use provided mapping or default exact k-NN mapping
420+
finalMappingJson = String.format(this.mappingJson, this.embeddingModel.dimensions());
421+
}
422+
createIndexMapping(this.index, finalMappingJson);
363423
}
364424
}
365425

@@ -417,6 +477,10 @@ public static class Builder extends AbstractVectorStoreBuilder<Builder> {
417477

418478
private String similarityFunction = COSINE_SIMILARITY_FUNCTION;
419479

480+
private boolean useApproximateKnn = false;
481+
482+
private int dimensions = 1536;
483+
420484
/**
421485
* Sets the OpenSearch client.
422486
* @param openSearchClient The OpenSearch client to use
@@ -488,6 +552,39 @@ public Builder similarityFunction(String similarityFunction) {
488552
return this;
489553
}
490554

555+
/**
556+
* Sets whether to use approximate k-NN search. If true, the approximate k-NN
557+
* method is used for faster searches and maintains good performance even at large
558+
* scales. If false, the exact brute-force k-NN method is used for precise and
559+
* highly accurate searches.
560+
* @param useApproximateKnn true to use approximate k-NN, false for exact k-NN
561+
* @return The builder instance
562+
* @see <a href=
563+
* "https://opensearch.org/docs/latest/search-plugins/knn/approximate-knn/">Approximate
564+
* k-NN</a>
565+
* @see <a href=
566+
* "https://opensearch.org/docs/latest/search-plugins/knn/knn-score-script/">Exact
567+
* k-NN with scoring script</a>
568+
*/
569+
public Builder useApproximateKnn(boolean useApproximateKnn) {
570+
this.useApproximateKnn = useApproximateKnn;
571+
return this;
572+
}
573+
574+
/**
575+
* Sets the number of dimensions for the vector embeddings. This is used when
576+
* creating the index mapping for approximate k-NN. If not set, defaults to 1536
577+
* or uses the embedding model's dimensions.
578+
* @param dimensions The number of dimensions
579+
* @return The builder instance
580+
* @throws IllegalArgumentException if dimensions is less than or equal to 0
581+
*/
582+
public Builder dimensions(int dimensions) {
583+
Assert.isTrue(dimensions > 0, "dimensions must be greater than 0");
584+
this.dimensions = dimensions;
585+
return this;
586+
}
587+
491588
/**
492589
* Builds a new OpenSearchVectorStore instance with the configured properties.
493590
* @return A new OpenSearchVectorStore instance

0 commit comments

Comments
 (0)