diff --git a/docs/changelog/130254.yaml b/docs/changelog/130254.yaml new file mode 100644 index 0000000000000..a17e934adbdc9 --- /dev/null +++ b/docs/changelog/130254.yaml @@ -0,0 +1,5 @@ +pr: 130254 +summary: Patch for Lucene bug 14857 +area: Vector Search +type: bug +issues: [] diff --git a/docs/reference/search/profile.asciidoc b/docs/reference/search/profile.asciidoc index f69092847c232..8bcfc9837dc68 100644 --- a/docs/reference/search/profile.asciidoc +++ b/docs/reference/search/profile.asciidoc @@ -1268,6 +1268,7 @@ POST my-knn-index/_bulk?refresh=true { "index": { "_id": "3" } } { "my-vector": [15, 11, 23] } -------------------------------------------------- +// TEST[skip:response format changes in 130254] With an index setup, we can now profile a kNN search query. @@ -1284,7 +1285,7 @@ POST my-knn-index/_search } } -------------------------------------------------- -// TEST[continued] +// TEST[skip:response format changes in 130254] <1> The `profile` parameter is set to `true`. @@ -1302,8 +1303,8 @@ One of the `dfs.knn` sections for a shard looks like the following: "vector_operations_count" : 4, "query" : [ { - "type" : "DocAndScoreQuery", - "description" : "DocAndScore[100]", + "type" : "KnnScoreDocQuery", + "description" : "KnnScoreDocQuery[100]", "time_in_nanos" : 444414, "breakdown" : { "set_min_competitive_score_count" : 0, @@ -1340,10 +1341,7 @@ One of the `dfs.knn` sections for a shard looks like the following: } ] } -------------------------------------------------- -// TESTRESPONSE[s/^/{\n"took": $body.took,\n"timed_out": $body.timed_out,\n"_shards": $body._shards,\n"hits": $body.hits,\n"profile": {\n"shards": [ {\n"id": "$body.$_path",\n"node_id": "$body.$_path",\n"shard_id": "$body.$_path",\n"index": "$body.$_path",\n"cluster": "$body.$_path",\n/] -// TESTRESPONSE[s/}$/}, "aggregations": [], "searches": $body.$_path, "fetch": $body.$_path}]}}/] -// TESTRESPONSE[s/ (\-)?[0-9]+/ $body.$_path/] -// TESTRESPONSE[s/"dfs" : \{/"dfs" : {"statistics": $body.$_path,/] +// TESTRESPONSE[skip:response format changes in 130254] In the `dfs.knn` portion of the response we can see the output the of timings for <>, <>, diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/370_profile.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/370_profile.yml index dc79961ae78cd..c1c8ddeb45d3d 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/370_profile.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/370_profile.yml @@ -211,8 +211,6 @@ dfs knn vector profiling: num_candidates: 100 - match: { hits.total.value: 1 } - - match: { profile.shards.0.dfs.knn.0.query.0.type: "DocAndScoreQuery" } - - match: { profile.shards.0.dfs.knn.0.query.0.description: "DocAndScore[100]" } - gt: { profile.shards.0.dfs.knn.0.query.0.time_in_nanos: 0 } - match: { profile.shards.0.dfs.knn.0.query.0.breakdown.set_min_competitive_score_count: 0 } - match: { profile.shards.0.dfs.knn.0.query.0.breakdown.set_min_competitive_score: 0 } @@ -275,8 +273,6 @@ dfs knn vector profiling with vector_operations_count: num_candidates: 100 - match: { hits.total.value: 1 } - - match: { profile.shards.0.dfs.knn.0.query.0.type: "DocAndScoreQuery" } - - match: { profile.shards.0.dfs.knn.0.query.0.description: "DocAndScore[100]" } - match: { profile.shards.0.dfs.knn.0.vector_operations_count: 1 } - gt: { profile.shards.0.dfs.knn.0.query.0.time_in_nanos: 0 } - match: { profile.shards.0.dfs.knn.0.query.0.breakdown.set_min_competitive_score_count: 0 } diff --git a/server/src/main/java/org/elasticsearch/search/SearchFeatures.java b/server/src/main/java/org/elasticsearch/search/SearchFeatures.java index 86032abb00b22..44f64d3317678 100644 --- a/server/src/main/java/org/elasticsearch/search/SearchFeatures.java +++ b/server/src/main/java/org/elasticsearch/search/SearchFeatures.java @@ -27,6 +27,7 @@ public Set getFeatures() { ); public static final NodeFeature INT_SORT_FOR_INT_SHORT_BYTE_FIELDS = new NodeFeature("search.sort.int_sort_for_int_short_byte_fields"); static final NodeFeature MULTI_MATCH_CHECKS_POSITIONS = new NodeFeature("search.multi.match.checks.positions"); + private static final NodeFeature KNN_QUERY_BUGFIX_130254 = new NodeFeature("search.knn.query.bugfix.130254"); @Override public Set getTestFeatures() { @@ -34,7 +35,8 @@ public Set getTestFeatures() { RETRIEVER_RESCORER_ENABLED, COMPLETION_FIELD_SUPPORTS_DUPLICATE_SUGGESTIONS, INT_SORT_FOR_INT_SHORT_BYTE_FIELDS, - MULTI_MATCH_CHECKS_POSITIONS + MULTI_MATCH_CHECKS_POSITIONS, + KNN_QUERY_BUGFIX_130254 ); } } diff --git a/server/src/main/java/org/elasticsearch/search/vectors/ESDiversifyingChildrenByteKnnVectorQuery.java b/server/src/main/java/org/elasticsearch/search/vectors/ESDiversifyingChildrenByteKnnVectorQuery.java index e77b86c9a363e..02fc0a374719e 100644 --- a/server/src/main/java/org/elasticsearch/search/vectors/ESDiversifyingChildrenByteKnnVectorQuery.java +++ b/server/src/main/java/org/elasticsearch/search/vectors/ESDiversifyingChildrenByteKnnVectorQuery.java @@ -9,15 +9,20 @@ package org.elasticsearch.search.vectors; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.join.BitSetProducer; import org.apache.lucene.search.join.DiversifyingChildrenByteKnnVectorQuery; import org.elasticsearch.search.profile.query.QueryProfiler; +import java.io.IOException; + public class ESDiversifyingChildrenByteKnnVectorQuery extends DiversifyingChildrenByteKnnVectorQuery implements QueryProfilerProvider { private final Integer kParam; private long vectorOpsCount; + private final int k; public ESDiversifyingChildrenByteKnnVectorQuery( String field, @@ -29,6 +34,18 @@ public ESDiversifyingChildrenByteKnnVectorQuery( ) { super(field, query, childFilter, numCands, parentsFilter); this.kParam = k; + this.k = numCands; + } + + @Override + public Query rewrite(IndexSearcher searcher) throws IOException { + Query rewrittenQuery = super.rewrite(searcher); + if (rewrittenQuery instanceof MatchNoDocsQuery) { + // If the rewritten query is a MatchNoDocsQuery, we can return it directly. + return rewrittenQuery; + } + // We don't rewrite this query, so we return it as is. + return KnnScoreDocQuery.fromQuery(rewrittenQuery, kParam == null ? k : kParam, searcher); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/vectors/ESDiversifyingChildrenFloatKnnVectorQuery.java b/server/src/main/java/org/elasticsearch/search/vectors/ESDiversifyingChildrenFloatKnnVectorQuery.java index 9b8b4d3e3b008..34ebd0a259769 100644 --- a/server/src/main/java/org/elasticsearch/search/vectors/ESDiversifyingChildrenFloatKnnVectorQuery.java +++ b/server/src/main/java/org/elasticsearch/search/vectors/ESDiversifyingChildrenFloatKnnVectorQuery.java @@ -9,15 +9,20 @@ package org.elasticsearch.search.vectors; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.join.BitSetProducer; import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; import org.elasticsearch.search.profile.query.QueryProfiler; +import java.io.IOException; + public class ESDiversifyingChildrenFloatKnnVectorQuery extends DiversifyingChildrenFloatKnnVectorQuery implements QueryProfilerProvider { private final Integer kParam; private long vectorOpsCount; + private final int k; public ESDiversifyingChildrenFloatKnnVectorQuery( String field, @@ -29,6 +34,18 @@ public ESDiversifyingChildrenFloatKnnVectorQuery( ) { super(field, query, childFilter, numCands, parentsFilter); this.kParam = k; + this.k = numCands; + } + + @Override + public Query rewrite(IndexSearcher searcher) throws IOException { + Query rewrittenQuery = super.rewrite(searcher); + if (rewrittenQuery instanceof MatchNoDocsQuery) { + // If the rewritten query is a MatchNoDocsQuery, we can return it directly. + return rewrittenQuery; + } + // We don't rewrite this query, so we return it as is. + return KnnScoreDocQuery.fromQuery(rewrittenQuery, kParam == null ? k : kParam, searcher); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/vectors/ESKnnByteVectorQuery.java b/server/src/main/java/org/elasticsearch/search/vectors/ESKnnByteVectorQuery.java index d47585c055094..6381ff8fc21c4 100644 --- a/server/src/main/java/org/elasticsearch/search/vectors/ESKnnByteVectorQuery.java +++ b/server/src/main/java/org/elasticsearch/search/vectors/ESKnnByteVectorQuery.java @@ -9,11 +9,15 @@ package org.elasticsearch.search.vectors; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnByteVectorQuery; +import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.elasticsearch.search.profile.query.QueryProfiler; +import java.io.IOException; + public class ESKnnByteVectorQuery extends KnnByteVectorQuery implements QueryProfilerProvider { private final Integer kParam; private long vectorOpsCount; @@ -23,6 +27,17 @@ public ESKnnByteVectorQuery(String field, byte[] target, Integer k, int numCands this.kParam = k; } + @Override + public Query rewrite(IndexSearcher searcher) throws IOException { + Query rewrittenQuery = super.rewrite(searcher); + if (rewrittenQuery instanceof MatchNoDocsQuery) { + // If the rewritten query is a MatchNoDocsQuery, we can return it directly. + return rewrittenQuery; + } + // We don't rewrite this query, so we return it as is. + return KnnScoreDocQuery.fromQuery(rewrittenQuery, kParam == null ? k : kParam, searcher); + } + @Override protected TopDocs mergeLeafResults(TopDocs[] perLeafResults) { // if k param is set, we get only top k results from each shard diff --git a/server/src/main/java/org/elasticsearch/search/vectors/ESKnnFloatVectorQuery.java b/server/src/main/java/org/elasticsearch/search/vectors/ESKnnFloatVectorQuery.java index 97ce1bc1d8347..56fa657de3209 100644 --- a/server/src/main/java/org/elasticsearch/search/vectors/ESKnnFloatVectorQuery.java +++ b/server/src/main/java/org/elasticsearch/search/vectors/ESKnnFloatVectorQuery.java @@ -9,11 +9,15 @@ package org.elasticsearch.search.vectors; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.elasticsearch.search.profile.query.QueryProfiler; +import java.io.IOException; + public class ESKnnFloatVectorQuery extends KnnFloatVectorQuery implements QueryProfilerProvider { private final Integer kParam; private long vectorOpsCount; @@ -23,6 +27,17 @@ public ESKnnFloatVectorQuery(String field, float[] target, Integer k, int numCan this.kParam = k; } + @Override + public Query rewrite(IndexSearcher searcher) throws IOException { + Query rewrittenQuery = super.rewrite(searcher); + if (rewrittenQuery instanceof MatchNoDocsQuery) { + // If the rewritten query is a MatchNoDocsQuery, we can return it directly. + return rewrittenQuery; + } + // We don't rewrite this query, so we return it as is. + return KnnScoreDocQuery.fromQuery(rewrittenQuery, kParam == null ? k : kParam, searcher); + } + @Override protected TopDocs mergeLeafResults(TopDocs[] perLeafResults) { // if k param is set, we get only top k results from each shard diff --git a/server/src/main/java/org/elasticsearch/search/vectors/KnnScoreDocQuery.java b/server/src/main/java/org/elasticsearch/search/vectors/KnnScoreDocQuery.java index 184bc3024aa65..08aefdfbd30ed 100644 --- a/server/src/main/java/org/elasticsearch/search/vectors/KnnScoreDocQuery.java +++ b/server/src/main/java/org/elasticsearch/search/vectors/KnnScoreDocQuery.java @@ -20,6 +20,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.Weight; import java.io.IOException; @@ -36,6 +37,7 @@ * {@link org.apache.lucene.search.KnnFloatVectorQuery}, which is package-private. */ public class KnnScoreDocQuery extends Query { + private final float maxScore; private final int[] docs; private final float[] scores; @@ -47,6 +49,18 @@ public class KnnScoreDocQuery extends Query { // an object identifying the reader context that was used to build this query private final Object contextIdentity; + static Query fromQuery(Query query, int k, IndexSearcher searcher) throws IOException { + if (query instanceof MatchNoDocsQuery) { + // If the rewritten query is a MatchNoDocsQuery, we can return it directly. + return query; + } + if (query instanceof KnnScoreDocQuery knnQuery) { + return knnQuery; + } + TopDocs topDocs = searcher.search(query, k); + return new KnnScoreDocQuery(topDocs.scoreDocs, searcher.getIndexReader()); + } + /** * Creates a query. * @@ -58,10 +72,13 @@ public class KnnScoreDocQuery extends Query { Arrays.sort(scoreDocs, Comparator.comparingInt(scoreDoc -> scoreDoc.doc)); this.docs = new int[scoreDocs.length]; this.scores = new float[scoreDocs.length]; + float maxScore = Float.NEGATIVE_INFINITY; for (int i = 0; i < scoreDocs.length; i++) { docs[i] = scoreDocs[i].doc; scores[i] = scoreDocs[i].score; + maxScore = Math.max(maxScore, scores[i]); } + this.maxScore = maxScore; this.segmentStarts = findSegmentStarts(reader, docs); this.contextIdentity = reader.getContext().id(); } @@ -157,16 +174,7 @@ public long cost() { @Override public float getMaxScore(int docId) { - // NO_MORE_DOCS indicates the maximum score for all docs in this segment - // Anything less than must be accounted for via the docBase. - if (docId != NO_MORE_DOCS) { - docId += context.docBase; - } - float maxScore = 0; - for (int idx = Math.max(lower, upTo); idx < upper && docs[idx] <= docId; idx++) { - maxScore = Math.max(maxScore, scores[idx] * boost); - } - return maxScore; + return maxScore * boost; } @Override @@ -174,19 +182,6 @@ public float score() { return scores[upTo] * boost; } - @Override - public int advanceShallow(int docId) { - int start = Math.max(upTo, lower); - int docIdIndex = Arrays.binarySearch(docs, start, upper, docId + context.docBase); - if (docIdIndex < 0) { - docIdIndex = -1 - docIdIndex; - } - if (docIdIndex >= upper) { - return NO_MORE_DOCS; - } - return docs[docIdIndex]; - } - @Override public int docID() { return currentDocId(); @@ -224,7 +219,7 @@ float[] scores() { @Override public String toString(String field) { - return "ScoreAndDocQuery"; + return "DocAndScoreQuery[" + docs[0] + ",...][" + scores[0] + ",...]," + maxScore; } @Override diff --git a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/600_rrf_retriever_profile.yml b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/600_rrf_retriever_profile.yml index a9ddb4f902929..75a6b479a6a92 100644 --- a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/600_rrf_retriever_profile.yml +++ b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/600_rrf_retriever_profile.yml @@ -71,9 +71,16 @@ setup: - do: indices.refresh: {} + - do: + indices.forcemerge: + index: test + max_num_segments: 1 --- "profile standard and knn query": + - requires: + cluster_features: ["search.knn.query.bugfix.130254"] + reason: "Bug fix introduced via PR #130254" - do: search: @@ -114,13 +121,13 @@ setup: - match: { hits.hits.2._id: "4" } - not_exists: profile.shards.0.dfs - - match: { profile.shards.0.searches.0.query.0.type: RankDocsQuery } - - length: { profile.shards.0.searches.0.query.0.children: 2 } - - match: { profile.shards.0.searches.0.query.0.children.0.type: TopQuery } - - match: { profile.shards.0.searches.0.query.0.children.1.type: BooleanQuery } - - length: { profile.shards.0.searches.0.query.0.children.1.children: 2 } - - match: { profile.shards.0.searches.0.query.0.children.1.children.0.type: TermQuery } - - match: { profile.shards.0.searches.0.query.0.children.1.children.1.type: DocAndScoreQuery } + - match: { profile.shards.0.searches.0.query.2.type: RankDocsQuery } + - length: { profile.shards.0.searches.0.query.2.children: 2 } + - match: { profile.shards.0.searches.0.query.2.children.0.type: TopQuery } + - match: { profile.shards.0.searches.0.query.2.children.1.type: BooleanQuery } + - length: { profile.shards.0.searches.0.query.2.children.1.children: 2 } + - match: { profile.shards.0.searches.0.query.2.children.1.children.0.type: TermQuery } + - match: { profile.shards.0.searches.0.query.2.children.1.children.1.type: KnnScoreDocQuery } --- "profile standard and knn dfs retrievers": @@ -173,8 +180,8 @@ setup: "using query and dfs knn search": - requires: - cluster_features: ["gte_v8.16.0"] - reason: "deprecation added in 8.16" + cluster_features: ["search.knn.query.bugfix.130254"] + reason: "Bug fix introduced via PR #130254" test_runner_features: warnings - do: @@ -212,8 +219,8 @@ setup: - exists: profile.shards.0.dfs - length: { profile.shards.0.dfs.knn: 1 } - - length: { profile.shards.0.dfs.knn.0.query: 1 } - - match: { profile.shards.0.dfs.knn.0.query.0.type: DocAndScoreQuery } + - length: { profile.shards.0.dfs.knn.0.query: 2 } + - match: { profile.shards.0.dfs.knn.0.query.1.type: KnnScoreDocQuery } - match: { profile.shards.0.searches.0.query.0.type: ConstantScoreQuery } - length: { profile.shards.0.searches.0.query.0.children: 1 }