From 3cfd7aaf6109de528db1fdca1f827921a446c92d Mon Sep 17 00:00:00 2001 From: Alexander Mueller Date: Mon, 11 Aug 2025 14:05:57 +0200 Subject: [PATCH 1/4] Add pruning support for FirstPassGroupingCollector (#15136) --- .../grouping/FirstPassGroupingCollector.java | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java index f7a20643096a..5e1ff059e774 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.TreeSet; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldComparator; import org.apache.lucene.search.LeafFieldComparator; import org.apache.lucene.search.Pruning; @@ -91,9 +92,16 @@ public FirstPassGroupingCollector( for (int i = 0; i < sortFields.length; i++) { final SortField sortField = sortFields[i]; + final Pruning pruning; + if (i == 0) { + pruning = compIDXEnd >= 0 ? Pruning.GREATER_THAN : Pruning.GREATER_THAN_OR_EQUAL_TO; + } else { + pruning = Pruning.NONE; + } + // use topNGroups + 1 so we have a spare slot to use for comparing (tracked by // this.spareSlot): - comparators[i] = sortField.getComparator(topNGroups + 1, Pruning.NONE); + comparators[i] = sortField.getComparator(topNGroups + 1, pruning); reversed[i] = sortField.getReverse() ? -1 : 1; } @@ -238,6 +246,9 @@ private void collectNewGroup(final int doc) throws IOException { // number of groups; from here on we will drop // bottom group when we insert new one: buildSortedSet(); + + // Allow pruning for compatible leaf comparators. + leafComparators[0].setHitsThresholdReached(); } } else { @@ -327,6 +338,11 @@ private void collectExistingGroup(final int doc, final CollectedSearchGroup g } } + @Override + public DocIdSetIterator competitiveIterator() throws IOException { + return leafComparators[0].competitiveIterator(); + } + private void buildSortedSet() throws IOException { final Comparator> comparator = new Comparator<>() { From e618ab30e60bf4584fbfccde25b21479100eaf21 Mon Sep 17 00:00:00 2001 From: Alexander Mueller Date: Mon, 1 Sep 2025 16:56:53 +0200 Subject: [PATCH 2/4] Add support for skipping over non-competitive documents in FirstPassGroupingCollector (#15136) Also adjust CachingCollector behavior to disable minimum competitive scores to allow caching the exhaustive list of hits. --- .../lucene/search/CachingCollector.java | 16 ++++++ .../grouping/FirstPassGroupingCollector.java | 53 ++++++++++++++----- .../lucene/search/grouping/TestGrouping.java | 23 ++++---- 3 files changed, 71 insertions(+), 21 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java b/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java index 4cedde982dda..e553d0ccb30c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java @@ -155,6 +155,17 @@ protected void collect(LeafCollector collector, int i) throws IOException { } } + private static class NoSkippingScorable extends FilterScorable { + public NoSkippingScorable(Scorable in) { + super(in); + } + + @Override + public void setMinCompetitiveScore(float minScore) { + // ignore to enforce exhaustive hits + } + } + private class NoScoreCachingLeafCollector extends FilterLeafCollector { final int maxDocsToCache; @@ -185,6 +196,11 @@ protected void buffer(int doc) throws IOException { docs[docCount] = doc; } + @Override + public void setScorer(Scorable scorer) throws IOException { + super.setScorer(new NoSkippingScorable(scorer)); + } + @Override public void collect(int doc) throws IOException { if (docs != null) { diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java index 5e1ff059e774..c733447f4718 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java @@ -50,9 +50,10 @@ public class FirstPassGroupingCollector extends SimpleCollector { private final LeafFieldComparator[] leafComparators; private final int[] reversed; private final int topNGroups; - private final boolean needsScores; private final HashMap> groupMap; private final int compIDXEnd; + private final ScoreMode scoreMode; + private final boolean canSetMinScore; // Set once we reach topNGroups unique groups: /** @@ -62,6 +63,9 @@ public class FirstPassGroupingCollector extends SimpleCollector { private int docBase; private int spareSlot; + private Scorable scorer; + private int bottomSlot; + private float minCompetitiveScore; /** * Create the first pass collector. @@ -83,7 +87,6 @@ public FirstPassGroupingCollector( // and specialize it? this.topNGroups = topNGroups; - this.needsScores = groupSort.needsScores(); final SortField[] sortFields = groupSort.getSort(); comparators = new FieldComparator[sortFields.length]; leafComparators = new LeafFieldComparator[sortFields.length]; @@ -105,13 +108,21 @@ public FirstPassGroupingCollector( reversed[i] = sortField.getReverse() ? -1 : 1; } + if (SortField.FIELD_SCORE.equals(sortFields[0]) == true) { + scoreMode = ScoreMode.TOP_SCORES; + canSetMinScore = true; + } else { + scoreMode = groupSort.needsScores() ? ScoreMode.TOP_DOCS_WITH_SCORES : ScoreMode.TOP_DOCS; + canSetMinScore = false; + } + spareSlot = topNGroups; groupMap = CollectionUtil.newHashMap(topNGroups); } @Override public ScoreMode scoreMode() { - return needsScores ? ScoreMode.COMPLETE : ScoreMode.COMPLETE_NO_SCORES; + return scoreMode; } /** @@ -162,10 +173,12 @@ public Collection> getTopGroups(int groupOffset) throws IOExcepti @Override public void setScorer(Scorable scorer) throws IOException { + this.scorer = scorer; groupSelector.setScorer(scorer); for (LeafFieldComparator comparator : leafComparators) { comparator.setScorer(scorer); } + setMinCompetitiveScore(scorer); } private boolean isCompetitive(int doc) throws IOException { @@ -273,9 +286,7 @@ private void collectNewGroup(final int doc) throws IOException { assert orderedGroups.size() == topNGroups; final int lastComparatorSlot = orderedGroups.last().comparatorSlot; - for (LeafFieldComparator fc : leafComparators) { - fc.setBottom(lastComparatorSlot); - } + setBottomSlot(lastComparatorSlot); } } @@ -331,9 +342,7 @@ private void collectExistingGroup(final int doc, final CollectedSearchGroup g // If we changed the value of the last group, or changed which group was last, then update // bottom: if (group == newLast || prevLast != newLast) { - for (LeafFieldComparator fc : leafComparators) { - fc.setBottom(newLast.comparatorSlot); - } + setBottomSlot(newLast.comparatorSlot); } } } @@ -364,13 +373,12 @@ public int compare(CollectedSearchGroup o1, CollectedSearchGroup o2) { orderedGroups.addAll(groupMap.values()); assert orderedGroups.size() > 0; - for (LeafFieldComparator fc : leafComparators) { - fc.setBottom(orderedGroups.last().comparatorSlot); - } + setBottomSlot(orderedGroups.last().comparatorSlot); } @Override protected void doSetNextReader(LeafReaderContext readerContext) throws IOException { + minCompetitiveScore = 0f; docBase = readerContext.docBase; for (int i = 0; i < comparators.length; i++) { leafComparators[i] = comparators[i].getLeafComparator(readerContext); @@ -388,4 +396,25 @@ public GroupSelector getGroupSelector() { private boolean isGroupMapFull() { return groupMap.size() >= topNGroups; } + + private void setBottomSlot(final int bottomSlot) throws IOException { + for (LeafFieldComparator fc : leafComparators) { + fc.setBottom(bottomSlot); + } + + this.bottomSlot = bottomSlot; + setMinCompetitiveScore(scorer); + } + + private void setMinCompetitiveScore(final Scorable scorer) throws IOException { + if (canSetMinScore == false || isGroupMapFull() == false) { + return; + } + + final float minScore = (float) comparators[0].value(bottomSlot); + if (minScore > minCompetitiveScore) { + scorer.setMinCompetitiveScore(minScore); + minCompetitiveScore = minScore; + } + } } diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java index a351c6e02137..a6e9ee0192b8 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.search.grouping; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -1520,14 +1522,14 @@ private void assertEquals( "expected.groups.length != actual.groups.length", expected.groups.length, actual.groups.length); - assertEquals( - "expected.totalHitCount != actual.totalHitCount", - expected.totalHitCount, - actual.totalHitCount); - assertEquals( - "expected.totalGroupedHitCount != actual.totalGroupedHitCount", - expected.totalGroupedHitCount, - actual.totalGroupedHitCount); + assertThat( + "expected.totalHitCount >= actual.totalHitCount", + actual.totalHitCount, + lessThanOrEqualTo(expected.totalHitCount)); + assertThat( + "expected.totalGroupedHitCount >= actual.totalGroupedHitCount", + actual.totalGroupedHitCount, + lessThanOrEqualTo(expected.totalGroupedHitCount)); if (expected.totalGroupCount != null && verifyTotalGroupCount) { assertEquals( "expected.totalGroupCount != actual.totalGroupCount", @@ -1556,7 +1558,10 @@ private void assertEquals( // TODO // assertEquals(expectedGroup.maxScore, actualGroup.maxScore); - assertEquals(expectedGroup.totalHits().value(), actualGroup.totalHits().value()); + assertThat( + "expectedGroup.totalHits().value() >= actualGroup.totalHits().value()", + actualGroup.totalHits().value(), + lessThanOrEqualTo(expectedGroup.totalHits().value())); final ScoreDoc[] expectedFDs = expectedGroup.scoreDocs(); final ScoreDoc[] actualFDs = actualGroup.scoreDocs(); From 6c29e6fde7a909482b514754e354dbb82ccae445 Mon Sep 17 00:00:00 2001 From: Alexander Mueller Date: Tue, 23 Sep 2025 18:35:48 +0200 Subject: [PATCH 3/4] Fix TestGrouping unit tests to use ScoreMode that is aligned with the collectors for the Weight (#15136) --- .../lucene/search/grouping/TestGrouping.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java index a6e9ee0192b8..a5b64cb8ef57 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java @@ -1375,13 +1375,6 @@ private TopGroups searchShards( + canUseIDV); } // Run 1st pass collector to get top groups per shard - final Weight w = - topSearcher.createWeight( - topSearcher.rewrite(query), - groupSort.needsScores() || docSort.needsScores() || getMaxScores - ? ScoreMode.COMPLETE - : ScoreMode.COMPLETE_NO_SCORES, - 1); final List>> shardGroups = new ArrayList<>(); List> firstPassGroupingCollectors = new ArrayList<>(); FirstPassGroupingCollector firstPassCollector = null; @@ -1405,6 +1398,10 @@ private TopGroups searchShards( System.out.println(" 1st pass collector=" + firstPassCollector); } firstPassGroupingCollectors.add(firstPassCollector); + + final Weight w = + topSearcher.createWeight(topSearcher.rewrite(query), firstPassCollector.scoreMode(), 1); + subSearchers[shardIDX].search(w, firstPassCollector); final Collection> topGroups = getSearchGroups(firstPassCollector, 0); if (topGroups != null) { @@ -1462,6 +1459,11 @@ private TopGroups searchShards( docSort, docOffset + topNDocs, getMaxScores); + + final Weight w = + topSearcher.createWeight( + topSearcher.rewrite(query), secondPassCollector.scoreMode(), 1); + subSearchers[shardIDX].search(w, secondPassCollector); shardTopGroups[shardIDX] = getTopGroups(secondPassCollector, 0); if (VERBOSE) { From 74417a24e61e640338f9387490655f742de31bc0 Mon Sep 17 00:00:00 2001 From: Alexander Mueller Date: Tue, 23 Sep 2025 18:38:27 +0200 Subject: [PATCH 4/4] Update CHANGES.txt (#15136) --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b2a255aa2420..33077fe30c6b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -75,6 +75,8 @@ Optimizations * GITHUB#15085, GITHUB#15092: Hunspell suggestions: Ensure candidate roots are not worse before updating. (Ilia Permiashkin) +* GITHUB#15210: Enable pruning and skipping support for FirstPassGroupingCollector (Alexander Mueller) + Bug Fixes --------------------- * GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero