Skip to content
Merged
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ Optimizations
* GITHUB#15151: Use `SimScorer#score` bulk API to compute impact scores per
block of postings. (Adrien Grand)

* GITHUB#15167: FirstPassGroupingCollector supports ignoring docs without group field (Binlong Gao)

* GITHUB#15160: Increased the size used for blocks of postings from 128 to 256.
This gives a noticeable speedup to many queries. (Adrien Grand)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public class DoubleRangeGroupSelector extends GroupSelector<DoubleRange> {
private final DoubleRangeFactory rangeFactory;

private Set<DoubleRange> inSecondPass;
private boolean includeEmpty = true;
private boolean includeEmpty;
private boolean positioned;
private DoubleRange current;

Expand Down Expand Up @@ -88,7 +88,6 @@ public DoubleRange copyValue() throws IOException {
@Override
public void setGroups(Collection<SearchGroup<DoubleRange>> searchGroups) {
inSecondPass = new HashSet<>();
includeEmpty = false;
for (SearchGroup<DoubleRange> group : searchGroups) {
if (group.groupValue == null) {
includeEmpty = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
public class FirstPassGroupingCollector<T> extends SimpleCollector {

private final GroupSelector<T> groupSelector;
private final boolean ignoreDocsWithoutGroupField;

private final FieldComparator<?>[] comparators;
private final LeafFieldComparator[] leafComparators;
Expand Down Expand Up @@ -73,7 +74,28 @@ public class FirstPassGroupingCollector<T> extends SimpleCollector {
*/
public FirstPassGroupingCollector(
GroupSelector<T> groupSelector, Sort groupSort, int topNGroups) {
this(groupSelector, groupSort, topNGroups, false);
}

/**
* Create the first pass collector with ignoreDocsWithoutGroupField
*
* @param groupSelector a GroupSelector used to defined groups
* @param groupSort The {@link Sort} used to sort the groups. The top sorted document within each
* group according to groupSort, determines how that group sorts against other groups. This
* must be non-null, ie, if you want to groupSort by relevance use Sort.RELEVANCE.
* @param topNGroups How many top groups to keep.
* @param ignoreDocsWithoutGroupField if true, ignore documents that don't have the group field
* instead of putting them in a null group
*/
@SuppressWarnings({"unchecked", "rawtypes"})
public FirstPassGroupingCollector(
GroupSelector<T> groupSelector,
Sort groupSort,
int topNGroups,
boolean ignoreDocsWithoutGroupField) {
this.groupSelector = groupSelector;
this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField;
if (topNGroups < 1) {
throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
}
Expand Down Expand Up @@ -197,12 +219,14 @@ public void collect(int doc) throws IOException {
return;
}

// TODO: should we add option to mean "ignore docs that
// don't have the group field" (instead of stuffing them
// under null group)?
groupSelector.advanceTo(doc);
GroupSelector.State state = groupSelector.advanceTo(doc);
T groupValue = groupSelector.currentValue();

// Skip documents without group field if option is enabled
if (ignoreDocsWithoutGroupField && state == GroupSelector.State.SKIP) {
return;
}

final CollectedSearchGroup<T> group = groupMap.get(groupValue);

if (group == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public class GroupingSearch {
private boolean cacheScores;
private boolean allGroups;
private boolean allGroupHeads;
private boolean ignoreDocsWithoutGroupField;

private Collection<?> matchingGroups;
private Bits matchingGroupHeads;
Expand Down Expand Up @@ -138,7 +139,7 @@ protected TopGroups groupByFieldOrFunction(
int topN = groupOffset + groupLimit;

final FirstPassGroupingCollector firstPassCollector =
new FirstPassGroupingCollector(grouper, groupSort, topN);
new FirstPassGroupingCollector(grouper, groupSort, topN, ignoreDocsWithoutGroupField);
final AllGroupsCollector allGroupsCollector =
allGroups ? new AllGroupsCollector(grouper) : null;
final AllGroupHeadsCollector allGroupHeadsCollector =
Expand Down Expand Up @@ -358,4 +359,16 @@ public GroupingSearch setAllGroupHeads(boolean allGroupHeads) {
public Bits getAllGroupHeads() {
return matchingGroupHeads;
}

/**
* Whether to ignore documents that don't have the group field instead of putting them in a null
* group.
*
* @param ignoreDocsWithoutGroupField Whether to ignore documents without group field
* @return <code>this</code>
*/
public GroupingSearch setIgnoreDocsWithoutGroupField(boolean ignoreDocsWithoutGroupField) {
this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public class LongRangeGroupSelector extends GroupSelector<LongRange> {
private final LongRangeFactory rangeFactory;

private Set<LongRange> inSecondPass;
private boolean includeEmpty = true;
private boolean includeEmpty;
private boolean positioned;
private LongRange current;

Expand Down Expand Up @@ -89,7 +89,6 @@ public LongRange copyValue() throws IOException {
@Override
public void setGroups(Collection<SearchGroup<LongRange>> searchGroups) {
inSecondPass = new HashSet<>();
includeEmpty = false;
for (SearchGroup<LongRange> group : searchGroups) {
if (group.groupValue == null) {
includeEmpty = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public class ValueSourceGroupSelector extends GroupSelector<MutableValue> {
private final Map<Object, Object> context;

private Set<MutableValue> secondPassGroups;
private boolean includeEmpty;

/**
* Create a new ValueSourceGroupSelector
Expand All @@ -61,8 +62,12 @@ public void setScorer(Scorable scorer) throws IOException {}
@Override
public State advanceTo(int doc) throws IOException {
this.filler.fillValue(doc);
MutableValue value = filler.getValue();
if (value.exists() == false) {
return includeEmpty ? State.ACCEPT : State.SKIP;
}
if (secondPassGroups != null) {
if (secondPassGroups.contains(filler.getValue()) == false) {
if (secondPassGroups.contains(value) == false) {
return State.SKIP;
}
}
Expand All @@ -83,7 +88,11 @@ public MutableValue copyValue() {
public void setGroups(Collection<SearchGroup<MutableValue>> searchGroups) {
secondPassGroups = new HashSet<>();
for (SearchGroup<MutableValue> group : searchGroups) {
secondPassGroups.add(group.groupValue);
if (group.groupValue.exists() == false) {
includeEmpty = true;
} else {
secondPassGroups.add(group.groupValue);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,49 @@ private void indexRandomDocs(RandomIndexWriter w) throws IOException {
}
}

public void testIgnoreDocsWithoutGroupField() throws IOException {
Shard shard = new Shard();

// Add documents with group field
Document doc = new Document();
doc.add(new TextField("text", "foo", Field.Store.NO));
addGroupField(doc, 1);
shard.writer.addDocument(doc);

doc = new Document();
doc.add(new TextField("text", "foo", Field.Store.NO));
addGroupField(doc, 2);
shard.writer.addDocument(doc);

// Add document without group field
doc = new Document();
doc.add(new TextField("text", "foo", Field.Store.NO));
shard.writer.addDocument(doc);

IndexSearcher searcher = shard.getIndexSearcher();
Query query = new TermQuery(new Term("text", "foo"));

// Test default behavior (include null group)
GroupingSearch grouping1 = new GroupingSearch(getGroupSelector());
TopGroups<T> groups1 = grouping1.search(searcher, query, 0, 10);
int defaultGroupCount = groups1.groups.length;

// Test ignoring docs without group field
GroupingSearch grouping2 = new GroupingSearch(getGroupSelector());
grouping2.setIgnoreDocsWithoutGroupField(true);
TopGroups<T> groups2 = grouping2.search(searcher, query, 0, 10);
int ignoreGroupCount = groups2.groups.length;

assertTrue(
"Expected ignoreGroupCount <= defaultGroupCount, got "
+ ignoreGroupCount
+ " vs "
+ defaultGroupCount,
ignoreGroupCount <= defaultGroupCount);

shard.close();
}

private void assertSortsBefore(GroupDocs<T> first, GroupDocs<T> second) {
Object[] groupSortValues = second.groupSortValues();
Object[] prevSortValues = first.groupSortValues();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
Expand Down Expand Up @@ -193,6 +194,84 @@ public void testBasic() throws Exception {
dir.close();
}

public void testIgnoreDocsWithoutGroupField() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w =
new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())));

String groupField = "group";
// Add documents with group field
Document doc = new Document();
addGroupField(doc, groupField, "group1");
// doc.add(new SortedDocValuesField("group", new BytesRef("group1")));
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);

doc = new Document();
addGroupField(doc, groupField, "group2");
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);

// Add document without group field
doc = new Document();
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);

DirectoryReader reader = w.getReader();
w.close();

IndexSearcher searcher = newSearcher(reader);

// Test default behavior (include null group)
FirstPassGroupingCollector<BytesRef> collector1 =
new FirstPassGroupingCollector<>(new TermGroupSelector(groupField), Sort.RELEVANCE, 10);
searcher.search(new MatchAllDocsQuery(), collector1);
Collection<SearchGroup<BytesRef>> groups1 = collector1.getTopGroups(0);

assertEquals(3, groups1.size()); // Should include null group

// Test ignoring docs without group field
FirstPassGroupingCollector<BytesRef> collector2 =
new FirstPassGroupingCollector<>(
new TermGroupSelector(groupField), Sort.RELEVANCE, 10, true);
searcher.search(new MatchAllDocsQuery(), collector2);
Collection<SearchGroup<BytesRef>> groups2 = collector2.getTopGroups(0);

assertEquals(2, groups2.size()); // Should exclude null group

reader.close();
dir.close();
}

public void testAllDocsWithoutGroupField() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w =
new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())));

// Add documents without group field
for (int i = 0; i < 5; i++) {
Document doc = new Document();
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);
}

DirectoryReader reader = w.getReader();
w.close();

IndexSearcher searcher = newSearcher(reader);

// Test ignoring docs without group field when all docs lack the field
FirstPassGroupingCollector<BytesRef> collector =
new FirstPassGroupingCollector<>(new TermGroupSelector("group"), Sort.RELEVANCE, 10, true);
searcher.search(new MatchAllDocsQuery(), collector);
Collection<SearchGroup<BytesRef>> groups = collector.getTopGroups(0);

assertNull(groups); // Should return null when no groups found

reader.close();
dir.close();
}

private void addGroupField(Document doc, String groupField, String value) {
doc.add(new SortedDocValuesField(groupField, new BytesRef(value)));
}
Expand Down
Loading