Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
title: "Support 'countDist' (count distinct) metric in rollup for streaming expressions"
type: added
authors:
- name: khushjain
links:
- name: SOLR-18220
url: https://issues.apache.org/jira/browse/SOLR-18220
Original file line number Diff line number Diff line change
Expand Up @@ -1448,7 +1448,7 @@ For faster aggregation over low to moderate cardinality fields, the `facet` func
* `StreamExpression` (Mandatory)
* `over`: (Mandatory) A list of fields to group by.
* `metrics`: (Mandatory) The list of metrics to compute.
Currently supported metrics are `sum(col)`, `avg(col)`, `min(col)`, `max(col)`, `count(*)`, `missing(col)`.
Currently supported metrics are `sum(col)`, `avg(col)`, `min(col)`, `max(col)`, `count(*)`, `missing(col)`, `countDist(col)`.

=== rollup Syntax

Expand All @@ -1466,7 +1466,8 @@ rollup(
avg(a_i),
avg(a_f),
count(*),
missing(a_i)
missing(a_i),
countDist(a_i)
)
----

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.solr.client.solrj.io.stream.metrics;

import java.io.IOException;
import java.util.HashSet;
import java.util.Locale;
import org.apache.solr.client.solrj.io.Tuple;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
Expand All @@ -29,6 +30,7 @@ public class CountDistinctMetric extends Metric {
public static final String APPROX_COUNT_DISTINCT = "hll";

private String columnName;
private HashSet<Object> distinctValues = new HashSet<>();

public CountDistinctMetric(String columnName) {
this(columnName, false);
Expand All @@ -53,6 +55,10 @@ public CountDistinctMetric(StreamExpression expression, StreamFactory factory)
expression,
functionName));
}
if (1 != expression.getParameters().size()) {
throw new IOException(
String.format(Locale.ROOT, "Invalid expression %s - unknown operands found", expression));
Comment on lines +58 to +60
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old constructor silently ignored the second parameter anyway, so no one could have depended on it. This PR fixes the serialization to match what the constructor actually accepts

}

init(functionName, columnName);
}
Expand All @@ -66,7 +72,10 @@ private void init(String functionName, String columnName) {

@Override
public void update(Tuple tuple) {
// Nop for now
Object value = tuple.get(columnName);
if (value != null) {
distinctValues.add(value);
}
Comment on lines 74 to +78
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

APPROX_COUNT_DISTINCT / hll constant existed but the code was a complete no-op. I'd argue that's a separate feature and needs its own function registered like hll()

}

@Override
Expand All @@ -81,14 +90,11 @@ public String[] getColumns() {

@Override
public Number getValue() {
// No op for now
return null;
return distinctValues.size();
}

@Override
public StreamExpressionParameter toExpression(StreamFactory factory) throws IOException {
return new StreamExpression(getFunctionName())
.withParameter(columnName)
.withParameter(Boolean.toString(outputLong));
return new StreamExpression(getFunctionName()).withParameter(columnName);
Comment on lines 97 to +98
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the SQL module's map-reduce path, not the streaming expression path. This PR targets countDist in streaming expression rollup()

Comment thread
KhushJain marked this conversation as resolved.
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionParser;
import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
import org.apache.solr.client.solrj.io.stream.metrics.CountDistinctMetric;
import org.apache.solr.client.solrj.io.stream.metrics.CountMetric;
import org.apache.solr.client.solrj.io.stream.metrics.MaxMetric;
import org.apache.solr.client.solrj.io.stream.metrics.MeanMetric;
Expand Down Expand Up @@ -1343,7 +1344,7 @@ public void testRollupStream() throws Exception {

new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2")
.add(id, "2", "a_s", "hello0", "a_i", "0", "a_f", "2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5")
Expand All @@ -1363,7 +1364,8 @@ public void testRollupStream() throws Exception {
.withFunctionName("min", MinMetric.class)
.withFunctionName("max", MaxMetric.class)
.withFunctionName("avg", MeanMetric.class)
.withFunctionName("count", CountMetric.class);
.withFunctionName("count", CountMetric.class)
.withFunctionName("countDist", CountDistinctMetric.class);

StreamExpression expression;
TupleStream stream;
Expand All @@ -1388,6 +1390,8 @@ public void testRollupStream() throws Exception {
+ "avg(a_i),"
+ "avg(a_f),"
+ "count(*),"
+ "countDist(a_i),"
+ "countDist(a_s)"
+ ")");
stream = factory.constructStream(expression);
stream.setStreamContext(streamContext);
Expand All @@ -1408,17 +1412,21 @@ public void testRollupStream() throws Exception {
Double avgi = tuple.getDouble("avg(a_i)");
Double avgf = tuple.getDouble("avg(a_f)");
Double count = tuple.getDouble("count(*)");
Double countDistI = tuple.getDouble("countDist(a_i)");
Double countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello0", bucket);
assertEquals(17.0D, sumi, 0.0);
assertEquals(15.0D, sumi, 0.0);
assertEquals(18.0D, sumf, 0.0);
assertEquals(0.0D, mini, 0.0);
assertEquals(1.0D, minf, 0.0);
assertEquals(14.0D, maxi, 0.0);
assertEquals(10.0D, maxf, 0.0);
assertEquals(4.25D, avgi, 0.0);
assertEquals(3.75D, avgi, 0.0);
assertEquals(4.5D, avgf, 0.0);
assertEquals(4, count, 0.0);
assertEquals(3, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

tuple = tuples.get(1);
bucket = tuple.getString("a_s");
Expand All @@ -1431,6 +1439,8 @@ public void testRollupStream() throws Exception {
avgi = tuple.getDouble("avg(a_i)");
avgf = tuple.getDouble("avg(a_f)");
count = tuple.getDouble("count(*)");
countDistI = tuple.getDouble("countDist(a_i)");
countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello3", bucket);
assertEquals(38.0D, sumi, 0.0);
Expand All @@ -1442,6 +1452,8 @@ public void testRollupStream() throws Exception {
assertEquals(9.5D, avgi, 0.0);
assertEquals(6.5D, avgf, 0.0);
assertEquals(4, count, 0.0);
assertEquals(4, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

tuple = tuples.get(2);
bucket = tuple.getString("a_s");
Expand All @@ -1454,6 +1466,8 @@ public void testRollupStream() throws Exception {
avgi = tuple.getDouble("avg(a_i)");
avgf = tuple.getDouble("avg(a_f)");
count = tuple.getDouble("count(*)");
countDistI = tuple.getDouble("countDist(a_i)");
countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello4", bucket);
assertEquals(15, sumi.longValue());
Expand All @@ -1465,6 +1479,8 @@ public void testRollupStream() throws Exception {
assertEquals(7.5D, avgi, 0.0);
assertEquals(5.5D, avgf, 0.0);
assertEquals(2, count, 0.0);
assertEquals(2, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

} finally {
solrClientCache.close();
Expand All @@ -1476,7 +1492,7 @@ public void testHashRollupStream() throws Exception {

new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2")
.add(id, "2", "a_s", "hello0", "a_i", "0", "a_f", "2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5")
Expand All @@ -1497,6 +1513,7 @@ public void testHashRollupStream() throws Exception {
.withFunctionName("max", MaxMetric.class)
.withFunctionName("avg", MeanMetric.class)
.withFunctionName("count", CountMetric.class)
.withFunctionName("countDist", CountDistinctMetric.class)
.withFunctionName("sort", SortStream.class);

StreamExpression expression;
Expand All @@ -1522,6 +1539,8 @@ public void testHashRollupStream() throws Exception {
+ "avg(a_i),"
+ "avg(a_f),"
+ "count(*),"
+ "countDist(a_i),"
+ "countDist(a_s)"
+ "), by=\"avg(a_f) asc\")");
stream = factory.constructStream(expression);
stream.setStreamContext(streamContext);
Expand All @@ -1542,17 +1561,21 @@ public void testHashRollupStream() throws Exception {
Double avgi = tuple.getDouble("avg(a_i)");
Double avgf = tuple.getDouble("avg(a_f)");
Double count = tuple.getDouble("count(*)");
Double countDistI = tuple.getDouble("countDist(a_i)");
Double countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello0", bucket);
assertEquals(17.0D, sumi, 0.0);
assertEquals(15.0D, sumi, 0.0);
assertEquals(18.0D, sumf, 0.0);
assertEquals(0.0D, mini, 0.0);
assertEquals(1.0D, minf, 0.0);
assertEquals(14.0D, maxi, 0.0);
assertEquals(10.0D, maxf, 0.0);
assertEquals(4.25D, avgi, 0.0);
assertEquals(3.75D, avgi, 0.0);
assertEquals(4.5D, avgf, 0.0);
assertEquals(4, count, 0.0);
assertEquals(3, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

tuple = tuples.get(1);
bucket = tuple.getString("a_s");
Expand All @@ -1565,6 +1588,8 @@ public void testHashRollupStream() throws Exception {
avgi = tuple.getDouble("avg(a_i)");
avgf = tuple.getDouble("avg(a_f)");
count = tuple.getDouble("count(*)");
countDistI = tuple.getDouble("countDist(a_i)");
countDistS = tuple.getDouble("countDist(a_s)");

System.out.println("################:bucket" + bucket);

Expand All @@ -1578,6 +1603,8 @@ public void testHashRollupStream() throws Exception {
assertEquals(7.5D, avgi, 0.0);
assertEquals(5.5D, avgf, 0.0);
assertEquals(2, count, 0.0);
assertEquals(2, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

tuple = tuples.get(2);
bucket = tuple.getString("a_s");
Expand All @@ -1590,6 +1617,8 @@ public void testHashRollupStream() throws Exception {
avgi = tuple.getDouble("avg(a_i)");
avgf = tuple.getDouble("avg(a_f)");
count = tuple.getDouble("count(*)");
countDistI = tuple.getDouble("countDist(a_i)");
countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello3", bucket);
assertEquals(38.0D, sumi, 0.0);
Expand All @@ -1601,6 +1630,8 @@ public void testHashRollupStream() throws Exception {
assertEquals(9.5D, avgi, 0.0);
assertEquals(6.5D, avgf, 0.0);
assertEquals(4, count, 0.0);
assertEquals(4, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

} finally {
solrClientCache.close();
Expand Down Expand Up @@ -1993,7 +2024,7 @@ public void testParallelRollupStream() throws Exception {

new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2")
.add(id, "2", "a_s", "hello0", "a_i", "0", "a_f", "2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5")
Expand Down Expand Up @@ -2070,13 +2101,13 @@ public void testParallelRollupStream() throws Exception {
Double count = tuple.getDouble("count(*)");

assertEquals("hello0", bucket);
assertEquals(17.0D, sumi, 0.0);
assertEquals(15.0D, sumi, 0.0);
assertEquals(18.0D, sumf, 0.0);
assertEquals(0.0D, mini, 0.0);
assertEquals(1.0D, minf, 0.0);
assertEquals(14.0D, maxi, 0.0);
assertEquals(10.0D, maxf, 0.0);
assertEquals(4.25D, avgi, 0.0);
assertEquals(3.75D, avgi, 0.0);
assertEquals(4.5D, avgf, 0.0);
assertEquals(4, count, 0.0);

Expand Down Expand Up @@ -2135,7 +2166,7 @@ public void testParallelHashRollupStream() throws Exception {

new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2")
.add(id, "2", "a_s", "hello0", "a_i", "0", "a_f", "2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5")
Expand Down Expand Up @@ -2213,13 +2244,13 @@ public void testParallelHashRollupStream() throws Exception {
Double count = tuple.getDouble("count(*)");

assertEquals("hello0", bucket);
assertEquals(17.0D, sumi, 0.0);
assertEquals(15.0D, sumi, 0.0);
assertEquals(18.0D, sumf, 0.0);
assertEquals(0.0D, mini, 0.0);
assertEquals(1.0D, minf, 0.0);
assertEquals(14.0D, maxi, 0.0);
assertEquals(10.0D, maxf, 0.0);
assertEquals(4.25D, avgi, 0.0);
assertEquals(3.75D, avgi, 0.0);
assertEquals(4.5D, avgf, 0.0);
assertEquals(4, count, 0.0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionParser;
import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
import org.apache.solr.client.solrj.io.stream.metrics.CountDistinctMetric;
import org.apache.solr.client.solrj.io.stream.metrics.CountMetric;
import org.apache.solr.client.solrj.io.stream.metrics.MaxMetric;
import org.apache.solr.client.solrj.io.stream.metrics.MeanMetric;
Expand Down Expand Up @@ -55,6 +56,7 @@ public StreamExpressionToExpressionTest() {
.withFunctionName("intersect", IntersectStream.class)
.withFunctionName("complement", ComplementStream.class)
.withFunctionName("count", CountMetric.class)
.withFunctionName("countDist", CountDistinctMetric.class)
.withFunctionName("sum", SumMetric.class)
.withFunctionName("min", MinMetric.class)
.withFunctionName("max", MaxMetric.class)
Expand Down Expand Up @@ -626,6 +628,19 @@ public void testCountMetric() throws Exception {
assertEquals("count(*)", expressionString);
}

@Test
public void testCountDistinctMetric() throws Exception {

Metric metric;
String expressionString;

// Basic test
metric = new CountDistinctMetric(StreamExpressionParser.parse("countDist(foo)"), factory);
expressionString = metric.toExpression(factory).toString();

assertEquals("countDist(foo)", expressionString);
}

@Test
public void testMaxMetric() throws Exception {

Expand Down
Loading