From d52f482bf3c6240ceb6311567b71631947a13bf9 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 2 Dec 2025 17:30:35 -0800 Subject: [PATCH] Random corrections --- docs/Community/Research.md | 3 ++- docs/Frequency/FrequentDistinctTuplesSketch.md | 4 ++-- pom.xml | 5 +++-- src/main/java/org/apache/datasketches/ByteArrayBuilder.java | 3 +++ src/main/java/org/apache/datasketches/Files.java | 2 ++ 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/Community/Research.md b/docs/Community/Research.md index 93213f707..e83f5b596 100644 --- a/docs/Community/Research.md +++ b/docs/Community/Research.md @@ -97,7 +97,8 @@ This solution suffices in some applications, but for other applications the chun ## References -**[ABL+17]** Daniel Anderson, Pryce Bevan, Kevin J. Lang, Edo Liberty, Lee Rhodes, and Justin Thaler. A high-performance algorithm for identifying frequent items in data streams. In *ACM IMC 2017 (To Appear)*, 2017. [Preliminary paper](https://arxiv.org/abs/1705.07001). +**[ABL+17]** Daniel Anderson, Pryce Bevan, Kevin J. Lang, Edo Liberty, Lee Rhodes, and Justin Thaler. A high-performance algorithm for identifying frequent items in data streams. In *ACM IMC 2017*, 2017. +(dl.acm.org),(arxiv.org/abs/1705.07001). **[AC+13]** Pankaj K. Agarwal, Graham Cormode, Zengfeng Huang, Jeff M. Phillips, Zhewei Wei, Ke Yi. Mergeable summaries. In *ACM Trans. Database Syst.* 38(4): 26:1-26:28, 2013 diff --git a/docs/Frequency/FrequentDistinctTuplesSketch.md b/docs/Frequency/FrequentDistinctTuplesSketch.md index 684953b4d..aeee24b10 100644 --- a/docs/Frequency/FrequentDistinctTuplesSketch.md +++ b/docs/Frequency/FrequentDistinctTuplesSketch.md @@ -165,9 +165,9 @@ When the Group is printed as a string, it will output seven columns as follows: ### Error Behavior Note: the code for the following study can be found in the characterization repository -[here](https://github.com/DataSketches/characterization/tree/master/src/main/java/org/apache/datasketches/characterization/fdt) and the configuration file can be found [here](https://github.com/DataSketches/characterization/tree/master/src/main/resources/fdt). +[here](https://github.com/apache/datasketches-characterization/tree/master/java-base/src/main/java/org/apache/datasketches/characterization/fdt) and the configuration file can be found [here](https://github.com/apache/datasketches-characterization/blob/master/java-base/src/main/resources/fdt/FdtAccuracyJob.conf). A login to GitHub will be required. -In order to study the error behavior of this sketch a power-law distribution with a slope of -1 was created. The head of the distribution was a single item with a cardinality of 16384, and the tail of the distribution was 16384 items each with a cardinality of one. All the points inbetween were items that have multiplicities and cardinalities that would fall on a straight line plotted on a Log-X, Log-Y graph. This generated an input stream of about 850K (Key, value) pairs, which was input into the sketch and is considered one trial. The sketch was constructed with a target +In order to study the error behavior of this sketch a power-law distribution with a slope of -1 was created. The head of the distribution was a single item with a cardinality of 16384, and the tail of the distribution was 16384 items each with a cardinality of one. All the points in between were items that have multiplicities and cardinalities that would fall on a straight line plotted on a Log-X, Log-Y graph. This generated an input stream of about 850K (Key, value) pairs, which was input into the sketch and is considered one trial. The sketch was constructed with a target threshold of 1% and a target RSE of 5%. Twenty such trials were run and the error distribution quantiles of the results were computed and is shown in the following graph. diff --git a/pom.xml b/pom.xml index 7a8873326..8bc0c8103 100644 --- a/pom.xml +++ b/pom.xml @@ -85,7 +85,7 @@ under the License. - 20231013 + 20230227 @@ -403,7 +403,7 @@ under the License. This profile is only active when the property "m2e.version" is set, which is the case when building in Eclipse with m2e. The ignore below tells m2eclipse to skip the execution. - --> + m2e @@ -442,6 +442,7 @@ under the License. + --> strict diff --git a/src/main/java/org/apache/datasketches/ByteArrayBuilder.java b/src/main/java/org/apache/datasketches/ByteArrayBuilder.java index 1fc9051c2..ae06e4a3a 100644 --- a/src/main/java/org/apache/datasketches/ByteArrayBuilder.java +++ b/src/main/java/org/apache/datasketches/ByteArrayBuilder.java @@ -36,6 +36,9 @@ public class ByteArrayBuilder { private int count_ = 0; private int capacity_; + /** + * Constructor, no arguments + */ public ByteArrayBuilder() { this(1024); } diff --git a/src/main/java/org/apache/datasketches/Files.java b/src/main/java/org/apache/datasketches/Files.java index 376ab36d4..8c9434288 100644 --- a/src/main/java/org/apache/datasketches/Files.java +++ b/src/main/java/org/apache/datasketches/Files.java @@ -51,6 +51,8 @@ public final class Files { private static final String LS = System.getProperty("line.separator"); private static final byte CR = 0xD; private static final byte LF = 0xA; + + /** DEFAULT_BUFSIZE */ public static final int DEFAULT_BUFSIZE = 8192; // Common IO & NIO file methods