From d947c5244d12f5d08c2bece8831ac2c8a3366217 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sun, 1 Feb 2026 21:32:39 +0900 Subject: [PATCH 1/4] feat: add t-digest double go accuracy profile --- go/go.mod | 2 +- go/go.sum | 2 + go/main.go | 44 +++++++++++++ go/main_test.go | 16 +++++ go/tdigest_double_accuracy_profile.go | 93 +++++++++++++++++++++++++++ go/tdigest_utils.go | 60 +++++++++++++++++ 6 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 go/tdigest_double_accuracy_profile.go create mode 100644 go/tdigest_utils.go diff --git a/go/go.mod b/go/go.mod index 2e40ffc..dbd7701 100644 --- a/go/go.mod +++ b/go/go.mod @@ -18,7 +18,7 @@ module github.com/apache/datasketches-characterization/datasketches-characteriza go 1.24.11 -require github.com/apache/datasketches-go v0.0.0-20260112141520-e1cb959c71df +require github.com/apache/datasketches-go v0.0.0-20260117014825-fabb7290e16c require ( github.com/twmb/murmur3 v1.1.8 // indirect diff --git a/go/go.sum b/go/go.sum index d52bbd0..f8e516e 100644 --- a/go/go.sum +++ b/go/go.sum @@ -2,6 +2,8 @@ github.com/apache/datasketches-go v0.0.0-20251119134622-22517a622447 h1:9B5BDC0H github.com/apache/datasketches-go v0.0.0-20251119134622-22517a622447/go.mod h1:4FkC6sbeiSlLSW/OwrtiTfwj01JYf9AK7DlENi9IIzg= github.com/apache/datasketches-go v0.0.0-20260112141520-e1cb959c71df h1:aNlsKI1eBiWAiUzJ/496C0dergqspcnBPBy68oO4K9s= github.com/apache/datasketches-go v0.0.0-20260112141520-e1cb959c71df/go.mod h1:s+dd951Fa5Xk8BV/jy2+hm38Ab4bJ5vN1DNB1eV7kPU= +github.com/apache/datasketches-go v0.0.0-20260117014825-fabb7290e16c h1:vTZp0e8BAIpG+81agb9khH7+FdSybSrxIkYEXC9gy9U= +github.com/apache/datasketches-go v0.0.0-20260117014825-fabb7290e16c/go.mod h1:s+dd951Fa5Xk8BV/jy2+hm38Ab4bJ5vN1DNB1eV7kPU= github.com/apache/datasketches-go v0.1.0-RC1 h1:4M/7NdXhh4TgefHPzEmikwTnsmXJ0NCsvKvZLgybf0Q= github.com/apache/datasketches-go v0.1.0-RC1/go.mod h1:s+dd951Fa5Xk8BV/jy2+hm38Ab4bJ5vN1DNB1eV7kPU= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= diff --git a/go/main.go b/go/main.go index 7d28502..73301f2 100644 --- a/go/main.go +++ b/go/main.go @@ -222,6 +222,50 @@ var ( numSketches: 32, }, ), + "tdigest_double_accuracy_profile": MustNewTDigestDoubleAccuracyProfile( + tdigestJobConfig{ + lgMin: 0, + lgMax: 23, + ppo: 8, + numTrials: 1000, + errorPCT: 99, + k: 200, + ranks: []float64{0.01, 0.05, 0.5, 0.95, 0.99}, + }, + ), + "tdigest_double_merge_accuracy_profile": MustNewTDigestDoubleMergeAccuracyProfile( + tdigestJobConfig{ + lgMin: 0, + lgMax: 23, + ppo: 8, + numTrials: 1000, + errorPCT: 99, + k: 200, + ranks: []float64{0.01, 0.05, 0.5, 0.95, 0.99}, + numSketches: 32, + }, + ), + "tdigest_double_update_speed_profile": MustNewTDigestDoubleUpdateSpeedProfile( + tdigestJobConfig{ + lgMinStreamLength: 0, + lgMaxStreamLength: 23, + ppo: 16, + lgMinTrials: 6, + lgMaxTrials: 16, + k: 200, + }, + ), + "tdigest_double_merge_speed_profile": MustNewTDigestDoubleMergeSpeedProfile( + tdigestJobConfig{ + lgMinStreamLength: 0, + lgMaxStreamLength: 23, + ppo: 16, + lgMinTrials: 6, + lgMaxTrials: 16, + k: 200, + numSketches: 32, + }, + ), } ) diff --git a/go/main_test.go b/go/main_test.go index c9e2486..588b608 100644 --- a/go/main_test.go +++ b/go/main_test.go @@ -68,3 +68,19 @@ func TestArrayOfNumbersUpdateSpeedRunner(t *testing.T) { func TestArrayOfNumbersUnionUpdateSpeedRunner(t *testing.T) { jobs["array_of_numbers_union_update_speed_profile"].run() } + +func TestTDigestDoubleAccuracyRuner(t *testing.T) { + jobs["tdigest_double_accuracy_profile"].run() +} + +func TestTDigestDoubleMergeAccuracyRunner(t *testing.T) { + jobs["tdigest_double_merge_accuracy_profile"].run() +} + +func TestTDigestDoubleUpdateSpeedRunner(t *testing.T) { + jobs["tdigest_double_update_speed_profile"].run() +} + +func TestTDigestDoubleMergeSpeedRunner(t *testing.T) { + jobs["tdigest_double_merge_speed_profile"].run() +} diff --git a/go/tdigest_double_accuracy_profile.go b/go/tdigest_double_accuracy_profile.go new file mode 100644 index 0000000..f0211d7 --- /dev/null +++ b/go/tdigest_double_accuracy_profile.go @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "fmt" + "math" + "math/rand" + "slices" + "sort" + + "github.com/apache/datasketches-go/tdigest" +) + +type TDigestDoubleAccuracyProfile struct { + config tdigestJobConfig +} + +func MustNewTDigestDoubleAccuracyProfile(cfg tdigestJobConfig) *TDigestDoubleAccuracyProfile { + return &TDigestDoubleAccuracyProfile{ + config: cfg, + } +} + +func (p *TDigestDoubleAccuracyProfile) run() { + fmt.Print("N") + for _, rank := range p.config.ranks { + fmt.Printf("\terr at %.2f", rank) + } + fmt.Println() + + numSteps := countPoints(p.config.lgMin, p.config.lgMax, p.config.ppo) + + rankErrors := make([][]float64, len(p.config.ranks)) + for i := range rankErrors { + rankErrors[i] = make([]float64, p.config.numTrials) + } + + errorPctIndex := p.config.numTrials * p.config.errorPCT / 100 + + streamLength := uint64(1) + for step := 0; step < numSteps; step++ { + for t := 0; t < p.config.numTrials; t++ { + p.runTrial(streamLength, rankErrors, t) + } + + fmt.Print(streamLength) + for i := range p.config.ranks { + sort.Float64s(rankErrors[i]) + rankError := rankErrors[i][errorPctIndex] + fmt.Printf("\t%.6f", rankError*100) + } + fmt.Println() + + streamLength = pwr2SeriesNext(p.config.ppo, streamLength) + } +} + +func (p *TDigestDoubleAccuracyProfile) runTrial(streamLength uint64, rankErrors [][]float64, trialIndex int) { + values := make([]float64, streamLength) + for i := uint64(0); i < streamLength; i++ { + values[i] = rand.ExpFloat64() / 1.5 + } + + sketch, _ := tdigest.NewDouble(uint16(p.config.k)) + for _, v := range values { + sketch.Update(v) + } + + slices.Sort(values) + + for j, rank := range p.config.ranks { + quantile := getQuantile(values, rank) + trueRank := getRankMidpoint(values, quantile) + sketchRank, _ := sketch.Rank(quantile) + rankErrors[j][trialIndex] = math.Abs(sketchRank - trueRank) + } +} diff --git a/go/tdigest_utils.go b/go/tdigest_utils.go new file mode 100644 index 0000000..8dff3cf --- /dev/null +++ b/go/tdigest_utils.go @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import "sort" + +type tdigestJobConfig struct { + lgMin int + lgMax int + ppo int + + numTrials int + errorPCT int + lgMinTrials int + lgMaxTrials int + + lgMinStreamLength int + lgMaxStreamLength int + + k int + ranks []float64 + + numSketches int +} + +func getQuantile(values []float64, rank float64) float64 { + if len(values) == 0 { + return 0 + } + index := int(float64(len(values)-1) * rank) + return values[index] +} + +func getRankMidpoint(values []float64, value float64) float64 { + n := len(values) + if n == 0 { + return 0 + } + lower := sort.SearchFloat64s(values, value) + upper := lower + for upper < n && values[upper] == value { + upper++ + } + return float64(lower+upper) / 2.0 / float64(n) +} From bdf2931577bfb32aa9e8bf5261d20c517bcf6303 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sun, 1 Feb 2026 21:33:10 +0900 Subject: [PATCH 2/4] feat: t-digest double merge accuracy profile for go --- go/tdigest_double_merge_accuracy_profile.go | 104 ++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 go/tdigest_double_merge_accuracy_profile.go diff --git a/go/tdigest_double_merge_accuracy_profile.go b/go/tdigest_double_merge_accuracy_profile.go new file mode 100644 index 0000000..549552a --- /dev/null +++ b/go/tdigest_double_merge_accuracy_profile.go @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "fmt" + "math" + "math/rand" + "slices" + "sort" + + "github.com/apache/datasketches-go/tdigest" +) + +type TDigestDoubleMergeAccuracyProfile struct { + config tdigestJobConfig +} + +func MustNewTDigestDoubleMergeAccuracyProfile(cfg tdigestJobConfig) *TDigestDoubleMergeAccuracyProfile { + return &TDigestDoubleMergeAccuracyProfile{ + config: cfg, + } +} + +func (p *TDigestDoubleMergeAccuracyProfile) run() { + fmt.Print("N") + for _, rank := range p.config.ranks { + fmt.Printf("\terr at %.2f", rank) + } + fmt.Println() + + numSteps := countPoints(p.config.lgMin, p.config.lgMax, p.config.ppo) + + rankErrors := make([][]float64, len(p.config.ranks)) + for i := range rankErrors { + rankErrors[i] = make([]float64, p.config.numTrials) + } + + errorPctIndex := p.config.numTrials * p.config.errorPCT / 100 + + streamLength := uint64(1) + for step := 0; step < numSteps; step++ { + for t := 0; t < p.config.numTrials; t++ { + p.runTrial(streamLength, rankErrors, t) + } + + fmt.Print(streamLength) + for i := range p.config.ranks { + sort.Float64s(rankErrors[i]) + rankError := rankErrors[i][errorPctIndex] + fmt.Printf("\t%.6f", rankError*100) + } + fmt.Println() + + streamLength = pwr2SeriesNext(p.config.ppo, streamLength) + } +} + +func (p *TDigestDoubleMergeAccuracyProfile) runTrial(streamLength uint64, rankErrors [][]float64, trialIndex int) { + numSketches := p.config.numSketches + values := make([]float64, streamLength) + for i := uint64(0); i < streamLength; i++ { + values[i] = rand.ExpFloat64() / 1.5 + } + + sketches := make([]*tdigest.Double, numSketches) + for i := 0; i < numSketches; i++ { + sketch, _ := tdigest.NewDouble(uint16(p.config.k)) + sketches[i] = sketch + } + + for i, v := range values { + sketches[i%numSketches].Update(v) + } + + merged, _ := tdigest.NewDouble(uint16(p.config.k)) + for i := 0; i < numSketches; i++ { + merged.Merge(sketches[i]) + } + + slices.Sort(values) + + for j, rank := range p.config.ranks { + quantile := getQuantile(values, rank) + trueRank := getRankMidpoint(values, quantile) + sketchRank, _ := merged.Rank(quantile) + rankErrors[j][trialIndex] = math.Abs(sketchRank - trueRank) + } +} From df3505ef3303b21dfa1553f71b31b7c7ac46a09c Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sun, 1 Feb 2026 21:33:41 +0900 Subject: [PATCH 3/4] feat: t-digest double merge speed profile for go --- go/tdigest_double_merge_speed_profile.go | 122 +++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 go/tdigest_double_merge_speed_profile.go diff --git a/go/tdigest_double_merge_speed_profile.go b/go/tdigest_double_merge_speed_profile.go new file mode 100644 index 0000000..2d722b6 --- /dev/null +++ b/go/tdigest_double_merge_speed_profile.go @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "fmt" + "math" + "math/rand" + "runtime" + "runtime/debug" + "time" + + "github.com/apache/datasketches-go/tdigest" +) + +type TDigestDoubleMergeSpeedProfile struct { + config tdigestJobConfig +} + +func MustNewTDigestDoubleMergeSpeedProfile(cfg tdigestJobConfig) *TDigestDoubleMergeSpeedProfile { + return &TDigestDoubleMergeSpeedProfile{ + config: cfg, + } +} + +func (p *TDigestDoubleMergeSpeedProfile) run() { + debug.SetMemoryLimit(math.MaxInt64) + + fmt.Println("Stream\tTrials\tBuild\tUpdate\tMerge\tSize") + + maxStreamLength := uint64(1) << p.config.lgMaxStreamLength + values := make([]float64, maxStreamLength) + sketches := make([]*tdigest.Double, p.config.numSketches) + + streamLength := uint64(1) << p.config.lgMinStreamLength + for streamLength < maxStreamLength { + numTrials := getNumTrials( + int(streamLength), + p.config.lgMin, + p.config.lgMax, + p.config.lgMinTrials, + p.config.lgMaxTrials, + ) + + var ( + totalBuildTimeNS int64 + totalUpdateTimeNS int64 + totalMergeTimeNS int64 + totalSizeBytes int64 + ) + + for t := 0; t < numTrials; t++ { + for i := uint64(0); i < streamLength; i++ { + values[i] = rand.Float64() + } + + runtime.GC() + + buildTimeNS, updateTimeNS, mergeTimeNS, sizeBytes := p.runTrial(values[:streamLength], sketches) + totalBuildTimeNS += buildTimeNS + totalUpdateTimeNS += updateTimeNS + totalMergeTimeNS += mergeTimeNS + totalSizeBytes += sizeBytes + } + + avgBuildTimeNS := float64(totalBuildTimeNS) / float64(numTrials) / float64(p.config.numSketches) + avgUpdateTimePerItemNS := float64(totalUpdateTimeNS) / float64(numTrials) / float64(streamLength) / float64(p.config.numSketches) + avgMergeTimeNS := float64(totalMergeTimeNS) / float64(numTrials) / float64(p.config.numSketches) + avgSizeBytes := float64(totalSizeBytes) / float64(numTrials) / float64(p.config.numSketches) + + fmt.Printf("%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", + streamLength, numTrials, avgBuildTimeNS, avgUpdateTimePerItemNS, avgMergeTimeNS, avgSizeBytes) + + streamLength = pwr2SeriesNext(p.config.ppo, streamLength) + } +} + +func (p *TDigestDoubleMergeSpeedProfile) runTrial( + values []float64, sketches []*tdigest.Double, +) (buildTimeNS, updateTimeNS, mergeTimeNS, sizeBytes int64) { + startBuild := time.Now() + for i := 0; i < p.config.numSketches; i++ { + sketch, _ := tdigest.NewDouble(uint16(p.config.k)) + sketches[i] = sketch + } + buildTimeNS = time.Since(startBuild).Nanoseconds() + + startUpdate := time.Now() + for _, sketch := range sketches { + for _, v := range values { + sketch.Update(v) + } + } + updateTimeNS = time.Since(startUpdate).Nanoseconds() + + merged, _ := tdigest.NewDouble(uint16(p.config.k)) + startMerge := time.Now() + for i := 0; i < p.config.numSketches; i++ { + merged.Merge(sketches[i]) + } + mergeTimeNS = time.Since(startMerge).Nanoseconds() + + serialized, _ := tdigest.EncodeDouble(merged, false) + sizeBytes = int64(len(serialized)) + + return buildTimeNS, updateTimeNS, mergeTimeNS, sizeBytes +} From 73323c487b50cc8c2b8adf780a5982d9299801bd Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sun, 1 Feb 2026 21:33:56 +0900 Subject: [PATCH 4/4] feat: t-digest double update speed profile for go --- go/tdigest_double_update_speed_profile.go | 104 ++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 go/tdigest_double_update_speed_profile.go diff --git a/go/tdigest_double_update_speed_profile.go b/go/tdigest_double_update_speed_profile.go new file mode 100644 index 0000000..9fdbcd6 --- /dev/null +++ b/go/tdigest_double_update_speed_profile.go @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "fmt" + "math" + "math/rand" + "runtime" + "runtime/debug" + "time" + + "github.com/apache/datasketches-go/tdigest" +) + +type TDigestDoubleUpdateSpeedProfile struct { + config tdigestJobConfig +} + +func MustNewTDigestDoubleUpdateSpeedProfile(cfg tdigestJobConfig) *TDigestDoubleUpdateSpeedProfile { + return &TDigestDoubleUpdateSpeedProfile{ + config: cfg, + } +} + +func (p *TDigestDoubleUpdateSpeedProfile) run() { + debug.SetMemoryLimit(math.MaxInt64) + + fmt.Println("Stream\tTrials\tBuild\tUpdate\tSize") + + maxStreamLength := uint64(1) << p.config.lgMaxStreamLength + values := make([]float64, maxStreamLength) + + streamLength := uint64(1) << p.config.lgMinStreamLength + for streamLength < maxStreamLength { + numTrials := getNumTrials( + int(streamLength), + p.config.lgMin, + p.config.lgMax, + p.config.lgMinTrials, + p.config.lgMaxTrials, + ) + + var ( + totalBuildTimeNS int64 + totalUpdateTimeNS int64 + totalSizeBytes int64 + ) + + for t := 0; t < numTrials; t++ { + for i := uint64(0); i < streamLength; i++ { + values[i] = rand.Float64() + } + + runtime.GC() + + buildTimeNS, updateTimeNS, sizeBytes := p.runTrial(values[:streamLength]) + totalBuildTimeNS += buildTimeNS + totalUpdateTimeNS += updateTimeNS + totalSizeBytes += sizeBytes + } + + avgBuildTimeNS := float64(totalBuildTimeNS) / float64(numTrials) + avgUpdateTimePerItemNS := float64(totalUpdateTimeNS) / float64(numTrials) / float64(streamLength) + avgSizeBytes := float64(totalSizeBytes) / float64(numTrials) + + fmt.Printf("%d\t%d\t%.2f\t%.2f\t%.2f\n", + streamLength, numTrials, avgBuildTimeNS, avgUpdateTimePerItemNS, avgSizeBytes) + + streamLength = pwr2SeriesNext(p.config.ppo, streamLength) + } +} + +func (p *TDigestDoubleUpdateSpeedProfile) runTrial(values []float64) (buildTimeNS, updateTimeNS, sizeBytes int64) { + startBuild := time.Now() + sketch, _ := tdigest.NewDouble(uint16(p.config.k)) + buildTimeNS = time.Since(startBuild).Nanoseconds() + + startUpdate := time.Now() + for _, v := range values { + sketch.Update(v) + } + updateTimeNS = time.Since(startUpdate).Nanoseconds() + + serialized, _ := tdigest.EncodeDouble(sketch, false) + sizeBytes = int64(len(serialized)) + + return buildTimeNS, updateTimeNS, sizeBytes +}