From 848131a1cdd6de945b86a48782cc2afb08e00734 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 20 Jan 2026 21:22:22 +0800 Subject: [PATCH 01/21] feat: impl CpcSketch Signed-off-by: tison --- datasketches/src/cpc/mod.rs | 3 +++ datasketches/src/cpc/pair_table.rs | 8 ++++++++ datasketches/src/lib.rs | 1 + 3 files changed, 12 insertions(+) create mode 100644 datasketches/src/cpc/mod.rs create mode 100644 datasketches/src/cpc/pair_table.rs diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs new file mode 100644 index 0000000..a2b5957 --- /dev/null +++ b/datasketches/src/cpc/mod.rs @@ -0,0 +1,3 @@ +//! Compressed Probabilistic Counting sketch family. + +mod pair_table; diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs new file mode 100644 index 0000000..0848634 --- /dev/null +++ b/datasketches/src/cpc/pair_table.rs @@ -0,0 +1,8 @@ +/// A highly specialized hash table used for sparse data. +/// +/// This table stores `(row, col)` pairs and uses linear probing for collision resolution. It is +/// optimized for scenarios where the cardinality of entries is low. +pub(crate) struct PairTable { + pub keys: Vec, + pub values: Vec, +} diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs index 009fd9e..17701ab 100644 --- a/datasketches/src/lib.rs +++ b/datasketches/src/lib.rs @@ -33,6 +33,7 @@ compile_error!("datasketches does not support big-endian targets"); pub mod bloom; pub mod common; pub mod countmin; +pub mod cpc; pub mod error; pub mod frequencies; pub mod hll; From d42cb8ffe429f52d97146a9e31a7a3535d22c4a6 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 20 Jan 2026 23:52:14 +0800 Subject: [PATCH 02/21] pair table Signed-off-by: tison --- Cargo.lock | 98 ++++++++ Cargo.toml | 1 + datasketches/Cargo.toml | 3 +- datasketches/src/cpc/mod.rs | 17 ++ datasketches/src/cpc/pair_table.rs | 345 ++++++++++++++++++++++++++++- 5 files changed, 460 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 34faf60..f9efb3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -73,6 +73,12 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + [[package]] name = "clap" version = "4.5.53" @@ -124,6 +130,7 @@ name = "datasketches" version = "0.2.0" dependencies = [ "googletest", + "rand", ] [[package]] @@ -149,6 +156,18 @@ dependencies = [ "datasketches", ] +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + [[package]] name = "googletest" version = "0.14.2" @@ -217,6 +236,15 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "proc-macro2" version = "1.0.104" @@ -235,6 +263,41 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom", +] + [[package]] name = "regex" version = "1.12.2" @@ -312,6 +375,15 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +dependencies = [ + "wit-bindgen", +] + [[package]] name = "which" version = "8.0.0" @@ -344,6 +416,12 @@ version = "0.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + [[package]] name = "x" version = "0.0.0" @@ -351,3 +429,23 @@ dependencies = [ "clap", "which", ] + +[[package]] +name = "zerocopy" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "668f5168d10b9ee831de31933dc111a459c97ec93225beb307aed970d1372dfd" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c7962b26b0a8685668b671ee4b54d007a67d4eaf05fda79ac0ecf41e32270f1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index 30c7a35..c7e4a18 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ datasketches = { path = "datasketches" } # Crates.io dependencies clap = { version = "4.5.20", features = ["derive"] } googletest = { version = "0.14.2" } +rand = { version = "0.9.2" } which = { version = "8.0.0" } [workspace.lints.rust] diff --git a/datasketches/Cargo.toml b/datasketches/Cargo.toml index 9676454..5e1f7ab 100644 --- a/datasketches/Cargo.toml +++ b/datasketches/Cargo.toml @@ -34,10 +34,9 @@ keywords = ["sketch", "hyperloglog", "probabilistic"] all-features = true rustdoc-args = ["--cfg", "docsrs"] -[dependencies] - [dev-dependencies] googletest = { workspace = true } +rand = { workspace = true } [lints] workspace = true diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index a2b5957..2be42c1 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! Compressed Probabilistic Counting sketch family. mod pair_table; diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs index 0848634..9ab6888 100644 --- a/datasketches/src/cpc/pair_table.rs +++ b/datasketches/src/cpc/pair_table.rs @@ -1,8 +1,349 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const UPSIZE_NUMERATOR: u32 = 3; +const UPSIZE_DENOMINATOR: u32 = 4; +const DOWNSIZE_NUMERATOR: u32 = 1; +const DOWNSIZE_DENOMINATOR: u32 = 4; + /// A highly specialized hash table used for sparse data. /// /// This table stores `(row, col)` pairs and uses linear probing for collision resolution. It is /// optimized for scenarios where the cardinality of entries is low. pub(crate) struct PairTable { - pub keys: Vec, - pub values: Vec, + /// log2 of number of slots + lg_size: u8, + num_valid_bits: u8, + num_items: u32, + slots: Vec, +} + +impl PairTable { + pub fn new(lg_size: u8, num_valid_bits: u8) -> Self { + assert!( + (2..=26).contains(&lg_size), + "lg_size must be in [2, 26], got {lg_size}" + ); + assert!( + ((lg_size + 1)..=32).contains(&num_valid_bits), + "num_valid_bits must be in [lg_size + 1, 32], got {num_valid_bits} where lg_size = {lg_size}" + ); + Self { + lg_size, + num_valid_bits, + num_items: 0, + slots: vec![u32::MAX; 1 << lg_size], + } + } + + /// A constructor specifically tailored to be a part of FM85 decompression scheme. + pub fn from_slots(lg_size: u8, num_items: u32, slots: Vec) -> Self { + let mut lg_num_slots = 2; + while UPSIZE_DENOMINATOR * num_items > (UPSIZE_NUMERATOR * (1 << lg_num_slots)) { + lg_num_slots += 1; + } + + let mut table = Self::new(lg_num_slots, 6 + lg_size); + + // Note: there is a possible "snowplow effect" here because the caller is passing in a + // sorted pairs array. However, we are starting out with the correct final table size, so + // the problem might not occur. + + for slot in slots { + table.must_insert(slot); + } + table.num_items = num_items; + table + } + + pub fn lg_size(&self) -> u8 { + self.lg_size + } + + pub fn num_items(&self) -> u32 { + self.num_items + } + + pub fn slots(&self) -> &[u32] { + &self.slots + } + + pub fn clear(&mut self) { + self.slots.fill(u32::MAX); + self.num_items = 0; + } + + pub fn maybe_delete(&mut self, item: u32) -> bool { + let index = self.lookup(item) as usize; + if self.slots[index] == u32::MAX { + return false; + } + assert_eq!( + self.slots[index], item, + "item {item} not found at index {index}" + ); + assert!(self.num_items > 0, "no items to delete"); + + // delete the item + self.slots[index] = u32::MAX; + self.num_items -= 1; + + // re-insert all items between the freed slot and the next empty slot + let mask = (1 << self.lg_size) - 1; + let mut probe = (index + 1) & mask; + let mut fetched = self.slots[probe]; + while fetched != u32::MAX { + self.slots[probe] = u32::MAX; + self.must_insert(fetched); + probe = (probe + 1) & mask; + fetched = self.slots[probe]; + } + + // shrink if necessary + while ((DOWNSIZE_DENOMINATOR * self.num_items) < (DOWNSIZE_NUMERATOR * (1 << self.lg_size))) + && (self.lg_size > 2) + { + self.rebuild(self.lg_size - 1); + } + + true + } + + pub fn maybe_insert(&mut self, item: u32) -> bool { + let index = self.lookup(item) as usize; + if self.slots[index] == item { + return false; + } + assert_eq!( + self.slots[index], + u32::MAX, + "no empty slot found for item {item} at index {index}" + ); + self.slots[index] = item; + self.num_items += 1; + while (UPSIZE_DENOMINATOR * self.num_items) > (UPSIZE_NUMERATOR * (1 << self.lg_size)) { + self.rebuild(self.lg_size + 1); + } + true + } + + /// While extracting the items from a linear probing hashtable, this will usually undo the + /// wrap-around provided that the table isn't too full. + /// + /// Experiments suggest that for sufficiently large tables the load factor would have to be + /// over 90 percent before this would fail frequently, and even then the subsequent sort would + /// fix things up. + /// + /// The result is nearly sorted, so make sure to use an efficient sort for that case. + pub fn unwrapping_get_items(&self) -> Vec { + if self.slots.is_empty() { + return vec![]; + } + + let table_size = 1usize << self.lg_size; + let mut result = vec![0; self.num_items as usize]; + let mut i = 0usize; + let mut l = 0usize; + let mut r = self.num_items as usize - 1; + + // special rules for the region before the first empty slot + let hi_bit = 1 << (self.num_valid_bits - 1); + while i < table_size && self.slots[i] != u32::MAX { + let item = self.slots[i]; + i += 1; + if (item & hi_bit) != 0 { + // this item was probably wrapped, so move to end + result[r] = item; + r -= 1; + } else { + result[l] = item; + l += 1; + } + } + + // the rest of the table is processed normally + while i < table_size { + let item = self.slots[i]; + i += 1; + if item != u32::MAX { + result[l] = item; + l += 1; + } + } + + assert_eq!(l, r + 1, "number of items mismatch during extraction"); + result + } + + fn must_insert(&mut self, item: u32) { + let index = self.lookup(item) as usize; + assert_ne!( + self.slots[index], item, + "item {item} already present in table" + ); + assert_eq!( + self.slots[index], + u32::MAX, + "no empty slot found for item {item} at index {index}" + ); + self.slots[index] = item; + // counts and resizing must be handled by the caller. + } + + fn lookup(&self, item: u32) -> u32 { + let size = 1 << self.lg_size; + let mask = size - 1; + + let shift = self.num_valid_bits - self.lg_size; + + // extract high table size bits + let mut probe = item >> shift; + assert!(probe <= mask, "probe = {probe}, mask = {mask}"); + + loop { + let slot = self.slots[probe as usize]; + if slot != item && slot != u32::MAX { + probe = (probe + 1) & mask; + } else { + break; + } + } + + probe + } + + /// Rebuilds to a larger size. `num_items` and `num_valid_bits` remain unchanged. + fn rebuild(&mut self, lg_size: u8) { + assert!( + (2..=26).contains(&lg_size), + "lg_size must be in [2, 26], got {lg_size}" + ); + assert!( + ((lg_size + 1)..=32).contains(&self.num_valid_bits), + "num_valid_bits must be in [lg_size + 1, 32], got {} where lg_size = {lg_size}", + self.num_valid_bits + ); + + let new_size = 1u32 << lg_size; + assert!( + new_size > self.num_items, + "new table size ({new_size}) must be larger than number of items {}", + self.num_items + ); + + let slots = std::mem::replace(&mut self.slots, vec![u32::MAX; new_size as usize]); + self.lg_size = lg_size; + for slot in slots { + if slot != u32::MAX { + self.must_insert(slot); + } + } + } +} + +/// This should be used pair with [`PairTable::unwrapping_get_items`]. +/// +/// In applications where the input array is already nearly sorted, insertion sort runs in linear +/// time with a very small constant. +/// +/// This introspective version of insertion sort protects against the quadratic cost of sorting bad +/// input arrays. It keeps track of how much work has been done, and if that exceeds a constant +/// times the array length, it switches to a different sorting algorithm. +pub fn introspective_insertion_sort(a: &mut [u32]) { + let cost_limit = 8 * a.len(); + + let mut cost = 0; + for i in 1..a.len() { + let mut j = i; + let v = a[i]; + while j >= 1 && v < a[j - 1] { + a[j] = a[j - 1]; + j -= 1; + } + a[j] = v; + cost += i - j; // distance moved is a measure of work + if cost > cost_limit { + knuth_shell_sort3(a); + return; + } + } +} + +fn knuth_shell_sort3(a: &mut [u32]) { + let len = a.len(); + + let mut h = 0; + while h < len / 9 { + h = 3 * h + 1; + } + + while h > 0 { + for i in h..len { + let v = a[i]; + let mut j = i; + while j >= h && v < a[j - h] { + a[j] = a[j - h]; + j -= h; + } + a[j] = v; + } + h /= 3; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_sort_1() { + let data = (0..10) + .map(|_| rand::random_range(0..10000)) + .collect::>(); + let mut sorted = data.clone(); + introspective_insertion_sort(&mut sorted); + assert!( + sorted.is_sorted(), + "data was not sorted correctly: origin={data:?}, sorted={sorted:?}" + ); + } + + #[test] + fn test_sort_2() { + let data = (0..10) + .map(|_| rand::random_range(0..10000) + 3_000_000_000) + .collect::>(); + let mut sorted = data.clone(); + introspective_insertion_sort(&mut sorted); + assert!( + sorted.is_sorted(), + "data was not sorted correctly: origin={data:?}, sorted={sorted:?}" + ); + } + + #[test] + fn test_sort_3() { + let len = 20; + let data = (0..len).map(|i| len - i + 1).collect::>(); + let mut sorted = data.clone(); + introspective_insertion_sort(&mut sorted); + assert!( + sorted.is_sorted(), + "data was not sorted correctly: origin={data:?}, sorted={sorted:?}" + ); + } } From 8b9d82f927f3aa78cf2b29217d25c62e69525202 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 13:29:27 +0800 Subject: [PATCH 03/21] icon_estimator Signed-off-by: tison --- datasketches/src/cpc/icon_estimator.rs | 234 +++++++++++++++++++++++++ datasketches/src/cpc/mod.rs | 1 + datasketches/src/cpc/pair_table.rs | 2 +- 3 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 datasketches/src/cpc/icon_estimator.rs diff --git a/datasketches/src/cpc/icon_estimator.rs b/datasketches/src/cpc/icon_estimator.rs new file mode 100644 index 0000000..5c91ff3 --- /dev/null +++ b/datasketches/src/cpc/icon_estimator.rs @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The ICON estimator for FM85 sketches is defined by the arXiv paper. +//! +//! The current file provides exact and approximate implementations of this estimator. The exact +//! version works for any value of K, but is quite slow. +//! +//! The much faster approximate version works for K values that are powers of two ranging from 2^4 +//! to 2^32. +//! +//! At a high-level, this approximation can be described as using an exponential approximation when +//! C > K * (5.6 or 5.7), while smaller values of C are handled by a degree-19 polynomial +//! approximation of a pre-conditioned version of the true ICON mapping from C to N_hat. +//! +//! This file also provides a validation procedure that compares its approximate and exact +//! implementations of the FM85 ICON estimator. + +const ICON_MIN_LOG_K: usize = 4; +const ICON_MAX_LOG_K: usize = 26; +const ICON_POLYNOMIAL_DEGREE: usize = 19; +const ICON_POLYNOMIAL_NUM_COEFFICIENTS: usize = 1 + ICON_POLYNOMIAL_DEGREE; +const ICON_TABLE_SIZE: usize = + ICON_POLYNOMIAL_NUM_COEFFICIENTS * (1 + (ICON_MAX_LOG_K - ICON_MIN_LOG_K)); + +#[rustfmt::skip] +#[allow(clippy::excessive_precision)] +const ICON_POLYNOMIAL_COEFFICIENTS: [f64; ICON_TABLE_SIZE] = [ + // log K = 4 + 0.9895027971889700513, 0.3319496644645180128, 0.1242818722715769986, -0.03324149686026930256, -0.2985637298081619817, + 1.366555923595830002, -4.705499366260569971, 11.61506432505530029, -21.11254986175579873, 28.89421695078809904, + -30.1383659011730991, 24.11946778830730054, -14.83391445199539938, 6.983088767267210173, -2.48964120264876998, + 0.6593243603602499947, -0.125493534558034997, 0.01620971672896159843, -0.001271267679036929953, 4.567178653294529745e-05, + + // log K = 5 + 0.9947713741300230339, 0.3326559581620939787, 0.1250050661634889981, -0.04130073804472530336, -0.2584095537451129854, + 1.218050389433120051, -4.319106696095399656, 10.87175052045090062, -20.0184979022142997, 27.63210188163320069, + -28.97950009664030091, 23.26740804691930009, -14.33375703270860058, 6.751281271241110105, -2.406363094133439962, + 0.6367414734718820357, -0.1210468076141379967, 0.01561196698118279963, -0.001222335432128580056, 4.383502970318410206e-05, + + // log K = 6 + 0.9973904854982870161, 0.3330148852217920119, 0.125251536589509993, -0.04434075124043219962, -0.2436238890691720116, + 1.163293254754570016, -4.177758779777369647, 10.60301981340099964, -19.6274507428828997, 27.18420839597660077, + -28.56827214174580121, 22.96268674086600114, -14.15234202220280046, 6.665700662642549901, -2.375043356720739851, + 0.6280993991240929608, -0.119319019358031006, 0.01537674055733759954, -0.001202881695730769916, 4.309894633186929849e-05, + + // log K = 7 + 0.9986963310058679655, 0.3331956705633329907, 0.125337696770523005, -0.04546817338088020299, -0.2386752211125199863, + 1.145927328111949972, -4.135694445582720036, 10.52805060502839929, -19.52408322548339825, 27.06921653903929936, + -28.46207532143190022, 22.88083524357429965, -14.10057147392659971, 6.63958754983273991, -2.364865219283200037, + 0.6251341806425250169, -0.1186991327450530043, 0.0152892726403408008, -0.001195439764873199896, 4.281098416794090072e-05, + + // log K = 8 + 0.999348600452531044, 0.3332480372393080148, 0.126666900963325002, -0.06495714694254159371, -0.08376282050638980681, + 0.3760158094643630267, -1.568204791601850001, 4.483117719555970382, -9.119180124379150598, 13.65799293358900002, + -15.3100211234349004, 12.97546344654869976, -8.351661538536939489, 4.075022612435580172, -1.49387015887069996, + 0.4040976870253379927, -0.07813232681879349328, 0.01020545649538820085, -0.0008063279210812720381, 2.909334976414100078e-05, + + // log K = 9 + 0.9996743787297059924, 0.3332925779481850093, 0.1267124599259649986, -0.06550452970936600228, -0.08191738117533520214, + 0.3773034458363569987, -1.604679509609959975, 4.636761898691969641, -9.487348609558699408, 14.25164235443030059, + -15.99674955529870068, 13.56353219046370029, -8.730194904342459594, 4.259010067932120336, -1.56106689792022002, + 0.4222540912786589828, -0.08165296504921559784, 0.01066878484925220041, -0.0008433887618256910015, 3.045339724886519912e-05, + + // log K = 10 + 0.999837191783945034, 0.3333142252339619804, 0.1267759538087240012, -0.06631005632753710077, -0.07692759158286699428, + 0.3568943956395980166, -1.546598721379510044, 4.51595019978557044, -9.298431968763770428, 14.02586858080080034, + -15.78858959520439953, 13.41484931677589998, -8.647958125130809748, 4.22398017468472009, -1.549708891200570093, + 0.419507410264540026, -0.08117411611046250475, 0.01061202286184199928, -0.000839300527596772007, 3.03185874520205985e-05, + + // log K = 11 + 0.9999186020796150265, 0.3333249054574359826, 0.126791713589799987, -0.06662487271699729652, -0.07335552427910230211, + 0.3316370184815959909, -1.434143797561290068, 4.180260309967409604, -8.593906870708760692, 12.95088874800289958, + -14.56876092520539956, 12.37074367531410068, -7.969152075707960137, 3.888774396648960074, -1.424923326506990051, + 0.385084561785229984, -0.07435541911616409816, 0.009695363567476529554, -0.0007644375960047160388, 2.75156194717188011e-05, + + // log K = 12 + 0.9999592955649559967, 0.3333310560725140093, 0.1267379744020450116, -0.06524495415766619344, -0.08854031542298740343, + 0.4244320628874230228, -1.794077789033230008, 5.133875262768450298, -10.40149374917120007, 15.47808115629240078, + -17.2272296137545986, 14.5002173676463002, -9.274819801602760094, 4.500782540026570189, -1.642359389030050076, + 0.442596113445525019, -0.0853226219238850947, 0.01111969379054169975, -0.0008771614088006969611, 3.161668519459719752e-05, + + // log K = 13 + 0.9999796468102559732, 0.3333336602394039727, 0.126728089053198989, -0.06503798598282370391, -0.09050261023823169548, + 0.4350609244189960201, -1.831274835815670077, 5.223387516985289913, -10.55574395269979959, 15.67359470222429962, + -17.41263416341029924, 14.63297400889229927, -9.346752431221359458, 4.530124905188380069, -1.651245566462089975, + 0.444542549250713015, -0.08561720963336499901, 0.01114805146185449992, -0.0008786251203363140043, 3.16416341644572998e-05, + + // log K = 14 + 0.9999898187060970445, 0.3333362579300819806, 0.1266984078369459976, -0.06464561179765909715, -0.09343280886228019777, + 0.4490702549264070087, -1.878087608052450008, 5.338004322057390283, -10.76690603590630069, 15.97069195083200022, + -17.73440379943459888, 14.90212518309260048, -9.520506013770420495, 4.616238931978830173, -1.68364817877918993, + 0.4536194960681350086, -0.087448605434800597, 0.01139929991331390009, -0.0008995891451622229631, 3.244407259782900338e-05, + + // log K = 15 + 0.9999949072549390028, 0.3333376334705290267, 0.126665364358402005, -0.06411790034705669439, -0.09776009134670660128, + 0.4704691112248470253, -1.948021675295769972, 5.497760972696490001, -11.03165645315390009, 16.29703330781000048, + -18.03851029448010124, 15.11836776139680083, -9.638205179917429533, 4.665122328753120051, -1.698980686525759953, + 0.4571799506245269873, -0.08804011353783609828, 0.01146553155965330043, -0.0009040455800659569869, 3.257931866957050274e-05, + + // log K = 16 + 0.9999974544793589493, 0.3333381337614599871, 0.1266524862971120102, -0.06391676499117690535, -0.09929616211306059592, + 0.4771390820378790254, -1.965762451227349938, 5.526802350376460282, -11.05703067024660058, 16.29535848023060041, + -18.00114005075790047, 15.06214012231560062, -9.58874727382628933, 4.63537541652793017, -1.686222848555620102, + 0.4532602373715179933, -0.08719448925964939923, 0.01134365425717459921, -0.0008934965241274289835, 3.216436244471380105e-05, + + // log K = 17 + 0.9999987278278800185, 0.3333383411464330148, 0.126642761751724009, -0.06371042959073920653, -0.1013564516034080043, + 0.4891311195679299839, -2.010971712051409899, 5.644390807952309963, -11.27697253921500042, 16.59957157207080058, + -18.31808338317799922, 15.31363518393730061, -9.741451446816620674, 4.706207545519429658, -1.711102469010010063, + 0.4597587341089349744, -0.08841670767182820134, 0.01149999225097850068, -0.0009056651366963050422, 3.259910736274500059e-05, + + // log K = 18 + 0.9999993637727100371, 0.3333385511608860097, 0.1266341580529160016, -0.06353272828164230335, -0.103139962850642003, + 0.4996216017206500104, -2.05099128585287982, 5.749874086531799655, -11.47727638570349917, 16.88141587810320132, + -18.61744656177490143, 15.55634230427719977, -9.892350736128680211, 4.778033520984200422, -1.737045483861280104, + 0.4667410882683730167, -0.08977256212421590165, 0.01167940146667079994, -0.0009201381242396030127, 3.313600701586759867e-05, + + // log K = 19 + 0.9999996805376010212, 0.3333372324328989778, 0.1267104737214659882, -0.06504749929326139601, -0.0882341962464350954, + 0.4131871162041140244, -1.725190703567099915, 4.900817515593920426, -9.883452720776510603, 14.6657081190816001, + -16.29398295135089825, 13.69805011761319946, -8.753475239465899449, 4.244072374564439976, -1.547202527706629915, + 0.4164770109614310267, -0.08017596922092029565, 0.01043146101701039954, -0.00082124200571200305, 2.953319493719429935e-05, + + // log K = 20 + 0.9999998390037539986, 0.3333365859956040067, 0.1267460211029839967, -0.06569456024647769843, -0.0823070353477164951, + 0.3810826463303410017, -1.611983580241109992, 4.624520077758210057, -9.397308335633589138, 14.03184981378050011, + -15.6703191315401007, 13.22992718704790072, -8.484216393184780713, 4.125607133488029987, -1.507690650697159906, + 0.4066678517577320129, -0.07842110121777939868, 0.01021780862225150042, -0.0008054065857047439754, 2.899431830426989844e-05, + + // log K = 21 + 0.9999999207001479817, 0.3333384953015239849, 0.1266331480396669928, -0.06345750166298599892, -0.1042341210992499961, + 0.5077112908497130039, -2.087398133609810191, 5.858842546192500222, -11.70620319777190055, 17.23103975433669888, + -19.01462552846669851, 15.89674059836560005, -10.11395134034419918, 4.88760796465891989, -1.777886770904629987, + 0.4780200178339499839, -0.09200895321782050218, 0.01198029553244219989, -0.0009447283875782100165, 3.405716775824710232e-05, + + // log K = 22 + 0.9999999606908690497, 0.3333383929524300071, 0.1266456445096819927, -0.06373504294081690225, -0.1012834291081849969, + 0.4893810690172959998, -2.01391428223606983, 5.656430437473649597, -11.3067201537791, 16.64980594135310099, + -18.3792355790383013, 15.36879753115040081, -9.778831246425049528, 4.725308061988969577, -1.718423596500280093, + 0.4618308177809870019, -0.08883675060799739454, 0.01155766944804260087, -0.0009104695617243750358, 3.278237729674439666e-05, + + // log K = 23 + 0.9999999794683379628, 0.3333386441751680085, 0.1266463995182049995, -0.06376031920455070556, -0.1010799540803130059, + 0.488540137426137, -2.012048323537570127, 5.654949475342659682, -11.31023240892979942, 16.66334675284959843, + -18.40241452866079896, 15.39443572867130072, -9.798844412838670692, 4.736683907539640082, -1.723168363744929987, + 0.463270349018644001, -0.08914619066708899531, 0.01160235936257320022, -0.0009143600818183229709, 3.293669304679140117e-05, + + // log K = 24 + 0.9999999911469820146, 0.3333376076934529975, 0.1266944349940530012, -0.06470524278387919381, -0.09189342220283110152, + 0.4359182372694809793, -1.815980282951169977, 5.149474056470340066, -10.37086570678100017, 15.36962686758569951, + -17.05756384717849983, 14.32755177515199918, -9.149944050025640152, 4.434601894497260055, -1.616478926806520056, + 0.4351979157055039793, -0.08381768225272340223, 0.01091321820476520016, -0.0008600264403629039739, 3.09667800347144002e-05, + + // log K = 25 + 0.9999999968592140354, 0.3333379164881000167, 0.1266782495827009913, -0.06434163088961859789, -0.09575258124988890451, + 0.4597843575354370049, -1.911374431241559924, 5.411856661251520428, -10.88850084646090011, 16.12298941380269923, + -17.88172178487259956, 15.01301780636859995, -9.585542896142529301, 4.645811872761620442, -1.693952293156189892, + 0.4563143308861309921, -0.08795976148455289523, 0.01146560428011200033, -0.0009048442931930629528, 3.26358391497329992e-05, + + // log K = 26 + 0.9999999970700530483, 0.333338329556315982, 0.126644753076394001, -0.06372365346512399997, -0.1012760856945769949, + 0.4886852278576360176, -2.009005418394389952, 5.638119224137019714, -11.26276715335160006, 16.57640024218650154, + -18.29035093605569884, 15.28892246224570073, -9.724916375991760731, 4.6978877652334603, -1.707974125916829955, + 0.4588937864564729963, -0.08824617586088029375, 0.01147732114826570046, -0.00090384524860747295, 3.253252703695579795e-05, +]; + +pub(super) fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize, x: f64) -> f64 { + let end = start + num - 1; + let mut total = coefficients[end]; + for i in (start..end).rev() { + total *= x; + total += coefficients[i]; + } + total +} + +pub(super) fn icon_exponential_approximation(k: f64, c: f64) -> f64 { + 0.7940236163830469 * k * 2f64.powf(c / k) +} + +pub(super) fn compute_icon_estimate(lg_k: u8, c: u32) -> f64 { + let lg_k = lg_k as usize; + assert!( + (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k), + "lg_k out of range; got {lg_k}", + ); + + match c { + 0 => return 0.0, + 1 => return 1.0, + _ => {} + } + + let k = (1 << lg_k) as f64; + let c = c as f64; + + // Differing thresholds ensure that the approximated estimator is monotonically increasing. + let threshold_factor = if lg_k < 14 { 5.7 } else { 5.6 }; + if c > threshold_factor * k { + return icon_exponential_approximation(k, c); + } + + let factor = evaluate_polynomial( + &ICON_POLYNOMIAL_COEFFICIENTS, + ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K), + ICON_POLYNOMIAL_NUM_COEFFICIENTS, + // The constant 2.0 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS. + // This factor, although somewhat arbitrary, is based on extensive characterization studies + // and is considered a safe conservative factor. + c / (2.0 * k), + ); + let ratio = c / k; + // The constant 66.774757 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS. + // This factor, although somewhat arbitrary, is based on extensive characterization studies + // and is considered a safe conservative factor. + let term = 1.0 + (ratio * ratio * ratio / 66.774757); + let result = c * factor * term; + if result >= c { result } else { c } +} diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index 2be42c1..4f03e5c 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -17,4 +17,5 @@ //! Compressed Probabilistic Counting sketch family. +mod icon_estimator; mod pair_table; diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs index 9ab6888..8701ac1 100644 --- a/datasketches/src/cpc/pair_table.rs +++ b/datasketches/src/cpc/pair_table.rs @@ -24,7 +24,7 @@ const DOWNSIZE_DENOMINATOR: u32 = 4; /// /// This table stores `(row, col)` pairs and uses linear probing for collision resolution. It is /// optimized for scenarios where the cardinality of entries is low. -pub(crate) struct PairTable { +pub(super) struct PairTable { /// log2 of number of slots lg_size: u8, num_valid_bits: u8, From 6219eed9786ff7496b768aad27855c0df1f16fb0 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 13:44:19 +0800 Subject: [PATCH 04/21] lookup tables Signed-off-by: tison --- datasketches/src/common/inv_pow2_table.rs | 103 ++++++++++++++++++++++ datasketches/src/common/mod.rs | 1 + datasketches/src/cpc/kxp_byte_lookup.rs | 76 ++++++++++++++++ datasketches/src/cpc/mod.rs | 1 + 4 files changed, 181 insertions(+) create mode 100644 datasketches/src/common/inv_pow2_table.rs create mode 100644 datasketches/src/cpc/kxp_byte_lookup.rs diff --git a/datasketches/src/common/inv_pow2_table.rs b/datasketches/src/common/inv_pow2_table.rs new file mode 100644 index 0000000..580a2fc --- /dev/null +++ b/datasketches/src/common/inv_pow2_table.rs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[rustfmt::skip] +#[allow(clippy::excessive_precision)] +pub(crate) const INVERSE_POWERS_OF_2: [f64; 256] = [ + 1.0, 0.5, 0.25, 0.125, + 0.0625, 0.03125, 0.015625, 0.0078125, + 0.00390625, 0.001953125, 0.0009765625, 0.00048828125, + 0.000244140625, 0.0001220703125, 6.103515625e-05, 3.0517578125e-05, + 1.52587890625e-05, 7.62939453125e-06, 3.814697265625e-06, 1.9073486328125e-06, + 9.5367431640625e-07, 4.76837158203125e-07, 2.384185791015625e-07, 1.1920928955078125e-07, + 5.9604644775390625e-08, 2.9802322387695312e-08, 1.4901161193847656e-08, 7.4505805969238281e-09, + 3.7252902984619141e-09, 1.862645149230957e-09, 9.3132257461547852e-10, 4.6566128730773926e-10, + 2.3283064365386963e-10, 1.1641532182693481e-10, 5.8207660913467407e-11, 2.9103830456733704e-11, + 1.4551915228366852e-11, 7.2759576141834259e-12, 3.637978807091713e-12, 1.8189894035458565e-12, + 9.0949470177292824e-13, 4.5474735088646412e-13, 2.2737367544323206e-13, 1.1368683772161603e-13, + 5.6843418860808015e-14, 2.8421709430404007e-14, 1.4210854715202004e-14, 7.1054273576010019e-15, + 3.5527136788005009e-15, 1.7763568394002505e-15, 8.8817841970012523e-16, 4.4408920985006262e-16, + 2.2204460492503131e-16, 1.1102230246251565e-16, 5.5511151231257827e-17, 2.7755575615628914e-17, + 1.3877787807814457e-17, 6.9388939039072284e-18, 3.4694469519536142e-18, 1.7347234759768071e-18, + 8.6736173798840355e-19, 4.3368086899420177e-19, 2.1684043449710089e-19, 1.0842021724855044e-19, + 5.4210108624275222e-20, 2.7105054312137611e-20, 1.3552527156068805e-20, 6.7762635780344027e-21, + 3.3881317890172014e-21, 1.6940658945086007e-21, 8.4703294725430034e-22, 4.2351647362715017e-22, + 2.1175823681357508e-22, 1.0587911840678754e-22, 5.2939559203393771e-23, 2.6469779601696886e-23, + 1.3234889800848443e-23, 6.6174449004242214e-24, 3.3087224502121107e-24, 1.6543612251060553e-24, + 8.2718061255302767e-25, 4.1359030627651384e-25, 2.0679515313825692e-25, 1.0339757656912846e-25, + 5.169878828456423e-26, 2.5849394142282115e-26, 1.2924697071141057e-26, 6.4623485355705287e-27, + 3.2311742677852644e-27, 1.6155871338926322e-27, 8.0779356694631609e-28, 4.0389678347315804e-28, + 2.0194839173657902e-28, 1.0097419586828951e-28, 5.0487097934144756e-29, 2.5243548967072378e-29, + 1.2621774483536189e-29, 6.3108872417680944e-30, 3.1554436208840472e-30, 1.5777218104420236e-30, + 7.8886090522101181e-31, 3.944304526105059e-31, 1.9721522630525295e-31, 9.8607613152626476e-32, + 4.9303806576313238e-32, 2.4651903288156619e-32, 1.2325951644078309e-32, 6.1629758220391547e-33, + 3.0814879110195774e-33, 1.5407439555097887e-33, 7.7037197775489434e-34, 3.8518598887744717e-34, + 1.9259299443872359e-34, 9.6296497219361793e-35, 4.8148248609680896e-35, 2.4074124304840448e-35, + 1.2037062152420224e-35, 6.018531076210112e-36, 3.009265538105056e-36, 1.504632769052528e-36, + 7.5231638452626401e-37, 3.76158192263132e-37, 1.88079096131566e-37, 9.4039548065783001e-38, + 4.70197740328915e-38, 2.350988701644575e-38, 1.1754943508222875e-38, 5.8774717541114375e-39, + 2.9387358770557188e-39, 1.4693679385278594e-39, 7.3468396926392969e-40, 3.6734198463196485e-40, + 1.8367099231598242e-40, 9.1835496157991212e-41, 4.5917748078995606e-41, 2.2958874039497803e-41, + 1.1479437019748901e-41, 5.7397185098744507e-42, 2.8698592549372254e-42, 1.4349296274686127e-42, + 7.1746481373430634e-43, 3.5873240686715317e-43, 1.7936620343357659e-43, 8.9683101716788293e-44, + 4.4841550858394146e-44, 2.2420775429197073e-44, 1.1210387714598537e-44, 5.6051938572992683e-45, + 2.8025969286496341e-45, 1.4012984643248171e-45, 7.0064923216240854e-46, 3.5032461608120427e-46, + 1.7516230804060213e-46, 8.7581154020301067e-47, 4.3790577010150533e-47, 2.1895288505075267e-47, + 1.0947644252537633e-47, 5.4738221262688167e-48, 2.7369110631344083e-48, 1.3684555315672042e-48, + 6.8422776578360209e-49, 3.4211388289180104e-49, 1.7105694144590052e-49, 8.5528470722950261e-50, + 4.276423536147513e-50, 2.1382117680737565e-50, 1.0691058840368783e-50, 5.3455294201843913e-51, + 2.6727647100921956e-51, 1.3363823550460978e-51, 6.6819117752304891e-52, 3.3409558876152446e-52, + 1.6704779438076223e-52, 8.3523897190381114e-53, 4.1761948595190557e-53, 2.0880974297595278e-53, + 1.0440487148797639e-53, 5.2202435743988196e-54, 2.6101217871994098e-54, 1.3050608935997049e-54, + 6.5253044679985245e-55, 3.2626522339992623e-55, 1.6313261169996311e-55, 8.1566305849981557e-56, + 4.0783152924990778e-56, 2.0391576462495389e-56, 1.0195788231247695e-56, 5.0978941156238473e-57, + 2.5489470578119236e-57, 1.2744735289059618e-57, 6.3723676445298091e-58, 3.1861838222649046e-58, + 1.5930919111324523e-58, 7.9654595556622614e-59, 3.9827297778311307e-59, 1.9913648889155653e-59, + 9.9568244445778267e-60, 4.9784122222889134e-60, 2.4892061111444567e-60, 1.2446030555722283e-60, + 6.2230152778611417e-61, 3.1115076389305709e-61, 1.5557538194652854e-61, 7.7787690973264271e-62, + 3.8893845486632136e-62, 1.9446922743316068e-62, 9.7234613716580339e-63, 4.861730685829017e-63, + 2.4308653429145085e-63, 1.2154326714572542e-63, 6.0771633572862712e-64, 3.0385816786431356e-64, + 1.5192908393215678e-64, 7.596454196607839e-65, 3.7982270983039195e-65, 1.8991135491519597e-65, + 9.4955677457597987e-66, 4.7477838728798994e-66, 2.3738919364399497e-66, 1.1869459682199748e-66, + 5.9347298410998742e-67, 2.9673649205499371e-67, 1.4836824602749686e-67, 7.4184123013748428e-68, + 3.7092061506874214e-68, 1.8546030753437107e-68, 9.2730153767185535e-69, 4.6365076883592767e-69, + 2.3182538441796384e-69, 1.1591269220898192e-69, 5.7956346104490959e-70, 2.897817305224548e-70, + 1.448908652612274e-70, 7.2445432630613699e-71, 3.6222716315306849e-71, 1.8111358157653425e-71, + 9.0556790788267124e-72, 4.5278395394133562e-72, 2.2639197697066781e-72, 1.131959884853339e-72, + 5.6597994242666952e-73, 2.8298997121333476e-73, 1.4149498560666738e-73, 7.074749280333369e-74, + 3.5373746401666845e-74, 1.7686873200833423e-74, 8.8434366004167113e-75, 4.4217183002083556e-75, + 2.2108591501041778e-75, 1.1054295750520889e-75, 5.5271478752604446e-76, 2.7635739376302223e-76, + 1.3817869688151111e-76, 6.9089348440755557e-77, 3.4544674220377779e-77, 1.7272337110188889e-77 +]; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn assert_inv_pow2_table() { + for i in 0u8..=255 { + // expected value + let expected = INVERSE_POWERS_OF_2[i as usize]; + + // computed value + let computed = 2f64.powf(-(i as f64)); + + assert_eq!(expected, computed); + } + } +} diff --git a/datasketches/src/common/mod.rs b/datasketches/src/common/mod.rs index 017ddc9..36b734f 100644 --- a/datasketches/src/common/mod.rs +++ b/datasketches/src/common/mod.rs @@ -25,3 +25,4 @@ pub use self::resize::ResizeFactor; // private to datasketches crate pub(crate) mod binomial_bounds; +pub(crate) mod inv_pow2_table; diff --git a/datasketches/src/cpc/kxp_byte_lookup.rs b/datasketches/src/cpc/kxp_byte_lookup.rs new file mode 100644 index 0000000..96649ea --- /dev/null +++ b/datasketches/src/cpc/kxp_byte_lookup.rs @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const KXP_BYTE_TABLE: [f64; 256] = [ + 0.99609375, 0.49609375, 0.74609375, 0.24609375, 0.87109375, 0.37109375, 0.62109375, 0.12109375, + 0.93359375, 0.43359375, 0.68359375, 0.18359375, 0.80859375, 0.30859375, 0.55859375, 0.05859375, + 0.96484375, 0.46484375, 0.71484375, 0.21484375, 0.83984375, 0.33984375, 0.58984375, 0.08984375, + 0.90234375, 0.40234375, 0.65234375, 0.15234375, 0.77734375, 0.27734375, 0.52734375, 0.02734375, + 0.98046875, 0.48046875, 0.73046875, 0.23046875, 0.85546875, 0.35546875, 0.60546875, 0.10546875, + 0.91796875, 0.41796875, 0.66796875, 0.16796875, 0.79296875, 0.29296875, 0.54296875, 0.04296875, + 0.94921875, 0.44921875, 0.69921875, 0.19921875, 0.82421875, 0.32421875, 0.57421875, 0.07421875, + 0.88671875, 0.38671875, 0.63671875, 0.13671875, 0.76171875, 0.26171875, 0.51171875, 0.01171875, + 0.98828125, 0.48828125, 0.73828125, 0.23828125, 0.86328125, 0.36328125, 0.61328125, 0.11328125, + 0.92578125, 0.42578125, 0.67578125, 0.17578125, 0.80078125, 0.30078125, 0.55078125, 0.05078125, + 0.95703125, 0.45703125, 0.70703125, 0.20703125, 0.83203125, 0.33203125, 0.58203125, 0.08203125, + 0.89453125, 0.39453125, 0.64453125, 0.14453125, 0.76953125, 0.26953125, 0.51953125, 0.01953125, + 0.97265625, 0.47265625, 0.72265625, 0.22265625, 0.84765625, 0.34765625, 0.59765625, 0.09765625, + 0.91015625, 0.41015625, 0.66015625, 0.16015625, 0.78515625, 0.28515625, 0.53515625, 0.03515625, + 0.94140625, 0.44140625, 0.69140625, 0.19140625, 0.81640625, 0.31640625, 0.56640625, 0.06640625, + 0.87890625, 0.37890625, 0.62890625, 0.12890625, 0.75390625, 0.25390625, 0.50390625, 0.00390625, + 0.9921875, 0.4921875, 0.7421875, 0.2421875, 0.8671875, 0.3671875, 0.6171875, 0.1171875, + 0.9296875, 0.4296875, 0.6796875, 0.1796875, 0.8046875, 0.3046875, 0.5546875, 0.0546875, + 0.9609375, 0.4609375, 0.7109375, 0.2109375, 0.8359375, 0.3359375, 0.5859375, 0.0859375, + 0.8984375, 0.3984375, 0.6484375, 0.1484375, 0.7734375, 0.2734375, 0.5234375, 0.0234375, + 0.9765625, 0.4765625, 0.7265625, 0.2265625, 0.8515625, 0.3515625, 0.6015625, 0.1015625, + 0.9140625, 0.4140625, 0.6640625, 0.1640625, 0.7890625, 0.2890625, 0.5390625, 0.0390625, + 0.9453125, 0.4453125, 0.6953125, 0.1953125, 0.8203125, 0.3203125, 0.5703125, 0.0703125, + 0.8828125, 0.3828125, 0.6328125, 0.1328125, 0.7578125, 0.2578125, 0.5078125, 0.0078125, + 0.984375, 0.484375, 0.734375, 0.234375, 0.859375, 0.359375, 0.609375, 0.109375, 0.921875, + 0.421875, 0.671875, 0.171875, 0.796875, 0.296875, 0.546875, 0.046875, 0.953125, 0.453125, + 0.703125, 0.203125, 0.828125, 0.328125, 0.578125, 0.078125, 0.890625, 0.390625, 0.640625, + 0.140625, 0.765625, 0.265625, 0.515625, 0.015625, 0.96875, 0.46875, 0.71875, 0.21875, 0.84375, + 0.34375, 0.59375, 0.09375, 0.90625, 0.40625, 0.65625, 0.15625, 0.78125, 0.28125, 0.53125, + 0.03125, 0.9375, 0.4375, 0.6875, 0.1875, 0.8125, 0.3125, 0.5625, 0.0625, 0.875, 0.375, 0.625, + 0.125, 0.75, 0.25, 0.5, 0.0, +]; + +#[cfg(test)] +mod tests { + use super::*; + use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2; + + #[test] + fn assert_kxp_byte_table() { + for byte in 0u8..=255 { + // expected value + let expected = KXP_BYTE_TABLE[byte as usize]; + + // computed value + let mut computed = 0.0; + for col in 0..8 { + let bit = (byte >> col) & 1; + if bit == 0 { + // note the inverted logic + computed += INVERSE_POWERS_OF_2[col + 1]; //note the "+1" + } + } + + assert_eq!(expected, computed); + } + } +} diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index 4f03e5c..0e6094f 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -18,4 +18,5 @@ //! Compressed Probabilistic Counting sketch family. mod icon_estimator; +mod kxp_byte_lookup; mod pair_table; From 2ae4095c545c89da075c9fecf961d043781c64b2 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 15:22:12 +0800 Subject: [PATCH 05/21] cpc_confidence Signed-off-by: tison --- datasketches/src/common/num_std_dev.rs | 5 + datasketches/src/cpc/cpc_confidence.rs | 188 +++++++++++++++++++++++++ datasketches/src/cpc/icon_estimator.rs | 6 +- datasketches/src/cpc/mod.rs | 1 + 4 files changed, 197 insertions(+), 3 deletions(-) create mode 100644 datasketches/src/cpc/cpc_confidence.rs diff --git a/datasketches/src/common/num_std_dev.rs b/datasketches/src/common/num_std_dev.rs index 5cae235..92cbddd 100644 --- a/datasketches/src/common/num_std_dev.rs +++ b/datasketches/src/common/num_std_dev.rs @@ -50,4 +50,9 @@ impl NumStdDev { pub const fn tail_probability(&self) -> f64 { DELTA_OF_NUM_STD_DEVS[*self as usize] } + + /// Returns the number of standard deviations as an `u8`. + pub const fn as_u8(&self) -> u8 { + *self as u8 + } } diff --git a/datasketches/src/cpc/cpc_confidence.rs b/datasketches/src/cpc/cpc_confidence.rs new file mode 100644 index 0000000..21863ef --- /dev/null +++ b/datasketches/src/cpc/cpc_confidence.rs @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::f64::consts::LN_2; + +use crate::common::NumStdDev; +use crate::cpc::icon_estimator::icon_estimate; + +const ICON_ERROR_CONSTANT: f64 = LN_2; + +const ICON_LOW_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 6037, 5720, 5328, // 4 1000000 + 6411, 6262, 5682, // 5 1000000 + 6724, 6403, 6127, // 6 1000000 + 6665, 6411, 6208, // 7 1000000 + 6959, 6525, 6427, // 8 1000000 + 6892, 6665, 6619, // 9 1000000 + 6792, 6752, 6690, // 10 1000000 + 6899, 6818, 6708, // 11 1000000 + 6871, 6845, 6812, // 12 1046369 + 6909, 6861, 6828, // 13 1043411 + 6919, 6897, 6842, // 14 1000297 +]; + +const ICON_HIGH_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 8031, 8559, 9309, // 4 1000000 + 7084, 7959, 8660, // 5 1000000 + 7141, 7514, 7876, // 6 1000000 + 7458, 7430, 7572, // 7 1000000 + 6892, 7141, 7497, // 8 1000000 + 6889, 7132, 7290, // 9 1000000 + 7075, 7118, 7185, // 10 1000000 + 7040, 7047, 7085, // 11 1000000 + 6993, 7019, 7053, // 12 1046369 + 6953, 7001, 6983, // 13 1043411 + 6944, 6966, 7004, // 14 1000297 +]; + +#[allow(clippy::excessive_precision)] +const HIP_ERROR_CONSTANT: f64 = 0.588705011257737332; // (LN_2 / 2.0).sqrt() + +const HIP_LOW_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 5871, 5247, 4826, // 4 1000000 + 5877, 5403, 5070, // 5 1000000 + 5873, 5533, 5304, // 6 1000000 + 5878, 5632, 5464, // 7 1000000 + 5874, 5690, 5564, // 8 1000000 + 5880, 5745, 5619, // 9 1000000 + 5875, 5784, 5701, // 10 1000000 + 5866, 5789, 5742, // 11 1000000 + 5869, 5827, 5784, // 12 1046369 + 5876, 5860, 5827, // 13 1043411 + 5881, 5853, 5842, // 14 1000297 +]; + +const HIP_HIGH_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 5855, 6688, 7391, // 4 1000000 + 5886, 6444, 6923, // 5 1000000 + 5885, 6254, 6594, // 6 1000000 + 5889, 6134, 6326, // 7 1000000 + 5900, 6072, 6203, // 8 1000000 + 5875, 6005, 6089, // 9 1000000 + 5871, 5980, 6040, // 10 1000000 + 5889, 5941, 6015, // 11 1000000 + 5871, 5926, 5973, // 12 1046369 + 5866, 5901, 5915, // 13 1043411 + 5880, 5914, 5953, // 14 1000297 +]; + +pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = ICON_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (ICON_HIGH_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let est = icon_estimate(lg_k, num_coupons); + let result = est / (1.0 + eps); + if result < (num_coupons as f64) { + num_coupons as f64 + } else { + result + } +} + +pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = ICON_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (ICON_LOW_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let est = icon_estimate(lg_k, num_coupons); + let result = est / (1.0 - eps); + result.ceil() // slight widening of interval to be conservative +} + +// merge_flag must already be checked as false +pub(super) fn hip_confidence_lb( + lg_k: u8, + num_coupons: u32, + hip_estimate: f64, + kappa: NumStdDev, +) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = HIP_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (HIP_HIGH_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let result = hip_estimate / (1.0 + eps); + if result < (num_coupons as f64) { + num_coupons as f64 + } else { + result + } +} + +// merge_flag must already be checked as false +pub(super) fn get_hip_confidence_ub( + lg_k: u8, + num_coupons: u32, + hip_estimate: f64, + kappa: NumStdDev, +) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = HIP_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (HIP_LOW_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let result = hip_estimate / (1.0 - eps); + result.ceil() // widening for coverage +} diff --git a/datasketches/src/cpc/icon_estimator.rs b/datasketches/src/cpc/icon_estimator.rs index 5c91ff3..c817aa4 100644 --- a/datasketches/src/cpc/icon_estimator.rs +++ b/datasketches/src/cpc/icon_estimator.rs @@ -179,7 +179,7 @@ const ICON_POLYNOMIAL_COEFFICIENTS: [f64; ICON_TABLE_SIZE] = [ 0.4588937864564729963, -0.08824617586088029375, 0.01147732114826570046, -0.00090384524860747295, 3.253252703695579795e-05, ]; -pub(super) fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize, x: f64) -> f64 { +fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize, x: f64) -> f64 { let end = start + num - 1; let mut total = coefficients[end]; for i in (start..end).rev() { @@ -189,11 +189,11 @@ pub(super) fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize total } -pub(super) fn icon_exponential_approximation(k: f64, c: f64) -> f64 { +fn icon_exponential_approximation(k: f64, c: f64) -> f64 { 0.7940236163830469 * k * 2f64.powf(c / k) } -pub(super) fn compute_icon_estimate(lg_k: u8, c: u32) -> f64 { +pub(super) fn icon_estimate(lg_k: u8, c: u32) -> f64 { let lg_k = lg_k as usize; assert!( (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k), diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index 0e6094f..ad0c69f 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -17,6 +17,7 @@ //! Compressed Probabilistic Counting sketch family. +mod cpc_confidence; mod icon_estimator; mod kxp_byte_lookup; mod pair_table; From d201bfbefeda3f742b71eb3828f31a6f382dd864 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 16:52:58 +0800 Subject: [PATCH 06/21] cpc sketch impl Signed-off-by: tison --- datasketches/src/cpc/cpc_confidence.rs | 188 --------------- .../cpc/{icon_estimator.rs => estimator.rs} | 228 ++++++++++++++++-- datasketches/src/cpc/mod.rs | 6 +- datasketches/src/cpc/pair_table.rs | 1 + datasketches/src/cpc/sketch.rs | 115 +++++++++ 5 files changed, 333 insertions(+), 205 deletions(-) delete mode 100644 datasketches/src/cpc/cpc_confidence.rs rename datasketches/src/cpc/{icon_estimator.rs => estimator.rs} (68%) create mode 100644 datasketches/src/cpc/sketch.rs diff --git a/datasketches/src/cpc/cpc_confidence.rs b/datasketches/src/cpc/cpc_confidence.rs deleted file mode 100644 index 21863ef..0000000 --- a/datasketches/src/cpc/cpc_confidence.rs +++ /dev/null @@ -1,188 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::f64::consts::LN_2; - -use crate::common::NumStdDev; -use crate::cpc::icon_estimator::icon_estimate; - -const ICON_ERROR_CONSTANT: f64 = LN_2; - -const ICON_LOW_SIDE_DATA: [u16; 33] = [ - //1, 2, 3, kappa - // lgK num trials - 6037, 5720, 5328, // 4 1000000 - 6411, 6262, 5682, // 5 1000000 - 6724, 6403, 6127, // 6 1000000 - 6665, 6411, 6208, // 7 1000000 - 6959, 6525, 6427, // 8 1000000 - 6892, 6665, 6619, // 9 1000000 - 6792, 6752, 6690, // 10 1000000 - 6899, 6818, 6708, // 11 1000000 - 6871, 6845, 6812, // 12 1046369 - 6909, 6861, 6828, // 13 1043411 - 6919, 6897, 6842, // 14 1000297 -]; - -const ICON_HIGH_SIDE_DATA: [u16; 33] = [ - //1, 2, 3, kappa - // lgK num trials - 8031, 8559, 9309, // 4 1000000 - 7084, 7959, 8660, // 5 1000000 - 7141, 7514, 7876, // 6 1000000 - 7458, 7430, 7572, // 7 1000000 - 6892, 7141, 7497, // 8 1000000 - 6889, 7132, 7290, // 9 1000000 - 7075, 7118, 7185, // 10 1000000 - 7040, 7047, 7085, // 11 1000000 - 6993, 7019, 7053, // 12 1046369 - 6953, 7001, 6983, // 13 1043411 - 6944, 6966, 7004, // 14 1000297 -]; - -#[allow(clippy::excessive_precision)] -const HIP_ERROR_CONSTANT: f64 = 0.588705011257737332; // (LN_2 / 2.0).sqrt() - -const HIP_LOW_SIDE_DATA: [u16; 33] = [ - //1, 2, 3, kappa - // lgK num trials - 5871, 5247, 4826, // 4 1000000 - 5877, 5403, 5070, // 5 1000000 - 5873, 5533, 5304, // 6 1000000 - 5878, 5632, 5464, // 7 1000000 - 5874, 5690, 5564, // 8 1000000 - 5880, 5745, 5619, // 9 1000000 - 5875, 5784, 5701, // 10 1000000 - 5866, 5789, 5742, // 11 1000000 - 5869, 5827, 5784, // 12 1046369 - 5876, 5860, 5827, // 13 1043411 - 5881, 5853, 5842, // 14 1000297 -]; - -const HIP_HIGH_SIDE_DATA: [u16; 33] = [ - //1, 2, 3, kappa - // lgK num trials - 5855, 6688, 7391, // 4 1000000 - 5886, 6444, 6923, // 5 1000000 - 5885, 6254, 6594, // 6 1000000 - 5889, 6134, 6326, // 7 1000000 - 5900, 6072, 6203, // 8 1000000 - 5875, 6005, 6089, // 9 1000000 - 5871, 5980, 6040, // 10 1000000 - 5889, 5941, 6015, // 11 1000000 - 5871, 5926, 5973, // 12 1046369 - 5866, 5901, 5915, // 13 1043411 - 5880, 5914, 5953, // 14 1000297 -]; - -pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { - if num_coupons == 0 { - return 0.0; - } - - let k = (1 << lg_k) as f64; - let kappa = kappa.as_u8(); - - let mut x = ICON_ERROR_CONSTANT; - if lg_k <= 14 { - let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; - x = (ICON_HIGH_SIDE_DATA[idx] as f64) / 10000.0; - } - let rel = x / k.sqrt(); - let eps = (kappa as f64) * rel; - let est = icon_estimate(lg_k, num_coupons); - let result = est / (1.0 + eps); - if result < (num_coupons as f64) { - num_coupons as f64 - } else { - result - } -} - -pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { - if num_coupons == 0 { - return 0.0; - } - - let k = (1 << lg_k) as f64; - let kappa = kappa.as_u8(); - - let mut x = ICON_ERROR_CONSTANT; - if lg_k <= 14 { - let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; - x = (ICON_LOW_SIDE_DATA[idx] as f64) / 10000.0; - } - let rel = x / k.sqrt(); - let eps = (kappa as f64) * rel; - let est = icon_estimate(lg_k, num_coupons); - let result = est / (1.0 - eps); - result.ceil() // slight widening of interval to be conservative -} - -// merge_flag must already be checked as false -pub(super) fn hip_confidence_lb( - lg_k: u8, - num_coupons: u32, - hip_estimate: f64, - kappa: NumStdDev, -) -> f64 { - if num_coupons == 0 { - return 0.0; - } - - let k = (1 << lg_k) as f64; - let kappa = kappa.as_u8(); - - let mut x = HIP_ERROR_CONSTANT; - if lg_k <= 14 { - let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; - x = (HIP_HIGH_SIDE_DATA[idx] as f64) / 10000.0; - } - let rel = x / k.sqrt(); - let eps = (kappa as f64) * rel; - let result = hip_estimate / (1.0 + eps); - if result < (num_coupons as f64) { - num_coupons as f64 - } else { - result - } -} - -// merge_flag must already be checked as false -pub(super) fn get_hip_confidence_ub( - lg_k: u8, - num_coupons: u32, - hip_estimate: f64, - kappa: NumStdDev, -) -> f64 { - if num_coupons == 0 { - return 0.0; - } - - let k = (1 << lg_k) as f64; - let kappa = kappa.as_u8(); - - let mut x = HIP_ERROR_CONSTANT; - if lg_k <= 14 { - let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; - x = (HIP_LOW_SIDE_DATA[idx] as f64) / 10000.0; - } - let rel = x / k.sqrt(); - let eps = (kappa as f64) * rel; - let result = hip_estimate / (1.0 - eps); - result.ceil() // widening for coverage -} diff --git a/datasketches/src/cpc/icon_estimator.rs b/datasketches/src/cpc/estimator.rs similarity index 68% rename from datasketches/src/cpc/icon_estimator.rs rename to datasketches/src/cpc/estimator.rs index c817aa4..9b8fb9f 100644 --- a/datasketches/src/cpc/icon_estimator.rs +++ b/datasketches/src/cpc/estimator.rs @@ -15,20 +15,218 @@ // specific language governing permissions and limitations // under the License. -//! The ICON estimator for FM85 sketches is defined by the arXiv paper. -//! -//! The current file provides exact and approximate implementations of this estimator. The exact -//! version works for any value of K, but is quite slow. -//! -//! The much faster approximate version works for K values that are powers of two ranging from 2^4 -//! to 2^32. -//! -//! At a high-level, this approximation can be described as using an exponential approximation when -//! C > K * (5.6 or 5.7), while smaller values of C are handled by a degree-19 polynomial -//! approximation of a pre-conditioned version of the true ICON mapping from C to N_hat. -//! -//! This file also provides a validation procedure that compares its approximate and exact -//! implementations of the FM85 ICON estimator. +use std::f64::consts::LN_2; + +use crate::common::NumStdDev; + +#[derive(Debug, Clone)] +pub(super) enum Estimator { + /// Default to HIP estimator. + /// + /// HIP estimator is fast and provides better accuracy. + Hip { kxp: f64, hip_estimate: f64 }, + /// ICON estimator. + /// + /// Used after merges. + Icon, +} + +impl Estimator { + pub fn estimate(&self, lg_k: u8, num_coupons: u32) -> f64 { + match self { + Estimator::Hip { hip_estimate, .. } => *hip_estimate, + Estimator::Icon => icon_estimate(lg_k, num_coupons), + } + } + + pub fn lower_bound(&self, lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { + match self { + Estimator::Hip { hip_estimate, .. } => { + hip_confidence_lb(lg_k, num_coupons, *hip_estimate, kappa) + } + Estimator::Icon => icon_confidence_lb(lg_k, num_coupons, kappa), + } + } + + pub fn upper_bound(&self, lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { + match self { + Estimator::Hip { hip_estimate, .. } => { + hip_confidence_ub(lg_k, num_coupons, *hip_estimate, kappa) + } + Estimator::Icon => icon_confidence_ub(lg_k, num_coupons, kappa), + } + } +} + +const ICON_ERROR_CONSTANT: f64 = LN_2; + +const ICON_LOW_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 6037, 5720, 5328, // 4 1000000 + 6411, 6262, 5682, // 5 1000000 + 6724, 6403, 6127, // 6 1000000 + 6665, 6411, 6208, // 7 1000000 + 6959, 6525, 6427, // 8 1000000 + 6892, 6665, 6619, // 9 1000000 + 6792, 6752, 6690, // 10 1000000 + 6899, 6818, 6708, // 11 1000000 + 6871, 6845, 6812, // 12 1046369 + 6909, 6861, 6828, // 13 1043411 + 6919, 6897, 6842, // 14 1000297 +]; + +const ICON_HIGH_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 8031, 8559, 9309, // 4 1000000 + 7084, 7959, 8660, // 5 1000000 + 7141, 7514, 7876, // 6 1000000 + 7458, 7430, 7572, // 7 1000000 + 6892, 7141, 7497, // 8 1000000 + 6889, 7132, 7290, // 9 1000000 + 7075, 7118, 7185, // 10 1000000 + 7040, 7047, 7085, // 11 1000000 + 6993, 7019, 7053, // 12 1046369 + 6953, 7001, 6983, // 13 1043411 + 6944, 6966, 7004, // 14 1000297 +]; + +#[allow(clippy::excessive_precision)] +const HIP_ERROR_CONSTANT: f64 = 0.588705011257737332; // (LN_2 / 2.0).sqrt() + +const HIP_LOW_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 5871, 5247, 4826, // 4 1000000 + 5877, 5403, 5070, // 5 1000000 + 5873, 5533, 5304, // 6 1000000 + 5878, 5632, 5464, // 7 1000000 + 5874, 5690, 5564, // 8 1000000 + 5880, 5745, 5619, // 9 1000000 + 5875, 5784, 5701, // 10 1000000 + 5866, 5789, 5742, // 11 1000000 + 5869, 5827, 5784, // 12 1046369 + 5876, 5860, 5827, // 13 1043411 + 5881, 5853, 5842, // 14 1000297 +]; + +const HIP_HIGH_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 5855, 6688, 7391, // 4 1000000 + 5886, 6444, 6923, // 5 1000000 + 5885, 6254, 6594, // 6 1000000 + 5889, 6134, 6326, // 7 1000000 + 5900, 6072, 6203, // 8 1000000 + 5875, 6005, 6089, // 9 1000000 + 5871, 5980, 6040, // 10 1000000 + 5889, 5941, 6015, // 11 1000000 + 5871, 5926, 5973, // 12 1046369 + 5866, 5901, 5915, // 13 1043411 + 5880, 5914, 5953, // 14 1000297 +]; + +fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = ICON_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (ICON_HIGH_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let est = icon_estimate(lg_k, num_coupons); + let result = est / (1.0 + eps); + if result < (num_coupons as f64) { + num_coupons as f64 + } else { + result + } +} + +fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = ICON_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (ICON_LOW_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let est = icon_estimate(lg_k, num_coupons); + let result = est / (1.0 - eps); + result.ceil() // slight widening of interval to be conservative +} + +fn hip_confidence_lb(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumStdDev) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = HIP_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (HIP_HIGH_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let result = hip_estimate / (1.0 + eps); + if result < (num_coupons as f64) { + num_coupons as f64 + } else { + result + } +} + +fn hip_confidence_ub(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumStdDev) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = HIP_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (HIP_LOW_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let result = hip_estimate / (1.0 - eps); + result.ceil() // widening for coverage +} + +// The ICON estimator for FM85 sketches is defined by the arXiv paper. +// +// The current file provides exact and approximate implementations of this estimator. The exact +// version works for any value of K, but is quite slow. +// +// The much faster approximate version works for K values that are powers of two ranging from 2^4 +// to 2^32. +// +// At a high-level, this approximation can be described as using an exponential approximation when +// C > K * (5.6 or 5.7), while smaller values of C are handled by a degree-19 polynomial +// approximation of a pre-conditioned version of the true ICON mapping from C to N_hat. +// +// This file also provides a validation procedure that compares its approximate and exact +// implementations of the FM85 ICON estimator. const ICON_MIN_LOG_K: usize = 4; const ICON_MAX_LOG_K: usize = 26; @@ -193,7 +391,7 @@ fn icon_exponential_approximation(k: f64, c: f64) -> f64 { 0.7940236163830469 * k * 2f64.powf(c / k) } -pub(super) fn icon_estimate(lg_k: u8, c: u32) -> f64 { +fn icon_estimate(lg_k: u8, c: u32) -> f64 { let lg_k = lg_k as usize; assert!( (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k), diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index ad0c69f..7d0c626 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -17,7 +17,9 @@ //! Compressed Probabilistic Counting sketch family. -mod cpc_confidence; -mod icon_estimator; +mod estimator; mod kxp_byte_lookup; mod pair_table; +mod sketch; + +pub use self::sketch::CpcSketch; diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs index 8701ac1..d4c135b 100644 --- a/datasketches/src/cpc/pair_table.rs +++ b/datasketches/src/cpc/pair_table.rs @@ -24,6 +24,7 @@ const DOWNSIZE_DENOMINATOR: u32 = 4; /// /// This table stores `(row, col)` pairs and uses linear probing for collision resolution. It is /// optimized for scenarios where the cardinality of entries is low. +#[derive(Debug, Clone)] pub(super) struct PairTable { /// log2 of number of slots lg_size: u8, diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs new file mode 100644 index 0000000..e6a1dfe --- /dev/null +++ b/datasketches/src/cpc/sketch.rs @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::common::NumStdDev; +use crate::cpc::estimator::Estimator; +use crate::cpc::pair_table::PairTable; +use crate::hash::DEFAULT_UPDATE_SEED; + +/// Default log2 of K. +const DEFAULT_LG_K: u8 = 11; +/// Min log2 of K. +const MIN_LG_K: usize = 4; +/// Max log2 of K. +const MAX_LG_K: usize = 26; + +/// A Compressed Probabilistic Counting sketch. +#[derive(Debug, Clone)] +pub struct CpcSketch { + // immutable config variables + lg_k: u8, + seed: u64, + + // sketch state + /// The number of coupons collected so far. + num_coupons: u32, + /// This is part of a speed optimization. + first_interesting_column: u8, + /// Physical storage for the sketch data. + storage: PhysicalStorage, + /// The current estimator type and associated data. + estimator: Estimator, +} + +impl Default for CpcSketch { + fn default() -> Self { + Self::new(DEFAULT_LG_K) + } +} + +impl CpcSketch { + /// Creates a new `CpcSketch` with the given `lg_k` and default seed. + pub fn new(lg_k: u8) -> Self { + Self::with_seed(lg_k, DEFAULT_UPDATE_SEED) + } + + /// Creates a new `CpcSketch` with the given `lg_k` and `seed`. + pub fn with_seed(lg_k: u8, seed: u64) -> Self { + assert!( + (MIN_LG_K..=MAX_LG_K).contains(&(lg_k as usize)), + "lg_k out of range; got {lg_k}", + ); + + Self { + lg_k, + seed, + num_coupons: 0, + first_interesting_column: 0, + storage: PhysicalStorage::Empty, + estimator: Estimator::Hip { + kxp: (1 << lg_k) as f64, + hip_estimate: 0.0, + }, + } + } + + /// Return the parameter lg_k. + pub fn lg_k(&self) -> u8 { + self.lg_k + } + + /// Returns the best estimate of the cardinality of the sketch. + pub fn estimate(&self) -> f64 { + let (lg_k, num_coupons) = (self.lg_k, self.num_coupons); + self.estimator.estimate(lg_k, num_coupons) + } + + /// Returns the best estimate of the lower bound of the confidence interval given `kappa`. + pub fn lower_bound(&self, kappa: NumStdDev) -> f64 { + let (lg_k, num_coupons) = (self.lg_k, self.num_coupons); + self.estimator.lower_bound(lg_k, num_coupons, kappa) + } + + /// Returns the best estimate of the upper bound of the confidence interval given `kappa`. + pub fn upper_bound(&self, kappa: NumStdDev) -> f64 { + let (lg_k, num_coupons) = (self.lg_k, self.num_coupons); + self.estimator.upper_bound(lg_k, num_coupons, kappa) + } +} + +#[derive(Debug, Clone)] +enum PhysicalStorage { + /// Empty storage state for EMPTY state. + Empty, + /// Sparse storage state for SPARSE state. + Sparse { surprising_value_table: PairTable }, + /// Dense storage state for HYBRID/PINNED/SLIDING state. + Dense { + window_offset: u8, + sliding_window: Vec, + }, +} From 315f4e89d85716da5c9d19a9135446305fa6d17d Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 17:07:39 +0800 Subject: [PATCH 07/21] max_serialized_bytes Signed-off-by: tison --- datasketches/src/cpc/sketch.rs | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index e6a1dfe..31c8d40 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -113,3 +113,51 @@ enum PhysicalStorage { sliding_window: Vec, }, } + +impl CpcSketch { + /// Returns the estimated maximum compressed serialized size of a sketch. + /// + /// The actual size of a compressed CPC sketch has a small random variance, but the following + /// empirically measured size should be large enough for at least 99.9 percent of sketches. + /// + /// For small values of `n` the size can be much smaller. + pub fn max_serialized_bytes(lg_k: u8) -> usize { + let lg_k = lg_k as usize; + assert!( + (MIN_LG_K..=MAX_LG_K).contains(&lg_k), + "lg_k out of range; got {lg_k}", + ); + + // These empirical values for the 99.9th percentile of size in bytes were measured using 100,000 + // trials. The value for each trial is the maximum of 5*16=80 measurements that were equally + // spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the + // worst-case space for the preamble, which is added by the function. + const CPC_EMPIRICAL_SIZE_MAX_LGK: usize = 19; + const CPC_EMPIRICAL_MAX_SIZE_BYTES: [usize; 16] = [ + 24, // lg_k = 4 + 36, // lg_k = 5 + 56, // lg_k = 6 + 100, // lg_k = 7 + 180, // lg_k = 8 + 344, // lg_k = 9 + 660, // lg_k = 10 + 1292, // lg_k = 11 + 2540, // lg_k = 12 + 5020, // lg_k = 13 + 9968, // lg_k = 14 + 19836, // lg_k = 15 + 39532, // lg_k = 16 + 78880, // lg_k = 17 + 157516, // lg_k = 18 + 314656, // lg_k = 19 + ]; + const CPC_EMPIRICAL_MAX_SIZE_FACTOR: f64 = 0.6; // 0.6 = 4.8 / 8.0 + const CPC_MAX_PREAMBLE_SIZE_BYTES: usize = 40; + + if lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK { + return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES; + } + let k = 1usize << lg_k; + ((CPC_EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + CPC_MAX_PREAMBLE_SIZE_BYTES + } +} From 0234127a857e875d747d4d67fb68dea8cfde283f Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 17:26:10 +0800 Subject: [PATCH 08/21] update structure Signed-off-by: tison --- datasketches/src/common/mod.rs | 13 +++++ datasketches/src/cpc/sketch.rs | 85 ++++++++++++++++++++++++++------ datasketches/src/theta/sketch.rs | 22 +++------ 3 files changed, 90 insertions(+), 30 deletions(-) diff --git a/datasketches/src/common/mod.rs b/datasketches/src/common/mod.rs index 36b734f..4a1db92 100644 --- a/datasketches/src/common/mod.rs +++ b/datasketches/src/common/mod.rs @@ -26,3 +26,16 @@ pub use self::resize::ResizeFactor; // private to datasketches crate pub(crate) mod binomial_bounds; pub(crate) mod inv_pow2_table; + +/// Canonicalize double value for compatibility with Java +pub(crate) fn canonical_double(value: f64) -> i64 { + if value.is_nan() { + // Java's Double.doubleToLongBits() NaN value + 0x7ff8000000000000i64 + } else { + // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode, + // which Rust guarantees. Thus, by adding a positive zero we + // canonicalize signed zero without any branches in one instruction. + (value + 0.0).to_bits() as i64 + } +} diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index 31c8d40..b6d9ee7 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -15,10 +15,14 @@ // specific language governing permissions and limitations // under the License. +use std::hash::Hash; + use crate::common::NumStdDev; +use crate::common::canonical_double; use crate::cpc::estimator::Estimator; use crate::cpc::pair_table::PairTable; use crate::hash::DEFAULT_UPDATE_SEED; +use crate::hash::MurmurHash3X64128; /// Default log2 of K. const DEFAULT_LG_K: u8 = 11; @@ -40,7 +44,7 @@ pub struct CpcSketch { /// This is part of a speed optimization. first_interesting_column: u8, /// Physical storage for the sketch data. - storage: PhysicalStorage, + state: State, /// The current estimator type and associated data. estimator: Estimator, } @@ -69,7 +73,7 @@ impl CpcSketch { seed, num_coupons: 0, first_interesting_column: 0, - storage: PhysicalStorage::Empty, + state: State::Empty, estimator: Estimator::Hip { kxp: (1 << lg_k) as f64, hip_estimate: 0.0, @@ -99,10 +103,62 @@ impl CpcSketch { let (lg_k, num_coupons) = (self.lg_k, self.num_coupons); self.estimator.upper_bound(lg_k, num_coupons, kappa) } + + /// Returns true if the sketch is empty. + pub fn is_empty(&self) -> bool { + self.num_coupons == 0 + } + + /// Update the sketch with a hashable value. + /// + /// For `f32`/`f64` values, use `update_f32`/`update_f64` instead. + pub fn update(&mut self, value: T) { + let mut hasher = MurmurHash3X64128::with_seed(self.seed); + value.hash(&mut hasher); + let (h1, h2) = hasher.finish128(); + + let k = 1 << self.lg_k; + let col = h2.leading_zeros(); // 0 <= col <= 64 + let col = if col > 63 { 63 } else { col as u8 }; // clip so that 0 <= col <= 63 + let row = (h1 & (k - 1)) as u32; + let mut row_col = (row << 6) | (col as u32); + // To avoid the hash table's "empty" value, we change the row of the following pair. + // This case is extremely unlikely, but we might as well handle it. + if row_col == u32::MAX { + row_col ^= 1 << 6; + } + self.row_col_update(row_col); + } + + /// Update the sketch with a f64 value. + pub fn update_f64(&mut self, value: f64) { + // Canonicalize double for compatibility with Java + let canonical = canonical_double(value); + self.update(canonical); + } + + /// Update the sketch with a f32 value. + pub fn update_f32(&mut self, value: f32) { + self.update_f64(value as f64); + } + + fn row_col_update(&mut self, row_col: u32) { + let col = (row_col & 63) as u8; + if col < self.first_interesting_column { + // important speed optimization + return; + } + // // window size is 0 until sketch is promoted from sparse to windowed + // if (sliding_window.size() == 0) { + // update_sparse(row_col); + // } else { + // update_windowed(row_col); + // } + } } #[derive(Debug, Clone)] -enum PhysicalStorage { +enum State { /// Empty storage state for EMPTY state. Empty, /// Sparse storage state for SPARSE state. @@ -128,12 +184,13 @@ impl CpcSketch { "lg_k out of range; got {lg_k}", ); - // These empirical values for the 99.9th percentile of size in bytes were measured using 100,000 - // trials. The value for each trial is the maximum of 5*16=80 measurements that were equally - // spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the - // worst-case space for the preamble, which is added by the function. - const CPC_EMPIRICAL_SIZE_MAX_LGK: usize = 19; - const CPC_EMPIRICAL_MAX_SIZE_BYTES: [usize; 16] = [ + // These empirical values for the 99.9th percentile of size in bytes were measured using + // 100,000 trials. The value for each trial is the maximum of 5*16=80 measurements + // that were equally spaced over values of the quantity C/K between 3.0 and 8.0. + // This table does not include the worst-case space for the preamble, which is added + // by the function. + const EMPIRICAL_SIZE_MAX_LGK: usize = 19; + const EMPIRICAL_MAX_SIZE_BYTES: [usize; 16] = [ 24, // lg_k = 4 36, // lg_k = 5 56, // lg_k = 6 @@ -151,13 +208,13 @@ impl CpcSketch { 157516, // lg_k = 18 314656, // lg_k = 19 ]; - const CPC_EMPIRICAL_MAX_SIZE_FACTOR: f64 = 0.6; // 0.6 = 4.8 / 8.0 - const CPC_MAX_PREAMBLE_SIZE_BYTES: usize = 40; + const EMPIRICAL_MAX_SIZE_FACTOR: f64 = 0.6; // 0.6 = 4.8 / 8.0 + const MAX_PREAMBLE_SIZE_BYTES: usize = 40; - if lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK { - return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES; + if lg_k <= EMPIRICAL_SIZE_MAX_LGK { + return EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] + MAX_PREAMBLE_SIZE_BYTES; } let k = 1usize << lg_k; - ((CPC_EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + CPC_MAX_PREAMBLE_SIZE_BYTES + ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES } } diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs index 506b2b6..07110d4 100644 --- a/datasketches/src/theta/sketch.rs +++ b/datasketches/src/theta/sketch.rs @@ -25,6 +25,7 @@ use std::hash::Hash; use crate::common::NumStdDev; use crate::common::ResizeFactor; use crate::common::binomial_bounds; +use crate::common::canonical_double; use crate::hash::DEFAULT_UPDATE_SEED; use crate::theta::hash_table::DEFAULT_LG_K; use crate::theta::hash_table::MAX_LG_K; @@ -52,7 +53,9 @@ impl ThetaSketch { ThetaSketchBuilder::default() } - /// Update the sketch with a hashable value + /// Update the sketch with a hashable value. + /// + /// For `f32`/`f64` values, use `update_f32`/`update_f64` instead. /// /// # Examples /// @@ -69,7 +72,7 @@ impl ThetaSketch { } } - /// Update the sketch with a f64 value + /// Update the sketch with a f64 value. /// /// # Examples /// @@ -85,7 +88,7 @@ impl ThetaSketch { self.update(canonical); } - /// Update the sketch with a f32 value + /// Update the sketch with a f32 value. /// /// # Examples /// @@ -356,16 +359,3 @@ impl ThetaSketchBuilder { ThetaSketch { table } } } - -/// Canonicalize double value for compatibility with Java -fn canonical_double(value: f64) -> i64 { - if value.is_nan() { - // Java's Double.doubleToLongBits() NaN value - 0x7ff8000000000000i64 - } else { - // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode, - // which Rust guarantees. Thus, by adding a positive zero we - // canonicalize signed zero without any branches in one instruction. - (value + 0.0).to_bits() as i64 - } -} From 3e09c9c2c5c82ddf024adc9a9f211c07af1ba61d Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 18:40:50 +0800 Subject: [PATCH 09/21] impl update Signed-off-by: tison --- datasketches/src/cpc/estimator.rs | 70 ++++-------- datasketches/src/cpc/sketch.rs | 172 +++++++++++++++++++++++------- 2 files changed, 155 insertions(+), 87 deletions(-) diff --git a/datasketches/src/cpc/estimator.rs b/datasketches/src/cpc/estimator.rs index 9b8fb9f..479e2a3 100644 --- a/datasketches/src/cpc/estimator.rs +++ b/datasketches/src/cpc/estimator.rs @@ -15,48 +15,8 @@ // specific language governing permissions and limitations // under the License. -use std::f64::consts::LN_2; - use crate::common::NumStdDev; - -#[derive(Debug, Clone)] -pub(super) enum Estimator { - /// Default to HIP estimator. - /// - /// HIP estimator is fast and provides better accuracy. - Hip { kxp: f64, hip_estimate: f64 }, - /// ICON estimator. - /// - /// Used after merges. - Icon, -} - -impl Estimator { - pub fn estimate(&self, lg_k: u8, num_coupons: u32) -> f64 { - match self { - Estimator::Hip { hip_estimate, .. } => *hip_estimate, - Estimator::Icon => icon_estimate(lg_k, num_coupons), - } - } - - pub fn lower_bound(&self, lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { - match self { - Estimator::Hip { hip_estimate, .. } => { - hip_confidence_lb(lg_k, num_coupons, *hip_estimate, kappa) - } - Estimator::Icon => icon_confidence_lb(lg_k, num_coupons, kappa), - } - } - - pub fn upper_bound(&self, lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { - match self { - Estimator::Hip { hip_estimate, .. } => { - hip_confidence_ub(lg_k, num_coupons, *hip_estimate, kappa) - } - Estimator::Icon => icon_confidence_ub(lg_k, num_coupons, kappa), - } - } -} +use std::f64::consts::LN_2; const ICON_ERROR_CONSTANT: f64 = LN_2; @@ -127,7 +87,7 @@ const HIP_HIGH_SIDE_DATA: [u16; 33] = [ 5880, 5914, 5953, // 14 1000297 ]; -fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { +pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { if num_coupons == 0 { return 0.0; } @@ -151,7 +111,7 @@ fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { } } -fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { +pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { if num_coupons == 0 { return 0.0; } @@ -171,7 +131,12 @@ fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { result.ceil() // slight widening of interval to be conservative } -fn hip_confidence_lb(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumStdDev) -> f64 { +pub(super) fn hip_confidence_lb( + lg_k: u8, + num_coupons: u32, + hip_est_accum: f64, + kappa: NumStdDev, +) -> f64 { if num_coupons == 0 { return 0.0; } @@ -186,7 +151,7 @@ fn hip_confidence_lb(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumSt } let rel = x / k.sqrt(); let eps = (kappa as f64) * rel; - let result = hip_estimate / (1.0 + eps); + let result = hip_est_accum / (1.0 + eps); if result < (num_coupons as f64) { num_coupons as f64 } else { @@ -194,7 +159,12 @@ fn hip_confidence_lb(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumSt } } -fn hip_confidence_ub(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumStdDev) -> f64 { +pub(super) fn hip_confidence_ub( + lg_k: u8, + num_coupons: u32, + hip_est_accum: f64, + kappa: NumStdDev, +) -> f64 { if num_coupons == 0 { return 0.0; } @@ -209,7 +179,7 @@ fn hip_confidence_ub(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumSt } let rel = x / k.sqrt(); let eps = (kappa as f64) * rel; - let result = hip_estimate / (1.0 - eps); + let result = hip_est_accum / (1.0 - eps); result.ceil() // widening for coverage } @@ -391,21 +361,21 @@ fn icon_exponential_approximation(k: f64, c: f64) -> f64 { 0.7940236163830469 * k * 2f64.powf(c / k) } -fn icon_estimate(lg_k: u8, c: u32) -> f64 { +pub(super) fn icon_estimate(lg_k: u8, num_coupons: u32) -> f64 { let lg_k = lg_k as usize; assert!( (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k), "lg_k out of range; got {lg_k}", ); - match c { + match num_coupons { 0 => return 0.0, 1 => return 1.0, _ => {} } let k = (1 << lg_k) as f64; - let c = c as f64; + let c = num_coupons as f64; // Differing thresholds ensure that the approximated estimator is monotonically increasing. let threshold_factor = if lg_k < 14 { 5.7 } else { 5.6 }; diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index b6d9ee7..7dbfe27 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -19,7 +19,10 @@ use std::hash::Hash; use crate::common::NumStdDev; use crate::common::canonical_double; -use crate::cpc::estimator::Estimator; +use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2; +use crate::cpc::estimator::{ + hip_confidence_lb, hip_confidence_ub, icon_confidence_lb, icon_confidence_ub, icon_estimate, +}; use crate::cpc::pair_table::PairTable; use crate::hash::DEFAULT_UPDATE_SEED; use crate::hash::MurmurHash3X64128; @@ -39,14 +42,28 @@ pub struct CpcSketch { seed: u64, // sketch state + /// Part of a speed optimization. + first_interesting_column: u8, /// The number of coupons collected so far. num_coupons: u32, - /// This is part of a speed optimization. - first_interesting_column: u8, - /// Physical storage for the sketch data. - state: State, - /// The current estimator type and associated data. - estimator: Estimator, + /// Surprising values table in sparse mode. + surprising_value_table: Option, + /// Derivable from num_coupons, but made explicit for speed. + window_offset: u8, + /// Size K bytes in dense mode. + sliding_window: Vec, + + // estimator state + /// Whether the sketch is a result of merging. + /// + /// If `false`, the HIP (Historical Inverse Probability) estimator is used. + /// If `true`, the ICON (Inter-Column Optimal) Estimator is fallback in use. + merge_flag: bool, + // the following variables are only valid in HIP estimator + /// A pre-calculated probability factor (`k * p`) used to compute the increment delta. + kxp: f64, + /// The accumulated cardinality estimate. + hip_est_accum: f64, } impl Default for CpcSketch { @@ -71,13 +88,14 @@ impl CpcSketch { Self { lg_k, seed, - num_coupons: 0, first_interesting_column: 0, - state: State::Empty, - estimator: Estimator::Hip { - kxp: (1 << lg_k) as f64, - hip_estimate: 0.0, - }, + num_coupons: 0, + surprising_value_table: None, + window_offset: 0, + sliding_window: vec![], + merge_flag: false, + kxp: (1 << lg_k) as f64, + hip_est_accum: 0.0, } } @@ -88,20 +106,29 @@ impl CpcSketch { /// Returns the best estimate of the cardinality of the sketch. pub fn estimate(&self) -> f64 { - let (lg_k, num_coupons) = (self.lg_k, self.num_coupons); - self.estimator.estimate(lg_k, num_coupons) + if !self.merge_flag { + self.hip_est_accum + } else { + icon_estimate(self.lg_k, self.num_coupons) + } } /// Returns the best estimate of the lower bound of the confidence interval given `kappa`. pub fn lower_bound(&self, kappa: NumStdDev) -> f64 { - let (lg_k, num_coupons) = (self.lg_k, self.num_coupons); - self.estimator.lower_bound(lg_k, num_coupons, kappa) + if !self.merge_flag { + hip_confidence_lb(self.lg_k, self.num_coupons, self.hip_est_accum, kappa) + } else { + icon_confidence_lb(self.lg_k, self.num_coupons, kappa) + } } /// Returns the best estimate of the upper bound of the confidence interval given `kappa`. pub fn upper_bound(&self, kappa: NumStdDev) -> f64 { - let (lg_k, num_coupons) = (self.lg_k, self.num_coupons); - self.estimator.upper_bound(lg_k, num_coupons, kappa) + if !self.merge_flag { + hip_confidence_ub(self.lg_k, self.num_coupons, self.hip_est_accum, kappa) + } else { + icon_confidence_ub(self.lg_k, self.num_coupons, kappa) + } } /// Returns true if the sketch is empty. @@ -148,26 +175,96 @@ impl CpcSketch { // important speed optimization return; } - // // window size is 0 until sketch is promoted from sparse to windowed - // if (sliding_window.size() == 0) { - // update_sparse(row_col); - // } else { - // update_windowed(row_col); - // } + + if self.num_coupons == 0 { + // promote EMPTY to SPARSE + self.surprising_value_table = Some(PairTable::new(2, 6 + self.lg_k)); + } + + if self.sliding_window.is_empty() { + self.update_sparse(row_col); + } else { + self.update_windowed(row_col); + } } -} -#[derive(Debug, Clone)] -enum State { - /// Empty storage state for EMPTY state. - Empty, - /// Sparse storage state for SPARSE state. - Sparse { surprising_value_table: PairTable }, - /// Dense storage state for HYBRID/PINNED/SLIDING state. - Dense { - window_offset: u8, - sliding_window: Vec, - }, + fn mut_surprising_value_table(&mut self) -> &mut PairTable { + self.surprising_value_table + .as_mut() + .expect("surprising value table is not initialized") + } + + fn update_hip(&mut self, row_col: u32) { + let k = 1 << self.lg_k; + let col = (row_col & 63) as usize; + let one_over_p = (k as f64) / self.kxp; + self.hip_est_accum += one_over_p; + self.kxp -= INVERSE_POWERS_OF_2[col + 1] // notice the "+1" + } + + fn update_sparse(&mut self, row_col: u32) { + let k = 1 << self.lg_k; + let c32pre = (self.num_coupons as u64) << 5; + assert!(c32pre < 3 * k); // C < 3K/32, in other words, flavor == SPARSE + let is_novel = self.mut_surprising_value_table().maybe_insert(row_col); + if is_novel { + self.num_coupons += 1; + self.update_hip(row_col); + let c32post = (self.num_coupons as u64) << 5; + if c32post >= 3 * k { + self.promote_sparse_to_windowed(); + } + } + } + + fn promote_sparse_to_windowed(&mut self) { + todo!() + } + + fn update_windowed(&mut self, row_col: u32) { + assert!(self.window_offset <= 56); + let k = 1 << self.lg_k; + let c32pre = (self.num_coupons as u64) << 5; + assert!(c32pre >= 3 * k); // C >= 3K/32, in other words flavor >= HYBRID + let c8pre = (self.num_coupons as u64) << 3; + let w8pre = (self.window_offset as u64) << 3; + assert!(c8pre < (27 + w8pre) * k); // C < (K * 27/8) + (K * windowOffset) + + let mut is_novel = false; // novel if new coupon; + let col = (row_col & 63) as u8; + if col < self.window_offset { + // track the surprising 0's "before" the window + is_novel = self.mut_surprising_value_table().maybe_delete(row_col); // inverted logic + } else if col < self.window_offset + 8 { + // track the 8 bits inside the window + let row = (row_col >> 6) as usize; + let old_bits = self.sliding_window[row]; + let new_bits = old_bits | (1 << (col - self.window_offset)); + if old_bits != new_bits { + self.sliding_window[row] = new_bits; + is_novel = true; + } + } else { + // track the surprising 1's "after" the window + is_novel = self.mut_surprising_value_table().maybe_insert(row_col); // normal logic + } + + if is_novel { + self.num_coupons += 1; + self.update_hip(row_col); + let c8post = (self.num_coupons as u64) << 3; + if c8post >= (27 + w8pre) * k { + self.move_window(); + assert!((1..=56).contains(&self.window_offset)); + let w8post = (self.window_offset as u64) << 3; + assert!(c8post < ((27 + w8post) * k)); // C < (K * 27/8) + (K * windowOffset) + } + } + } + + fn move_window(&mut self) { + todo!() + } } impl CpcSketch { @@ -218,3 +315,4 @@ impl CpcSketch { ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES } } + From f914fb8acdd16a5bf65ff9d9ecdeb5e6108862c8 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 19:24:38 +0800 Subject: [PATCH 10/21] promote_sparse_to_windowed Signed-off-by: tison --- datasketches/src/cpc/estimator.rs | 3 ++- datasketches/src/cpc/sketch.rs | 38 ++++++++++++++++++++++++++----- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/datasketches/src/cpc/estimator.rs b/datasketches/src/cpc/estimator.rs index 479e2a3..6d37e6e 100644 --- a/datasketches/src/cpc/estimator.rs +++ b/datasketches/src/cpc/estimator.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::common::NumStdDev; use std::f64::consts::LN_2; +use crate::common::NumStdDev; + const ICON_ERROR_CONSTANT: f64 = LN_2; const ICON_LOW_SIDE_DATA: [u16; 33] = [ diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index 7dbfe27..6cbe0b9 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -20,9 +20,11 @@ use std::hash::Hash; use crate::common::NumStdDev; use crate::common::canonical_double; use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2; -use crate::cpc::estimator::{ - hip_confidence_lb, hip_confidence_ub, icon_confidence_lb, icon_confidence_ub, icon_estimate, -}; +use crate::cpc::estimator::hip_confidence_lb; +use crate::cpc::estimator::hip_confidence_ub; +use crate::cpc::estimator::icon_confidence_lb; +use crate::cpc::estimator::icon_confidence_ub; +use crate::cpc::estimator::icon_estimate; use crate::cpc::pair_table::PairTable; use crate::hash::DEFAULT_UPDATE_SEED; use crate::hash::MurmurHash3X64128; @@ -191,7 +193,7 @@ impl CpcSketch { fn mut_surprising_value_table(&mut self) -> &mut PairTable { self.surprising_value_table .as_mut() - .expect("surprising value table is not initialized") + .expect("surprising value table must be initialized") } fn update_hip(&mut self, row_col: u32) { @@ -218,7 +220,32 @@ impl CpcSketch { } fn promote_sparse_to_windowed(&mut self) { - todo!() + assert_eq!(self.window_offset, 0); + + let k = 1 << self.lg_k; + let c32 = (self.num_coupons as u64) << 5; + assert!((c32 == (3 * k)) || ((self.lg_k == 4) && (c32 > (3 * k)))); + + self.sliding_window.resize(k as usize, 0); + + let old_table = self + .surprising_value_table + .replace(PairTable::new(2, 6 + self.lg_k)) + .expect("surprising value table must be initialized"); + let old_slots = old_table.slots(); + for &row_col in old_slots { + if row_col != u32::MAX { + let col = (row_col & 63) as u8; + if col < 8 { + let row = (row_col >> 6) as usize; + self.sliding_window[row] |= 1 << col; + } else { + // cannot use must_insert(), because it doesn't provide for growth + let is_novel = self.mut_surprising_value_table().maybe_insert(row_col); + assert!(is_novel); + } + } + } } fn update_windowed(&mut self, row_col: u32) { @@ -315,4 +342,3 @@ impl CpcSketch { ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES } } - From 822cc6b108e9876417d3e7763be8a6c13514f322 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 19:50:53 +0800 Subject: [PATCH 11/21] move_window Signed-off-by: tison --- datasketches/src/cpc/pair_table.rs | 4 -- datasketches/src/cpc/sketch.rs | 94 +++++++++++++++++++++++++++++- 2 files changed, 93 insertions(+), 5 deletions(-) diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs index d4c135b..2af1859 100644 --- a/datasketches/src/cpc/pair_table.rs +++ b/datasketches/src/cpc/pair_table.rs @@ -71,10 +71,6 @@ impl PairTable { table } - pub fn lg_size(&self) -> u8 { - self.lg_size - } - pub fn num_items(&self) -> u32 { self.num_items } diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index 6cbe0b9..f7fc6eb 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -190,6 +190,12 @@ impl CpcSketch { } } + fn surprising_value_table(&self) -> &PairTable { + self.surprising_value_table + .as_ref() + .expect("surprising value table must be initialized") + } + fn mut_surprising_value_table(&mut self) -> &mut PairTable { self.surprising_value_table .as_mut() @@ -290,7 +296,83 @@ impl CpcSketch { } fn move_window(&mut self) { - todo!() + let new_offset = self.window_offset + 1; + assert!((0..=56).contains(&new_offset)); + assert_eq!( + new_offset, + determine_correct_offset(self.lg_k, self.num_coupons) + ); + + // Construct the full-sized bit matrix that corresponds to the sketch + let bit_matrix = self.build_bit_matrix(); + + // refresh the KXP register on every 8th window shift. + // if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data()); + + self.mut_surprising_value_table().clear(); // the new number of surprises will be about the same + + let mask_for_clearing_window = (0xFF << new_offset) ^ u64::MAX; + let mask_for_flipping_early_zone = (1u64 << new_offset) - 1; + + let mut all_surprises_ored = 0u64; + for (i, &bit) in bit_matrix.iter().enumerate() { + let mut pattern = bit; + self.sliding_window[i] = ((pattern >> new_offset) & 0xff) as u8; + pattern &= mask_for_clearing_window; + // The following line converts surprising 0's to 1's in the "early zone", + // (and vice versa, which is essential for this procedure's O(k) time cost). + pattern ^= mask_for_flipping_early_zone; + all_surprises_ored |= pattern; // a cheap way to recalculate first_interesting_column + while pattern != 0 { + let col = pattern.trailing_zeros(); + pattern ^= 1 << col; // erase the 1 + let row_col = ((i as u32) << 6) | col; + let is_novel = self.mut_surprising_value_table().maybe_insert(row_col); + assert!(is_novel); + } + } + + self.window_offset = new_offset; + self.first_interesting_column = all_surprises_ored.trailing_zeros() as u8; + if self.first_interesting_column > new_offset { + self.first_interesting_column = new_offset; // corner case + } + } + + fn build_bit_matrix(&self) -> Vec { + let k = 1usize << self.lg_k; + let offset = self.window_offset; + assert!((0..=56).contains(&offset)); + + // Fill the matrix with default rows in which the "early zone" is filled with ones. + // This is essential for the routine's O(k) time cost (as opposed to O(C)). + let default_row = (1u64 << offset) - 1; + + let mut matrix = vec![default_row; k]; + if self.num_coupons == 0 { + return matrix; + } + + if !self.sliding_window.is_empty() { + // In other words, we are in window mode, not sparse mode + for (i, bit) in matrix.iter_mut().enumerate() { + // set the window bits, trusting the sketch's current offset + *bit |= (self.sliding_window[i] as u64) << offset; + } + } + + for &row_col in self.surprising_value_table().slots() { + if row_col != u32::MAX { + let col = (row_col & 63) as u8; + let row = (row_col >> 6) as usize; + // Flip the specified matrix bit from its default value. + // In the "early" zone the bit changes from 1 to 0. + // In the "late" zone the bit changes from 0 to 1. + matrix[row] ^= 1 << col; + } + } + + matrix } } @@ -342,3 +424,13 @@ impl CpcSketch { ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES } } + +fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 { + let k = 1 << lg_k; + let tmp = ((num_coupons as i64) << 3) - (19 * k as i64); + if tmp < 0 { + 0 + } else { + (tmp >> (lg_k as i64 + 3)) as u8 // tmp / 8K + } +} From 73b514cf24a6d2b695119eb70ecb75b16ccb1571 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 29 Jan 2026 23:56:20 +0800 Subject: [PATCH 12/21] add tests Signed-off-by: tison --- datasketches/src/cpc/mod.rs | 2 + datasketches/tests/cpc_update_test.rs | 62 +++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 datasketches/tests/cpc_update_test.rs diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index 7d0c626..33f3c10 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#![allow(dead_code)] + //! Compressed Probabilistic Counting sketch family. mod estimator; diff --git a/datasketches/tests/cpc_update_test.rs b/datasketches/tests/cpc_update_test.rs new file mode 100644 index 0000000..2d6f28b --- /dev/null +++ b/datasketches/tests/cpc_update_test.rs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datasketches::common::NumStdDev; +use datasketches::cpc::CpcSketch; +use googletest::assert_that; +use googletest::prelude::ge; +use googletest::prelude::le; +use googletest::prelude::near; + +const RELATIVE_ERROR_FOR_LG_K_11: f64 = 0.02; + +#[test] +fn test_empty() { + let sketch = CpcSketch::new(11); + assert!(sketch.is_empty()); + assert_eq!(sketch.estimate(), 0.0); + assert_eq!(sketch.lower_bound(NumStdDev::One), 0.0); + assert_eq!(sketch.upper_bound(NumStdDev::One), 0.0); +} + +#[test] +fn test_one_value() { + let mut sketch = CpcSketch::new(11); + sketch.update(1); + assert!(!sketch.is_empty()); + assert_eq!(sketch.estimate(), 1.0); + assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One))); + assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One))); +} + +#[test] +fn test_many_values() { + const N: usize = 10000; + const N_F64: f64 = N as f64; + + let mut sketch = CpcSketch::new(11); + for i in 0..N { + sketch.update(i); + } + assert!(!sketch.is_empty()); + assert_that!( + sketch.estimate(), + near(N_F64, RELATIVE_ERROR_FOR_LG_K_11 * N_F64) + ); + assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One))); + assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One))); +} From 08b7a613590935a53d41c4eec432ac9a3519b2a3 Mon Sep 17 00:00:00 2001 From: tison Date: Fri, 30 Jan 2026 00:41:54 +0800 Subject: [PATCH 13/21] refresh_kxp Signed-off-by: tison --- datasketches/src/cpc/kxp_byte_lookup.rs | 2 +- datasketches/src/cpc/sketch.rs | 36 ++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/datasketches/src/cpc/kxp_byte_lookup.rs b/datasketches/src/cpc/kxp_byte_lookup.rs index 96649ea..25fddf6 100644 --- a/datasketches/src/cpc/kxp_byte_lookup.rs +++ b/datasketches/src/cpc/kxp_byte_lookup.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -const KXP_BYTE_TABLE: [f64; 256] = [ +pub(super) const KXP_BYTE_TABLE: [f64; 256] = [ 0.99609375, 0.49609375, 0.74609375, 0.24609375, 0.87109375, 0.37109375, 0.62109375, 0.12109375, 0.93359375, 0.43359375, 0.68359375, 0.18359375, 0.80859375, 0.30859375, 0.55859375, 0.05859375, 0.96484375, 0.46484375, 0.71484375, 0.21484375, 0.83984375, 0.33984375, 0.58984375, 0.08984375, diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index f7fc6eb..caa0503 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -25,6 +25,7 @@ use crate::cpc::estimator::hip_confidence_ub; use crate::cpc::estimator::icon_confidence_lb; use crate::cpc::estimator::icon_confidence_ub; use crate::cpc::estimator::icon_estimate; +use crate::cpc::kxp_byte_lookup::KXP_BYTE_TABLE; use crate::cpc::pair_table::PairTable; use crate::hash::DEFAULT_UPDATE_SEED; use crate::hash::MurmurHash3X64128; @@ -297,7 +298,7 @@ impl CpcSketch { fn move_window(&mut self) { let new_offset = self.window_offset + 1; - assert!((0..=56).contains(&new_offset)); + assert!(new_offset <= 56); assert_eq!( new_offset, determine_correct_offset(self.lg_k, self.num_coupons) @@ -307,7 +308,9 @@ impl CpcSketch { let bit_matrix = self.build_bit_matrix(); // refresh the KXP register on every 8th window shift. - // if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data()); + if (new_offset & 0x7) == 0 { + self.refresh_kxp(&bit_matrix); + } self.mut_surprising_value_table().clear(); // the new number of surprises will be about the same @@ -339,10 +342,37 @@ impl CpcSketch { } } + /// The KXP register is a double with roughly 50 bits of precision, but it might need roughly + /// 90 bits to track the value with perfect accuracy. + /// + /// Therefore, we recalculate KXP occasionally from the sketch's full bit_matrix so that it + /// will reflect changes that were previously outside the mantissa. + fn refresh_kxp(&mut self, bit_matrix: &[u64]) { + // for improved numerical accuracy, we separately sum the bytes of the u64's + let mut byte_sums = [0.0; 8]; + for &bit in bit_matrix { + let mut word = bit; + for sum in byte_sums.iter_mut() { + let byte = (word & 0xFF) as usize; + *sum += KXP_BYTE_TABLE[byte]; + word >>= 8; + } + } + + let mut total = 0.0; + for i in (0..8).rev() { + // the reverse order is important + let factor = INVERSE_POWERS_OF_2[i * 8]; // pow (256.0, (-1.0 * ((double) j))) + total += factor * byte_sums[i]; + } + + self.kxp = total; + } + fn build_bit_matrix(&self) -> Vec { let k = 1usize << self.lg_k; let offset = self.window_offset; - assert!((0..=56).contains(&offset)); + assert!(offset <= 56); // Fill the matrix with default rows in which the "early zone" is filled with ones. // This is essential for the routine's O(k) time cost (as opposed to O(C)). From 5b34a747f89b60b789eb91f095be480156479a63 Mon Sep 17 00:00:00 2001 From: tison Date: Fri, 30 Jan 2026 08:22:43 +0800 Subject: [PATCH 14/21] update comments Signed-off-by: tison --- datasketches/src/cpc/sketch.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index caa0503..1b82492 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -49,11 +49,11 @@ pub struct CpcSketch { first_interesting_column: u8, /// The number of coupons collected so far. num_coupons: u32, - /// Surprising values table in sparse mode. + /// Sparse and surprising values. surprising_value_table: Option, /// Derivable from num_coupons, but made explicit for speed. window_offset: u8, - /// Size K bytes in dense mode. + /// Size K bytes in dense mode (flavor >= HYBRID). sliding_window: Vec, // estimator state From a3ccddca1f2e860905c91c3a90ef16d0e66b160f Mon Sep 17 00:00:00 2001 From: tison Date: Fri, 30 Jan 2026 11:52:46 +0800 Subject: [PATCH 15/21] add validate method Signed-off-by: tison --- datasketches/src/cpc/mod.rs | 8 ++++++++ datasketches/src/cpc/sketch.rs | 13 +++++++++++++ datasketches/tests/cpc_update_test.rs | 3 +++ 3 files changed, 24 insertions(+) diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index 33f3c10..14e4168 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -25,3 +25,11 @@ mod pair_table; mod sketch; pub use self::sketch::CpcSketch; + +fn count_bits_set_in_matrix(matrix: &[u64]) -> u32 { + let mut count = 0; + for word in matrix { + count += word.count_ones(); + } + count +} diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index 1b82492..c5562fd 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -20,6 +20,7 @@ use std::hash::Hash; use crate::common::NumStdDev; use crate::common::canonical_double; use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2; +use crate::cpc::count_bits_set_in_matrix; use crate::cpc::estimator::hip_confidence_lb; use crate::cpc::estimator::hip_confidence_ub; use crate::cpc::estimator::icon_confidence_lb; @@ -455,6 +456,18 @@ impl CpcSketch { } } +// testing methods +impl CpcSketch { + /// Validate this sketch is valid. + /// + /// This method is typically used for debugging and testing purposes. + pub fn validate(&self) -> bool { + let bit_matrix = self.build_bit_matrix(); + let num_bits_set = count_bits_set_in_matrix(&bit_matrix); + num_bits_set == self.num_coupons + } +} + fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 { let k = 1 << lg_k; let tmp = ((num_coupons as i64) << 3) - (19 * k as i64); diff --git a/datasketches/tests/cpc_update_test.rs b/datasketches/tests/cpc_update_test.rs index 2d6f28b..fb17573 100644 --- a/datasketches/tests/cpc_update_test.rs +++ b/datasketches/tests/cpc_update_test.rs @@ -31,6 +31,7 @@ fn test_empty() { assert_eq!(sketch.estimate(), 0.0); assert_eq!(sketch.lower_bound(NumStdDev::One), 0.0); assert_eq!(sketch.upper_bound(NumStdDev::One), 0.0); + assert!(sketch.validate()); } #[test] @@ -41,6 +42,7 @@ fn test_one_value() { assert_eq!(sketch.estimate(), 1.0); assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One))); assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One))); + assert!(sketch.validate()); } #[test] @@ -59,4 +61,5 @@ fn test_many_values() { ); assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One))); assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One))); + assert!(sketch.validate()); } From cfcd0b7797ee576a7a92cbb6cdb6f496e59d0a27 Mon Sep 17 00:00:00 2001 From: tison Date: Fri, 30 Jan 2026 17:30:32 +0800 Subject: [PATCH 16/21] impl Union Signed-off-by: tison --- datasketches/src/cpc/mod.rs | 45 +++++ datasketches/src/cpc/sketch.rs | 48 ++--- datasketches/src/cpc/union.rs | 348 +++++++++++++++++++++++++++++++++ 3 files changed, 414 insertions(+), 27 deletions(-) create mode 100644 datasketches/src/cpc/union.rs diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index 14e4168..21879c0 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -23,9 +23,26 @@ mod estimator; mod kxp_byte_lookup; mod pair_table; mod sketch; +mod union; pub use self::sketch::CpcSketch; +/// Default log2 of K. +const DEFAULT_LG_K: u8 = 11; +/// Min log2 of K. +const MIN_LG_K: usize = 4; +/// Max log2 of K. +const MAX_LG_K: usize = 26; + +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +enum Flavor { + EMPTY, // 0 == C < 1 + SPARSE, // 1 <= C < 3K/32 + HYBRID, // 3K/32 <= C < K/2 + PINNED, // K/2 <= C < 27K/8 [NB: 27/8 = 3 + 3/8] + SLIDING, // 27K/8 <= C +} + fn count_bits_set_in_matrix(matrix: &[u64]) -> u32 { let mut count = 0; for word in matrix { @@ -33,3 +50,31 @@ fn count_bits_set_in_matrix(matrix: &[u64]) -> u32 { } count } + +fn determine_flavor(lg_k: u8, num_coupons: u32) -> Flavor { + let k = 1 << lg_k; + let c2 = num_coupons << 1; + let c8 = num_coupons << 3; + let c32 = num_coupons << 5; + if num_coupons == 0 { + Flavor::EMPTY + } else if c32 < (3 * k) { + Flavor::SPARSE + } else if c2 < k { + Flavor::HYBRID + } else if c8 < (27 * k) { + Flavor::PINNED + } else { + Flavor::SLIDING + } +} + +fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 { + let k = 1i64 << lg_k; + let tmp = ((num_coupons as i64) << 3) - (19 * k); // 8C - 19K + if tmp < 0 { + 0 + } else { + (tmp >> (lg_k + 3)) as u8 // tmp / 8K + } +} diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index c5562fd..832db46 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -20,6 +20,7 @@ use std::hash::Hash; use crate::common::NumStdDev; use crate::common::canonical_double; use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2; +use crate::cpc::MIN_LG_K; use crate::cpc::count_bits_set_in_matrix; use crate::cpc::estimator::hip_confidence_lb; use crate::cpc::estimator::hip_confidence_ub; @@ -28,16 +29,11 @@ use crate::cpc::estimator::icon_confidence_ub; use crate::cpc::estimator::icon_estimate; use crate::cpc::kxp_byte_lookup::KXP_BYTE_TABLE; use crate::cpc::pair_table::PairTable; +use crate::cpc::{DEFAULT_LG_K, determine_correct_offset}; +use crate::cpc::{Flavor, MAX_LG_K, determine_flavor}; use crate::hash::DEFAULT_UPDATE_SEED; use crate::hash::MurmurHash3X64128; -/// Default log2 of K. -const DEFAULT_LG_K: u8 = 11; -/// Min log2 of K. -const MIN_LG_K: usize = 4; -/// Max log2 of K. -const MAX_LG_K: usize = 26; - /// A Compressed Probabilistic Counting sketch. #[derive(Debug, Clone)] pub struct CpcSketch { @@ -47,22 +43,22 @@ pub struct CpcSketch { // sketch state /// Part of a speed optimization. - first_interesting_column: u8, + pub(super) first_interesting_column: u8, /// The number of coupons collected so far. - num_coupons: u32, + pub(super) num_coupons: u32, /// Sparse and surprising values. - surprising_value_table: Option, + pub(super) surprising_value_table: Option, /// Derivable from num_coupons, but made explicit for speed. - window_offset: u8, + pub(super) window_offset: u8, /// Size K bytes in dense mode (flavor >= HYBRID). - sliding_window: Vec, + pub(super) sliding_window: Vec, // estimator state /// Whether the sketch is a result of merging. /// /// If `false`, the HIP (Historical Inverse Probability) estimator is used. /// If `true`, the ICON (Inter-Column Optimal) Estimator is fallback in use. - merge_flag: bool, + pub(super) merge_flag: bool, // the following variables are only valid in HIP estimator /// A pre-calculated probability factor (`k * p`) used to compute the increment delta. kxp: f64, @@ -173,7 +169,11 @@ impl CpcSketch { self.update_f64(value as f64); } - fn row_col_update(&mut self, row_col: u32) { + pub(super) fn flavor(&self) -> Flavor { + determine_flavor(self.lg_k, self.num_coupons) + } + + pub(super) fn row_col_update(&mut self, row_col: u32) { let col = (row_col & 63) as u8; if col < self.first_interesting_column { // important speed optimization @@ -192,13 +192,17 @@ impl CpcSketch { } } - fn surprising_value_table(&self) -> &PairTable { + pub(super) fn seed(&self) -> u64 { + self.seed + } + + pub(super) fn surprising_value_table(&self) -> &PairTable { self.surprising_value_table .as_ref() .expect("surprising value table must be initialized") } - fn mut_surprising_value_table(&mut self) -> &mut PairTable { + pub(super) fn mut_surprising_value_table(&mut self) -> &mut PairTable { self.surprising_value_table .as_mut() .expect("surprising value table must be initialized") @@ -370,7 +374,7 @@ impl CpcSketch { self.kxp = total; } - fn build_bit_matrix(&self) -> Vec { + pub(super) fn build_bit_matrix(&self) -> Vec { let k = 1usize << self.lg_k; let offset = self.window_offset; assert!(offset <= 56); @@ -467,13 +471,3 @@ impl CpcSketch { num_bits_set == self.num_coupons } } - -fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 { - let k = 1 << lg_k; - let tmp = ((num_coupons as i64) << 3) - (19 * k as i64); - if tmp < 0 { - 0 - } else { - (tmp >> (lg_k as i64 + 3)) as u8 // tmp / 8K - } -} diff --git a/datasketches/src/cpc/union.rs b/datasketches/src/cpc/union.rs new file mode 100644 index 0000000..47e2198 --- /dev/null +++ b/datasketches/src/cpc/union.rs @@ -0,0 +1,348 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::cpc::count_bits_set_in_matrix; +use crate::cpc::pair_table::PairTable; +use crate::cpc::{CpcSketch, determine_correct_offset}; +use crate::cpc::{DEFAULT_LG_K, Flavor}; +use crate::hash::DEFAULT_UPDATE_SEED; + +/// The union (merge) operation for the CPC sketches. +#[derive(Debug, Clone)] +pub struct CpcUnion { + // immutable config variables + lg_k: u8, + seed: u64, + + // union state + state: UnionState, +} + +impl Default for CpcUnion { + fn default() -> Self { + Self::new(DEFAULT_LG_K) + } +} + +impl CpcUnion { + /// Creates a new `CpcUnion` with the given `lg_k` and default seed. + pub fn new(lg_k: u8) -> Self { + Self::with_seed(lg_k, DEFAULT_UPDATE_SEED) + } + + /// Creates a new `CpcUnion` with the given `lg_k` and `seed`. + pub fn with_seed(lg_k: u8, seed: u64) -> Self { + // We begin with the accumulator holding an EMPTY_MERGED sketch object. + let sketch = CpcSketch::with_seed(lg_k, seed); + let state = UnionState::Accumulator(sketch); + Self { lg_k, seed, state } + } + + /// Return the parameter lg_k. + /// + /// Note that due to merging with source sketches that may have a lower value of LgK, this value + /// can be less than what the union object was configured with. + pub fn lg_k(&self) -> u8 { + self.lg_k + } + + /// Returns the result of union operations as a CPC sketch. + pub fn get_result(&self) -> CpcSketch { + match &self.state { + UnionState::Accumulator(sketch) => { + if sketch.is_empty() { + CpcSketch::with_seed(self.lg_k, self.seed) + } else { + let mut sketch = sketch.clone(); + assert_eq!(sketch.flavor(), Flavor::SPARSE); + sketch.merge_flag = true; + sketch + } + } + UnionState::BitMatrix(matrix) => { + let lg_k = self.lg_k; + + let mut sketch = CpcSketch::with_seed(lg_k, self.seed); + let num_coupons = count_bits_set_in_matrix(matrix); + sketch.num_coupons = num_coupons; + let offset = determine_correct_offset(lg_k, num_coupons); + sketch.window_offset = offset; + + // Build the window and pair table + let k = 1 << lg_k; + let mut sliding_window = vec![0u8; k]; + + // LgSize = K/16; in some cases this will end up being oversized + let new_table_lg_size = (lg_k - 4).max(2); + let mut table = PairTable::new(new_table_lg_size, 6 + lg_k); + + // the following should work even when the offset is zero + let mask_for_clearing_window = (0xFFu64 << offset) ^ u64::MAX; + let mask_for_flipping_early_zone = (1u64 << offset) - 1; + let mut all_surprises_ored = 0; + + // The snowplow effect was caused by processing the rows in order, + // but we have fixed it by using a sufficiently large hash table. + for i in 0..k { + let mut pattern = matrix[i]; + sliding_window[i] = ((pattern >> offset) & 0xFF) as u8; + pattern &= mask_for_clearing_window; + pattern ^= mask_for_flipping_early_zone; // this flipping converts surprising 0's to 1's + all_surprises_ored |= pattern; + while pattern != 0 { + let col = pattern.trailing_zeros(); + pattern = pattern ^ (1u64 << col); // erase the 1 + let row_col = ((i as u32) << 6) | col; + let is_novel = table.maybe_insert(row_col); + assert!(is_novel); + } + } + + // at this point we could shrink an oversized hash table, but the relative waste isn't very big + sketch.first_interesting_column = all_surprises_ored.trailing_zeros() as u8; + if sketch.first_interesting_column > offset { + sketch.first_interesting_column = offset; // corner case + } + + // HIP-related fields will contain zeros, and that is okay since merge_flag is true + sketch.sliding_window = sliding_window; + sketch.surprising_value_table = Some(table); + sketch.merge_flag = true; + + sketch + } + } + } + + /// Update this union with a CpcSketch. + pub fn update(&mut self, sketch: &CpcSketch) { + assert_eq!(self.seed, sketch.seed()); + + let flavor = sketch.flavor(); + if flavor == Flavor::EMPTY { + return; + } + + if sketch.lg_k() < self.lg_k { + self.reduce_k(sketch.lg_k()); + } + + // if source is past SPARSE mode, make sure that union is a bitMatrix. + if flavor > Flavor::SPARSE { + if let UnionState::Accumulator(old_sketch) = &self.state { + // convert the accumulator to a bit matrix + let bit_matrix = old_sketch.build_bit_matrix(); + self.state = UnionState::BitMatrix(bit_matrix); + } + } + + match &mut self.state { + UnionState::Accumulator(old_sketch) => { + // [Case A] Sparse, bitMatrix == null, accumulator valid + if flavor == Flavor::SPARSE { + let old_flavor = old_sketch.flavor(); + if old_flavor != Flavor::SPARSE && old_flavor != Flavor::EMPTY { + unreachable!( + "unexpected old flavor in union accumulator: {:?}", + old_flavor + ); + } + + // The following partially fixes the snowplow problem provided that the K's + // are equal. + if old_flavor == Flavor::EMPTY && self.lg_k == sketch.lg_k() { + *old_sketch = sketch.clone(); + return; + } + + walk_table_updating_sketch(old_sketch, sketch.surprising_value_table()); + let final_flavor = old_sketch.flavor(); + + // if the accumulator has graduated beyond sparse, switch to a bit matrix + // representation + if final_flavor > Flavor::SPARSE { + let bit_matrix = old_sketch.build_bit_matrix(); + self.state = UnionState::BitMatrix(bit_matrix); + } + + return; + } + + // If flavor is past SPARSE mode, the state must have been converted to bitMatrix. + // Otherwise, the EMPTY case was handled at the start of this method, and the + // SPARSE case was handled above. + unreachable!("unexpected flavor in union accumulator: {:?}", flavor); + } + UnionState::BitMatrix(old_matrix) => { + if flavor == Flavor::SPARSE { + // [Case B] Sparse, bitMatrix valid, accumulator == null + or_table_into_matrix(old_matrix, self.lg_k, sketch.surprising_value_table()); + return; + } + + if matches!(flavor, Flavor::HYBRID | Flavor::PINNED) { + // [Case C] Hybrid, bitMatrix valid, accumulator == null + // [Case C] Pinned, bitMatrix valid, accumulator == null + or_window_into_matrix( + old_matrix, + self.lg_k, + &sketch.sliding_window, + sketch.window_offset, + sketch.lg_k(), + ); + or_table_into_matrix(old_matrix, self.lg_k, sketch.surprising_value_table()); + return; + } + + // [Case D] Sliding, bitMatrix valid, accumulator == null + // SLIDING mode involves inverted logic, so we cannot just walk the source sketch. + // Instead, we convert it to a bitMatrix that can be ORed into the destination. + assert_eq!(flavor, Flavor::SLIDING); + let src_matrix = sketch.build_bit_matrix(); + or_matrix_into_matrix(old_matrix, self.lg_k, &src_matrix, sketch.lg_k()); + return; + } + } + } + + fn reduce_k(&mut self, new_lg_k: u8) { + match &mut self.state { + UnionState::Accumulator(sketch) => { + if sketch.is_empty() { + self.lg_k = new_lg_k; + self.state = UnionState::Accumulator(CpcSketch::with_seed(new_lg_k, self.seed)); + return; + } + + let mut new_sketch = CpcSketch::with_seed(new_lg_k, self.seed); + walk_table_updating_sketch(&mut new_sketch, sketch.surprising_value_table()); + + let final_new_flavor = new_sketch.flavor(); + // SV table had to have something in it + assert_ne!(final_new_flavor, Flavor::EMPTY); + if final_new_flavor == Flavor::SPARSE { + self.lg_k = new_lg_k; + self.state = UnionState::Accumulator(new_sketch); + return; + } + + // the new sketch has graduated beyond sparse, so convert to bitMatrix + self.lg_k = new_lg_k; + self.state = UnionState::BitMatrix(new_sketch.build_bit_matrix()); + } + UnionState::BitMatrix(matrix) => { + let new_k = 1 << new_lg_k; + let mut new_matrix = vec![0; new_k]; + or_matrix_into_matrix(&mut new_matrix, new_lg_k, &matrix, self.lg_k); + self.lg_k = new_lg_k; + self.state = UnionState::BitMatrix(new_matrix); + } + } + } +} + +// testing methods +impl CpcUnion { + /// Returns the number of coupons in the union. + /// + /// This is primarily for testing and validation purposes. + pub fn num_coupons(&self) -> u32 { + match &self.state { + UnionState::Accumulator(sketch) => sketch.num_coupons, + UnionState::BitMatrix(matrix) => count_bits_set_in_matrix(matrix), + } + } +} + +fn or_window_into_matrix( + dst_matrix: &mut [u64], + dst_lg_k: u8, + src_window: &[u8], + src_offset: u8, + src_lg_k: u8, +) { + assert!(dst_lg_k <= src_lg_k); + let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k + let src_k = 1 << src_lg_k; + for src_row in 0..src_k { + dst_matrix[src_row & dst_mask] |= (src_window[src_row] as u64) << src_offset; + } +} + +fn or_table_into_matrix(dst_matrix: &mut [u64], dst_lg_k: u8, src_table: &PairTable) { + let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k + let slots = src_table.slots(); + for &row_col in slots.iter() { + if row_col != u32::MAX { + let src_row = (row_col >> 6) as usize; + let src_col = (row_col & 63) as usize; + let dst_row = src_row & dst_mask; + dst_matrix[dst_row] |= 1u64 << src_col; + } + } +} + +fn or_matrix_into_matrix(dst_matrix: &mut [u64], dst_lg_k: u8, src_matrix: &[u64], src_lg_k: u8) { + assert!(dst_lg_k <= src_lg_k); + let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k + let src_k = 1 << src_lg_k; + for src_row in 0..src_k { + let dst_row = src_row & dst_mask; + dst_matrix[dst_row] |= src_matrix[src_row]; + } +} + +fn walk_table_updating_sketch(sketch: &mut CpcSketch, table: &PairTable) { + assert!(sketch.lg_k() <= 26); + + let slots = table.slots(); + let num_slots = slots.len() as u32; + + // downsamples when dst lgK < src LgK + let dst_mask = (((1u64 << sketch.lg_k()) - 1) << 6) | 63; + // Using a golden ratio stride fixes the snowplow effect. + let mut stride = (0.6180339887498949025 * (num_slots as f64)) as u32; + assert!(stride >= 2); + if stride == ((stride >> 1) << 1) { + // force the stride to be odd + stride += 1; + } + assert!((stride >= 3) && (stride < num_slots)); + + let mut k = 0; + for _ in 0..num_slots { + k &= num_slots - 1; + let row_col = slots[k as usize]; + if row_col != u32::MAX { + sketch.row_col_update(row_col & (dst_mask as u32)); + } + k += stride; + } +} + +/// The internal State of the Union operation. +/// +/// At most one of BitMatrix and accumulator will be non-null at any given moment. Accumulator is a +/// sketch object that is employed until it graduates out of Sparse mode. +/// +/// At that point, it is converted into a full-sized bitMatrix, which is mathematically a sketch, +/// but doesn't maintain any of the "extra" fields of our sketch objects. +#[derive(Debug, Clone)] +enum UnionState { + Accumulator(CpcSketch), + BitMatrix(Vec), +} From acbcc1cbb0baa5d9b3a61f2f34e6705793807080 Mon Sep 17 00:00:00 2001 From: tison Date: Fri, 30 Jan 2026 18:07:35 +0800 Subject: [PATCH 17/21] add tests Signed-off-by: tison --- Cargo.toml | 2 + datasketches/src/cpc/mod.rs | 4 +- datasketches/src/cpc/sketch.rs | 28 +++++-- datasketches/src/cpc/union.rs | 20 +++-- datasketches/tests/cpc_union_test.rs | 105 +++++++++++++++++++++++++++ 5 files changed, 143 insertions(+), 16 deletions(-) create mode 100644 datasketches/tests/cpc_union_test.rs diff --git a/Cargo.toml b/Cargo.toml index 9e802b6..cc776fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,4 +45,6 @@ unused_must_use = "deny" [workspace.lints.clippy] dbg_macro = "deny" + too_many_arguments = "allow" +needless_range_loop = "allow" diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index 21879c0..635c8c2 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -26,6 +26,7 @@ mod sketch; mod union; pub use self::sketch::CpcSketch; +pub use self::union::CpcUnion; /// Default log2 of K. const DEFAULT_LG_K: u8 = 11; @@ -35,6 +36,7 @@ const MIN_LG_K: usize = 4; const MAX_LG_K: usize = 26; #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +#[expect(clippy::upper_case_acronyms)] enum Flavor { EMPTY, // 0 == C < 1 SPARSE, // 1 <= C < 3K/32 @@ -70,7 +72,7 @@ fn determine_flavor(lg_k: u8, num_coupons: u32) -> Flavor { } fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 { - let k = 1i64 << lg_k; + let k = 1 << lg_k; let tmp = ((num_coupons as i64) << 3) - (19 * k); // 8C - 19K if tmp < 0 { 0 diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index 832db46..43cfdf5 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -20,8 +20,13 @@ use std::hash::Hash; use crate::common::NumStdDev; use crate::common::canonical_double; use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2; +use crate::cpc::DEFAULT_LG_K; +use crate::cpc::Flavor; +use crate::cpc::MAX_LG_K; use crate::cpc::MIN_LG_K; use crate::cpc::count_bits_set_in_matrix; +use crate::cpc::determine_correct_offset; +use crate::cpc::determine_flavor; use crate::cpc::estimator::hip_confidence_lb; use crate::cpc::estimator::hip_confidence_ub; use crate::cpc::estimator::icon_confidence_lb; @@ -29,8 +34,6 @@ use crate::cpc::estimator::icon_confidence_ub; use crate::cpc::estimator::icon_estimate; use crate::cpc::kxp_byte_lookup::KXP_BYTE_TABLE; use crate::cpc::pair_table::PairTable; -use crate::cpc::{DEFAULT_LG_K, determine_correct_offset}; -use crate::cpc::{Flavor, MAX_LG_K, determine_flavor}; use crate::hash::DEFAULT_UPDATE_SEED; use crate::hash::MurmurHash3X64128; @@ -309,6 +312,8 @@ impl CpcSketch { determine_correct_offset(self.lg_k, self.num_coupons) ); + let k = 1 << self.lg_k; + // Construct the full-sized bit matrix that corresponds to the sketch let bit_matrix = self.build_bit_matrix(); @@ -323,8 +328,8 @@ impl CpcSketch { let mask_for_flipping_early_zone = (1u64 << new_offset) - 1; let mut all_surprises_ored = 0u64; - for (i, &bit) in bit_matrix.iter().enumerate() { - let mut pattern = bit; + for i in 0..k { + let mut pattern = bit_matrix[i]; self.sliding_window[i] = ((pattern >> new_offset) & 0xff) as u8; pattern &= mask_for_clearing_window; // The following line converts surprising 0's to 1's in the "early zone", @@ -375,7 +380,7 @@ impl CpcSketch { } pub(super) fn build_bit_matrix(&self) -> Vec { - let k = 1usize << self.lg_k; + let k = 1 << self.lg_k; let offset = self.window_offset; assert!(offset <= 56); @@ -390,9 +395,9 @@ impl CpcSketch { if !self.sliding_window.is_empty() { // In other words, we are in window mode, not sparse mode - for (i, bit) in matrix.iter_mut().enumerate() { + for i in 0..k { // set the window bits, trusting the sketch's current offset - *bit |= (self.sliding_window[i] as u64) << offset; + matrix[i] |= (self.sliding_window[i] as u64) << offset; } } @@ -455,7 +460,7 @@ impl CpcSketch { if lg_k <= EMPIRICAL_SIZE_MAX_LGK { return EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] + MAX_PREAMBLE_SIZE_BYTES; } - let k = 1usize << lg_k; + let k = 1 << lg_k; ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES } } @@ -470,4 +475,11 @@ impl CpcSketch { let num_bits_set = count_bits_set_in_matrix(&bit_matrix); num_bits_set == self.num_coupons } + + /// Returns the number of coupons in the sketch. + /// + /// This is primarily for testing and validation purposes. + pub fn num_coupons(&self) -> u32 { + self.num_coupons + } } diff --git a/datasketches/src/cpc/union.rs b/datasketches/src/cpc/union.rs index 47e2198..95ce433 100644 --- a/datasketches/src/cpc/union.rs +++ b/datasketches/src/cpc/union.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. +use crate::cpc::CpcSketch; +use crate::cpc::DEFAULT_LG_K; +use crate::cpc::Flavor; use crate::cpc::count_bits_set_in_matrix; +use crate::cpc::determine_correct_offset; use crate::cpc::pair_table::PairTable; -use crate::cpc::{CpcSketch, determine_correct_offset}; -use crate::cpc::{DEFAULT_LG_K, Flavor}; use crate::hash::DEFAULT_UPDATE_SEED; /// The union (merge) operation for the CPC sketches. @@ -105,14 +107,15 @@ impl CpcUnion { all_surprises_ored |= pattern; while pattern != 0 { let col = pattern.trailing_zeros(); - pattern = pattern ^ (1u64 << col); // erase the 1 + pattern ^= 1u64 << col; // erase the 1 let row_col = ((i as u32) << 6) | col; let is_novel = table.maybe_insert(row_col); assert!(is_novel); } } - // at this point we could shrink an oversized hash table, but the relative waste isn't very big + // at this point we could shrink an oversized hash table, but the relative waste + // isn't very big sketch.first_interesting_column = all_surprises_ored.trailing_zeros() as u8; if sketch.first_interesting_column > offset { sketch.first_interesting_column = offset; // corner case @@ -129,6 +132,10 @@ impl CpcUnion { } /// Update this union with a CpcSketch. + /// + /// # Panics + /// + /// Panics if the seed of the provided sketch does not match the seed of this union. pub fn update(&mut self, sketch: &CpcSketch) { assert_eq!(self.seed, sketch.seed()); @@ -214,7 +221,6 @@ impl CpcUnion { assert_eq!(flavor, Flavor::SLIDING); let src_matrix = sketch.build_bit_matrix(); or_matrix_into_matrix(old_matrix, self.lg_k, &src_matrix, sketch.lg_k()); - return; } } } @@ -247,7 +253,7 @@ impl CpcUnion { UnionState::BitMatrix(matrix) => { let new_k = 1 << new_lg_k; let mut new_matrix = vec![0; new_k]; - or_matrix_into_matrix(&mut new_matrix, new_lg_k, &matrix, self.lg_k); + or_matrix_into_matrix(&mut new_matrix, new_lg_k, matrix, self.lg_k); self.lg_k = new_lg_k; self.state = UnionState::BitMatrix(new_matrix); } @@ -315,7 +321,7 @@ fn walk_table_updating_sketch(sketch: &mut CpcSketch, table: &PairTable) { // downsamples when dst lgK < src LgK let dst_mask = (((1u64 << sketch.lg_k()) - 1) << 6) | 63; // Using a golden ratio stride fixes the snowplow effect. - let mut stride = (0.6180339887498949025 * (num_slots as f64)) as u32; + let mut stride = (0.6180339887498949 * (num_slots as f64)) as u32; assert!(stride >= 2); if stride == ((stride >> 1) << 1) { // force the stride to be odd diff --git a/datasketches/tests/cpc_union_test.rs b/datasketches/tests/cpc_union_test.rs new file mode 100644 index 0000000..0a2d1dd --- /dev/null +++ b/datasketches/tests/cpc_union_test.rs @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datasketches::cpc::CpcSketch; +use datasketches::cpc::CpcUnion; +use googletest::assert_that; +use googletest::prelude::near; + +const RELATIVE_ERROR_FOR_LG_K_11: f64 = 0.02; + +#[test] +fn test_empty() { + let union = CpcUnion::new(11); + let sketch = union.get_result(); + assert!(sketch.is_empty()); + assert_eq!(sketch.estimate(), 0.0); +} + +#[test] +fn test_two_values() { + let mut sketch = CpcSketch::new(11); + sketch.update(1); + let mut union = CpcUnion::new(11); + union.update(&sketch); + + let result = union.get_result(); + assert!(!result.is_empty()); + assert_eq!(result.estimate(), 1.0); + + sketch.update(2); + union.update(&sketch); + let result = union.get_result(); + assert!(!result.is_empty()); + assert_that!( + sketch.estimate(), + near(2.0, RELATIVE_ERROR_FOR_LG_K_11 * 2.0) + ); +} + +#[test] +fn test_custom_seed() { + let mut sketch = CpcSketch::with_seed(11, 123); + sketch.update(1); + sketch.update(2); + sketch.update(3); + + let mut union = CpcUnion::with_seed(11, 123); + union.update(&sketch); + let result = union.get_result(); + assert!(!result.is_empty()); + assert_that!( + result.estimate(), + near(3.0, RELATIVE_ERROR_FOR_LG_K_11 * 3.0) + ); +} + +#[test] +#[should_panic] +fn test_custom_seed_mismatch() { + let mut sketch = CpcSketch::with_seed(11, 123); + sketch.update(1); + sketch.update(2); + sketch.update(3); + + let mut union = CpcUnion::with_seed(11, 234); + union.update(&sketch); +} + +#[test] +fn test_large_values() { + let mut key = 0; + let mut sketch = CpcSketch::new(11); + let mut union = CpcUnion::new(11); + for _ in 0..1000 { + let mut tmp = CpcSketch::new(11); + for _ in 0..10000 { + sketch.update(key); + tmp.update(key); + key += 1; + } + union.update(&tmp); + } + let result = union.get_result(); + assert!(!result.is_empty()); + assert_eq!(result.num_coupons(), union.num_coupons()); + let estimate = sketch.estimate(); + assert_that!( + result.estimate(), + near(estimate, RELATIVE_ERROR_FOR_LG_K_11 * estimate) + ); +} From 770ed74d4a7142a4c774f5f8654149c2026b65af Mon Sep 17 00:00:00 2001 From: tison Date: Fri, 30 Jan 2026 19:10:16 +0800 Subject: [PATCH 18/21] less cast Signed-off-by: tison --- datasketches/src/common/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datasketches/src/common/mod.rs b/datasketches/src/common/mod.rs index 4a1db92..a43373a 100644 --- a/datasketches/src/common/mod.rs +++ b/datasketches/src/common/mod.rs @@ -28,14 +28,14 @@ pub(crate) mod binomial_bounds; pub(crate) mod inv_pow2_table; /// Canonicalize double value for compatibility with Java -pub(crate) fn canonical_double(value: f64) -> i64 { +pub(crate) fn canonical_double(value: f64) -> u64 { if value.is_nan() { // Java's Double.doubleToLongBits() NaN value - 0x7ff8000000000000i64 + 0x7ff8000000000000u64 } else { // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode, // which Rust guarantees. Thus, by adding a positive zero we // canonicalize signed zero without any branches in one instruction. - (value + 0.0).to_bits() as i64 + (value + 0.0).to_bits() } } From 21ea3f5724884dd6b02ebbc4e183a6e0041e22d8 Mon Sep 17 00:00:00 2001 From: tison Date: Fri, 30 Jan 2026 22:38:21 +0800 Subject: [PATCH 19/21] more tests Signed-off-by: tison --- datasketches/tests/cpc_union_test.rs | 64 +++++++++++++++++++++++++++ datasketches/tests/cpc_update_test.rs | 7 +-- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/datasketches/tests/cpc_union_test.rs b/datasketches/tests/cpc_union_test.rs index 0a2d1dd..6e8aa32 100644 --- a/datasketches/tests/cpc_union_test.rs +++ b/datasketches/tests/cpc_union_test.rs @@ -103,3 +103,67 @@ fn test_large_values() { near(estimate, RELATIVE_ERROR_FOR_LG_K_11 * estimate) ); } + +#[test] +fn test_reduce_k_empty() { + let mut sketch = CpcSketch::new(11); + for i in 0..10000 { + sketch.update(i); + } + let mut union = CpcUnion::new(12); + union.update(&sketch); + let result = union.get_result(); + assert_eq!(result.lg_k(), 11); + assert_that!( + result.estimate(), + near(10000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0) + ); +} + +#[test] +fn test_reduce_k_sparse() { + let mut union = CpcUnion::new(12); + + let mut sketch12 = CpcSketch::new(12); + for i in 0..100 { + sketch12.update(i); + } + union.update(&sketch12); + + let mut sketch11 = CpcSketch::new(11); + for i in 0..1000 { + sketch11.update(i); + } + union.update(&sketch11); + + let result = union.get_result(); + assert_eq!(result.lg_k(), 11); + assert_that!( + result.estimate(), + near(1000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0) + ); +} + +#[test] +fn test_reduce_k_window() { + let mut union = CpcUnion::new(12); + + let mut sketch12 = CpcSketch::new(12); + for i in 0..500 { + sketch12.update(i); + } + union.update(&sketch12); + + let mut sketch11 = CpcSketch::new(11); + for i in 0..1000 { + sketch11.update(i); + } + union.update(&sketch11); + + let result = union.get_result(); + assert_eq!(result.lg_k(), 11); + assert_that!( + result.estimate(), + near(1000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0) + ); +} diff --git a/datasketches/tests/cpc_update_test.rs b/datasketches/tests/cpc_update_test.rs index fb17573..7b814f7 100644 --- a/datasketches/tests/cpc_update_test.rs +++ b/datasketches/tests/cpc_update_test.rs @@ -47,17 +47,14 @@ fn test_one_value() { #[test] fn test_many_values() { - const N: usize = 10000; - const N_F64: f64 = N as f64; - let mut sketch = CpcSketch::new(11); - for i in 0..N { + for i in 0..10000 { sketch.update(i); } assert!(!sketch.is_empty()); assert_that!( sketch.estimate(), - near(N_F64, RELATIVE_ERROR_FOR_LG_K_11 * N_F64) + near(10000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0) ); assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One))); assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One))); From aa0fe4ad6c54db77823dc2dc2c29f336004ee7b1 Mon Sep 17 00:00:00 2001 From: tison Date: Sun, 1 Feb 2026 11:01:35 +0800 Subject: [PATCH 20/21] add more docs Signed-off-by: tison --- datasketches/src/cpc/mod.rs | 20 +++++++++- datasketches/src/cpc/sketch.rs | 42 +++++++++++++------- datasketches/src/cpc/union.rs | 61 ++++++++++++++++++++++++++++-- datasketches/src/hll/sketch.rs | 2 +- datasketches/src/hll/union.rs | 2 +- datasketches/src/tdigest/sketch.rs | 4 +- 6 files changed, 109 insertions(+), 22 deletions(-) diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs index 635c8c2..4c9fea2 100644 --- a/datasketches/src/cpc/mod.rs +++ b/datasketches/src/cpc/mod.rs @@ -17,7 +17,25 @@ #![allow(dead_code)] -//! Compressed Probabilistic Counting sketch family. +//! Compressed Probabilistic Counting sketch. +//! +//! This is a unique-counting sketch that implements the Compressed Probabilistic Counting (CPC, +//! a.k.a. FM85) algorithms developed by Kevin Lang in his paper [Back to the Future: an Even More +//! Nearly Optimal Cardinality Estimation Algorithm](https://arxiv.org/abs/1708.06839). +//! +//! This sketch is extremely space-efficient when serialized. In an apples-to-apples empirical +//! comparison against compressed HyperLogLog sketches, this new algorithm simultaneously wins on +//! the two dimensions of the space/accuracy tradeoff and produces sketches that are smaller than +//! the entropy of HLL, so no possible implementation of compressed HLL can match its space +//! efficiency for a given accuracy. As described in the paper this sketch implements a newly +//! developed ICON estimator algorithm that survives union operations, another well-known estimator, +//! the [Historical Inverse Probability (HIP)](https://arxiv.org/abs/1306.3284) estimator does not. +//! +//! The update speed performance of this sketch is quite fast and is comparable to the speed of HLL. +//! The union (merging) capability of this sketch also allows for merging of sketches with different +//! configurations of K. +//! +//! For additional security this sketch can be configured with a user-specified hash seed. mod estimator; mod kxp_byte_lookup; diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index 43cfdf5..6af8bfc 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -38,6 +38,8 @@ use crate::hash::DEFAULT_UPDATE_SEED; use crate::hash::MurmurHash3X64128; /// A Compressed Probabilistic Counting sketch. +/// +/// See the [module level documentation](super) for more. #[derive(Debug, Clone)] pub struct CpcSketch { // immutable config variables @@ -77,11 +79,19 @@ impl Default for CpcSketch { impl CpcSketch { /// Creates a new `CpcSketch` with the given `lg_k` and default seed. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 16]`. pub fn new(lg_k: u8) -> Self { Self::with_seed(lg_k, DEFAULT_UPDATE_SEED) } /// Creates a new `CpcSketch` with the given `lg_k` and `seed`. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 16]`. pub fn with_seed(lg_k: u8, seed: u64) -> Self { assert!( (MIN_LG_K..=MAX_LG_K).contains(&(lg_k as usize)), @@ -222,7 +232,7 @@ impl CpcSketch { fn update_sparse(&mut self, row_col: u32) { let k = 1 << self.lg_k; let c32pre = (self.num_coupons as u64) << 5; - assert!(c32pre < 3 * k); // C < 3K/32, in other words, flavor == SPARSE + debug_assert!(c32pre < 3 * k); // C < 3K/32, in other words, flavor == SPARSE let is_novel = self.mut_surprising_value_table().maybe_insert(row_col); if is_novel { self.num_coupons += 1; @@ -235,11 +245,11 @@ impl CpcSketch { } fn promote_sparse_to_windowed(&mut self) { - assert_eq!(self.window_offset, 0); + debug_assert_eq!(self.window_offset, 0); let k = 1 << self.lg_k; let c32 = (self.num_coupons as u64) << 5; - assert!((c32 == (3 * k)) || ((self.lg_k == 4) && (c32 > (3 * k)))); + debug_assert!((c32 == (3 * k)) || ((self.lg_k == 4) && (c32 > (3 * k)))); self.sliding_window.resize(k as usize, 0); @@ -257,20 +267,20 @@ impl CpcSketch { } else { // cannot use must_insert(), because it doesn't provide for growth let is_novel = self.mut_surprising_value_table().maybe_insert(row_col); - assert!(is_novel); + debug_assert!(is_novel); } } } } fn update_windowed(&mut self, row_col: u32) { - assert!(self.window_offset <= 56); + debug_assert!(self.window_offset <= 56); let k = 1 << self.lg_k; let c32pre = (self.num_coupons as u64) << 5; - assert!(c32pre >= 3 * k); // C >= 3K/32, in other words flavor >= HYBRID + debug_assert!(c32pre >= 3 * k); // C >= 3K/32, in other words flavor >= HYBRID let c8pre = (self.num_coupons as u64) << 3; let w8pre = (self.window_offset as u64) << 3; - assert!(c8pre < (27 + w8pre) * k); // C < (K * 27/8) + (K * windowOffset) + debug_assert!(c8pre < (27 + w8pre) * k); // C < (K * 27/8) + (K * windowOffset) let mut is_novel = false; // novel if new coupon; let col = (row_col & 63) as u8; @@ -297,17 +307,17 @@ impl CpcSketch { let c8post = (self.num_coupons as u64) << 3; if c8post >= (27 + w8pre) * k { self.move_window(); - assert!((1..=56).contains(&self.window_offset)); + debug_assert!((1..=56).contains(&self.window_offset)); let w8post = (self.window_offset as u64) << 3; - assert!(c8post < ((27 + w8post) * k)); // C < (K * 27/8) + (K * windowOffset) + debug_assert!(c8post < ((27 + w8post) * k)); // C < (K * 27/8) + (K * windowOffset) } } } fn move_window(&mut self) { let new_offset = self.window_offset + 1; - assert!(new_offset <= 56); - assert_eq!( + debug_assert!(new_offset <= 56); + debug_assert_eq!( new_offset, determine_correct_offset(self.lg_k, self.num_coupons) ); @@ -341,7 +351,7 @@ impl CpcSketch { pattern ^= 1 << col; // erase the 1 let row_col = ((i as u32) << 6) | col; let is_novel = self.mut_surprising_value_table().maybe_insert(row_col); - assert!(is_novel); + debug_assert!(is_novel); } } @@ -382,7 +392,7 @@ impl CpcSketch { pub(super) fn build_bit_matrix(&self) -> Vec { let k = 1 << self.lg_k; let offset = self.window_offset; - assert!(offset <= 56); + debug_assert!(offset <= 56); // Fill the matrix with default rows in which the "early zone" is filled with ones. // This is essential for the routine's O(k) time cost (as opposed to O(C)). @@ -423,6 +433,10 @@ impl CpcSketch { /// empirically measured size should be large enough for at least 99.9 percent of sketches. /// /// For small values of `n` the size can be much smaller. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 26]`. pub fn max_serialized_bytes(lg_k: u8) -> usize { let lg_k = lg_k as usize; assert!( @@ -469,7 +483,7 @@ impl CpcSketch { impl CpcSketch { /// Validate this sketch is valid. /// - /// This method is typically used for debugging and testing purposes. + /// This is primarily for testing and validation purposes. pub fn validate(&self) -> bool { let bit_matrix = self.build_bit_matrix(); let num_bits_set = count_bits_set_in_matrix(&bit_matrix); diff --git a/datasketches/src/cpc/union.rs b/datasketches/src/cpc/union.rs index 95ce433..1a5255f 100644 --- a/datasketches/src/cpc/union.rs +++ b/datasketches/src/cpc/union.rs @@ -15,6 +15,52 @@ // specific language governing permissions and limitations // under the License. +//! The merging logic is somewhat involved, so it will be summarized here. +//! +//! First, we compare the K values of the union and the source sketch. +//! +//! If `source.K < union.K`, we reduce the union's K to match, which requires downsampling the +//! union's internal sketch. +//! +//! Here is how to perform the downsampling: +//! +//! If the union contains a bitMatrix, downsample it by row-wise ORing. +//! +//! If the union contains a sparse sketch, then create a new empty sketch, and walk the old target +//! sketch updating the new one (with modulo). At the end, check whether the new target sketch is +//! still in sparse mode (it might not be, because downsampling densifies the set of collected +//! coupons). If it is NOT in sparse mode, immediately convert it to a bitMatrix. +//! +//! At this point, we have `source.K >= union.K`. (We won't keep mentioning this, but in all the +//! following the source's row indices are used mod union.K while updating the union's sketch. +//! That takes care of the situation where `source.K < union.K`.) +//! +//! Case A: union is Sparse and source is Sparse. We walk the source sketch updating the union's +//! sketch. At the end, if the union's sketch is no longer in sparse mode, we convert it to a +//! bitMatrix. +//! +//! Case B: union is bitMatrix and source is Sparse. We walk the source sketch, setting bits in the +//! bitMatrix. +//! +//! In the remaining cases, we have flavor(source) > Sparse, so we immediately convert the union's +//! sketch to a bitMatrix (even if the union contains very few coupons). Then: +//! +//! Case C: union is bitMatrix and source is Hybrid or Pinned. Then we OR the source's sliding +//! window into the bitMatrix, and walk the source's table, setting bits in the bitMatrix. +//! +//! Case D: union is bitMatrix, and source is Sliding. Then we convert the source into a bitMatrix, +//! and OR it into the union's bitMatrix. (Important note; merely walking the source wouldn't work +//! because of the partially inverted Logic in the Sliding flavor, where the presence of coupons +//! is sometimes indicated by the ABSENCE of row_col pairs in the surprises table.) +//! +//! How does [`CpcUnion::get_result`] work? +//! +//! If the union has an Accumulator state, make a copy of that sketch. +//! +//! If the union has a BitMatrix state, then we have to convert the bitMatrix back into a sketch, +//! which requires doing some extra work to figure out the values of num_coupons, offset, +//! first_interesting_column, and kxp. + use crate::cpc::CpcSketch; use crate::cpc::DEFAULT_LG_K; use crate::cpc::Flavor; @@ -42,11 +88,19 @@ impl Default for CpcUnion { impl CpcUnion { /// Creates a new `CpcUnion` with the given `lg_k` and default seed. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 16]`. pub fn new(lg_k: u8) -> Self { Self::with_seed(lg_k, DEFAULT_UPDATE_SEED) } /// Creates a new `CpcUnion` with the given `lg_k` and `seed`. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 16]`. pub fn with_seed(lg_k: u8, seed: u64) -> Self { // We begin with the accumulator holding an EMPTY_MERGED sketch object. let sketch = CpcSketch::with_seed(lg_k, seed); @@ -56,8 +110,8 @@ impl CpcUnion { /// Return the parameter lg_k. /// - /// Note that due to merging with source sketches that may have a lower value of LgK, this value - /// can be less than what the union object was configured with. + /// Note that due to merging with source sketches that may have a lower value of lg_k, this + /// value can be less than what the union object was configured with. pub fn lg_k(&self) -> u8 { self.lg_k } @@ -121,7 +175,8 @@ impl CpcUnion { sketch.first_interesting_column = offset; // corner case } - // HIP-related fields will contain zeros, and that is okay since merge_flag is true + // HIP-related fields will contain zeros, and that is okay since merge_flag is true, + // and thus the HIP estimator will not be used. sketch.sliding_window = sliding_window; sketch.surprising_value_table = Some(table); sketch.merge_flag = true; diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs index 8c79625..bc1ce42 100644 --- a/datasketches/src/hll/sketch.rs +++ b/datasketches/src/hll/sketch.rs @@ -40,7 +40,7 @@ use crate::hll::serialization::*; /// A HyperLogLog sketch. /// -/// See the [hll module level documentation](crate::hll) for more. +/// See the [module level documentation](super) for more. #[derive(Debug, Clone, PartialEq)] pub struct HllSketch { lg_config_k: u8, diff --git a/datasketches/src/hll/union.rs b/datasketches/src/hll/union.rs index 5dba0bb..870c8d8 100644 --- a/datasketches/src/hll/union.rs +++ b/datasketches/src/hll/union.rs @@ -45,7 +45,7 @@ use crate::hll::pack_coupon; /// the union of all input sketches. It automatically handles sketches with /// different configurations and modes. /// -/// See the [hll module level documentation](crate::hll) for more. +/// See the [module level documentation](super) for more. #[derive(Debug, Clone)] pub struct HllUnion { /// Maximum lg_k that this union can handle diff --git a/datasketches/src/tdigest/sketch.rs b/datasketches/src/tdigest/sketch.rs index 91c90c7..651d44e 100644 --- a/datasketches/src/tdigest/sketch.rs +++ b/datasketches/src/tdigest/sketch.rs @@ -34,7 +34,7 @@ const DEFAULT_WEIGHT: NonZeroU64 = NonZeroU64::new(1).unwrap(); /// T-Digest sketch for estimating quantiles and ranks. /// -/// See the [tdigest module level documentation](crate::tdigest) for more. +/// See the [module level documentation](super) for more. #[derive(Debug, Clone)] pub struct TDigestMut { k: u16, @@ -788,7 +788,7 @@ impl TDigestMut { /// Immutable (frozen) T-Digest sketch for estimating quantiles and ranks. /// -/// See the [module documentation](super) for more details. +/// See the [module level documentation](super) for more. pub struct TDigest { k: u16, From 99e71ffd419bb8c2d34c3909e95dddeb3ea832ce Mon Sep 17 00:00:00 2001 From: tison Date: Sun, 1 Feb 2026 11:11:25 +0800 Subject: [PATCH 21/21] tidy Signed-off-by: tison --- datasketches/src/error.rs | 2 +- datasketches/src/tdigest/sketch.rs | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/datasketches/src/error.rs b/datasketches/src/error.rs index dcb0592..7e8b663 100644 --- a/datasketches/src/error.rs +++ b/datasketches/src/error.rs @@ -89,7 +89,7 @@ impl Error { } } -// Convenience constructors for deserialization errors +// Convenient constructors used within datasketches crate. impl Error { pub(crate) fn invalid_argument(msg: impl Into) -> Self { Self::new(ErrorKind::InvalidArgument, msg) diff --git a/datasketches/src/tdigest/sketch.rs b/datasketches/src/tdigest/sketch.rs index 651d44e..a790892 100644 --- a/datasketches/src/tdigest/sketch.rs +++ b/datasketches/src/tdigest/sketch.rs @@ -22,7 +22,6 @@ use std::num::NonZeroU64; use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::error::Error; -use crate::error::ErrorKind; use crate::tdigest::serialization::*; /// The default value of K if one is not specified. @@ -100,10 +99,9 @@ impl TDigestMut { /// ``` pub fn try_new(k: u16) -> Result { if k < 10 { - return Err(Error::new( - ErrorKind::InvalidArgument, - format!("k must be at least 10, got {k}"), - )); + return Err(Error::invalid_argument(format!( + "k must be at least 10, got {k}" + ))); } Ok(Self::make(