From 848131a1cdd6de945b86a48782cc2afb08e00734 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Tue, 20 Jan 2026 21:22:22 +0800
Subject: [PATCH 01/21] feat: impl CpcSketch

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/mod.rs        | 3 +++
 datasketches/src/cpc/pair_table.rs | 8 ++++++++
 datasketches/src/lib.rs            | 1 +
 3 files changed, 12 insertions(+)
 create mode 100644 datasketches/src/cpc/mod.rs
 create mode 100644 datasketches/src/cpc/pair_table.rs
diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
new file mode 100644
index 0000000..a2b5957
--- /dev/null
+++ b/datasketches/src/cpc/mod.rs
@@ -0,0 +1,3 @@
+//! Compressed Probabilistic Counting sketch family.
+
+mod pair_table;
diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs
new file mode 100644
index 0000000..0848634
--- /dev/null
+++ b/datasketches/src/cpc/pair_table.rs
@@ -0,0 +1,8 @@
+/// A highly specialized hash table used for sparse data.
+///
+/// This table stores `(row, col)` pairs and uses linear probing for collision resolution. It is
+/// optimized for scenarios where the cardinality of entries is low.
+pub(crate) struct PairTable {
+    pub keys: Vec<u64>,
+    pub values: Vec<u8>,
+}
diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs
index 009fd9e..17701ab 100644
--- a/datasketches/src/lib.rs
+++ b/datasketches/src/lib.rs
@@ -33,6 +33,7 @@ compile_error!("datasketches does not support big-endian targets");
 pub mod bloom;
 pub mod common;
 pub mod countmin;
+pub mod cpc;
 pub mod error;
 pub mod frequencies;
 pub mod hll;

From d42cb8ffe429f52d97146a9e31a7a3535d22c4a6 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Tue, 20 Jan 2026 23:52:14 +0800
Subject: [PATCH 02/21] pair table

Signed-off-by: tison <wander4096@gmail.com>
---
 Cargo.lock                         |  98 ++++++++
 Cargo.toml                         |   1 +
 datasketches/Cargo.toml            |   3 +-
 datasketches/src/cpc/mod.rs        |  17 ++
 datasketches/src/cpc/pair_table.rs | 345 ++++++++++++++++++++++++++++-
 5 files changed, 460 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 34faf60..f9efb3b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -73,6 +73,12 @@ version = "2.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
 
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
 [[package]]
 name = "clap"
 version = "4.5.53"
@@ -124,6 +130,7 @@ name = "datasketches"
 version = "0.2.0"
 dependencies = [
  "googletest",
+ "rand",
 ]
 
 [[package]]
@@ -149,6 +156,18 @@ dependencies = [
  "datasketches",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasip2",
+]
+
 [[package]]
 name = "googletest"
 version = "0.14.2"
@@ -217,6 +236,15 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.104"
@@ -235,6 +263,41 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "rand"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+dependencies = [
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom",
+]
+
 [[package]]
 name = "regex"
 version = "1.12.2"
@@ -312,6 +375,15 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
+[[package]]
+name = "wasip2"
+version = "1.0.1+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
+dependencies = [
+ "wit-bindgen",
+]
+
 [[package]]
 name = "which"
 version = "8.0.0"
@@ -344,6 +416,12 @@ version = "0.0.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904"
 
+[[package]]
+name = "wit-bindgen"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
+
 [[package]]
 name = "x"
 version = "0.0.0"
@@ -351,3 +429,23 @@ dependencies = [
  "clap",
  "which",
 ]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "668f5168d10b9ee831de31933dc111a459c97ec93225beb307aed970d1372dfd"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c7962b26b0a8685668b671ee4b54d007a67d4eaf05fda79ac0ecf41e32270f1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/Cargo.toml b/Cargo.toml
index 30c7a35..c7e4a18 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,6 +34,7 @@ datasketches = { path = "datasketches" }
 # Crates.io dependencies
 clap = { version = "4.5.20", features = ["derive"] }
 googletest = { version = "0.14.2" }
+rand = { version = "0.9.2" }
 which = { version = "8.0.0" }
 
 [workspace.lints.rust]
diff --git a/datasketches/Cargo.toml b/datasketches/Cargo.toml
index 9676454..5e1f7ab 100644
--- a/datasketches/Cargo.toml
+++ b/datasketches/Cargo.toml
@@ -34,10 +34,9 @@ keywords = ["sketch", "hyperloglog", "probabilistic"]
 all-features = true
 rustdoc-args = ["--cfg", "docsrs"]
 
-[dependencies]
-
 [dev-dependencies]
 googletest = { workspace = true }
+rand = { workspace = true }
 
 [lints]
 workspace = true
diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index a2b5957..2be42c1 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -1,3 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
 //! Compressed Probabilistic Counting sketch family.
 
 mod pair_table;
diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs
index 0848634..9ab6888 100644
--- a/datasketches/src/cpc/pair_table.rs
+++ b/datasketches/src/cpc/pair_table.rs
@@ -1,8 +1,349 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+const UPSIZE_NUMERATOR: u32 = 3;
+const UPSIZE_DENOMINATOR: u32 = 4;
+const DOWNSIZE_NUMERATOR: u32 = 1;
+const DOWNSIZE_DENOMINATOR: u32 = 4;
+
 /// A highly specialized hash table used for sparse data.
 ///
 /// This table stores `(row, col)` pairs and uses linear probing for collision resolution. It is
 /// optimized for scenarios where the cardinality of entries is low.
 pub(crate) struct PairTable {
-    pub keys: Vec<u64>,
-    pub values: Vec<u8>,
+    /// log2 of number of slots
+    lg_size: u8,
+    num_valid_bits: u8,
+    num_items: u32,
+    slots: Vec<u32>,
+}
+
+impl PairTable {
+    pub fn new(lg_size: u8, num_valid_bits: u8) -> Self {
+        assert!(
+            (2..=26).contains(&lg_size),
+            "lg_size must be in [2, 26], got {lg_size}"
+        );
+        assert!(
+            ((lg_size + 1)..=32).contains(&num_valid_bits),
+            "num_valid_bits must be in [lg_size + 1, 32], got {num_valid_bits} where lg_size = {lg_size}"
+        );
+        Self {
+            lg_size,
+            num_valid_bits,
+            num_items: 0,
+            slots: vec![u32::MAX; 1 << lg_size],
+        }
+    }
+
+    /// A constructor specifically tailored to be a part of FM85 decompression scheme.
+    pub fn from_slots(lg_size: u8, num_items: u32, slots: Vec<u32>) -> Self {
+        let mut lg_num_slots = 2;
+        while UPSIZE_DENOMINATOR * num_items > (UPSIZE_NUMERATOR * (1 << lg_num_slots)) {
+            lg_num_slots += 1;
+        }
+
+        let mut table = Self::new(lg_num_slots, 6 + lg_size);
+
+        // Note: there is a possible "snowplow effect" here because the caller is passing in a
+        // sorted pairs array. However, we are starting out with the correct final table size, so
+        // the problem might not occur.
+
+        for slot in slots {
+            table.must_insert(slot);
+        }
+        table.num_items = num_items;
+        table
+    }
+
+    pub fn lg_size(&self) -> u8 {
+        self.lg_size
+    }
+
+    pub fn num_items(&self) -> u32 {
+        self.num_items
+    }
+
+    pub fn slots(&self) -> &[u32] {
+        &self.slots
+    }
+
+    pub fn clear(&mut self) {
+        self.slots.fill(u32::MAX);
+        self.num_items = 0;
+    }
+
+    pub fn maybe_delete(&mut self, item: u32) -> bool {
+        let index = self.lookup(item) as usize;
+        if self.slots[index] == u32::MAX {
+            return false;
+        }
+        assert_eq!(
+            self.slots[index], item,
+            "item {item} not found at index {index}"
+        );
+        assert!(self.num_items > 0, "no items to delete");
+
+        // delete the item
+        self.slots[index] = u32::MAX;
+        self.num_items -= 1;
+
+        // re-insert all items between the freed slot and the next empty slot
+        let mask = (1 << self.lg_size) - 1;
+        let mut probe = (index + 1) & mask;
+        let mut fetched = self.slots[probe];
+        while fetched != u32::MAX {
+            self.slots[probe] = u32::MAX;
+            self.must_insert(fetched);
+            probe = (probe + 1) & mask;
+            fetched = self.slots[probe];
+        }
+
+        // shrink if necessary
+        while ((DOWNSIZE_DENOMINATOR * self.num_items) < (DOWNSIZE_NUMERATOR * (1 << self.lg_size)))
+            && (self.lg_size > 2)
+        {
+            self.rebuild(self.lg_size - 1);
+        }
+
+        true
+    }
+
+    pub fn maybe_insert(&mut self, item: u32) -> bool {
+        let index = self.lookup(item) as usize;
+        if self.slots[index] == item {
+            return false;
+        }
+        assert_eq!(
+            self.slots[index],
+            u32::MAX,
+            "no empty slot found for item {item} at index {index}"
+        );
+        self.slots[index] = item;
+        self.num_items += 1;
+        while (UPSIZE_DENOMINATOR * self.num_items) > (UPSIZE_NUMERATOR * (1 << self.lg_size)) {
+            self.rebuild(self.lg_size + 1);
+        }
+        true
+    }
+
+    /// While extracting the items from a linear probing hashtable, this will usually undo the
+    /// wrap-around provided that the table isn't too full.
+    ///
+    /// Experiments suggest that for sufficiently large tables the load factor would have to be
+    /// over 90 percent before this would fail frequently, and even then the subsequent sort would
+    /// fix things up.
+    ///
+    /// The result is nearly sorted, so make sure to use an efficient sort for that case.
+    pub fn unwrapping_get_items(&self) -> Vec<u32> {
+        if self.slots.is_empty() {
+            return vec![];
+        }
+
+        let table_size = 1usize << self.lg_size;
+        let mut result = vec![0; self.num_items as usize];
+        let mut i = 0usize;
+        let mut l = 0usize;
+        let mut r = self.num_items as usize - 1;
+
+        // special rules for the region before the first empty slot
+        let hi_bit = 1 << (self.num_valid_bits - 1);
+        while i < table_size && self.slots[i] != u32::MAX {
+            let item = self.slots[i];
+            i += 1;
+            if (item & hi_bit) != 0 {
+                // this item was probably wrapped, so move to end
+                result[r] = item;
+                r -= 1;
+            } else {
+                result[l] = item;
+                l += 1;
+            }
+        }
+
+        // the rest of the table is processed normally
+        while i < table_size {
+            let item = self.slots[i];
+            i += 1;
+            if item != u32::MAX {
+                result[l] = item;
+                l += 1;
+            }
+        }
+
+        assert_eq!(l, r + 1, "number of items mismatch during extraction");
+        result
+    }
+
+    fn must_insert(&mut self, item: u32) {
+        let index = self.lookup(item) as usize;
+        assert_ne!(
+            self.slots[index], item,
+            "item {item} already present in table"
+        );
+        assert_eq!(
+            self.slots[index],
+            u32::MAX,
+            "no empty slot found for item {item} at index {index}"
+        );
+        self.slots[index] = item;
+        // counts and resizing must be handled by the caller.
+    }
+
+    fn lookup(&self, item: u32) -> u32 {
+        let size = 1 << self.lg_size;
+        let mask = size - 1;
+
+        let shift = self.num_valid_bits - self.lg_size;
+
+        // extract high table size bits
+        let mut probe = item >> shift;
+        assert!(probe <= mask, "probe = {probe}, mask = {mask}");
+
+        loop {
+            let slot = self.slots[probe as usize];
+            if slot != item && slot != u32::MAX {
+                probe = (probe + 1) & mask;
+            } else {
+                break;
+            }
+        }
+
+        probe
+    }
+
+    /// Rebuilds to a larger size. `num_items` and `num_valid_bits` remain unchanged.
+    fn rebuild(&mut self, lg_size: u8) {
+        assert!(
+            (2..=26).contains(&lg_size),
+            "lg_size must be in [2, 26], got {lg_size}"
+        );
+        assert!(
+            ((lg_size + 1)..=32).contains(&self.num_valid_bits),
+            "num_valid_bits must be in [lg_size + 1, 32], got {} where lg_size = {lg_size}",
+            self.num_valid_bits
+        );
+
+        let new_size = 1u32 << lg_size;
+        assert!(
+            new_size > self.num_items,
+            "new table size ({new_size}) must be larger than number of items {}",
+            self.num_items
+        );
+
+        let slots = std::mem::replace(&mut self.slots, vec![u32::MAX; new_size as usize]);
+        self.lg_size = lg_size;
+        for slot in slots {
+            if slot != u32::MAX {
+                self.must_insert(slot);
+            }
+        }
+    }
+}
+
+/// This should be used pair with [`PairTable::unwrapping_get_items`].
+///
+/// In applications where the input array is already nearly sorted, insertion sort runs in linear
+/// time with a very small constant.
+///
+/// This introspective version of insertion sort protects against the quadratic cost of sorting bad
+/// input arrays. It keeps track of how much work has been done, and if that exceeds a constant
+/// times the array length, it switches to a different sorting algorithm.
+pub fn introspective_insertion_sort(a: &mut [u32]) {
+    let cost_limit = 8 * a.len();
+
+    let mut cost = 0;
+    for i in 1..a.len() {
+        let mut j = i;
+        let v = a[i];
+        while j >= 1 && v < a[j - 1] {
+            a[j] = a[j - 1];
+            j -= 1;
+        }
+        a[j] = v;
+        cost += i - j; // distance moved is a measure of work
+        if cost > cost_limit {
+            knuth_shell_sort3(a);
+            return;
+        }
+    }
+}
+
+fn knuth_shell_sort3(a: &mut [u32]) {
+    let len = a.len();
+
+    let mut h = 0;
+    while h < len / 9 {
+        h = 3 * h + 1;
+    }
+
+    while h > 0 {
+        for i in h..len {
+            let v = a[i];
+            let mut j = i;
+            while j >= h && v < a[j - h] {
+                a[j] = a[j - h];
+                j -= h;
+            }
+            a[j] = v;
+        }
+        h /= 3;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sort_1() {
+        let data = (0..10)
+            .map(|_| rand::random_range(0..10000))
+            .collect::<Vec<_>>();
+        let mut sorted = data.clone();
+        introspective_insertion_sort(&mut sorted);
+        assert!(
+            sorted.is_sorted(),
+            "data was not sorted correctly: origin={data:?}, sorted={sorted:?}"
+        );
+    }
+
+    #[test]
+    fn test_sort_2() {
+        let data = (0..10)
+            .map(|_| rand::random_range(0..10000) + 3_000_000_000)
+            .collect::<Vec<_>>();
+        let mut sorted = data.clone();
+        introspective_insertion_sort(&mut sorted);
+        assert!(
+            sorted.is_sorted(),
+            "data was not sorted correctly: origin={data:?}, sorted={sorted:?}"
+        );
+    }
+
+    #[test]
+    fn test_sort_3() {
+        let len = 20;
+        let data = (0..len).map(|i| len - i + 1).collect::<Vec<_>>();
+        let mut sorted = data.clone();
+        introspective_insertion_sort(&mut sorted);
+        assert!(
+            sorted.is_sorted(),
+            "data was not sorted correctly: origin={data:?}, sorted={sorted:?}"
+        );
+    }
 }

From 8b9d82f927f3aa78cf2b29217d25c62e69525202 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 13:29:27 +0800
Subject: [PATCH 03/21] icon_estimator

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/icon_estimator.rs | 234 +++++++++++++++++++++++++
 datasketches/src/cpc/mod.rs            |   1 +
 datasketches/src/cpc/pair_table.rs     |   2 +-
 3 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 datasketches/src/cpc/icon_estimator.rs

diff --git a/datasketches/src/cpc/icon_estimator.rs b/datasketches/src/cpc/icon_estimator.rs
new file mode 100644
index 0000000..5c91ff3
--- /dev/null
+++ b/datasketches/src/cpc/icon_estimator.rs
@@ -0,0 +1,234 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! The ICON estimator for FM85 sketches is defined by the arXiv paper.
+//!
+//! The current file provides exact and approximate implementations of this estimator. The exact
+//! version works for any value of K, but is quite slow.
+//!
+//! The much faster approximate version works for K values that are powers of two ranging from 2^4
+//! to 2^32.
+//!
+//! At a high-level, this approximation can be described as using an exponential approximation when
+//! C > K * (5.6 or 5.7), while smaller values of C are handled by a degree-19 polynomial
+//! approximation of a pre-conditioned version of the true ICON mapping from C to N_hat.
+//!
+//! This file also provides a validation procedure that compares its approximate and exact
+//! implementations of the FM85 ICON estimator.
+
+const ICON_MIN_LOG_K: usize = 4;
+const ICON_MAX_LOG_K: usize = 26;
+const ICON_POLYNOMIAL_DEGREE: usize = 19;
+const ICON_POLYNOMIAL_NUM_COEFFICIENTS: usize = 1 + ICON_POLYNOMIAL_DEGREE;
+const ICON_TABLE_SIZE: usize =
+    ICON_POLYNOMIAL_NUM_COEFFICIENTS * (1 + (ICON_MAX_LOG_K - ICON_MIN_LOG_K));
+
+#[rustfmt::skip]
+#[allow(clippy::excessive_precision)]
+const ICON_POLYNOMIAL_COEFFICIENTS: [f64; ICON_TABLE_SIZE] = [
+    // log K = 4
+    0.9895027971889700513, 0.3319496644645180128, 0.1242818722715769986, -0.03324149686026930256, -0.2985637298081619817,
+    1.366555923595830002, -4.705499366260569971, 11.61506432505530029, -21.11254986175579873, 28.89421695078809904,
+    -30.1383659011730991, 24.11946778830730054, -14.83391445199539938, 6.983088767267210173, -2.48964120264876998,
+    0.6593243603602499947, -0.125493534558034997, 0.01620971672896159843, -0.001271267679036929953, 4.567178653294529745e-05,
+
+    // log K = 5
+    0.9947713741300230339, 0.3326559581620939787, 0.1250050661634889981, -0.04130073804472530336, -0.2584095537451129854,
+    1.218050389433120051, -4.319106696095399656, 10.87175052045090062, -20.0184979022142997, 27.63210188163320069,
+    -28.97950009664030091, 23.26740804691930009, -14.33375703270860058, 6.751281271241110105, -2.406363094133439962,
+    0.6367414734718820357, -0.1210468076141379967, 0.01561196698118279963, -0.001222335432128580056, 4.383502970318410206e-05,
+
+    // log K = 6
+    0.9973904854982870161, 0.3330148852217920119, 0.125251536589509993, -0.04434075124043219962, -0.2436238890691720116,
+    1.163293254754570016, -4.177758779777369647, 10.60301981340099964, -19.6274507428828997, 27.18420839597660077,
+    -28.56827214174580121, 22.96268674086600114, -14.15234202220280046, 6.665700662642549901, -2.375043356720739851,
+    0.6280993991240929608, -0.119319019358031006, 0.01537674055733759954, -0.001202881695730769916, 4.309894633186929849e-05,
+
+    // log K = 7
+    0.9986963310058679655, 0.3331956705633329907, 0.125337696770523005, -0.04546817338088020299, -0.2386752211125199863,
+    1.145927328111949972, -4.135694445582720036, 10.52805060502839929, -19.52408322548339825, 27.06921653903929936,
+    -28.46207532143190022, 22.88083524357429965, -14.10057147392659971, 6.63958754983273991, -2.364865219283200037,
+    0.6251341806425250169, -0.1186991327450530043, 0.0152892726403408008, -0.001195439764873199896, 4.281098416794090072e-05,
+
+    // log K = 8
+    0.999348600452531044, 0.3332480372393080148, 0.126666900963325002, -0.06495714694254159371, -0.08376282050638980681,
+    0.3760158094643630267, -1.568204791601850001, 4.483117719555970382, -9.119180124379150598, 13.65799293358900002,
+    -15.3100211234349004, 12.97546344654869976, -8.351661538536939489, 4.075022612435580172, -1.49387015887069996,
+    0.4040976870253379927, -0.07813232681879349328, 0.01020545649538820085, -0.0008063279210812720381, 2.909334976414100078e-05,
+
+    // log K = 9
+    0.9996743787297059924, 0.3332925779481850093, 0.1267124599259649986, -0.06550452970936600228, -0.08191738117533520214,
+    0.3773034458363569987, -1.604679509609959975, 4.636761898691969641, -9.487348609558699408, 14.25164235443030059,
+    -15.99674955529870068, 13.56353219046370029, -8.730194904342459594, 4.259010067932120336, -1.56106689792022002,
+    0.4222540912786589828, -0.08165296504921559784, 0.01066878484925220041, -0.0008433887618256910015, 3.045339724886519912e-05,
+
+    // log K = 10
+    0.999837191783945034, 0.3333142252339619804, 0.1267759538087240012, -0.06631005632753710077, -0.07692759158286699428,
+    0.3568943956395980166, -1.546598721379510044, 4.51595019978557044, -9.298431968763770428, 14.02586858080080034,
+    -15.78858959520439953, 13.41484931677589998, -8.647958125130809748, 4.22398017468472009, -1.549708891200570093,
+    0.419507410264540026, -0.08117411611046250475, 0.01061202286184199928, -0.000839300527596772007, 3.03185874520205985e-05,
+
+    // log K = 11
+    0.9999186020796150265, 0.3333249054574359826, 0.126791713589799987, -0.06662487271699729652, -0.07335552427910230211,
+    0.3316370184815959909, -1.434143797561290068, 4.180260309967409604, -8.593906870708760692, 12.95088874800289958,
+    -14.56876092520539956, 12.37074367531410068, -7.969152075707960137, 3.888774396648960074, -1.424923326506990051,
+    0.385084561785229984, -0.07435541911616409816, 0.009695363567476529554, -0.0007644375960047160388, 2.75156194717188011e-05,
+
+    // log K = 12
+    0.9999592955649559967, 0.3333310560725140093, 0.1267379744020450116, -0.06524495415766619344, -0.08854031542298740343,
+    0.4244320628874230228, -1.794077789033230008, 5.133875262768450298, -10.40149374917120007, 15.47808115629240078,
+    -17.2272296137545986, 14.5002173676463002, -9.274819801602760094, 4.500782540026570189, -1.642359389030050076,
+    0.442596113445525019, -0.0853226219238850947, 0.01111969379054169975, -0.0008771614088006969611, 3.161668519459719752e-05,
+
+    // log K = 13
+    0.9999796468102559732, 0.3333336602394039727, 0.126728089053198989, -0.06503798598282370391, -0.09050261023823169548,
+    0.4350609244189960201, -1.831274835815670077, 5.223387516985289913, -10.55574395269979959, 15.67359470222429962,
+    -17.41263416341029924, 14.63297400889229927, -9.346752431221359458, 4.530124905188380069, -1.651245566462089975,
+    0.444542549250713015, -0.08561720963336499901, 0.01114805146185449992, -0.0008786251203363140043, 3.16416341644572998e-05,
+
+    // log K = 14
+    0.9999898187060970445, 0.3333362579300819806, 0.1266984078369459976, -0.06464561179765909715, -0.09343280886228019777,
+    0.4490702549264070087, -1.878087608052450008, 5.338004322057390283, -10.76690603590630069, 15.97069195083200022,
+    -17.73440379943459888, 14.90212518309260048, -9.520506013770420495, 4.616238931978830173, -1.68364817877918993,
+    0.4536194960681350086, -0.087448605434800597, 0.01139929991331390009, -0.0008995891451622229631, 3.244407259782900338e-05,
+
+    // log K = 15
+    0.9999949072549390028, 0.3333376334705290267, 0.126665364358402005, -0.06411790034705669439, -0.09776009134670660128,
+    0.4704691112248470253, -1.948021675295769972, 5.497760972696490001, -11.03165645315390009, 16.29703330781000048,
+    -18.03851029448010124, 15.11836776139680083, -9.638205179917429533, 4.665122328753120051, -1.698980686525759953,
+    0.4571799506245269873, -0.08804011353783609828, 0.01146553155965330043, -0.0009040455800659569869, 3.257931866957050274e-05,
+
+    // log K = 16
+    0.9999974544793589493, 0.3333381337614599871, 0.1266524862971120102, -0.06391676499117690535, -0.09929616211306059592,
+    0.4771390820378790254, -1.965762451227349938, 5.526802350376460282, -11.05703067024660058, 16.29535848023060041,
+    -18.00114005075790047, 15.06214012231560062, -9.58874727382628933, 4.63537541652793017, -1.686222848555620102,
+    0.4532602373715179933, -0.08719448925964939923, 0.01134365425717459921, -0.0008934965241274289835, 3.216436244471380105e-05,
+
+    // log K = 17
+    0.9999987278278800185, 0.3333383411464330148, 0.126642761751724009, -0.06371042959073920653, -0.1013564516034080043,
+    0.4891311195679299839, -2.010971712051409899, 5.644390807952309963, -11.27697253921500042, 16.59957157207080058,
+    -18.31808338317799922, 15.31363518393730061, -9.741451446816620674, 4.706207545519429658, -1.711102469010010063,
+    0.4597587341089349744, -0.08841670767182820134, 0.01149999225097850068, -0.0009056651366963050422, 3.259910736274500059e-05,
+
+    // log K = 18
+    0.9999993637727100371, 0.3333385511608860097, 0.1266341580529160016, -0.06353272828164230335, -0.103139962850642003,
+    0.4996216017206500104, -2.05099128585287982, 5.749874086531799655, -11.47727638570349917, 16.88141587810320132,
+    -18.61744656177490143, 15.55634230427719977, -9.892350736128680211, 4.778033520984200422, -1.737045483861280104,
+    0.4667410882683730167, -0.08977256212421590165, 0.01167940146667079994, -0.0009201381242396030127, 3.313600701586759867e-05,
+
+    // log K = 19
+    0.9999996805376010212, 0.3333372324328989778, 0.1267104737214659882, -0.06504749929326139601, -0.0882341962464350954,
+    0.4131871162041140244, -1.725190703567099915, 4.900817515593920426, -9.883452720776510603, 14.6657081190816001,
+    -16.29398295135089825, 13.69805011761319946, -8.753475239465899449, 4.244072374564439976, -1.547202527706629915,
+    0.4164770109614310267, -0.08017596922092029565, 0.01043146101701039954, -0.00082124200571200305, 2.953319493719429935e-05,
+
+    // log K = 20
+    0.9999998390037539986, 0.3333365859956040067, 0.1267460211029839967, -0.06569456024647769843, -0.0823070353477164951,
+    0.3810826463303410017, -1.611983580241109992, 4.624520077758210057, -9.397308335633589138, 14.03184981378050011,
+    -15.6703191315401007, 13.22992718704790072, -8.484216393184780713, 4.125607133488029987, -1.507690650697159906,
+    0.4066678517577320129, -0.07842110121777939868, 0.01021780862225150042, -0.0008054065857047439754, 2.899431830426989844e-05,
+
+    // log K = 21
+    0.9999999207001479817, 0.3333384953015239849, 0.1266331480396669928, -0.06345750166298599892, -0.1042341210992499961,
+    0.5077112908497130039, -2.087398133609810191, 5.858842546192500222, -11.70620319777190055, 17.23103975433669888,
+    -19.01462552846669851, 15.89674059836560005, -10.11395134034419918, 4.88760796465891989, -1.777886770904629987,
+    0.4780200178339499839, -0.09200895321782050218, 0.01198029553244219989, -0.0009447283875782100165, 3.405716775824710232e-05,
+
+    // log K = 22
+    0.9999999606908690497, 0.3333383929524300071, 0.1266456445096819927, -0.06373504294081690225, -0.1012834291081849969,
+    0.4893810690172959998, -2.01391428223606983, 5.656430437473649597, -11.3067201537791, 16.64980594135310099,
+    -18.3792355790383013, 15.36879753115040081, -9.778831246425049528, 4.725308061988969577, -1.718423596500280093,
+    0.4618308177809870019, -0.08883675060799739454, 0.01155766944804260087, -0.0009104695617243750358, 3.278237729674439666e-05,
+
+    // log K = 23
+    0.9999999794683379628, 0.3333386441751680085, 0.1266463995182049995, -0.06376031920455070556, -0.1010799540803130059,
+    0.488540137426137, -2.012048323537570127, 5.654949475342659682, -11.31023240892979942, 16.66334675284959843,
+    -18.40241452866079896, 15.39443572867130072, -9.798844412838670692, 4.736683907539640082, -1.723168363744929987,
+    0.463270349018644001, -0.08914619066708899531, 0.01160235936257320022, -0.0009143600818183229709, 3.293669304679140117e-05,
+
+    // log K = 24
+    0.9999999911469820146, 0.3333376076934529975, 0.1266944349940530012, -0.06470524278387919381, -0.09189342220283110152,
+    0.4359182372694809793, -1.815980282951169977, 5.149474056470340066, -10.37086570678100017, 15.36962686758569951,
+    -17.05756384717849983, 14.32755177515199918, -9.149944050025640152, 4.434601894497260055, -1.616478926806520056,
+    0.4351979157055039793, -0.08381768225272340223, 0.01091321820476520016, -0.0008600264403629039739, 3.09667800347144002e-05,
+
+    // log K = 25
+    0.9999999968592140354, 0.3333379164881000167, 0.1266782495827009913, -0.06434163088961859789, -0.09575258124988890451,
+    0.4597843575354370049, -1.911374431241559924, 5.411856661251520428, -10.88850084646090011, 16.12298941380269923,
+    -17.88172178487259956, 15.01301780636859995, -9.585542896142529301, 4.645811872761620442, -1.693952293156189892,
+    0.4563143308861309921, -0.08795976148455289523, 0.01146560428011200033, -0.0009048442931930629528, 3.26358391497329992e-05,
+
+    // log K = 26
+    0.9999999970700530483, 0.333338329556315982, 0.126644753076394001, -0.06372365346512399997, -0.1012760856945769949,
+    0.4886852278576360176, -2.009005418394389952, 5.638119224137019714, -11.26276715335160006, 16.57640024218650154,
+    -18.29035093605569884, 15.28892246224570073, -9.724916375991760731, 4.6978877652334603, -1.707974125916829955,
+    0.4588937864564729963, -0.08824617586088029375, 0.01147732114826570046, -0.00090384524860747295, 3.253252703695579795e-05,
+];
+
+pub(super) fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize, x: f64) -> f64 {
+    let end = start + num - 1;
+    let mut total = coefficients[end];
+    for i in (start..end).rev() {
+        total *= x;
+        total += coefficients[i];
+    }
+    total
+}
+
+pub(super) fn icon_exponential_approximation(k: f64, c: f64) -> f64 {
+    0.7940236163830469 * k * 2f64.powf(c / k)
+}
+
+pub(super) fn compute_icon_estimate(lg_k: u8, c: u32) -> f64 {
+    let lg_k = lg_k as usize;
+    assert!(
+        (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k),
+        "lg_k out of range; got {lg_k}",
+    );
+
+    match c {
+        0 => return 0.0,
+        1 => return 1.0,
+        _ => {}
+    }
+
+    let k = (1 << lg_k) as f64;
+    let c = c as f64;
+
+    // Differing thresholds ensure that the approximated estimator is monotonically increasing.
+    let threshold_factor = if lg_k < 14 { 5.7 } else { 5.6 };
+    if c > threshold_factor * k {
+        return icon_exponential_approximation(k, c);
+    }
+
+    let factor = evaluate_polynomial(
+        &ICON_POLYNOMIAL_COEFFICIENTS,
+        ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K),
+        ICON_POLYNOMIAL_NUM_COEFFICIENTS,
+        // The constant 2.0 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS.
+        // This factor, although somewhat arbitrary, is based on extensive characterization studies
+        // and is considered a safe conservative factor.
+        c / (2.0 * k),
+    );
+    let ratio = c / k;
+    // The constant 66.774757 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS.
+    // This factor, although somewhat arbitrary, is based on extensive characterization studies
+    // and is considered a safe conservative factor.
+    let term = 1.0 + (ratio * ratio * ratio / 66.774757);
+    let result = c * factor * term;
+    if result >= c { result } else { c }
+}
diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index 2be42c1..4f03e5c 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -17,4 +17,5 @@
 
 //! Compressed Probabilistic Counting sketch family.
 
+mod icon_estimator;
 mod pair_table;
diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs
index 9ab6888..8701ac1 100644
--- a/datasketches/src/cpc/pair_table.rs
+++ b/datasketches/src/cpc/pair_table.rs
@@ -24,7 +24,7 @@ const DOWNSIZE_DENOMINATOR: u32 = 4;
 ///
 /// This table stores `(row, col)` pairs and uses linear probing for collision resolution. It is
 /// optimized for scenarios where the cardinality of entries is low.
-pub(crate) struct PairTable {
+pub(super) struct PairTable {
     /// log2 of number of slots
     lg_size: u8,
     num_valid_bits: u8,

From 6219eed9786ff7496b768aad27855c0df1f16fb0 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 13:44:19 +0800
Subject: [PATCH 04/21] lookup tables

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/common/inv_pow2_table.rs | 103 ++++++++++++++++++++++
 datasketches/src/common/mod.rs            |   1 +
 datasketches/src/cpc/kxp_byte_lookup.rs   |  76 ++++++++++++++++
 datasketches/src/cpc/mod.rs               |   1 +
 4 files changed, 181 insertions(+)
 create mode 100644 datasketches/src/common/inv_pow2_table.rs
 create mode 100644 datasketches/src/cpc/kxp_byte_lookup.rs

diff --git a/datasketches/src/common/inv_pow2_table.rs b/datasketches/src/common/inv_pow2_table.rs
new file mode 100644
index 0000000..580a2fc
--- /dev/null
+++ b/datasketches/src/common/inv_pow2_table.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[rustfmt::skip]
+#[allow(clippy::excessive_precision)]
+pub(crate) const INVERSE_POWERS_OF_2: [f64; 256] = [
+    1.0, 0.5, 0.25, 0.125,
+    0.0625, 0.03125, 0.015625, 0.0078125,
+    0.00390625, 0.001953125, 0.0009765625, 0.00048828125,
+    0.000244140625, 0.0001220703125, 6.103515625e-05, 3.0517578125e-05,
+    1.52587890625e-05, 7.62939453125e-06, 3.814697265625e-06, 1.9073486328125e-06,
+    9.5367431640625e-07, 4.76837158203125e-07, 2.384185791015625e-07, 1.1920928955078125e-07,
+    5.9604644775390625e-08, 2.9802322387695312e-08, 1.4901161193847656e-08, 7.4505805969238281e-09,
+    3.7252902984619141e-09, 1.862645149230957e-09, 9.3132257461547852e-10, 4.6566128730773926e-10,
+    2.3283064365386963e-10, 1.1641532182693481e-10, 5.8207660913467407e-11, 2.9103830456733704e-11,
+    1.4551915228366852e-11, 7.2759576141834259e-12, 3.637978807091713e-12, 1.8189894035458565e-12,
+    9.0949470177292824e-13, 4.5474735088646412e-13, 2.2737367544323206e-13, 1.1368683772161603e-13,
+    5.6843418860808015e-14, 2.8421709430404007e-14, 1.4210854715202004e-14, 7.1054273576010019e-15,
+    3.5527136788005009e-15, 1.7763568394002505e-15, 8.8817841970012523e-16, 4.4408920985006262e-16,
+    2.2204460492503131e-16, 1.1102230246251565e-16, 5.5511151231257827e-17, 2.7755575615628914e-17,
+    1.3877787807814457e-17, 6.9388939039072284e-18, 3.4694469519536142e-18, 1.7347234759768071e-18,
+    8.6736173798840355e-19, 4.3368086899420177e-19, 2.1684043449710089e-19, 1.0842021724855044e-19,
+    5.4210108624275222e-20, 2.7105054312137611e-20, 1.3552527156068805e-20, 6.7762635780344027e-21,
+    3.3881317890172014e-21, 1.6940658945086007e-21, 8.4703294725430034e-22, 4.2351647362715017e-22,
+    2.1175823681357508e-22, 1.0587911840678754e-22, 5.2939559203393771e-23, 2.6469779601696886e-23,
+    1.3234889800848443e-23, 6.6174449004242214e-24, 3.3087224502121107e-24, 1.6543612251060553e-24,
+    8.2718061255302767e-25, 4.1359030627651384e-25, 2.0679515313825692e-25, 1.0339757656912846e-25,
+    5.169878828456423e-26, 2.5849394142282115e-26, 1.2924697071141057e-26, 6.4623485355705287e-27,
+    3.2311742677852644e-27, 1.6155871338926322e-27, 8.0779356694631609e-28, 4.0389678347315804e-28,
+    2.0194839173657902e-28, 1.0097419586828951e-28, 5.0487097934144756e-29, 2.5243548967072378e-29,
+    1.2621774483536189e-29, 6.3108872417680944e-30, 3.1554436208840472e-30, 1.5777218104420236e-30,
+    7.8886090522101181e-31, 3.944304526105059e-31, 1.9721522630525295e-31, 9.8607613152626476e-32,
+    4.9303806576313238e-32, 2.4651903288156619e-32, 1.2325951644078309e-32, 6.1629758220391547e-33,
+    3.0814879110195774e-33, 1.5407439555097887e-33, 7.7037197775489434e-34, 3.8518598887744717e-34,
+    1.9259299443872359e-34, 9.6296497219361793e-35, 4.8148248609680896e-35, 2.4074124304840448e-35,
+    1.2037062152420224e-35, 6.018531076210112e-36, 3.009265538105056e-36, 1.504632769052528e-36,
+    7.5231638452626401e-37, 3.76158192263132e-37, 1.88079096131566e-37, 9.4039548065783001e-38,
+    4.70197740328915e-38, 2.350988701644575e-38, 1.1754943508222875e-38, 5.8774717541114375e-39,
+    2.9387358770557188e-39, 1.4693679385278594e-39, 7.3468396926392969e-40, 3.6734198463196485e-40,
+    1.8367099231598242e-40, 9.1835496157991212e-41, 4.5917748078995606e-41, 2.2958874039497803e-41,
+    1.1479437019748901e-41, 5.7397185098744507e-42, 2.8698592549372254e-42, 1.4349296274686127e-42,
+    7.1746481373430634e-43, 3.5873240686715317e-43, 1.7936620343357659e-43, 8.9683101716788293e-44,
+    4.4841550858394146e-44, 2.2420775429197073e-44, 1.1210387714598537e-44, 5.6051938572992683e-45,
+    2.8025969286496341e-45, 1.4012984643248171e-45, 7.0064923216240854e-46, 3.5032461608120427e-46,
+    1.7516230804060213e-46, 8.7581154020301067e-47, 4.3790577010150533e-47, 2.1895288505075267e-47,
+    1.0947644252537633e-47, 5.4738221262688167e-48, 2.7369110631344083e-48, 1.3684555315672042e-48,
+    6.8422776578360209e-49, 3.4211388289180104e-49, 1.7105694144590052e-49, 8.5528470722950261e-50,
+    4.276423536147513e-50, 2.1382117680737565e-50, 1.0691058840368783e-50, 5.3455294201843913e-51,
+    2.6727647100921956e-51, 1.3363823550460978e-51, 6.6819117752304891e-52, 3.3409558876152446e-52,
+    1.6704779438076223e-52, 8.3523897190381114e-53, 4.1761948595190557e-53, 2.0880974297595278e-53,
+    1.0440487148797639e-53, 5.2202435743988196e-54, 2.6101217871994098e-54, 1.3050608935997049e-54,
+    6.5253044679985245e-55, 3.2626522339992623e-55, 1.6313261169996311e-55, 8.1566305849981557e-56,
+    4.0783152924990778e-56, 2.0391576462495389e-56, 1.0195788231247695e-56, 5.0978941156238473e-57,
+    2.5489470578119236e-57, 1.2744735289059618e-57, 6.3723676445298091e-58, 3.1861838222649046e-58,
+    1.5930919111324523e-58, 7.9654595556622614e-59, 3.9827297778311307e-59, 1.9913648889155653e-59,
+    9.9568244445778267e-60, 4.9784122222889134e-60, 2.4892061111444567e-60, 1.2446030555722283e-60,
+    6.2230152778611417e-61, 3.1115076389305709e-61, 1.5557538194652854e-61, 7.7787690973264271e-62,
+    3.8893845486632136e-62, 1.9446922743316068e-62, 9.7234613716580339e-63, 4.861730685829017e-63,
+    2.4308653429145085e-63, 1.2154326714572542e-63, 6.0771633572862712e-64, 3.0385816786431356e-64,
+    1.5192908393215678e-64, 7.596454196607839e-65, 3.7982270983039195e-65, 1.8991135491519597e-65,
+    9.4955677457597987e-66, 4.7477838728798994e-66, 2.3738919364399497e-66, 1.1869459682199748e-66,
+    5.9347298410998742e-67, 2.9673649205499371e-67, 1.4836824602749686e-67, 7.4184123013748428e-68,
+    3.7092061506874214e-68, 1.8546030753437107e-68, 9.2730153767185535e-69, 4.6365076883592767e-69,
+    2.3182538441796384e-69, 1.1591269220898192e-69, 5.7956346104490959e-70, 2.897817305224548e-70,
+    1.448908652612274e-70, 7.2445432630613699e-71, 3.6222716315306849e-71, 1.8111358157653425e-71,
+    9.0556790788267124e-72, 4.5278395394133562e-72, 2.2639197697066781e-72, 1.131959884853339e-72,
+    5.6597994242666952e-73, 2.8298997121333476e-73, 1.4149498560666738e-73, 7.074749280333369e-74,
+    3.5373746401666845e-74, 1.7686873200833423e-74, 8.8434366004167113e-75, 4.4217183002083556e-75,
+    2.2108591501041778e-75, 1.1054295750520889e-75, 5.5271478752604446e-76, 2.7635739376302223e-76,
+    1.3817869688151111e-76, 6.9089348440755557e-77, 3.4544674220377779e-77, 1.7272337110188889e-77
+];
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn assert_inv_pow2_table() {
+        for i in 0u8..=255 {
+            // expected value
+            let expected = INVERSE_POWERS_OF_2[i as usize];
+
+            // computed value
+            let computed = 2f64.powf(-(i as f64));
+
+            assert_eq!(expected, computed);
+        }
+    }
+}
diff --git a/datasketches/src/common/mod.rs b/datasketches/src/common/mod.rs
index 017ddc9..36b734f 100644
--- a/datasketches/src/common/mod.rs
+++ b/datasketches/src/common/mod.rs
@@ -25,3 +25,4 @@ pub use self::resize::ResizeFactor;
 
 // private to datasketches crate
 pub(crate) mod binomial_bounds;
+pub(crate) mod inv_pow2_table;
diff --git a/datasketches/src/cpc/kxp_byte_lookup.rs b/datasketches/src/cpc/kxp_byte_lookup.rs
new file mode 100644
index 0000000..96649ea
--- /dev/null
+++ b/datasketches/src/cpc/kxp_byte_lookup.rs
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+const KXP_BYTE_TABLE: [f64; 256] = [
+    0.99609375, 0.49609375, 0.74609375, 0.24609375, 0.87109375, 0.37109375, 0.62109375, 0.12109375,
+    0.93359375, 0.43359375, 0.68359375, 0.18359375, 0.80859375, 0.30859375, 0.55859375, 0.05859375,
+    0.96484375, 0.46484375, 0.71484375, 0.21484375, 0.83984375, 0.33984375, 0.58984375, 0.08984375,
+    0.90234375, 0.40234375, 0.65234375, 0.15234375, 0.77734375, 0.27734375, 0.52734375, 0.02734375,
+    0.98046875, 0.48046875, 0.73046875, 0.23046875, 0.85546875, 0.35546875, 0.60546875, 0.10546875,
+    0.91796875, 0.41796875, 0.66796875, 0.16796875, 0.79296875, 0.29296875, 0.54296875, 0.04296875,
+    0.94921875, 0.44921875, 0.69921875, 0.19921875, 0.82421875, 0.32421875, 0.57421875, 0.07421875,
+    0.88671875, 0.38671875, 0.63671875, 0.13671875, 0.76171875, 0.26171875, 0.51171875, 0.01171875,
+    0.98828125, 0.48828125, 0.73828125, 0.23828125, 0.86328125, 0.36328125, 0.61328125, 0.11328125,
+    0.92578125, 0.42578125, 0.67578125, 0.17578125, 0.80078125, 0.30078125, 0.55078125, 0.05078125,
+    0.95703125, 0.45703125, 0.70703125, 0.20703125, 0.83203125, 0.33203125, 0.58203125, 0.08203125,
+    0.89453125, 0.39453125, 0.64453125, 0.14453125, 0.76953125, 0.26953125, 0.51953125, 0.01953125,
+    0.97265625, 0.47265625, 0.72265625, 0.22265625, 0.84765625, 0.34765625, 0.59765625, 0.09765625,
+    0.91015625, 0.41015625, 0.66015625, 0.16015625, 0.78515625, 0.28515625, 0.53515625, 0.03515625,
+    0.94140625, 0.44140625, 0.69140625, 0.19140625, 0.81640625, 0.31640625, 0.56640625, 0.06640625,
+    0.87890625, 0.37890625, 0.62890625, 0.12890625, 0.75390625, 0.25390625, 0.50390625, 0.00390625,
+    0.9921875, 0.4921875, 0.7421875, 0.2421875, 0.8671875, 0.3671875, 0.6171875, 0.1171875,
+    0.9296875, 0.4296875, 0.6796875, 0.1796875, 0.8046875, 0.3046875, 0.5546875, 0.0546875,
+    0.9609375, 0.4609375, 0.7109375, 0.2109375, 0.8359375, 0.3359375, 0.5859375, 0.0859375,
+    0.8984375, 0.3984375, 0.6484375, 0.1484375, 0.7734375, 0.2734375, 0.5234375, 0.0234375,
+    0.9765625, 0.4765625, 0.7265625, 0.2265625, 0.8515625, 0.3515625, 0.6015625, 0.1015625,
+    0.9140625, 0.4140625, 0.6640625, 0.1640625, 0.7890625, 0.2890625, 0.5390625, 0.0390625,
+    0.9453125, 0.4453125, 0.6953125, 0.1953125, 0.8203125, 0.3203125, 0.5703125, 0.0703125,
+    0.8828125, 0.3828125, 0.6328125, 0.1328125, 0.7578125, 0.2578125, 0.5078125, 0.0078125,
+    0.984375, 0.484375, 0.734375, 0.234375, 0.859375, 0.359375, 0.609375, 0.109375, 0.921875,
+    0.421875, 0.671875, 0.171875, 0.796875, 0.296875, 0.546875, 0.046875, 0.953125, 0.453125,
+    0.703125, 0.203125, 0.828125, 0.328125, 0.578125, 0.078125, 0.890625, 0.390625, 0.640625,
+    0.140625, 0.765625, 0.265625, 0.515625, 0.015625, 0.96875, 0.46875, 0.71875, 0.21875, 0.84375,
+    0.34375, 0.59375, 0.09375, 0.90625, 0.40625, 0.65625, 0.15625, 0.78125, 0.28125, 0.53125,
+    0.03125, 0.9375, 0.4375, 0.6875, 0.1875, 0.8125, 0.3125, 0.5625, 0.0625, 0.875, 0.375, 0.625,
+    0.125, 0.75, 0.25, 0.5, 0.0,
+];
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2;
+
+    #[test]
+    fn assert_kxp_byte_table() {
+        for byte in 0u8..=255 {
+            // expected value
+            let expected = KXP_BYTE_TABLE[byte as usize];
+
+            // computed value
+            let mut computed = 0.0;
+            for col in 0..8 {
+                let bit = (byte >> col) & 1;
+                if bit == 0 {
+                    // note the inverted logic
+                    computed += INVERSE_POWERS_OF_2[col + 1]; //note the "+1"
+                }
+            }
+
+            assert_eq!(expected, computed);
+        }
+    }
+}
diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index 4f03e5c..0e6094f 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -18,4 +18,5 @@
 //! Compressed Probabilistic Counting sketch family.
 
 mod icon_estimator;
+mod kxp_byte_lookup;
 mod pair_table;

From 2ae4095c545c89da075c9fecf961d043781c64b2 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 15:22:12 +0800
Subject: [PATCH 05/21] cpc_confidence

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/common/num_std_dev.rs |   5 +
 datasketches/src/cpc/cpc_confidence.rs | 188 +++++++++++++++++++++++++
 datasketches/src/cpc/icon_estimator.rs |   6 +-
 datasketches/src/cpc/mod.rs            |   1 +
 4 files changed, 197 insertions(+), 3 deletions(-)
 create mode 100644 datasketches/src/cpc/cpc_confidence.rs

diff --git a/datasketches/src/common/num_std_dev.rs b/datasketches/src/common/num_std_dev.rs
index 5cae235..92cbddd 100644
--- a/datasketches/src/common/num_std_dev.rs
+++ b/datasketches/src/common/num_std_dev.rs
@@ -50,4 +50,9 @@ impl NumStdDev {
     pub const fn tail_probability(&self) -> f64 {
         DELTA_OF_NUM_STD_DEVS[*self as usize]
     }
+
+    /// Returns the number of standard deviations as an `u8`.
+    pub const fn as_u8(&self) -> u8 {
+        *self as u8
+    }
 }
diff --git a/datasketches/src/cpc/cpc_confidence.rs b/datasketches/src/cpc/cpc_confidence.rs
new file mode 100644
index 0000000..21863ef
--- /dev/null
+++ b/datasketches/src/cpc/cpc_confidence.rs
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::f64::consts::LN_2;
+
+use crate::common::NumStdDev;
+use crate::cpc::icon_estimator::icon_estimate;
+
+const ICON_ERROR_CONSTANT: f64 = LN_2;
+
+const ICON_LOW_SIDE_DATA: [u16; 33] = [
+    //1,    2,    3,   kappa
+    //                 lgK num trials
+    6037, 5720, 5328, // 4 1000000
+    6411, 6262, 5682, // 5 1000000
+    6724, 6403, 6127, // 6 1000000
+    6665, 6411, 6208, // 7 1000000
+    6959, 6525, 6427, // 8 1000000
+    6892, 6665, 6619, // 9 1000000
+    6792, 6752, 6690, // 10 1000000
+    6899, 6818, 6708, // 11 1000000
+    6871, 6845, 6812, // 12 1046369
+    6909, 6861, 6828, // 13 1043411
+    6919, 6897, 6842, // 14 1000297
+];
+
+const ICON_HIGH_SIDE_DATA: [u16; 33] = [
+    //1,    2,    3,   kappa
+    //                 lgK num trials
+    8031, 8559, 9309, // 4 1000000
+    7084, 7959, 8660, // 5 1000000
+    7141, 7514, 7876, // 6 1000000
+    7458, 7430, 7572, // 7 1000000
+    6892, 7141, 7497, // 8 1000000
+    6889, 7132, 7290, // 9 1000000
+    7075, 7118, 7185, // 10 1000000
+    7040, 7047, 7085, // 11 1000000
+    6993, 7019, 7053, // 12 1046369
+    6953, 7001, 6983, // 13 1043411
+    6944, 6966, 7004, // 14 1000297
+];
+
+#[allow(clippy::excessive_precision)]
+const HIP_ERROR_CONSTANT: f64 = 0.588705011257737332; // (LN_2 / 2.0).sqrt()
+
+const HIP_LOW_SIDE_DATA: [u16; 33] = [
+    //1,    2,    3,   kappa
+    //                 lgK num trials
+    5871, 5247, 4826, // 4 1000000
+    5877, 5403, 5070, // 5 1000000
+    5873, 5533, 5304, // 6 1000000
+    5878, 5632, 5464, // 7 1000000
+    5874, 5690, 5564, // 8 1000000
+    5880, 5745, 5619, // 9 1000000
+    5875, 5784, 5701, // 10 1000000
+    5866, 5789, 5742, // 11 1000000
+    5869, 5827, 5784, // 12 1046369
+    5876, 5860, 5827, // 13 1043411
+    5881, 5853, 5842, // 14 1000297
+];
+
+const HIP_HIGH_SIDE_DATA: [u16; 33] = [
+    //1,    2,    3,   kappa
+    //                 lgK num trials
+    5855, 6688, 7391, // 4 1000000
+    5886, 6444, 6923, // 5 1000000
+    5885, 6254, 6594, // 6 1000000
+    5889, 6134, 6326, // 7 1000000
+    5900, 6072, 6203, // 8 1000000
+    5875, 6005, 6089, // 9 1000000
+    5871, 5980, 6040, // 10 1000000
+    5889, 5941, 6015, // 11 1000000
+    5871, 5926, 5973, // 12 1046369
+    5866, 5901, 5915, // 13 1043411
+    5880, 5914, 5953, // 14 1000297
+];
+
+pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
+    if num_coupons == 0 {
+        return 0.0;
+    }
+
+    let k = (1 << lg_k) as f64;
+    let kappa = kappa.as_u8();
+
+    let mut x = ICON_ERROR_CONSTANT;
+    if lg_k <= 14 {
+        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+        x = (ICON_HIGH_SIDE_DATA[idx] as f64) / 10000.0;
+    }
+    let rel = x / k.sqrt();
+    let eps = (kappa as f64) * rel;
+    let est = icon_estimate(lg_k, num_coupons);
+    let result = est / (1.0 + eps);
+    if result < (num_coupons as f64) {
+        num_coupons as f64
+    } else {
+        result
+    }
+}
+
+pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
+    if num_coupons == 0 {
+        return 0.0;
+    }
+
+    let k = (1 << lg_k) as f64;
+    let kappa = kappa.as_u8();
+
+    let mut x = ICON_ERROR_CONSTANT;
+    if lg_k <= 14 {
+        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+        x = (ICON_LOW_SIDE_DATA[idx] as f64) / 10000.0;
+    }
+    let rel = x / k.sqrt();
+    let eps = (kappa as f64) * rel;
+    let est = icon_estimate(lg_k, num_coupons);
+    let result = est / (1.0 - eps);
+    result.ceil() // slight widening of interval to be conservative
+}
+
+// merge_flag must already be checked as false
+pub(super) fn hip_confidence_lb(
+    lg_k: u8,
+    num_coupons: u32,
+    hip_estimate: f64,
+    kappa: NumStdDev,
+) -> f64 {
+    if num_coupons == 0 {
+        return 0.0;
+    }
+
+    let k = (1 << lg_k) as f64;
+    let kappa = kappa.as_u8();
+
+    let mut x = HIP_ERROR_CONSTANT;
+    if lg_k <= 14 {
+        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+        x = (HIP_HIGH_SIDE_DATA[idx] as f64) / 10000.0;
+    }
+    let rel = x / k.sqrt();
+    let eps = (kappa as f64) * rel;
+    let result = hip_estimate / (1.0 + eps);
+    if result < (num_coupons as f64) {
+        num_coupons as f64
+    } else {
+        result
+    }
+}
+
+// merge_flag must already be checked as false
+pub(super) fn get_hip_confidence_ub(
+    lg_k: u8,
+    num_coupons: u32,
+    hip_estimate: f64,
+    kappa: NumStdDev,
+) -> f64 {
+    if num_coupons == 0 {
+        return 0.0;
+    }
+
+    let k = (1 << lg_k) as f64;
+    let kappa = kappa.as_u8();
+
+    let mut x = HIP_ERROR_CONSTANT;
+    if lg_k <= 14 {
+        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+        x = (HIP_LOW_SIDE_DATA[idx] as f64) / 10000.0;
+    }
+    let rel = x / k.sqrt();
+    let eps = (kappa as f64) * rel;
+    let result = hip_estimate / (1.0 - eps);
+    result.ceil() // widening for coverage
+}
diff --git a/datasketches/src/cpc/icon_estimator.rs b/datasketches/src/cpc/icon_estimator.rs
index 5c91ff3..c817aa4 100644
--- a/datasketches/src/cpc/icon_estimator.rs
+++ b/datasketches/src/cpc/icon_estimator.rs
@@ -179,7 +179,7 @@ const ICON_POLYNOMIAL_COEFFICIENTS: [f64; ICON_TABLE_SIZE] = [
     0.4588937864564729963, -0.08824617586088029375, 0.01147732114826570046, -0.00090384524860747295, 3.253252703695579795e-05,
 ];
 
-pub(super) fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize, x: f64) -> f64 {
+fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize, x: f64) -> f64 {
     let end = start + num - 1;
     let mut total = coefficients[end];
     for i in (start..end).rev() {
@@ -189,11 +189,11 @@ pub(super) fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize
     total
 }
 
-pub(super) fn icon_exponential_approximation(k: f64, c: f64) -> f64 {
+fn icon_exponential_approximation(k: f64, c: f64) -> f64 {
     0.7940236163830469 * k * 2f64.powf(c / k)
 }
 
-pub(super) fn compute_icon_estimate(lg_k: u8, c: u32) -> f64 {
+pub(super) fn icon_estimate(lg_k: u8, c: u32) -> f64 {
     let lg_k = lg_k as usize;
     assert!(
         (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k),
diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index 0e6094f..ad0c69f 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -17,6 +17,7 @@
 
 //! Compressed Probabilistic Counting sketch family.
 
+mod cpc_confidence;
 mod icon_estimator;
 mod kxp_byte_lookup;
 mod pair_table;

From d201bfbefeda3f742b71eb3828f31a6f382dd864 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 16:52:58 +0800
Subject: [PATCH 06/21] cpc sketch impl

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/cpc_confidence.rs        | 188 ---------------
 .../cpc/{icon_estimator.rs => estimator.rs}   | 228 ++++++++++++++++--
 datasketches/src/cpc/mod.rs                   |   6 +-
 datasketches/src/cpc/pair_table.rs            |   1 +
 datasketches/src/cpc/sketch.rs                | 115 +++++++++
 5 files changed, 333 insertions(+), 205 deletions(-)
 delete mode 100644 datasketches/src/cpc/cpc_confidence.rs
 rename datasketches/src/cpc/{icon_estimator.rs => estimator.rs} (68%)
 create mode 100644 datasketches/src/cpc/sketch.rs

diff --git a/datasketches/src/cpc/cpc_confidence.rs b/datasketches/src/cpc/cpc_confidence.rs
deleted file mode 100644
index 21863ef..0000000
--- a/datasketches/src/cpc/cpc_confidence.rs
+++ /dev/null
@@ -1,188 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::f64::consts::LN_2;
-
-use crate::common::NumStdDev;
-use crate::cpc::icon_estimator::icon_estimate;
-
-const ICON_ERROR_CONSTANT: f64 = LN_2;
-
-const ICON_LOW_SIDE_DATA: [u16; 33] = [
-    //1,    2,    3,   kappa
-    //                 lgK num trials
-    6037, 5720, 5328, // 4 1000000
-    6411, 6262, 5682, // 5 1000000
-    6724, 6403, 6127, // 6 1000000
-    6665, 6411, 6208, // 7 1000000
-    6959, 6525, 6427, // 8 1000000
-    6892, 6665, 6619, // 9 1000000
-    6792, 6752, 6690, // 10 1000000
-    6899, 6818, 6708, // 11 1000000
-    6871, 6845, 6812, // 12 1046369
-    6909, 6861, 6828, // 13 1043411
-    6919, 6897, 6842, // 14 1000297
-];
-
-const ICON_HIGH_SIDE_DATA: [u16; 33] = [
-    //1,    2,    3,   kappa
-    //                 lgK num trials
-    8031, 8559, 9309, // 4 1000000
-    7084, 7959, 8660, // 5 1000000
-    7141, 7514, 7876, // 6 1000000
-    7458, 7430, 7572, // 7 1000000
-    6892, 7141, 7497, // 8 1000000
-    6889, 7132, 7290, // 9 1000000
-    7075, 7118, 7185, // 10 1000000
-    7040, 7047, 7085, // 11 1000000
-    6993, 7019, 7053, // 12 1046369
-    6953, 7001, 6983, // 13 1043411
-    6944, 6966, 7004, // 14 1000297
-];
-
-#[allow(clippy::excessive_precision)]
-const HIP_ERROR_CONSTANT: f64 = 0.588705011257737332; // (LN_2 / 2.0).sqrt()
-
-const HIP_LOW_SIDE_DATA: [u16; 33] = [
-    //1,    2,    3,   kappa
-    //                 lgK num trials
-    5871, 5247, 4826, // 4 1000000
-    5877, 5403, 5070, // 5 1000000
-    5873, 5533, 5304, // 6 1000000
-    5878, 5632, 5464, // 7 1000000
-    5874, 5690, 5564, // 8 1000000
-    5880, 5745, 5619, // 9 1000000
-    5875, 5784, 5701, // 10 1000000
-    5866, 5789, 5742, // 11 1000000
-    5869, 5827, 5784, // 12 1046369
-    5876, 5860, 5827, // 13 1043411
-    5881, 5853, 5842, // 14 1000297
-];
-
-const HIP_HIGH_SIDE_DATA: [u16; 33] = [
-    //1,    2,    3,   kappa
-    //                 lgK num trials
-    5855, 6688, 7391, // 4 1000000
-    5886, 6444, 6923, // 5 1000000
-    5885, 6254, 6594, // 6 1000000
-    5889, 6134, 6326, // 7 1000000
-    5900, 6072, 6203, // 8 1000000
-    5875, 6005, 6089, // 9 1000000
-    5871, 5980, 6040, // 10 1000000
-    5889, 5941, 6015, // 11 1000000
-    5871, 5926, 5973, // 12 1046369
-    5866, 5901, 5915, // 13 1043411
-    5880, 5914, 5953, // 14 1000297
-];
-
-pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
-    if num_coupons == 0 {
-        return 0.0;
-    }
-
-    let k = (1 << lg_k) as f64;
-    let kappa = kappa.as_u8();
-
-    let mut x = ICON_ERROR_CONSTANT;
-    if lg_k <= 14 {
-        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
-        x = (ICON_HIGH_SIDE_DATA[idx] as f64) / 10000.0;
-    }
-    let rel = x / k.sqrt();
-    let eps = (kappa as f64) * rel;
-    let est = icon_estimate(lg_k, num_coupons);
-    let result = est / (1.0 + eps);
-    if result < (num_coupons as f64) {
-        num_coupons as f64
-    } else {
-        result
-    }
-}
-
-pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
-    if num_coupons == 0 {
-        return 0.0;
-    }
-
-    let k = (1 << lg_k) as f64;
-    let kappa = kappa.as_u8();
-
-    let mut x = ICON_ERROR_CONSTANT;
-    if lg_k <= 14 {
-        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
-        x = (ICON_LOW_SIDE_DATA[idx] as f64) / 10000.0;
-    }
-    let rel = x / k.sqrt();
-    let eps = (kappa as f64) * rel;
-    let est = icon_estimate(lg_k, num_coupons);
-    let result = est / (1.0 - eps);
-    result.ceil() // slight widening of interval to be conservative
-}
-
-// merge_flag must already be checked as false
-pub(super) fn hip_confidence_lb(
-    lg_k: u8,
-    num_coupons: u32,
-    hip_estimate: f64,
-    kappa: NumStdDev,
-) -> f64 {
-    if num_coupons == 0 {
-        return 0.0;
-    }
-
-    let k = (1 << lg_k) as f64;
-    let kappa = kappa.as_u8();
-
-    let mut x = HIP_ERROR_CONSTANT;
-    if lg_k <= 14 {
-        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
-        x = (HIP_HIGH_SIDE_DATA[idx] as f64) / 10000.0;
-    }
-    let rel = x / k.sqrt();
-    let eps = (kappa as f64) * rel;
-    let result = hip_estimate / (1.0 + eps);
-    if result < (num_coupons as f64) {
-        num_coupons as f64
-    } else {
-        result
-    }
-}
-
-// merge_flag must already be checked as false
-pub(super) fn get_hip_confidence_ub(
-    lg_k: u8,
-    num_coupons: u32,
-    hip_estimate: f64,
-    kappa: NumStdDev,
-) -> f64 {
-    if num_coupons == 0 {
-        return 0.0;
-    }
-
-    let k = (1 << lg_k) as f64;
-    let kappa = kappa.as_u8();
-
-    let mut x = HIP_ERROR_CONSTANT;
-    if lg_k <= 14 {
-        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
-        x = (HIP_LOW_SIDE_DATA[idx] as f64) / 10000.0;
-    }
-    let rel = x / k.sqrt();
-    let eps = (kappa as f64) * rel;
-    let result = hip_estimate / (1.0 - eps);
-    result.ceil() // widening for coverage
-}
diff --git a/datasketches/src/cpc/icon_estimator.rs b/datasketches/src/cpc/estimator.rs
similarity index 68%
rename from datasketches/src/cpc/icon_estimator.rs
rename to datasketches/src/cpc/estimator.rs
index c817aa4..9b8fb9f 100644
--- a/datasketches/src/cpc/icon_estimator.rs
+++ b/datasketches/src/cpc/estimator.rs
@@ -15,20 +15,218 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! The ICON estimator for FM85 sketches is defined by the arXiv paper.
-//!
-//! The current file provides exact and approximate implementations of this estimator. The exact
-//! version works for any value of K, but is quite slow.
-//!
-//! The much faster approximate version works for K values that are powers of two ranging from 2^4
-//! to 2^32.
-//!
-//! At a high-level, this approximation can be described as using an exponential approximation when
-//! C > K * (5.6 or 5.7), while smaller values of C are handled by a degree-19 polynomial
-//! approximation of a pre-conditioned version of the true ICON mapping from C to N_hat.
-//!
-//! This file also provides a validation procedure that compares its approximate and exact
-//! implementations of the FM85 ICON estimator.
+use std::f64::consts::LN_2;
+
+use crate::common::NumStdDev;
+
+#[derive(Debug, Clone)]
+pub(super) enum Estimator {
+    /// Default to HIP estimator.
+    ///
+    /// HIP estimator is fast and provides better accuracy.
+    Hip { kxp: f64, hip_estimate: f64 },
+    /// ICON estimator.
+    ///
+    /// Used after merges.
+    Icon,
+}
+
+impl Estimator {
+    pub fn estimate(&self, lg_k: u8, num_coupons: u32) -> f64 {
+        match self {
+            Estimator::Hip { hip_estimate, .. } => *hip_estimate,
+            Estimator::Icon => icon_estimate(lg_k, num_coupons),
+        }
+    }
+
+    pub fn lower_bound(&self, lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
+        match self {
+            Estimator::Hip { hip_estimate, .. } => {
+                hip_confidence_lb(lg_k, num_coupons, *hip_estimate, kappa)
+            }
+            Estimator::Icon => icon_confidence_lb(lg_k, num_coupons, kappa),
+        }
+    }
+
+    pub fn upper_bound(&self, lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
+        match self {
+            Estimator::Hip { hip_estimate, .. } => {
+                hip_confidence_ub(lg_k, num_coupons, *hip_estimate, kappa)
+            }
+            Estimator::Icon => icon_confidence_ub(lg_k, num_coupons, kappa),
+        }
+    }
+}
+
+const ICON_ERROR_CONSTANT: f64 = LN_2;
+
+const ICON_LOW_SIDE_DATA: [u16; 33] = [
+    //1,    2,    3,   kappa
+    //                 lgK num trials
+    6037, 5720, 5328, // 4 1000000
+    6411, 6262, 5682, // 5 1000000
+    6724, 6403, 6127, // 6 1000000
+    6665, 6411, 6208, // 7 1000000
+    6959, 6525, 6427, // 8 1000000
+    6892, 6665, 6619, // 9 1000000
+    6792, 6752, 6690, // 10 1000000
+    6899, 6818, 6708, // 11 1000000
+    6871, 6845, 6812, // 12 1046369
+    6909, 6861, 6828, // 13 1043411
+    6919, 6897, 6842, // 14 1000297
+];
+
+const ICON_HIGH_SIDE_DATA: [u16; 33] = [
+    //1,    2,    3,   kappa
+    //                 lgK num trials
+    8031, 8559, 9309, // 4 1000000
+    7084, 7959, 8660, // 5 1000000
+    7141, 7514, 7876, // 6 1000000
+    7458, 7430, 7572, // 7 1000000
+    6892, 7141, 7497, // 8 1000000
+    6889, 7132, 7290, // 9 1000000
+    7075, 7118, 7185, // 10 1000000
+    7040, 7047, 7085, // 11 1000000
+    6993, 7019, 7053, // 12 1046369
+    6953, 7001, 6983, // 13 1043411
+    6944, 6966, 7004, // 14 1000297
+];
+
+#[allow(clippy::excessive_precision)]
+const HIP_ERROR_CONSTANT: f64 = 0.588705011257737332; // (LN_2 / 2.0).sqrt()
+
+const HIP_LOW_SIDE_DATA: [u16; 33] = [
+    //1,    2,    3,   kappa
+    //                 lgK num trials
+    5871, 5247, 4826, // 4 1000000
+    5877, 5403, 5070, // 5 1000000
+    5873, 5533, 5304, // 6 1000000
+    5878, 5632, 5464, // 7 1000000
+    5874, 5690, 5564, // 8 1000000
+    5880, 5745, 5619, // 9 1000000
+    5875, 5784, 5701, // 10 1000000
+    5866, 5789, 5742, // 11 1000000
+    5869, 5827, 5784, // 12 1046369
+    5876, 5860, 5827, // 13 1043411
+    5881, 5853, 5842, // 14 1000297
+];
+
+const HIP_HIGH_SIDE_DATA: [u16; 33] = [
+    //1,    2,    3,   kappa
+    //                 lgK num trials
+    5855, 6688, 7391, // 4 1000000
+    5886, 6444, 6923, // 5 1000000
+    5885, 6254, 6594, // 6 1000000
+    5889, 6134, 6326, // 7 1000000
+    5900, 6072, 6203, // 8 1000000
+    5875, 6005, 6089, // 9 1000000
+    5871, 5980, 6040, // 10 1000000
+    5889, 5941, 6015, // 11 1000000
+    5871, 5926, 5973, // 12 1046369
+    5866, 5901, 5915, // 13 1043411
+    5880, 5914, 5953, // 14 1000297
+];
+
+fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
+    if num_coupons == 0 {
+        return 0.0;
+    }
+
+    let k = (1 << lg_k) as f64;
+    let kappa = kappa.as_u8();
+
+    let mut x = ICON_ERROR_CONSTANT;
+    if lg_k <= 14 {
+        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+        x = (ICON_HIGH_SIDE_DATA[idx] as f64) / 10000.0;
+    }
+    let rel = x / k.sqrt();
+    let eps = (kappa as f64) * rel;
+    let est = icon_estimate(lg_k, num_coupons);
+    let result = est / (1.0 + eps);
+    if result < (num_coupons as f64) {
+        num_coupons as f64
+    } else {
+        result
+    }
+}
+
+fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
+    if num_coupons == 0 {
+        return 0.0;
+    }
+
+    let k = (1 << lg_k) as f64;
+    let kappa = kappa.as_u8();
+
+    let mut x = ICON_ERROR_CONSTANT;
+    if lg_k <= 14 {
+        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+        x = (ICON_LOW_SIDE_DATA[idx] as f64) / 10000.0;
+    }
+    let rel = x / k.sqrt();
+    let eps = (kappa as f64) * rel;
+    let est = icon_estimate(lg_k, num_coupons);
+    let result = est / (1.0 - eps);
+    result.ceil() // slight widening of interval to be conservative
+}
+
+fn hip_confidence_lb(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumStdDev) -> f64 {
+    if num_coupons == 0 {
+        return 0.0;
+    }
+
+    let k = (1 << lg_k) as f64;
+    let kappa = kappa.as_u8();
+
+    let mut x = HIP_ERROR_CONSTANT;
+    if lg_k <= 14 {
+        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+        x = (HIP_HIGH_SIDE_DATA[idx] as f64) / 10000.0;
+    }
+    let rel = x / k.sqrt();
+    let eps = (kappa as f64) * rel;
+    let result = hip_estimate / (1.0 + eps);
+    if result < (num_coupons as f64) {
+        num_coupons as f64
+    } else {
+        result
+    }
+}
+
+fn hip_confidence_ub(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumStdDev) -> f64 {
+    if num_coupons == 0 {
+        return 0.0;
+    }
+
+    let k = (1 << lg_k) as f64;
+    let kappa = kappa.as_u8();
+
+    let mut x = HIP_ERROR_CONSTANT;
+    if lg_k <= 14 {
+        let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+        x = (HIP_LOW_SIDE_DATA[idx] as f64) / 10000.0;
+    }
+    let rel = x / k.sqrt();
+    let eps = (kappa as f64) * rel;
+    let result = hip_estimate / (1.0 - eps);
+    result.ceil() // widening for coverage
+}
+
+// The ICON estimator for FM85 sketches is defined by the arXiv paper.
+//
+// The current file provides exact and approximate implementations of this estimator. The exact
+// version works for any value of K, but is quite slow.
+//
+// The much faster approximate version works for K values that are powers of two ranging from 2^4
+// to 2^32.
+//
+// At a high-level, this approximation can be described as using an exponential approximation when
+// C > K * (5.6 or 5.7), while smaller values of C are handled by a degree-19 polynomial
+// approximation of a pre-conditioned version of the true ICON mapping from C to N_hat.
+//
+// This file also provides a validation procedure that compares its approximate and exact
+// implementations of the FM85 ICON estimator.
 
 const ICON_MIN_LOG_K: usize = 4;
 const ICON_MAX_LOG_K: usize = 26;
@@ -193,7 +391,7 @@ fn icon_exponential_approximation(k: f64, c: f64) -> f64 {
     0.7940236163830469 * k * 2f64.powf(c / k)
 }
 
-pub(super) fn icon_estimate(lg_k: u8, c: u32) -> f64 {
+fn icon_estimate(lg_k: u8, c: u32) -> f64 {
     let lg_k = lg_k as usize;
     assert!(
         (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k),
diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index ad0c69f..7d0c626 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -17,7 +17,9 @@
 
 //! Compressed Probabilistic Counting sketch family.
 
-mod cpc_confidence;
-mod icon_estimator;
+mod estimator;
 mod kxp_byte_lookup;
 mod pair_table;
+mod sketch;
+
+pub use self::sketch::CpcSketch;
diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs
index 8701ac1..d4c135b 100644
--- a/datasketches/src/cpc/pair_table.rs
+++ b/datasketches/src/cpc/pair_table.rs
@@ -24,6 +24,7 @@ const DOWNSIZE_DENOMINATOR: u32 = 4;
 ///
 /// This table stores `(row, col)` pairs and uses linear probing for collision resolution. It is
 /// optimized for scenarios where the cardinality of entries is low.
+#[derive(Debug, Clone)]
 pub(super) struct PairTable {
     /// log2 of number of slots
     lg_size: u8,
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
new file mode 100644
index 0000000..e6a1dfe
--- /dev/null
+++ b/datasketches/src/cpc/sketch.rs
@@ -0,0 +1,115 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::common::NumStdDev;
+use crate::cpc::estimator::Estimator;
+use crate::cpc::pair_table::PairTable;
+use crate::hash::DEFAULT_UPDATE_SEED;
+
+/// Default log2 of K.
+const DEFAULT_LG_K: u8 = 11;
+/// Min log2 of K.
+const MIN_LG_K: usize = 4;
+/// Max log2 of K.
+const MAX_LG_K: usize = 26;
+
+/// A Compressed Probabilistic Counting sketch.
+#[derive(Debug, Clone)]
+pub struct CpcSketch {
+    // immutable config variables
+    lg_k: u8,
+    seed: u64,
+
+    // sketch state
+    /// The number of coupons collected so far.
+    num_coupons: u32,
+    /// This is part of a speed optimization.
+    first_interesting_column: u8,
+    /// Physical storage for the sketch data.
+    storage: PhysicalStorage,
+    /// The current estimator type and associated data.
+    estimator: Estimator,
+}
+
+impl Default for CpcSketch {
+    fn default() -> Self {
+        Self::new(DEFAULT_LG_K)
+    }
+}
+
+impl CpcSketch {
+    /// Creates a new `CpcSketch` with the given `lg_k` and default seed.
+    pub fn new(lg_k: u8) -> Self {
+        Self::with_seed(lg_k, DEFAULT_UPDATE_SEED)
+    }
+
+    /// Creates a new `CpcSketch` with the given `lg_k` and `seed`.
+    pub fn with_seed(lg_k: u8, seed: u64) -> Self {
+        assert!(
+            (MIN_LG_K..=MAX_LG_K).contains(&(lg_k as usize)),
+            "lg_k out of range; got {lg_k}",
+        );
+
+        Self {
+            lg_k,
+            seed,
+            num_coupons: 0,
+            first_interesting_column: 0,
+            storage: PhysicalStorage::Empty,
+            estimator: Estimator::Hip {
+                kxp: (1 << lg_k) as f64,
+                hip_estimate: 0.0,
+            },
+        }
+    }
+
+    /// Return the parameter lg_k.
+    pub fn lg_k(&self) -> u8 {
+        self.lg_k
+    }
+
+    /// Returns the best estimate of the cardinality of the sketch.
+    pub fn estimate(&self) -> f64 {
+        let (lg_k, num_coupons) = (self.lg_k, self.num_coupons);
+        self.estimator.estimate(lg_k, num_coupons)
+    }
+
+    /// Returns the best estimate of the lower bound of the confidence interval given `kappa`.
+    pub fn lower_bound(&self, kappa: NumStdDev) -> f64 {
+        let (lg_k, num_coupons) = (self.lg_k, self.num_coupons);
+        self.estimator.lower_bound(lg_k, num_coupons, kappa)
+    }
+
+    /// Returns the best estimate of the upper bound of the confidence interval given `kappa`.
+    pub fn upper_bound(&self, kappa: NumStdDev) -> f64 {
+        let (lg_k, num_coupons) = (self.lg_k, self.num_coupons);
+        self.estimator.upper_bound(lg_k, num_coupons, kappa)
+    }
+}
+
+#[derive(Debug, Clone)]
+enum PhysicalStorage {
+    /// Empty storage state for EMPTY state.
+    Empty,
+    /// Sparse storage state for SPARSE state.
+    Sparse { surprising_value_table: PairTable },
+    /// Dense storage state for HYBRID/PINNED/SLIDING state.
+    Dense {
+        window_offset: u8,
+        sliding_window: Vec<u8>,
+    },
+}

From 315f4e89d85716da5c9d19a9135446305fa6d17d Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 17:07:39 +0800
Subject: [PATCH 07/21] max_serialized_bytes

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/sketch.rs | 48 ++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index e6a1dfe..31c8d40 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -113,3 +113,51 @@ enum PhysicalStorage {
         sliding_window: Vec<u8>,
     },
 }
+
+impl CpcSketch {
+    /// Returns the estimated maximum compressed serialized size of a sketch.
+    ///
+    /// The actual size of a compressed CPC sketch has a small random variance, but the following
+    /// empirically measured size should be large enough for at least 99.9 percent of sketches.
+    ///
+    /// For small values of `n` the size can be much smaller.
+    pub fn max_serialized_bytes(lg_k: u8) -> usize {
+        let lg_k = lg_k as usize;
+        assert!(
+            (MIN_LG_K..=MAX_LG_K).contains(&lg_k),
+            "lg_k out of range; got {lg_k}",
+        );
+
+        // These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
+        // trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
+        // spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
+        // worst-case space for the preamble, which is added by the function.
+        const CPC_EMPIRICAL_SIZE_MAX_LGK: usize = 19;
+        const CPC_EMPIRICAL_MAX_SIZE_BYTES: [usize; 16] = [
+            24,     // lg_k = 4
+            36,     // lg_k = 5
+            56,     // lg_k = 6
+            100,    // lg_k = 7
+            180,    // lg_k = 8
+            344,    // lg_k = 9
+            660,    // lg_k = 10
+            1292,   // lg_k = 11
+            2540,   // lg_k = 12
+            5020,   // lg_k = 13
+            9968,   // lg_k = 14
+            19836,  // lg_k = 15
+            39532,  // lg_k = 16
+            78880,  // lg_k = 17
+            157516, // lg_k = 18
+            314656, // lg_k = 19
+        ];
+        const CPC_EMPIRICAL_MAX_SIZE_FACTOR: f64 = 0.6; // 0.6 = 4.8 / 8.0
+        const CPC_MAX_PREAMBLE_SIZE_BYTES: usize = 40;
+
+        if lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK {
+            return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
+        }
+        let k = 1usize << lg_k;
+        ((CPC_EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + CPC_MAX_PREAMBLE_SIZE_BYTES
+    }
+}

From 0234127a857e875d747d4d67fb68dea8cfde283f Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 17:26:10 +0800
Subject: [PATCH 08/21] update structure

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/common/mod.rs   | 13 +++++
 datasketches/src/cpc/sketch.rs   | 85 ++++++++++++++++++++++++++------
 datasketches/src/theta/sketch.rs | 22 +++------
 3 files changed, 90 insertions(+), 30 deletions(-)

diff --git a/datasketches/src/common/mod.rs b/datasketches/src/common/mod.rs
index 36b734f..4a1db92 100644
--- a/datasketches/src/common/mod.rs
+++ b/datasketches/src/common/mod.rs
@@ -26,3 +26,16 @@ pub use self::resize::ResizeFactor;
 // private to datasketches crate
 pub(crate) mod binomial_bounds;
 pub(crate) mod inv_pow2_table;
+
+/// Canonicalize double value for compatibility with Java
+pub(crate) fn canonical_double(value: f64) -> i64 {
+    if value.is_nan() {
+        // Java's Double.doubleToLongBits() NaN value
+        0x7ff8000000000000i64
+    } else {
+        // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode,
+        // which Rust guarantees. Thus, by adding a positive zero we
+        // canonicalize signed zero without any branches in one instruction.
+        (value + 0.0).to_bits() as i64
+    }
+}
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index 31c8d40..b6d9ee7 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -15,10 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::hash::Hash;
+
 use crate::common::NumStdDev;
+use crate::common::canonical_double;
 use crate::cpc::estimator::Estimator;
 use crate::cpc::pair_table::PairTable;
 use crate::hash::DEFAULT_UPDATE_SEED;
+use crate::hash::MurmurHash3X64128;
 
 /// Default log2 of K.
 const DEFAULT_LG_K: u8 = 11;
@@ -40,7 +44,7 @@ pub struct CpcSketch {
     /// This is part of a speed optimization.
     first_interesting_column: u8,
     /// Physical storage for the sketch data.
-    storage: PhysicalStorage,
+    state: State,
     /// The current estimator type and associated data.
     estimator: Estimator,
 }
@@ -69,7 +73,7 @@ impl CpcSketch {
             seed,
             num_coupons: 0,
             first_interesting_column: 0,
-            storage: PhysicalStorage::Empty,
+            state: State::Empty,
             estimator: Estimator::Hip {
                 kxp: (1 << lg_k) as f64,
                 hip_estimate: 0.0,
@@ -99,10 +103,62 @@ impl CpcSketch {
         let (lg_k, num_coupons) = (self.lg_k, self.num_coupons);
         self.estimator.upper_bound(lg_k, num_coupons, kappa)
     }
+
+    /// Returns true if the sketch is empty.
+    pub fn is_empty(&self) -> bool {
+        self.num_coupons == 0
+    }
+
+    /// Update the sketch with a hashable value.
+    ///
+    /// For `f32`/`f64` values, use `update_f32`/`update_f64` instead.
+    pub fn update<T: Hash>(&mut self, value: T) {
+        let mut hasher = MurmurHash3X64128::with_seed(self.seed);
+        value.hash(&mut hasher);
+        let (h1, h2) = hasher.finish128();
+
+        let k = 1 << self.lg_k;
+        let col = h2.leading_zeros(); // 0 <= col <= 64
+        let col = if col > 63 { 63 } else { col as u8 }; // clip so that 0 <= col <= 63
+        let row = (h1 & (k - 1)) as u32;
+        let mut row_col = (row << 6) | (col as u32);
+        // To avoid the hash table's "empty" value, we change the row of the following pair.
+        // This case is extremely unlikely, but we might as well handle it.
+        if row_col == u32::MAX {
+            row_col ^= 1 << 6;
+        }
+        self.row_col_update(row_col);
+    }
+
+    /// Update the sketch with a f64 value.
+    pub fn update_f64(&mut self, value: f64) {
+        // Canonicalize double for compatibility with Java
+        let canonical = canonical_double(value);
+        self.update(canonical);
+    }
+
+    /// Update the sketch with a f32 value.
+    pub fn update_f32(&mut self, value: f32) {
+        self.update_f64(value as f64);
+    }
+
+    fn row_col_update(&mut self, row_col: u32) {
+        let col = (row_col & 63) as u8;
+        if col < self.first_interesting_column {
+            // important speed optimization
+            return;
+        }
+        // // window size is 0 until sketch is promoted from sparse to windowed
+        // if (sliding_window.size() == 0) {
+        //     update_sparse(row_col);
+        // } else {
+        //     update_windowed(row_col);
+        // }
+    }
 }
 
 #[derive(Debug, Clone)]
-enum PhysicalStorage {
+enum State {
     /// Empty storage state for EMPTY state.
     Empty,
     /// Sparse storage state for SPARSE state.
@@ -128,12 +184,13 @@ impl CpcSketch {
             "lg_k out of range; got {lg_k}",
         );
 
-        // These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
-        // trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
-        // spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
-        // worst-case space for the preamble, which is added by the function.
-        const CPC_EMPIRICAL_SIZE_MAX_LGK: usize = 19;
-        const CPC_EMPIRICAL_MAX_SIZE_BYTES: [usize; 16] = [
+        // These empirical values for the 99.9th percentile of size in bytes were measured using
+        // 100,000 trials. The value for each trial is the maximum of 5*16=80 measurements
+        // that were equally spaced over values of the quantity C/K between 3.0 and 8.0.
+        // This table does not include the worst-case space for the preamble, which is added
+        // by the function.
+        const EMPIRICAL_SIZE_MAX_LGK: usize = 19;
+        const EMPIRICAL_MAX_SIZE_BYTES: [usize; 16] = [
             24,     // lg_k = 4
             36,     // lg_k = 5
             56,     // lg_k = 6
@@ -151,13 +208,13 @@ impl CpcSketch {
             157516, // lg_k = 18
             314656, // lg_k = 19
         ];
-        const CPC_EMPIRICAL_MAX_SIZE_FACTOR: f64 = 0.6; // 0.6 = 4.8 / 8.0
-        const CPC_MAX_PREAMBLE_SIZE_BYTES: usize = 40;
+        const EMPIRICAL_MAX_SIZE_FACTOR: f64 = 0.6; // 0.6 = 4.8 / 8.0
+        const MAX_PREAMBLE_SIZE_BYTES: usize = 40;
 
-        if lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK {
-            return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
+        if lg_k <= EMPIRICAL_SIZE_MAX_LGK {
+            return EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] + MAX_PREAMBLE_SIZE_BYTES;
         }
         let k = 1usize << lg_k;
-        ((CPC_EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + CPC_MAX_PREAMBLE_SIZE_BYTES
+        ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES
     }
 }
diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs
index 506b2b6..07110d4 100644
--- a/datasketches/src/theta/sketch.rs
+++ b/datasketches/src/theta/sketch.rs
@@ -25,6 +25,7 @@ use std::hash::Hash;
 use crate::common::NumStdDev;
 use crate::common::ResizeFactor;
 use crate::common::binomial_bounds;
+use crate::common::canonical_double;
 use crate::hash::DEFAULT_UPDATE_SEED;
 use crate::theta::hash_table::DEFAULT_LG_K;
 use crate::theta::hash_table::MAX_LG_K;
@@ -52,7 +53,9 @@ impl ThetaSketch {
         ThetaSketchBuilder::default()
     }
 
-    /// Update the sketch with a hashable value
+    /// Update the sketch with a hashable value.
+    ///
+    /// For `f32`/`f64` values, use `update_f32`/`update_f64` instead.
     ///
     /// # Examples
     ///
@@ -69,7 +72,7 @@ impl ThetaSketch {
         }
     }
 
-    /// Update the sketch with a f64 value
+    /// Update the sketch with a f64 value.
     ///
     /// # Examples
     ///
@@ -85,7 +88,7 @@ impl ThetaSketch {
         self.update(canonical);
     }
 
-    /// Update the sketch with a f32 value
+    /// Update the sketch with a f32 value.
     ///
     /// # Examples
     ///
@@ -356,16 +359,3 @@ impl ThetaSketchBuilder {
         ThetaSketch { table }
     }
 }
-
-/// Canonicalize double value for compatibility with Java
-fn canonical_double(value: f64) -> i64 {
-    if value.is_nan() {
-        // Java's Double.doubleToLongBits() NaN value
-        0x7ff8000000000000i64
-    } else {
-        // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode,
-        // which Rust guarantees. Thus, by adding a positive zero we
-        // canonicalize signed zero without any branches in one instruction.
-        (value + 0.0).to_bits() as i64
-    }
-}

From 3e09c9c2c5c82ddf024adc9a9f211c07af1ba61d Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 18:40:50 +0800
Subject: [PATCH 09/21] impl update

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/estimator.rs |  70 ++++--------
 datasketches/src/cpc/sketch.rs    | 172 +++++++++++++++++++++++-------
 2 files changed, 155 insertions(+), 87 deletions(-)

diff --git a/datasketches/src/cpc/estimator.rs b/datasketches/src/cpc/estimator.rs
index 9b8fb9f..479e2a3 100644
--- a/datasketches/src/cpc/estimator.rs
+++ b/datasketches/src/cpc/estimator.rs
@@ -15,48 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::f64::consts::LN_2;
-
 use crate::common::NumStdDev;
-
-#[derive(Debug, Clone)]
-pub(super) enum Estimator {
-    /// Default to HIP estimator.
-    ///
-    /// HIP estimator is fast and provides better accuracy.
-    Hip { kxp: f64, hip_estimate: f64 },
-    /// ICON estimator.
-    ///
-    /// Used after merges.
-    Icon,
-}
-
-impl Estimator {
-    pub fn estimate(&self, lg_k: u8, num_coupons: u32) -> f64 {
-        match self {
-            Estimator::Hip { hip_estimate, .. } => *hip_estimate,
-            Estimator::Icon => icon_estimate(lg_k, num_coupons),
-        }
-    }
-
-    pub fn lower_bound(&self, lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
-        match self {
-            Estimator::Hip { hip_estimate, .. } => {
-                hip_confidence_lb(lg_k, num_coupons, *hip_estimate, kappa)
-            }
-            Estimator::Icon => icon_confidence_lb(lg_k, num_coupons, kappa),
-        }
-    }
-
-    pub fn upper_bound(&self, lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
-        match self {
-            Estimator::Hip { hip_estimate, .. } => {
-                hip_confidence_ub(lg_k, num_coupons, *hip_estimate, kappa)
-            }
-            Estimator::Icon => icon_confidence_ub(lg_k, num_coupons, kappa),
-        }
-    }
-}
+use std::f64::consts::LN_2;
 
 const ICON_ERROR_CONSTANT: f64 = LN_2;
 
@@ -127,7 +87,7 @@ const HIP_HIGH_SIDE_DATA: [u16; 33] = [
     5880, 5914, 5953, // 14 1000297
 ];
 
-fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
+pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
     if num_coupons == 0 {
         return 0.0;
     }
@@ -151,7 +111,7 @@ fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
     }
 }
 
-fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
+pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
     if num_coupons == 0 {
         return 0.0;
     }
@@ -171,7 +131,12 @@ fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
     result.ceil() // slight widening of interval to be conservative
 }
 
-fn hip_confidence_lb(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumStdDev) -> f64 {
+pub(super) fn hip_confidence_lb(
+    lg_k: u8,
+    num_coupons: u32,
+    hip_est_accum: f64,
+    kappa: NumStdDev,
+) -> f64 {
     if num_coupons == 0 {
         return 0.0;
     }
@@ -186,7 +151,7 @@ fn hip_confidence_lb(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumSt
     }
     let rel = x / k.sqrt();
     let eps = (kappa as f64) * rel;
-    let result = hip_estimate / (1.0 + eps);
+    let result = hip_est_accum / (1.0 + eps);
     if result < (num_coupons as f64) {
         num_coupons as f64
     } else {
@@ -194,7 +159,12 @@ fn hip_confidence_lb(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumSt
     }
 }
 
-fn hip_confidence_ub(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumStdDev) -> f64 {
+pub(super) fn hip_confidence_ub(
+    lg_k: u8,
+    num_coupons: u32,
+    hip_est_accum: f64,
+    kappa: NumStdDev,
+) -> f64 {
     if num_coupons == 0 {
         return 0.0;
     }
@@ -209,7 +179,7 @@ fn hip_confidence_ub(lg_k: u8, num_coupons: u32, hip_estimate: f64, kappa: NumSt
     }
     let rel = x / k.sqrt();
     let eps = (kappa as f64) * rel;
-    let result = hip_estimate / (1.0 - eps);
+    let result = hip_est_accum / (1.0 - eps);
     result.ceil() // widening for coverage
 }
 
@@ -391,21 +361,21 @@ fn icon_exponential_approximation(k: f64, c: f64) -> f64 {
     0.7940236163830469 * k * 2f64.powf(c / k)
 }
 
-fn icon_estimate(lg_k: u8, c: u32) -> f64 {
+pub(super) fn icon_estimate(lg_k: u8, num_coupons: u32) -> f64 {
     let lg_k = lg_k as usize;
     assert!(
         (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k),
         "lg_k out of range; got {lg_k}",
     );
 
-    match c {
+    match num_coupons {
         0 => return 0.0,
         1 => return 1.0,
         _ => {}
     }
 
     let k = (1 << lg_k) as f64;
-    let c = c as f64;
+    let c = num_coupons as f64;
 
     // Differing thresholds ensure that the approximated estimator is monotonically increasing.
     let threshold_factor = if lg_k < 14 { 5.7 } else { 5.6 };
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index b6d9ee7..7dbfe27 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -19,7 +19,10 @@ use std::hash::Hash;
 
 use crate::common::NumStdDev;
 use crate::common::canonical_double;
-use crate::cpc::estimator::Estimator;
+use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2;
+use crate::cpc::estimator::{
+    hip_confidence_lb, hip_confidence_ub, icon_confidence_lb, icon_confidence_ub, icon_estimate,
+};
 use crate::cpc::pair_table::PairTable;
 use crate::hash::DEFAULT_UPDATE_SEED;
 use crate::hash::MurmurHash3X64128;
@@ -39,14 +42,28 @@ pub struct CpcSketch {
     seed: u64,
 
     // sketch state
+    /// Part of a speed optimization.
+    first_interesting_column: u8,
     /// The number of coupons collected so far.
     num_coupons: u32,
-    /// This is part of a speed optimization.
-    first_interesting_column: u8,
-    /// Physical storage for the sketch data.
-    state: State,
-    /// The current estimator type and associated data.
-    estimator: Estimator,
+    /// Surprising values table in sparse mode.
+    surprising_value_table: Option<PairTable>,
+    /// Derivable from num_coupons, but made explicit for speed.
+    window_offset: u8,
+    /// Size K bytes in dense mode.
+    sliding_window: Vec<u8>,
+
+    // estimator state
+    /// Whether the sketch is a result of merging.
+    ///
+    /// If `false`, the HIP (Historical Inverse Probability) estimator is used.
+    /// If `true`, the ICON (Inter-Column Optimal) Estimator is fallback in use.
+    merge_flag: bool,
+    // the following variables are only valid in HIP estimator
+    /// A pre-calculated probability factor (`k * p`) used to compute the increment delta.
+    kxp: f64,
+    /// The accumulated cardinality estimate.
+    hip_est_accum: f64,
 }
 
 impl Default for CpcSketch {
@@ -71,13 +88,14 @@ impl CpcSketch {
         Self {
             lg_k,
             seed,
-            num_coupons: 0,
             first_interesting_column: 0,
-            state: State::Empty,
-            estimator: Estimator::Hip {
-                kxp: (1 << lg_k) as f64,
-                hip_estimate: 0.0,
-            },
+            num_coupons: 0,
+            surprising_value_table: None,
+            window_offset: 0,
+            sliding_window: vec![],
+            merge_flag: false,
+            kxp: (1 << lg_k) as f64,
+            hip_est_accum: 0.0,
         }
     }
 
@@ -88,20 +106,29 @@ impl CpcSketch {
 
     /// Returns the best estimate of the cardinality of the sketch.
     pub fn estimate(&self) -> f64 {
-        let (lg_k, num_coupons) = (self.lg_k, self.num_coupons);
-        self.estimator.estimate(lg_k, num_coupons)
+        if !self.merge_flag {
+            self.hip_est_accum
+        } else {
+            icon_estimate(self.lg_k, self.num_coupons)
+        }
     }
 
     /// Returns the best estimate of the lower bound of the confidence interval given `kappa`.
     pub fn lower_bound(&self, kappa: NumStdDev) -> f64 {
-        let (lg_k, num_coupons) = (self.lg_k, self.num_coupons);
-        self.estimator.lower_bound(lg_k, num_coupons, kappa)
+        if !self.merge_flag {
+            hip_confidence_lb(self.lg_k, self.num_coupons, self.hip_est_accum, kappa)
+        } else {
+            icon_confidence_lb(self.lg_k, self.num_coupons, kappa)
+        }
     }
 
     /// Returns the best estimate of the upper bound of the confidence interval given `kappa`.
     pub fn upper_bound(&self, kappa: NumStdDev) -> f64 {
-        let (lg_k, num_coupons) = (self.lg_k, self.num_coupons);
-        self.estimator.upper_bound(lg_k, num_coupons, kappa)
+        if !self.merge_flag {
+            hip_confidence_ub(self.lg_k, self.num_coupons, self.hip_est_accum, kappa)
+        } else {
+            icon_confidence_ub(self.lg_k, self.num_coupons, kappa)
+        }
     }
 
     /// Returns true if the sketch is empty.
@@ -148,26 +175,96 @@ impl CpcSketch {
             // important speed optimization
             return;
         }
-        // // window size is 0 until sketch is promoted from sparse to windowed
-        // if (sliding_window.size() == 0) {
-        //     update_sparse(row_col);
-        // } else {
-        //     update_windowed(row_col);
-        // }
+
+        if self.num_coupons == 0 {
+            // promote EMPTY to SPARSE
+            self.surprising_value_table = Some(PairTable::new(2, 6 + self.lg_k));
+        }
+
+        if self.sliding_window.is_empty() {
+            self.update_sparse(row_col);
+        } else {
+            self.update_windowed(row_col);
+        }
     }
-}
 
-#[derive(Debug, Clone)]
-enum State {
-    /// Empty storage state for EMPTY state.
-    Empty,
-    /// Sparse storage state for SPARSE state.
-    Sparse { surprising_value_table: PairTable },
-    /// Dense storage state for HYBRID/PINNED/SLIDING state.
-    Dense {
-        window_offset: u8,
-        sliding_window: Vec<u8>,
-    },
+    fn mut_surprising_value_table(&mut self) -> &mut PairTable {
+        self.surprising_value_table
+            .as_mut()
+            .expect("surprising value table is not initialized")
+    }
+
+    fn update_hip(&mut self, row_col: u32) {
+        let k = 1 << self.lg_k;
+        let col = (row_col & 63) as usize;
+        let one_over_p = (k as f64) / self.kxp;
+        self.hip_est_accum += one_over_p;
+        self.kxp -= INVERSE_POWERS_OF_2[col + 1] // notice the "+1"
+    }
+
+    fn update_sparse(&mut self, row_col: u32) {
+        let k = 1 << self.lg_k;
+        let c32pre = (self.num_coupons as u64) << 5;
+        assert!(c32pre < 3 * k); // C < 3K/32, in other words, flavor == SPARSE
+        let is_novel = self.mut_surprising_value_table().maybe_insert(row_col);
+        if is_novel {
+            self.num_coupons += 1;
+            self.update_hip(row_col);
+            let c32post = (self.num_coupons as u64) << 5;
+            if c32post >= 3 * k {
+                self.promote_sparse_to_windowed();
+            }
+        }
+    }
+
+    fn promote_sparse_to_windowed(&mut self) {
+        todo!()
+    }
+
+    fn update_windowed(&mut self, row_col: u32) {
+        assert!(self.window_offset <= 56);
+        let k = 1 << self.lg_k;
+        let c32pre = (self.num_coupons as u64) << 5;
+        assert!(c32pre >= 3 * k); // C >= 3K/32, in other words flavor >= HYBRID
+        let c8pre = (self.num_coupons as u64) << 3;
+        let w8pre = (self.window_offset as u64) << 3;
+        assert!(c8pre < (27 + w8pre) * k); // C < (K * 27/8) + (K * windowOffset)
+
+        let mut is_novel = false; // novel if new coupon;
+        let col = (row_col & 63) as u8;
+        if col < self.window_offset {
+            // track the surprising 0's "before" the window
+            is_novel = self.mut_surprising_value_table().maybe_delete(row_col); // inverted logic
+        } else if col < self.window_offset + 8 {
+            // track the 8 bits inside the window
+            let row = (row_col >> 6) as usize;
+            let old_bits = self.sliding_window[row];
+            let new_bits = old_bits | (1 << (col - self.window_offset));
+            if old_bits != new_bits {
+                self.sliding_window[row] = new_bits;
+                is_novel = true;
+            }
+        } else {
+            // track the surprising 1's "after" the window
+            is_novel = self.mut_surprising_value_table().maybe_insert(row_col); // normal logic
+        }
+
+        if is_novel {
+            self.num_coupons += 1;
+            self.update_hip(row_col);
+            let c8post = (self.num_coupons as u64) << 3;
+            if c8post >= (27 + w8pre) * k {
+                self.move_window();
+                assert!((1..=56).contains(&self.window_offset));
+                let w8post = (self.window_offset as u64) << 3;
+                assert!(c8post < ((27 + w8post) * k)); // C < (K * 27/8) + (K * windowOffset)
+            }
+        }
+    }
+
+    fn move_window(&mut self) {
+        todo!()
+    }
 }
 
 impl CpcSketch {
@@ -218,3 +315,4 @@ impl CpcSketch {
         ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES
     }
 }
+

From f914fb8acdd16a5bf65ff9d9ecdeb5e6108862c8 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 19:24:38 +0800
Subject: [PATCH 10/21] promote_sparse_to_windowed

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/estimator.rs |  3 ++-
 datasketches/src/cpc/sketch.rs    | 38 ++++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/datasketches/src/cpc/estimator.rs b/datasketches/src/cpc/estimator.rs
index 479e2a3..6d37e6e 100644
--- a/datasketches/src/cpc/estimator.rs
+++ b/datasketches/src/cpc/estimator.rs
@@ -15,9 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::common::NumStdDev;
 use std::f64::consts::LN_2;
 
+use crate::common::NumStdDev;
+
 const ICON_ERROR_CONSTANT: f64 = LN_2;
 
 const ICON_LOW_SIDE_DATA: [u16; 33] = [
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index 7dbfe27..6cbe0b9 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -20,9 +20,11 @@ use std::hash::Hash;
 use crate::common::NumStdDev;
 use crate::common::canonical_double;
 use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2;
-use crate::cpc::estimator::{
-    hip_confidence_lb, hip_confidence_ub, icon_confidence_lb, icon_confidence_ub, icon_estimate,
-};
+use crate::cpc::estimator::hip_confidence_lb;
+use crate::cpc::estimator::hip_confidence_ub;
+use crate::cpc::estimator::icon_confidence_lb;
+use crate::cpc::estimator::icon_confidence_ub;
+use crate::cpc::estimator::icon_estimate;
 use crate::cpc::pair_table::PairTable;
 use crate::hash::DEFAULT_UPDATE_SEED;
 use crate::hash::MurmurHash3X64128;
@@ -191,7 +193,7 @@ impl CpcSketch {
     fn mut_surprising_value_table(&mut self) -> &mut PairTable {
         self.surprising_value_table
             .as_mut()
-            .expect("surprising value table is not initialized")
+            .expect("surprising value table must be initialized")
     }
 
     fn update_hip(&mut self, row_col: u32) {
@@ -218,7 +220,32 @@ impl CpcSketch {
     }
 
     fn promote_sparse_to_windowed(&mut self) {
-        todo!()
+        assert_eq!(self.window_offset, 0);
+
+        let k = 1 << self.lg_k;
+        let c32 = (self.num_coupons as u64) << 5;
+        assert!((c32 == (3 * k)) || ((self.lg_k == 4) && (c32 > (3 * k))));
+
+        self.sliding_window.resize(k as usize, 0);
+
+        let old_table = self
+            .surprising_value_table
+            .replace(PairTable::new(2, 6 + self.lg_k))
+            .expect("surprising value table must be initialized");
+        let old_slots = old_table.slots();
+        for &row_col in old_slots {
+            if row_col != u32::MAX {
+                let col = (row_col & 63) as u8;
+                if col < 8 {
+                    let row = (row_col >> 6) as usize;
+                    self.sliding_window[row] |= 1 << col;
+                } else {
+                    // cannot use must_insert(), because it doesn't provide for growth
+                    let is_novel = self.mut_surprising_value_table().maybe_insert(row_col);
+                    assert!(is_novel);
+                }
+            }
+        }
     }
 
     fn update_windowed(&mut self, row_col: u32) {
@@ -315,4 +342,3 @@ impl CpcSketch {
         ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES
     }
 }
-

From 822cc6b108e9876417d3e7763be8a6c13514f322 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 19:50:53 +0800
Subject: [PATCH 11/21] move_window

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/pair_table.rs |  4 --
 datasketches/src/cpc/sketch.rs     | 94 +++++++++++++++++++++++++++++-
 2 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs
index d4c135b..2af1859 100644
--- a/datasketches/src/cpc/pair_table.rs
+++ b/datasketches/src/cpc/pair_table.rs
@@ -71,10 +71,6 @@ impl PairTable {
         table
     }
 
-    pub fn lg_size(&self) -> u8 {
-        self.lg_size
-    }
-
     pub fn num_items(&self) -> u32 {
         self.num_items
     }
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index 6cbe0b9..f7fc6eb 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -190,6 +190,12 @@ impl CpcSketch {
         }
     }
 
+    fn surprising_value_table(&self) -> &PairTable {
+        self.surprising_value_table
+            .as_ref()
+            .expect("surprising value table must be initialized")
+    }
+
     fn mut_surprising_value_table(&mut self) -> &mut PairTable {
         self.surprising_value_table
             .as_mut()
@@ -290,7 +296,83 @@ impl CpcSketch {
     }
 
     fn move_window(&mut self) {
-        todo!()
+        let new_offset = self.window_offset + 1;
+        assert!((0..=56).contains(&new_offset));
+        assert_eq!(
+            new_offset,
+            determine_correct_offset(self.lg_k, self.num_coupons)
+        );
+
+        // Construct the full-sized bit matrix that corresponds to the sketch
+        let bit_matrix = self.build_bit_matrix();
+
+        // refresh the KXP register on every 8th window shift.
+        // if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data());
+
+        self.mut_surprising_value_table().clear(); // the new number of surprises will be about the same
+
+        let mask_for_clearing_window = (0xFF << new_offset) ^ u64::MAX;
+        let mask_for_flipping_early_zone = (1u64 << new_offset) - 1;
+
+        let mut all_surprises_ored = 0u64;
+        for (i, &bit) in bit_matrix.iter().enumerate() {
+            let mut pattern = bit;
+            self.sliding_window[i] = ((pattern >> new_offset) & 0xff) as u8;
+            pattern &= mask_for_clearing_window;
+            // The following line converts surprising 0's to 1's in the "early zone",
+            // (and vice versa, which is essential for this procedure's O(k) time cost).
+            pattern ^= mask_for_flipping_early_zone;
+            all_surprises_ored |= pattern; // a cheap way to recalculate first_interesting_column
+            while pattern != 0 {
+                let col = pattern.trailing_zeros();
+                pattern ^= 1 << col; // erase the 1
+                let row_col = ((i as u32) << 6) | col;
+                let is_novel = self.mut_surprising_value_table().maybe_insert(row_col);
+                assert!(is_novel);
+            }
+        }
+
+        self.window_offset = new_offset;
+        self.first_interesting_column = all_surprises_ored.trailing_zeros() as u8;
+        if self.first_interesting_column > new_offset {
+            self.first_interesting_column = new_offset; // corner case
+        }
+    }
+
+    fn build_bit_matrix(&self) -> Vec<u64> {
+        let k = 1usize << self.lg_k;
+        let offset = self.window_offset;
+        assert!((0..=56).contains(&offset));
+
+        // Fill the matrix with default rows in which the "early zone" is filled with ones.
+        // This is essential for the routine's O(k) time cost (as opposed to O(C)).
+        let default_row = (1u64 << offset) - 1;
+
+        let mut matrix = vec![default_row; k];
+        if self.num_coupons == 0 {
+            return matrix;
+        }
+
+        if !self.sliding_window.is_empty() {
+            // In other words, we are in window mode, not sparse mode
+            for (i, bit) in matrix.iter_mut().enumerate() {
+                // set the window bits, trusting the sketch's current offset
+                *bit |= (self.sliding_window[i] as u64) << offset;
+            }
+        }
+
+        for &row_col in self.surprising_value_table().slots() {
+            if row_col != u32::MAX {
+                let col = (row_col & 63) as u8;
+                let row = (row_col >> 6) as usize;
+                // Flip the specified matrix bit from its default value.
+                // In the "early" zone the bit changes from 1 to 0.
+                // In the "late" zone the bit changes from 0 to 1.
+                matrix[row] ^= 1 << col;
+            }
+        }
+
+        matrix
     }
 }
 
@@ -342,3 +424,13 @@ impl CpcSketch {
         ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES
     }
 }
+
+fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 {
+    let k = 1 << lg_k;
+    let tmp = ((num_coupons as i64) << 3) - (19 * k as i64);
+    if tmp < 0 {
+        0
+    } else {
+        (tmp >> (lg_k as i64 + 3)) as u8 // tmp / 8K
+    }
+}

From 73b514cf24a6d2b695119eb70ecb75b16ccb1571 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 29 Jan 2026 23:56:20 +0800
Subject: [PATCH 12/21] add tests

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/mod.rs           |  2 +
 datasketches/tests/cpc_update_test.rs | 62 +++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 datasketches/tests/cpc_update_test.rs

diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index 7d0c626..33f3c10 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![allow(dead_code)]
+
 //! Compressed Probabilistic Counting sketch family.
 
 mod estimator;
diff --git a/datasketches/tests/cpc_update_test.rs b/datasketches/tests/cpc_update_test.rs
new file mode 100644
index 0000000..2d6f28b
--- /dev/null
+++ b/datasketches/tests/cpc_update_test.rs
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datasketches::common::NumStdDev;
+use datasketches::cpc::CpcSketch;
+use googletest::assert_that;
+use googletest::prelude::ge;
+use googletest::prelude::le;
+use googletest::prelude::near;
+
+const RELATIVE_ERROR_FOR_LG_K_11: f64 = 0.02;
+
+#[test]
+fn test_empty() {
+    let sketch = CpcSketch::new(11);
+    assert!(sketch.is_empty());
+    assert_eq!(sketch.estimate(), 0.0);
+    assert_eq!(sketch.lower_bound(NumStdDev::One), 0.0);
+    assert_eq!(sketch.upper_bound(NumStdDev::One), 0.0);
+}
+
+#[test]
+fn test_one_value() {
+    let mut sketch = CpcSketch::new(11);
+    sketch.update(1);
+    assert!(!sketch.is_empty());
+    assert_eq!(sketch.estimate(), 1.0);
+    assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One)));
+    assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One)));
+}
+
+#[test]
+fn test_many_values() {
+    const N: usize = 10000;
+    const N_F64: f64 = N as f64;
+
+    let mut sketch = CpcSketch::new(11);
+    for i in 0..N {
+        sketch.update(i);
+    }
+    assert!(!sketch.is_empty());
+    assert_that!(
+        sketch.estimate(),
+        near(N_F64, RELATIVE_ERROR_FOR_LG_K_11 * N_F64)
+    );
+    assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One)));
+    assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One)));
+}

From 08b7a613590935a53d41c4eec432ac9a3519b2a3 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Fri, 30 Jan 2026 00:41:54 +0800
Subject: [PATCH 13/21] refresh_kxp

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/kxp_byte_lookup.rs |  2 +-
 datasketches/src/cpc/sketch.rs          | 36 ++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/datasketches/src/cpc/kxp_byte_lookup.rs b/datasketches/src/cpc/kxp_byte_lookup.rs
index 96649ea..25fddf6 100644
--- a/datasketches/src/cpc/kxp_byte_lookup.rs
+++ b/datasketches/src/cpc/kxp_byte_lookup.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-const KXP_BYTE_TABLE: [f64; 256] = [
+pub(super) const KXP_BYTE_TABLE: [f64; 256] = [
     0.99609375, 0.49609375, 0.74609375, 0.24609375, 0.87109375, 0.37109375, 0.62109375, 0.12109375,
     0.93359375, 0.43359375, 0.68359375, 0.18359375, 0.80859375, 0.30859375, 0.55859375, 0.05859375,
     0.96484375, 0.46484375, 0.71484375, 0.21484375, 0.83984375, 0.33984375, 0.58984375, 0.08984375,
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index f7fc6eb..caa0503 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -25,6 +25,7 @@ use crate::cpc::estimator::hip_confidence_ub;
 use crate::cpc::estimator::icon_confidence_lb;
 use crate::cpc::estimator::icon_confidence_ub;
 use crate::cpc::estimator::icon_estimate;
+use crate::cpc::kxp_byte_lookup::KXP_BYTE_TABLE;
 use crate::cpc::pair_table::PairTable;
 use crate::hash::DEFAULT_UPDATE_SEED;
 use crate::hash::MurmurHash3X64128;
@@ -297,7 +298,7 @@ impl CpcSketch {
 
     fn move_window(&mut self) {
         let new_offset = self.window_offset + 1;
-        assert!((0..=56).contains(&new_offset));
+        assert!(new_offset <= 56);
         assert_eq!(
             new_offset,
             determine_correct_offset(self.lg_k, self.num_coupons)
@@ -307,7 +308,9 @@ impl CpcSketch {
         let bit_matrix = self.build_bit_matrix();
 
         // refresh the KXP register on every 8th window shift.
-        // if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data());
+        if (new_offset & 0x7) == 0 {
+            self.refresh_kxp(&bit_matrix);
+        }
 
         self.mut_surprising_value_table().clear(); // the new number of surprises will be about the same
 
@@ -339,10 +342,37 @@ impl CpcSketch {
         }
     }
 
+    /// The KXP register is a double with roughly 50 bits of precision, but it might need roughly
+    /// 90 bits to track the value with perfect accuracy.
+    ///
+    /// Therefore, we recalculate KXP occasionally from the sketch's full bit_matrix so that it
+    /// will reflect changes that were previously outside the mantissa.
+    fn refresh_kxp(&mut self, bit_matrix: &[u64]) {
+        // for improved numerical accuracy, we separately sum the bytes of the u64's
+        let mut byte_sums = [0.0; 8];
+        for &bit in bit_matrix {
+            let mut word = bit;
+            for sum in byte_sums.iter_mut() {
+                let byte = (word & 0xFF) as usize;
+                *sum += KXP_BYTE_TABLE[byte];
+                word >>= 8;
+            }
+        }
+
+        let mut total = 0.0;
+        for i in (0..8).rev() {
+            // the reverse order is important
+            let factor = INVERSE_POWERS_OF_2[i * 8]; // pow (256.0, (-1.0 * ((double) j)))
+            total += factor * byte_sums[i];
+        }
+
+        self.kxp = total;
+    }
+
     fn build_bit_matrix(&self) -> Vec<u64> {
         let k = 1usize << self.lg_k;
         let offset = self.window_offset;
-        assert!((0..=56).contains(&offset));
+        assert!(offset <= 56);
 
         // Fill the matrix with default rows in which the "early zone" is filled with ones.
         // This is essential for the routine's O(k) time cost (as opposed to O(C)).

From 5b34a747f89b60b789eb91f095be480156479a63 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Fri, 30 Jan 2026 08:22:43 +0800
Subject: [PATCH 14/21] update comments

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/sketch.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index caa0503..1b82492 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -49,11 +49,11 @@ pub struct CpcSketch {
     first_interesting_column: u8,
     /// The number of coupons collected so far.
     num_coupons: u32,
-    /// Surprising values table in sparse mode.
+    /// Sparse and surprising values.
     surprising_value_table: Option<PairTable>,
     /// Derivable from num_coupons, but made explicit for speed.
     window_offset: u8,
-    /// Size K bytes in dense mode.
+    /// Size K bytes in dense mode (flavor >= HYBRID).
     sliding_window: Vec<u8>,
 
     // estimator state

From a3ccddca1f2e860905c91c3a90ef16d0e66b160f Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Fri, 30 Jan 2026 11:52:46 +0800
Subject: [PATCH 15/21] add validate method

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/mod.rs           |  8 ++++++++
 datasketches/src/cpc/sketch.rs        | 13 +++++++++++++
 datasketches/tests/cpc_update_test.rs |  3 +++
 3 files changed, 24 insertions(+)

diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index 33f3c10..14e4168 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -25,3 +25,11 @@ mod pair_table;
 mod sketch;
 
 pub use self::sketch::CpcSketch;
+
+fn count_bits_set_in_matrix(matrix: &[u64]) -> u32 {
+    let mut count = 0;
+    for word in matrix {
+        count += word.count_ones();
+    }
+    count
+}
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index 1b82492..c5562fd 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -20,6 +20,7 @@ use std::hash::Hash;
 use crate::common::NumStdDev;
 use crate::common::canonical_double;
 use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2;
+use crate::cpc::count_bits_set_in_matrix;
 use crate::cpc::estimator::hip_confidence_lb;
 use crate::cpc::estimator::hip_confidence_ub;
 use crate::cpc::estimator::icon_confidence_lb;
@@ -455,6 +456,18 @@ impl CpcSketch {
     }
 }
 
+// testing methods
+impl CpcSketch {
+    /// Validate this sketch is valid.
+    ///
+    /// This method is typically used for debugging and testing purposes.
+    pub fn validate(&self) -> bool {
+        let bit_matrix = self.build_bit_matrix();
+        let num_bits_set = count_bits_set_in_matrix(&bit_matrix);
+        num_bits_set == self.num_coupons
+    }
+}
+
 fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 {
     let k = 1 << lg_k;
     let tmp = ((num_coupons as i64) << 3) - (19 * k as i64);
diff --git a/datasketches/tests/cpc_update_test.rs b/datasketches/tests/cpc_update_test.rs
index 2d6f28b..fb17573 100644
--- a/datasketches/tests/cpc_update_test.rs
+++ b/datasketches/tests/cpc_update_test.rs
@@ -31,6 +31,7 @@ fn test_empty() {
     assert_eq!(sketch.estimate(), 0.0);
     assert_eq!(sketch.lower_bound(NumStdDev::One), 0.0);
     assert_eq!(sketch.upper_bound(NumStdDev::One), 0.0);
+    assert!(sketch.validate());
 }
 
 #[test]
@@ -41,6 +42,7 @@ fn test_one_value() {
     assert_eq!(sketch.estimate(), 1.0);
     assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One)));
     assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One)));
+    assert!(sketch.validate());
 }
 
 #[test]
@@ -59,4 +61,5 @@ fn test_many_values() {
     );
     assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One)));
     assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One)));
+    assert!(sketch.validate());
 }

From cfcd0b7797ee576a7a92cbb6cdb6f496e59d0a27 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Fri, 30 Jan 2026 17:30:32 +0800
Subject: [PATCH 16/21] impl Union

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/mod.rs    |  45 +++++
 datasketches/src/cpc/sketch.rs |  48 ++---
 datasketches/src/cpc/union.rs  | 348 +++++++++++++++++++++++++++++++++
 3 files changed, 414 insertions(+), 27 deletions(-)
 create mode 100644 datasketches/src/cpc/union.rs

diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index 14e4168..21879c0 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -23,9 +23,26 @@ mod estimator;
 mod kxp_byte_lookup;
 mod pair_table;
 mod sketch;
+mod union;
 
 pub use self::sketch::CpcSketch;
 
+/// Default log2 of K.
+const DEFAULT_LG_K: u8 = 11;
+/// Min log2 of K.
+const MIN_LG_K: usize = 4;
+/// Max log2 of K.
+const MAX_LG_K: usize = 26;
+
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+enum Flavor {
+    EMPTY,   //    0  == C <    1
+    SPARSE,  //    1  <= C <   3K/32
+    HYBRID,  // 3K/32 <= C <   K/2
+    PINNED,  //   K/2 <= C < 27K/8  [NB: 27/8 = 3 + 3/8]
+    SLIDING, // 27K/8 <= C
+}
+
 fn count_bits_set_in_matrix(matrix: &[u64]) -> u32 {
     let mut count = 0;
     for word in matrix {
@@ -33,3 +50,31 @@ fn count_bits_set_in_matrix(matrix: &[u64]) -> u32 {
     }
     count
 }
+
+fn determine_flavor(lg_k: u8, num_coupons: u32) -> Flavor {
+    let k = 1 << lg_k;
+    let c2 = num_coupons << 1;
+    let c8 = num_coupons << 3;
+    let c32 = num_coupons << 5;
+    if num_coupons == 0 {
+        Flavor::EMPTY
+    } else if c32 < (3 * k) {
+        Flavor::SPARSE
+    } else if c2 < k {
+        Flavor::HYBRID
+    } else if c8 < (27 * k) {
+        Flavor::PINNED
+    } else {
+        Flavor::SLIDING
+    }
+}
+
+fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 {
+    let k = 1i64 << lg_k;
+    let tmp = ((num_coupons as i64) << 3) - (19 * k); // 8C - 19K
+    if tmp < 0 {
+        0
+    } else {
+        (tmp >> (lg_k + 3)) as u8 // tmp / 8K
+    }
+}
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index c5562fd..832db46 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -20,6 +20,7 @@ use std::hash::Hash;
 use crate::common::NumStdDev;
 use crate::common::canonical_double;
 use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2;
+use crate::cpc::MIN_LG_K;
 use crate::cpc::count_bits_set_in_matrix;
 use crate::cpc::estimator::hip_confidence_lb;
 use crate::cpc::estimator::hip_confidence_ub;
@@ -28,16 +29,11 @@ use crate::cpc::estimator::icon_confidence_ub;
 use crate::cpc::estimator::icon_estimate;
 use crate::cpc::kxp_byte_lookup::KXP_BYTE_TABLE;
 use crate::cpc::pair_table::PairTable;
+use crate::cpc::{DEFAULT_LG_K, determine_correct_offset};
+use crate::cpc::{Flavor, MAX_LG_K, determine_flavor};
 use crate::hash::DEFAULT_UPDATE_SEED;
 use crate::hash::MurmurHash3X64128;
 
-/// Default log2 of K.
-const DEFAULT_LG_K: u8 = 11;
-/// Min log2 of K.
-const MIN_LG_K: usize = 4;
-/// Max log2 of K.
-const MAX_LG_K: usize = 26;
-
 /// A Compressed Probabilistic Counting sketch.
 #[derive(Debug, Clone)]
 pub struct CpcSketch {
@@ -47,22 +43,22 @@ pub struct CpcSketch {
 
     // sketch state
     /// Part of a speed optimization.
-    first_interesting_column: u8,
+    pub(super) first_interesting_column: u8,
     /// The number of coupons collected so far.
-    num_coupons: u32,
+    pub(super) num_coupons: u32,
     /// Sparse and surprising values.
-    surprising_value_table: Option<PairTable>,
+    pub(super) surprising_value_table: Option<PairTable>,
     /// Derivable from num_coupons, but made explicit for speed.
-    window_offset: u8,
+    pub(super) window_offset: u8,
     /// Size K bytes in dense mode (flavor >= HYBRID).
-    sliding_window: Vec<u8>,
+    pub(super) sliding_window: Vec<u8>,
 
     // estimator state
     /// Whether the sketch is a result of merging.
     ///
     /// If `false`, the HIP (Historical Inverse Probability) estimator is used.
     /// If `true`, the ICON (Inter-Column Optimal) Estimator is fallback in use.
-    merge_flag: bool,
+    pub(super) merge_flag: bool,
     // the following variables are only valid in HIP estimator
     /// A pre-calculated probability factor (`k * p`) used to compute the increment delta.
     kxp: f64,
@@ -173,7 +169,11 @@ impl CpcSketch {
         self.update_f64(value as f64);
     }
 
-    fn row_col_update(&mut self, row_col: u32) {
+    pub(super) fn flavor(&self) -> Flavor {
+        determine_flavor(self.lg_k, self.num_coupons)
+    }
+
+    pub(super) fn row_col_update(&mut self, row_col: u32) {
         let col = (row_col & 63) as u8;
         if col < self.first_interesting_column {
             // important speed optimization
@@ -192,13 +192,17 @@ impl CpcSketch {
         }
     }
 
-    fn surprising_value_table(&self) -> &PairTable {
+    pub(super) fn seed(&self) -> u64 {
+        self.seed
+    }
+
+    pub(super) fn surprising_value_table(&self) -> &PairTable {
         self.surprising_value_table
             .as_ref()
             .expect("surprising value table must be initialized")
     }
 
-    fn mut_surprising_value_table(&mut self) -> &mut PairTable {
+    pub(super) fn mut_surprising_value_table(&mut self) -> &mut PairTable {
         self.surprising_value_table
             .as_mut()
             .expect("surprising value table must be initialized")
@@ -370,7 +374,7 @@ impl CpcSketch {
         self.kxp = total;
     }
 
-    fn build_bit_matrix(&self) -> Vec<u64> {
+    pub(super) fn build_bit_matrix(&self) -> Vec<u64> {
         let k = 1usize << self.lg_k;
         let offset = self.window_offset;
         assert!(offset <= 56);
@@ -467,13 +471,3 @@ impl CpcSketch {
         num_bits_set == self.num_coupons
     }
 }
-
-fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 {
-    let k = 1 << lg_k;
-    let tmp = ((num_coupons as i64) << 3) - (19 * k as i64);
-    if tmp < 0 {
-        0
-    } else {
-        (tmp >> (lg_k as i64 + 3)) as u8 // tmp / 8K
-    }
-}
diff --git a/datasketches/src/cpc/union.rs b/datasketches/src/cpc/union.rs
new file mode 100644
index 0000000..47e2198
--- /dev/null
+++ b/datasketches/src/cpc/union.rs
@@ -0,0 +1,348 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::cpc::count_bits_set_in_matrix;
+use crate::cpc::pair_table::PairTable;
+use crate::cpc::{CpcSketch, determine_correct_offset};
+use crate::cpc::{DEFAULT_LG_K, Flavor};
+use crate::hash::DEFAULT_UPDATE_SEED;
+
+/// The union (merge) operation for the CPC sketches.
+#[derive(Debug, Clone)]
+pub struct CpcUnion {
+    // immutable config variables
+    lg_k: u8,
+    seed: u64,
+
+    // union state
+    state: UnionState,
+}
+
+impl Default for CpcUnion {
+    fn default() -> Self {
+        Self::new(DEFAULT_LG_K)
+    }
+}
+
+impl CpcUnion {
+    /// Creates a new `CpcUnion` with the given `lg_k` and default seed.
+    pub fn new(lg_k: u8) -> Self {
+        Self::with_seed(lg_k, DEFAULT_UPDATE_SEED)
+    }
+
+    /// Creates a new `CpcUnion` with the given `lg_k` and `seed`.
+    pub fn with_seed(lg_k: u8, seed: u64) -> Self {
+        // We begin with the accumulator holding an EMPTY_MERGED sketch object.
+        let sketch = CpcSketch::with_seed(lg_k, seed);
+        let state = UnionState::Accumulator(sketch);
+        Self { lg_k, seed, state }
+    }
+
+    /// Return the parameter lg_k.
+    ///
+    /// Note that due to merging with source sketches that may have a lower value of LgK, this value
+    /// can be less than what the union object was configured with.
+    pub fn lg_k(&self) -> u8 {
+        self.lg_k
+    }
+
+    /// Returns the result of union operations as a CPC sketch.
+    pub fn get_result(&self) -> CpcSketch {
+        match &self.state {
+            UnionState::Accumulator(sketch) => {
+                if sketch.is_empty() {
+                    CpcSketch::with_seed(self.lg_k, self.seed)
+                } else {
+                    let mut sketch = sketch.clone();
+                    assert_eq!(sketch.flavor(), Flavor::SPARSE);
+                    sketch.merge_flag = true;
+                    sketch
+                }
+            }
+            UnionState::BitMatrix(matrix) => {
+                let lg_k = self.lg_k;
+
+                let mut sketch = CpcSketch::with_seed(lg_k, self.seed);
+                let num_coupons = count_bits_set_in_matrix(matrix);
+                sketch.num_coupons = num_coupons;
+                let offset = determine_correct_offset(lg_k, num_coupons);
+                sketch.window_offset = offset;
+
+                // Build the window and pair table
+                let k = 1 << lg_k;
+                let mut sliding_window = vec![0u8; k];
+
+                // LgSize = K/16; in some cases this will end up being oversized
+                let new_table_lg_size = (lg_k - 4).max(2);
+                let mut table = PairTable::new(new_table_lg_size, 6 + lg_k);
+
+                // the following should work even when the offset is zero
+                let mask_for_clearing_window = (0xFFu64 << offset) ^ u64::MAX;
+                let mask_for_flipping_early_zone = (1u64 << offset) - 1;
+                let mut all_surprises_ored = 0;
+
+                // The snowplow effect was caused by processing the rows in order,
+                // but we have fixed it by using a sufficiently large hash table.
+                for i in 0..k {
+                    let mut pattern = matrix[i];
+                    sliding_window[i] = ((pattern >> offset) & 0xFF) as u8;
+                    pattern &= mask_for_clearing_window;
+                    pattern ^= mask_for_flipping_early_zone; // this flipping converts surprising 0's to 1's
+                    all_surprises_ored |= pattern;
+                    while pattern != 0 {
+                        let col = pattern.trailing_zeros();
+                        pattern = pattern ^ (1u64 << col); // erase the 1
+                        let row_col = ((i as u32) << 6) | col;
+                        let is_novel = table.maybe_insert(row_col);
+                        assert!(is_novel);
+                    }
+                }
+
+                // at this point we could shrink an oversized hash table, but the relative waste isn't very big
+                sketch.first_interesting_column = all_surprises_ored.trailing_zeros() as u8;
+                if sketch.first_interesting_column > offset {
+                    sketch.first_interesting_column = offset; // corner case
+                }
+
+                // HIP-related fields will contain zeros, and that is okay since merge_flag is true
+                sketch.sliding_window = sliding_window;
+                sketch.surprising_value_table = Some(table);
+                sketch.merge_flag = true;
+
+                sketch
+            }
+        }
+    }
+
+    /// Update this union with a CpcSketch.
+    pub fn update(&mut self, sketch: &CpcSketch) {
+        assert_eq!(self.seed, sketch.seed());
+
+        let flavor = sketch.flavor();
+        if flavor == Flavor::EMPTY {
+            return;
+        }
+
+        if sketch.lg_k() < self.lg_k {
+            self.reduce_k(sketch.lg_k());
+        }
+
+        // if source is past SPARSE mode, make sure that union is a bitMatrix.
+        if flavor > Flavor::SPARSE {
+            if let UnionState::Accumulator(old_sketch) = &self.state {
+                // convert the accumulator to a bit matrix
+                let bit_matrix = old_sketch.build_bit_matrix();
+                self.state = UnionState::BitMatrix(bit_matrix);
+            }
+        }
+
+        match &mut self.state {
+            UnionState::Accumulator(old_sketch) => {
+                // [Case A] Sparse, bitMatrix == null, accumulator valid
+                if flavor == Flavor::SPARSE {
+                    let old_flavor = old_sketch.flavor();
+                    if old_flavor != Flavor::SPARSE && old_flavor != Flavor::EMPTY {
+                        unreachable!(
+                            "unexpected old flavor in union accumulator: {:?}",
+                            old_flavor
+                        );
+                    }
+
+                    // The following partially fixes the snowplow problem provided that the K's
+                    // are equal.
+                    if old_flavor == Flavor::EMPTY && self.lg_k == sketch.lg_k() {
+                        *old_sketch = sketch.clone();
+                        return;
+                    }
+
+                    walk_table_updating_sketch(old_sketch, sketch.surprising_value_table());
+                    let final_flavor = old_sketch.flavor();
+
+                    // if the accumulator has graduated beyond sparse, switch to a bit matrix
+                    // representation
+                    if final_flavor > Flavor::SPARSE {
+                        let bit_matrix = old_sketch.build_bit_matrix();
+                        self.state = UnionState::BitMatrix(bit_matrix);
+                    }
+
+                    return;
+                }
+
+                // If flavor is past SPARSE mode, the state must have been converted to bitMatrix.
+                // Otherwise, the EMPTY case was handled at the start of this method, and the
+                // SPARSE case was handled above.
+                unreachable!("unexpected flavor in union accumulator: {:?}", flavor);
+            }
+            UnionState::BitMatrix(old_matrix) => {
+                if flavor == Flavor::SPARSE {
+                    // [Case B] Sparse, bitMatrix valid, accumulator == null
+                    or_table_into_matrix(old_matrix, self.lg_k, sketch.surprising_value_table());
+                    return;
+                }
+
+                if matches!(flavor, Flavor::HYBRID | Flavor::PINNED) {
+                    // [Case C] Hybrid, bitMatrix valid, accumulator == null
+                    // [Case C] Pinned, bitMatrix valid, accumulator == null
+                    or_window_into_matrix(
+                        old_matrix,
+                        self.lg_k,
+                        &sketch.sliding_window,
+                        sketch.window_offset,
+                        sketch.lg_k(),
+                    );
+                    or_table_into_matrix(old_matrix, self.lg_k, sketch.surprising_value_table());
+                    return;
+                }
+
+                // [Case D] Sliding, bitMatrix valid, accumulator == null
+                // SLIDING mode involves inverted logic, so we cannot just walk the source sketch.
+                // Instead, we convert it to a bitMatrix that can be ORed into the destination.
+                assert_eq!(flavor, Flavor::SLIDING);
+                let src_matrix = sketch.build_bit_matrix();
+                or_matrix_into_matrix(old_matrix, self.lg_k, &src_matrix, sketch.lg_k());
+                return;
+            }
+        }
+    }
+
+    fn reduce_k(&mut self, new_lg_k: u8) {
+        match &mut self.state {
+            UnionState::Accumulator(sketch) => {
+                if sketch.is_empty() {
+                    self.lg_k = new_lg_k;
+                    self.state = UnionState::Accumulator(CpcSketch::with_seed(new_lg_k, self.seed));
+                    return;
+                }
+
+                let mut new_sketch = CpcSketch::with_seed(new_lg_k, self.seed);
+                walk_table_updating_sketch(&mut new_sketch, sketch.surprising_value_table());
+
+                let final_new_flavor = new_sketch.flavor();
+                // SV table had to have something in it
+                assert_ne!(final_new_flavor, Flavor::EMPTY);
+                if final_new_flavor == Flavor::SPARSE {
+                    self.lg_k = new_lg_k;
+                    self.state = UnionState::Accumulator(new_sketch);
+                    return;
+                }
+
+                // the new sketch has graduated beyond sparse, so convert to bitMatrix
+                self.lg_k = new_lg_k;
+                self.state = UnionState::BitMatrix(new_sketch.build_bit_matrix());
+            }
+            UnionState::BitMatrix(matrix) => {
+                let new_k = 1 << new_lg_k;
+                let mut new_matrix = vec![0; new_k];
+                or_matrix_into_matrix(&mut new_matrix, new_lg_k, &matrix, self.lg_k);
+                self.lg_k = new_lg_k;
+                self.state = UnionState::BitMatrix(new_matrix);
+            }
+        }
+    }
+}
+
+// testing methods
+impl CpcUnion {
+    /// Returns the number of coupons in the union.
+    ///
+    /// This is primarily for testing and validation purposes.
+    pub fn num_coupons(&self) -> u32 {
+        match &self.state {
+            UnionState::Accumulator(sketch) => sketch.num_coupons,
+            UnionState::BitMatrix(matrix) => count_bits_set_in_matrix(matrix),
+        }
+    }
+}
+
+fn or_window_into_matrix(
+    dst_matrix: &mut [u64],
+    dst_lg_k: u8,
+    src_window: &[u8],
+    src_offset: u8,
+    src_lg_k: u8,
+) {
+    assert!(dst_lg_k <= src_lg_k);
+    let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k
+    let src_k = 1 << src_lg_k;
+    for src_row in 0..src_k {
+        dst_matrix[src_row & dst_mask] |= (src_window[src_row] as u64) << src_offset;
+    }
+}
+
+fn or_table_into_matrix(dst_matrix: &mut [u64], dst_lg_k: u8, src_table: &PairTable) {
+    let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k
+    let slots = src_table.slots();
+    for &row_col in slots.iter() {
+        if row_col != u32::MAX {
+            let src_row = (row_col >> 6) as usize;
+            let src_col = (row_col & 63) as usize;
+            let dst_row = src_row & dst_mask;
+            dst_matrix[dst_row] |= 1u64 << src_col;
+        }
+    }
+}
+
+fn or_matrix_into_matrix(dst_matrix: &mut [u64], dst_lg_k: u8, src_matrix: &[u64], src_lg_k: u8) {
+    assert!(dst_lg_k <= src_lg_k);
+    let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k
+    let src_k = 1 << src_lg_k;
+    for src_row in 0..src_k {
+        let dst_row = src_row & dst_mask;
+        dst_matrix[dst_row] |= src_matrix[src_row];
+    }
+}
+
+fn walk_table_updating_sketch(sketch: &mut CpcSketch, table: &PairTable) {
+    assert!(sketch.lg_k() <= 26);
+
+    let slots = table.slots();
+    let num_slots = slots.len() as u32;
+
+    // downsamples when dst lgK < src LgK
+    let dst_mask = (((1u64 << sketch.lg_k()) - 1) << 6) | 63;
+    // Using a golden ratio stride fixes the snowplow effect.
+    let mut stride = (0.6180339887498949025 * (num_slots as f64)) as u32;
+    assert!(stride >= 2);
+    if stride == ((stride >> 1) << 1) {
+        // force the stride to be odd
+        stride += 1;
+    }
+    assert!((stride >= 3) && (stride < num_slots));
+
+    let mut k = 0;
+    for _ in 0..num_slots {
+        k &= num_slots - 1;
+        let row_col = slots[k as usize];
+        if row_col != u32::MAX {
+            sketch.row_col_update(row_col & (dst_mask as u32));
+        }
+        k += stride;
+    }
+}
+
+/// The internal State of the Union operation.
+///
+/// At most one of BitMatrix and accumulator will be non-null at any given moment. Accumulator is a
+/// sketch object that is employed until it graduates out of Sparse mode.
+///
+/// At that point, it is converted into a full-sized bitMatrix, which is mathematically a sketch,
+/// but doesn't maintain any of the "extra" fields of our sketch objects.
+#[derive(Debug, Clone)]
+enum UnionState {
+    Accumulator(CpcSketch),
+    BitMatrix(Vec<u64>),
+}

From acbcc1cbb0baa5d9b3a61f2f34e6705793807080 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Fri, 30 Jan 2026 18:07:35 +0800
Subject: [PATCH 17/21] add tests

Signed-off-by: tison <wander4096@gmail.com>
---
 Cargo.toml                           |   2 +
 datasketches/src/cpc/mod.rs          |   4 +-
 datasketches/src/cpc/sketch.rs       |  28 +++++--
 datasketches/src/cpc/union.rs        |  20 +++--
 datasketches/tests/cpc_union_test.rs | 105 +++++++++++++++++++++++++++
 5 files changed, 143 insertions(+), 16 deletions(-)
 create mode 100644 datasketches/tests/cpc_union_test.rs

diff --git a/Cargo.toml b/Cargo.toml
index 9e802b6..cc776fe 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,4 +45,6 @@ unused_must_use = "deny"
 
 [workspace.lints.clippy]
 dbg_macro = "deny"
+
 too_many_arguments = "allow"
+needless_range_loop = "allow"
diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index 21879c0..635c8c2 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -26,6 +26,7 @@ mod sketch;
 mod union;
 
 pub use self::sketch::CpcSketch;
+pub use self::union::CpcUnion;
 
 /// Default log2 of K.
 const DEFAULT_LG_K: u8 = 11;
@@ -35,6 +36,7 @@ const MIN_LG_K: usize = 4;
 const MAX_LG_K: usize = 26;
 
 #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+#[expect(clippy::upper_case_acronyms)]
 enum Flavor {
     EMPTY,   //    0  == C <    1
     SPARSE,  //    1  <= C <   3K/32
@@ -70,7 +72,7 @@ fn determine_flavor(lg_k: u8, num_coupons: u32) -> Flavor {
 }
 
 fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 {
-    let k = 1i64 << lg_k;
+    let k = 1 << lg_k;
     let tmp = ((num_coupons as i64) << 3) - (19 * k); // 8C - 19K
     if tmp < 0 {
         0
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index 832db46..43cfdf5 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -20,8 +20,13 @@ use std::hash::Hash;
 use crate::common::NumStdDev;
 use crate::common::canonical_double;
 use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2;
+use crate::cpc::DEFAULT_LG_K;
+use crate::cpc::Flavor;
+use crate::cpc::MAX_LG_K;
 use crate::cpc::MIN_LG_K;
 use crate::cpc::count_bits_set_in_matrix;
+use crate::cpc::determine_correct_offset;
+use crate::cpc::determine_flavor;
 use crate::cpc::estimator::hip_confidence_lb;
 use crate::cpc::estimator::hip_confidence_ub;
 use crate::cpc::estimator::icon_confidence_lb;
@@ -29,8 +34,6 @@ use crate::cpc::estimator::icon_confidence_ub;
 use crate::cpc::estimator::icon_estimate;
 use crate::cpc::kxp_byte_lookup::KXP_BYTE_TABLE;
 use crate::cpc::pair_table::PairTable;
-use crate::cpc::{DEFAULT_LG_K, determine_correct_offset};
-use crate::cpc::{Flavor, MAX_LG_K, determine_flavor};
 use crate::hash::DEFAULT_UPDATE_SEED;
 use crate::hash::MurmurHash3X64128;
 
@@ -309,6 +312,8 @@ impl CpcSketch {
             determine_correct_offset(self.lg_k, self.num_coupons)
         );
 
+        let k = 1 << self.lg_k;
+
         // Construct the full-sized bit matrix that corresponds to the sketch
         let bit_matrix = self.build_bit_matrix();
 
@@ -323,8 +328,8 @@ impl CpcSketch {
         let mask_for_flipping_early_zone = (1u64 << new_offset) - 1;
 
         let mut all_surprises_ored = 0u64;
-        for (i, &bit) in bit_matrix.iter().enumerate() {
-            let mut pattern = bit;
+        for i in 0..k {
+            let mut pattern = bit_matrix[i];
             self.sliding_window[i] = ((pattern >> new_offset) & 0xff) as u8;
             pattern &= mask_for_clearing_window;
             // The following line converts surprising 0's to 1's in the "early zone",
@@ -375,7 +380,7 @@ impl CpcSketch {
     }
 
     pub(super) fn build_bit_matrix(&self) -> Vec<u64> {
-        let k = 1usize << self.lg_k;
+        let k = 1 << self.lg_k;
         let offset = self.window_offset;
         assert!(offset <= 56);
 
@@ -390,9 +395,9 @@ impl CpcSketch {
 
         if !self.sliding_window.is_empty() {
             // In other words, we are in window mode, not sparse mode
-            for (i, bit) in matrix.iter_mut().enumerate() {
+            for i in 0..k {
                 // set the window bits, trusting the sketch's current offset
-                *bit |= (self.sliding_window[i] as u64) << offset;
+                matrix[i] |= (self.sliding_window[i] as u64) << offset;
             }
         }
 
@@ -455,7 +460,7 @@ impl CpcSketch {
         if lg_k <= EMPIRICAL_SIZE_MAX_LGK {
             return EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] + MAX_PREAMBLE_SIZE_BYTES;
         }
-        let k = 1usize << lg_k;
+        let k = 1 << lg_k;
         ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES
     }
 }
@@ -470,4 +475,11 @@ impl CpcSketch {
         let num_bits_set = count_bits_set_in_matrix(&bit_matrix);
         num_bits_set == self.num_coupons
     }
+
+    /// Returns the number of coupons in the sketch.
+    ///
+    /// This is primarily for testing and validation purposes.
+    pub fn num_coupons(&self) -> u32 {
+        self.num_coupons
+    }
 }
diff --git a/datasketches/src/cpc/union.rs b/datasketches/src/cpc/union.rs
index 47e2198..95ce433 100644
--- a/datasketches/src/cpc/union.rs
+++ b/datasketches/src/cpc/union.rs
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::cpc::CpcSketch;
+use crate::cpc::DEFAULT_LG_K;
+use crate::cpc::Flavor;
 use crate::cpc::count_bits_set_in_matrix;
+use crate::cpc::determine_correct_offset;
 use crate::cpc::pair_table::PairTable;
-use crate::cpc::{CpcSketch, determine_correct_offset};
-use crate::cpc::{DEFAULT_LG_K, Flavor};
 use crate::hash::DEFAULT_UPDATE_SEED;
 
 /// The union (merge) operation for the CPC sketches.
@@ -105,14 +107,15 @@ impl CpcUnion {
                     all_surprises_ored |= pattern;
                     while pattern != 0 {
                         let col = pattern.trailing_zeros();
-                        pattern = pattern ^ (1u64 << col); // erase the 1
+                        pattern ^= 1u64 << col; // erase the 1
                         let row_col = ((i as u32) << 6) | col;
                         let is_novel = table.maybe_insert(row_col);
                         assert!(is_novel);
                     }
                 }
 
-                // at this point we could shrink an oversized hash table, but the relative waste isn't very big
+                // at this point we could shrink an oversized hash table, but the relative waste
+                // isn't very big
                 sketch.first_interesting_column = all_surprises_ored.trailing_zeros() as u8;
                 if sketch.first_interesting_column > offset {
                     sketch.first_interesting_column = offset; // corner case
@@ -129,6 +132,10 @@ impl CpcUnion {
     }
 
     /// Update this union with a CpcSketch.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the seed of the provided sketch does not match the seed of this union.
     pub fn update(&mut self, sketch: &CpcSketch) {
         assert_eq!(self.seed, sketch.seed());
 
@@ -214,7 +221,6 @@ impl CpcUnion {
                 assert_eq!(flavor, Flavor::SLIDING);
                 let src_matrix = sketch.build_bit_matrix();
                 or_matrix_into_matrix(old_matrix, self.lg_k, &src_matrix, sketch.lg_k());
-                return;
             }
         }
     }
@@ -247,7 +253,7 @@ impl CpcUnion {
             UnionState::BitMatrix(matrix) => {
                 let new_k = 1 << new_lg_k;
                 let mut new_matrix = vec![0; new_k];
-                or_matrix_into_matrix(&mut new_matrix, new_lg_k, &matrix, self.lg_k);
+                or_matrix_into_matrix(&mut new_matrix, new_lg_k, matrix, self.lg_k);
                 self.lg_k = new_lg_k;
                 self.state = UnionState::BitMatrix(new_matrix);
             }
@@ -315,7 +321,7 @@ fn walk_table_updating_sketch(sketch: &mut CpcSketch, table: &PairTable) {
     // downsamples when dst lgK < src LgK
     let dst_mask = (((1u64 << sketch.lg_k()) - 1) << 6) | 63;
     // Using a golden ratio stride fixes the snowplow effect.
-    let mut stride = (0.6180339887498949025 * (num_slots as f64)) as u32;
+    let mut stride = (0.6180339887498949 * (num_slots as f64)) as u32;
     assert!(stride >= 2);
     if stride == ((stride >> 1) << 1) {
         // force the stride to be odd
diff --git a/datasketches/tests/cpc_union_test.rs b/datasketches/tests/cpc_union_test.rs
new file mode 100644
index 0000000..0a2d1dd
--- /dev/null
+++ b/datasketches/tests/cpc_union_test.rs
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datasketches::cpc::CpcSketch;
+use datasketches::cpc::CpcUnion;
+use googletest::assert_that;
+use googletest::prelude::near;
+
+const RELATIVE_ERROR_FOR_LG_K_11: f64 = 0.02;
+
+#[test]
+fn test_empty() {
+    let union = CpcUnion::new(11);
+    let sketch = union.get_result();
+    assert!(sketch.is_empty());
+    assert_eq!(sketch.estimate(), 0.0);
+}
+
+#[test]
+fn test_two_values() {
+    let mut sketch = CpcSketch::new(11);
+    sketch.update(1);
+    let mut union = CpcUnion::new(11);
+    union.update(&sketch);
+
+    let result = union.get_result();
+    assert!(!result.is_empty());
+    assert_eq!(result.estimate(), 1.0);
+
+    sketch.update(2);
+    union.update(&sketch);
+    let result = union.get_result();
+    assert!(!result.is_empty());
+    assert_that!(
+        sketch.estimate(),
+        near(2.0, RELATIVE_ERROR_FOR_LG_K_11 * 2.0)
+    );
+}
+
+#[test]
+fn test_custom_seed() {
+    let mut sketch = CpcSketch::with_seed(11, 123);
+    sketch.update(1);
+    sketch.update(2);
+    sketch.update(3);
+
+    let mut union = CpcUnion::with_seed(11, 123);
+    union.update(&sketch);
+    let result = union.get_result();
+    assert!(!result.is_empty());
+    assert_that!(
+        result.estimate(),
+        near(3.0, RELATIVE_ERROR_FOR_LG_K_11 * 3.0)
+    );
+}
+
+#[test]
+#[should_panic]
+fn test_custom_seed_mismatch() {
+    let mut sketch = CpcSketch::with_seed(11, 123);
+    sketch.update(1);
+    sketch.update(2);
+    sketch.update(3);
+
+    let mut union = CpcUnion::with_seed(11, 234);
+    union.update(&sketch);
+}
+
+#[test]
+fn test_large_values() {
+    let mut key = 0;
+    let mut sketch = CpcSketch::new(11);
+    let mut union = CpcUnion::new(11);
+    for _ in 0..1000 {
+        let mut tmp = CpcSketch::new(11);
+        for _ in 0..10000 {
+            sketch.update(key);
+            tmp.update(key);
+            key += 1;
+        }
+        union.update(&tmp);
+    }
+    let result = union.get_result();
+    assert!(!result.is_empty());
+    assert_eq!(result.num_coupons(), union.num_coupons());
+    let estimate = sketch.estimate();
+    assert_that!(
+        result.estimate(),
+        near(estimate, RELATIVE_ERROR_FOR_LG_K_11 * estimate)
+    );
+}

From 770ed74d4a7142a4c774f5f8654149c2026b65af Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Fri, 30 Jan 2026 19:10:16 +0800
Subject: [PATCH 18/21] less cast

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/common/mod.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datasketches/src/common/mod.rs b/datasketches/src/common/mod.rs
index 4a1db92..a43373a 100644
--- a/datasketches/src/common/mod.rs
+++ b/datasketches/src/common/mod.rs
@@ -28,14 +28,14 @@ pub(crate) mod binomial_bounds;
 pub(crate) mod inv_pow2_table;
 
 /// Canonicalize double value for compatibility with Java
-pub(crate) fn canonical_double(value: f64) -> i64 {
+pub(crate) fn canonical_double(value: f64) -> u64 {
     if value.is_nan() {
         // Java's Double.doubleToLongBits() NaN value
-        0x7ff8000000000000i64
+        0x7ff8000000000000u64
     } else {
         // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode,
         // which Rust guarantees. Thus, by adding a positive zero we
         // canonicalize signed zero without any branches in one instruction.
-        (value + 0.0).to_bits() as i64
+        (value + 0.0).to_bits()
     }
 }

From 21ea3f5724884dd6b02ebbc4e183a6e0041e22d8 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Fri, 30 Jan 2026 22:38:21 +0800
Subject: [PATCH 19/21] more tests

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/tests/cpc_union_test.rs  | 64 +++++++++++++++++++++++++++
 datasketches/tests/cpc_update_test.rs |  7 +--
 2 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/datasketches/tests/cpc_union_test.rs b/datasketches/tests/cpc_union_test.rs
index 0a2d1dd..6e8aa32 100644
--- a/datasketches/tests/cpc_union_test.rs
+++ b/datasketches/tests/cpc_union_test.rs
@@ -103,3 +103,67 @@ fn test_large_values() {
         near(estimate, RELATIVE_ERROR_FOR_LG_K_11 * estimate)
     );
 }
+
+#[test]
+fn test_reduce_k_empty() {
+    let mut sketch = CpcSketch::new(11);
+    for i in 0..10000 {
+        sketch.update(i);
+    }
+    let mut union = CpcUnion::new(12);
+    union.update(&sketch);
+    let result = union.get_result();
+    assert_eq!(result.lg_k(), 11);
+    assert_that!(
+        result.estimate(),
+        near(10000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0)
+    );
+}
+
+#[test]
+fn test_reduce_k_sparse() {
+    let mut union = CpcUnion::new(12);
+
+    let mut sketch12 = CpcSketch::new(12);
+    for i in 0..100 {
+        sketch12.update(i);
+    }
+    union.update(&sketch12);
+
+    let mut sketch11 = CpcSketch::new(11);
+    for i in 0..1000 {
+        sketch11.update(i);
+    }
+    union.update(&sketch11);
+
+    let result = union.get_result();
+    assert_eq!(result.lg_k(), 11);
+    assert_that!(
+        result.estimate(),
+        near(1000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0)
+    );
+}
+
+#[test]
+fn test_reduce_k_window() {
+    let mut union = CpcUnion::new(12);
+
+    let mut sketch12 = CpcSketch::new(12);
+    for i in 0..500 {
+        sketch12.update(i);
+    }
+    union.update(&sketch12);
+
+    let mut sketch11 = CpcSketch::new(11);
+    for i in 0..1000 {
+        sketch11.update(i);
+    }
+    union.update(&sketch11);
+
+    let result = union.get_result();
+    assert_eq!(result.lg_k(), 11);
+    assert_that!(
+        result.estimate(),
+        near(1000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0)
+    );
+}
diff --git a/datasketches/tests/cpc_update_test.rs b/datasketches/tests/cpc_update_test.rs
index fb17573..7b814f7 100644
--- a/datasketches/tests/cpc_update_test.rs
+++ b/datasketches/tests/cpc_update_test.rs
@@ -47,17 +47,14 @@ fn test_one_value() {
 
 #[test]
 fn test_many_values() {
-    const N: usize = 10000;
-    const N_F64: f64 = N as f64;
-
     let mut sketch = CpcSketch::new(11);
-    for i in 0..N {
+    for i in 0..10000 {
         sketch.update(i);
     }
     assert!(!sketch.is_empty());
     assert_that!(
         sketch.estimate(),
-        near(N_F64, RELATIVE_ERROR_FOR_LG_K_11 * N_F64)
+        near(10000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0)
     );
     assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One)));
     assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One)));

From aa0fe4ad6c54db77823dc2dc2c29f336004ee7b1 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Sun, 1 Feb 2026 11:01:35 +0800
Subject: [PATCH 20/21] add more docs

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/cpc/mod.rs        | 20 +++++++++-
 datasketches/src/cpc/sketch.rs     | 42 +++++++++++++-------
 datasketches/src/cpc/union.rs      | 61 ++++++++++++++++++++++++++++--
 datasketches/src/hll/sketch.rs     |  2 +-
 datasketches/src/hll/union.rs      |  2 +-
 datasketches/src/tdigest/sketch.rs |  4 +-
 6 files changed, 109 insertions(+), 22 deletions(-)

diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
index 635c8c2..4c9fea2 100644
--- a/datasketches/src/cpc/mod.rs
+++ b/datasketches/src/cpc/mod.rs
@@ -17,7 +17,25 @@
 
 #![allow(dead_code)]
 
-//! Compressed Probabilistic Counting sketch family.
+//! Compressed Probabilistic Counting sketch.
+//!
+//! This is a unique-counting sketch that implements the Compressed Probabilistic Counting (CPC,
+//! a.k.a. FM85) algorithms developed by Kevin Lang in his paper [Back to the Future: an Even More
+//! Nearly Optimal Cardinality Estimation Algorithm](https://arxiv.org/abs/1708.06839).
+//!
+//! This sketch is extremely space-efficient when serialized. In an apples-to-apples empirical
+//! comparison against compressed HyperLogLog sketches, this new algorithm simultaneously wins on
+//! the two dimensions of the space/accuracy tradeoff and produces sketches that are smaller than
+//! the entropy of HLL, so no possible implementation of compressed HLL can match its space
+//! efficiency for a given accuracy. As described in the paper this sketch implements a newly
+//! developed ICON estimator algorithm that survives union operations, another well-known estimator,
+//! the [Historical Inverse Probability (HIP)](https://arxiv.org/abs/1306.3284) estimator does not.
+//!
+//! The update speed performance of this sketch is quite fast and is comparable to the speed of HLL.
+//! The union (merging) capability of this sketch also allows for merging of sketches with different
+//! configurations of K.
+//!
+//! For additional security this sketch can be configured with a user-specified hash seed.
 
 mod estimator;
 mod kxp_byte_lookup;
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index 43cfdf5..6af8bfc 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -38,6 +38,8 @@ use crate::hash::DEFAULT_UPDATE_SEED;
 use crate::hash::MurmurHash3X64128;
 
 /// A Compressed Probabilistic Counting sketch.
+///
+/// See the [module level documentation](super) for more.
 #[derive(Debug, Clone)]
 pub struct CpcSketch {
     // immutable config variables
@@ -77,11 +79,19 @@ impl Default for CpcSketch {
 
 impl CpcSketch {
     /// Creates a new `CpcSketch` with the given `lg_k` and default seed.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `lg_k` is not in the range `[4, 16]`.
     pub fn new(lg_k: u8) -> Self {
         Self::with_seed(lg_k, DEFAULT_UPDATE_SEED)
     }
 
     /// Creates a new `CpcSketch` with the given `lg_k` and `seed`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `lg_k` is not in the range `[4, 16]`.
     pub fn with_seed(lg_k: u8, seed: u64) -> Self {
         assert!(
             (MIN_LG_K..=MAX_LG_K).contains(&(lg_k as usize)),
@@ -222,7 +232,7 @@ impl CpcSketch {
     fn update_sparse(&mut self, row_col: u32) {
         let k = 1 << self.lg_k;
         let c32pre = (self.num_coupons as u64) << 5;
-        assert!(c32pre < 3 * k); // C < 3K/32, in other words, flavor == SPARSE
+        debug_assert!(c32pre < 3 * k); // C < 3K/32, in other words, flavor == SPARSE
         let is_novel = self.mut_surprising_value_table().maybe_insert(row_col);
         if is_novel {
             self.num_coupons += 1;
@@ -235,11 +245,11 @@ impl CpcSketch {
     }
 
     fn promote_sparse_to_windowed(&mut self) {
-        assert_eq!(self.window_offset, 0);
+        debug_assert_eq!(self.window_offset, 0);
 
         let k = 1 << self.lg_k;
         let c32 = (self.num_coupons as u64) << 5;
-        assert!((c32 == (3 * k)) || ((self.lg_k == 4) && (c32 > (3 * k))));
+        debug_assert!((c32 == (3 * k)) || ((self.lg_k == 4) && (c32 > (3 * k))));
 
         self.sliding_window.resize(k as usize, 0);
 
@@ -257,20 +267,20 @@ impl CpcSketch {
                 } else {
                     // cannot use must_insert(), because it doesn't provide for growth
                     let is_novel = self.mut_surprising_value_table().maybe_insert(row_col);
-                    assert!(is_novel);
+                    debug_assert!(is_novel);
                 }
             }
         }
     }
 
     fn update_windowed(&mut self, row_col: u32) {
-        assert!(self.window_offset <= 56);
+        debug_assert!(self.window_offset <= 56);
         let k = 1 << self.lg_k;
         let c32pre = (self.num_coupons as u64) << 5;
-        assert!(c32pre >= 3 * k); // C >= 3K/32, in other words flavor >= HYBRID
+        debug_assert!(c32pre >= 3 * k); // C >= 3K/32, in other words flavor >= HYBRID
         let c8pre = (self.num_coupons as u64) << 3;
         let w8pre = (self.window_offset as u64) << 3;
-        assert!(c8pre < (27 + w8pre) * k); // C < (K * 27/8) + (K * windowOffset)
+        debug_assert!(c8pre < (27 + w8pre) * k); // C < (K * 27/8) + (K * windowOffset)
 
         let mut is_novel = false; // novel if new coupon;
         let col = (row_col & 63) as u8;
@@ -297,17 +307,17 @@ impl CpcSketch {
             let c8post = (self.num_coupons as u64) << 3;
             if c8post >= (27 + w8pre) * k {
                 self.move_window();
-                assert!((1..=56).contains(&self.window_offset));
+                debug_assert!((1..=56).contains(&self.window_offset));
                 let w8post = (self.window_offset as u64) << 3;
-                assert!(c8post < ((27 + w8post) * k)); // C < (K * 27/8) + (K * windowOffset)
+                debug_assert!(c8post < ((27 + w8post) * k)); // C < (K * 27/8) + (K * windowOffset)
             }
         }
     }
 
     fn move_window(&mut self) {
         let new_offset = self.window_offset + 1;
-        assert!(new_offset <= 56);
-        assert_eq!(
+        debug_assert!(new_offset <= 56);
+        debug_assert_eq!(
             new_offset,
             determine_correct_offset(self.lg_k, self.num_coupons)
         );
@@ -341,7 +351,7 @@ impl CpcSketch {
                 pattern ^= 1 << col; // erase the 1
                 let row_col = ((i as u32) << 6) | col;
                 let is_novel = self.mut_surprising_value_table().maybe_insert(row_col);
-                assert!(is_novel);
+                debug_assert!(is_novel);
             }
         }
 
@@ -382,7 +392,7 @@ impl CpcSketch {
     pub(super) fn build_bit_matrix(&self) -> Vec<u64> {
         let k = 1 << self.lg_k;
         let offset = self.window_offset;
-        assert!(offset <= 56);
+        debug_assert!(offset <= 56);
 
         // Fill the matrix with default rows in which the "early zone" is filled with ones.
         // This is essential for the routine's O(k) time cost (as opposed to O(C)).
@@ -423,6 +433,10 @@ impl CpcSketch {
     /// empirically measured size should be large enough for at least 99.9 percent of sketches.
     ///
     /// For small values of `n` the size can be much smaller.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `lg_k` is not in the range `[4, 26]`.
     pub fn max_serialized_bytes(lg_k: u8) -> usize {
         let lg_k = lg_k as usize;
         assert!(
@@ -469,7 +483,7 @@ impl CpcSketch {
 impl CpcSketch {
     /// Validate this sketch is valid.
     ///
-    /// This method is typically used for debugging and testing purposes.
+    /// This is primarily for testing and validation purposes.
     pub fn validate(&self) -> bool {
         let bit_matrix = self.build_bit_matrix();
         let num_bits_set = count_bits_set_in_matrix(&bit_matrix);
diff --git a/datasketches/src/cpc/union.rs b/datasketches/src/cpc/union.rs
index 95ce433..1a5255f 100644
--- a/datasketches/src/cpc/union.rs
+++ b/datasketches/src/cpc/union.rs
@@ -15,6 +15,52 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! The merging logic is somewhat involved, so it will be summarized here.
+//!
+//! First, we compare the K values of the union and the source sketch.
+//!
+//! If `source.K < union.K`, we reduce the union's K to match, which requires downsampling the
+//! union's internal sketch.
+//!
+//! Here is how to perform the downsampling:
+//!
+//! If the union contains a bitMatrix, downsample it by row-wise ORing.
+//!
+//! If the union contains a sparse sketch, then create a new empty sketch, and walk the old target
+//! sketch updating the new one (with modulo). At the end, check whether the new target sketch is
+//! still in sparse mode (it might not be, because downsampling densifies the set of collected
+//! coupons). If it is NOT in sparse mode, immediately convert it to a bitMatrix.
+//!
+//! At this point, we have `source.K >= union.K`. (We won't keep mentioning this, but in all the
+//! following the source's row indices are used mod union.K while updating the union's sketch.
+//! That takes care of the situation where `source.K < union.K`.)
+//!
+//! Case A: union is Sparse and source is Sparse. We walk the source sketch updating the union's
+//! sketch. At the end, if the union's sketch is no longer in sparse mode, we convert it to a
+//! bitMatrix.
+//!
+//! Case B: union is bitMatrix and source is Sparse. We walk the source sketch, setting bits in the
+//! bitMatrix.
+//!
+//! In the remaining cases, we have flavor(source) > Sparse, so we immediately convert the union's
+//! sketch to a bitMatrix (even if the union contains very few coupons). Then:
+//!
+//! Case C: union is bitMatrix and source is Hybrid or Pinned. Then we OR the source's sliding
+//! window into the bitMatrix, and walk the source's table, setting bits in the bitMatrix.
+//!
+//! Case D: union is bitMatrix, and source is Sliding. Then we convert the source into a bitMatrix,
+//! and OR it into the union's bitMatrix. (Important note; merely walking the source wouldn't work
+//! because of the partially inverted Logic in the Sliding flavor, where the presence of coupons
+//! is sometimes indicated by the ABSENCE of row_col pairs in the surprises table.)
+//!
+//! How does [`CpcUnion::get_result`] work?
+//!
+//! If the union has an Accumulator state, make a copy of that sketch.
+//!
+//! If the union has a BitMatrix state, then we have to convert the bitMatrix back into a sketch,
+//! which requires doing some extra work to figure out the values of num_coupons, offset,
+//! first_interesting_column, and kxp.
+
 use crate::cpc::CpcSketch;
 use crate::cpc::DEFAULT_LG_K;
 use crate::cpc::Flavor;
@@ -42,11 +88,19 @@ impl Default for CpcUnion {
 
 impl CpcUnion {
     /// Creates a new `CpcUnion` with the given `lg_k` and default seed.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `lg_k` is not in the range `[4, 16]`.
     pub fn new(lg_k: u8) -> Self {
         Self::with_seed(lg_k, DEFAULT_UPDATE_SEED)
     }
 
     /// Creates a new `CpcUnion` with the given `lg_k` and `seed`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `lg_k` is not in the range `[4, 16]`.
     pub fn with_seed(lg_k: u8, seed: u64) -> Self {
         // We begin with the accumulator holding an EMPTY_MERGED sketch object.
         let sketch = CpcSketch::with_seed(lg_k, seed);
@@ -56,8 +110,8 @@ impl CpcUnion {
 
     /// Return the parameter lg_k.
     ///
-    /// Note that due to merging with source sketches that may have a lower value of LgK, this value
-    /// can be less than what the union object was configured with.
+    /// Note that due to merging with source sketches that may have a lower value of lg_k, this
+    /// value can be less than what the union object was configured with.
     pub fn lg_k(&self) -> u8 {
         self.lg_k
     }
@@ -121,7 +175,8 @@ impl CpcUnion {
                     sketch.first_interesting_column = offset; // corner case
                 }
 
-                // HIP-related fields will contain zeros, and that is okay since merge_flag is true
+                // HIP-related fields will contain zeros, and that is okay since merge_flag is true,
+                // and thus the HIP estimator will not be used.
                 sketch.sliding_window = sliding_window;
                 sketch.surprising_value_table = Some(table);
                 sketch.merge_flag = true;
diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs
index 8c79625..bc1ce42 100644
--- a/datasketches/src/hll/sketch.rs
+++ b/datasketches/src/hll/sketch.rs
@@ -40,7 +40,7 @@ use crate::hll::serialization::*;
 
 /// A HyperLogLog sketch.
 ///
-/// See the [hll module level documentation](crate::hll) for more.
+/// See the [module level documentation](super) for more.
 #[derive(Debug, Clone, PartialEq)]
 pub struct HllSketch {
     lg_config_k: u8,
diff --git a/datasketches/src/hll/union.rs b/datasketches/src/hll/union.rs
index 5dba0bb..870c8d8 100644
--- a/datasketches/src/hll/union.rs
+++ b/datasketches/src/hll/union.rs
@@ -45,7 +45,7 @@ use crate::hll::pack_coupon;
 /// the union of all input sketches. It automatically handles sketches with
 /// different configurations and modes.
 ///
-/// See the [hll module level documentation](crate::hll) for more.
+/// See the [module level documentation](super) for more.
 #[derive(Debug, Clone)]
 pub struct HllUnion {
     /// Maximum lg_k that this union can handle
diff --git a/datasketches/src/tdigest/sketch.rs b/datasketches/src/tdigest/sketch.rs
index 91c90c7..651d44e 100644
--- a/datasketches/src/tdigest/sketch.rs
+++ b/datasketches/src/tdigest/sketch.rs
@@ -34,7 +34,7 @@ const DEFAULT_WEIGHT: NonZeroU64 = NonZeroU64::new(1).unwrap();
 
 /// T-Digest sketch for estimating quantiles and ranks.
 ///
-/// See the [tdigest module level documentation](crate::tdigest) for more.
+/// See the [module level documentation](super) for more.
 #[derive(Debug, Clone)]
 pub struct TDigestMut {
     k: u16,
@@ -788,7 +788,7 @@ impl TDigestMut {
 
 /// Immutable (frozen) T-Digest sketch for estimating quantiles and ranks.
 ///
-/// See the [module documentation](super) for more details.
+/// See the [module level documentation](super) for more.
 pub struct TDigest {
     k: u16,
 

From 99e71ffd419bb8c2d34c3909e95dddeb3ea832ce Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Sun, 1 Feb 2026 11:11:25 +0800
Subject: [PATCH 21/21] tidy

Signed-off-by: tison <wander4096@gmail.com>
---
 datasketches/src/error.rs          | 2 +-
 datasketches/src/tdigest/sketch.rs | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/datasketches/src/error.rs b/datasketches/src/error.rs
index dcb0592..7e8b663 100644
--- a/datasketches/src/error.rs
+++ b/datasketches/src/error.rs
@@ -89,7 +89,7 @@ impl Error {
     }
 }
 
-// Convenience constructors for deserialization errors
+// Convenient constructors used within datasketches crate.
 impl Error {
     pub(crate) fn invalid_argument(msg: impl Into<String>) -> Self {
         Self::new(ErrorKind::InvalidArgument, msg)
diff --git a/datasketches/src/tdigest/sketch.rs b/datasketches/src/tdigest/sketch.rs
index 651d44e..a790892 100644
--- a/datasketches/src/tdigest/sketch.rs
+++ b/datasketches/src/tdigest/sketch.rs
@@ -22,7 +22,6 @@ use std::num::NonZeroU64;
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
 use crate::error::Error;
-use crate::error::ErrorKind;
 use crate::tdigest::serialization::*;
 
 /// The default value of K if one is not specified.
@@ -100,10 +99,9 @@ impl TDigestMut {
     /// ```
     pub fn try_new(k: u16) -> Result<Self, Error> {
         if k < 10 {
-            return Err(Error::new(
-                ErrorKind::InvalidArgument,
-                format!("k must be at least 10, got {k}"),
-            ));
+            return Err(Error::invalid_argument(format!(
+                "k must be at least 10, got {k}"
+            )));
         }
 
         Ok(Self::make(