diff --git a/Cargo.lock b/Cargo.lock index c946f1e..1bbd4f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -81,9 +81,9 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "clap" -version = "4.5.53" +version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8" +checksum = "3e34525d5bbbd55da2bb745d34b36121baac88d07619a9a09cfcf4a6c0832785" dependencies = [ "clap_builder", "clap_derive", @@ -91,9 +91,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.53" +version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00" +checksum = "59a20016a20a3da95bef50ec7238dbd09baeef4311dcdd38ec15aba69812fb61" dependencies = [ "anstream", "anstyle", @@ -103,9 +103,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.49" +version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" dependencies = [ "heck", "proc-macro2", @@ -115,9 +115,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.6" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" +checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" [[package]] name = "colorchoice" @@ -143,6 +143,7 @@ version = "0.2.0" dependencies = [ "googletest", "insta", + "rand", ] [[package]] @@ -241,9 +242,9 @@ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "libc" -version = "0.2.178" +version = "0.2.180" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" +checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "linux-raw-sys" @@ -278,20 +279,29 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "proc-macro2" -version = "1.0.104" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.42" +version = "1.0.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" dependencies = [ "proc-macro2", ] @@ -302,6 +312,35 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom", +] + [[package]] name = "regex" version = "1.12.2" @@ -364,9 +403,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.112" +version = "2.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21f182278bf2d2bcb3c88b1b08a37df029d71ce3d3ae26168e3c653b213b99d4" +checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" dependencies = [ "proc-macro2", "quote", @@ -525,3 +564,23 @@ dependencies = [ "clap", "which", ] + +[[package]] +name = "zerocopy" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdea86ddd5568519879b8187e1cf04e24fce28f7fe046ceecbce472ff19a2572" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c15e1b46eff7c6c91195752e0eeed8ef040e391cdece7c25376957d5f15df22" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index d387fbf..cc776fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ datasketches = { path = "datasketches" } clap = { version = "4.5.20", features = ["derive"] } insta = { version = "1.46.1" } googletest = { version = "0.14.2" } +rand = { version = "0.9.2" } which = { version = "8.0.0" } [workspace.lints.rust] @@ -44,4 +45,6 @@ unused_must_use = "deny" [workspace.lints.clippy] dbg_macro = "deny" + too_many_arguments = "allow" +needless_range_loop = "allow" diff --git a/datasketches/Cargo.toml b/datasketches/Cargo.toml index 214288e..88b224c 100644 --- a/datasketches/Cargo.toml +++ b/datasketches/Cargo.toml @@ -36,6 +36,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dev-dependencies] googletest = { workspace = true } +rand = { workspace = true } insta = { workspace = true } [lints] diff --git a/datasketches/src/common/inv_pow2_table.rs b/datasketches/src/common/inv_pow2_table.rs new file mode 100644 index 0000000..580a2fc --- /dev/null +++ b/datasketches/src/common/inv_pow2_table.rs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[rustfmt::skip] +#[allow(clippy::excessive_precision)] +pub(crate) const INVERSE_POWERS_OF_2: [f64; 256] = [ + 1.0, 0.5, 0.25, 0.125, + 0.0625, 0.03125, 0.015625, 0.0078125, + 0.00390625, 0.001953125, 0.0009765625, 0.00048828125, + 0.000244140625, 0.0001220703125, 6.103515625e-05, 3.0517578125e-05, + 1.52587890625e-05, 7.62939453125e-06, 3.814697265625e-06, 1.9073486328125e-06, + 9.5367431640625e-07, 4.76837158203125e-07, 2.384185791015625e-07, 1.1920928955078125e-07, + 5.9604644775390625e-08, 2.9802322387695312e-08, 1.4901161193847656e-08, 7.4505805969238281e-09, + 3.7252902984619141e-09, 1.862645149230957e-09, 9.3132257461547852e-10, 4.6566128730773926e-10, + 2.3283064365386963e-10, 1.1641532182693481e-10, 5.8207660913467407e-11, 2.9103830456733704e-11, + 1.4551915228366852e-11, 7.2759576141834259e-12, 3.637978807091713e-12, 1.8189894035458565e-12, + 9.0949470177292824e-13, 4.5474735088646412e-13, 2.2737367544323206e-13, 1.1368683772161603e-13, + 5.6843418860808015e-14, 2.8421709430404007e-14, 1.4210854715202004e-14, 7.1054273576010019e-15, + 3.5527136788005009e-15, 1.7763568394002505e-15, 8.8817841970012523e-16, 4.4408920985006262e-16, + 2.2204460492503131e-16, 1.1102230246251565e-16, 5.5511151231257827e-17, 2.7755575615628914e-17, + 1.3877787807814457e-17, 6.9388939039072284e-18, 3.4694469519536142e-18, 1.7347234759768071e-18, + 8.6736173798840355e-19, 4.3368086899420177e-19, 2.1684043449710089e-19, 1.0842021724855044e-19, + 5.4210108624275222e-20, 2.7105054312137611e-20, 1.3552527156068805e-20, 6.7762635780344027e-21, + 3.3881317890172014e-21, 1.6940658945086007e-21, 8.4703294725430034e-22, 4.2351647362715017e-22, + 2.1175823681357508e-22, 1.0587911840678754e-22, 5.2939559203393771e-23, 2.6469779601696886e-23, + 1.3234889800848443e-23, 6.6174449004242214e-24, 3.3087224502121107e-24, 1.6543612251060553e-24, + 8.2718061255302767e-25, 4.1359030627651384e-25, 2.0679515313825692e-25, 1.0339757656912846e-25, + 5.169878828456423e-26, 2.5849394142282115e-26, 1.2924697071141057e-26, 6.4623485355705287e-27, + 3.2311742677852644e-27, 1.6155871338926322e-27, 8.0779356694631609e-28, 4.0389678347315804e-28, + 2.0194839173657902e-28, 1.0097419586828951e-28, 5.0487097934144756e-29, 2.5243548967072378e-29, + 1.2621774483536189e-29, 6.3108872417680944e-30, 3.1554436208840472e-30, 1.5777218104420236e-30, + 7.8886090522101181e-31, 3.944304526105059e-31, 1.9721522630525295e-31, 9.8607613152626476e-32, + 4.9303806576313238e-32, 2.4651903288156619e-32, 1.2325951644078309e-32, 6.1629758220391547e-33, + 3.0814879110195774e-33, 1.5407439555097887e-33, 7.7037197775489434e-34, 3.8518598887744717e-34, + 1.9259299443872359e-34, 9.6296497219361793e-35, 4.8148248609680896e-35, 2.4074124304840448e-35, + 1.2037062152420224e-35, 6.018531076210112e-36, 3.009265538105056e-36, 1.504632769052528e-36, + 7.5231638452626401e-37, 3.76158192263132e-37, 1.88079096131566e-37, 9.4039548065783001e-38, + 4.70197740328915e-38, 2.350988701644575e-38, 1.1754943508222875e-38, 5.8774717541114375e-39, + 2.9387358770557188e-39, 1.4693679385278594e-39, 7.3468396926392969e-40, 3.6734198463196485e-40, + 1.8367099231598242e-40, 9.1835496157991212e-41, 4.5917748078995606e-41, 2.2958874039497803e-41, + 1.1479437019748901e-41, 5.7397185098744507e-42, 2.8698592549372254e-42, 1.4349296274686127e-42, + 7.1746481373430634e-43, 3.5873240686715317e-43, 1.7936620343357659e-43, 8.9683101716788293e-44, + 4.4841550858394146e-44, 2.2420775429197073e-44, 1.1210387714598537e-44, 5.6051938572992683e-45, + 2.8025969286496341e-45, 1.4012984643248171e-45, 7.0064923216240854e-46, 3.5032461608120427e-46, + 1.7516230804060213e-46, 8.7581154020301067e-47, 4.3790577010150533e-47, 2.1895288505075267e-47, + 1.0947644252537633e-47, 5.4738221262688167e-48, 2.7369110631344083e-48, 1.3684555315672042e-48, + 6.8422776578360209e-49, 3.4211388289180104e-49, 1.7105694144590052e-49, 8.5528470722950261e-50, + 4.276423536147513e-50, 2.1382117680737565e-50, 1.0691058840368783e-50, 5.3455294201843913e-51, + 2.6727647100921956e-51, 1.3363823550460978e-51, 6.6819117752304891e-52, 3.3409558876152446e-52, + 1.6704779438076223e-52, 8.3523897190381114e-53, 4.1761948595190557e-53, 2.0880974297595278e-53, + 1.0440487148797639e-53, 5.2202435743988196e-54, 2.6101217871994098e-54, 1.3050608935997049e-54, + 6.5253044679985245e-55, 3.2626522339992623e-55, 1.6313261169996311e-55, 8.1566305849981557e-56, + 4.0783152924990778e-56, 2.0391576462495389e-56, 1.0195788231247695e-56, 5.0978941156238473e-57, + 2.5489470578119236e-57, 1.2744735289059618e-57, 6.3723676445298091e-58, 3.1861838222649046e-58, + 1.5930919111324523e-58, 7.9654595556622614e-59, 3.9827297778311307e-59, 1.9913648889155653e-59, + 9.9568244445778267e-60, 4.9784122222889134e-60, 2.4892061111444567e-60, 1.2446030555722283e-60, + 6.2230152778611417e-61, 3.1115076389305709e-61, 1.5557538194652854e-61, 7.7787690973264271e-62, + 3.8893845486632136e-62, 1.9446922743316068e-62, 9.7234613716580339e-63, 4.861730685829017e-63, + 2.4308653429145085e-63, 1.2154326714572542e-63, 6.0771633572862712e-64, 3.0385816786431356e-64, + 1.5192908393215678e-64, 7.596454196607839e-65, 3.7982270983039195e-65, 1.8991135491519597e-65, + 9.4955677457597987e-66, 4.7477838728798994e-66, 2.3738919364399497e-66, 1.1869459682199748e-66, + 5.9347298410998742e-67, 2.9673649205499371e-67, 1.4836824602749686e-67, 7.4184123013748428e-68, + 3.7092061506874214e-68, 1.8546030753437107e-68, 9.2730153767185535e-69, 4.6365076883592767e-69, + 2.3182538441796384e-69, 1.1591269220898192e-69, 5.7956346104490959e-70, 2.897817305224548e-70, + 1.448908652612274e-70, 7.2445432630613699e-71, 3.6222716315306849e-71, 1.8111358157653425e-71, + 9.0556790788267124e-72, 4.5278395394133562e-72, 2.2639197697066781e-72, 1.131959884853339e-72, + 5.6597994242666952e-73, 2.8298997121333476e-73, 1.4149498560666738e-73, 7.074749280333369e-74, + 3.5373746401666845e-74, 1.7686873200833423e-74, 8.8434366004167113e-75, 4.4217183002083556e-75, + 2.2108591501041778e-75, 1.1054295750520889e-75, 5.5271478752604446e-76, 2.7635739376302223e-76, + 1.3817869688151111e-76, 6.9089348440755557e-77, 3.4544674220377779e-77, 1.7272337110188889e-77 +]; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn assert_inv_pow2_table() { + for i in 0u8..=255 { + // expected value + let expected = INVERSE_POWERS_OF_2[i as usize]; + + // computed value + let computed = 2f64.powf(-(i as f64)); + + assert_eq!(expected, computed); + } + } +} diff --git a/datasketches/src/common/mod.rs b/datasketches/src/common/mod.rs index 017ddc9..a43373a 100644 --- a/datasketches/src/common/mod.rs +++ b/datasketches/src/common/mod.rs @@ -25,3 +25,17 @@ pub use self::resize::ResizeFactor; // private to datasketches crate pub(crate) mod binomial_bounds; +pub(crate) mod inv_pow2_table; + +/// Canonicalize double value for compatibility with Java +pub(crate) fn canonical_double(value: f64) -> u64 { + if value.is_nan() { + // Java's Double.doubleToLongBits() NaN value + 0x7ff8000000000000u64 + } else { + // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode, + // which Rust guarantees. Thus, by adding a positive zero we + // canonicalize signed zero without any branches in one instruction. + (value + 0.0).to_bits() + } +} diff --git a/datasketches/src/common/num_std_dev.rs b/datasketches/src/common/num_std_dev.rs index 5cae235..92cbddd 100644 --- a/datasketches/src/common/num_std_dev.rs +++ b/datasketches/src/common/num_std_dev.rs @@ -50,4 +50,9 @@ impl NumStdDev { pub const fn tail_probability(&self) -> f64 { DELTA_OF_NUM_STD_DEVS[*self as usize] } + + /// Returns the number of standard deviations as an `u8`. + pub const fn as_u8(&self) -> u8 { + *self as u8 + } } diff --git a/datasketches/src/cpc/estimator.rs b/datasketches/src/cpc/estimator.rs new file mode 100644 index 0000000..6d37e6e --- /dev/null +++ b/datasketches/src/cpc/estimator.rs @@ -0,0 +1,403 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::f64::consts::LN_2; + +use crate::common::NumStdDev; + +const ICON_ERROR_CONSTANT: f64 = LN_2; + +const ICON_LOW_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 6037, 5720, 5328, // 4 1000000 + 6411, 6262, 5682, // 5 1000000 + 6724, 6403, 6127, // 6 1000000 + 6665, 6411, 6208, // 7 1000000 + 6959, 6525, 6427, // 8 1000000 + 6892, 6665, 6619, // 9 1000000 + 6792, 6752, 6690, // 10 1000000 + 6899, 6818, 6708, // 11 1000000 + 6871, 6845, 6812, // 12 1046369 + 6909, 6861, 6828, // 13 1043411 + 6919, 6897, 6842, // 14 1000297 +]; + +const ICON_HIGH_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 8031, 8559, 9309, // 4 1000000 + 7084, 7959, 8660, // 5 1000000 + 7141, 7514, 7876, // 6 1000000 + 7458, 7430, 7572, // 7 1000000 + 6892, 7141, 7497, // 8 1000000 + 6889, 7132, 7290, // 9 1000000 + 7075, 7118, 7185, // 10 1000000 + 7040, 7047, 7085, // 11 1000000 + 6993, 7019, 7053, // 12 1046369 + 6953, 7001, 6983, // 13 1043411 + 6944, 6966, 7004, // 14 1000297 +]; + +#[allow(clippy::excessive_precision)] +const HIP_ERROR_CONSTANT: f64 = 0.588705011257737332; // (LN_2 / 2.0).sqrt() + +const HIP_LOW_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 5871, 5247, 4826, // 4 1000000 + 5877, 5403, 5070, // 5 1000000 + 5873, 5533, 5304, // 6 1000000 + 5878, 5632, 5464, // 7 1000000 + 5874, 5690, 5564, // 8 1000000 + 5880, 5745, 5619, // 9 1000000 + 5875, 5784, 5701, // 10 1000000 + 5866, 5789, 5742, // 11 1000000 + 5869, 5827, 5784, // 12 1046369 + 5876, 5860, 5827, // 13 1043411 + 5881, 5853, 5842, // 14 1000297 +]; + +const HIP_HIGH_SIDE_DATA: [u16; 33] = [ + //1, 2, 3, kappa + // lgK num trials + 5855, 6688, 7391, // 4 1000000 + 5886, 6444, 6923, // 5 1000000 + 5885, 6254, 6594, // 6 1000000 + 5889, 6134, 6326, // 7 1000000 + 5900, 6072, 6203, // 8 1000000 + 5875, 6005, 6089, // 9 1000000 + 5871, 5980, 6040, // 10 1000000 + 5889, 5941, 6015, // 11 1000000 + 5871, 5926, 5973, // 12 1046369 + 5866, 5901, 5915, // 13 1043411 + 5880, 5914, 5953, // 14 1000297 +]; + +pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = ICON_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (ICON_HIGH_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let est = icon_estimate(lg_k, num_coupons); + let result = est / (1.0 + eps); + if result < (num_coupons as f64) { + num_coupons as f64 + } else { + result + } +} + +pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = ICON_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (ICON_LOW_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let est = icon_estimate(lg_k, num_coupons); + let result = est / (1.0 - eps); + result.ceil() // slight widening of interval to be conservative +} + +pub(super) fn hip_confidence_lb( + lg_k: u8, + num_coupons: u32, + hip_est_accum: f64, + kappa: NumStdDev, +) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = HIP_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (HIP_HIGH_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let result = hip_est_accum / (1.0 + eps); + if result < (num_coupons as f64) { + num_coupons as f64 + } else { + result + } +} + +pub(super) fn hip_confidence_ub( + lg_k: u8, + num_coupons: u32, + hip_est_accum: f64, + kappa: NumStdDev, +) -> f64 { + if num_coupons == 0 { + return 0.0; + } + + let k = (1 << lg_k) as f64; + let kappa = kappa.as_u8(); + + let mut x = HIP_ERROR_CONSTANT; + if lg_k <= 14 { + let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize; + x = (HIP_LOW_SIDE_DATA[idx] as f64) / 10000.0; + } + let rel = x / k.sqrt(); + let eps = (kappa as f64) * rel; + let result = hip_est_accum / (1.0 - eps); + result.ceil() // widening for coverage +} + +// The ICON estimator for FM85 sketches is defined by the arXiv paper. +// +// The current file provides exact and approximate implementations of this estimator. The exact +// version works for any value of K, but is quite slow. +// +// The much faster approximate version works for K values that are powers of two ranging from 2^4 +// to 2^32. +// +// At a high-level, this approximation can be described as using an exponential approximation when +// C > K * (5.6 or 5.7), while smaller values of C are handled by a degree-19 polynomial +// approximation of a pre-conditioned version of the true ICON mapping from C to N_hat. +// +// This file also provides a validation procedure that compares its approximate and exact +// implementations of the FM85 ICON estimator. + +const ICON_MIN_LOG_K: usize = 4; +const ICON_MAX_LOG_K: usize = 26; +const ICON_POLYNOMIAL_DEGREE: usize = 19; +const ICON_POLYNOMIAL_NUM_COEFFICIENTS: usize = 1 + ICON_POLYNOMIAL_DEGREE; +const ICON_TABLE_SIZE: usize = + ICON_POLYNOMIAL_NUM_COEFFICIENTS * (1 + (ICON_MAX_LOG_K - ICON_MIN_LOG_K)); + +#[rustfmt::skip] +#[allow(clippy::excessive_precision)] +const ICON_POLYNOMIAL_COEFFICIENTS: [f64; ICON_TABLE_SIZE] = [ + // log K = 4 + 0.9895027971889700513, 0.3319496644645180128, 0.1242818722715769986, -0.03324149686026930256, -0.2985637298081619817, + 1.366555923595830002, -4.705499366260569971, 11.61506432505530029, -21.11254986175579873, 28.89421695078809904, + -30.1383659011730991, 24.11946778830730054, -14.83391445199539938, 6.983088767267210173, -2.48964120264876998, + 0.6593243603602499947, -0.125493534558034997, 0.01620971672896159843, -0.001271267679036929953, 4.567178653294529745e-05, + + // log K = 5 + 0.9947713741300230339, 0.3326559581620939787, 0.1250050661634889981, -0.04130073804472530336, -0.2584095537451129854, + 1.218050389433120051, -4.319106696095399656, 10.87175052045090062, -20.0184979022142997, 27.63210188163320069, + -28.97950009664030091, 23.26740804691930009, -14.33375703270860058, 6.751281271241110105, -2.406363094133439962, + 0.6367414734718820357, -0.1210468076141379967, 0.01561196698118279963, -0.001222335432128580056, 4.383502970318410206e-05, + + // log K = 6 + 0.9973904854982870161, 0.3330148852217920119, 0.125251536589509993, -0.04434075124043219962, -0.2436238890691720116, + 1.163293254754570016, -4.177758779777369647, 10.60301981340099964, -19.6274507428828997, 27.18420839597660077, + -28.56827214174580121, 22.96268674086600114, -14.15234202220280046, 6.665700662642549901, -2.375043356720739851, + 0.6280993991240929608, -0.119319019358031006, 0.01537674055733759954, -0.001202881695730769916, 4.309894633186929849e-05, + + // log K = 7 + 0.9986963310058679655, 0.3331956705633329907, 0.125337696770523005, -0.04546817338088020299, -0.2386752211125199863, + 1.145927328111949972, -4.135694445582720036, 10.52805060502839929, -19.52408322548339825, 27.06921653903929936, + -28.46207532143190022, 22.88083524357429965, -14.10057147392659971, 6.63958754983273991, -2.364865219283200037, + 0.6251341806425250169, -0.1186991327450530043, 0.0152892726403408008, -0.001195439764873199896, 4.281098416794090072e-05, + + // log K = 8 + 0.999348600452531044, 0.3332480372393080148, 0.126666900963325002, -0.06495714694254159371, -0.08376282050638980681, + 0.3760158094643630267, -1.568204791601850001, 4.483117719555970382, -9.119180124379150598, 13.65799293358900002, + -15.3100211234349004, 12.97546344654869976, -8.351661538536939489, 4.075022612435580172, -1.49387015887069996, + 0.4040976870253379927, -0.07813232681879349328, 0.01020545649538820085, -0.0008063279210812720381, 2.909334976414100078e-05, + + // log K = 9 + 0.9996743787297059924, 0.3332925779481850093, 0.1267124599259649986, -0.06550452970936600228, -0.08191738117533520214, + 0.3773034458363569987, -1.604679509609959975, 4.636761898691969641, -9.487348609558699408, 14.25164235443030059, + -15.99674955529870068, 13.56353219046370029, -8.730194904342459594, 4.259010067932120336, -1.56106689792022002, + 0.4222540912786589828, -0.08165296504921559784, 0.01066878484925220041, -0.0008433887618256910015, 3.045339724886519912e-05, + + // log K = 10 + 0.999837191783945034, 0.3333142252339619804, 0.1267759538087240012, -0.06631005632753710077, -0.07692759158286699428, + 0.3568943956395980166, -1.546598721379510044, 4.51595019978557044, -9.298431968763770428, 14.02586858080080034, + -15.78858959520439953, 13.41484931677589998, -8.647958125130809748, 4.22398017468472009, -1.549708891200570093, + 0.419507410264540026, -0.08117411611046250475, 0.01061202286184199928, -0.000839300527596772007, 3.03185874520205985e-05, + + // log K = 11 + 0.9999186020796150265, 0.3333249054574359826, 0.126791713589799987, -0.06662487271699729652, -0.07335552427910230211, + 0.3316370184815959909, -1.434143797561290068, 4.180260309967409604, -8.593906870708760692, 12.95088874800289958, + -14.56876092520539956, 12.37074367531410068, -7.969152075707960137, 3.888774396648960074, -1.424923326506990051, + 0.385084561785229984, -0.07435541911616409816, 0.009695363567476529554, -0.0007644375960047160388, 2.75156194717188011e-05, + + // log K = 12 + 0.9999592955649559967, 0.3333310560725140093, 0.1267379744020450116, -0.06524495415766619344, -0.08854031542298740343, + 0.4244320628874230228, -1.794077789033230008, 5.133875262768450298, -10.40149374917120007, 15.47808115629240078, + -17.2272296137545986, 14.5002173676463002, -9.274819801602760094, 4.500782540026570189, -1.642359389030050076, + 0.442596113445525019, -0.0853226219238850947, 0.01111969379054169975, -0.0008771614088006969611, 3.161668519459719752e-05, + + // log K = 13 + 0.9999796468102559732, 0.3333336602394039727, 0.126728089053198989, -0.06503798598282370391, -0.09050261023823169548, + 0.4350609244189960201, -1.831274835815670077, 5.223387516985289913, -10.55574395269979959, 15.67359470222429962, + -17.41263416341029924, 14.63297400889229927, -9.346752431221359458, 4.530124905188380069, -1.651245566462089975, + 0.444542549250713015, -0.08561720963336499901, 0.01114805146185449992, -0.0008786251203363140043, 3.16416341644572998e-05, + + // log K = 14 + 0.9999898187060970445, 0.3333362579300819806, 0.1266984078369459976, -0.06464561179765909715, -0.09343280886228019777, + 0.4490702549264070087, -1.878087608052450008, 5.338004322057390283, -10.76690603590630069, 15.97069195083200022, + -17.73440379943459888, 14.90212518309260048, -9.520506013770420495, 4.616238931978830173, -1.68364817877918993, + 0.4536194960681350086, -0.087448605434800597, 0.01139929991331390009, -0.0008995891451622229631, 3.244407259782900338e-05, + + // log K = 15 + 0.9999949072549390028, 0.3333376334705290267, 0.126665364358402005, -0.06411790034705669439, -0.09776009134670660128, + 0.4704691112248470253, -1.948021675295769972, 5.497760972696490001, -11.03165645315390009, 16.29703330781000048, + -18.03851029448010124, 15.11836776139680083, -9.638205179917429533, 4.665122328753120051, -1.698980686525759953, + 0.4571799506245269873, -0.08804011353783609828, 0.01146553155965330043, -0.0009040455800659569869, 3.257931866957050274e-05, + + // log K = 16 + 0.9999974544793589493, 0.3333381337614599871, 0.1266524862971120102, -0.06391676499117690535, -0.09929616211306059592, + 0.4771390820378790254, -1.965762451227349938, 5.526802350376460282, -11.05703067024660058, 16.29535848023060041, + -18.00114005075790047, 15.06214012231560062, -9.58874727382628933, 4.63537541652793017, -1.686222848555620102, + 0.4532602373715179933, -0.08719448925964939923, 0.01134365425717459921, -0.0008934965241274289835, 3.216436244471380105e-05, + + // log K = 17 + 0.9999987278278800185, 0.3333383411464330148, 0.126642761751724009, -0.06371042959073920653, -0.1013564516034080043, + 0.4891311195679299839, -2.010971712051409899, 5.644390807952309963, -11.27697253921500042, 16.59957157207080058, + -18.31808338317799922, 15.31363518393730061, -9.741451446816620674, 4.706207545519429658, -1.711102469010010063, + 0.4597587341089349744, -0.08841670767182820134, 0.01149999225097850068, -0.0009056651366963050422, 3.259910736274500059e-05, + + // log K = 18 + 0.9999993637727100371, 0.3333385511608860097, 0.1266341580529160016, -0.06353272828164230335, -0.103139962850642003, + 0.4996216017206500104, -2.05099128585287982, 5.749874086531799655, -11.47727638570349917, 16.88141587810320132, + -18.61744656177490143, 15.55634230427719977, -9.892350736128680211, 4.778033520984200422, -1.737045483861280104, + 0.4667410882683730167, -0.08977256212421590165, 0.01167940146667079994, -0.0009201381242396030127, 3.313600701586759867e-05, + + // log K = 19 + 0.9999996805376010212, 0.3333372324328989778, 0.1267104737214659882, -0.06504749929326139601, -0.0882341962464350954, + 0.4131871162041140244, -1.725190703567099915, 4.900817515593920426, -9.883452720776510603, 14.6657081190816001, + -16.29398295135089825, 13.69805011761319946, -8.753475239465899449, 4.244072374564439976, -1.547202527706629915, + 0.4164770109614310267, -0.08017596922092029565, 0.01043146101701039954, -0.00082124200571200305, 2.953319493719429935e-05, + + // log K = 20 + 0.9999998390037539986, 0.3333365859956040067, 0.1267460211029839967, -0.06569456024647769843, -0.0823070353477164951, + 0.3810826463303410017, -1.611983580241109992, 4.624520077758210057, -9.397308335633589138, 14.03184981378050011, + -15.6703191315401007, 13.22992718704790072, -8.484216393184780713, 4.125607133488029987, -1.507690650697159906, + 0.4066678517577320129, -0.07842110121777939868, 0.01021780862225150042, -0.0008054065857047439754, 2.899431830426989844e-05, + + // log K = 21 + 0.9999999207001479817, 0.3333384953015239849, 0.1266331480396669928, -0.06345750166298599892, -0.1042341210992499961, + 0.5077112908497130039, -2.087398133609810191, 5.858842546192500222, -11.70620319777190055, 17.23103975433669888, + -19.01462552846669851, 15.89674059836560005, -10.11395134034419918, 4.88760796465891989, -1.777886770904629987, + 0.4780200178339499839, -0.09200895321782050218, 0.01198029553244219989, -0.0009447283875782100165, 3.405716775824710232e-05, + + // log K = 22 + 0.9999999606908690497, 0.3333383929524300071, 0.1266456445096819927, -0.06373504294081690225, -0.1012834291081849969, + 0.4893810690172959998, -2.01391428223606983, 5.656430437473649597, -11.3067201537791, 16.64980594135310099, + -18.3792355790383013, 15.36879753115040081, -9.778831246425049528, 4.725308061988969577, -1.718423596500280093, + 0.4618308177809870019, -0.08883675060799739454, 0.01155766944804260087, -0.0009104695617243750358, 3.278237729674439666e-05, + + // log K = 23 + 0.9999999794683379628, 0.3333386441751680085, 0.1266463995182049995, -0.06376031920455070556, -0.1010799540803130059, + 0.488540137426137, -2.012048323537570127, 5.654949475342659682, -11.31023240892979942, 16.66334675284959843, + -18.40241452866079896, 15.39443572867130072, -9.798844412838670692, 4.736683907539640082, -1.723168363744929987, + 0.463270349018644001, -0.08914619066708899531, 0.01160235936257320022, -0.0009143600818183229709, 3.293669304679140117e-05, + + // log K = 24 + 0.9999999911469820146, 0.3333376076934529975, 0.1266944349940530012, -0.06470524278387919381, -0.09189342220283110152, + 0.4359182372694809793, -1.815980282951169977, 5.149474056470340066, -10.37086570678100017, 15.36962686758569951, + -17.05756384717849983, 14.32755177515199918, -9.149944050025640152, 4.434601894497260055, -1.616478926806520056, + 0.4351979157055039793, -0.08381768225272340223, 0.01091321820476520016, -0.0008600264403629039739, 3.09667800347144002e-05, + + // log K = 25 + 0.9999999968592140354, 0.3333379164881000167, 0.1266782495827009913, -0.06434163088961859789, -0.09575258124988890451, + 0.4597843575354370049, -1.911374431241559924, 5.411856661251520428, -10.88850084646090011, 16.12298941380269923, + -17.88172178487259956, 15.01301780636859995, -9.585542896142529301, 4.645811872761620442, -1.693952293156189892, + 0.4563143308861309921, -0.08795976148455289523, 0.01146560428011200033, -0.0009048442931930629528, 3.26358391497329992e-05, + + // log K = 26 + 0.9999999970700530483, 0.333338329556315982, 0.126644753076394001, -0.06372365346512399997, -0.1012760856945769949, + 0.4886852278576360176, -2.009005418394389952, 5.638119224137019714, -11.26276715335160006, 16.57640024218650154, + -18.29035093605569884, 15.28892246224570073, -9.724916375991760731, 4.6978877652334603, -1.707974125916829955, + 0.4588937864564729963, -0.08824617586088029375, 0.01147732114826570046, -0.00090384524860747295, 3.253252703695579795e-05, +]; + +fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize, x: f64) -> f64 { + let end = start + num - 1; + let mut total = coefficients[end]; + for i in (start..end).rev() { + total *= x; + total += coefficients[i]; + } + total +} + +fn icon_exponential_approximation(k: f64, c: f64) -> f64 { + 0.7940236163830469 * k * 2f64.powf(c / k) +} + +pub(super) fn icon_estimate(lg_k: u8, num_coupons: u32) -> f64 { + let lg_k = lg_k as usize; + assert!( + (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k), + "lg_k out of range; got {lg_k}", + ); + + match num_coupons { + 0 => return 0.0, + 1 => return 1.0, + _ => {} + } + + let k = (1 << lg_k) as f64; + let c = num_coupons as f64; + + // Differing thresholds ensure that the approximated estimator is monotonically increasing. + let threshold_factor = if lg_k < 14 { 5.7 } else { 5.6 }; + if c > threshold_factor * k { + return icon_exponential_approximation(k, c); + } + + let factor = evaluate_polynomial( + &ICON_POLYNOMIAL_COEFFICIENTS, + ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K), + ICON_POLYNOMIAL_NUM_COEFFICIENTS, + // The constant 2.0 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS. + // This factor, although somewhat arbitrary, is based on extensive characterization studies + // and is considered a safe conservative factor. + c / (2.0 * k), + ); + let ratio = c / k; + // The constant 66.774757 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS. + // This factor, although somewhat arbitrary, is based on extensive characterization studies + // and is considered a safe conservative factor. + let term = 1.0 + (ratio * ratio * ratio / 66.774757); + let result = c * factor * term; + if result >= c { result } else { c } +} diff --git a/datasketches/src/cpc/kxp_byte_lookup.rs b/datasketches/src/cpc/kxp_byte_lookup.rs new file mode 100644 index 0000000..25fddf6 --- /dev/null +++ b/datasketches/src/cpc/kxp_byte_lookup.rs @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub(super) const KXP_BYTE_TABLE: [f64; 256] = [ + 0.99609375, 0.49609375, 0.74609375, 0.24609375, 0.87109375, 0.37109375, 0.62109375, 0.12109375, + 0.93359375, 0.43359375, 0.68359375, 0.18359375, 0.80859375, 0.30859375, 0.55859375, 0.05859375, + 0.96484375, 0.46484375, 0.71484375, 0.21484375, 0.83984375, 0.33984375, 0.58984375, 0.08984375, + 0.90234375, 0.40234375, 0.65234375, 0.15234375, 0.77734375, 0.27734375, 0.52734375, 0.02734375, + 0.98046875, 0.48046875, 0.73046875, 0.23046875, 0.85546875, 0.35546875, 0.60546875, 0.10546875, + 0.91796875, 0.41796875, 0.66796875, 0.16796875, 0.79296875, 0.29296875, 0.54296875, 0.04296875, + 0.94921875, 0.44921875, 0.69921875, 0.19921875, 0.82421875, 0.32421875, 0.57421875, 0.07421875, + 0.88671875, 0.38671875, 0.63671875, 0.13671875, 0.76171875, 0.26171875, 0.51171875, 0.01171875, + 0.98828125, 0.48828125, 0.73828125, 0.23828125, 0.86328125, 0.36328125, 0.61328125, 0.11328125, + 0.92578125, 0.42578125, 0.67578125, 0.17578125, 0.80078125, 0.30078125, 0.55078125, 0.05078125, + 0.95703125, 0.45703125, 0.70703125, 0.20703125, 0.83203125, 0.33203125, 0.58203125, 0.08203125, + 0.89453125, 0.39453125, 0.64453125, 0.14453125, 0.76953125, 0.26953125, 0.51953125, 0.01953125, + 0.97265625, 0.47265625, 0.72265625, 0.22265625, 0.84765625, 0.34765625, 0.59765625, 0.09765625, + 0.91015625, 0.41015625, 0.66015625, 0.16015625, 0.78515625, 0.28515625, 0.53515625, 0.03515625, + 0.94140625, 0.44140625, 0.69140625, 0.19140625, 0.81640625, 0.31640625, 0.56640625, 0.06640625, + 0.87890625, 0.37890625, 0.62890625, 0.12890625, 0.75390625, 0.25390625, 0.50390625, 0.00390625, + 0.9921875, 0.4921875, 0.7421875, 0.2421875, 0.8671875, 0.3671875, 0.6171875, 0.1171875, + 0.9296875, 0.4296875, 0.6796875, 0.1796875, 0.8046875, 0.3046875, 0.5546875, 0.0546875, + 0.9609375, 0.4609375, 0.7109375, 0.2109375, 0.8359375, 0.3359375, 0.5859375, 0.0859375, + 0.8984375, 0.3984375, 0.6484375, 0.1484375, 0.7734375, 0.2734375, 0.5234375, 0.0234375, + 0.9765625, 0.4765625, 0.7265625, 0.2265625, 0.8515625, 0.3515625, 0.6015625, 0.1015625, + 0.9140625, 0.4140625, 0.6640625, 0.1640625, 0.7890625, 0.2890625, 0.5390625, 0.0390625, + 0.9453125, 0.4453125, 0.6953125, 0.1953125, 0.8203125, 0.3203125, 0.5703125, 0.0703125, + 0.8828125, 0.3828125, 0.6328125, 0.1328125, 0.7578125, 0.2578125, 0.5078125, 0.0078125, + 0.984375, 0.484375, 0.734375, 0.234375, 0.859375, 0.359375, 0.609375, 0.109375, 0.921875, + 0.421875, 0.671875, 0.171875, 0.796875, 0.296875, 0.546875, 0.046875, 0.953125, 0.453125, + 0.703125, 0.203125, 0.828125, 0.328125, 0.578125, 0.078125, 0.890625, 0.390625, 0.640625, + 0.140625, 0.765625, 0.265625, 0.515625, 0.015625, 0.96875, 0.46875, 0.71875, 0.21875, 0.84375, + 0.34375, 0.59375, 0.09375, 0.90625, 0.40625, 0.65625, 0.15625, 0.78125, 0.28125, 0.53125, + 0.03125, 0.9375, 0.4375, 0.6875, 0.1875, 0.8125, 0.3125, 0.5625, 0.0625, 0.875, 0.375, 0.625, + 0.125, 0.75, 0.25, 0.5, 0.0, +]; + +#[cfg(test)] +mod tests { + use super::*; + use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2; + + #[test] + fn assert_kxp_byte_table() { + for byte in 0u8..=255 { + // expected value + let expected = KXP_BYTE_TABLE[byte as usize]; + + // computed value + let mut computed = 0.0; + for col in 0..8 { + let bit = (byte >> col) & 1; + if bit == 0 { + // note the inverted logic + computed += INVERSE_POWERS_OF_2[col + 1]; //note the "+1" + } + } + + assert_eq!(expected, computed); + } + } +} diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs new file mode 100644 index 0000000..4c9fea2 --- /dev/null +++ b/datasketches/src/cpc/mod.rs @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#![allow(dead_code)] + +//! Compressed Probabilistic Counting sketch. +//! +//! This is a unique-counting sketch that implements the Compressed Probabilistic Counting (CPC, +//! a.k.a. FM85) algorithms developed by Kevin Lang in his paper [Back to the Future: an Even More +//! Nearly Optimal Cardinality Estimation Algorithm](https://arxiv.org/abs/1708.06839). +//! +//! This sketch is extremely space-efficient when serialized. In an apples-to-apples empirical +//! comparison against compressed HyperLogLog sketches, this new algorithm simultaneously wins on +//! the two dimensions of the space/accuracy tradeoff and produces sketches that are smaller than +//! the entropy of HLL, so no possible implementation of compressed HLL can match its space +//! efficiency for a given accuracy. As described in the paper this sketch implements a newly +//! developed ICON estimator algorithm that survives union operations, another well-known estimator, +//! the [Historical Inverse Probability (HIP)](https://arxiv.org/abs/1306.3284) estimator does not. +//! +//! The update speed performance of this sketch is quite fast and is comparable to the speed of HLL. +//! The union (merging) capability of this sketch also allows for merging of sketches with different +//! configurations of K. +//! +//! For additional security this sketch can be configured with a user-specified hash seed. + +mod estimator; +mod kxp_byte_lookup; +mod pair_table; +mod sketch; +mod union; + +pub use self::sketch::CpcSketch; +pub use self::union::CpcUnion; + +/// Default log2 of K. +const DEFAULT_LG_K: u8 = 11; +/// Min log2 of K. +const MIN_LG_K: usize = 4; +/// Max log2 of K. +const MAX_LG_K: usize = 26; + +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +#[expect(clippy::upper_case_acronyms)] +enum Flavor { + EMPTY, // 0 == C < 1 + SPARSE, // 1 <= C < 3K/32 + HYBRID, // 3K/32 <= C < K/2 + PINNED, // K/2 <= C < 27K/8 [NB: 27/8 = 3 + 3/8] + SLIDING, // 27K/8 <= C +} + +fn count_bits_set_in_matrix(matrix: &[u64]) -> u32 { + let mut count = 0; + for word in matrix { + count += word.count_ones(); + } + count +} + +fn determine_flavor(lg_k: u8, num_coupons: u32) -> Flavor { + let k = 1 << lg_k; + let c2 = num_coupons << 1; + let c8 = num_coupons << 3; + let c32 = num_coupons << 5; + if num_coupons == 0 { + Flavor::EMPTY + } else if c32 < (3 * k) { + Flavor::SPARSE + } else if c2 < k { + Flavor::HYBRID + } else if c8 < (27 * k) { + Flavor::PINNED + } else { + Flavor::SLIDING + } +} + +fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 { + let k = 1 << lg_k; + let tmp = ((num_coupons as i64) << 3) - (19 * k); // 8C - 19K + if tmp < 0 { + 0 + } else { + (tmp >> (lg_k + 3)) as u8 // tmp / 8K + } +} diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs new file mode 100644 index 0000000..2af1859 --- /dev/null +++ b/datasketches/src/cpc/pair_table.rs @@ -0,0 +1,346 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const UPSIZE_NUMERATOR: u32 = 3; +const UPSIZE_DENOMINATOR: u32 = 4; +const DOWNSIZE_NUMERATOR: u32 = 1; +const DOWNSIZE_DENOMINATOR: u32 = 4; + +/// A highly specialized hash table used for sparse data. +/// +/// This table stores `(row, col)` pairs and uses linear probing for collision resolution. It is +/// optimized for scenarios where the cardinality of entries is low. +#[derive(Debug, Clone)] +pub(super) struct PairTable { + /// log2 of number of slots + lg_size: u8, + num_valid_bits: u8, + num_items: u32, + slots: Vec, +} + +impl PairTable { + pub fn new(lg_size: u8, num_valid_bits: u8) -> Self { + assert!( + (2..=26).contains(&lg_size), + "lg_size must be in [2, 26], got {lg_size}" + ); + assert!( + ((lg_size + 1)..=32).contains(&num_valid_bits), + "num_valid_bits must be in [lg_size + 1, 32], got {num_valid_bits} where lg_size = {lg_size}" + ); + Self { + lg_size, + num_valid_bits, + num_items: 0, + slots: vec![u32::MAX; 1 << lg_size], + } + } + + /// A constructor specifically tailored to be a part of FM85 decompression scheme. + pub fn from_slots(lg_size: u8, num_items: u32, slots: Vec) -> Self { + let mut lg_num_slots = 2; + while UPSIZE_DENOMINATOR * num_items > (UPSIZE_NUMERATOR * (1 << lg_num_slots)) { + lg_num_slots += 1; + } + + let mut table = Self::new(lg_num_slots, 6 + lg_size); + + // Note: there is a possible "snowplow effect" here because the caller is passing in a + // sorted pairs array. However, we are starting out with the correct final table size, so + // the problem might not occur. + + for slot in slots { + table.must_insert(slot); + } + table.num_items = num_items; + table + } + + pub fn num_items(&self) -> u32 { + self.num_items + } + + pub fn slots(&self) -> &[u32] { + &self.slots + } + + pub fn clear(&mut self) { + self.slots.fill(u32::MAX); + self.num_items = 0; + } + + pub fn maybe_delete(&mut self, item: u32) -> bool { + let index = self.lookup(item) as usize; + if self.slots[index] == u32::MAX { + return false; + } + assert_eq!( + self.slots[index], item, + "item {item} not found at index {index}" + ); + assert!(self.num_items > 0, "no items to delete"); + + // delete the item + self.slots[index] = u32::MAX; + self.num_items -= 1; + + // re-insert all items between the freed slot and the next empty slot + let mask = (1 << self.lg_size) - 1; + let mut probe = (index + 1) & mask; + let mut fetched = self.slots[probe]; + while fetched != u32::MAX { + self.slots[probe] = u32::MAX; + self.must_insert(fetched); + probe = (probe + 1) & mask; + fetched = self.slots[probe]; + } + + // shrink if necessary + while ((DOWNSIZE_DENOMINATOR * self.num_items) < (DOWNSIZE_NUMERATOR * (1 << self.lg_size))) + && (self.lg_size > 2) + { + self.rebuild(self.lg_size - 1); + } + + true + } + + pub fn maybe_insert(&mut self, item: u32) -> bool { + let index = self.lookup(item) as usize; + if self.slots[index] == item { + return false; + } + assert_eq!( + self.slots[index], + u32::MAX, + "no empty slot found for item {item} at index {index}" + ); + self.slots[index] = item; + self.num_items += 1; + while (UPSIZE_DENOMINATOR * self.num_items) > (UPSIZE_NUMERATOR * (1 << self.lg_size)) { + self.rebuild(self.lg_size + 1); + } + true + } + + /// While extracting the items from a linear probing hashtable, this will usually undo the + /// wrap-around provided that the table isn't too full. + /// + /// Experiments suggest that for sufficiently large tables the load factor would have to be + /// over 90 percent before this would fail frequently, and even then the subsequent sort would + /// fix things up. + /// + /// The result is nearly sorted, so make sure to use an efficient sort for that case. + pub fn unwrapping_get_items(&self) -> Vec { + if self.slots.is_empty() { + return vec![]; + } + + let table_size = 1usize << self.lg_size; + let mut result = vec![0; self.num_items as usize]; + let mut i = 0usize; + let mut l = 0usize; + let mut r = self.num_items as usize - 1; + + // special rules for the region before the first empty slot + let hi_bit = 1 << (self.num_valid_bits - 1); + while i < table_size && self.slots[i] != u32::MAX { + let item = self.slots[i]; + i += 1; + if (item & hi_bit) != 0 { + // this item was probably wrapped, so move to end + result[r] = item; + r -= 1; + } else { + result[l] = item; + l += 1; + } + } + + // the rest of the table is processed normally + while i < table_size { + let item = self.slots[i]; + i += 1; + if item != u32::MAX { + result[l] = item; + l += 1; + } + } + + assert_eq!(l, r + 1, "number of items mismatch during extraction"); + result + } + + fn must_insert(&mut self, item: u32) { + let index = self.lookup(item) as usize; + assert_ne!( + self.slots[index], item, + "item {item} already present in table" + ); + assert_eq!( + self.slots[index], + u32::MAX, + "no empty slot found for item {item} at index {index}" + ); + self.slots[index] = item; + // counts and resizing must be handled by the caller. + } + + fn lookup(&self, item: u32) -> u32 { + let size = 1 << self.lg_size; + let mask = size - 1; + + let shift = self.num_valid_bits - self.lg_size; + + // extract high table size bits + let mut probe = item >> shift; + assert!(probe <= mask, "probe = {probe}, mask = {mask}"); + + loop { + let slot = self.slots[probe as usize]; + if slot != item && slot != u32::MAX { + probe = (probe + 1) & mask; + } else { + break; + } + } + + probe + } + + /// Rebuilds to a larger size. `num_items` and `num_valid_bits` remain unchanged. + fn rebuild(&mut self, lg_size: u8) { + assert!( + (2..=26).contains(&lg_size), + "lg_size must be in [2, 26], got {lg_size}" + ); + assert!( + ((lg_size + 1)..=32).contains(&self.num_valid_bits), + "num_valid_bits must be in [lg_size + 1, 32], got {} where lg_size = {lg_size}", + self.num_valid_bits + ); + + let new_size = 1u32 << lg_size; + assert!( + new_size > self.num_items, + "new table size ({new_size}) must be larger than number of items {}", + self.num_items + ); + + let slots = std::mem::replace(&mut self.slots, vec![u32::MAX; new_size as usize]); + self.lg_size = lg_size; + for slot in slots { + if slot != u32::MAX { + self.must_insert(slot); + } + } + } +} + +/// This should be used pair with [`PairTable::unwrapping_get_items`]. +/// +/// In applications where the input array is already nearly sorted, insertion sort runs in linear +/// time with a very small constant. +/// +/// This introspective version of insertion sort protects against the quadratic cost of sorting bad +/// input arrays. It keeps track of how much work has been done, and if that exceeds a constant +/// times the array length, it switches to a different sorting algorithm. +pub fn introspective_insertion_sort(a: &mut [u32]) { + let cost_limit = 8 * a.len(); + + let mut cost = 0; + for i in 1..a.len() { + let mut j = i; + let v = a[i]; + while j >= 1 && v < a[j - 1] { + a[j] = a[j - 1]; + j -= 1; + } + a[j] = v; + cost += i - j; // distance moved is a measure of work + if cost > cost_limit { + knuth_shell_sort3(a); + return; + } + } +} + +fn knuth_shell_sort3(a: &mut [u32]) { + let len = a.len(); + + let mut h = 0; + while h < len / 9 { + h = 3 * h + 1; + } + + while h > 0 { + for i in h..len { + let v = a[i]; + let mut j = i; + while j >= h && v < a[j - h] { + a[j] = a[j - h]; + j -= h; + } + a[j] = v; + } + h /= 3; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_sort_1() { + let data = (0..10) + .map(|_| rand::random_range(0..10000)) + .collect::>(); + let mut sorted = data.clone(); + introspective_insertion_sort(&mut sorted); + assert!( + sorted.is_sorted(), + "data was not sorted correctly: origin={data:?}, sorted={sorted:?}" + ); + } + + #[test] + fn test_sort_2() { + let data = (0..10) + .map(|_| rand::random_range(0..10000) + 3_000_000_000) + .collect::>(); + let mut sorted = data.clone(); + introspective_insertion_sort(&mut sorted); + assert!( + sorted.is_sorted(), + "data was not sorted correctly: origin={data:?}, sorted={sorted:?}" + ); + } + + #[test] + fn test_sort_3() { + let len = 20; + let data = (0..len).map(|i| len - i + 1).collect::>(); + let mut sorted = data.clone(); + introspective_insertion_sort(&mut sorted); + assert!( + sorted.is_sorted(), + "data was not sorted correctly: origin={data:?}, sorted={sorted:?}" + ); + } +} diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs new file mode 100644 index 0000000..6af8bfc --- /dev/null +++ b/datasketches/src/cpc/sketch.rs @@ -0,0 +1,499 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::hash::Hash; + +use crate::common::NumStdDev; +use crate::common::canonical_double; +use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2; +use crate::cpc::DEFAULT_LG_K; +use crate::cpc::Flavor; +use crate::cpc::MAX_LG_K; +use crate::cpc::MIN_LG_K; +use crate::cpc::count_bits_set_in_matrix; +use crate::cpc::determine_correct_offset; +use crate::cpc::determine_flavor; +use crate::cpc::estimator::hip_confidence_lb; +use crate::cpc::estimator::hip_confidence_ub; +use crate::cpc::estimator::icon_confidence_lb; +use crate::cpc::estimator::icon_confidence_ub; +use crate::cpc::estimator::icon_estimate; +use crate::cpc::kxp_byte_lookup::KXP_BYTE_TABLE; +use crate::cpc::pair_table::PairTable; +use crate::hash::DEFAULT_UPDATE_SEED; +use crate::hash::MurmurHash3X64128; + +/// A Compressed Probabilistic Counting sketch. +/// +/// See the [module level documentation](super) for more. +#[derive(Debug, Clone)] +pub struct CpcSketch { + // immutable config variables + lg_k: u8, + seed: u64, + + // sketch state + /// Part of a speed optimization. + pub(super) first_interesting_column: u8, + /// The number of coupons collected so far. + pub(super) num_coupons: u32, + /// Sparse and surprising values. + pub(super) surprising_value_table: Option, + /// Derivable from num_coupons, but made explicit for speed. + pub(super) window_offset: u8, + /// Size K bytes in dense mode (flavor >= HYBRID). + pub(super) sliding_window: Vec, + + // estimator state + /// Whether the sketch is a result of merging. + /// + /// If `false`, the HIP (Historical Inverse Probability) estimator is used. + /// If `true`, the ICON (Inter-Column Optimal) Estimator is fallback in use. + pub(super) merge_flag: bool, + // the following variables are only valid in HIP estimator + /// A pre-calculated probability factor (`k * p`) used to compute the increment delta. + kxp: f64, + /// The accumulated cardinality estimate. + hip_est_accum: f64, +} + +impl Default for CpcSketch { + fn default() -> Self { + Self::new(DEFAULT_LG_K) + } +} + +impl CpcSketch { + /// Creates a new `CpcSketch` with the given `lg_k` and default seed. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 16]`. + pub fn new(lg_k: u8) -> Self { + Self::with_seed(lg_k, DEFAULT_UPDATE_SEED) + } + + /// Creates a new `CpcSketch` with the given `lg_k` and `seed`. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 16]`. + pub fn with_seed(lg_k: u8, seed: u64) -> Self { + assert!( + (MIN_LG_K..=MAX_LG_K).contains(&(lg_k as usize)), + "lg_k out of range; got {lg_k}", + ); + + Self { + lg_k, + seed, + first_interesting_column: 0, + num_coupons: 0, + surprising_value_table: None, + window_offset: 0, + sliding_window: vec![], + merge_flag: false, + kxp: (1 << lg_k) as f64, + hip_est_accum: 0.0, + } + } + + /// Return the parameter lg_k. + pub fn lg_k(&self) -> u8 { + self.lg_k + } + + /// Returns the best estimate of the cardinality of the sketch. + pub fn estimate(&self) -> f64 { + if !self.merge_flag { + self.hip_est_accum + } else { + icon_estimate(self.lg_k, self.num_coupons) + } + } + + /// Returns the best estimate of the lower bound of the confidence interval given `kappa`. + pub fn lower_bound(&self, kappa: NumStdDev) -> f64 { + if !self.merge_flag { + hip_confidence_lb(self.lg_k, self.num_coupons, self.hip_est_accum, kappa) + } else { + icon_confidence_lb(self.lg_k, self.num_coupons, kappa) + } + } + + /// Returns the best estimate of the upper bound of the confidence interval given `kappa`. + pub fn upper_bound(&self, kappa: NumStdDev) -> f64 { + if !self.merge_flag { + hip_confidence_ub(self.lg_k, self.num_coupons, self.hip_est_accum, kappa) + } else { + icon_confidence_ub(self.lg_k, self.num_coupons, kappa) + } + } + + /// Returns true if the sketch is empty. + pub fn is_empty(&self) -> bool { + self.num_coupons == 0 + } + + /// Update the sketch with a hashable value. + /// + /// For `f32`/`f64` values, use `update_f32`/`update_f64` instead. + pub fn update(&mut self, value: T) { + let mut hasher = MurmurHash3X64128::with_seed(self.seed); + value.hash(&mut hasher); + let (h1, h2) = hasher.finish128(); + + let k = 1 << self.lg_k; + let col = h2.leading_zeros(); // 0 <= col <= 64 + let col = if col > 63 { 63 } else { col as u8 }; // clip so that 0 <= col <= 63 + let row = (h1 & (k - 1)) as u32; + let mut row_col = (row << 6) | (col as u32); + // To avoid the hash table's "empty" value, we change the row of the following pair. + // This case is extremely unlikely, but we might as well handle it. + if row_col == u32::MAX { + row_col ^= 1 << 6; + } + self.row_col_update(row_col); + } + + /// Update the sketch with a f64 value. + pub fn update_f64(&mut self, value: f64) { + // Canonicalize double for compatibility with Java + let canonical = canonical_double(value); + self.update(canonical); + } + + /// Update the sketch with a f32 value. + pub fn update_f32(&mut self, value: f32) { + self.update_f64(value as f64); + } + + pub(super) fn flavor(&self) -> Flavor { + determine_flavor(self.lg_k, self.num_coupons) + } + + pub(super) fn row_col_update(&mut self, row_col: u32) { + let col = (row_col & 63) as u8; + if col < self.first_interesting_column { + // important speed optimization + return; + } + + if self.num_coupons == 0 { + // promote EMPTY to SPARSE + self.surprising_value_table = Some(PairTable::new(2, 6 + self.lg_k)); + } + + if self.sliding_window.is_empty() { + self.update_sparse(row_col); + } else { + self.update_windowed(row_col); + } + } + + pub(super) fn seed(&self) -> u64 { + self.seed + } + + pub(super) fn surprising_value_table(&self) -> &PairTable { + self.surprising_value_table + .as_ref() + .expect("surprising value table must be initialized") + } + + pub(super) fn mut_surprising_value_table(&mut self) -> &mut PairTable { + self.surprising_value_table + .as_mut() + .expect("surprising value table must be initialized") + } + + fn update_hip(&mut self, row_col: u32) { + let k = 1 << self.lg_k; + let col = (row_col & 63) as usize; + let one_over_p = (k as f64) / self.kxp; + self.hip_est_accum += one_over_p; + self.kxp -= INVERSE_POWERS_OF_2[col + 1] // notice the "+1" + } + + fn update_sparse(&mut self, row_col: u32) { + let k = 1 << self.lg_k; + let c32pre = (self.num_coupons as u64) << 5; + debug_assert!(c32pre < 3 * k); // C < 3K/32, in other words, flavor == SPARSE + let is_novel = self.mut_surprising_value_table().maybe_insert(row_col); + if is_novel { + self.num_coupons += 1; + self.update_hip(row_col); + let c32post = (self.num_coupons as u64) << 5; + if c32post >= 3 * k { + self.promote_sparse_to_windowed(); + } + } + } + + fn promote_sparse_to_windowed(&mut self) { + debug_assert_eq!(self.window_offset, 0); + + let k = 1 << self.lg_k; + let c32 = (self.num_coupons as u64) << 5; + debug_assert!((c32 == (3 * k)) || ((self.lg_k == 4) && (c32 > (3 * k)))); + + self.sliding_window.resize(k as usize, 0); + + let old_table = self + .surprising_value_table + .replace(PairTable::new(2, 6 + self.lg_k)) + .expect("surprising value table must be initialized"); + let old_slots = old_table.slots(); + for &row_col in old_slots { + if row_col != u32::MAX { + let col = (row_col & 63) as u8; + if col < 8 { + let row = (row_col >> 6) as usize; + self.sliding_window[row] |= 1 << col; + } else { + // cannot use must_insert(), because it doesn't provide for growth + let is_novel = self.mut_surprising_value_table().maybe_insert(row_col); + debug_assert!(is_novel); + } + } + } + } + + fn update_windowed(&mut self, row_col: u32) { + debug_assert!(self.window_offset <= 56); + let k = 1 << self.lg_k; + let c32pre = (self.num_coupons as u64) << 5; + debug_assert!(c32pre >= 3 * k); // C >= 3K/32, in other words flavor >= HYBRID + let c8pre = (self.num_coupons as u64) << 3; + let w8pre = (self.window_offset as u64) << 3; + debug_assert!(c8pre < (27 + w8pre) * k); // C < (K * 27/8) + (K * windowOffset) + + let mut is_novel = false; // novel if new coupon; + let col = (row_col & 63) as u8; + if col < self.window_offset { + // track the surprising 0's "before" the window + is_novel = self.mut_surprising_value_table().maybe_delete(row_col); // inverted logic + } else if col < self.window_offset + 8 { + // track the 8 bits inside the window + let row = (row_col >> 6) as usize; + let old_bits = self.sliding_window[row]; + let new_bits = old_bits | (1 << (col - self.window_offset)); + if old_bits != new_bits { + self.sliding_window[row] = new_bits; + is_novel = true; + } + } else { + // track the surprising 1's "after" the window + is_novel = self.mut_surprising_value_table().maybe_insert(row_col); // normal logic + } + + if is_novel { + self.num_coupons += 1; + self.update_hip(row_col); + let c8post = (self.num_coupons as u64) << 3; + if c8post >= (27 + w8pre) * k { + self.move_window(); + debug_assert!((1..=56).contains(&self.window_offset)); + let w8post = (self.window_offset as u64) << 3; + debug_assert!(c8post < ((27 + w8post) * k)); // C < (K * 27/8) + (K * windowOffset) + } + } + } + + fn move_window(&mut self) { + let new_offset = self.window_offset + 1; + debug_assert!(new_offset <= 56); + debug_assert_eq!( + new_offset, + determine_correct_offset(self.lg_k, self.num_coupons) + ); + + let k = 1 << self.lg_k; + + // Construct the full-sized bit matrix that corresponds to the sketch + let bit_matrix = self.build_bit_matrix(); + + // refresh the KXP register on every 8th window shift. + if (new_offset & 0x7) == 0 { + self.refresh_kxp(&bit_matrix); + } + + self.mut_surprising_value_table().clear(); // the new number of surprises will be about the same + + let mask_for_clearing_window = (0xFF << new_offset) ^ u64::MAX; + let mask_for_flipping_early_zone = (1u64 << new_offset) - 1; + + let mut all_surprises_ored = 0u64; + for i in 0..k { + let mut pattern = bit_matrix[i]; + self.sliding_window[i] = ((pattern >> new_offset) & 0xff) as u8; + pattern &= mask_for_clearing_window; + // The following line converts surprising 0's to 1's in the "early zone", + // (and vice versa, which is essential for this procedure's O(k) time cost). + pattern ^= mask_for_flipping_early_zone; + all_surprises_ored |= pattern; // a cheap way to recalculate first_interesting_column + while pattern != 0 { + let col = pattern.trailing_zeros(); + pattern ^= 1 << col; // erase the 1 + let row_col = ((i as u32) << 6) | col; + let is_novel = self.mut_surprising_value_table().maybe_insert(row_col); + debug_assert!(is_novel); + } + } + + self.window_offset = new_offset; + self.first_interesting_column = all_surprises_ored.trailing_zeros() as u8; + if self.first_interesting_column > new_offset { + self.first_interesting_column = new_offset; // corner case + } + } + + /// The KXP register is a double with roughly 50 bits of precision, but it might need roughly + /// 90 bits to track the value with perfect accuracy. + /// + /// Therefore, we recalculate KXP occasionally from the sketch's full bit_matrix so that it + /// will reflect changes that were previously outside the mantissa. + fn refresh_kxp(&mut self, bit_matrix: &[u64]) { + // for improved numerical accuracy, we separately sum the bytes of the u64's + let mut byte_sums = [0.0; 8]; + for &bit in bit_matrix { + let mut word = bit; + for sum in byte_sums.iter_mut() { + let byte = (word & 0xFF) as usize; + *sum += KXP_BYTE_TABLE[byte]; + word >>= 8; + } + } + + let mut total = 0.0; + for i in (0..8).rev() { + // the reverse order is important + let factor = INVERSE_POWERS_OF_2[i * 8]; // pow (256.0, (-1.0 * ((double) j))) + total += factor * byte_sums[i]; + } + + self.kxp = total; + } + + pub(super) fn build_bit_matrix(&self) -> Vec { + let k = 1 << self.lg_k; + let offset = self.window_offset; + debug_assert!(offset <= 56); + + // Fill the matrix with default rows in which the "early zone" is filled with ones. + // This is essential for the routine's O(k) time cost (as opposed to O(C)). + let default_row = (1u64 << offset) - 1; + + let mut matrix = vec![default_row; k]; + if self.num_coupons == 0 { + return matrix; + } + + if !self.sliding_window.is_empty() { + // In other words, we are in window mode, not sparse mode + for i in 0..k { + // set the window bits, trusting the sketch's current offset + matrix[i] |= (self.sliding_window[i] as u64) << offset; + } + } + + for &row_col in self.surprising_value_table().slots() { + if row_col != u32::MAX { + let col = (row_col & 63) as u8; + let row = (row_col >> 6) as usize; + // Flip the specified matrix bit from its default value. + // In the "early" zone the bit changes from 1 to 0. + // In the "late" zone the bit changes from 0 to 1. + matrix[row] ^= 1 << col; + } + } + + matrix + } +} + +impl CpcSketch { + /// Returns the estimated maximum compressed serialized size of a sketch. + /// + /// The actual size of a compressed CPC sketch has a small random variance, but the following + /// empirically measured size should be large enough for at least 99.9 percent of sketches. + /// + /// For small values of `n` the size can be much smaller. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 26]`. + pub fn max_serialized_bytes(lg_k: u8) -> usize { + let lg_k = lg_k as usize; + assert!( + (MIN_LG_K..=MAX_LG_K).contains(&lg_k), + "lg_k out of range; got {lg_k}", + ); + + // These empirical values for the 99.9th percentile of size in bytes were measured using + // 100,000 trials. The value for each trial is the maximum of 5*16=80 measurements + // that were equally spaced over values of the quantity C/K between 3.0 and 8.0. + // This table does not include the worst-case space for the preamble, which is added + // by the function. + const EMPIRICAL_SIZE_MAX_LGK: usize = 19; + const EMPIRICAL_MAX_SIZE_BYTES: [usize; 16] = [ + 24, // lg_k = 4 + 36, // lg_k = 5 + 56, // lg_k = 6 + 100, // lg_k = 7 + 180, // lg_k = 8 + 344, // lg_k = 9 + 660, // lg_k = 10 + 1292, // lg_k = 11 + 2540, // lg_k = 12 + 5020, // lg_k = 13 + 9968, // lg_k = 14 + 19836, // lg_k = 15 + 39532, // lg_k = 16 + 78880, // lg_k = 17 + 157516, // lg_k = 18 + 314656, // lg_k = 19 + ]; + const EMPIRICAL_MAX_SIZE_FACTOR: f64 = 0.6; // 0.6 = 4.8 / 8.0 + const MAX_PREAMBLE_SIZE_BYTES: usize = 40; + + if lg_k <= EMPIRICAL_SIZE_MAX_LGK { + return EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] + MAX_PREAMBLE_SIZE_BYTES; + } + let k = 1 << lg_k; + ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) + MAX_PREAMBLE_SIZE_BYTES + } +} + +// testing methods +impl CpcSketch { + /// Validate this sketch is valid. + /// + /// This is primarily for testing and validation purposes. + pub fn validate(&self) -> bool { + let bit_matrix = self.build_bit_matrix(); + let num_bits_set = count_bits_set_in_matrix(&bit_matrix); + num_bits_set == self.num_coupons + } + + /// Returns the number of coupons in the sketch. + /// + /// This is primarily for testing and validation purposes. + pub fn num_coupons(&self) -> u32 { + self.num_coupons + } +} diff --git a/datasketches/src/cpc/union.rs b/datasketches/src/cpc/union.rs new file mode 100644 index 0000000..1a5255f --- /dev/null +++ b/datasketches/src/cpc/union.rs @@ -0,0 +1,409 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The merging logic is somewhat involved, so it will be summarized here. +//! +//! First, we compare the K values of the union and the source sketch. +//! +//! If `source.K < union.K`, we reduce the union's K to match, which requires downsampling the +//! union's internal sketch. +//! +//! Here is how to perform the downsampling: +//! +//! If the union contains a bitMatrix, downsample it by row-wise ORing. +//! +//! If the union contains a sparse sketch, then create a new empty sketch, and walk the old target +//! sketch updating the new one (with modulo). At the end, check whether the new target sketch is +//! still in sparse mode (it might not be, because downsampling densifies the set of collected +//! coupons). If it is NOT in sparse mode, immediately convert it to a bitMatrix. +//! +//! At this point, we have `source.K >= union.K`. (We won't keep mentioning this, but in all the +//! following the source's row indices are used mod union.K while updating the union's sketch. +//! That takes care of the situation where `source.K < union.K`.) +//! +//! Case A: union is Sparse and source is Sparse. We walk the source sketch updating the union's +//! sketch. At the end, if the union's sketch is no longer in sparse mode, we convert it to a +//! bitMatrix. +//! +//! Case B: union is bitMatrix and source is Sparse. We walk the source sketch, setting bits in the +//! bitMatrix. +//! +//! In the remaining cases, we have flavor(source) > Sparse, so we immediately convert the union's +//! sketch to a bitMatrix (even if the union contains very few coupons). Then: +//! +//! Case C: union is bitMatrix and source is Hybrid or Pinned. Then we OR the source's sliding +//! window into the bitMatrix, and walk the source's table, setting bits in the bitMatrix. +//! +//! Case D: union is bitMatrix, and source is Sliding. Then we convert the source into a bitMatrix, +//! and OR it into the union's bitMatrix. (Important note; merely walking the source wouldn't work +//! because of the partially inverted Logic in the Sliding flavor, where the presence of coupons +//! is sometimes indicated by the ABSENCE of row_col pairs in the surprises table.) +//! +//! How does [`CpcUnion::get_result`] work? +//! +//! If the union has an Accumulator state, make a copy of that sketch. +//! +//! If the union has a BitMatrix state, then we have to convert the bitMatrix back into a sketch, +//! which requires doing some extra work to figure out the values of num_coupons, offset, +//! first_interesting_column, and kxp. + +use crate::cpc::CpcSketch; +use crate::cpc::DEFAULT_LG_K; +use crate::cpc::Flavor; +use crate::cpc::count_bits_set_in_matrix; +use crate::cpc::determine_correct_offset; +use crate::cpc::pair_table::PairTable; +use crate::hash::DEFAULT_UPDATE_SEED; + +/// The union (merge) operation for the CPC sketches. +#[derive(Debug, Clone)] +pub struct CpcUnion { + // immutable config variables + lg_k: u8, + seed: u64, + + // union state + state: UnionState, +} + +impl Default for CpcUnion { + fn default() -> Self { + Self::new(DEFAULT_LG_K) + } +} + +impl CpcUnion { + /// Creates a new `CpcUnion` with the given `lg_k` and default seed. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 16]`. + pub fn new(lg_k: u8) -> Self { + Self::with_seed(lg_k, DEFAULT_UPDATE_SEED) + } + + /// Creates a new `CpcUnion` with the given `lg_k` and `seed`. + /// + /// # Panics + /// + /// Panics if `lg_k` is not in the range `[4, 16]`. + pub fn with_seed(lg_k: u8, seed: u64) -> Self { + // We begin with the accumulator holding an EMPTY_MERGED sketch object. + let sketch = CpcSketch::with_seed(lg_k, seed); + let state = UnionState::Accumulator(sketch); + Self { lg_k, seed, state } + } + + /// Return the parameter lg_k. + /// + /// Note that due to merging with source sketches that may have a lower value of lg_k, this + /// value can be less than what the union object was configured with. + pub fn lg_k(&self) -> u8 { + self.lg_k + } + + /// Returns the result of union operations as a CPC sketch. + pub fn get_result(&self) -> CpcSketch { + match &self.state { + UnionState::Accumulator(sketch) => { + if sketch.is_empty() { + CpcSketch::with_seed(self.lg_k, self.seed) + } else { + let mut sketch = sketch.clone(); + assert_eq!(sketch.flavor(), Flavor::SPARSE); + sketch.merge_flag = true; + sketch + } + } + UnionState::BitMatrix(matrix) => { + let lg_k = self.lg_k; + + let mut sketch = CpcSketch::with_seed(lg_k, self.seed); + let num_coupons = count_bits_set_in_matrix(matrix); + sketch.num_coupons = num_coupons; + let offset = determine_correct_offset(lg_k, num_coupons); + sketch.window_offset = offset; + + // Build the window and pair table + let k = 1 << lg_k; + let mut sliding_window = vec![0u8; k]; + + // LgSize = K/16; in some cases this will end up being oversized + let new_table_lg_size = (lg_k - 4).max(2); + let mut table = PairTable::new(new_table_lg_size, 6 + lg_k); + + // the following should work even when the offset is zero + let mask_for_clearing_window = (0xFFu64 << offset) ^ u64::MAX; + let mask_for_flipping_early_zone = (1u64 << offset) - 1; + let mut all_surprises_ored = 0; + + // The snowplow effect was caused by processing the rows in order, + // but we have fixed it by using a sufficiently large hash table. + for i in 0..k { + let mut pattern = matrix[i]; + sliding_window[i] = ((pattern >> offset) & 0xFF) as u8; + pattern &= mask_for_clearing_window; + pattern ^= mask_for_flipping_early_zone; // this flipping converts surprising 0's to 1's + all_surprises_ored |= pattern; + while pattern != 0 { + let col = pattern.trailing_zeros(); + pattern ^= 1u64 << col; // erase the 1 + let row_col = ((i as u32) << 6) | col; + let is_novel = table.maybe_insert(row_col); + assert!(is_novel); + } + } + + // at this point we could shrink an oversized hash table, but the relative waste + // isn't very big + sketch.first_interesting_column = all_surprises_ored.trailing_zeros() as u8; + if sketch.first_interesting_column > offset { + sketch.first_interesting_column = offset; // corner case + } + + // HIP-related fields will contain zeros, and that is okay since merge_flag is true, + // and thus the HIP estimator will not be used. + sketch.sliding_window = sliding_window; + sketch.surprising_value_table = Some(table); + sketch.merge_flag = true; + + sketch + } + } + } + + /// Update this union with a CpcSketch. + /// + /// # Panics + /// + /// Panics if the seed of the provided sketch does not match the seed of this union. + pub fn update(&mut self, sketch: &CpcSketch) { + assert_eq!(self.seed, sketch.seed()); + + let flavor = sketch.flavor(); + if flavor == Flavor::EMPTY { + return; + } + + if sketch.lg_k() < self.lg_k { + self.reduce_k(sketch.lg_k()); + } + + // if source is past SPARSE mode, make sure that union is a bitMatrix. + if flavor > Flavor::SPARSE { + if let UnionState::Accumulator(old_sketch) = &self.state { + // convert the accumulator to a bit matrix + let bit_matrix = old_sketch.build_bit_matrix(); + self.state = UnionState::BitMatrix(bit_matrix); + } + } + + match &mut self.state { + UnionState::Accumulator(old_sketch) => { + // [Case A] Sparse, bitMatrix == null, accumulator valid + if flavor == Flavor::SPARSE { + let old_flavor = old_sketch.flavor(); + if old_flavor != Flavor::SPARSE && old_flavor != Flavor::EMPTY { + unreachable!( + "unexpected old flavor in union accumulator: {:?}", + old_flavor + ); + } + + // The following partially fixes the snowplow problem provided that the K's + // are equal. + if old_flavor == Flavor::EMPTY && self.lg_k == sketch.lg_k() { + *old_sketch = sketch.clone(); + return; + } + + walk_table_updating_sketch(old_sketch, sketch.surprising_value_table()); + let final_flavor = old_sketch.flavor(); + + // if the accumulator has graduated beyond sparse, switch to a bit matrix + // representation + if final_flavor > Flavor::SPARSE { + let bit_matrix = old_sketch.build_bit_matrix(); + self.state = UnionState::BitMatrix(bit_matrix); + } + + return; + } + + // If flavor is past SPARSE mode, the state must have been converted to bitMatrix. + // Otherwise, the EMPTY case was handled at the start of this method, and the + // SPARSE case was handled above. + unreachable!("unexpected flavor in union accumulator: {:?}", flavor); + } + UnionState::BitMatrix(old_matrix) => { + if flavor == Flavor::SPARSE { + // [Case B] Sparse, bitMatrix valid, accumulator == null + or_table_into_matrix(old_matrix, self.lg_k, sketch.surprising_value_table()); + return; + } + + if matches!(flavor, Flavor::HYBRID | Flavor::PINNED) { + // [Case C] Hybrid, bitMatrix valid, accumulator == null + // [Case C] Pinned, bitMatrix valid, accumulator == null + or_window_into_matrix( + old_matrix, + self.lg_k, + &sketch.sliding_window, + sketch.window_offset, + sketch.lg_k(), + ); + or_table_into_matrix(old_matrix, self.lg_k, sketch.surprising_value_table()); + return; + } + + // [Case D] Sliding, bitMatrix valid, accumulator == null + // SLIDING mode involves inverted logic, so we cannot just walk the source sketch. + // Instead, we convert it to a bitMatrix that can be ORed into the destination. + assert_eq!(flavor, Flavor::SLIDING); + let src_matrix = sketch.build_bit_matrix(); + or_matrix_into_matrix(old_matrix, self.lg_k, &src_matrix, sketch.lg_k()); + } + } + } + + fn reduce_k(&mut self, new_lg_k: u8) { + match &mut self.state { + UnionState::Accumulator(sketch) => { + if sketch.is_empty() { + self.lg_k = new_lg_k; + self.state = UnionState::Accumulator(CpcSketch::with_seed(new_lg_k, self.seed)); + return; + } + + let mut new_sketch = CpcSketch::with_seed(new_lg_k, self.seed); + walk_table_updating_sketch(&mut new_sketch, sketch.surprising_value_table()); + + let final_new_flavor = new_sketch.flavor(); + // SV table had to have something in it + assert_ne!(final_new_flavor, Flavor::EMPTY); + if final_new_flavor == Flavor::SPARSE { + self.lg_k = new_lg_k; + self.state = UnionState::Accumulator(new_sketch); + return; + } + + // the new sketch has graduated beyond sparse, so convert to bitMatrix + self.lg_k = new_lg_k; + self.state = UnionState::BitMatrix(new_sketch.build_bit_matrix()); + } + UnionState::BitMatrix(matrix) => { + let new_k = 1 << new_lg_k; + let mut new_matrix = vec![0; new_k]; + or_matrix_into_matrix(&mut new_matrix, new_lg_k, matrix, self.lg_k); + self.lg_k = new_lg_k; + self.state = UnionState::BitMatrix(new_matrix); + } + } + } +} + +// testing methods +impl CpcUnion { + /// Returns the number of coupons in the union. + /// + /// This is primarily for testing and validation purposes. + pub fn num_coupons(&self) -> u32 { + match &self.state { + UnionState::Accumulator(sketch) => sketch.num_coupons, + UnionState::BitMatrix(matrix) => count_bits_set_in_matrix(matrix), + } + } +} + +fn or_window_into_matrix( + dst_matrix: &mut [u64], + dst_lg_k: u8, + src_window: &[u8], + src_offset: u8, + src_lg_k: u8, +) { + assert!(dst_lg_k <= src_lg_k); + let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k + let src_k = 1 << src_lg_k; + for src_row in 0..src_k { + dst_matrix[src_row & dst_mask] |= (src_window[src_row] as u64) << src_offset; + } +} + +fn or_table_into_matrix(dst_matrix: &mut [u64], dst_lg_k: u8, src_table: &PairTable) { + let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k + let slots = src_table.slots(); + for &row_col in slots.iter() { + if row_col != u32::MAX { + let src_row = (row_col >> 6) as usize; + let src_col = (row_col & 63) as usize; + let dst_row = src_row & dst_mask; + dst_matrix[dst_row] |= 1u64 << src_col; + } + } +} + +fn or_matrix_into_matrix(dst_matrix: &mut [u64], dst_lg_k: u8, src_matrix: &[u64], src_lg_k: u8) { + assert!(dst_lg_k <= src_lg_k); + let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k + let src_k = 1 << src_lg_k; + for src_row in 0..src_k { + let dst_row = src_row & dst_mask; + dst_matrix[dst_row] |= src_matrix[src_row]; + } +} + +fn walk_table_updating_sketch(sketch: &mut CpcSketch, table: &PairTable) { + assert!(sketch.lg_k() <= 26); + + let slots = table.slots(); + let num_slots = slots.len() as u32; + + // downsamples when dst lgK < src LgK + let dst_mask = (((1u64 << sketch.lg_k()) - 1) << 6) | 63; + // Using a golden ratio stride fixes the snowplow effect. + let mut stride = (0.6180339887498949 * (num_slots as f64)) as u32; + assert!(stride >= 2); + if stride == ((stride >> 1) << 1) { + // force the stride to be odd + stride += 1; + } + assert!((stride >= 3) && (stride < num_slots)); + + let mut k = 0; + for _ in 0..num_slots { + k &= num_slots - 1; + let row_col = slots[k as usize]; + if row_col != u32::MAX { + sketch.row_col_update(row_col & (dst_mask as u32)); + } + k += stride; + } +} + +/// The internal State of the Union operation. +/// +/// At most one of BitMatrix and accumulator will be non-null at any given moment. Accumulator is a +/// sketch object that is employed until it graduates out of Sparse mode. +/// +/// At that point, it is converted into a full-sized bitMatrix, which is mathematically a sketch, +/// but doesn't maintain any of the "extra" fields of our sketch objects. +#[derive(Debug, Clone)] +enum UnionState { + Accumulator(CpcSketch), + BitMatrix(Vec), +} diff --git a/datasketches/src/error.rs b/datasketches/src/error.rs index dcb0592..7e8b663 100644 --- a/datasketches/src/error.rs +++ b/datasketches/src/error.rs @@ -89,7 +89,7 @@ impl Error { } } -// Convenience constructors for deserialization errors +// Convenient constructors used within datasketches crate. impl Error { pub(crate) fn invalid_argument(msg: impl Into) -> Self { Self::new(ErrorKind::InvalidArgument, msg) diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs index 8c79625..bc1ce42 100644 --- a/datasketches/src/hll/sketch.rs +++ b/datasketches/src/hll/sketch.rs @@ -40,7 +40,7 @@ use crate::hll::serialization::*; /// A HyperLogLog sketch. /// -/// See the [hll module level documentation](crate::hll) for more. +/// See the [module level documentation](super) for more. #[derive(Debug, Clone, PartialEq)] pub struct HllSketch { lg_config_k: u8, diff --git a/datasketches/src/hll/union.rs b/datasketches/src/hll/union.rs index 5dba0bb..870c8d8 100644 --- a/datasketches/src/hll/union.rs +++ b/datasketches/src/hll/union.rs @@ -45,7 +45,7 @@ use crate::hll::pack_coupon; /// the union of all input sketches. It automatically handles sketches with /// different configurations and modes. /// -/// See the [hll module level documentation](crate::hll) for more. +/// See the [module level documentation](super) for more. #[derive(Debug, Clone)] pub struct HllUnion { /// Maximum lg_k that this union can handle diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs index 009fd9e..17701ab 100644 --- a/datasketches/src/lib.rs +++ b/datasketches/src/lib.rs @@ -33,6 +33,7 @@ compile_error!("datasketches does not support big-endian targets"); pub mod bloom; pub mod common; pub mod countmin; +pub mod cpc; pub mod error; pub mod frequencies; pub mod hll; diff --git a/datasketches/src/tdigest/sketch.rs b/datasketches/src/tdigest/sketch.rs index 91c90c7..a790892 100644 --- a/datasketches/src/tdigest/sketch.rs +++ b/datasketches/src/tdigest/sketch.rs @@ -22,7 +22,6 @@ use std::num::NonZeroU64; use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::error::Error; -use crate::error::ErrorKind; use crate::tdigest::serialization::*; /// The default value of K if one is not specified. @@ -34,7 +33,7 @@ const DEFAULT_WEIGHT: NonZeroU64 = NonZeroU64::new(1).unwrap(); /// T-Digest sketch for estimating quantiles and ranks. /// -/// See the [tdigest module level documentation](crate::tdigest) for more. +/// See the [module level documentation](super) for more. #[derive(Debug, Clone)] pub struct TDigestMut { k: u16, @@ -100,10 +99,9 @@ impl TDigestMut { /// ``` pub fn try_new(k: u16) -> Result { if k < 10 { - return Err(Error::new( - ErrorKind::InvalidArgument, - format!("k must be at least 10, got {k}"), - )); + return Err(Error::invalid_argument(format!( + "k must be at least 10, got {k}" + ))); } Ok(Self::make( @@ -788,7 +786,7 @@ impl TDigestMut { /// Immutable (frozen) T-Digest sketch for estimating quantiles and ranks. /// -/// See the [module documentation](super) for more details. +/// See the [module level documentation](super) for more. pub struct TDigest { k: u16, diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs index 506b2b6..07110d4 100644 --- a/datasketches/src/theta/sketch.rs +++ b/datasketches/src/theta/sketch.rs @@ -25,6 +25,7 @@ use std::hash::Hash; use crate::common::NumStdDev; use crate::common::ResizeFactor; use crate::common::binomial_bounds; +use crate::common::canonical_double; use crate::hash::DEFAULT_UPDATE_SEED; use crate::theta::hash_table::DEFAULT_LG_K; use crate::theta::hash_table::MAX_LG_K; @@ -52,7 +53,9 @@ impl ThetaSketch { ThetaSketchBuilder::default() } - /// Update the sketch with a hashable value + /// Update the sketch with a hashable value. + /// + /// For `f32`/`f64` values, use `update_f32`/`update_f64` instead. /// /// # Examples /// @@ -69,7 +72,7 @@ impl ThetaSketch { } } - /// Update the sketch with a f64 value + /// Update the sketch with a f64 value. /// /// # Examples /// @@ -85,7 +88,7 @@ impl ThetaSketch { self.update(canonical); } - /// Update the sketch with a f32 value + /// Update the sketch with a f32 value. /// /// # Examples /// @@ -356,16 +359,3 @@ impl ThetaSketchBuilder { ThetaSketch { table } } } - -/// Canonicalize double value for compatibility with Java -fn canonical_double(value: f64) -> i64 { - if value.is_nan() { - // Java's Double.doubleToLongBits() NaN value - 0x7ff8000000000000i64 - } else { - // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode, - // which Rust guarantees. Thus, by adding a positive zero we - // canonicalize signed zero without any branches in one instruction. - (value + 0.0).to_bits() as i64 - } -} diff --git a/datasketches/tests/cpc_union_test.rs b/datasketches/tests/cpc_union_test.rs new file mode 100644 index 0000000..6e8aa32 --- /dev/null +++ b/datasketches/tests/cpc_union_test.rs @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datasketches::cpc::CpcSketch; +use datasketches::cpc::CpcUnion; +use googletest::assert_that; +use googletest::prelude::near; + +const RELATIVE_ERROR_FOR_LG_K_11: f64 = 0.02; + +#[test] +fn test_empty() { + let union = CpcUnion::new(11); + let sketch = union.get_result(); + assert!(sketch.is_empty()); + assert_eq!(sketch.estimate(), 0.0); +} + +#[test] +fn test_two_values() { + let mut sketch = CpcSketch::new(11); + sketch.update(1); + let mut union = CpcUnion::new(11); + union.update(&sketch); + + let result = union.get_result(); + assert!(!result.is_empty()); + assert_eq!(result.estimate(), 1.0); + + sketch.update(2); + union.update(&sketch); + let result = union.get_result(); + assert!(!result.is_empty()); + assert_that!( + sketch.estimate(), + near(2.0, RELATIVE_ERROR_FOR_LG_K_11 * 2.0) + ); +} + +#[test] +fn test_custom_seed() { + let mut sketch = CpcSketch::with_seed(11, 123); + sketch.update(1); + sketch.update(2); + sketch.update(3); + + let mut union = CpcUnion::with_seed(11, 123); + union.update(&sketch); + let result = union.get_result(); + assert!(!result.is_empty()); + assert_that!( + result.estimate(), + near(3.0, RELATIVE_ERROR_FOR_LG_K_11 * 3.0) + ); +} + +#[test] +#[should_panic] +fn test_custom_seed_mismatch() { + let mut sketch = CpcSketch::with_seed(11, 123); + sketch.update(1); + sketch.update(2); + sketch.update(3); + + let mut union = CpcUnion::with_seed(11, 234); + union.update(&sketch); +} + +#[test] +fn test_large_values() { + let mut key = 0; + let mut sketch = CpcSketch::new(11); + let mut union = CpcUnion::new(11); + for _ in 0..1000 { + let mut tmp = CpcSketch::new(11); + for _ in 0..10000 { + sketch.update(key); + tmp.update(key); + key += 1; + } + union.update(&tmp); + } + let result = union.get_result(); + assert!(!result.is_empty()); + assert_eq!(result.num_coupons(), union.num_coupons()); + let estimate = sketch.estimate(); + assert_that!( + result.estimate(), + near(estimate, RELATIVE_ERROR_FOR_LG_K_11 * estimate) + ); +} + +#[test] +fn test_reduce_k_empty() { + let mut sketch = CpcSketch::new(11); + for i in 0..10000 { + sketch.update(i); + } + let mut union = CpcUnion::new(12); + union.update(&sketch); + let result = union.get_result(); + assert_eq!(result.lg_k(), 11); + assert_that!( + result.estimate(), + near(10000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0) + ); +} + +#[test] +fn test_reduce_k_sparse() { + let mut union = CpcUnion::new(12); + + let mut sketch12 = CpcSketch::new(12); + for i in 0..100 { + sketch12.update(i); + } + union.update(&sketch12); + + let mut sketch11 = CpcSketch::new(11); + for i in 0..1000 { + sketch11.update(i); + } + union.update(&sketch11); + + let result = union.get_result(); + assert_eq!(result.lg_k(), 11); + assert_that!( + result.estimate(), + near(1000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0) + ); +} + +#[test] +fn test_reduce_k_window() { + let mut union = CpcUnion::new(12); + + let mut sketch12 = CpcSketch::new(12); + for i in 0..500 { + sketch12.update(i); + } + union.update(&sketch12); + + let mut sketch11 = CpcSketch::new(11); + for i in 0..1000 { + sketch11.update(i); + } + union.update(&sketch11); + + let result = union.get_result(); + assert_eq!(result.lg_k(), 11); + assert_that!( + result.estimate(), + near(1000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0) + ); +} diff --git a/datasketches/tests/cpc_update_test.rs b/datasketches/tests/cpc_update_test.rs new file mode 100644 index 0000000..7b814f7 --- /dev/null +++ b/datasketches/tests/cpc_update_test.rs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datasketches::common::NumStdDev; +use datasketches::cpc::CpcSketch; +use googletest::assert_that; +use googletest::prelude::ge; +use googletest::prelude::le; +use googletest::prelude::near; + +const RELATIVE_ERROR_FOR_LG_K_11: f64 = 0.02; + +#[test] +fn test_empty() { + let sketch = CpcSketch::new(11); + assert!(sketch.is_empty()); + assert_eq!(sketch.estimate(), 0.0); + assert_eq!(sketch.lower_bound(NumStdDev::One), 0.0); + assert_eq!(sketch.upper_bound(NumStdDev::One), 0.0); + assert!(sketch.validate()); +} + +#[test] +fn test_one_value() { + let mut sketch = CpcSketch::new(11); + sketch.update(1); + assert!(!sketch.is_empty()); + assert_eq!(sketch.estimate(), 1.0); + assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One))); + assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One))); + assert!(sketch.validate()); +} + +#[test] +fn test_many_values() { + let mut sketch = CpcSketch::new(11); + for i in 0..10000 { + sketch.update(i); + } + assert!(!sketch.is_empty()); + assert_that!( + sketch.estimate(), + near(10000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0) + ); + assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One))); + assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One))); + assert!(sketch.validate()); +}