diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 000000000..28c204a5a
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,76 @@
+{
+    "files.associations": {
+        "chrono": "cpp",
+        "shared_mutex": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "any": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "strstream": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "cinttypes": "cpp",
+        "codecvt": "cpp",
+        "complex": "cpp",
+        "condition_variable": "cpp",
+        "cstdint": "cpp",
+        "deque": "cpp",
+        "list": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "string": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "ostream": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "cfenv": "cpp",
+        "typeindex": "cpp",
+        "typeinfo": "cpp",
+        "variant": "cpp",
+        "compare": "cpp",
+        "concepts": "cpp",
+        "future": "cpp",
+        "numbers": "cpp",
+        "semaphore": "cpp",
+        "stop_token": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d3d2b860..fba71c919 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,6 +52,7 @@ if (MSVC)
 endif()
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/roaring)
 
 if(NOT PYBIND)
     set(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS ON)
diff --git a/apps/build_memory_index.cpp b/apps/build_memory_index.cpp
index 544e42dee..faf322d9f 100644
--- a/apps/build_memory_index.cpp
+++ b/apps/build_memory_index.cpp
@@ -27,6 +27,7 @@ int main(int argc, char **argv)
     std::string data_type, dist_fn, data_path, index_path_prefix, label_file, universal_label, label_type;
     uint32_t num_threads, R, L, Lf, build_PQ_bytes;
     float alpha;
+    uint32_t inter_size;
     bool use_pq_build, use_opq;
 
     po::options_description desc{
@@ -70,6 +71,9 @@ int main(int argc, char **argv)
                                        program_options_utils::FILTERED_LBUILD);
         optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
                                        program_options_utils::LABEL_TYPE_DESCRIPTION);
+        optional_configs.add_options()(
+            "min_inter", po::value<uint32_t>(&inter_size)->default_value(1),
+            "Sets the minimum intersection size between filter sets. Defaults to 1, giving filtered-diskann");
 
         // Merge required and optional parameters
         desc.add(required_configs).add(optional_configs);
@@ -119,6 +123,7 @@ int main(int argc, char **argv)
 
         size_t data_num, data_dim;
         diskann::get_bin_metadata(data_path, data_num, data_dim);
+        min_inter_size = inter_size;
 
         auto index_build_params = diskann::IndexWriteParametersBuilder(L, R)
                                       .with_filter_list_size(Lf)
diff --git a/apps/search_disk_index.cpp b/apps/search_disk_index.cpp
index 7e2a7ac6d..1c1b43680 100644
--- a/apps/search_disk_index.cpp
+++ b/apps/search_disk_index.cpp
@@ -53,7 +53,7 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre
                       const uint32_t num_threads, const uint32_t recall_at, const uint32_t beamwidth,
                       const uint32_t num_nodes_to_cache, const uint32_t search_io_limit,
                       const std::vector<uint32_t> &Lvec, const float fail_if_recall_below,
-                      const std::vector<std::string> &query_filters, const bool use_reorder_data = false)
+                      const std::vector<std::vector<std::string>> &query_filters, const bool use_reorder_data = false)
 {
     diskann::cout << "Search parameters: #threads: " << num_threads << ", ";
     if (beamwidth <= 0)
@@ -239,11 +239,11 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre
                 LabelT label_for_search;
                 if (query_filters.size() == 1)
                 { // one label for all queries
-                    label_for_search = _pFlashIndex->get_converted_label(query_filters[0]);
+                    label_for_search = _pFlashIndex->get_converted_label(query_filters[0][0]);
                 }
                 else
                 { // one label for each query
-                    label_for_search = _pFlashIndex->get_converted_label(query_filters[i]);
+                    label_for_search = _pFlashIndex->get_converted_label(query_filters[i][0]);
                 }
                 _pFlashIndex->cached_beam_search(
                     query + (i * query_aligned_dim), recall_at, L, query_result_ids_64.data() + (i * recall_at),
@@ -434,10 +434,12 @@ int main(int argc, char **argv)
         return -1;
     }
 
-    std::vector<std::string> query_filters;
+    std::vector<std::vector<std::string>> query_filters;
     if (filter_label != "")
     {
-        query_filters.push_back(filter_label);
+        std::vector<std::string> tmp;
+        tmp.push_back(filter_label);
+        query_filters.push_back(tmp);
     }
     else if (query_filters_file != "")
     {
diff --git a/apps/search_memory_index.cpp b/apps/search_memory_index.cpp
index 1a9acc285..00f6473a9 100644
--- a/apps/search_memory_index.cpp
+++ b/apps/search_memory_index.cpp
@@ -9,6 +9,7 @@
 #include <set>
 #include <string.h>
 #include <boost/program_options.hpp>
+#include <random>
 
 #ifndef _WINDOWS
 #include <sys/mman.h>
@@ -18,6 +19,7 @@
 #endif
 
 #include "index.h"
+#include "roaring.h"
 #include "memory_mapper.h"
 #include "utils.h"
 #include "program_options_utils.hpp"
@@ -30,7 +32,10 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
                         const std::string &query_file, const std::string &truthset_file, const uint32_t num_threads,
                         const uint32_t recall_at, const bool print_all_recalls, const std::vector<uint32_t> &Lvec,
                         const bool dynamic, const bool tags, const bool show_qps_per_thread,
-                        const std::vector<std::string> &query_filters, const float fail_if_recall_below)
+                        const std::vector<std::vector<std::vector<std::string>>> &query_filters,
+                        const uint32_t filter_penalty_threshold, const uint32_t bruteforce_threshold,
+                        uint32_t L_for_print, const float fail_if_recall_below,
+                        uint32_t maxN = 10000000, float p1 = 0.1, float p2 = 0.1)
 {
     using TagT = uint32_t;
     // Load the query file
@@ -40,6 +45,9 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
     size_t query_num, query_dim, query_aligned_dim, gt_num, gt_dim;
     diskann::load_aligned_bin<T>(query_file, query, query_num, query_dim, query_aligned_dim);
 
+    std::vector<double> filter_match_time(query_num);
+    std::vector<double> dist_cmp_time(query_num);
+
     bool calc_recall_flag = false;
     if (truthset_file != std::string("null") && file_exists(truthset_file))
     {
@@ -70,6 +78,10 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
 
     const size_t num_frozen_pts = diskann::get_graph_num_frozen_points(index_path);
 
+    std::cout << filter_penalty_threshold << " is value of filter_penalty_threshold at driver file" << std::endl;
+    auto search_params =
+        diskann::IndexSearchParams(*(std::max_element(Lvec.begin(), Lvec.end())), num_threads, filter_penalty_threshold,
+                                   bruteforce_threshold);
     auto config = diskann::IndexConfigBuilder()
                       .with_metric(metric)
                       .with_dimension(query_dim)
@@ -86,6 +98,7 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
                       .is_use_opq(false)
                       .with_num_pq_chunks(0)
                       .with_num_frozen_pts(num_frozen_pts)
+                      .with_index_search_params(search_params)
                       .build();
 
     auto index_factory = diskann::IndexFactory(config);
@@ -110,25 +123,43 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
     else
     {
         std::cout << std::setw(4) << "Ls" << std::setw(12) << qps_title << std::setw(18) << "Avg dist cmps"
-                  << std::setw(20) << "Mean Latency (mus)" << std::setw(15) << "99.9 Latency";
-        table_width += 4 + 12 + 18 + 20 + 15;
+                  << std::setw(20) << "Mean Latency (mus)" << std::setw(15) << "Recall" 
+                  #ifdef INSTRUMENT
+                  << std::setw(20) << "Brute Latency (mus)" << std::setw(20) << "Brute Recall" <<  std::setw(20) << "Graph Latency (mus)" << std::setw(20)
+                  << "Graph Recall" 
+                  #else
+                  << std::endl;
+                  #endif
+        table_width += 4 + 12 + 18 + 20 + 15 
+        #ifdef INSTRUMENT
+        + 20 + 20 + 20 + 20 + 20 + 20;
+        #else
+        ;
+        #endif
     }
-    uint32_t recalls_to_print = 0;
-    const uint32_t first_recall = print_all_recalls ? 1 : recall_at;
-    if (calc_recall_flag)
-    {
-        for (uint32_t curr_recall = first_recall; curr_recall <= recall_at; curr_recall++)
+    /*    uint32_t recalls_to_print = 0;
+        const uint32_t first_recall = print_all_recalls ? 1 : recall_at;
+        if (calc_recall_flag)
         {
-            std::cout << std::setw(12) << ("Recall@" + std::to_string(curr_recall));
-        }
-        recalls_to_print = recall_at + 1 - first_recall;
-        table_width += recalls_to_print * 12;
-    }
+            for (uint32_t curr_recall = first_recall; curr_recall <= recall_at; curr_recall++)
+            {
+                std::cout << std::setw(12) << ("Recall@" + std::to_string(curr_recall));
+            }
+            recalls_to_print = recall_at + 1 - first_recall;
+            table_width += recalls_to_print * 12;
+        } */
     std::cout << std::endl;
     std::cout << std::string(table_width, '=') << std::endl;
 
     std::vector<std::vector<uint32_t>> query_result_ids(Lvec.size());
     std::vector<std::vector<float>> query_result_dists(Lvec.size());
+    std::vector<std::vector<uint32_t>> query_result_class(Lvec.size());
+    std::vector<float> brute_recalls(Lvec.size(), 0);
+    std::vector<float> graph_recalls(Lvec.size(), 0);
+    std::vector<float> brute_lat(Lvec.size(), 0);
+    std::vector<float> graph_lat(Lvec.size(), 0);
+    for (auto &x : query_result_class)
+        x.resize(query_num, 0);
     std::vector<float> latency_stats(query_num, 0);
     std::vector<uint32_t> cmp_stats;
     if (not tags || filtered_search)
@@ -141,36 +172,65 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
     {
         query_result_tags.resize(recall_at * query_num);
     }
-
+     //query_num = 4;
+    //query_num = 1;
     double best_recall = 0.0;
 
     for (uint32_t test_id = 0; test_id < Lvec.size(); test_id++)
     {
+        query_result_class[test_id].resize(query_num, 0);
+        time_to_get_valid = 0;
+        time_to_intersect = 0;
+        time_to_filter_check_and_compare = 0;
+        time_to_detect_penalty = 0;
+        num_brutes = 0;
+        num_graphs = 0;
         uint32_t L = Lvec[test_id];
-        if (L < recall_at)
-        {
-            diskann::cout << "Ignoring search with L:" << L << " since it's smaller than K:" << recall_at << std::endl;
-            continue;
-        }
+        /*        if (L < recall_at)
+                {
+                    diskann::cout << "Ignoring search with L:" << L << " since it's smaller than K:" << recall_at <<
+           std::endl; continue;
+                }*/
 
         query_result_ids[test_id].resize(recall_at * query_num);
         query_result_dists[test_id].resize(recall_at * query_num);
         std::vector<T *> res = std::vector<T *>();
-
+        int method_used = 0;
         auto s = std::chrono::high_resolution_clock::now();
         omp_set_num_threads(num_threads);
 #pragma omp parallel for schedule(dynamic, 1)
         for (int64_t i = 0; i < (int64_t)query_num; i++)
         {
+            curr_query = i;
+/*            std::cout<<"\n\nQuery #" <<i <<"\n*******************\n";
+            for (uint32_t rnr =  0; rnr < L; rnr++) {
+                std::cout<<std::setw(10)<< gt_ids[i*gt_dim+rnr]<<":" << gt_dists[i*gt_dim+rnr]<<"\t";
+            }
+            std::cout<<std::endl;*/
+            if (L_for_print == L)
+                print_qstats = true;
+            else
+                print_qstats = false;
+
             auto qs = std::chrono::high_resolution_clock::now();
             if (filtered_search && !tags)
             {
-                std::string raw_filter = query_filters.size() == 1 ? query_filters[0] : query_filters[i];
+                uint32_t old_b, old_g, old_c;
+                old_b = num_brutes;
+                old_g = num_graphs;
+                method_used = 0;
+                std::vector<std::vector<std::string>> raw_filter = query_filters.size() == 1 ? query_filters[0] : query_filters[i];
 
                 auto retval = index->search_with_filters(query + i * query_aligned_dim, raw_filter, recall_at, L,
                                                          query_result_ids[test_id].data() + i * recall_at,
                                                          query_result_dists[test_id].data() + i * recall_at);
+                if (num_graphs > old_g)
+                    method_used = 1;
+                else
+                    method_used = 0;
                 cmp_stats[i] = retval.second;
+//                                filter_match_time[i] = time_to_get_valid*1000000;
+                //                dist_cmp_time[i] = time_to_compare*1000000;
             }
             else if (metric == diskann::FAST_L2)
             {
@@ -186,10 +246,12 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
                 }
                 else
                 {
-                    std::string raw_filter = query_filters.size() == 1 ? query_filters[0] : query_filters[i];
+                    std::vector<std::string> raw_filter =
+                        query_filters.size() == 1 ? query_filters[0][0] : query_filters[i][0];
 
                     index->search_with_tags(query + i * query_aligned_dim, recall_at, L,
-                                            query_result_tags.data() + i * recall_at, nullptr, res, true, raw_filter);
+                                            query_result_tags.data() + i * recall_at, nullptr, res, true,
+                                            raw_filter[0]);
                 }
 
                 for (int64_t r = 0; r < (int64_t)recall_at; r++)
@@ -207,6 +269,17 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
             auto qe = std::chrono::high_resolution_clock::now();
             std::chrono::duration<double> diff = qe - qs;
             latency_stats[i] = (float)(diff.count() * 1000000);
+            switch (method_used)
+            {
+            case 0:
+                query_result_class[test_id][i] = 0;
+                brute_lat[test_id] += latency_stats[i];
+                break;
+            case 1:
+                query_result_class[test_id][i] = 1;
+                graph_lat[test_id] += latency_stats[i];
+                break;
+            }
         }
         std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
 
@@ -218,8 +291,86 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
         std::vector<double> recalls;
         if (calc_recall_flag)
         {
-            recalls.reserve(recalls_to_print);
-            for (uint32_t curr_recall = first_recall; curr_recall <= recall_at; curr_recall++)
+            if (L == L_for_print)
+            {
+                std::ofstream query_stats_file;
+                query_stats_file.open(result_path_prefix + "_query_stats.txt");
+                query_stats_file << "cmps\tnum correct\tfilt time\tcmp time\tlatency" << std::endl;
+                for (size_t i = 0; i < query_num; i++)
+                {
+                    std::set<uint32_t> gt, res;
+                    uint32_t *gt_vec = gt_ids + gt_dim * i;
+                    uint32_t *res_vec = query_result_ids[test_id].data() + recall_at * i;
+                    size_t tie_breaker = recall_at;
+                    if (gt_dists != nullptr)
+                    {
+                        tie_breaker = recall_at - 1;
+                        float *gt_dist_vec = gt_dists + gt_dim * i;
+                        while (tie_breaker < gt_dim && gt_dist_vec[tie_breaker] == gt_dist_vec[recall_at - 1])
+                            tie_breaker++;
+                    }
+
+                    gt.insert(gt_vec, gt_vec + tie_breaker);
+                    res.insert(res_vec,
+                               res_vec + recall_at); // change to recall_at for recall k@k
+                                                     // or dim_or for k@dim_or
+                    uint32_t cur_recall = 0;
+                    for (auto &v : gt)
+                    {
+                        if (res.find(v) != res.end())
+                        {
+                            cur_recall++;
+                        }
+                    }
+                    query_stats_file << cmp_stats[i] << "\t" << cur_recall << "\t" << filter_match_time[i] << "\t"
+                                     << dist_cmp_time[i] << "\t" << latency_stats[i] << "\t";
+                    for (auto const &r : res)
+                        query_stats_file << r << " ";
+                    query_stats_file << std::endl;
+                }
+                query_stats_file.close();
+            }
+
+            recalls.reserve(1);
+
+            for (size_t i = 0; i < query_num; i++)
+            {
+                std::set<uint32_t> gt, res;
+                uint32_t *gt_vec = gt_ids + gt_dim * i;
+                uint32_t *res_vec = query_result_ids[test_id].data() + recall_at * i;
+                size_t tie_breaker = recall_at;
+                if (gt_dists != nullptr)
+                {
+                    tie_breaker = recall_at - 1;
+                    float *gt_dist_vec = gt_dists + gt_dim * i;
+                    while (tie_breaker < gt_dim && gt_dist_vec[tie_breaker] == gt_dist_vec[recall_at - 1])
+                        tie_breaker++;
+                }
+
+                gt.insert(gt_vec, gt_vec + tie_breaker);
+                res.insert(res_vec,
+                           res_vec + recall_at); // change to recall_at for recall k@k
+                                                 // or dim_or for k@dim_or
+                uint32_t cur_recall = 0;
+                for (auto &v : gt)
+                {
+                    if (res.find(v) != res.end())
+                    {
+                        cur_recall++;
+                    }
+                }
+                switch (query_result_class[test_id][i])
+                {
+                case 0:
+                    brute_recalls[test_id] += cur_recall;
+                    break;
+                case 1:
+                    graph_recalls[test_id] += cur_recall;
+                    break;
+                }
+            }
+
+            for (uint32_t curr_recall = recall_at; curr_recall <= recall_at; curr_recall++)
             {
                 recalls.push_back(diskann::calculate_recall((uint32_t)query_num, gt_ids, gt_dists, (uint32_t)gt_dim,
                                                             query_result_ids[test_id].data(), recall_at, curr_recall));
@@ -240,16 +391,24 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
         else
         {
             std::cout << std::setw(4) << L << std::setw(12) << displayed_qps << std::setw(18) << avg_cmps
-                      << std::setw(20) << (float)mean_latency << std::setw(15)
-                      << (float)latency_stats[(uint64_t)(0.999 * query_num)];
-        }
-        for (double recall : recalls)
-        {
-            std::cout << std::setw(12) << recall;
-            best_recall = std::max(recall, best_recall);
+                      << std::setw(20) << (float)mean_latency << std::setw(15) << (float)recalls[0] 
+                      #ifdef INSTRUMENT
+                      << std::setw(20) << (float)(brute_lat[test_id] * 1.0) / (num_brutes * 1.0) << std::setw(20)
+                      << (float)(brute_recalls[test_id] * 100.0) / (num_brutes * recall_at * 1.0) << std::setw(20)
+                      << (float)(graph_lat[test_id] * 1.0) / (num_graphs * 1.0) << std::setw(20)
+                      << (float)(graph_recalls[test_id] * 100.0) / (num_graphs * recall_at * 1.0) << " " << (1000000*time_to_detect_penalty) / query_num << "\t" << (1000000*time_to_get_valid) / query_num 
+                      //                      << std::setw(20) << (float)(brute_lat[test_id]*1.0) << std::setw(20) <<
+                      //                      (float)(brute_recalls[test_id]*100.0)
+                      //                     << std::setw(20) << (float)(graph_lat[test_id]*1.0) << std::setw(20) <<
+                      //                     (float)(graph_recalls[test_id]*100.0)
+                      << std::endl;
+                      #else
+                      << std::endl;
+                      #endif
         }
-        std::cout << std::endl;
     }
+    std::cout << "num_graphs " << num_graphs << std::endl;
+    std::cout << "num_brutes " << num_brutes << std::endl;
 
     std::cout << "Done searching. Now saving results " << std::endl;
     uint64_t test_id = 0;
@@ -279,11 +438,14 @@ int main(int argc, char **argv)
 {
     std::string data_type, dist_fn, index_path_prefix, result_path, query_file, gt_file, filter_label, label_type,
         query_filters_file;
-    uint32_t num_threads, K;
+    uint32_t num_threads, K, filter_penalty_threshold, bruteforce_threshold, L_for_print, num_local;
     std::vector<uint32_t> Lvec;
-    bool print_all_recalls, dynamic, tags, show_qps_per_thread;
+    bool print_all_recalls, dynamic, tags, show_qps_per_thread, global_start;
     float fail_if_recall_below = 0.0f;
 
+    uint32_t maxN;
+    float p1, p2;
+
     po::options_description desc{
         program_options_utils::make_program_description("search_memory_index", "Searches in-memory DiskANN indexes")};
     try
@@ -316,6 +478,23 @@ int main(int argc, char **argv)
         optional_configs.add_options()("query_filters_file",
                                        po::value<std::string>(&query_filters_file)->default_value(std::string("")),
                                        program_options_utils::FILTERS_FILE_DESCRIPTION);
+        optional_configs.add_options()("filter_penalty_threshold",
+                                       po::value<uint32_t>(&filter_penalty_threshold)->default_value(0),
+                                       "What penalty threshold to tolerate for multiple filter search");
+        optional_configs.add_options()("bruteforce_threshold",
+                                       po::value<uint32_t>(&bruteforce_threshold)->default_value(0),
+                                       "Threshold under which we bruteforce the filtered search");
+        optional_configs.add_options()("use_global_start",
+                                       po::value<bool>(&global_start)->default_value(false),
+                                       "Whether or not to use global start or predicate-aware starting point in graph search");
+        optional_configs.add_options()("expand_two_hops",
+                                       po::value<bool>(&expand_two_hops)->default_value(false),
+                                       "Whether or not to use ACORN-like idea of two hops at search");                                       
+        optional_configs.add_options()("num_local_start",
+                                       po::value<uint32_t>(&num_local)->default_value(0),
+                                       "How many local start points to use");
+
+
         optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
                                        program_options_utils::LABEL_TYPE_DESCRIPTION);
         optional_configs.add_options()("gt_file", po::value<std::string>(&gt_file)->default_value(std::string("null")),
@@ -323,6 +502,9 @@ int main(int argc, char **argv)
         optional_configs.add_options()("num_threads,T",
                                        po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
                                        program_options_utils::NUMBER_THREADS_DESCRIPTION);
+        optional_configs.add_options()(
+            "L_to_print", po::value<uint32_t>(&L_for_print)->default_value(0),
+            "Which of the given L's to provide query statistics for (written to index_path + \"_query_stats.txt\")");
         optional_configs.add_options()(
             "dynamic", po::value<bool>(&dynamic)->default_value(false),
             "Whether the index is dynamic. Dynamic indices must have associated tags.  Default false.");
@@ -332,6 +514,10 @@ int main(int argc, char **argv)
                                        po::value<float>(&fail_if_recall_below)->default_value(0.0f),
                                        program_options_utils::FAIL_IF_RECALL_BELOW);
 
+        optional_configs.add_options()("maxN", po::value<uint32_t>(&maxN)->default_value(10000000), "maxN");
+        optional_configs.add_options()("p1", po::value<float>(&p1)->default_value(0.1), "p1");
+        optional_configs.add_options()("p2", po::value<float>(&p2)->default_value(0.1), "p2");
+
         // Output controls
         po::options_description output_controls("Output controls");
         output_controls.add_options()("print_all_recalls", po::bool_switch(&print_all_recalls),
@@ -403,16 +589,32 @@ int main(int argc, char **argv)
         return -1;
     }
 
-    std::vector<std::string> query_filters;
+    std::vector<std::vector<std::vector<std::string>>> query_filters;
     if (filter_label != "")
     {
-        query_filters.push_back(filter_label);
+        std::vector<std::vector<std::string>> single_filter;
+        std::vector<std::string> tmp;
+        tmp.push_back(filter_label);
+        single_filter.push_back(tmp);
+        query_filters.push_back(single_filter);
     }
     else if (query_filters_file != "")
     {
-        query_filters = read_file_to_vector_of_strings(query_filters_file);
+        query_filters = read_file_to_vector_of_vector_of_strings(query_filters_file);
+        for (auto &x : query_filters[0]) {
+            std::cout<<"(";
+            for (auto &y : x) {
+                std::cout<<y<<"|";
+            }
+            std::cout<<")&";
+        }
     }
 
+    use_global_start = global_start;
+    num_start_points = num_local;
+
+    std::cout<<"Num local start points: " << num_start_points << std::endl;
+
     try
     {
         if (!query_filters.empty() && label_type == "ushort")
@@ -421,19 +623,22 @@ int main(int argc, char **argv)
             {
                 return search_memory_index<int8_t, uint16_t>(
                     metric, index_path_prefix, result_path, query_file, gt_file, num_threads, K, print_all_recalls,
-                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, fail_if_recall_below);
+                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, filter_penalty_threshold,
+                    bruteforce_threshold, L_for_print, fail_if_recall_below);
             }
             else if (data_type == std::string("uint8"))
             {
                 return search_memory_index<uint8_t, uint16_t>(
                     metric, index_path_prefix, result_path, query_file, gt_file, num_threads, K, print_all_recalls,
-                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, fail_if_recall_below);
+                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, filter_penalty_threshold,
+                    bruteforce_threshold, L_for_print, fail_if_recall_below);
             }
             else if (data_type == std::string("float"))
             {
-                return search_memory_index<float, uint16_t>(metric, index_path_prefix, result_path, query_file, gt_file,
-                                                            num_threads, K, print_all_recalls, Lvec, dynamic, tags,
-                                                            show_qps_per_thread, query_filters, fail_if_recall_below);
+                return search_memory_index<float, uint16_t>(
+                    metric, index_path_prefix, result_path, query_file, gt_file, num_threads, K, print_all_recalls,
+                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, filter_penalty_threshold,
+                    bruteforce_threshold, L_for_print, fail_if_recall_below);
             }
             else
             {
@@ -445,21 +650,24 @@ int main(int argc, char **argv)
         {
             if (data_type == std::string("int8"))
             {
-                return search_memory_index<int8_t>(metric, index_path_prefix, result_path, query_file, gt_file,
-                                                   num_threads, K, print_all_recalls, Lvec, dynamic, tags,
-                                                   show_qps_per_thread, query_filters, fail_if_recall_below);
+                return search_memory_index<int8_t>(
+                    metric, index_path_prefix, result_path, query_file, gt_file, num_threads, K, print_all_recalls,
+                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, filter_penalty_threshold,
+                    bruteforce_threshold, L_for_print, fail_if_recall_below, maxN, p1, p2);
             }
             else if (data_type == std::string("uint8"))
             {
-                return search_memory_index<uint8_t>(metric, index_path_prefix, result_path, query_file, gt_file,
-                                                    num_threads, K, print_all_recalls, Lvec, dynamic, tags,
-                                                    show_qps_per_thread, query_filters, fail_if_recall_below);
+                return search_memory_index<uint8_t>(
+                    metric, index_path_prefix, result_path, query_file, gt_file, num_threads, K, print_all_recalls,
+                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, filter_penalty_threshold,
+                    bruteforce_threshold, L_for_print, fail_if_recall_below);
             }
             else if (data_type == std::string("float"))
             {
-                return search_memory_index<float>(metric, index_path_prefix, result_path, query_file, gt_file,
-                                                  num_threads, K, print_all_recalls, Lvec, dynamic, tags,
-                                                  show_qps_per_thread, query_filters, fail_if_recall_below);
+                return search_memory_index<float>(
+                    metric, index_path_prefix, result_path, query_file, gt_file, num_threads, K, print_all_recalls,
+                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, filter_penalty_threshold,
+                    bruteforce_threshold, L_for_print, fail_if_recall_below);
             }
             else
             {
diff --git a/apps/utils/CMakeLists.txt b/apps/utils/CMakeLists.txt
index 3b8cf223c..dc5c0fb06 100644
--- a/apps/utils/CMakeLists.txt
+++ b/apps/utils/CMakeLists.txt
@@ -5,6 +5,9 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_COMPILE_WARNING_AS_ERROR ON)
 
 
+add_executable(bitmap_vs_list bitmap_vs_list.cpp)
+target_link_libraries(bitmap_vs_list ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
+
 add_executable(fvecs_to_bin fvecs_to_bin.cpp)
 
 add_executable(fvecs_to_bvecs fvecs_to_bvecs.cpp)
@@ -51,9 +54,13 @@ add_executable(compute_groundtruth compute_groundtruth.cpp)
 target_include_directories(compute_groundtruth PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
 target_link_libraries(compute_groundtruth ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
 
-add_executable(compute_groundtruth_for_filters compute_groundtruth_for_filters.cpp)
-target_include_directories(compute_groundtruth_for_filters PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
-target_link_libraries(compute_groundtruth_for_filters ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
+add_executable(compute_filtered_groundtruth compute_filtered_groundtruth.cpp)
+target_include_directories(compute_filtered_groundtruth PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
+target_link_libraries(compute_filtered_groundtruth ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
+
+add_executable(get_false_positive_rate get_false_positive_rate.cpp)
+target_include_directories(get_false_positive_rate PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
+target_link_libraries(get_false_positive_rate ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
 
 
 add_executable(generate_pq generate_pq.cpp)
@@ -97,7 +104,7 @@ if (NOT MSVC)
             simulate_aggregate_recall
             calculate_recall
             compute_groundtruth
-            compute_groundtruth_for_filters
+            compute_filtered_groundtruth
             generate_pq
             partition_data
             partition_with_ram_budget
@@ -107,4 +114,4 @@ if (NOT MSVC)
             stats_label_data
             RUNTIME
     )
-endif()
\ No newline at end of file
+endif()
diff --git a/apps/utils/bitmap_vs_list.cpp b/apps/utils/bitmap_vs_list.cpp
new file mode 100644
index 000000000..b3ba386d8
--- /dev/null
+++ b/apps/utils/bitmap_vs_list.cpp
@@ -0,0 +1,201 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#include <cstring>
+#include <iomanip>
+#include <algorithm>
+#include <numeric>
+#include <omp.h>
+#include <set>
+#include <string.h>
+#include <boost/program_options.hpp>
+#include <random>
+
+#ifndef _WINDOWS
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+#endif
+
+#include "id_list.h"
+#include "index.h"
+#include "roaring.h"
+#include "memory_mapper.h"
+#include "utils.h"
+#include "program_options_utils.hpp"
+#include "index_factory.h"
+
+namespace po = boost::program_options;
+
+int main(int argc, char **argv)
+{
+
+    uint32_t maxN;
+    float p1, p2;
+
+    po::options_description desc{
+        program_options_utils::make_program_description("search_memory_index", "Searches in-memory DiskANN indexes")};
+    try
+    {
+        desc.add_options()("help,h", "Print this information on arguments");
+
+        // Required parameters
+        po::options_description required_configs("Required");
+        po::options_description optional_configs("Optional");
+
+        required_configs.add_options()("maxN", po::value<uint32_t>(&maxN)->default_value(10000000), "maxN");
+        required_configs.add_options()("p1", po::value<float>(&p1)->default_value(0.1), "p1");
+        required_configs.add_options()("p2", po::value<float>(&p2)->default_value(0.1), "p2");
+
+        // Merge required and optional parameters
+        desc.add(required_configs).add(optional_configs);
+
+        po::variables_map vm;
+        po::store(po::parse_command_line(argc, argv, desc), vm);
+        if (vm.count("help"))
+        {
+            std::cout << desc;
+            return 0;
+        }
+        po::notify(vm);
+    }
+    catch (const std::exception &ex)
+    {
+        std::cerr << ex.what() << '\n';
+        return -1;
+    }
+
+    std::random_device rd;  // Will be used to obtain a seed for the random number engine
+    std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
+    std::uniform_real_distribution<> dis(0, 1.0);
+
+    //    uint32_t maxN = 10000000;
+    //    float p1_val = 0.1;
+    //    float p2_val = 0.1;
+    std::vector<uint32_t> v1;
+    std::vector<uint32_t> v2;
+    for (uint32_t i = 0; i < maxN; i++)
+    {
+        float tau = dis(gen);
+        if (tau < p1)
+            v1.push_back(i);
+    }
+    for (uint32_t i = 0; i < maxN; i++)
+    {
+        float tau = dis(gen);
+        if (tau < p2)
+            v2.push_back(i);
+    }
+
+    std::cout << "sizes of v1, v2: " << v1.size() << " " << v2.size() << std::endl;
+
+    roaring_bitmap_t *r1 = roaring_bitmap_create();
+    roaring_bitmap_t *r2 = roaring_bitmap_create();
+    for (auto &x : v1)
+        roaring_bitmap_add(r1, x);
+    for (auto &x : v2)
+        roaring_bitmap_add(r2, x);
+
+    diskann::RoaringIdList dr1, dr2;
+    for (auto &x : v1)
+        dr1.add(x);
+    for (auto &x : v2)
+        dr2.add(x);
+
+    auto s = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 100; i++)
+    {
+        //        auto intersect = roaring_bitmap_create();
+        auto intersect = roaring_bitmap_and(r1, r2);
+        //        if (i == 99) std::cout<<roaring_bitmap_get_cardinality(intersect) << " is the size of intersection
+        //        computed using roaring." << std::endl;
+        roaring_bitmap_free(intersect);
+    }
+
+    std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
+    std::cout << "set intersection computation using roaring bitmap time:" << diff.count() / 100 << std::endl;
+
+    s = std::chrono::high_resolution_clock::now();
+    dr1.intersect_list(dr2);
+    diff = std::chrono::high_resolution_clock::now() - s;
+    std::cout << dr1.size() << " is the size of intersection computed using diskann::roaring." << std::endl;
+    std::cout << "set intersection computation using diskann::roaring bitmap time:" << diff.count() << std::endl;
+
+    s = std::chrono::high_resolution_clock::now();
+    uint64_t count = 0;
+    //    for (int i = 0; i < 100; i++) {
+    for (auto &y : v2)
+    {
+        bool flag = roaring_bitmap_contains(r1, y);
+        if (flag)
+            count++;
+    }
+    //   }
+    diff = std::chrono::high_resolution_clock::now() - s;
+    std::cout << "found " << count << " matches of set 2 in set 1" << std::endl;
+    std::cout << "membership lookup time using roaring:" << diff.count() / (v2.size()) << std::endl;
+
+    s = std::chrono::high_resolution_clock::now();
+    //   diff = std::chrono::high_resolution_clock::now() - s;
+    //    for (int i = 0; i < 100; i++) {
+    auto intersect = roaring_bitmap_create();
+    for (auto &y : v2)
+    {
+        bool flag = roaring_bitmap_contains(r1, y);
+        if (flag)
+            roaring_bitmap_add(intersect, y);
+    }
+    roaring_bitmap_free(intersect);
+    //    }
+    diff = std::chrono::high_resolution_clock::now() - s;
+    std::cout << "set intersection computation using roaring lookups time:" << diff.count() << std::endl;
+
+    tsl::robin_set<uint32_t> a;
+    for (auto &x : v1)
+        a.insert(x);
+
+    s = std::chrono::high_resolution_clock::now();
+    count = 0;
+    //    for (int i = 0; i < 100; i++) {
+    for (auto &y : v2)
+    {
+        bool flag = a.find(y) != a.end() ? true : false;
+        if (flag)
+            count++;
+    }
+    //    }
+    diff = std::chrono::high_resolution_clock::now() - s;
+    std::cout << "intersection count per robin_set: " << count << std::endl;
+    std::cout << "tsl robin membership check time:" << diff.count() / (v2.size()) << std::endl;
+
+    s = std::chrono::high_resolution_clock::now();
+    //    for (int i = 0; i < 100; i++) {
+    tsl::robin_set<uint32_t> robin_intersect;
+    for (auto &y : v2)
+    {
+        bool flag = a.find(y) != a.end() ? true : false;
+        if (flag)
+            robin_intersect.insert(y);
+    }
+    //            if (i == 99)
+    std::cout << robin_intersect.size() << std::endl;
+    //    }
+    diff = std::chrono::high_resolution_clock::now() - s;
+    std::cout << "intersection computation time via iterative tsl lookups:" << diff.count() << std::endl;
+
+    s = std::chrono::high_resolution_clock::now();
+    //    for (int i = 0; i < 100; i++)
+    {
+        std::vector<uint32_t> common_filters;
+        std::set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), std::back_inserter(common_filters));
+        //      if (i==99)
+        std::cout << common_filters.size() << std::endl;
+    }
+    diff = std::chrono::high_resolution_clock::now() - s;
+    std::cout << "intersection computation using std set intersection time:" << diff.count() << std::endl;
+
+    roaring_bitmap_free(r1);
+    roaring_bitmap_free(r2);
+    return 0;
+}
diff --git a/apps/utils/calculate_recall.cpp b/apps/utils/calculate_recall.cpp
index dc76252cc..8a78863d1 100644
--- a/apps/utils/calculate_recall.cpp
+++ b/apps/utils/calculate_recall.cpp
@@ -14,9 +14,9 @@
 
 int main(int argc, char **argv)
 {
-    if (argc != 4)
+    if (argc < 4 || argc > 5)
     {
-        std::cout << argv[0] << " <ground_truth_bin> <our_results_bin>  <r> " << std::endl;
+        std::cout << argv[0] << " <ground_truth_bin> <our_results_bin>  <r1> <r2> {for r1-recall@r2} " << std::endl;
         return -1;
     }
     uint32_t *gold_std = NULL;
@@ -39,16 +39,19 @@ int main(int argc, char **argv)
     points_num = points_num_gs;
 
     uint32_t recall_at = std::atoi(argv[3]);
+    uint32_t r2 = recall_at;
+    if (argc == 5)
+        r2 = std::atoi(argv[4]);
 
-    if ((dim_or < recall_at) || (recall_at > dim_gs))
+    if ((dim_or < r2) || (recall_at > dim_gs))
     {
         std::cout << "ground truth has size " << dim_gs << "; our set has " << dim_or << " points. Asking for recall "
-                  << recall_at << std::endl;
+                  << recall_at << "@" << r2 << ". Fix the mismatch in requirements." << std::endl;
         return -1;
     }
-    std::cout << "Calculating recall@" << recall_at << std::endl;
+    std::cout << "Calculating " << recall_at << "-recall@" << r2 << std::endl;
     double recall_val = diskann::calculate_recall((uint32_t)points_num, gold_std, gs_dist, (uint32_t)dim_gs,
-                                                  our_results, (uint32_t)dim_or, (uint32_t)recall_at);
+                                                  our_results, (uint32_t)dim_or, (uint32_t)recall_at, (uint32_t)r2);
 
     //  double avg_recall = (recall*1.0)/(points_num*1.0);
     std::cout << "Avg. recall@" << recall_at << " is " << recall_val << "\n";
diff --git a/apps/utils/compute_filtered_groundtruth.cpp b/apps/utils/compute_filtered_groundtruth.cpp
new file mode 100644
index 000000000..13a47e207
--- /dev/null
+++ b/apps/utils/compute_filtered_groundtruth.cpp
@@ -0,0 +1,784 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <chrono>
+#include <vector>
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <random>
+#include <limits>
+#include <cstring>
+#include <queue>
+#include <omp.h>
+#include <mkl.h>
+#include <boost/program_options.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <unordered_map>
+#include <tsl/robin_map.h>
+#include <tsl/robin_set.h>
+
+#ifdef _WINDOWS
+#include <malloc.h>
+#else
+#include <stdlib.h>
+#endif
+#include "filter_utils.h"
+#include "utils.h"
+
+// WORKS FOR UPTO 2 BILLION POINTS (as we use INT INSTEAD OF UNSIGNED)
+
+#define PARTSIZE 10000000
+#define ALIGNMENT 512
+
+// custom types (for readability)
+typedef tsl::robin_set<std::string> label_set;
+typedef std::string path;
+
+namespace po = boost::program_options;
+
+template <class T> T div_round_up(const T numerator, const T denominator)
+{
+    return (numerator % denominator == 0) ? (numerator / denominator) : 1 + (numerator / denominator);
+}
+
+using pairIF = std::pair<size_t, float>;
+struct cmpmaxstruct
+{
+    bool operator()(const pairIF &l, const pairIF &r)
+    {
+        return l.second < r.second;
+    };
+};
+
+using maxPQIFCS = std::priority_queue<pairIF, std::vector<pairIF>, cmpmaxstruct>;
+
+template <class T> T *aligned_malloc(const size_t n, const size_t alignment)
+{
+#ifdef _WINDOWS
+    return (T *)_aligned_malloc(sizeof(T) * n, alignment);
+#else
+    return static_cast<T *>(aligned_alloc(alignment, sizeof(T) * n));
+#endif
+}
+
+inline bool custom_dist(const std::pair<uint32_t, float> &a, const std::pair<uint32_t, float> &b)
+{
+    return a.second < b.second;
+}
+
+void compute_l2sq(float *const points_l2sq, const float *const matrix, const int64_t num_points, const uint64_t dim)
+{
+    assert(points_l2sq != NULL);
+#pragma omp parallel for schedule(static, 65536)
+    for (int64_t d = 0; d < num_points; ++d)
+        points_l2sq[d] = cblas_sdot((int64_t)dim, matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1,
+                                    matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1);
+}
+
+void distsq_to_points(const size_t dim,
+                      float *dist_matrix, // Col Major, cols are queries, rows are points
+                      size_t npoints, const float *const points,
+                      const float *const points_l2sq, // points in Col major
+                      size_t nqueries, const float *const queries,
+                      const float *const queries_l2sq, // queries in Col major
+                      float *ones_vec = NULL)          // Scratchspace of num_data size and init to 1.0
+{
+    bool ones_vec_alloc = false;
+    if (ones_vec == NULL)
+    {
+        ones_vec = new float[nqueries > npoints ? nqueries : npoints];
+        std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
+        ones_vec_alloc = true;
+    }
+    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-2.0, points, dim, queries, dim,
+                (float)0.0, dist_matrix, npoints);
+    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, points_l2sq, npoints,
+                ones_vec, nqueries, (float)1.0, dist_matrix, npoints);
+    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, ones_vec, npoints,
+                queries_l2sq, nqueries, (float)1.0, dist_matrix, npoints);
+    if (ones_vec_alloc)
+        delete[] ones_vec;
+}
+
+void inner_prod_to_points(const size_t dim,
+                          float *dist_matrix, // Col Major, cols are queries, rows are points
+                          size_t npoints, const float *const points, size_t nqueries, const float *const queries,
+                          float *ones_vec = NULL) // Scratchspace of num_data size and init to 1.0
+{
+    bool ones_vec_alloc = false;
+    if (ones_vec == NULL)
+    {
+        ones_vec = new float[nqueries > npoints ? nqueries : npoints];
+        std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
+        ones_vec_alloc = true;
+    }
+    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-1.0, points, dim, queries, dim,
+                (float)0.0, dist_matrix, npoints);
+
+    if (ones_vec_alloc)
+        delete[] ones_vec;
+}
+
+void exact_knn(const size_t dim, const size_t k,
+               size_t *const closest_points,     // k * num_queries preallocated, col
+                                                 // major, queries columns
+               float *const dist_closest_points, // k * num_queries
+                                                 // preallocated, Dist to
+                                                 // corresponding closes_points
+               size_t npoints,
+               float *points_in, // points in Col major
+               size_t nqueries, float *queries_in, diskann::Metric metric,
+               std::vector<boost::dynamic_bitset<>> &matching_points) // queries in Col major
+{
+    float *points_l2sq = new float[npoints];
+    float *queries_l2sq = new float[nqueries];
+    compute_l2sq(points_l2sq, points_in, npoints, dim);
+    compute_l2sq(queries_l2sq, queries_in, nqueries, dim);
+
+    float *points = points_in;
+    float *queries = queries_in;
+
+    if (metric == diskann::Metric::COSINE)
+    { // we convert cosine distance as
+      // normalized L2 distnace
+        points = new float[npoints * dim];
+        queries = new float[nqueries * dim];
+#pragma omp parallel for schedule(static, 4096)
+        for (int64_t i = 0; i < (int64_t)npoints; i++)
+        {
+            float norm = std::sqrt(points_l2sq[i]);
+            if (norm == 0)
+            {
+                norm = std::numeric_limits<float>::epsilon();
+            }
+            for (uint32_t j = 0; j < dim; j++)
+            {
+                points[i * dim + j] = points_in[i * dim + j] / norm;
+            }
+        }
+
+#pragma omp parallel for schedule(static, 4096)
+        for (int64_t i = 0; i < (int64_t)nqueries; i++)
+        {
+            float norm = std::sqrt(queries_l2sq[i]);
+            if (norm == 0)
+            {
+                norm = std::numeric_limits<float>::epsilon();
+            }
+            for (uint32_t j = 0; j < dim; j++)
+            {
+                queries[i * dim + j] = queries_in[i * dim + j] / norm;
+            }
+        }
+        // recalculate norms after normalizing, they should all be one.
+        compute_l2sq(points_l2sq, points, npoints, dim);
+        compute_l2sq(queries_l2sq, queries, nqueries, dim);
+    }
+
+    std::cout << "Going to compute " << k << " NNs for " << nqueries << " queries over " << npoints << " points in "
+              << dim << " dimensions using";
+    if (metric == diskann::Metric::INNER_PRODUCT)
+        std::cout << " MIPS ";
+    else if (metric == diskann::Metric::COSINE)
+        std::cout << " Cosine ";
+    else
+        std::cout << " L2 ";
+    std::cout << "distance fn. " << std::endl;
+
+    size_t q_batch_size = (1 << 9);
+    float *dist_matrix = new float[(size_t)q_batch_size * (size_t)npoints];
+
+    for (size_t b = 0; b < div_round_up(nqueries, q_batch_size); ++b)
+    {
+        int64_t q_b = b * q_batch_size;
+        int64_t q_e = ((b + 1) * q_batch_size > nqueries) ? nqueries : (b + 1) * q_batch_size;
+
+        if (metric == diskann::Metric::L2 || metric == diskann::Metric::COSINE)
+        {
+            distsq_to_points(dim, dist_matrix, npoints, points, points_l2sq, q_e - q_b,
+                             queries + (ptrdiff_t)q_b * (ptrdiff_t)dim, queries_l2sq + q_b);
+        }
+        else
+        {
+            inner_prod_to_points(dim, dist_matrix, npoints, points, q_e - q_b,
+                                 queries + (ptrdiff_t)q_b * (ptrdiff_t)dim);
+        }
+        std::cout << "Computed distances for queries: [" << q_b << "," << q_e << ")" << std::endl;
+
+#pragma omp parallel for schedule(dynamic, 16)
+        for (long long q = q_b; q < q_e; q++)
+        {
+            maxPQIFCS point_dist;
+            //            for (size_t p = 0; p < k; p++) {
+            //                if (matching_points[q][p] == true)
+            //                point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) *
+            //                (ptrdiff_t)npoints]);
+            //            }
+            for (size_t p = 0; p < npoints; p++)
+            {
+                if (matching_points[q][p] == false)
+                    continue;
+                if (point_dist.size() < k ||
+                    point_dist.top().second > dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints])
+                    point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints]);
+                if (point_dist.size() > k)
+                    point_dist.pop();
+            }
+            //            for (ptrdiff_t l = 0; l < (ptrdiff_t)k; ++l)
+            ptrdiff_t l = 0;
+            while (point_dist.size() > 0)
+            {
+                closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().first;
+                dist_closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().second;
+                point_dist.pop();
+                l++;
+            }
+            while (l < k)
+            {
+                closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] =
+                    std::numeric_limits<uint64_t>::max();
+                dist_closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] =
+                    std::numeric_limits<float>::max();
+                l++;
+            }
+//            assert(std::is_sorted(dist_closest_points + (ptrdiff_t)q * (ptrdiff_t)k,
+  //                                dist_closest_points + (ptrdiff_t)(q + 1) * (ptrdiff_t)k));
+        }
+        std::cout << "Computed exact k-NN for queries: [" << q_b << "," << q_e << ")" << std::endl;
+    }
+
+    delete[] dist_matrix;
+
+    delete[] points_l2sq;
+    delete[] queries_l2sq;
+
+    if (metric == diskann::Metric::COSINE)
+    {
+        delete[] points;
+        delete[] queries;
+    }
+}
+
+template <typename T> inline int get_num_parts(const char *filename)
+{
+    std::ifstream reader;
+    reader.exceptions(std::ios::failbit | std::ios::badbit);
+    reader.open(filename, std::ios::binary);
+    std::cout << "Reading bin file " << filename << " ...\n";
+    int npts_i32, ndims_i32;
+    reader.read((char *)&npts_i32, sizeof(int));
+    reader.read((char *)&ndims_i32, sizeof(int));
+    std::cout << "#pts = " << npts_i32 << ", #dims = " << ndims_i32 << std::endl;
+    reader.close();
+    uint32_t num_parts =
+        (npts_i32 % PARTSIZE) == 0 ? npts_i32 / PARTSIZE : (uint32_t)std::floor(npts_i32 / PARTSIZE) + 1;
+    std::cout << "Number of parts: " << num_parts << std::endl;
+    return num_parts;
+}
+
+template <typename T>
+inline void load_bin_as_float(const char *filename, float *&data, size_t &npts, size_t &ndims, int part_num)
+{
+    std::ifstream reader;
+    reader.exceptions(std::ios::failbit | std::ios::badbit);
+    reader.open(filename, std::ios::binary);
+    std::cout << "Reading bin file " << filename << " ...\n";
+    int npts_i32, ndims_i32;
+    reader.read((char *)&npts_i32, sizeof(int));
+    reader.read((char *)&ndims_i32, sizeof(int));
+    uint64_t start_id = part_num * PARTSIZE;
+    uint64_t end_id = (std::min)(start_id + PARTSIZE, (uint64_t)npts_i32);
+    npts = end_id - start_id;
+    ndims = (uint64_t)ndims_i32;
+    std::cout << "#pts in part = " << npts << ", #dims = " << ndims << ", size = " << npts * ndims * sizeof(T) << "B"
+              << std::endl;
+
+    reader.seekg(start_id * ndims * sizeof(T) + 2 * sizeof(uint32_t), std::ios::beg);
+    T *data_T = new T[npts * ndims];
+    reader.read((char *)data_T, sizeof(T) * npts * ndims);
+    std::cout << "Finished reading part of the bin file." << std::endl;
+    reader.close();
+    data = aligned_malloc<float>(npts * ndims, ALIGNMENT);
+#pragma omp parallel for schedule(dynamic, 32768)
+    for (int64_t i = 0; i < (int64_t)npts; i++)
+    {
+        for (int64_t j = 0; j < (int64_t)ndims; j++)
+        {
+            float cur_val_float = (float)data_T[i * ndims + j];
+            std::memcpy((char *)(data + i * ndims + j), (char *)&cur_val_float, sizeof(float));
+        }
+    }
+    delete[] data_T;
+    std::cout << "Finished converting part data to float." << std::endl;
+}
+
+template <typename T> inline void save_bin(const std::string filename, T *data, size_t npts, size_t ndims)
+{
+    std::ofstream writer;
+    writer.exceptions(std::ios::failbit | std::ios::badbit);
+    writer.open(filename, std::ios::binary | std::ios::out);
+    std::cout << "Writing bin: " << filename << "\n";
+    int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
+    writer.write((char *)&npts_i32, sizeof(int));
+    writer.write((char *)&ndims_i32, sizeof(int));
+    std::cout << "bin: #pts = " << npts << ", #dims = " << ndims
+              << ", size = " << npts * ndims * sizeof(T) + 2 * sizeof(int) << "B" << std::endl;
+
+    writer.write((char *)data, npts * ndims * sizeof(T));
+    writer.close();
+    std::cout << "Finished writing bin" << std::endl;
+}
+
+inline void save_groundtruth_as_one_file(const std::string filename, int32_t *data, float *distances, size_t npts,
+                                         size_t ndims)
+{
+    std::ofstream writer(filename, std::ios::binary | std::ios::out);
+    int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
+    writer.write((char *)&npts_i32, sizeof(int));
+    writer.write((char *)&ndims_i32, sizeof(int));
+    std::cout << "Saving truthset in one file (npts, dim, npts*dim id-matrix, "
+                 "npts*dim dist-matrix) with npts = "
+              << npts << ", dim = " << ndims << ", size = " << 2 * npts * ndims * sizeof(uint32_t) + 2 * sizeof(int)
+              << "B" << std::endl;
+
+    writer.write((char *)data, npts * ndims * sizeof(uint32_t));
+    writer.write((char *)distances, npts * ndims * sizeof(float));
+    writer.close();
+    std::cout << "Finished writing truthset" << std::endl;
+}
+
+inline void parse_base_label_file(const std::string &map_file, std::vector<tsl::robin_set<std::string>> &pts_to_labels,
+                                  uint32_t start_id = 0)
+{
+    pts_to_labels.clear();
+    std::ifstream infile(map_file);
+    std::string line, token;
+    std::set<std::string> labels;
+    infile.clear();
+    infile.seekg(0, std::ios::beg);
+    uint32_t line_no = 0;
+    while (std::getline(infile, line))
+    {
+        if (line_no < start_id)
+        {
+            line_no++;
+            continue;
+        }
+        line_no++;
+        std::istringstream iss(line);
+        tsl::robin_set<std::string> lbls;
+
+        getline(iss, token, '\t');
+        std::istringstream new_iss(token);
+        while (getline(new_iss, token, ','))
+        {
+            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
+            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
+            lbls.insert(token);
+            labels.insert(token);
+        }
+        //        std::sort(lbls.begin(), lbls.end());
+        pts_to_labels.push_back(lbls);
+        if (pts_to_labels.size() >= PARTSIZE)
+            break;
+    }
+    std::cout << "Identified " << labels.size() << " distinct label(s), and populated labels for "
+              << pts_to_labels.size() << " points" << std::endl;
+}
+
+// outer vector is # queries,  inner vector is size of the AND predicate
+inline void parse_query_label_file(const std::string &query_label_file,
+                                   std::vector<std::vector<std::vector<std::string>>> &query_labels)
+{
+    query_labels.clear();
+    std::ifstream infile(query_label_file);
+    std::string line, token;
+    std::set<std::string> labels;
+    infile.clear();
+    infile.seekg(0, std::ios::beg);
+    uint32_t line_cnt = 0;
+    bool print_flag = true;
+    while (std::getline(infile, line))
+    {
+        std::istringstream iss(line);
+        std::vector<std::vector<std::string>> lbls(0);
+
+        getline(iss, token, '\t');
+        std::istringstream new_iss(token);
+        while (getline(new_iss, token, '&'))
+        {
+            std::vector<std::string> or_clause(0);
+            std::istringstream inner_iss(token);
+            while (getline(inner_iss, token, '|'))
+            {
+//                if (print_flag)
+//                    std::cout<<token<<" || ";
+                token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
+                token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
+                or_clause.push_back(token);
+                labels.insert(token);
+            }
+            if (print_flag)
+                std::cout<<" && ";
+            lbls.push_back(or_clause);
+        }
+        //        std::sort(lbls.begin(), lbls.end());
+        query_labels.push_back(lbls);
+        line_cnt++;
+        if (line_cnt>10)
+                print_flag = false;
+    }
+    std::cout << "Identified " << labels.size() << " distinct label(s), and populated labels for "
+              << query_labels.size() << " queries" << std::endl;
+}
+
+void print_query_stats(std::vector<std::pair<uint32_t, uint32_t>> &v)
+{
+
+    std::sort(v.begin(), v.end(), [](const std::pair<uint32_t, uint32_t> &a, const std::pair<uint32_t, uint32_t> &b) {
+        return a.second < b.second;
+    });
+
+    for (uint32_t pct = 0; pct < 100; pct += 1)
+    {
+        std::cout << v[(v.size() * pct * 1.0) / 100].second << " is pass-rate of query with percentile " << pct
+                  << std::endl;
+    }
+    std::cout<<"\n Top 10 pass-rates" << std::endl;
+    for (uint32_t i = 0; i < 10; i += 1)
+    {
+        if (i == v.size())
+            break;
+        std::cout << v[v.size() - (i + 1)].second << " is pass-rate of query of rank " << (i+1)<< std::endl;
+    }
+
+    return;
+}
+
+// template<typename A, typename B>
+// add UNIVERSAL LABEL SUPPORT
+int identify_matching_points(const std::string &base, const size_t start_id, const std::string &query,
+                             const std::string &unv_label, std::vector<boost::dynamic_bitset<>> &matching_points,
+                             std::vector<std::pair<uint32_t, uint32_t>> &query_stats)
+{
+    std::vector<tsl::robin_set<std::string>> base_labels;
+    std::vector<std::vector<std::vector<std::string>>> query_labels;
+    parse_base_label_file(base, base_labels, start_id);
+    parse_query_label_file(query, query_labels);
+    matching_points.clear();
+    uint32_t num_query = query_labels.size();
+    uint32_t num_base = base_labels.size();
+    matching_points.resize(num_query);
+    for (auto &x : matching_points)
+        x.resize(num_base);
+    std::cout << "Starting to identify matching points " << std::endl;
+    std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
+
+#pragma omp parallel for schedule(dynamic, 128)
+    for (uint32_t i = 0; i < num_query; i++)
+    {
+        //        if (i % 100 == 0)
+        //        std::cout<<"."<< std::flush;
+        //        tsl::robin_set<uint32_t> matches;
+        for (uint32_t j = 0; j < num_base; j++)
+        {
+            bool pass = true;
+            if (unv_label.empty() || (base_labels[j].find(unv_label) == base_labels[j].end()))
+            {
+                for (uint32_t k = 0; k < query_labels[i].size(); k++)
+                {
+                    bool or_pass = false;
+                for (uint32_t l = 0; l < query_labels[i][k].size(); l++)
+                {
+                    if (base_labels[j].find(query_labels[i][k][l]) != base_labels[j].end())
+                    {
+                        or_pass = true;
+                        break;
+                    }
+                }
+                if (or_pass == false) {
+                    pass = false;
+                    break;
+                }
+                }
+            }
+            if (pass)
+            {
+                matching_points[i][j] = 1;
+                query_stats[i].second++;
+            }
+        }
+    }
+    std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+
+    std::cout << "It took me " << time_span.count() << " seconds.";
+    return 0;
+}
+
+template <typename T>
+std::vector<std::vector<std::pair<uint32_t, float>>> processUnfilteredParts(
+    const std::string &base_file, const std::string &base_labels, const std::string &query_labels,
+    const std::string &unv_label, size_t &nqueries, size_t &npoints, size_t &dim, size_t &k, float *query_data,
+    const diskann::Metric &metric, std::vector<uint32_t> &location_to_tag)
+{
+    float *base_data = nullptr;
+    int num_parts = get_num_parts<T>(base_file.c_str());
+    std::vector<std::vector<std::pair<uint32_t, float>>> res(nqueries);
+    std::vector<std::pair<uint32_t, uint32_t>> query_stats(nqueries);
+    for (uint32_t i = 0; i < nqueries; i++)
+    {
+        query_stats[i].first = i;
+        query_stats[i].second = 0;
+    }
+
+    for (int p = 0; p < num_parts; p++)
+    {
+        size_t start_id = p * PARTSIZE;
+        load_bin_as_float<T>(base_file.c_str(), base_data, npoints, dim, p);
+        size_t end_id = start_id + npoints;
+
+        std::vector<boost::dynamic_bitset<>> matching_points;
+        identify_matching_points(base_labels, start_id, query_labels, unv_label, matching_points, query_stats);
+
+        size_t *closest_points_part = new size_t[nqueries * k];
+        float *dist_closest_points_part = new float[nqueries * k];
+
+        auto part_k = k < npoints ? k : npoints;
+        exact_knn(dim, part_k, closest_points_part, dist_closest_points_part, npoints, base_data, nqueries, query_data,
+                  metric, matching_points);
+
+        for (size_t i = 0; i < nqueries; i++)
+        {
+            for (size_t j = 0; j < part_k; j++)
+            {
+                if (!location_to_tag.empty())
+                    if (location_to_tag[closest_points_part[i * k + j] + start_id] == 0)
+                        continue;
+
+                res[i].push_back(std::make_pair((uint32_t)(closest_points_part[i * part_k + j] + start_id),
+                                                dist_closest_points_part[i * part_k + j]));
+            }
+        }
+
+        delete[] closest_points_part;
+        delete[] dist_closest_points_part;
+
+        diskann::aligned_free(base_data);
+    }
+    print_query_stats(query_stats);
+
+    return res;
+};
+
+// add UNIVERSAL LABEL SUPPORT
+template <typename T>
+int aux_main(const std::string &base_file, const std::string &query_file, const std::string &gt_file, size_t k,
+             const diskann::Metric &metric, const std::string &base_labels, const std::string &query_labels,
+             const std::string &unv_label, const std::string &tags_file = std::string(""))
+{
+    size_t npoints, nqueries, dim;
+
+    float *query_data;
+    load_bin_as_float<T>(query_file.c_str(), query_data, nqueries, dim, 0);
+    if (nqueries > PARTSIZE)
+        std::cerr << "WARNING: #Queries provided (" << nqueries << ") is greater than " << PARTSIZE
+                  << ". Computing GT only for the first " << PARTSIZE << " queries." << std::endl;
+
+    // load tags
+    const bool tags_enabled = tags_file.empty() ? false : true;
+    std::vector<uint32_t> location_to_tag = diskann::loadTags(tags_file, base_file);
+
+    int *closest_points = new int[nqueries * k];
+    float *dist_closest_points = new float[nqueries * k];
+
+    std::vector<std::vector<std::pair<uint32_t, float>>> results =
+        processUnfilteredParts<T>(base_file, base_labels, query_labels, unv_label, nqueries, npoints, dim, k,
+                                  query_data, metric, location_to_tag);
+
+    for (size_t i = 0; i < nqueries; i++)
+    {
+        std::vector<std::pair<uint32_t, float>> &cur_res = results[i];
+        std::sort(cur_res.begin(), cur_res.end(), custom_dist);
+        size_t j = 0;
+        for (auto iter : cur_res)
+        {
+            if (j == k)
+                break;
+            if (tags_enabled)
+            {
+                std::uint32_t index_with_tag = location_to_tag[iter.first];
+                closest_points[i * k + j] = (int32_t)index_with_tag;
+            }
+            else
+            {
+                closest_points[i * k + j] = (int32_t)iter.first;
+            }
+
+            if (metric == diskann::Metric::INNER_PRODUCT)
+                dist_closest_points[i * k + j] = -iter.second;
+            else
+                dist_closest_points[i * k + j] = iter.second;
+
+            ++j;
+        }
+        if (j < k)
+            std::cout << "WARNING: found less than k GT entries for query " << i << std::endl;
+    }
+
+    save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points, nqueries, k);
+    delete[] closest_points;
+    delete[] dist_closest_points;
+    diskann::aligned_free(query_data);
+
+    return 0;
+}
+
+void load_truthset(const std::string &bin_file, uint32_t *&ids, float *&dists, size_t &npts, size_t &dim)
+{
+    size_t read_blk_size = 64 * 1024 * 1024;
+    cached_ifstream reader(bin_file, read_blk_size);
+    diskann::cout << "Reading truthset file " << bin_file.c_str() << " ..." << std::endl;
+    size_t actual_file_size = reader.get_file_size();
+
+    int npts_i32, dim_i32;
+    reader.read((char *)&npts_i32, sizeof(int));
+    reader.read((char *)&dim_i32, sizeof(int));
+    npts = (uint32_t)npts_i32;
+    dim = (uint32_t)dim_i32;
+
+    diskann::cout << "Metadata: #pts = " << npts << ", #dims = " << dim << "... " << std::endl;
+
+    int truthset_type = -1; // 1 means truthset has ids and distances, 2 means
+                            // only ids, -1 is error
+    size_t expected_file_size_with_dists = 2 * npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
+
+    if (actual_file_size == expected_file_size_with_dists)
+        truthset_type = 1;
+
+    size_t expected_file_size_just_ids = npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
+
+    if (actual_file_size == expected_file_size_just_ids)
+        truthset_type = 2;
+
+    if (truthset_type == -1)
+    {
+        std::stringstream stream;
+        stream << "Error. File size mismatch. File should have bin format, with "
+                  "npts followed by ngt followed by npts*ngt ids and optionally "
+                  "followed by npts*ngt distance values; actual size: "
+               << actual_file_size << ", expected: " << expected_file_size_with_dists << " or "
+               << expected_file_size_just_ids;
+        diskann::cout << stream.str();
+        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
+    }
+
+    ids = new uint32_t[npts * dim];
+    reader.read((char *)ids, npts * dim * sizeof(uint32_t));
+
+    if (truthset_type == 1)
+    {
+        dists = new float[npts * dim];
+        reader.read((char *)dists, npts * dim * sizeof(float));
+    }
+}
+
+int main(int argc, char **argv)
+{
+    std::string data_type, dist_fn, base_file, query_file, gt_file, tags_file, base_labels, query_labels, unv_label;
+    uint64_t K;
+
+    try
+    {
+        po::options_description desc{"Arguments"};
+
+        desc.add_options()("help,h", "Print information on arguments");
+
+        desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
+        desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
+                           "distance function <l2/mips/cosine>");
+        desc.add_options()("base_file", po::value<std::string>(&base_file)->required(),
+                           "File containing the base vectors in binary format");
+        desc.add_options()("query_file", po::value<std::string>(&query_file)->required(),
+                           "File containing the query vectors in binary format");
+        desc.add_options()("base_labels", po::value<std::string>(&base_labels)->required(),
+                           "File containing the base labels");
+        desc.add_options()("query_labels", po::value<std::string>(&query_labels)->required(),
+                           "File containing the query labels");
+        desc.add_options()("gt_file", po::value<std::string>(&gt_file)->required(),
+                           "File name for the writing ground truth in binary "
+                           "format, please don' append .bin at end if "
+                           "no filter_label or filter_label_file is provided it "
+                           "will save the file with '.bin' at end."
+                           "else it will save the file as filename_label.bin");
+        desc.add_options()("K", po::value<uint64_t>(&K)->required(),
+                           "Number of ground truth nearest neighbors to compute");
+        desc.add_options()("tags_file", po::value<std::string>(&tags_file)->default_value(std::string()),
+                           "File containing the tags in binary format");
+        desc.add_options()("universal_label", po::value<std::string>(&unv_label)->default_value(std::string()),
+                           "universal_label value");
+
+        po::variables_map vm;
+        po::store(po::parse_command_line(argc, argv, desc), vm);
+        if (vm.count("help"))
+        {
+            std::cout << desc;
+            return 0;
+        }
+        po::notify(vm);
+    }
+    catch (const std::exception &ex)
+    {
+        std::cerr << ex.what() << '\n';
+        return -1;
+    }
+
+    if (data_type != std::string("float") && data_type != std::string("int8") && data_type != std::string("uint8"))
+    {
+        std::cout << "Unsupported type. float, int8 and uint8 types are supported." << std::endl;
+        return -1;
+    }
+
+    diskann::Metric metric;
+    if (dist_fn == std::string("l2"))
+    {
+        metric = diskann::Metric::L2;
+    }
+    else if (dist_fn == std::string("mips"))
+    {
+        metric = diskann::Metric::INNER_PRODUCT;
+    }
+    else if (dist_fn == std::string("cosine"))
+    {
+        metric = diskann::Metric::COSINE;
+    }
+    else
+    {
+        std::cerr << "Unsupported distance function. Use l2/mips/cosine." << std::endl;
+        return -1;
+    }
+
+    try
+    {
+        if (data_type == std::string("float"))
+            aux_main<float>(base_file, query_file, gt_file, K, metric, base_labels, query_labels, unv_label, tags_file);
+        if (data_type == std::string("int8"))
+            aux_main<int8_t>(base_file, query_file, gt_file, K, metric, base_labels, query_labels, unv_label,
+                             tags_file);
+        if (data_type == std::string("uint8"))
+            aux_main<uint8_t>(base_file, query_file, gt_file, K, metric, base_labels, query_labels, unv_label,
+                              tags_file);
+    }
+    catch (const std::exception &e)
+    {
+        std::cout << std::string(e.what()) << std::endl;
+        diskann::cerr << "Compute GT failed." << std::endl;
+        return -1;
+    }
+}
diff --git a/apps/utils/gen_random_slice.cpp b/apps/utils/gen_random_slice.cpp
index a4cd96e0a..f7c4c7a1a 100644
--- a/apps/utils/gen_random_slice.cpp
+++ b/apps/utils/gen_random_slice.cpp
@@ -20,37 +20,42 @@
 #include <time.h>
 #include <typeinfo>
 
-template <typename T> int aux_main(char **argv)
+template <typename T> int aux_main(int argc, char **argv)
 {
     std::string base_file(argv[2]);
     std::string output_prefix(argv[3]);
     float sampling_rate = (float)(std::atof(argv[4]));
-    gen_random_slice<T>(base_file, output_prefix, sampling_rate);
+
+    std::string label_file = "";
+    if (argc == 6)
+        label_file = std::string(argv[5]);
+
+    gen_random_slice<T>(base_file, output_prefix, sampling_rate, label_file);
     return 0;
 }
 
 int main(int argc, char **argv)
 {
-    if (argc != 5)
+    if (argc < 5 || argc > 6)
     {
         std::cout << argv[0]
                   << " data_type [float/int8/uint8] base_bin_file "
-                     "sample_output_prefix sampling_probability"
+                     "sample_output_prefix sampling_probability <label_file (optional)>"
                   << std::endl;
         exit(-1);
     }
 
     if (std::string(argv[1]) == std::string("float"))
     {
-        aux_main<float>(argv);
+        aux_main<float>(argc, argv);
     }
     else if (std::string(argv[1]) == std::string("int8"))
     {
-        aux_main<int8_t>(argv);
+        aux_main<int8_t>(argc, argv);
     }
     else if (std::string(argv[1]) == std::string("uint8"))
     {
-        aux_main<uint8_t>(argv);
+        aux_main<uint8_t>(argc, argv);
     }
     else
         std::cout << "Unsupported type. Use float/int8/uint8." << std::endl;
diff --git a/apps/utils/generate_pq.cpp b/apps/utils/generate_pq.cpp
index a881b1104..ba39a740a 100644
--- a/apps/utils/generate_pq.cpp
+++ b/apps/utils/generate_pq.cpp
@@ -28,10 +28,10 @@ bool generate_pq(const std::string &data_path, const std::string &index_prefix_p
     else
     {
         diskann::generate_pq_pivots(train_data, train_size, (uint32_t)train_dim, (uint32_t)num_pq_centers,
-                                    (uint32_t)num_pq_chunks, KMEANS_ITERS_FOR_PQ, pq_pivots_path);
+                                    (uint32_t)num_pq_chunks, KMEANS_ITERS_FOR_PQ, pq_pivots_path, false);
     }
     diskann::generate_pq_data_from_pivots<T>(data_path, (uint32_t)num_pq_centers, (uint32_t)num_pq_chunks,
-                                             pq_pivots_path, pq_compressed_vectors_path, true);
+                                             pq_pivots_path, pq_compressed_vectors_path, opq);
 
     delete[] train_data;
 
diff --git a/apps/utils/get_false_positive_rate.cpp b/apps/utils/get_false_positive_rate.cpp
new file mode 100644
index 000000000..f6cfa96a6
--- /dev/null
+++ b/apps/utils/get_false_positive_rate.cpp
@@ -0,0 +1,309 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <chrono>
+#include <vector>
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <random>
+#include <limits>
+#include <cstring>
+#include <queue>
+#include <omp.h>
+#include <mkl.h>
+#include <boost/program_options.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <unordered_map>
+#include <tsl/robin_map.h>
+#include <tsl/robin_set.h>
+
+#ifdef _WINDOWS
+#include <malloc.h>
+#else
+#include <stdlib.h>
+#endif
+#include "filter_utils.h"
+#include "utils.h"
+
+// WORKS FOR UPTO 2 BILLION POINTS (as we use INT INSTEAD OF UNSIGNED)
+
+#define PARTSIZE 10000000
+#define ALIGNMENT 512
+
+// custom types (for readability)
+typedef tsl::robin_set<std::string> label_set;
+typedef std::string path;
+
+namespace po = boost::program_options;
+
+template <class T> T div_round_up(const T numerator, const T denominator)
+{
+    return (numerator % denominator == 0) ? (numerator / denominator) : 1 + (numerator / denominator);
+}
+
+inline void parse_base_label_file(const std::string &map_file, std::vector<tsl::robin_set<std::string>> &pts_to_labels,
+                                  uint32_t start_id = 0)
+{
+    pts_to_labels.clear();
+    std::ifstream infile(map_file);
+    std::string line, token;
+    std::set<std::string> labels;
+    infile.clear();
+    infile.seekg(0, std::ios::beg);
+    uint32_t line_no = 0;
+    while (std::getline(infile, line))
+    {
+        if (line_no < start_id)
+        {
+            line_no++;
+            continue;
+        }
+        line_no++;
+        std::istringstream iss(line);
+        tsl::robin_set<std::string> lbls;
+
+        getline(iss, token, '\t');
+        std::istringstream new_iss(token);
+        while (getline(new_iss, token, ','))
+        {
+            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
+            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
+            lbls.insert(token);
+            labels.insert(token);
+        }
+        //        std::sort(lbls.begin(), lbls.end());
+        pts_to_labels.push_back(lbls);
+        if (pts_to_labels.size() >= PARTSIZE)
+            break;
+    }
+    std::cout << "Identified " << labels.size() << " distinct label(s), and populated labels for "
+              << pts_to_labels.size() << " points" << std::endl;
+}
+
+// outer vector is # queries,  inner vector is size of the AND predicate
+inline void parse_query_label_file(const std::string &query_label_file,
+                                   std::vector<std::vector<std::string>> &query_labels)
+{
+    query_labels.clear();
+    std::ifstream infile(query_label_file);
+    std::string line, token;
+    std::set<std::string> labels;
+    infile.clear();
+    infile.seekg(0, std::ios::beg);
+    while (std::getline(infile, line))
+    {
+        std::istringstream iss(line);
+        std::vector<std::string> lbls(0);
+
+        getline(iss, token, '\t');
+        std::istringstream new_iss(token);
+        while (getline(new_iss, token, '&'))
+        {
+            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
+            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
+            lbls.push_back(token);
+            labels.insert(token);
+        }
+        //        std::sort(lbls.begin(), lbls.end());
+        query_labels.push_back(lbls);
+    }
+    std::cout << "Identified " << labels.size() << " distinct label(s), and populated labels for "
+              << query_labels.size() << " queries" << std::endl;
+}
+
+// template<typename A, typename B>
+// add UNIVERSAL LABEL SUPPORT
+int identify_matching_points(const std::string &base, const size_t start_id, const std::string &query,
+                             const std::string &unv_label, std::vector<boost::dynamic_bitset<>> &matching_points,
+                             std::vector<std::pair<uint32_t, uint32_t>> &query_stats)
+{
+    std::vector<tsl::robin_set<std::string>> base_labels;
+    std::vector<std::vector<std::string>> query_labels;
+    parse_base_label_file(base, base_labels, start_id);
+    parse_query_label_file(query, query_labels);
+    matching_points.clear();
+    uint32_t num_query = query_labels.size();
+    uint32_t num_base = base_labels.size();
+    matching_points.resize(num_query);
+    for (auto &x : matching_points)
+        x.resize(num_base);
+    std::cout << "Starting to identify matching points " << std::endl;
+    std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
+
+#pragma omp parallel for schedule(dynamic, 128)
+    for (uint32_t i = 0; i < num_query; i++)
+    {
+        //        if (i % 100 == 0)
+        //        std::cout<<"."<< std::flush;
+        //        tsl::robin_set<uint32_t> matches;
+        for (uint32_t j = 0; j < num_base; j++)
+        {
+            bool pass = true;
+            if (unv_label.empty() || (base_labels[j].find(unv_label) == base_labels[j].end()))
+            {
+                for (uint32_t k = 0; k < query_labels[i].size(); k++)
+                {
+                    if (base_labels[j].find(query_labels[i][k]) == base_labels[j].end())
+                    {
+                        pass = false;
+                        break;
+                    }
+                }
+            }
+            if (pass)
+            {
+                matching_points[i][j] = 1;
+                query_stats[i].second++;
+            }
+        }
+    }
+    std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+
+    std::cout << "It took me " << time_span.count() << " seconds." << std::endl;
+    return 0;
+}
+
+void get_fp_rate(const std::string &base_labels, const std::string &base_labels_bloom, const std::string &query_labels,
+                 const std::string &query_labels_bloom, const std::string &unv_label, size_t &nqueries, size_t &npoints)
+{
+    //    int num_parts = get_num_parts<T>(base_labels.c_str());
+    uint32_t num_parts = (npoints % PARTSIZE) == 0 ? npoints / PARTSIZE : (uint32_t)std::floor(npoints / PARTSIZE) + 1;
+
+    std::vector<std::vector<std::pair<uint32_t, float>>> res(nqueries);
+    std::vector<std::pair<uint32_t, uint32_t>> query_stats(nqueries);
+    for (uint32_t i = 0; i < nqueries; i++)
+    {
+        query_stats[i].first = i;
+        query_stats[i].second = 0;
+    }
+
+    std::vector<uint32_t> corrects(nqueries, 0);
+    std::vector<uint32_t> positives(nqueries, 0);
+    for (int p = 0; p < num_parts; p++)
+    {
+        size_t start_id = p * PARTSIZE;
+        //        load_bin_as_float<T>(base_file.c_str(), base_data, npoints, dim, p);
+        size_t end_id = start_id + npoints;
+
+        std::vector<boost::dynamic_bitset<>> matching_points;
+        std::vector<boost::dynamic_bitset<>> matching_points_bloom;
+        identify_matching_points(base_labels, start_id, query_labels, unv_label, matching_points, query_stats);
+        identify_matching_points(base_labels_bloom, start_id, query_labels_bloom, unv_label, matching_points_bloom,
+                                 query_stats);
+        for (size_t i = 0; i < nqueries; i++)
+        {
+            corrects[i] += matching_points[i].count();
+            positives[i] += matching_points_bloom[i].count();
+        }
+    }
+    float fp_rate = 0;
+    uint32_t good_queries = 0;
+    for (size_t i = 0; i < nqueries; i++)
+    {
+        if (corrects[i] > 0)
+        {
+            fp_rate += (1.0 * positives[i]) / (1.0 * corrects[i]);
+            good_queries++;
+        }
+        if (corrects[i] > positives[i])
+        {
+            std::cout << "ERROR! False negative(s) detected." << std::endl;
+        }
+    }
+
+    fp_rate /= good_queries;
+    std::cout << "FP rate: " << fp_rate << std::endl;
+};
+
+uint64_t get_label_file_num_pts(std::string file)
+{
+    uint64_t num_lines = 0;
+    std::ifstream infile(file);
+    std::string line, token;
+    std::set<std::string> labels;
+    infile.clear();
+    infile.seekg(0, std::ios::beg);
+    while (std::getline(infile, line))
+    {
+        num_lines++;
+    }
+    std::cout << "Identified " << num_lines << " lines in " << file << std::endl;
+    return num_lines;
+}
+
+// add UNIVERSAL LABEL SUPPORT
+int aux_main(const std::string &base_labels, const std::string &base_labels_bloom, const std::string &query_labels,
+             const std::string &query_labels_bloom, const std::string &unv_label,
+             const std::string &tags_file = std::string(""))
+{
+    size_t npoints, nqueries, dim;
+
+    nqueries = get_label_file_num_pts(query_labels);
+    npoints = get_label_file_num_pts(base_labels);
+
+    if (nqueries > PARTSIZE)
+        std::cerr << "WARNING: #Queries provided (" << nqueries << ") is greater than " << PARTSIZE
+                  << ". Computing GT only for the first " << PARTSIZE << " queries." << std::endl;
+
+    // load tags
+    const bool tags_enabled = tags_file.empty() ? false : true;
+
+    get_fp_rate(base_labels, base_labels_bloom, query_labels, query_labels_bloom, unv_label, nqueries, npoints);
+
+    return 0;
+}
+
+int main(int argc, char **argv)
+{
+    std::string base_labels, query_labels, unv_label, base_labels_bloom, query_labels_bloom;
+
+    try
+    {
+        po::options_description desc{"Arguments"};
+
+        desc.add_options()("help,h", "Print information on arguments");
+
+        desc.add_options()("base_labels", po::value<std::string>(&base_labels)->required(),
+                           "File containing the base labels");
+        desc.add_options()("query_labels", po::value<std::string>(&query_labels)->required(),
+                           "File containing the query labels");
+        desc.add_options()("base_labels_bloom", po::value<std::string>(&base_labels_bloom)->required(),
+                           "File containing the base labels bloom");
+        desc.add_options()("query_labels_bloom", po::value<std::string>(&query_labels_bloom)->required(),
+                           "File containing the query labels bloom");
+        desc.add_options()("universal_label", po::value<std::string>(&unv_label)->default_value(std::string()),
+                           "universal_label value");
+
+        po::variables_map vm;
+        po::store(po::parse_command_line(argc, argv, desc), vm);
+        if (vm.count("help"))
+        {
+            std::cout << desc;
+            return 0;
+        }
+        po::notify(vm);
+    }
+    catch (const std::exception &ex)
+    {
+        std::cerr << ex.what() << '\n';
+        return -1;
+    }
+
+    try
+    {
+        aux_main(base_labels, base_labels_bloom, query_labels, query_labels_bloom, unv_label);
+    }
+    catch (const std::exception &e)
+    {
+        std::cout << std::string(e.what()) << std::endl;
+        diskann::cerr << "Failed." << std::endl;
+        return -1;
+    }
+}
diff --git a/include/abstract_index.h b/include/abstract_index.h
index 059866f7c..fdb85585c 100644
--- a/include/abstract_index.h
+++ b/include/abstract_index.h
@@ -78,6 +78,10 @@ class AbstractIndex
     std::pair<uint32_t, uint32_t> search_with_filters(const DataType &query, const std::string &raw_label,
                                                       const size_t K, const uint32_t L, IndexType *indices,
                                                       float *distances);
+    template <typename IndexType>
+    std::pair<uint32_t, uint32_t> search_with_filters(const DataType &query, const std::vector<std::vector<std::string>> &raw_label,
+                                                      const size_t K, const uint32_t L, IndexType *indices,
+                                                      float *distances);
 
     // insert points with labels, labels should be present for filtered index
     template <typename data_type, typename tag_type, typename label_type>
@@ -110,7 +114,8 @@ class AbstractIndex
     virtual void _build(const DataType &data, const size_t num_points_to_load, TagVector &tags) = 0;
     virtual std::pair<uint32_t, uint32_t> _search(const DataType &query, const size_t K, const uint32_t L,
                                                   std::any &indices, float *distances = nullptr) = 0;
-    virtual std::pair<uint32_t, uint32_t> _search_with_filters(const DataType &query, const std::string &filter_label,
+    virtual std::pair<uint32_t, uint32_t> _search_with_filters(const DataType &query,
+                                                               const std::vector<std::vector<std::string>> &filter_label,
                                                                const size_t K, const uint32_t L, std::any &indices,
                                                                float *distances) = 0;
     virtual int _insert_point(const DataType &data_point, const TagType tag, Labelvector &labels) = 0;
diff --git a/include/cluster_store.h b/include/cluster_store.h
new file mode 100644
index 000000000..fc377688f
--- /dev/null
+++ b/include/cluster_store.h
@@ -0,0 +1,101 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include <vector>
+#include <string>
+
+#include "types.h"
+#include "windows_customizations.h"
+#include "id_list.h"
+#include "distance.h"
+#include "scratch.h"
+
+namespace diskann
+{
+
+template <typename data_t> class AbstractClusterStore
+{
+  public:
+    AbstractClusterStore(const size_t dim)
+    {
+        _dim = dim;
+    }
+
+    virtual ~AbstractClusterStore() = default;
+
+    // Return number of clusters in the index
+    virtual uint32_t load(const std::string &filename) = 0;
+
+    virtual size_t save(const std::string &filename) = 0;
+
+    //    DISKANN_DLLEXPORT virtual location_t capacity() const;
+
+    size_t get_dims()
+    {
+        return _dim;
+    };
+
+    size_t get_num_clusters()
+    {
+        return _num_clusters;
+    };
+
+    virtual void add_cetroids(float *clusters, uint32_t num_clusters) = 0;
+
+    // populate the store with vectors (either from a pointer or bin file),
+    // potentially after pre-processing the vectors if the metric deems so
+    // e.g., normalizing vectors for cosine distance over floating-point vectors
+    // useful for bulk or static index building.
+    virtual void assign_data_to_clusters(data_t *vectors, std::vector<uint32_t> &ids) = 0;
+
+    // operations on vectors
+    // like populate_data function, but over one vector at a time useful for
+    // streaming setting
+    virtual void get_closest_clusters(data_t *query, const uint32_t num_closest,
+                                      InMemQueryScratch<data_t> *scratch) = 0;
+    virtual void get_cluster_members(const uint32_t cluster_id, AbstractIdList &output_list) = 0;
+
+  protected:
+    float *_cluster_centroids = nullptr;
+    float *_cluster_norms = nullptr;
+    size_t _num_clusters = 0;
+    size_t _dim;
+};
+
+template <typename data_t> class InMemClusterStore : public AbstractClusterStore<data_t>
+{
+  public:
+    InMemClusterStore(const size_t dim);
+
+    virtual ~InMemClusterStore();
+
+    // Return number of clusters in the index
+    virtual uint32_t load(const std::string &filename) override;
+
+    virtual size_t save(const std::string &filename) override;
+
+    //    DISKANN_DLLEXPORT virtual location_t capacity() const;
+
+    virtual void add_cetroids(float *clusters, uint32_t num_clusters) override;
+
+    // populate the store with vectors (either from a pointer or bin file),
+    // potentially after pre-processing the vectors if the metric deems so
+    // e.g., normalizing vectors for cosine distance over floating-point vectors
+    // useful for bulk or static index building.
+    virtual void assign_data_to_clusters(data_t *vectors, std::vector<uint32_t> &ids) override;
+
+    // operations on vectors
+    // like populate_data function, but over one vector at a time useful for
+    // streaming setting
+    virtual void get_closest_clusters(data_t *query, const uint32_t num_closest,
+                                      InMemQueryScratch<data_t> *scratch) override;
+
+    virtual void get_cluster_members(const uint32_t cluster_id, AbstractIdList &output_list) override;
+
+  protected:
+    std::vector<diskann::RoaringIdList> _posting_lists;
+};
+
+} // namespace diskann
diff --git a/include/defaults.h b/include/defaults.h
index ef1750fcf..3b8af5ce5 100644
--- a/include/defaults.h
+++ b/include/defaults.h
@@ -13,6 +13,8 @@ const uint32_t NUM_THREADS = 0;
 const uint32_t MAX_OCCLUSION_SIZE = 750;
 const bool HAS_LABELS = false;
 const uint32_t FILTER_LIST_SIZE = 0;
+const uint32_t FILTER_PENALTY_THRESHOLD = std::numeric_limits<uint32_t>::max();
+const uint32_t BRUTEFORCE_THRESHOLD = 0;
 const uint32_t NUM_FROZEN_POINTS_STATIC = 0;
 const uint32_t NUM_FROZEN_POINTS_DYNAMIC = 1;
 
diff --git a/include/id_list.h b/include/id_list.h
new file mode 100644
index 000000000..164f9c27b
--- /dev/null
+++ b/include/id_list.h
@@ -0,0 +1,106 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include "roaring.hh"
+#include "types.h"
+#include "windows_customizations.h"
+#include "distance.h"
+#include <iterator>
+
+namespace diskann
+{
+
+class AbstractIdList
+{
+  public:
+    AbstractIdList()
+    {
+    }
+
+    AbstractIdList(uint32_t size, uint32_t *vals)
+    {
+    }
+
+    virtual ~AbstractIdList() = default;
+
+    virtual uint64_t size() = 0;
+
+    virtual void add(const uint32_t val) = 0;
+
+    virtual void copy_from(const AbstractIdList &other) = 0;
+
+    virtual void intersect_list(const AbstractIdList &other) = 0;
+
+    virtual void union_list(const AbstractIdList &other) = 0;
+
+    virtual roaring::Roaring get_list() = 0;
+
+  protected:
+};
+
+class RoaringIdList : public AbstractIdList
+{
+  public:
+    RoaringIdList()
+    {
+    }
+
+    RoaringIdList(const RoaringIdList &d) : AbstractIdList(d)
+    {
+        list = d.list;
+        //    std::cout<<"here" ;
+    }
+
+    RoaringIdList(uint32_t size, uint32_t *vals)
+    {
+        list = roaring::Roaring(size, vals);
+    }
+
+    ~RoaringIdList()
+    {
+    }
+
+    uint64_t size()
+    {
+        return list.cardinality();
+    }
+
+    void add(const uint32_t val)
+    {
+        list.add(val);
+    }
+
+    void copy_from(const AbstractIdList &other)
+    {
+        const RoaringIdList &other_r = dynamic_cast<const RoaringIdList &>(other);
+        list = other_r.list;
+    }
+
+    void intersect_list(const AbstractIdList &other)
+    {
+        const RoaringIdList &other_r = dynamic_cast<const RoaringIdList &>(other);
+        list &= other_r.list;
+    }
+
+    void union_list(const AbstractIdList &other)
+    {
+        const RoaringIdList &other_r = dynamic_cast<const RoaringIdList &>(other);
+        list |= other_r.list;
+    }
+
+    roaring::Roaring get_list()
+    {
+        return list;
+    }
+
+    // TODO: make this protected again and remove external usage of list
+    //  protected:
+    roaring::Roaring list;
+};
+
+} // namespace diskann
diff --git a/include/index.h b/include/index.h
index b9bf4f384..36ed41c25 100644
--- a/include/index.h
+++ b/include/index.h
@@ -11,10 +11,12 @@
 
 #include "distance.h"
 #include "locking.h"
+#include "math_utils.h"
 #include "natural_number_map.h"
 #include "natural_number_set.h"
 #include "neighbor.h"
 #include "parameters.h"
+#include "partition.h"
 #include "utils.h"
 #include "windows_customizations.h"
 #include "scratch.h"
@@ -24,10 +26,30 @@
 
 #include "quantized_distance.h"
 #include "pq_data_store.h"
+#include "roaring.hh"
 
 #define OVERHEAD_FACTOR 1.1
 #define EXPAND_IF_FULL 0
 #define DEFAULT_MAXC 750
+//#define INSTRUMENT true
+
+inline double time_to_intersect = 0.;
+inline double time_to_cluster = 0.;
+inline double time_to_filter_check_and_compare = 0.;
+inline double time_to_get_valid = 0.;
+inline double time_to_detect_penalty = 0.;
+inline double time_to_estimate = 0;
+inline uint32_t num_brutes = 0;
+inline uint32_t num_clusters = 0;
+inline uint32_t num_graphs = 0;
+inline uint32_t min_inter_size = 2;
+inline bool print_qstats = false;
+inline int64_t curr_query = -1;
+inline uint32_t penalty_scale = 10;
+inline uint32_t num_sp = 2;
+inline bool use_global_start = false;
+inline uint32_t num_start_points = 1;
+inline bool expand_two_hops = false;
 
 namespace diskann
 {
@@ -87,8 +109,11 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
     DISKANN_DLLEXPORT size_t get_num_points();
     DISKANN_DLLEXPORT size_t get_max_points();
 
-    DISKANN_DLLEXPORT bool detect_common_filters(uint32_t point_id, bool search_invocation,
-                                                 const std::vector<LabelT> &incoming_labels);
+    DISKANN_DLLEXPORT uint32_t detect_common_filters(uint32_t point_id, bool search_invocation,
+                                                     const std::vector<LabelT> &incoming_labels);
+
+    DISKANN_DLLEXPORT inline uint32_t detect_filter_penalty(uint32_t point_id, bool search_invocation,
+                                                            const std::vector<std::vector<LabelT>> &incoming_labels);
 
     // Batch build from a file. Optionally pass tags vector.
     DISKANN_DLLEXPORT void build(const char *filename, const size_t num_points_to_load,
@@ -139,9 +164,9 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
                                               float *distances, std::vector<T *> &res_vectors, bool use_filters = false,
                                               const std::string filter_label = "");
 
-    // Filter support search
     template <typename IndexType>
-    DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search_with_filters(const T *query, const LabelT &filter_label,
+    DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search_with_filters(const T *query,
+                                                                        const std::vector<std::vector<LabelT>> &filter_label,
                                                                         const size_t K, const uint32_t L,
                                                                         IndexType *indices, float *distances);
 
@@ -207,8 +232,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
     virtual std::pair<uint32_t, uint32_t> _search(const DataType &query, const size_t K, const uint32_t L,
                                                   std::any &indices, float *distances = nullptr) override;
     virtual std::pair<uint32_t, uint32_t> _search_with_filters(const DataType &query,
-                                                               const std::string &filter_label_raw, const size_t K,
-                                                               const uint32_t L, std::any &indices,
+                                                               const std::vector<std::vector<std::string>> &filter_label_raw,
+                                                               const size_t K, const uint32_t L, std::any &indices,
                                                                float *distances) override;
 
     virtual int _insert_point(const DataType &data_point, const TagType tag) override;
@@ -249,17 +274,33 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
 
     void parse_label_file(const std::string &label_file, size_t &num_pts_labels);
 
+    void parse_sample_label_file(const std::string &label_file, size_t &num_samples);
+
+    std::vector<std::pair<LabelT, uint32_t>> sort_filter_counts(const std::vector<LabelT> &filter_label);
+
+    std::pair<uint32_t, std::vector<uint32_t>> sample_intersection(roaring::Roaring &intersection_bitmap, roaring::Roaring &tmp_bitmap,
+                                                      const std::vector<std::vector<LabelT>> &filter_labels);
+
     std::unordered_map<std::string, LabelT> load_label_map(const std::string &map_file);
 
     // Returns the locations of start point and frozen points suitable for use
     // with iterate_to_fixed_point.
     std::vector<uint32_t> get_init_ids();
 
+    std::pair<uint32_t, uint32_t> brute_force_filters(const T *query, const uint32_t Lsize,
+                                                      const roaring::Roaring &init_ids, InMemQueryScratch<T> *scratch);
+
     // The query to use is placed in scratch->aligned_query
     std::pair<uint32_t, uint32_t> iterate_to_fixed_point(InMemQueryScratch<T> *scratch, const uint32_t Lindex,
                                                          const std::vector<uint32_t> &init_ids, bool use_filter,
                                                          const std::vector<LabelT> &filters, bool search_invocation);
 
+    // The query to use is placed in scratch->aligned_query
+    std::pair<uint32_t, uint32_t> iterate_to_fixed_point(InMemQueryScratch<T> *scratch, const uint32_t Lindex,
+        const std::vector<uint32_t> &init_ids, bool use_filter,
+        const std::vector<std::vector<LabelT>> &filters, bool search_invocation);
+
+
     void search_for_point_and_prune(int location, uint32_t Lindex, std::vector<uint32_t> &pruned_list,
                                     InMemQueryScratch<T> *scratch, bool use_filter = false,
                                     uint32_t filteredLindex = 0);
@@ -379,14 +420,24 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
     // Location to label is only updated during insert_point(), all other reads are protected by
     // default as a location can only be released at end of consolidate deletes
     std::vector<std::vector<LabelT>> _location_to_labels;
+    std::vector<tsl::robin_set<LabelT>> _location_to_labels_robin;
+    std::vector<roaring::Roaring> _location_to_labels_bitmap;
     tsl::robin_set<LabelT> _labels;
     std::string _labels_file;
     std::unordered_map<LabelT, uint32_t> _label_to_start_id;
+    std::vector<roaring::Roaring> _labels_to_points;
+    std::unordered_map<LabelT, tsl::robin_set<uint32_t>> _labels_to_points_set;
+    std::vector<roaring::Roaring> _labels_to_points_sample;
+    uint32_t *_sample_map = nullptr;
+    float _sample_prob = 0;
     std::unordered_map<uint32_t, uint32_t> _medoid_counts;
 
     bool _use_universal_label = false;
     LabelT _universal_label = 0;
     uint32_t _filterIndexingQueueSize;
+    uint32_t _filter_penalty_threshold = 0;
+    uint32_t _bruteforce_threshold = 0;
+    float _prob = 0.1;
     std::unordered_map<std::string, LabelT> _label_map;
 
     // Indexing parameters
diff --git a/include/parameters.h b/include/parameters.h
index 0206814bd..8b7aadac3 100644
--- a/include/parameters.h
+++ b/include/parameters.h
@@ -39,12 +39,20 @@ class IndexWriteParameters
 class IndexSearchParams
 {
   public:
-    IndexSearchParams(const uint32_t initial_search_list_size, const uint32_t num_search_threads)
-        : initial_search_list_size(initial_search_list_size), num_search_threads(num_search_threads)
+    IndexSearchParams(const uint32_t initial_search_list_size, const uint32_t num_search_threads,
+                      const uint32_t filter_penalty_threshold = 0, const uint32_t bruteforce_threshold = 0,
+                      const uint32_t clustering_threshold = 0)
+        : initial_search_list_size(initial_search_list_size), num_search_threads(num_search_threads),
+          filter_penalty_threshold(filter_penalty_threshold), bruteforce_threshold(bruteforce_threshold),
+          clustering_threshold(clustering_threshold)
     {
     }
+
     const uint32_t initial_search_list_size; // search L
     const uint32_t num_search_threads;       // search threads
+    const uint32_t filter_penalty_threshold{defaults::FILTER_PENALTY_THRESHOLD};
+    const uint32_t bruteforce_threshold{defaults::BRUTEFORCE_THRESHOLD};
+    const uint32_t clustering_threshold{defaults::BRUTEFORCE_THRESHOLD};
 };
 
 class IndexWriteParametersBuilder
diff --git a/include/partition.h b/include/partition.h
index c2c4c76ad..48f3f33fc 100644
--- a/include/partition.h
+++ b/include/partition.h
@@ -16,7 +16,8 @@
 #include "windows_customizations.h"
 
 template <typename T>
-void gen_random_slice(const std::string base_file, const std::string output_prefix, double sampling_rate);
+void gen_random_slice(const std::string base_file, const std::string output_prefix, double sampling_rate,
+                      std::string label_file = "");
 
 template <typename T>
 void gen_random_slice(const std::string data_file, double p_val, float *&sampled_data, size_t &slice_size,
diff --git a/include/scratch.h b/include/scratch.h
index 2f43e3365..7b092d187 100644
--- a/include/scratch.h
+++ b/include/scratch.h
@@ -5,11 +5,12 @@
 
 #include <vector>
 
-#include "boost_dynamic_bitset_fwd.h"
-// #include "boost/dynamic_bitset.hpp"
+//#include "boost_dynamic_bitset_fwd.h"
+#include "boost/dynamic_bitset.hpp"
 #include "tsl/robin_set.h"
 #include "tsl/robin_map.h"
 #include "tsl/sparse_map.h"
+#include "roaring.hh"
 
 #include "aligned_file_reader.h"
 #include "abstract_scratch.h"
@@ -17,6 +18,8 @@
 #include "defaults.h"
 #include "concurrent_queue.h"
 
+#define MAX_NUM_CLUSTERS 500
+
 namespace diskann
 {
 template <typename T> class PQScratch;
@@ -37,6 +40,14 @@ template <typename T> class InMemQueryScratch : public AbstractScratch<T>
     {
         return _L;
     }
+    inline float *get_query_float()
+    {
+        return _aligned_query_float;
+    }
+    inline std::vector<float> &get_cluster_distance_vector()
+    {
+        return _cluster_distances;
+    }
     inline uint32_t get_R()
     {
         return _R;
@@ -61,6 +72,11 @@ template <typename T> class InMemQueryScratch : public AbstractScratch<T>
     {
         return _best_l_nodes;
     }
+    inline std::vector<uint32_t> &closest_clusters()
+    {
+        return _closest_clusters;
+    }
+
     inline std::vector<float> &occlude_factor()
     {
         return _occlude_factor;
@@ -94,6 +110,19 @@ template <typename T> class InMemQueryScratch : public AbstractScratch<T>
         return _occlude_list_output;
     }
 
+    inline roaring::Roaring &get_valid_bitmap()
+    {
+        _last_intersection.removeRangeClosed(_last_intersection.minimum(), _last_intersection.maximum());
+        return _last_intersection;
+    }
+
+    inline roaring::Roaring &get_tmp_bitmap()
+    {
+        _tmp_intersection.removeRangeClosed(_tmp_intersection.minimum(), _tmp_intersection.maximum());
+        return _tmp_intersection;
+    }
+
+
   private:
     uint32_t _L;
     uint32_t _R;
@@ -132,6 +161,14 @@ template <typename T> class InMemQueryScratch : public AbstractScratch<T>
     tsl::robin_set<uint32_t> _expanded_nodes_set;
     std::vector<Neighbor> _expanded_nghrs_vec;
     std::vector<uint32_t> _occlude_list_output;
+
+    roaring::Roaring _tmp_intersection;
+
+    roaring::Roaring _last_intersection;
+    // _to calculate the closest clusters during filtered search in clustered index
+    std::vector<uint32_t> _closest_clusters;
+    std::vector<float> _cluster_distances;
+    float *_aligned_query_float;
 };
 
 //
diff --git a/include/utils.h b/include/utils.h
index d3af5c3a9..677490e49 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -673,7 +673,7 @@ inline void copy_file(std::string in_file, std::string out_file)
 }
 
 DISKANN_DLLEXPORT double calculate_recall(unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
-                                          unsigned *our_results, unsigned dim_or, unsigned recall_at);
+                                          unsigned *our_results, unsigned dim_or, unsigned recall_at, unsigned r2 = 0);
 
 DISKANN_DLLEXPORT double calculate_recall(unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
                                           unsigned *our_results, unsigned dim_or, unsigned recall_at,
@@ -1083,51 +1083,109 @@ template <typename T = float> inline void normalize(T *arr, const size_t dim)
     }
 }
 
-inline std::vector<std::string> read_file_to_vector_of_strings(const std::string &filename, bool unique = false)
+inline std::vector<std::vector<std::vector<std::string>>> read_file_to_vector_of_vector_of_strings(const std::string &filename,
+                                                                            bool unique = false)
 {
-    std::vector<std::string> result;
-    std::set<std::string> elementSet;
-    if (filename != "")
+    std::vector<std::vector<std::vector<std::string>>> query_filters;
+    std::ifstream file(filename);
+    std::string line, token;
+
+    if (file.fail())
     {
-        std::ifstream file(filename);
-        if (file.fail())
-        {
-            throw diskann::ANNException(std::string("Failed to open file ") + filename, -1);
-        }
-        std::string line;
-        while (std::getline(file, line))
+        throw diskann::ANNException(std::string("Failed to open file ") + filename, -1);
+    }
+
+    while (std::getline(file, line))
+    {
+
+        std::istringstream iss(line);
+        std::vector<std::vector<std::string>> lbls(0);
+
+        getline(iss, token, '\t');
+        std::istringstream new_iss(token);
+        while (getline(new_iss, token, '&'))
         {
-            if (line.empty())
+            std::vector<std::string> or_clause(0);
+            std::istringstream inner_iss(token);
+            while (getline(inner_iss, token, '|'))
             {
-                break;
-            }
-            if (line.find(',') != std::string::npos)
-            {
-                std::cerr << "Every query must have exactly one filter" << std::endl;
-                exit(-1);
-            }
-            if (!line.empty() && (line.back() == '\r' || line.back() == '\n'))
-            {
-                line.erase(line.size() - 1);
-            }
-            if (!elementSet.count(line))
-            {
-                result.push_back(line);
-            }
-            if (unique)
-            {
-                elementSet.insert(line);
+//                if (print_flag)
+//                    std::cout<<token<<" || ";
+                token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
+                token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
+                or_clause.push_back(token);
+//                labels.insert(token);
             }
+            lbls.push_back(or_clause);
         }
-        file.close();
-    }
-    else
-    {
-        throw diskann::ANNException(std::string("Failed to open file. filename can not be blank"), -1);
+        //        std::sort(lbls.begin(), lbls.end());
+        query_filters.push_back(lbls);
+
+
+/*        std::istringstream iss(line);
+        std::vector<std::string> lbls(0);
+        while (getline(iss, token, '&'))
+        {
+            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
+            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
+            lbls.push_back(token);
+        }
+        query_filters.push_back(lbls);*/ 
     }
-    return result;
+    std::cout << "Populated labels for " << query_filters.size() << " queries" << std::endl;
+    return query_filters;
+}
+
+
+inline std::vector<std::vector<std::string>> read_file_to_vector_of_strings(const std::string &filename,
+    bool unique = false)
+{
+std::vector<std::vector<std::string>> query_filters;
+std::ifstream file(filename);
+std::string line, token;
+
+if (file.fail())
+{
+throw diskann::ANNException(std::string("Failed to open file ") + filename, -1);
 }
 
+while (std::getline(file, line))
+{
+
+std::istringstream iss(line);
+std::vector<std::string> lbls(0);
+
+getline(iss, token, '\t');
+std::istringstream new_iss(token);
+while (getline(new_iss, token, '&'))
+{
+
+//                if (print_flag)
+//                    std::cout<<token<<" || ";
+token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
+token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
+//                labels.insert(token);
+lbls.push_back(token);
+}
+//        std::sort(lbls.begin(), lbls.end());
+query_filters.push_back(lbls);
+
+
+/*        std::istringstream iss(line);
+std::vector<std::string> lbls(0);
+while (getline(iss, token, '&'))
+{
+token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
+token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
+lbls.push_back(token);
+}
+query_filters.push_back(lbls);*/ 
+}
+std::cout << "Populated labels for " << query_filters.size() << " queries" << std::endl;
+return query_filters;
+}
+
+
 inline void clean_up_artifacts(tsl::robin_set<std::string> paths_to_clean, tsl::robin_set<std::string> path_suffixes)
 {
     try
diff --git a/packages-microsoft-prod.deb b/packages-microsoft-prod.deb
new file mode 100644
index 000000000..40874bc7f
Binary files /dev/null and b/packages-microsoft-prod.deb differ
diff --git a/roaring/roaring.c b/roaring/roaring.c
new file mode 100644
index 000000000..eb74f389a
--- /dev/null
+++ b/roaring/roaring.c
@@ -0,0 +1,25870 @@
+// !!! DO NOT EDIT - THIS IS AN AUTO-GENERATED FILE !!!
+// Created by amalgamation.sh on 2024-04-02T07:40:50Z
+
+/*
+ * The CRoaring project is under a dual license (Apache/MIT).
+ * Users of the library may choose one or the other license.
+ */
+/*
+ * Copyright 2016-2022 The CRoaring authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * MIT License
+ *
+ * Copyright 2016-2022 The CRoaring authors
+ *
+ * Permission is hereby granted, free of charge, to any
+ * person obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the
+ * Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software
+ * is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice
+ * shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+ * TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+ * SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+ * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "roaring.h"
+
+/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
+#ifdef DMALLOC
+#include "dmalloc.h"
+#endif
+
+#include "roaring.h"  /* include public API definitions */
+/* begin file include/roaring/isadetection.h */
+#ifndef ROARING_ISADETECTION_H
+#define ROARING_ISADETECTION_H
+#if defined(__x86_64__) || defined(_M_AMD64)  // x64
+
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#ifdef __has_include
+// We want to make sure that the AVX-512 functions are only built on compilers
+// fully supporting AVX-512.
+#if __has_include(<avx512vbmi2intrin.h>)
+#define CROARING_COMPILER_SUPPORTS_AVX512 1
+#endif  // #if __has_include(<avx512vbmi2intrin.h>)
+#endif  // #ifdef __has_include
+
+// Visual Studio 2019 and up support AVX-512
+#ifdef _MSC_VER
+#if _MSC_VER >= 1920
+#define CROARING_COMPILER_SUPPORTS_AVX512 1
+#endif  // #if _MSC_VER >= 1920
+#endif  // #ifdef _MSC_VER
+
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#define CROARING_COMPILER_SUPPORTS_AVX512 0
+#endif  // #ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#endif  // #ifndef CROARING_COMPILER_SUPPORTS_AVX512
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+enum {
+    ROARING_SUPPORTS_AVX2 = 1,
+    ROARING_SUPPORTS_AVX512 = 2,
+};
+int croaring_hardware_support(void);
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+#endif  // x64
+#endif  // ROARING_ISADETECTION_H
+/* end file include/roaring/isadetection.h */
+/* begin file include/roaring/containers/perfparameters.h */
+#ifndef PERFPARAMETERS_H_
+#define PERFPARAMETERS_H_
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/**
+During lazy computations, we can transform array containers into bitset
+containers as
+long as we can expect them to have  ARRAY_LAZY_LOWERBOUND values.
+*/
+enum { ARRAY_LAZY_LOWERBOUND = 1024 };
+
+/* default initial size of a run container
+   setting it to zero delays the malloc.*/
+enum { RUN_DEFAULT_INIT_SIZE = 0 };
+
+/* default initial size of an array container
+   setting it to zero delays the malloc */
+enum { ARRAY_DEFAULT_INIT_SIZE = 0 };
+
+/* automatic bitset conversion during lazy or */
+#ifndef LAZY_OR_BITSET_CONVERSION
+#define LAZY_OR_BITSET_CONVERSION true
+#endif
+
+/* automatically attempt to convert a bitset to a full run during lazy
+ * evaluation */
+#ifndef LAZY_OR_BITSET_CONVERSION_TO_FULL
+#define LAZY_OR_BITSET_CONVERSION_TO_FULL true
+#endif
+
+/* automatically attempt to convert a bitset to a full run */
+#ifndef OR_BITSET_CONVERSION_TO_FULL
+#define OR_BITSET_CONVERSION_TO_FULL true
+#endif
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif
+/* end file include/roaring/containers/perfparameters.h */
+/* begin file include/roaring/containers/container_defs.h */
+/*
+ * container_defs.h
+ *
+ * Unlike containers.h (which is a file aggregating all the container includes,
+ * like array.h, bitset.h, and run.h) this is a file included BY those headers
+ * to do things like define the container base class `container_t`.
+ */
+
+#ifndef INCLUDE_CONTAINERS_CONTAINER_DEFS_H_
+#define INCLUDE_CONTAINERS_CONTAINER_DEFS_H_
+
+#ifdef __cplusplus
+#include <type_traits>  // used by casting helper for compile-time check
+#endif
+
+// The preferences are a separate file to separate out tweakable parameters
+
+#ifdef __cplusplus
+namespace roaring {
+namespace internal {  // No extern "C" (contains template)
+#endif
+
+/*
+ * Since roaring_array_t's definition is not opaque, the container type is
+ * part of the API.  If it's not going to be `void*` then it needs a name, and
+ * expectations are to prefix C library-exported names with `roaring_` etc.
+ *
+ * Rather than force the whole codebase to use the name `roaring_container_t`,
+ * the few API appearances use the macro ROARING_CONTAINER_T.  Those includes
+ * are prior to containers.h, so make a short private alias of `container_t`.
+ * Then undefine the awkward macro so it's not used any more than it has to be.
+ */
+typedef ROARING_CONTAINER_T container_t;
+#undef ROARING_CONTAINER_T
+
+/*
+ * See ROARING_CONTAINER_T for notes on using container_t as a base class.
+ * This macro helps make the following pattern look nicer:
+ *
+ *     #ifdef __cplusplus
+ *     struct roaring_array_s : public container_t {
+ *     #else
+ *     struct roaring_array_s {
+ *     #endif
+ *         int32_t cardinality;
+ *         int32_t capacity;
+ *         uint16_t *array;
+ *     }
+ */
+#if defined(__cplusplus)
+#define STRUCT_CONTAINER(name) struct name : public container_t /* { ... } */
+#else
+#define STRUCT_CONTAINER(name) struct name /* { ... } */
+#endif
+
+/**
+ * Since container_t* is not void* in C++, "dangerous" casts are not needed to
+ * downcast; only a static_cast<> is needed.  Define a macro for static casting
+ * which helps make casts more visible, and catches problems at compile-time
+ * when building the C sources in C++ mode:
+ *
+ *     void some_func(container_t **c, ...) {  // double pointer, not single
+ *         array_container_t *ac1 = (array_container_t *)(c);  // uncaught!!
+ *
+ *         array_container_t *ac2 = CAST(array_container_t *, c)  // C++ errors
+ *         array_container_t *ac3 = CAST_array(c);  // shorthand for #2, errors
+ *     }
+ *
+ * Trickier to do is a cast from `container**` to `array_container_t**`.  This
+ * needs a reinterpret_cast<>, which sacrifices safety...so a template is used
+ * leveraging <type_traits> to make sure it's legal in the C++ build.
+ */
+#ifdef __cplusplus
+#define CAST(type, value) static_cast<type>(value)
+#define movable_CAST(type, value) movable_CAST_HELPER<type>(value)
+
+template <typename PPDerived, typename Base>
+PPDerived movable_CAST_HELPER(Base **ptr_to_ptr) {
+    typedef typename std::remove_pointer<PPDerived>::type PDerived;
+    typedef typename std::remove_pointer<PDerived>::type Derived;
+    static_assert(std::is_base_of<Base, Derived>::value,
+                  "use movable_CAST() for container_t** => xxx_container_t**");
+    return reinterpret_cast<Derived **>(ptr_to_ptr);
+}
+#else
+#define CAST(type, value) ((type)value)
+#define movable_CAST(type, value) ((type)value)
+#endif
+
+// Use for converting e.g. an `array_container_t**` to a `container_t**`
+//
+#define movable_CAST_base(c) movable_CAST(container_t **, c)
+
+#ifdef __cplusplus
+}
+}  // namespace roaring { namespace internal {
+#endif
+
+#endif /* INCLUDE_CONTAINERS_CONTAINER_DEFS_H_ */
+/* end file include/roaring/containers/container_defs.h */
+/* begin file include/roaring/array_util.h */
+#ifndef ARRAY_UTIL_H
+#define ARRAY_UTIL_H
+
+#include <stddef.h>  // for size_t
+#include <stdint.h>
+
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/*
+ *  Good old binary search.
+ *  Assumes that array is sorted, has logarithmic complexity.
+ *  if the result is x, then:
+ *     if ( x>0 )  you have array[x] = ikey
+ *     if ( x<0 ) then inserting ikey at position -x-1 in array (insuring that
+ * array[-x-1]=ikey) keys the array sorted.
+ */
+inline int32_t binarySearch(const uint16_t *array, int32_t lenarray,
+                            uint16_t ikey) {
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t middleValue = array[middleIndex];
+        if (middleValue < ikey) {
+            low = middleIndex + 1;
+        } else if (middleValue > ikey) {
+            high = middleIndex - 1;
+        } else {
+            return middleIndex;
+        }
+    }
+    return -(low + 1);
+}
+
+/**
+ * Galloping search
+ * Assumes that array is sorted, has logarithmic complexity.
+ * if the result is x, then if x = length, you have that all values in array
+ * between pos and length are smaller than min. otherwise returns the first
+ * index x such that array[x] >= min.
+ */
+static inline int32_t advanceUntil(const uint16_t *array, int32_t pos,
+                                   int32_t length, uint16_t min) {
+    int32_t lower = pos + 1;
+
+    if ((lower >= length) || (array[lower] >= min)) {
+        return lower;
+    }
+
+    int32_t spansize = 1;
+
+    while ((lower + spansize < length) && (array[lower + spansize] < min)) {
+        spansize <<= 1;
+    }
+    int32_t upper = (lower + spansize < length) ? lower + spansize : length - 1;
+
+    if (array[upper] == min) {
+        return upper;
+    }
+    if (array[upper] < min) {
+        // means
+        // array
+        // has no
+        // item
+        // >= min
+        // pos = array.length;
+        return length;
+    }
+
+    // we know that the next-smallest span was too small
+    lower += (spansize >> 1);
+
+    int32_t mid = 0;
+    while (lower + 1 != upper) {
+        mid = (lower + upper) >> 1;
+        if (array[mid] == min) {
+            return mid;
+        } else if (array[mid] < min) {
+            lower = mid;
+        } else {
+            upper = mid;
+        }
+    }
+    return upper;
+}
+
+/**
+ * Returns number of elements which are less than ikey.
+ * Array elements must be unique and sorted.
+ */
+static inline int32_t count_less(const uint16_t *array, int32_t lenarray,
+                                 uint16_t ikey) {
+    if (lenarray == 0) return 0;
+    int32_t pos = binarySearch(array, lenarray, ikey);
+    return pos >= 0 ? pos : -(pos + 1);
+}
+
+/**
+ * Returns number of elements which are greater than ikey.
+ * Array elements must be unique and sorted.
+ */
+static inline int32_t count_greater(const uint16_t *array, int32_t lenarray,
+                                    uint16_t ikey) {
+    if (lenarray == 0) return 0;
+    int32_t pos = binarySearch(array, lenarray, ikey);
+    if (pos >= 0) {
+        return lenarray - (pos + 1);
+    } else {
+        return lenarray - (-pos - 1);
+    }
+}
+
+/**
+ * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
+ * Optimized by D. Lemire on May 3rd 2013
+ *
+ * C should have capacity greater than the minimum of s_1 and s_b + 8
+ * where 8 is sizeof(__m128i)/sizeof(uint16_t).
+ */
+int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
+                           const uint16_t *__restrict__ B, size_t s_b,
+                           uint16_t *C);
+
+int32_t intersect_vector16_inplace(uint16_t *__restrict__ A, size_t s_a,
+                                   const uint16_t *__restrict__ B, size_t s_b);
+
+/**
+ * Take an array container and write it out to a 32-bit array, using base
+ * as the offset.
+ */
+int array_container_to_uint32_array_vector16(void *vout, const uint16_t *array,
+                                             size_t cardinality, uint32_t base);
+#if CROARING_COMPILER_SUPPORTS_AVX512
+int avx512_array_container_to_uint32_array(void *vout, const uint16_t *array,
+                                           size_t cardinality, uint32_t base);
+#endif
+/**
+ * Compute the cardinality of the intersection using SSE4 instructions
+ */
+int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A,
+                                       size_t s_a,
+                                       const uint16_t *__restrict__ B,
+                                       size_t s_b);
+
+/* Computes the intersection between one small and one large set of uint16_t.
+ * Stores the result into buffer and return the number of elements. */
+int32_t intersect_skewed_uint16(const uint16_t *smallarray, size_t size_s,
+                                const uint16_t *largearray, size_t size_l,
+                                uint16_t *buffer);
+
+/* Computes the size of the intersection between one small and one large set of
+ * uint16_t. */
+int32_t intersect_skewed_uint16_cardinality(const uint16_t *smallarray,
+                                            size_t size_s,
+                                            const uint16_t *largearray,
+                                            size_t size_l);
+
+/* Check whether the size of the intersection between one small and one large
+ * set of uint16_t is non-zero. */
+bool intersect_skewed_uint16_nonempty(const uint16_t *smallarray, size_t size_s,
+                                      const uint16_t *largearray,
+                                      size_t size_l);
+/**
+ * Generic intersection function.
+ */
+int32_t intersect_uint16(const uint16_t *A, const size_t lenA,
+                         const uint16_t *B, const size_t lenB, uint16_t *out);
+/**
+ * Compute the size of the intersection (generic).
+ */
+int32_t intersect_uint16_cardinality(const uint16_t *A, const size_t lenA,
+                                     const uint16_t *B, const size_t lenB);
+
+/**
+ * Checking whether the size of the intersection  is non-zero.
+ */
+bool intersect_uint16_nonempty(const uint16_t *A, const size_t lenA,
+                               const uint16_t *B, const size_t lenB);
+/**
+ * Generic union function.
+ */
+size_t union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
+                    size_t size_2, uint16_t *buffer);
+
+/**
+ * Generic XOR function.
+ */
+int32_t xor_uint16(const uint16_t *array_1, int32_t card_1,
+                   const uint16_t *array_2, int32_t card_2, uint16_t *out);
+
+/**
+ * Generic difference function (ANDNOT).
+ */
+int difference_uint16(const uint16_t *a1, int length1, const uint16_t *a2,
+                      int length2, uint16_t *a_out);
+
+/**
+ * Generic intersection function.
+ */
+size_t intersection_uint32(const uint32_t *A, const size_t lenA,
+                           const uint32_t *B, const size_t lenB, uint32_t *out);
+
+/**
+ * Generic intersection function, returns just the cardinality.
+ */
+size_t intersection_uint32_card(const uint32_t *A, const size_t lenA,
+                                const uint32_t *B, const size_t lenB);
+
+/**
+ * Generic union function.
+ */
+size_t union_uint32(const uint32_t *set_1, size_t size_1, const uint32_t *set_2,
+                    size_t size_2, uint32_t *buffer);
+
+/**
+ * A fast SSE-based union function.
+ */
+uint32_t union_vector16(const uint16_t *__restrict__ set_1, uint32_t size_1,
+                        const uint16_t *__restrict__ set_2, uint32_t size_2,
+                        uint16_t *__restrict__ buffer);
+/**
+ * A fast SSE-based XOR function.
+ */
+uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
+                      const uint16_t *__restrict__ array2, uint32_t length2,
+                      uint16_t *__restrict__ output);
+
+/**
+ * A fast SSE-based difference function.
+ */
+int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
+                            const uint16_t *__restrict__ B, size_t s_b,
+                            uint16_t *C);
+
+/**
+ * Generic union function, returns just the cardinality.
+ */
+size_t union_uint32_card(const uint32_t *set_1, size_t size_1,
+                         const uint32_t *set_2, size_t size_2);
+
+/**
+ * combines union_uint16 and  union_vector16 optimally
+ */
+size_t fast_union_uint16(const uint16_t *set_1, size_t size_1,
+                         const uint16_t *set_2, size_t size_2,
+                         uint16_t *buffer);
+
+bool memequals(const void *s1, const void *s2, size_t n);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+#endif
+/* end file include/roaring/array_util.h */
+/* begin file include/roaring/utilasm.h */
+/*
+ * utilasm.h
+ *
+ */
+
+#ifndef INCLUDE_UTILASM_H_
+#define INCLUDE_UTILASM_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+#endif
+
+#if defined(CROARING_INLINE_ASM)
+#define CROARING_ASMBITMANIPOPTIMIZATION  // optimization flag
+
+#define ASM_SHIFT_RIGHT(srcReg, bitsReg, destReg) \
+    __asm volatile("shrx %1, %2, %0"              \
+                   : "=r"(destReg)                \
+                   :             /* write */      \
+                   "r"(bitsReg), /* read only */  \
+                   "r"(srcReg)   /* read only */  \
+    )
+
+#define ASM_INPLACESHIFT_RIGHT(srcReg, bitsReg)  \
+    __asm volatile("shrx %1, %0, %0"             \
+                   : "+r"(srcReg)                \
+                   :            /* read/write */ \
+                   "r"(bitsReg) /* read only */  \
+    )
+
+#define ASM_SHIFT_LEFT(srcReg, bitsReg, destReg) \
+    __asm volatile("shlx %1, %2, %0"             \
+                   : "=r"(destReg)               \
+                   :             /* write */     \
+                   "r"(bitsReg), /* read only */ \
+                   "r"(srcReg)   /* read only */ \
+    )
+// set bit at position testBit within testByte to 1 and
+// copy cmovDst to cmovSrc if that bit was previously clear
+#define ASM_SET_BIT_INC_WAS_CLEAR(testByte, testBit, count) \
+    __asm volatile(                                         \
+        "bts %2, %0\n"                                      \
+        "sbb $-1, %1\n"                                     \
+        : "+r"(testByte), /* read/write */                  \
+          "+r"(count)                                       \
+        :            /* read/write */                       \
+        "r"(testBit) /* read only */                        \
+    )
+
+#define ASM_CLEAR_BIT_DEC_WAS_SET(testByte, testBit, count) \
+    __asm volatile(                                         \
+        "btr %2, %0\n"                                      \
+        "sbb $0, %1\n"                                      \
+        : "+r"(testByte), /* read/write */                  \
+          "+r"(count)                                       \
+        :            /* read/write */                       \
+        "r"(testBit) /* read only */                        \
+    )
+
+#define ASM_BT64(testByte, testBit, count) \
+    __asm volatile(                        \
+        "bt %2,%1\n"                       \
+        "sbb %0,%0" /*could use setb */    \
+        : "=r"(count)                      \
+        :              /* write */         \
+        "r"(testByte), /* read only */     \
+        "r"(testBit)   /* read only */     \
+    )
+
+#endif
+
+#ifdef __cplusplus
+}
+}  // extern "C" { namespace roaring {
+#endif
+
+#endif /* INCLUDE_UTILASM_H_ */
+/* end file include/roaring/utilasm.h */
+/* begin file include/roaring/bitset_util.h */
+#ifndef BITSET_UTIL_H
+#define BITSET_UTIL_H
+
+#include <stdint.h>
+
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/*
+ * Set all bits in indexes [begin,end) to true.
+ */
+static inline void bitset_set_range(uint64_t *words, uint32_t start,
+                                    uint32_t end) {
+    if (start == end) return;
+    uint32_t firstword = start / 64;
+    uint32_t endword = (end - 1) / 64;
+    if (firstword == endword) {
+        words[firstword] |= ((~UINT64_C(0)) << (start % 64)) &
+                            ((~UINT64_C(0)) >> ((~end + 1) % 64));
+        return;
+    }
+    words[firstword] |= (~UINT64_C(0)) << (start % 64);
+    for (uint32_t i = firstword + 1; i < endword; i++) {
+        words[i] = ~UINT64_C(0);
+    }
+    words[endword] |= (~UINT64_C(0)) >> ((~end + 1) % 64);
+}
+
+/*
+ * Find the cardinality of the bitset in [begin,begin+lenminusone]
+ */
+static inline int bitset_lenrange_cardinality(const uint64_t *words,
+                                              uint32_t start,
+                                              uint32_t lenminusone) {
+    uint32_t firstword = start / 64;
+    uint32_t endword = (start + lenminusone) / 64;
+    if (firstword == endword) {
+        return roaring_hamming(words[firstword] &
+                               ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
+                                   << (start % 64));
+    }
+    int answer =
+        roaring_hamming(words[firstword] & ((~UINT64_C(0)) << (start % 64)));
+    for (uint32_t i = firstword + 1; i < endword; i++) {
+        answer += roaring_hamming(words[i]);
+    }
+    answer += roaring_hamming(words[endword] &
+                              (~UINT64_C(0)) >>
+                                  (((~start + 1) - lenminusone - 1) % 64));
+    return answer;
+}
+
+/*
+ * Check whether the cardinality of the bitset in [begin,begin+lenminusone] is 0
+ */
+static inline bool bitset_lenrange_empty(const uint64_t *words, uint32_t start,
+                                         uint32_t lenminusone) {
+    uint32_t firstword = start / 64;
+    uint32_t endword = (start + lenminusone) / 64;
+    if (firstword == endword) {
+        return (words[firstword] & ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
+                                       << (start % 64)) == 0;
+    }
+    if (((words[firstword] & ((~UINT64_C(0)) << (start % 64)))) != 0) {
+        return false;
+    }
+    for (uint32_t i = firstword + 1; i < endword; i++) {
+        if (words[i] != 0) {
+            return false;
+        }
+    }
+    if ((words[endword] &
+         (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)) != 0) {
+        return false;
+    }
+    return true;
+}
+
+/*
+ * Set all bits in indexes [begin,begin+lenminusone] to true.
+ */
+static inline void bitset_set_lenrange(uint64_t *words, uint32_t start,
+                                       uint32_t lenminusone) {
+    uint32_t firstword = start / 64;
+    uint32_t endword = (start + lenminusone) / 64;
+    if (firstword == endword) {
+        words[firstword] |= ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
+                            << (start % 64);
+        return;
+    }
+    uint64_t temp = words[endword];
+    words[firstword] |= (~UINT64_C(0)) << (start % 64);
+    for (uint32_t i = firstword + 1; i < endword; i += 2)
+        words[i] = words[i + 1] = ~UINT64_C(0);
+    words[endword] =
+        temp | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64);
+}
+
+/*
+ * Flip all the bits in indexes [begin,end).
+ */
+static inline void bitset_flip_range(uint64_t *words, uint32_t start,
+                                     uint32_t end) {
+    if (start == end) return;
+    uint32_t firstword = start / 64;
+    uint32_t endword = (end - 1) / 64;
+    words[firstword] ^= ~((~UINT64_C(0)) << (start % 64));
+    for (uint32_t i = firstword; i < endword; i++) {
+        words[i] = ~words[i];
+    }
+    words[endword] ^= ((~UINT64_C(0)) >> ((~end + 1) % 64));
+}
+
+/*
+ * Set all bits in indexes [begin,end) to false.
+ */
+static inline void bitset_reset_range(uint64_t *words, uint32_t start,
+                                      uint32_t end) {
+    if (start == end) return;
+    uint32_t firstword = start / 64;
+    uint32_t endword = (end - 1) / 64;
+    if (firstword == endword) {
+        words[firstword] &= ~(((~UINT64_C(0)) << (start % 64)) &
+                              ((~UINT64_C(0)) >> ((~end + 1) % 64)));
+        return;
+    }
+    words[firstword] &= ~((~UINT64_C(0)) << (start % 64));
+    for (uint32_t i = firstword + 1; i < endword; i++) {
+        words[i] = UINT64_C(0);
+    }
+    words[endword] &= ~((~UINT64_C(0)) >> ((~end + 1) % 64));
+}
+
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out", values start at "base".
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ * set.
+ *
+ * Returns how many values were actually decoded.
+ *
+ * This function should only be expected to be faster than
+ * bitset_extract_setbits
+ * when the density of the bitset is high.
+ *
+ * This function uses AVX2 decoding.
+ */
+size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
+                                   uint32_t *out, size_t outcapacity,
+                                   uint32_t base);
+
+size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length,
+                                     uint32_t *out, size_t outcapacity,
+                                     uint32_t base);
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out", values start at "base".
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ */
+size_t bitset_extract_setbits(const uint64_t *words, size_t length,
+                              uint32_t *out, uint32_t base);
+
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out" as 16-bit integers, values start at "base" (can
+ *be set to zero)
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ *
+ * This function should only be expected to be faster than
+ *bitset_extract_setbits_uint16
+ * when the density of the bitset is high.
+ *
+ * This function uses SSE decoding.
+ */
+size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length,
+                                         uint16_t *out, size_t outcapacity,
+                                         uint16_t base);
+
+size_t bitset_extract_setbits_avx512_uint16(const uint64_t *words,
+                                            size_t length, uint16_t *out,
+                                            size_t outcapacity, uint16_t base);
+
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out",  values start at "base"
+ * (can be set to zero)
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ */
+size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length,
+                                     uint16_t *out, uint16_t base);
+
+/*
+ * Given two bitsets containing "length" 64-bit words, write out the position
+ * of all the common set bits to "out", values start at "base"
+ * (can be set to zero)
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ * set.
+ *
+ * Returns how many values were actually decoded.
+ */
+size_t bitset_extract_intersection_setbits_uint16(
+    const uint64_t *__restrict__ words1, const uint64_t *__restrict__ words2,
+    size_t length, uint16_t *out, uint16_t base);
+
+/*
+ * Given a bitset having cardinality card, set all bit values in the list (there
+ * are length of them)
+ * and return the updated cardinality. This evidently assumes that the bitset
+ * already contained data.
+ */
+uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card,
+                                  const uint16_t *list, uint64_t length);
+/*
+ * Given a bitset, set all bit values in the list (there
+ * are length of them).
+ */
+void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length);
+
+/*
+ * Given a bitset having cardinality card, unset all bit values in the list
+ * (there are length of them)
+ * and return the updated cardinality. This evidently assumes that the bitset
+ * already contained data.
+ */
+uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list,
+                           uint64_t length);
+
+/*
+ * Given a bitset having cardinality card, toggle all bit values in the list
+ * (there are length of them)
+ * and return the updated cardinality. This evidently assumes that the bitset
+ * already contained data.
+ */
+
+uint64_t bitset_flip_list_withcard(uint64_t *words, uint64_t card,
+                                   const uint16_t *list, uint64_t length);
+
+void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length);
+
+#if CROARING_IS_X64
+/***
+ * BEGIN Harley-Seal popcount functions.
+ */
+CROARING_TARGET_AVX2
+/**
+ * Compute the population count of a 256-bit word
+ * This is not especially fast, but it is convenient as part of other functions.
+ */
+static inline __m256i popcount256(__m256i v) {
+    const __m256i lookuppos = _mm256_setr_epi8(
+        /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
+        /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
+        /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
+        /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4,
+
+        /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
+        /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
+        /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
+        /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4);
+    const __m256i lookupneg = _mm256_setr_epi8(
+        /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
+        /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
+        /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
+        /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4,
+
+        /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
+        /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
+        /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
+        /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4);
+    const __m256i low_mask = _mm256_set1_epi8(0x0f);
+
+    const __m256i lo = _mm256_and_si256(v, low_mask);
+    const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
+    const __m256i popcnt1 = _mm256_shuffle_epi8(lookuppos, lo);
+    const __m256i popcnt2 = _mm256_shuffle_epi8(lookupneg, hi);
+    return _mm256_sad_epu8(popcnt1, popcnt2);
+}
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+/**
+ * Simple CSA over 256 bits
+ */
+static inline void CSA(__m256i *h, __m256i *l, __m256i a, __m256i b,
+                       __m256i c) {
+    const __m256i u = _mm256_xor_si256(a, b);
+    *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
+    *l = _mm256_xor_si256(u, c);
+}
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+/**
+ * Fast Harley-Seal AVX population count function
+ */
+inline static uint64_t avx2_harley_seal_popcount256(const __m256i *data,
+                                                    const uint64_t size) {
+    __m256i total = _mm256_setzero_si256();
+    __m256i ones = _mm256_setzero_si256();
+    __m256i twos = _mm256_setzero_si256();
+    __m256i fours = _mm256_setzero_si256();
+    __m256i eights = _mm256_setzero_si256();
+    __m256i sixteens = _mm256_setzero_si256();
+    __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
+
+    const uint64_t limit = size - size % 16;
+    uint64_t i = 0;
+
+    for (; i < limit; i += 16) {
+        CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i),
+            _mm256_lddqu_si256(data + i + 1));
+        CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 2),
+            _mm256_lddqu_si256(data + i + 3));
+        CSA(&foursA, &twos, twos, twosA, twosB);
+        CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 4),
+            _mm256_lddqu_si256(data + i + 5));
+        CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 6),
+            _mm256_lddqu_si256(data + i + 7));
+        CSA(&foursB, &twos, twos, twosA, twosB);
+        CSA(&eightsA, &fours, fours, foursA, foursB);
+        CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 8),
+            _mm256_lddqu_si256(data + i + 9));
+        CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 10),
+            _mm256_lddqu_si256(data + i + 11));
+        CSA(&foursA, &twos, twos, twosA, twosB);
+        CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 12),
+            _mm256_lddqu_si256(data + i + 13));
+        CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 14),
+            _mm256_lddqu_si256(data + i + 15));
+        CSA(&foursB, &twos, twos, twosA, twosB);
+        CSA(&eightsB, &fours, fours, foursA, foursB);
+        CSA(&sixteens, &eights, eights, eightsA, eightsB);
+
+        total = _mm256_add_epi64(total, popcount256(sixteens));
+    }
+
+    total = _mm256_slli_epi64(total, 4);  // * 16
+    total = _mm256_add_epi64(
+        total, _mm256_slli_epi64(popcount256(eights), 3));  // += 8 * ...
+    total = _mm256_add_epi64(
+        total, _mm256_slli_epi64(popcount256(fours), 2));  // += 4 * ...
+    total = _mm256_add_epi64(
+        total, _mm256_slli_epi64(popcount256(twos), 1));  // += 2 * ...
+    total = _mm256_add_epi64(total, popcount256(ones));
+    for (; i < size; i++)
+        total =
+            _mm256_add_epi64(total, popcount256(_mm256_lddqu_si256(data + i)));
+
+    return (uint64_t)(_mm256_extract_epi64(total, 0)) +
+           (uint64_t)(_mm256_extract_epi64(total, 1)) +
+           (uint64_t)(_mm256_extract_epi64(total, 2)) +
+           (uint64_t)(_mm256_extract_epi64(total, 3));
+}
+CROARING_UNTARGET_AVX2
+
+#define AVXPOPCNTFNC(opname, avx_intrinsic)                                    \
+    static inline uint64_t avx2_harley_seal_popcount256_##opname(              \
+        const __m256i *data1, const __m256i *data2, const uint64_t size) {     \
+        __m256i total = _mm256_setzero_si256();                                \
+        __m256i ones = _mm256_setzero_si256();                                 \
+        __m256i twos = _mm256_setzero_si256();                                 \
+        __m256i fours = _mm256_setzero_si256();                                \
+        __m256i eights = _mm256_setzero_si256();                               \
+        __m256i sixteens = _mm256_setzero_si256();                             \
+        __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;                \
+        __m256i A1, A2;                                                        \
+        const uint64_t limit = size - size % 16;                               \
+        uint64_t i = 0;                                                        \
+        for (; i < limit; i += 16) {                                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i),                  \
+                               _mm256_lddqu_si256(data2 + i));                 \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1),              \
+                               _mm256_lddqu_si256(data2 + i + 1));             \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2),              \
+                               _mm256_lddqu_si256(data2 + i + 2));             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3),              \
+                               _mm256_lddqu_si256(data2 + i + 3));             \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursA, &twos, twos, twosA, twosB);                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4),              \
+                               _mm256_lddqu_si256(data2 + i + 4));             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5),              \
+                               _mm256_lddqu_si256(data2 + i + 5));             \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6),              \
+                               _mm256_lddqu_si256(data2 + i + 6));             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7),              \
+                               _mm256_lddqu_si256(data2 + i + 7));             \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursB, &twos, twos, twosA, twosB);                           \
+            CSA(&eightsA, &fours, fours, foursA, foursB);                      \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8),              \
+                               _mm256_lddqu_si256(data2 + i + 8));             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9),              \
+                               _mm256_lddqu_si256(data2 + i + 9));             \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10),             \
+                               _mm256_lddqu_si256(data2 + i + 10));            \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11),             \
+                               _mm256_lddqu_si256(data2 + i + 11));            \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursA, &twos, twos, twosA, twosB);                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12),             \
+                               _mm256_lddqu_si256(data2 + i + 12));            \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13),             \
+                               _mm256_lddqu_si256(data2 + i + 13));            \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14),             \
+                               _mm256_lddqu_si256(data2 + i + 14));            \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15),             \
+                               _mm256_lddqu_si256(data2 + i + 15));            \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursB, &twos, twos, twosA, twosB);                           \
+            CSA(&eightsB, &fours, fours, foursA, foursB);                      \
+            CSA(&sixteens, &eights, eights, eightsA, eightsB);                 \
+            total = _mm256_add_epi64(total, popcount256(sixteens));            \
+        }                                                                      \
+        total = _mm256_slli_epi64(total, 4);                                   \
+        total = _mm256_add_epi64(total,                                        \
+                                 _mm256_slli_epi64(popcount256(eights), 3));   \
+        total =                                                                \
+            _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
+        total =                                                                \
+            _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1));  \
+        total = _mm256_add_epi64(total, popcount256(ones));                    \
+        for (; i < size; i++) {                                                \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i),                  \
+                               _mm256_lddqu_si256(data2 + i));                 \
+            total = _mm256_add_epi64(total, popcount256(A1));                  \
+        }                                                                      \
+        return (uint64_t)(_mm256_extract_epi64(total, 0)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 1)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 2)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 3));                     \
+    }                                                                          \
+    static inline uint64_t avx2_harley_seal_popcount256andstore_##opname(      \
+        const __m256i *__restrict__ data1, const __m256i *__restrict__ data2,  \
+        __m256i *__restrict__ out, const uint64_t size) {                      \
+        __m256i total = _mm256_setzero_si256();                                \
+        __m256i ones = _mm256_setzero_si256();                                 \
+        __m256i twos = _mm256_setzero_si256();                                 \
+        __m256i fours = _mm256_setzero_si256();                                \
+        __m256i eights = _mm256_setzero_si256();                               \
+        __m256i sixteens = _mm256_setzero_si256();                             \
+        __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;                \
+        __m256i A1, A2;                                                        \
+        const uint64_t limit = size - size % 16;                               \
+        uint64_t i = 0;                                                        \
+        for (; i < limit; i += 16) {                                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i),                  \
+                               _mm256_lddqu_si256(data2 + i));                 \
+            _mm256_storeu_si256(out + i, A1);                                  \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1),              \
+                               _mm256_lddqu_si256(data2 + i + 1));             \
+            _mm256_storeu_si256(out + i + 1, A2);                              \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2),              \
+                               _mm256_lddqu_si256(data2 + i + 2));             \
+            _mm256_storeu_si256(out + i + 2, A1);                              \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3),              \
+                               _mm256_lddqu_si256(data2 + i + 3));             \
+            _mm256_storeu_si256(out + i + 3, A2);                              \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursA, &twos, twos, twosA, twosB);                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4),              \
+                               _mm256_lddqu_si256(data2 + i + 4));             \
+            _mm256_storeu_si256(out + i + 4, A1);                              \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5),              \
+                               _mm256_lddqu_si256(data2 + i + 5));             \
+            _mm256_storeu_si256(out + i + 5, A2);                              \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6),              \
+                               _mm256_lddqu_si256(data2 + i + 6));             \
+            _mm256_storeu_si256(out + i + 6, A1);                              \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7),              \
+                               _mm256_lddqu_si256(data2 + i + 7));             \
+            _mm256_storeu_si256(out + i + 7, A2);                              \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursB, &twos, twos, twosA, twosB);                           \
+            CSA(&eightsA, &fours, fours, foursA, foursB);                      \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8),              \
+                               _mm256_lddqu_si256(data2 + i + 8));             \
+            _mm256_storeu_si256(out + i + 8, A1);                              \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9),              \
+                               _mm256_lddqu_si256(data2 + i + 9));             \
+            _mm256_storeu_si256(out + i + 9, A2);                              \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10),             \
+                               _mm256_lddqu_si256(data2 + i + 10));            \
+            _mm256_storeu_si256(out + i + 10, A1);                             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11),             \
+                               _mm256_lddqu_si256(data2 + i + 11));            \
+            _mm256_storeu_si256(out + i + 11, A2);                             \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursA, &twos, twos, twosA, twosB);                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12),             \
+                               _mm256_lddqu_si256(data2 + i + 12));            \
+            _mm256_storeu_si256(out + i + 12, A1);                             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13),             \
+                               _mm256_lddqu_si256(data2 + i + 13));            \
+            _mm256_storeu_si256(out + i + 13, A2);                             \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14),             \
+                               _mm256_lddqu_si256(data2 + i + 14));            \
+            _mm256_storeu_si256(out + i + 14, A1);                             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15),             \
+                               _mm256_lddqu_si256(data2 + i + 15));            \
+            _mm256_storeu_si256(out + i + 15, A2);                             \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursB, &twos, twos, twosA, twosB);                           \
+            CSA(&eightsB, &fours, fours, foursA, foursB);                      \
+            CSA(&sixteens, &eights, eights, eightsA, eightsB);                 \
+            total = _mm256_add_epi64(total, popcount256(sixteens));            \
+        }                                                                      \
+        total = _mm256_slli_epi64(total, 4);                                   \
+        total = _mm256_add_epi64(total,                                        \
+                                 _mm256_slli_epi64(popcount256(eights), 3));   \
+        total =                                                                \
+            _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
+        total =                                                                \
+            _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1));  \
+        total = _mm256_add_epi64(total, popcount256(ones));                    \
+        for (; i < size; i++) {                                                \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i),                  \
+                               _mm256_lddqu_si256(data2 + i));                 \
+            _mm256_storeu_si256(out + i, A1);                                  \
+            total = _mm256_add_epi64(total, popcount256(A1));                  \
+        }                                                                      \
+        return (uint64_t)(_mm256_extract_epi64(total, 0)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 1)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 2)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 3));                     \
+    }
+
+CROARING_TARGET_AVX2
+AVXPOPCNTFNC(or, _mm256_or_si256)
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+AVXPOPCNTFNC(union, _mm256_or_si256)
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+AVXPOPCNTFNC(and, _mm256_and_si256)
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+AVXPOPCNTFNC(intersection, _mm256_and_si256)
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+AVXPOPCNTFNC(xor, _mm256_xor_si256)
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+AVXPOPCNTFNC(andnot, _mm256_andnot_si256)
+CROARING_UNTARGET_AVX2
+
+#define VPOPCNT_AND_ADD(ptr, i, accu)                                  \
+    const __m512i v##i = _mm512_loadu_si512((const __m512i *)ptr + i); \
+    const __m512i p##i = _mm512_popcnt_epi64(v##i);                    \
+    accu = _mm512_add_epi64(accu, p##i);
+
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+static inline uint64_t sum_epu64_256(const __m256i v) {
+    return (uint64_t)(_mm256_extract_epi64(v, 0)) +
+           (uint64_t)(_mm256_extract_epi64(v, 1)) +
+           (uint64_t)(_mm256_extract_epi64(v, 2)) +
+           (uint64_t)(_mm256_extract_epi64(v, 3));
+}
+
+static inline uint64_t simd_sum_epu64(const __m512i v) {
+    __m256i lo = _mm512_extracti64x4_epi64(v, 0);
+    __m256i hi = _mm512_extracti64x4_epi64(v, 1);
+
+    return sum_epu64_256(lo) + sum_epu64_256(hi);
+}
+
+static inline uint64_t avx512_vpopcount(const __m512i *data,
+                                        const uint64_t size) {
+    const uint64_t limit = size - size % 4;
+    __m512i total = _mm512_setzero_si512();
+    uint64_t i = 0;
+
+    for (; i < limit; i += 4) {
+        VPOPCNT_AND_ADD(data + i, 0, total);
+        VPOPCNT_AND_ADD(data + i, 1, total);
+        VPOPCNT_AND_ADD(data + i, 2, total);
+        VPOPCNT_AND_ADD(data + i, 3, total);
+    }
+
+    for (; i < size; i++) {
+        total = _mm512_add_epi64(
+            total, _mm512_popcnt_epi64(_mm512_loadu_si512(data + i)));
+    }
+
+    return simd_sum_epu64(total);
+}
+CROARING_UNTARGET_AVX512
+#endif
+
+#define AVXPOPCNTFNC512(opname, avx_intrinsic)                                \
+    static inline uint64_t avx512_harley_seal_popcount512_##opname(           \
+        const __m512i *data1, const __m512i *data2, const uint64_t size) {    \
+        __m512i total = _mm512_setzero_si512();                               \
+        const uint64_t limit = size - size % 4;                               \
+        uint64_t i = 0;                                                       \
+        for (; i < limit; i += 4) {                                           \
+            __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i),         \
+                                       _mm512_loadu_si512(data2 + i));        \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1));         \
+            __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1),     \
+                                       _mm512_loadu_si512(data2 + i + 1));    \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2));         \
+            __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2),     \
+                                       _mm512_loadu_si512(data2 + i + 2));    \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3));         \
+            __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3),     \
+                                       _mm512_loadu_si512(data2 + i + 3));    \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4));         \
+        }                                                                     \
+        for (; i < size; i++) {                                               \
+            __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i),          \
+                                      _mm512_loadu_si512(data2 + i));         \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a));          \
+        }                                                                     \
+        return simd_sum_epu64(total);                                         \
+    }                                                                         \
+    static inline uint64_t avx512_harley_seal_popcount512andstore_##opname(   \
+        const __m512i *__restrict__ data1, const __m512i *__restrict__ data2, \
+        __m512i *__restrict__ out, const uint64_t size) {                     \
+        __m512i total = _mm512_setzero_si512();                               \
+        const uint64_t limit = size - size % 4;                               \
+        uint64_t i = 0;                                                       \
+        for (; i < limit; i += 4) {                                           \
+            __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i),         \
+                                       _mm512_loadu_si512(data2 + i));        \
+            _mm512_storeu_si512(out + i, a1);                                 \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1));         \
+            __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1),     \
+                                       _mm512_loadu_si512(data2 + i + 1));    \
+            _mm512_storeu_si512(out + i + 1, a2);                             \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2));         \
+            __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2),     \
+                                       _mm512_loadu_si512(data2 + i + 2));    \
+            _mm512_storeu_si512(out + i + 2, a3);                             \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3));         \
+            __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3),     \
+                                       _mm512_loadu_si512(data2 + i + 3));    \
+            _mm512_storeu_si512(out + i + 3, a4);                             \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4));         \
+        }                                                                     \
+        for (; i < size; i++) {                                               \
+            __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i),          \
+                                      _mm512_loadu_si512(data2 + i));         \
+            _mm512_storeu_si512(out + i, a);                                  \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a));          \
+        }                                                                     \
+        return simd_sum_epu64(total);                                         \
+    }
+
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+AVXPOPCNTFNC512(or, _mm512_or_si512)
+AVXPOPCNTFNC512(union, _mm512_or_si512)
+AVXPOPCNTFNC512(and, _mm512_and_si512)
+AVXPOPCNTFNC512(intersection, _mm512_and_si512)
+AVXPOPCNTFNC512(xor, _mm512_xor_si512)
+AVXPOPCNTFNC512(andnot, _mm512_andnot_si512)
+CROARING_UNTARGET_AVX512
+#endif
+/***
+ * END Harley-Seal popcount functions.
+ */
+
+#endif  // CROARING_IS_X64
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+#endif
+/* end file include/roaring/bitset_util.h */
+/* begin file include/roaring/containers/array.h */
+/*
+ * array.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_ARRAY_H_
+#define INCLUDE_CONTAINERS_ARRAY_H_
+
+#include <string.h>
+
+
+// Include other headers after roaring_types.h
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+
+// Note: in pure C++ code, you should avoid putting `using` in header files
+using api::roaring_iterator;
+using api::roaring_iterator64;
+
+namespace internal {
+#endif
+
+/* Containers with DEFAULT_MAX_SIZE or less integers should be arrays */
+enum { DEFAULT_MAX_SIZE = 4096 };
+
+/* struct array_container - sparse representation of a bitmap
+ *
+ * @cardinality: number of indices in `array` (and the bitmap)
+ * @capacity:    allocated size of `array`
+ * @array:       sorted list of integers
+ */
+STRUCT_CONTAINER(array_container_s) {
+    int32_t cardinality;
+    int32_t capacity;
+    uint16_t *array;
+};
+
+typedef struct array_container_s array_container_t;
+
+#define CAST_array(c) CAST(array_container_t *, c)  // safer downcast
+#define const_CAST_array(c) CAST(const array_container_t *, c)
+#define movable_CAST_array(c) movable_CAST(array_container_t **, c)
+
+/* Create a new array with default. Return NULL in case of failure. See also
+ * array_container_create_given_capacity. */
+array_container_t *array_container_create(void);
+
+/* Create a new array with a specified capacity size. Return NULL in case of
+ * failure. */
+array_container_t *array_container_create_given_capacity(int32_t size);
+
+/* Create a new array containing all values in [min,max). */
+array_container_t *array_container_create_range(uint32_t min, uint32_t max);
+
+/*
+ * Shrink the capacity to the actual size, return the number of bytes saved.
+ */
+int array_container_shrink_to_fit(array_container_t *src);
+
+/* Free memory owned by `array'. */
+void array_container_free(array_container_t *array);
+
+/* Duplicate container */
+array_container_t *array_container_clone(const array_container_t *src);
+
+/* Get the cardinality of `array'. */
+ALLOW_UNALIGNED
+static inline int array_container_cardinality(const array_container_t *array) {
+    return array->cardinality;
+}
+
+static inline bool array_container_nonzero_cardinality(
+    const array_container_t *array) {
+    return array->cardinality > 0;
+}
+
+/* Copy one container into another. We assume that they are distinct. */
+void array_container_copy(const array_container_t *src, array_container_t *dst);
+
+/*  Add all the values in [min,max) (included) at a distance k*step from min.
+    The container must have a size less or equal to DEFAULT_MAX_SIZE after this
+   addition. */
+void array_container_add_from_range(array_container_t *arr, uint32_t min,
+                                    uint32_t max, uint16_t step);
+
+static inline bool array_container_empty(const array_container_t *array) {
+    return array->cardinality == 0;
+}
+
+/* check whether the cardinality is equal to the capacity (this does not mean
+ * that it contains 1<<16 elements) */
+static inline bool array_container_full(const array_container_t *array) {
+    return array->cardinality == array->capacity;
+}
+
+/* Compute the union of `src_1' and `src_2' and write the result to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void array_container_union(const array_container_t *src_1,
+                           const array_container_t *src_2,
+                           array_container_t *dst);
+
+/* symmetric difference, see array_container_union */
+void array_container_xor(const array_container_t *array_1,
+                         const array_container_t *array_2,
+                         array_container_t *out);
+
+/* Computes the intersection of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+void array_container_intersection(const array_container_t *src_1,
+                                  const array_container_t *src_2,
+                                  array_container_t *dst);
+
+/* Check whether src_1 and src_2 intersect. */
+bool array_container_intersect(const array_container_t *src_1,
+                               const array_container_t *src_2);
+
+/* computers the size of the intersection between two arrays.
+ */
+int array_container_intersection_cardinality(const array_container_t *src_1,
+                                             const array_container_t *src_2);
+
+/* computes the intersection of array1 and array2 and write the result to
+ * array1.
+ * */
+void array_container_intersection_inplace(array_container_t *src_1,
+                                          const array_container_t *src_2);
+
+/*
+ * Write out the 16-bit integers contained in this container as a list of 32-bit
+ * integers using base
+ * as the starting value (it might be expected that base has zeros in its 16
+ * least significant bits).
+ * The function returns the number of values written.
+ * The caller is responsible for allocating enough memory in out.
+ */
+int array_container_to_uint32_array(void *vout, const array_container_t *cont,
+                                    uint32_t base);
+
+/* Compute the number of runs */
+int32_t array_container_number_of_runs(const array_container_t *ac);
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void array_container_printf(const array_container_t *v);
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit
+ * integers starting at base.
+ */
+void array_container_printf_as_uint32_array(const array_container_t *v,
+                                            uint32_t base);
+
+bool array_container_validate(const array_container_t *v, const char **reason);
+
+/**
+ * Return the serialized size in bytes of a container having cardinality "card".
+ */
+static inline int32_t array_container_serialized_size_in_bytes(int32_t card) {
+    return card * 2 + 2;
+}
+
+/**
+ * Increase capacity to at least min.
+ * Whether the existing data needs to be copied over depends on the "preserve"
+ * parameter. If preserve is false, then the new content will be uninitialized,
+ * otherwise the old content is copied.
+ */
+void array_container_grow(array_container_t *container, int32_t min,
+                          bool preserve);
+
+bool array_container_iterate(const array_container_t *cont, uint32_t base,
+                             roaring_iterator iterator, void *ptr);
+bool array_container_iterate64(const array_container_t *cont, uint32_t base,
+                               roaring_iterator64 iterator, uint64_t high_bits,
+                               void *ptr);
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes written should be
+ * array_container_size_in_bytes(container).
+ *
+ */
+int32_t array_container_write(const array_container_t *container, char *buf);
+/**
+ * Reads the instance from buf, outputs how many bytes were read.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes read should be array_container_size_in_bytes(container).
+ * You need to provide the (known) cardinality.
+ */
+int32_t array_container_read(int32_t cardinality, array_container_t *container,
+                             const char *buf);
+
+/**
+ * Return the serialized size in bytes of a container (see
+ * bitset_container_write)
+ * This is meant to be compatible with the Java and Go versions of Roaring and
+ * assumes
+ * that the cardinality of the container is already known.
+ *
+ */
+ALLOW_UNALIGNED
+static inline int32_t array_container_size_in_bytes(
+    const array_container_t *container) {
+    return container->cardinality * sizeof(uint16_t);
+}
+
+/**
+ * Return true if the two arrays have the same content.
+ */
+ALLOW_UNALIGNED
+static inline bool array_container_equals(const array_container_t *container1,
+                                          const array_container_t *container2) {
+    if (container1->cardinality != container2->cardinality) {
+        return false;
+    }
+    return memequals(container1->array, container2->array,
+                     container1->cardinality * 2);
+}
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool array_container_is_subset(const array_container_t *container1,
+                               const array_container_t *container2);
+
+/**
+ * If the element of given rank is in this container, supposing that the first
+ * element has rank start_rank, then the function returns true and sets element
+ * accordingly.
+ * Otherwise, it returns false and update start_rank.
+ */
+static inline bool array_container_select(const array_container_t *container,
+                                          uint32_t *start_rank, uint32_t rank,
+                                          uint32_t *element) {
+    int card = array_container_cardinality(container);
+    if (*start_rank + card <= rank) {
+        *start_rank += card;
+        return false;
+    } else {
+        *element = container->array[rank - *start_rank];
+        return true;
+    }
+}
+
+/* Computes the  difference of array1 and array2 and write the result
+ * to array out.
+ * Array out does not need to be distinct from array_1
+ */
+void array_container_andnot(const array_container_t *array_1,
+                            const array_container_t *array_2,
+                            array_container_t *out);
+
+/* Append x to the set. Assumes that the value is larger than any preceding
+ * values.  */
+static inline void array_container_append(array_container_t *arr,
+                                          uint16_t pos) {
+    const int32_t capacity = arr->capacity;
+
+    if (array_container_full(arr)) {
+        array_container_grow(arr, capacity + 1, true);
+    }
+
+    arr->array[arr->cardinality++] = pos;
+}
+
+/**
+ * Add value to the set if final cardinality doesn't exceed max_cardinality.
+ * Return code:
+ * 1  -- value was added
+ * 0  -- value was already present
+ * -1 -- value was not added because cardinality would exceed max_cardinality
+ */
+static inline int array_container_try_add(array_container_t *arr,
+                                          uint16_t value,
+                                          int32_t max_cardinality) {
+    const int32_t cardinality = arr->cardinality;
+
+    // best case, we can append.
+    if ((array_container_empty(arr) || arr->array[cardinality - 1] < value) &&
+        cardinality < max_cardinality) {
+        array_container_append(arr, value);
+        return 1;
+    }
+
+    const int32_t loc = binarySearch(arr->array, cardinality, value);
+
+    if (loc >= 0) {
+        return 0;
+    } else if (cardinality < max_cardinality) {
+        if (array_container_full(arr)) {
+            array_container_grow(arr, arr->capacity + 1, true);
+        }
+        const int32_t insert_idx = -loc - 1;
+        memmove(arr->array + insert_idx + 1, arr->array + insert_idx,
+                (cardinality - insert_idx) * sizeof(uint16_t));
+        arr->array[insert_idx] = value;
+        arr->cardinality++;
+        return 1;
+    } else {
+        return -1;
+    }
+}
+
+/* Add value to the set. Returns true if x was not already present.  */
+static inline bool array_container_add(array_container_t *arr, uint16_t value) {
+    return array_container_try_add(arr, value, INT32_MAX) == 1;
+}
+
+/* Remove x from the set. Returns true if x was present.  */
+static inline bool array_container_remove(array_container_t *arr,
+                                          uint16_t pos) {
+    const int32_t idx = binarySearch(arr->array, arr->cardinality, pos);
+    const bool is_present = idx >= 0;
+    if (is_present) {
+        memmove(arr->array + idx, arr->array + idx + 1,
+                (arr->cardinality - idx - 1) * sizeof(uint16_t));
+        arr->cardinality--;
+    }
+
+    return is_present;
+}
+
+/* Check whether x is present.  */
+inline bool array_container_contains(const array_container_t *arr,
+                                     uint16_t pos) {
+    //    return binarySearch(arr->array, arr->cardinality, pos) >= 0;
+    // binary search with fallback to linear search for short ranges
+    int32_t low = 0;
+    const uint16_t *carr = (const uint16_t *)arr->array;
+    int32_t high = arr->cardinality - 1;
+    //    while (high - low >= 0) {
+    while (high >= low + 16) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t middleValue = carr[middleIndex];
+        if (middleValue < pos) {
+            low = middleIndex + 1;
+        } else if (middleValue > pos) {
+            high = middleIndex - 1;
+        } else {
+            return true;
+        }
+    }
+
+    for (int i = low; i <= high; i++) {
+        uint16_t v = carr[i];
+        if (v == pos) {
+            return true;
+        }
+        if (v > pos) return false;
+    }
+    return false;
+}
+
+void array_container_offset(const array_container_t *c, container_t **loc,
+                            container_t **hic, uint16_t offset);
+
+//* Check whether a range of values from range_start (included) to range_end
+//(excluded) is present. */
+static inline bool array_container_contains_range(const array_container_t *arr,
+                                                  uint32_t range_start,
+                                                  uint32_t range_end) {
+    const int32_t range_count = range_end - range_start;
+    const uint16_t rs_included = (uint16_t)range_start;
+    const uint16_t re_included = (uint16_t)(range_end - 1);
+
+    // Empty range is always included
+    if (range_count <= 0) {
+        return true;
+    }
+    if (range_count > arr->cardinality) {
+        return false;
+    }
+
+    const int32_t start =
+        binarySearch(arr->array, arr->cardinality, rs_included);
+    // If this sorted array contains all items in the range:
+    // * the start item must be found
+    // * the last item in range range_count must exist, and be the expected end
+    // value
+    return (start >= 0) && (arr->cardinality >= start + range_count) &&
+           (arr->array[start + range_count - 1] == re_included);
+}
+
+/* Returns the smallest value (assumes not empty) */
+inline uint16_t array_container_minimum(const array_container_t *arr) {
+    if (arr->cardinality == 0) return 0;
+    return arr->array[0];
+}
+
+/* Returns the largest value (assumes not empty) */
+inline uint16_t array_container_maximum(const array_container_t *arr) {
+    if (arr->cardinality == 0) return 0;
+    return arr->array[arr->cardinality - 1];
+}
+
+/* Returns the number of values equal or smaller than x */
+inline int array_container_rank(const array_container_t *arr, uint16_t x) {
+    const int32_t idx = binarySearch(arr->array, arr->cardinality, x);
+    const bool is_present = idx >= 0;
+    if (is_present) {
+        return idx + 1;
+    } else {
+        return -idx - 1;
+    }
+}
+
+/*  bulk version of array_container_rank(); return number of consumed elements
+ */
+inline uint32_t array_container_rank_many(const array_container_t *arr,
+                                          uint64_t start_rank,
+                                          const uint32_t *begin,
+                                          const uint32_t *end, uint64_t *ans) {
+    const uint16_t high = (uint16_t)((*begin) >> 16);
+    uint32_t pos = 0;
+    const uint32_t *iter = begin;
+    for (; iter != end; iter++) {
+        uint32_t x = *iter;
+        uint16_t xhigh = (uint16_t)(x >> 16);
+        if (xhigh != high) return iter - begin;  // stop at next container
+
+        const int32_t idx =
+            binarySearch(arr->array + pos, arr->cardinality - pos, (uint16_t)x);
+        const bool is_present = idx >= 0;
+        if (is_present) {
+            *(ans++) = start_rank + pos + (idx + 1);
+            pos = idx + 1;
+        } else {
+            *(ans++) = start_rank + pos + (-idx - 1);
+        }
+    }
+    return iter - begin;
+}
+
+/* Returns the index of x , if not exsist return -1 */
+inline int array_container_get_index(const array_container_t *arr, uint16_t x) {
+    const int32_t idx = binarySearch(arr->array, arr->cardinality, x);
+    const bool is_present = idx >= 0;
+    if (is_present) {
+        return idx;
+    } else {
+        return -1;
+    }
+}
+
+/* Returns the index of the first value equal or larger than x, or -1 */
+inline int array_container_index_equalorlarger(const array_container_t *arr,
+                                               uint16_t x) {
+    const int32_t idx = binarySearch(arr->array, arr->cardinality, x);
+    const bool is_present = idx >= 0;
+    if (is_present) {
+        return idx;
+    } else {
+        int32_t candidate = -idx - 1;
+        if (candidate < arr->cardinality) return candidate;
+        return -1;
+    }
+}
+
+/*
+ * Adds all values in range [min,max] using hint:
+ *   nvals_less is the number of array values less than $min
+ *   nvals_greater is the number of array values greater than $max
+ */
+static inline void array_container_add_range_nvals(array_container_t *array,
+                                                   uint32_t min, uint32_t max,
+                                                   int32_t nvals_less,
+                                                   int32_t nvals_greater) {
+    int32_t union_cardinality = nvals_less + (max - min + 1) + nvals_greater;
+    if (union_cardinality > array->capacity) {
+        array_container_grow(array, union_cardinality, true);
+    }
+    memmove(&(array->array[union_cardinality - nvals_greater]),
+            &(array->array[array->cardinality - nvals_greater]),
+            nvals_greater * sizeof(uint16_t));
+    for (uint32_t i = 0; i <= max - min; i++) {
+        array->array[nvals_less + i] = (uint16_t)(min + i);
+    }
+    array->cardinality = union_cardinality;
+}
+
+/**
+ * Adds all values in range [min,max]. This function is currently unused
+ * and left as a documentation.
+ */
+/*static inline void array_container_add_range(array_container_t *array,
+                                             uint32_t min, uint32_t max) {
+    int32_t nvals_greater = count_greater(array->array, array->cardinality,
+max); int32_t nvals_less = count_less(array->array, array->cardinality -
+nvals_greater, min); array_container_add_range_nvals(array, min, max,
+nvals_less, nvals_greater);
+}*/
+
+/*
+ * Removes all elements array[pos] .. array[pos+count-1]
+ */
+static inline void array_container_remove_range(array_container_t *array,
+                                                uint32_t pos, uint32_t count) {
+    if (count != 0) {
+        memmove(&(array->array[pos]), &(array->array[pos + count]),
+                (array->cardinality - pos - count) * sizeof(uint16_t));
+        array->cardinality -= count;
+    }
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif /* INCLUDE_CONTAINERS_ARRAY_H_ */
+/* end file include/roaring/containers/array.h */
+/* begin file include/roaring/containers/bitset.h */
+/*
+ * bitset.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_BITSET_H_
+#define INCLUDE_CONTAINERS_BITSET_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+
+// Include other headers after roaring_types.h
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+
+// Note: in pure C++ code, you should avoid putting `using` in header files
+using api::roaring_iterator;
+using api::roaring_iterator64;
+
+namespace internal {
+#endif
+
+enum {
+    BITSET_CONTAINER_SIZE_IN_WORDS = (1 << 16) / 64,
+    BITSET_UNKNOWN_CARDINALITY = -1
+};
+
+STRUCT_CONTAINER(bitset_container_s) {
+    int32_t cardinality;
+    uint64_t *words;
+};
+
+typedef struct bitset_container_s bitset_container_t;
+
+#define CAST_bitset(c) CAST(bitset_container_t *, c)  // safer downcast
+#define const_CAST_bitset(c) CAST(const bitset_container_t *, c)
+#define movable_CAST_bitset(c) movable_CAST(bitset_container_t **, c)
+
+/* Create a new bitset. Return NULL in case of failure. */
+bitset_container_t *bitset_container_create(void);
+
+/* Free memory. */
+void bitset_container_free(bitset_container_t *bitset);
+
+/* Clear bitset (sets bits to 0). */
+void bitset_container_clear(bitset_container_t *bitset);
+
+/* Set all bits to 1. */
+void bitset_container_set_all(bitset_container_t *bitset);
+
+/* Duplicate bitset */
+bitset_container_t *bitset_container_clone(const bitset_container_t *src);
+
+/* Set the bit in [begin,end). WARNING: as of April 2016, this method is slow
+ * and
+ * should not be used in performance-sensitive code. Ever.  */
+void bitset_container_set_range(bitset_container_t *bitset, uint32_t begin,
+                                uint32_t end);
+
+#if defined(CROARING_ASMBITMANIPOPTIMIZATION) && defined(__AVX2__)
+/* Set the ith bit.  */
+static inline void bitset_container_set(bitset_container_t *bitset,
+                                        uint16_t pos) {
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t p = pos;
+    ASM_SHIFT_RIGHT(p, shift, offset);
+    uint64_t load = bitset->words[offset];
+    ASM_SET_BIT_INC_WAS_CLEAR(load, p, bitset->cardinality);
+    bitset->words[offset] = load;
+}
+
+/* Unset the ith bit. Currently unused. Could be used for optimization. */
+/*static inline void bitset_container_unset(bitset_container_t *bitset,
+                                          uint16_t pos) {
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t p = pos;
+    ASM_SHIFT_RIGHT(p, shift, offset);
+    uint64_t load = bitset->words[offset];
+    ASM_CLEAR_BIT_DEC_WAS_SET(load, p, bitset->cardinality);
+    bitset->words[offset] = load;
+}*/
+
+/* Add `pos' to `bitset'. Returns true if `pos' was not present. Might be slower
+ * than bitset_container_set.  */
+static inline bool bitset_container_add(bitset_container_t *bitset,
+                                        uint16_t pos) {
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t p = pos;
+    ASM_SHIFT_RIGHT(p, shift, offset);
+    uint64_t load = bitset->words[offset];
+    // could be possibly slightly further optimized
+    const int32_t oldcard = bitset->cardinality;
+    ASM_SET_BIT_INC_WAS_CLEAR(load, p, bitset->cardinality);
+    bitset->words[offset] = load;
+    return bitset->cardinality - oldcard;
+}
+
+/* Remove `pos' from `bitset'. Returns true if `pos' was present.  Might be
+ * slower than bitset_container_unset.  */
+static inline bool bitset_container_remove(bitset_container_t *bitset,
+                                           uint16_t pos) {
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t p = pos;
+    ASM_SHIFT_RIGHT(p, shift, offset);
+    uint64_t load = bitset->words[offset];
+    // could be possibly slightly further optimized
+    const int32_t oldcard = bitset->cardinality;
+    ASM_CLEAR_BIT_DEC_WAS_SET(load, p, bitset->cardinality);
+    bitset->words[offset] = load;
+    return oldcard - bitset->cardinality;
+}
+
+/* Get the value of the ith bit.  */
+inline bool bitset_container_get(const bitset_container_t *bitset,
+                                 uint16_t pos) {
+    uint64_t word = bitset->words[pos >> 6];
+    const uint64_t p = pos;
+    ASM_INPLACESHIFT_RIGHT(word, p);
+    return word & 1;
+}
+
+#else
+
+/* Set the ith bit.  */
+static inline void bitset_container_set(bitset_container_t *bitset,
+                                        uint16_t pos) {
+    const uint64_t old_word = bitset->words[pos >> 6];
+    const int index = pos & 63;
+    const uint64_t new_word = old_word | (UINT64_C(1) << index);
+    bitset->cardinality += (uint32_t)((old_word ^ new_word) >> index);
+    bitset->words[pos >> 6] = new_word;
+}
+
+/* Unset the ith bit. Currently unused.  */
+/*static inline void bitset_container_unset(bitset_container_t *bitset,
+                                          uint16_t pos) {
+    const uint64_t old_word = bitset->words[pos >> 6];
+    const int index = pos & 63;
+    const uint64_t new_word = old_word & (~(UINT64_C(1) << index));
+    bitset->cardinality -= (uint32_t)((old_word ^ new_word) >> index);
+    bitset->words[pos >> 6] = new_word;
+}*/
+
+/* Add `pos' to `bitset'. Returns true if `pos' was not present. Might be slower
+ * than bitset_container_set.  */
+static inline bool bitset_container_add(bitset_container_t *bitset,
+                                        uint16_t pos) {
+    const uint64_t old_word = bitset->words[pos >> 6];
+    const int index = pos & 63;
+    const uint64_t new_word = old_word | (UINT64_C(1) << index);
+    const uint64_t increment = (old_word ^ new_word) >> index;
+    bitset->cardinality += (uint32_t)increment;
+    bitset->words[pos >> 6] = new_word;
+    return increment > 0;
+}
+
+/* Remove `pos' from `bitset'. Returns true if `pos' was present.  Might be
+ * slower than bitset_container_unset.  */
+static inline bool bitset_container_remove(bitset_container_t *bitset,
+                                           uint16_t pos) {
+    const uint64_t old_word = bitset->words[pos >> 6];
+    const int index = pos & 63;
+    const uint64_t new_word = old_word & (~(UINT64_C(1) << index));
+    const uint64_t increment = (old_word ^ new_word) >> index;
+    bitset->cardinality -= (uint32_t)increment;
+    bitset->words[pos >> 6] = new_word;
+    return increment > 0;
+}
+
+/* Get the value of the ith bit.  */
+inline bool bitset_container_get(const bitset_container_t *bitset,
+                                 uint16_t pos) {
+    const uint64_t word = bitset->words[pos >> 6];
+    return (word >> (pos & 63)) & 1;
+}
+
+#endif
+
+/*
+ * Check if all bits are set in a range of positions from pos_start (included)
+ * to pos_end (excluded).
+ */
+static inline bool bitset_container_get_range(const bitset_container_t *bitset,
+                                              uint32_t pos_start,
+                                              uint32_t pos_end) {
+    const uint32_t start = pos_start >> 6;
+    const uint32_t end = pos_end >> 6;
+
+    const uint64_t first = ~((1ULL << (pos_start & 0x3F)) - 1);
+    const uint64_t last = (1ULL << (pos_end & 0x3F)) - 1;
+
+    if (start == end)
+        return ((bitset->words[end] & first & last) == (first & last));
+    if ((bitset->words[start] & first) != first) return false;
+
+    if ((end < BITSET_CONTAINER_SIZE_IN_WORDS) &&
+        ((bitset->words[end] & last) != last)) {
+        return false;
+    }
+
+    for (uint32_t i = start + 1;
+         (i < BITSET_CONTAINER_SIZE_IN_WORDS) && (i < end); ++i) {
+        if (bitset->words[i] != UINT64_C(0xFFFFFFFFFFFFFFFF)) return false;
+    }
+
+    return true;
+}
+
+/* Check whether `bitset' is present in `array'.  Calls bitset_container_get. */
+inline bool bitset_container_contains(const bitset_container_t *bitset,
+                                      uint16_t pos) {
+    return bitset_container_get(bitset, pos);
+}
+
+/*
+ * Check whether a range of bits from position `pos_start' (included) to
+ * `pos_end' (excluded) is present in `bitset'.  Calls bitset_container_get_all.
+ */
+static inline bool bitset_container_contains_range(
+    const bitset_container_t *bitset, uint32_t pos_start, uint32_t pos_end) {
+    return bitset_container_get_range(bitset, pos_start, pos_end);
+}
+
+/* Get the number of bits set */
+ALLOW_UNALIGNED
+static inline int bitset_container_cardinality(
+    const bitset_container_t *bitset) {
+    return bitset->cardinality;
+}
+
+/* Copy one container into another. We assume that they are distinct. */
+void bitset_container_copy(const bitset_container_t *source,
+                           bitset_container_t *dest);
+
+/*  Add all the values [min,max) at a distance k*step from min: min,
+ * min+step,.... */
+void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min,
+                                     uint32_t max, uint16_t step);
+
+/* Get the number of bits set (force computation). This does not modify bitset.
+ * To update the cardinality, you should do
+ * bitset->cardinality =  bitset_container_compute_cardinality(bitset).*/
+int bitset_container_compute_cardinality(const bitset_container_t *bitset);
+
+/* Check whether this bitset is empty,
+ *  it never modifies the bitset struct. */
+static inline bool bitset_container_empty(const bitset_container_t *bitset) {
+    if (bitset->cardinality == BITSET_UNKNOWN_CARDINALITY) {
+        for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) {
+            if ((bitset->words[i]) != 0) return false;
+        }
+        return true;
+    }
+    return bitset->cardinality == 0;
+}
+
+/* Get whether there is at least one bit set  (see bitset_container_empty for
+   the reverse), the bitset is never modified */
+static inline bool bitset_container_const_nonzero_cardinality(
+    const bitset_container_t *bitset) {
+    return !bitset_container_empty(bitset);
+}
+
+/*
+ * Check whether the two bitsets intersect
+ */
+bool bitset_container_intersect(const bitset_container_t *src_1,
+                                const bitset_container_t *src_2);
+
+/* Computes the union of bitsets `src_1' and `src_2' into `dst'  and return the
+ * cardinality. */
+int bitset_container_or(const bitset_container_t *src_1,
+                        const bitset_container_t *src_2,
+                        bitset_container_t *dst);
+
+/* Computes the union of bitsets `src_1' and `src_2' and return the cardinality.
+ */
+int bitset_container_or_justcard(const bitset_container_t *src_1,
+                                 const bitset_container_t *src_2);
+
+/* Computes the union of bitsets `src_1' and `src_2' into `dst' and return the
+ * cardinality. Same as bitset_container_or. */
+int bitset_container_union(const bitset_container_t *src_1,
+                           const bitset_container_t *src_2,
+                           bitset_container_t *dst);
+
+/* Computes the union of bitsets `src_1' and `src_2'  and return the
+ * cardinality. Same as bitset_container_or_justcard. */
+int bitset_container_union_justcard(const bitset_container_t *src_1,
+                                    const bitset_container_t *src_2);
+
+/* Computes the union of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_union_nocard(const bitset_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  bitset_container_t *dst);
+
+/* Computes the union of bitsets `src_1' and `src_2' into `dst', but does not
+ * update the cardinality. Provided to optimize chained operations. */
+int bitset_container_or_nocard(const bitset_container_t *src_1,
+                               const bitset_container_t *src_2,
+                               bitset_container_t *dst);
+
+/* Computes the intersection of bitsets `src_1' and `src_2' into `dst' and
+ * return the cardinality. */
+int bitset_container_and(const bitset_container_t *src_1,
+                         const bitset_container_t *src_2,
+                         bitset_container_t *dst);
+
+/* Computes the intersection of bitsets `src_1' and `src_2'  and return the
+ * cardinality. */
+int bitset_container_and_justcard(const bitset_container_t *src_1,
+                                  const bitset_container_t *src_2);
+
+/* Computes the intersection of bitsets `src_1' and `src_2' into `dst' and
+ * return the cardinality. Same as bitset_container_and. */
+int bitset_container_intersection(const bitset_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  bitset_container_t *dst);
+
+/* Computes the intersection of bitsets `src_1' and `src_2' and return the
+ * cardinality. Same as bitset_container_and_justcard. */
+int bitset_container_intersection_justcard(const bitset_container_t *src_1,
+                                           const bitset_container_t *src_2);
+
+/* Computes the intersection of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_intersection_nocard(const bitset_container_t *src_1,
+                                         const bitset_container_t *src_2,
+                                         bitset_container_t *dst);
+
+/* Computes the intersection of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_and_nocard(const bitset_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                bitset_container_t *dst);
+
+/* Computes the exclusive or of bitsets `src_1' and `src_2' into `dst' and
+ * return the cardinality. */
+int bitset_container_xor(const bitset_container_t *src_1,
+                         const bitset_container_t *src_2,
+                         bitset_container_t *dst);
+
+/* Computes the exclusive or of bitsets `src_1' and `src_2' and return the
+ * cardinality. */
+int bitset_container_xor_justcard(const bitset_container_t *src_1,
+                                  const bitset_container_t *src_2);
+
+/* Computes the exclusive or of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_xor_nocard(const bitset_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                bitset_container_t *dst);
+
+/* Computes the and not of bitsets `src_1' and `src_2' into `dst' and return the
+ * cardinality. */
+int bitset_container_andnot(const bitset_container_t *src_1,
+                            const bitset_container_t *src_2,
+                            bitset_container_t *dst);
+
+/* Computes the and not of bitsets `src_1' and `src_2'  and return the
+ * cardinality. */
+int bitset_container_andnot_justcard(const bitset_container_t *src_1,
+                                     const bitset_container_t *src_2);
+
+/* Computes the and not or of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_andnot_nocard(const bitset_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   bitset_container_t *dst);
+
+void bitset_container_offset(const bitset_container_t *c, container_t **loc,
+                             container_t **hic, uint16_t offset);
+/*
+ * Write out the 16-bit integers contained in this container as a list of 32-bit
+ * integers using base
+ * as the starting value (it might be expected that base has zeros in its 16
+ * least significant bits).
+ * The function returns the number of values written.
+ * The caller is responsible for allocating enough memory in out.
+ * The out pointer should point to enough memory (the cardinality times 32
+ * bits).
+ */
+int bitset_container_to_uint32_array(uint32_t *out,
+                                     const bitset_container_t *bc,
+                                     uint32_t base);
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void bitset_container_printf(const bitset_container_t *v);
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit
+ * integers starting at base.
+ */
+void bitset_container_printf_as_uint32_array(const bitset_container_t *v,
+                                             uint32_t base);
+
+bool bitset_container_validate(const bitset_container_t *v,
+                               const char **reason);
+
+/**
+ * Return the serialized size in bytes of a container.
+ */
+static inline int32_t bitset_container_serialized_size_in_bytes(void) {
+    return BITSET_CONTAINER_SIZE_IN_WORDS * 8;
+}
+
+/**
+ * Return the the number of runs.
+ */
+int bitset_container_number_of_runs(bitset_container_t *bc);
+
+bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base,
+                              roaring_iterator iterator, void *ptr);
+bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base,
+                                roaring_iterator64 iterator, uint64_t high_bits,
+                                void *ptr);
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes written should be
+ * bitset_container_size_in_bytes(container).
+ */
+int32_t bitset_container_write(const bitset_container_t *container, char *buf);
+
+/**
+ * Reads the instance from buf, outputs how many bytes were read.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes read should be bitset_container_size_in_bytes(container).
+ * You need to provide the (known) cardinality.
+ */
+int32_t bitset_container_read(int32_t cardinality,
+                              bitset_container_t *container, const char *buf);
+/**
+ * Return the serialized size in bytes of a container (see
+ * bitset_container_write).
+ * This is meant to be compatible with the Java and Go versions of Roaring and
+ * assumes
+ * that the cardinality of the container is already known or can be computed.
+ */
+static inline int32_t bitset_container_size_in_bytes(
+    const bitset_container_t *container) {
+    (void)container;
+    return BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+}
+
+/**
+ * Return true if the two containers have the same content.
+ */
+bool bitset_container_equals(const bitset_container_t *container1,
+                             const bitset_container_t *container2);
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool bitset_container_is_subset(const bitset_container_t *container1,
+                                const bitset_container_t *container2);
+
+/**
+ * If the element of given rank is in this container, supposing that the first
+ * element has rank start_rank, then the function returns true and sets element
+ * accordingly.
+ * Otherwise, it returns false and update start_rank.
+ */
+bool bitset_container_select(const bitset_container_t *container,
+                             uint32_t *start_rank, uint32_t rank,
+                             uint32_t *element);
+
+/* Returns the smallest value (assumes not empty) */
+uint16_t bitset_container_minimum(const bitset_container_t *container);
+
+/* Returns the largest value (assumes not empty) */
+uint16_t bitset_container_maximum(const bitset_container_t *container);
+
+/* Returns the number of values equal or smaller than x */
+int bitset_container_rank(const bitset_container_t *container, uint16_t x);
+
+/* bulk version of bitset_container_rank(); return number of consumed elements
+ */
+uint32_t bitset_container_rank_many(const bitset_container_t *container,
+                                    uint64_t start_rank, const uint32_t *begin,
+                                    const uint32_t *end, uint64_t *ans);
+
+/* Returns the index of x , if not exsist return -1 */
+int bitset_container_get_index(const bitset_container_t *container, uint16_t x);
+
+/* Returns the index of the first value equal or larger than x, or -1 */
+int bitset_container_index_equalorlarger(const bitset_container_t *container,
+                                         uint16_t x);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif /* INCLUDE_CONTAINERS_BITSET_H_ */
+/* end file include/roaring/containers/bitset.h */
+/* begin file include/roaring/containers/run.h */
+/*
+ * run.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_RUN_H_
+#define INCLUDE_CONTAINERS_RUN_H_
+
+
+// Include other headers after roaring_types.h
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+
+// Note: in pure C++ code, you should avoid putting `using` in header files
+using api::roaring_iterator;
+using api::roaring_iterator64;
+
+namespace internal {
+#endif
+
+/* struct rle16_s - run length pair
+ *
+ * @value:  start position of the run
+ * @length: length of the run is `length + 1`
+ *
+ * An RLE pair {v, l} would represent the integers between the interval
+ * [v, v+l+1], e.g. {3, 2} = [3, 4, 5].
+ */
+struct rle16_s {
+    uint16_t value;
+    uint16_t length;
+};
+
+typedef struct rle16_s rle16_t;
+
+#ifdef __cplusplus
+#define MAKE_RLE16(val, len) \
+    { (uint16_t)(val), (uint16_t)(len) }  // no tagged structs until c++20
+#else
+#define MAKE_RLE16(val, len) \
+    (rle16_t) { .value = (uint16_t)(val), .length = (uint16_t)(len) }
+#endif
+
+/* struct run_container_s - run container bitmap
+ *
+ * @n_runs:   number of rle_t pairs in `runs`.
+ * @capacity: capacity in rle_t pairs `runs` can hold.
+ * @runs:     pairs of rle_t.
+ */
+STRUCT_CONTAINER(run_container_s) {
+    int32_t n_runs;
+    int32_t capacity;
+    rle16_t *runs;
+};
+
+typedef struct run_container_s run_container_t;
+
+#define CAST_run(c) CAST(run_container_t *, c)  // safer downcast
+#define const_CAST_run(c) CAST(const run_container_t *, c)
+#define movable_CAST_run(c) movable_CAST(run_container_t **, c)
+
+/* Create a new run container. Return NULL in case of failure. */
+run_container_t *run_container_create(void);
+
+/* Create a new run container with given capacity. Return NULL in case of
+ * failure. */
+run_container_t *run_container_create_given_capacity(int32_t size);
+
+/*
+ * Shrink the capacity to the actual size, return the number of bytes saved.
+ */
+int run_container_shrink_to_fit(run_container_t *src);
+
+/* Free memory owned by `run'. */
+void run_container_free(run_container_t *run);
+
+/* Duplicate container */
+run_container_t *run_container_clone(const run_container_t *src);
+
+/*
+ * Effectively deletes the value at index index, repacking data.
+ */
+static inline void recoverRoomAtIndex(run_container_t *run, uint16_t index) {
+    memmove(run->runs + index, run->runs + (1 + index),
+            (run->n_runs - index - 1) * sizeof(rle16_t));
+    run->n_runs--;
+}
+
+/**
+ * Good old binary search through rle data
+ */
+inline int32_t interleavedBinarySearch(const rle16_t *array, int32_t lenarray,
+                                       uint16_t ikey) {
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t middleValue = array[middleIndex].value;
+        if (middleValue < ikey) {
+            low = middleIndex + 1;
+        } else if (middleValue > ikey) {
+            high = middleIndex - 1;
+        } else {
+            return middleIndex;
+        }
+    }
+    return -(low + 1);
+}
+
+/*
+ * Returns index of the run which contains $ikey
+ */
+static inline int32_t rle16_find_run(const rle16_t *array, int32_t lenarray,
+                                     uint16_t ikey) {
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t min = array[middleIndex].value;
+        uint16_t max = array[middleIndex].value + array[middleIndex].length;
+        if (ikey > max) {
+            low = middleIndex + 1;
+        } else if (ikey < min) {
+            high = middleIndex - 1;
+        } else {
+            return middleIndex;
+        }
+    }
+    return -(low + 1);
+}
+
+/**
+ * Returns number of runs which can'be be merged with the key because they
+ * are less than the key.
+ * Note that [5,6,7,8] can be merged with the key 9 and won't be counted.
+ */
+static inline int32_t rle16_count_less(const rle16_t *array, int32_t lenarray,
+                                       uint16_t key) {
+    if (lenarray == 0) return 0;
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t min_value = array[middleIndex].value;
+        uint16_t max_value =
+            array[middleIndex].value + array[middleIndex].length;
+        if (max_value + UINT32_C(1) < key) {  // uint32 arithmetic
+            low = middleIndex + 1;
+        } else if (key < min_value) {
+            high = middleIndex - 1;
+        } else {
+            return middleIndex;
+        }
+    }
+    return low;
+}
+
+static inline int32_t rle16_count_greater(const rle16_t *array,
+                                          int32_t lenarray, uint16_t key) {
+    if (lenarray == 0) return 0;
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t min_value = array[middleIndex].value;
+        uint16_t max_value =
+            array[middleIndex].value + array[middleIndex].length;
+        if (max_value < key) {
+            low = middleIndex + 1;
+        } else if (key + UINT32_C(1) < min_value) {  // uint32 arithmetic
+            high = middleIndex - 1;
+        } else {
+            return lenarray - (middleIndex + 1);
+        }
+    }
+    return lenarray - low;
+}
+
+/**
+ * increase capacity to at least min. Whether the
+ * existing data needs to be copied over depends on copy. If "copy" is false,
+ * then the new content will be uninitialized, otherwise a copy is made.
+ */
+void run_container_grow(run_container_t *run, int32_t min, bool copy);
+
+/**
+ * Moves the data so that we can write data at index
+ */
+static inline void makeRoomAtIndex(run_container_t *run, uint16_t index) {
+    /* This function calls realloc + memmove sequentially to move by one index.
+     * Potentially copying twice the array.
+     */
+    if (run->n_runs + 1 > run->capacity)
+        run_container_grow(run, run->n_runs + 1, true);
+    memmove(run->runs + 1 + index, run->runs + index,
+            (run->n_runs - index) * sizeof(rle16_t));
+    run->n_runs++;
+}
+
+/* Add `pos' to `run'. Returns true if `pos' was not present. */
+bool run_container_add(run_container_t *run, uint16_t pos);
+
+/* Remove `pos' from `run'. Returns true if `pos' was present. */
+static inline bool run_container_remove(run_container_t *run, uint16_t pos) {
+    int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos);
+    if (index >= 0) {
+        int32_t le = run->runs[index].length;
+        if (le == 0) {
+            recoverRoomAtIndex(run, (uint16_t)index);
+        } else {
+            run->runs[index].value++;
+            run->runs[index].length--;
+        }
+        return true;
+    }
+    index = -index - 2;  // points to preceding value, possibly -1
+    if (index >= 0) {    // possible match
+        int32_t offset = pos - run->runs[index].value;
+        int32_t le = run->runs[index].length;
+        if (offset < le) {
+            // need to break in two
+            run->runs[index].length = (uint16_t)(offset - 1);
+            // need to insert
+            uint16_t newvalue = pos + 1;
+            int32_t newlength = le - offset - 1;
+            makeRoomAtIndex(run, (uint16_t)(index + 1));
+            run->runs[index + 1].value = newvalue;
+            run->runs[index + 1].length = (uint16_t)newlength;
+            return true;
+
+        } else if (offset == le) {
+            run->runs[index].length--;
+            return true;
+        }
+    }
+    // no match
+    return false;
+}
+
+/* Check whether `pos' is present in `run'.  */
+inline bool run_container_contains(const run_container_t *run, uint16_t pos) {
+    int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos);
+    if (index >= 0) return true;
+    index = -index - 2;  // points to preceding value, possibly -1
+    if (index != -1) {   // possible match
+        int32_t offset = pos - run->runs[index].value;
+        int32_t le = run->runs[index].length;
+        if (offset <= le) return true;
+    }
+    return false;
+}
+
+/*
+ * Check whether all positions in a range of positions from pos_start (included)
+ * to pos_end (excluded) is present in `run'.
+ */
+static inline bool run_container_contains_range(const run_container_t *run,
+                                                uint32_t pos_start,
+                                                uint32_t pos_end) {
+    uint32_t count = 0;
+    int32_t index =
+        interleavedBinarySearch(run->runs, run->n_runs, (uint16_t)pos_start);
+    if (index < 0) {
+        index = -index - 2;
+        if ((index == -1) ||
+            ((pos_start - run->runs[index].value) > run->runs[index].length)) {
+            return false;
+        }
+    }
+    for (int32_t i = index; i < run->n_runs; ++i) {
+        const uint32_t stop = run->runs[i].value + run->runs[i].length;
+        if (run->runs[i].value >= pos_end) break;
+        if (stop >= pos_end) {
+            count += (((pos_end - run->runs[i].value) > 0)
+                          ? (pos_end - run->runs[i].value)
+                          : 0);
+            break;
+        }
+        const uint32_t min = (stop - pos_start) > 0 ? (stop - pos_start) : 0;
+        count += (min < run->runs[i].length) ? min : run->runs[i].length;
+    }
+    return count >= (pos_end - pos_start - 1);
+}
+
+/* Get the cardinality of `run'. Requires an actual computation. */
+int run_container_cardinality(const run_container_t *run);
+
+/* Card > 0?, see run_container_empty for the reverse */
+static inline bool run_container_nonzero_cardinality(
+    const run_container_t *run) {
+    return run->n_runs > 0;  // runs never empty
+}
+
+/* Card == 0?, see run_container_nonzero_cardinality for the reverse */
+static inline bool run_container_empty(const run_container_t *run) {
+    return run->n_runs == 0;  // runs never empty
+}
+
+/* Copy one container into another. We assume that they are distinct. */
+void run_container_copy(const run_container_t *src, run_container_t *dst);
+
+/**
+ * Append run described by vl to the run container, possibly merging.
+ * It is assumed that the run would be inserted at the end of the container, no
+ * check is made.
+ * It is assumed that the run container has the necessary capacity: caller is
+ * responsible for checking memory capacity.
+ *
+ *
+ * This is not a safe function, it is meant for performance: use with care.
+ */
+static inline void run_container_append(run_container_t *run, rle16_t vl,
+                                        rle16_t *previousrl) {
+    const uint32_t previousend = previousrl->value + previousrl->length;
+    if (vl.value > previousend + 1) {  // we add a new one
+        run->runs[run->n_runs] = vl;
+        run->n_runs++;
+        *previousrl = vl;
+    } else {
+        uint32_t newend = vl.value + vl.length + UINT32_C(1);
+        if (newend > previousend) {  // we merge
+            previousrl->length = (uint16_t)(newend - 1 - previousrl->value);
+            run->runs[run->n_runs - 1] = *previousrl;
+        }
+    }
+}
+
+/**
+ * Like run_container_append but it is assumed that the content of run is empty.
+ */
+static inline rle16_t run_container_append_first(run_container_t *run,
+                                                 rle16_t vl) {
+    run->runs[run->n_runs] = vl;
+    run->n_runs++;
+    return vl;
+}
+
+/**
+ * append a single value  given by val to the run container, possibly merging.
+ * It is assumed that the value would be inserted at the end of the container,
+ * no check is made.
+ * It is assumed that the run container has the necessary capacity: caller is
+ * responsible for checking memory capacity.
+ *
+ * This is not a safe function, it is meant for performance: use with care.
+ */
+static inline void run_container_append_value(run_container_t *run,
+                                              uint16_t val,
+                                              rle16_t *previousrl) {
+    const uint32_t previousend = previousrl->value + previousrl->length;
+    if (val > previousend + 1) {  // we add a new one
+        *previousrl = MAKE_RLE16(val, 0);
+        run->runs[run->n_runs] = *previousrl;
+        run->n_runs++;
+    } else if (val == previousend + 1) {  // we merge
+        previousrl->length++;
+        run->runs[run->n_runs - 1] = *previousrl;
+    }
+}
+
+/**
+ * Like run_container_append_value but it is assumed that the content of run is
+ * empty.
+ */
+static inline rle16_t run_container_append_value_first(run_container_t *run,
+                                                       uint16_t val) {
+    rle16_t newrle = MAKE_RLE16(val, 0);
+    run->runs[run->n_runs] = newrle;
+    run->n_runs++;
+    return newrle;
+}
+
+/* Check whether the container spans the whole chunk (cardinality = 1<<16).
+ * This check can be done in constant time (inexpensive). */
+static inline bool run_container_is_full(const run_container_t *run) {
+    rle16_t vl = run->runs[0];
+    return (run->n_runs == 1) && (vl.value == 0) && (vl.length == 0xFFFF);
+}
+
+/* Compute the union of `src_1' and `src_2' and write the result to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void run_container_union(const run_container_t *src_1,
+                         const run_container_t *src_2, run_container_t *dst);
+
+/* Compute the union of `src_1' and `src_2' and write the result to `src_1' */
+void run_container_union_inplace(run_container_t *src_1,
+                                 const run_container_t *src_2);
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+void run_container_intersection(const run_container_t *src_1,
+                                const run_container_t *src_2,
+                                run_container_t *dst);
+
+/* Compute the size of the intersection of src_1 and src_2 . */
+int run_container_intersection_cardinality(const run_container_t *src_1,
+                                           const run_container_t *src_2);
+
+/* Check whether src_1 and src_2 intersect. */
+bool run_container_intersect(const run_container_t *src_1,
+                             const run_container_t *src_2);
+
+/* Compute the symmetric difference of `src_1' and `src_2' and write the result
+ * to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void run_container_xor(const run_container_t *src_1,
+                       const run_container_t *src_2, run_container_t *dst);
+
+/*
+ * Write out the 16-bit integers contained in this container as a list of 32-bit
+ * integers using base
+ * as the starting value (it might be expected that base has zeros in its 16
+ * least significant bits).
+ * The function returns the number of values written.
+ * The caller is responsible for allocating enough memory in out.
+ */
+int run_container_to_uint32_array(void *vout, const run_container_t *cont,
+                                  uint32_t base);
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void run_container_printf(const run_container_t *v);
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit
+ * integers starting at base.
+ */
+void run_container_printf_as_uint32_array(const run_container_t *v,
+                                          uint32_t base);
+
+bool run_container_validate(const run_container_t *run, const char **reason);
+
+/**
+ * Return the serialized size in bytes of a container having "num_runs" runs.
+ */
+static inline int32_t run_container_serialized_size_in_bytes(int32_t num_runs) {
+    return sizeof(uint16_t) +
+           sizeof(rle16_t) * num_runs;  // each run requires 2 2-byte entries.
+}
+
+bool run_container_iterate(const run_container_t *cont, uint32_t base,
+                           roaring_iterator iterator, void *ptr);
+bool run_container_iterate64(const run_container_t *cont, uint32_t base,
+                             roaring_iterator64 iterator, uint64_t high_bits,
+                             void *ptr);
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes written should be run_container_size_in_bytes(container).
+ */
+int32_t run_container_write(const run_container_t *container, char *buf);
+
+/**
+ * Reads the instance from buf, outputs how many bytes were read.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes read should be bitset_container_size_in_bytes(container).
+ * The cardinality parameter is provided for consistency with other containers,
+ * but
+ * it might be effectively ignored..
+ */
+int32_t run_container_read(int32_t cardinality, run_container_t *container,
+                           const char *buf);
+
+/**
+ * Return the serialized size in bytes of a container (see run_container_write).
+ * This is meant to be compatible with the Java and Go versions of Roaring.
+ */
+ALLOW_UNALIGNED
+static inline int32_t run_container_size_in_bytes(
+    const run_container_t *container) {
+    return run_container_serialized_size_in_bytes(container->n_runs);
+}
+
+/**
+ * Return true if the two containers have the same content.
+ */
+ALLOW_UNALIGNED
+static inline bool run_container_equals(const run_container_t *container1,
+                                        const run_container_t *container2) {
+    if (container1->n_runs != container2->n_runs) {
+        return false;
+    }
+    return memequals(container1->runs, container2->runs,
+                     container1->n_runs * sizeof(rle16_t));
+}
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool run_container_is_subset(const run_container_t *container1,
+                             const run_container_t *container2);
+
+/**
+ * Used in a start-finish scan that appends segments, for XOR and NOT
+ */
+
+void run_container_smart_append_exclusive(run_container_t *src,
+                                          const uint16_t start,
+                                          const uint16_t length);
+
+/**
+ * The new container consists of a single run [start,stop).
+ * It is required that stop>start, the caller is responsability for this check.
+ * It is required that stop <= (1<<16), the caller is responsability for this
+ * check. The cardinality of the created container is stop - start. Returns NULL
+ * on failure
+ */
+static inline run_container_t *run_container_create_range(uint32_t start,
+                                                          uint32_t stop) {
+    run_container_t *rc = run_container_create_given_capacity(1);
+    if (rc) {
+        rle16_t r;
+        r.value = (uint16_t)start;
+        r.length = (uint16_t)(stop - start - 1);
+        run_container_append_first(rc, r);
+    }
+    return rc;
+}
+
+/**
+ * If the element of given rank is in this container, supposing that the first
+ * element has rank start_rank, then the function returns true and sets element
+ * accordingly.
+ * Otherwise, it returns false and update start_rank.
+ */
+bool run_container_select(const run_container_t *container,
+                          uint32_t *start_rank, uint32_t rank,
+                          uint32_t *element);
+
+/* Compute the difference of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+
+void run_container_andnot(const run_container_t *src_1,
+                          const run_container_t *src_2, run_container_t *dst);
+
+void run_container_offset(const run_container_t *c, container_t **loc,
+                          container_t **hic, uint16_t offset);
+
+/* Returns the smallest value (assumes not empty) */
+inline uint16_t run_container_minimum(const run_container_t *run) {
+    if (run->n_runs == 0) return 0;
+    return run->runs[0].value;
+}
+
+/* Returns the largest value (assumes not empty) */
+inline uint16_t run_container_maximum(const run_container_t *run) {
+    if (run->n_runs == 0) return 0;
+    return run->runs[run->n_runs - 1].value + run->runs[run->n_runs - 1].length;
+}
+
+/* Returns the number of values equal or smaller than x */
+int run_container_rank(const run_container_t *arr, uint16_t x);
+
+/* bulk version of run_container_rank(); return number of consumed elements */
+uint32_t run_container_rank_many(const run_container_t *arr,
+                                 uint64_t start_rank, const uint32_t *begin,
+                                 const uint32_t *end, uint64_t *ans);
+
+/* Returns the index of x, if not exsist return -1 */
+int run_container_get_index(const run_container_t *arr, uint16_t x);
+
+/* Returns the index of the first run containing a value at least as large as x,
+ * or -1 */
+inline int run_container_index_equalorlarger(const run_container_t *arr,
+                                             uint16_t x) {
+    int32_t index = interleavedBinarySearch(arr->runs, arr->n_runs, x);
+    if (index >= 0) return index;
+    index = -index - 2;  // points to preceding run, possibly -1
+    if (index != -1) {   // possible match
+        int32_t offset = x - arr->runs[index].value;
+        int32_t le = arr->runs[index].length;
+        if (offset <= le) return index;
+    }
+    index += 1;
+    if (index < arr->n_runs) {
+        return index;
+    }
+    return -1;
+}
+
+/*
+ * Add all values in range [min, max] using hint.
+ */
+static inline void run_container_add_range_nruns(run_container_t *run,
+                                                 uint32_t min, uint32_t max,
+                                                 int32_t nruns_less,
+                                                 int32_t nruns_greater) {
+    int32_t nruns_common = run->n_runs - nruns_less - nruns_greater;
+    if (nruns_common == 0) {
+        makeRoomAtIndex(run, (uint16_t)nruns_less);
+        run->runs[nruns_less].value = (uint16_t)min;
+        run->runs[nruns_less].length = (uint16_t)(max - min);
+    } else {
+        uint32_t common_min = run->runs[nruns_less].value;
+        uint32_t common_max = run->runs[nruns_less + nruns_common - 1].value +
+                              run->runs[nruns_less + nruns_common - 1].length;
+        uint32_t result_min = (common_min < min) ? common_min : min;
+        uint32_t result_max = (common_max > max) ? common_max : max;
+
+        run->runs[nruns_less].value = (uint16_t)result_min;
+        run->runs[nruns_less].length = (uint16_t)(result_max - result_min);
+
+        memmove(&(run->runs[nruns_less + 1]),
+                &(run->runs[run->n_runs - nruns_greater]),
+                nruns_greater * sizeof(rle16_t));
+        run->n_runs = nruns_less + 1 + nruns_greater;
+    }
+}
+
+/**
+ * Add all values in range [min, max]. This function is currently unused
+ * and left as documentation.
+ */
+/*static inline void run_container_add_range(run_container_t* run,
+                                           uint32_t min, uint32_t max) {
+    int32_t nruns_greater = rle16_count_greater(run->runs, run->n_runs, max);
+    int32_t nruns_less = rle16_count_less(run->runs, run->n_runs -
+nruns_greater, min); run_container_add_range_nruns(run, min, max, nruns_less,
+nruns_greater);
+}*/
+
+/**
+ * Shifts last $count elements either left (distance < 0) or right (distance >
+ * 0)
+ */
+static inline void run_container_shift_tail(run_container_t *run, int32_t count,
+                                            int32_t distance) {
+    if (distance > 0) {
+        if (run->capacity < count + distance) {
+            run_container_grow(run, count + distance, true);
+        }
+    }
+    int32_t srcpos = run->n_runs - count;
+    int32_t dstpos = srcpos + distance;
+    memmove(&(run->runs[dstpos]), &(run->runs[srcpos]),
+            sizeof(rle16_t) * count);
+    run->n_runs += distance;
+}
+
+/**
+ * Remove all elements in range [min, max]
+ */
+static inline void run_container_remove_range(run_container_t *run,
+                                              uint32_t min, uint32_t max) {
+    int32_t first = rle16_find_run(run->runs, run->n_runs, (uint16_t)min);
+    int32_t last = rle16_find_run(run->runs, run->n_runs, (uint16_t)max);
+
+    if (first >= 0 && min > run->runs[first].value &&
+        max < ((uint32_t)run->runs[first].value +
+               (uint32_t)run->runs[first].length)) {
+        // split this run into two adjacent runs
+
+        // right subinterval
+        makeRoomAtIndex(run, (uint16_t)(first + 1));
+        run->runs[first + 1].value = (uint16_t)(max + 1);
+        run->runs[first + 1].length =
+            (uint16_t)((run->runs[first].value + run->runs[first].length) -
+                       (max + 1));
+
+        // left subinterval
+        run->runs[first].length =
+            (uint16_t)((min - 1) - run->runs[first].value);
+
+        return;
+    }
+
+    // update left-most partial run
+    if (first >= 0) {
+        if (min > run->runs[first].value) {
+            run->runs[first].length =
+                (uint16_t)((min - 1) - run->runs[first].value);
+            first++;
+        }
+    } else {
+        first = -first - 1;
+    }
+
+    // update right-most run
+    if (last >= 0) {
+        uint16_t run_max = run->runs[last].value + run->runs[last].length;
+        if (run_max > max) {
+            run->runs[last].value = (uint16_t)(max + 1);
+            run->runs[last].length = (uint16_t)(run_max - (max + 1));
+            last--;
+        }
+    } else {
+        last = (-last - 1) - 1;
+    }
+
+    // remove intermediate runs
+    if (first <= last) {
+        run_container_shift_tail(run, run->n_runs - (last + 1),
+                                 -(last - first + 1));
+    }
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif /* INCLUDE_CONTAINERS_RUN_H_ */
+/* end file include/roaring/containers/run.h */
+/* begin file include/roaring/containers/convert.h */
+/*
+ * convert.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_CONVERT_H_
+#define INCLUDE_CONTAINERS_CONVERT_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Convert an array into a bitset. The input container is not freed or modified.
+ */
+bitset_container_t *bitset_container_from_array(const array_container_t *arr);
+
+/* Convert a run into a bitset. The input container is not freed or modified. */
+bitset_container_t *bitset_container_from_run(const run_container_t *arr);
+
+/* Convert a run into an array. The input container is not freed or modified. */
+array_container_t *array_container_from_run(const run_container_t *arr);
+
+/* Convert a bitset into an array. The input container is not freed or modified.
+ */
+array_container_t *array_container_from_bitset(const bitset_container_t *bits);
+
+/* Convert an array into a run. The input container is not freed or modified.
+ */
+run_container_t *run_container_from_array(const array_container_t *c);
+
+/* convert a run into either an array or a bitset
+ * might free the container. This does not free the input run container. */
+container_t *convert_to_bitset_or_array_container(run_container_t *rc,
+                                                  int32_t card,
+                                                  uint8_t *resulttype);
+
+/* convert containers to and from runcontainers, as is most space efficient.
+ * The container might be freed. */
+container_t *convert_run_optimize(container_t *c, uint8_t typecode_original,
+                                  uint8_t *typecode_after);
+
+/* converts a run container to either an array or a bitset, IF it saves space.
+ */
+/* If a conversion occurs, the caller is responsible to free the original
+ * container and
+ * he becomes reponsible to free the new one. */
+container_t *convert_run_to_efficient_container(run_container_t *c,
+                                                uint8_t *typecode_after);
+
+// like convert_run_to_efficient_container but frees the old result if needed
+container_t *convert_run_to_efficient_container_and_free(
+    run_container_t *c, uint8_t *typecode_after);
+
+/**
+ * Create new container which is a union of run container and
+ * range [min, max]. Caller is responsible for freeing run container.
+ */
+container_t *container_from_run_range(const run_container_t *run, uint32_t min,
+                                      uint32_t max, uint8_t *typecode_after);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif /* INCLUDE_CONTAINERS_CONVERT_H_ */
+/* end file include/roaring/containers/convert.h */
+/* begin file include/roaring/containers/mixed_equal.h */
+/*
+ * mixed_equal.h
+ *
+ */
+
+#ifndef CONTAINERS_MIXED_EQUAL_H_
+#define CONTAINERS_MIXED_EQUAL_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/**
+ * Return true if the two containers have the same content.
+ */
+bool array_container_equal_bitset(const array_container_t* container1,
+                                  const bitset_container_t* container2);
+
+/**
+ * Return true if the two containers have the same content.
+ */
+bool run_container_equals_array(const run_container_t* container1,
+                                const array_container_t* container2);
+/**
+ * Return true if the two containers have the same content.
+ */
+bool run_container_equals_bitset(const run_container_t* container1,
+                                 const bitset_container_t* container2);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif /* CONTAINERS_MIXED_EQUAL_H_ */
+/* end file include/roaring/containers/mixed_equal.h */
+/* begin file include/roaring/containers/mixed_subset.h */
+/*
+ * mixed_subset.h
+ *
+ */
+
+#ifndef CONTAINERS_MIXED_SUBSET_H_
+#define CONTAINERS_MIXED_SUBSET_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool array_container_is_subset_bitset(const array_container_t* container1,
+                                      const bitset_container_t* container2);
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool run_container_is_subset_array(const run_container_t* container1,
+                                   const array_container_t* container2);
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool array_container_is_subset_run(const array_container_t* container1,
+                                   const run_container_t* container2);
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool run_container_is_subset_bitset(const run_container_t* container1,
+                                    const bitset_container_t* container2);
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool bitset_container_is_subset_run(const bitset_container_t* container1,
+                                    const run_container_t* container2);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif /* CONTAINERS_MIXED_SUBSET_H_ */
+/* end file include/roaring/containers/mixed_subset.h */
+/* begin file include/roaring/containers/mixed_andnot.h */
+/*
+ * mixed_andnot.h
+ */
+#ifndef INCLUDE_CONTAINERS_MIXED_ANDNOT_H_
+#define INCLUDE_CONTAINERS_MIXED_ANDNOT_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst, a valid array container that could be the same as dst.*/
+void array_bitset_container_andnot(const array_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   array_container_t *dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * src_1 */
+
+void array_bitset_container_iandnot(array_container_t *src_1,
+                                    const bitset_container_t *src_2);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst, which does not initially have a valid container.
+ * Return true for a bitset result; false for array
+ */
+
+bool bitset_array_container_andnot(const bitset_container_t *src_1,
+                                   const array_container_t *src_2,
+                                   container_t **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_array_container_iandnot(bitset_container_t *src_1,
+                                    const array_container_t *src_2,
+                                    container_t **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_andnot(const run_container_t *src_1,
+                                 const bitset_container_t *src_2,
+                                 container_t **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_iandnot(run_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  container_t **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset").  dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool bitset_run_container_andnot(const bitset_container_t *src_1,
+                                 const run_container_t *src_2,
+                                 container_t **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_run_container_iandnot(bitset_container_t *src_1,
+                                  const run_container_t *src_2,
+                                  container_t **dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any type of container.
+ */
+
+int run_array_container_andnot(const run_container_t *src_1,
+                               const array_container_t *src_2,
+                               container_t **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+int run_array_container_iandnot(run_container_t *src_1,
+                                const array_container_t *src_2,
+                                container_t **dst);
+
+/* dst must be a valid array container, allowed to be src_1 */
+
+void array_run_container_andnot(const array_container_t *src_1,
+                                const run_container_t *src_2,
+                                array_container_t *dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+void array_run_container_iandnot(array_container_t *src_1,
+                                 const run_container_t *src_2);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int run_run_container_andnot(const run_container_t *src_1,
+                             const run_container_t *src_2, container_t **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+int run_run_container_iandnot(run_container_t *src_1,
+                              const run_container_t *src_2, container_t **dst);
+
+/*
+ * dst is a valid array container and may be the same as src_1
+ */
+
+void array_array_container_andnot(const array_container_t *src_1,
+                                  const array_container_t *src_2,
+                                  array_container_t *dst);
+
+/* inplace array-array andnot will always be able to reuse the space of
+ * src_1 */
+void array_array_container_iandnot(array_container_t *src_1,
+                                   const array_container_t *src_2);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially). Return value is
+ * "dst is a bitset"
+ */
+
+bool bitset_bitset_container_andnot(const bitset_container_t *src_1,
+                                    const bitset_container_t *src_2,
+                                    container_t **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_bitset_container_iandnot(bitset_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     container_t **dst);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif
+/* end file include/roaring/containers/mixed_andnot.h */
+/* begin file include/roaring/containers/mixed_intersection.h */
+/*
+ * mixed_intersection.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_MIXED_INTERSECTION_H_
+#define INCLUDE_CONTAINERS_MIXED_INTERSECTION_H_
+
+/* These functions appear to exclude cases where the
+ * inputs have the same type and the output is guaranteed
+ * to have the same type as the inputs.  Eg, array intersection
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be equal to src_1. We assume that dst is a
+ * valid container. */
+void array_bitset_container_intersection(const array_container_t *src_1,
+                                         const bitset_container_t *src_2,
+                                         array_container_t *dst);
+
+/* Compute the size of the intersection of src_1 and src_2. */
+int array_bitset_container_intersection_cardinality(
+    const array_container_t *src_1, const bitset_container_t *src_2);
+
+/* Checking whether src_1 and src_2 intersect. */
+bool array_bitset_container_intersect(const array_container_t *src_1,
+                                      const bitset_container_t *src_2);
+
+/*
+ * Compute the intersection between src_1 and src_2 and write the result
+ * to *dst. If the return function is true, the result is a bitset_container_t
+ * otherwise is a array_container_t. We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_bitset_container_intersection(const bitset_container_t *src_1,
+                                          const bitset_container_t *src_2,
+                                          container_t **dst);
+
+/* Compute the intersection between src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be equal to src_1. We assume that dst is a
+ * valid container. */
+void array_run_container_intersection(const array_container_t *src_1,
+                                      const run_container_t *src_2,
+                                      array_container_t *dst);
+
+/* Compute the intersection between src_1 and src_2 and write the result to
+ * *dst. If the result is true then the result is a bitset_container_t
+ * otherwise is a array_container_t.
+ * If *dst == src_2, then an in-place intersection is attempted
+ **/
+bool run_bitset_container_intersection(const run_container_t *src_1,
+                                       const bitset_container_t *src_2,
+                                       container_t **dst);
+
+/* Compute the size of the intersection between src_1 and src_2 . */
+int array_run_container_intersection_cardinality(const array_container_t *src_1,
+                                                 const run_container_t *src_2);
+
+/* Compute the size of the intersection  between src_1 and src_2
+ **/
+int run_bitset_container_intersection_cardinality(
+    const run_container_t *src_1, const bitset_container_t *src_2);
+
+/* Check that src_1 and src_2 intersect. */
+bool array_run_container_intersect(const array_container_t *src_1,
+                                   const run_container_t *src_2);
+
+/* Check that src_1 and src_2 intersect.
+ **/
+bool run_bitset_container_intersect(const run_container_t *src_1,
+                                    const bitset_container_t *src_2);
+
+/*
+ * Same as bitset_bitset_container_intersection except that if the output is to
+ * be a
+ * bitset_container_t, then src_1 is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_bitset_container_intersection_inplace(
+    bitset_container_t *src_1, const bitset_container_t *src_2,
+    container_t **dst);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif /* INCLUDE_CONTAINERS_MIXED_INTERSECTION_H_ */
+/* end file include/roaring/containers/mixed_intersection.h */
+/* begin file include/roaring/containers/mixed_negation.h */
+/*
+ * mixed_negation.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_MIXED_NEGATION_H_
+#define INCLUDE_CONTAINERS_MIXED_NEGATION_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Negation across the entire range of the container.
+ * Compute the  negation of src  and write the result
+ * to *dst. The complement of a
+ * sufficiently sparse set will always be dense and a hence a bitmap
+ * We assume that dst is pre-allocated and a valid bitset container
+ * There can be no in-place version.
+ */
+void array_container_negation(const array_container_t *src,
+                              bitset_container_t *dst);
+
+/* Negation across the entire range of the container
+ * Compute the  negation of src  and write the result
+ * to *dst.  A true return value indicates a bitset result,
+ * otherwise the result is an array container.
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_container_negation(const bitset_container_t *src,
+                               container_t **dst);
+
+/* inplace version */
+/*
+ * Same as bitset_container_negation except that if the output is to
+ * be a
+ * bitset_container_t, then src is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_container_negation_inplace(bitset_container_t *src,
+                                       container_t **dst);
+
+/* Negation across the entire range of container
+ * Compute the  negation of src  and write the result
+ * to *dst.
+ * Return values are the *_TYPECODES as defined * in containers.h
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+int run_container_negation(const run_container_t *src, container_t **dst);
+
+/*
+ * Same as run_container_negation except that if the output is to
+ * be a
+ * run_container_t, and has the capacity to hold the result,
+ * then src is modified and no allocation is made.
+ * In all cases, the result is in *dst.
+ */
+int run_container_negation_inplace(run_container_t *src, container_t **dst);
+
+/* Negation across a range of the container.
+ * Compute the  negation of src  and write the result
+ * to *dst. Returns true if the result is a bitset container
+ * and false for an array container.  *dst is not preallocated.
+ */
+bool array_container_negation_range(const array_container_t *src,
+                                    const int range_start, const int range_end,
+                                    container_t **dst);
+
+/* Even when the result would fit, it is unclear how to make an
+ * inplace version without inefficient copying.  Thus this routine
+ * may be a wrapper for the non-in-place version
+ */
+bool array_container_negation_range_inplace(array_container_t *src,
+                                            const int range_start,
+                                            const int range_end,
+                                            container_t **dst);
+
+/* Negation across a range of the container
+ * Compute the  negation of src  and write the result
+ * to *dst.  A true return value indicates a bitset result,
+ * otherwise the result is an array container.
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_container_negation_range(const bitset_container_t *src,
+                                     const int range_start, const int range_end,
+                                     container_t **dst);
+
+/* inplace version */
+/*
+ * Same as bitset_container_negation except that if the output is to
+ * be a
+ * bitset_container_t, then src is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_container_negation_range_inplace(bitset_container_t *src,
+                                             const int range_start,
+                                             const int range_end,
+                                             container_t **dst);
+
+/* Negation across a range of container
+ * Compute the  negation of src  and write the result
+ * to *dst.  Return values are the *_TYPECODES as defined * in containers.h
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+int run_container_negation_range(const run_container_t *src,
+                                 const int range_start, const int range_end,
+                                 container_t **dst);
+
+/*
+ * Same as run_container_negation except that if the output is to
+ * be a
+ * run_container_t, and has the capacity to hold the result,
+ * then src is modified and no allocation is made.
+ * In all cases, the result is in *dst.
+ */
+int run_container_negation_range_inplace(run_container_t *src,
+                                         const int range_start,
+                                         const int range_end,
+                                         container_t **dst);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif /* INCLUDE_CONTAINERS_MIXED_NEGATION_H_ */
+/* end file include/roaring/containers/mixed_negation.h */
+/* begin file include/roaring/containers/mixed_union.h */
+/*
+ * mixed_intersection.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_MIXED_UNION_H_
+#define INCLUDE_CONTAINERS_MIXED_UNION_H_
+
+/* These functions appear to exclude cases where the
+ * inputs have the same type and the output is guaranteed
+ * to have the same type as the inputs.  Eg, bitset unions
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.   */
+void array_bitset_container_union(const array_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  bitset_container_t *dst);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). */
+void array_bitset_container_lazy_union(const array_container_t *src_1,
+                                       const bitset_container_t *src_2,
+                                       bitset_container_t *dst);
+
+/*
+ * Compute the union between src_1 and src_2 and write the result
+ * to *dst. If the return function is true, the result is a bitset_container_t
+ * otherwise is a array_container_t. We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool array_array_container_union(const array_container_t *src_1,
+                                 const array_container_t *src_2,
+                                 container_t **dst);
+
+/*
+ * Compute the union between src_1 and src_2 and write the result
+ * to *dst if it cannot be written to src_1. If the return function is true,
+ * the result is a bitset_container_t
+ * otherwise is a array_container_t. When the result is an array_container_t, it
+ * it either written to src_1 (if *dst is null) or to *dst.
+ * If the result is a bitset_container_t and *dst is null, then there was a
+ * failure.
+ */
+bool array_array_container_inplace_union(array_container_t *src_1,
+                                         const array_container_t *src_2,
+                                         container_t **dst);
+
+/*
+ * Same as array_array_container_union except that it will more eagerly produce
+ * a bitset.
+ */
+bool array_array_container_lazy_union(const array_container_t *src_1,
+                                      const array_container_t *src_2,
+                                      container_t **dst);
+
+/*
+ * Same as array_array_container_inplace_union except that it will more eagerly
+ * produce a bitset.
+ */
+bool array_array_container_lazy_inplace_union(array_container_t *src_1,
+                                              const array_container_t *src_2,
+                                              container_t **dst);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. We assume that dst is a
+ * valid container. The result might need to be further converted to array or
+ * bitset container,
+ * the caller is responsible for the eventual conversion. */
+void array_run_container_union(const array_container_t *src_1,
+                               const run_container_t *src_2,
+                               run_container_t *dst);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * src2. The result might need to be further converted to array or
+ * bitset container,
+ * the caller is responsible for the eventual conversion. */
+void array_run_container_inplace_union(const array_container_t *src_1,
+                                       run_container_t *src_2);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be src_2.
+ * If run_container_is_full(src_1) is true, you must not be calling this
+ *function.
+ **/
+void run_bitset_container_union(const run_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                bitset_container_t *dst);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be src_2.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY).
+ * If run_container_is_full(src_1) is true, you must not be calling this
+ * function.
+ * */
+void run_bitset_container_lazy_union(const run_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     bitset_container_t *dst);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif /* INCLUDE_CONTAINERS_MIXED_UNION_H_ */
+/* end file include/roaring/containers/mixed_union.h */
+/* begin file include/roaring/containers/mixed_xor.h */
+/*
+ * mixed_xor.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_MIXED_XOR_H_
+#define INCLUDE_CONTAINERS_MIXED_XOR_H_
+
+/* These functions appear to exclude cases where the
+ * inputs have the same type and the output is guaranteed
+ * to have the same type as the inputs.  Eg, bitset unions
+ */
+
+/*
+ * Java implementation (as of May 2016) for array_run, run_run
+ * and  bitset_run don't do anything different for inplace.
+ * (They are not truly in place.)
+ */
+
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially).
+ * Result is true iff dst is a bitset  */
+bool array_bitset_container_xor(const array_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                container_t **dst);
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY).
+ */
+
+void array_bitset_container_lazy_xor(const array_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     bitset_container_t *dst);
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially). Return value is
+ * "dst is a bitset"
+ */
+
+bool bitset_bitset_container_xor(const bitset_container_t *src_1,
+                                 const bitset_container_t *src_2,
+                                 container_t **dst);
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_xor(const run_container_t *src_1,
+                              const bitset_container_t *src_2,
+                              container_t **dst);
+
+/* lazy xor.  Dst is initialized and may be equal to src_2.
+ *  Result is left as a bitset container, even if actual
+ *  cardinality would dictate an array container.
+ */
+
+void run_bitset_container_lazy_xor(const run_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   bitset_container_t *dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int array_run_container_xor(const array_container_t *src_1,
+                            const run_container_t *src_2, container_t **dst);
+
+/* dst does not initially have a valid container.  Creates either
+ * an array or a bitset container, indicated by return code
+ */
+
+bool array_array_container_xor(const array_container_t *src_1,
+                               const array_container_t *src_2,
+                               container_t **dst);
+
+/* dst does not initially have a valid container.  Creates either
+ * an array or a bitset container, indicated by return code.
+ * A bitset container will not have a valid cardinality and the
+ * container type might not be correct for the actual cardinality
+ */
+
+bool array_array_container_lazy_xor(const array_container_t *src_1,
+                                    const array_container_t *src_2,
+                                    container_t **dst);
+
+/* Dst is a valid run container. (Can it be src_2? Let's say not.)
+ * Leaves result as run container, even if other options are
+ * smaller.
+ */
+
+void array_run_container_lazy_xor(const array_container_t *src_1,
+                                  const run_container_t *src_2,
+                                  run_container_t *dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int run_run_container_xor(const run_container_t *src_1,
+                          const run_container_t *src_2, container_t **dst);
+
+/* INPLACE versions (initial implementation may not exploit all inplace
+ * opportunities (if any...)
+ */
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_array_container_ixor(bitset_container_t *src_1,
+                                 const array_container_t *src_2,
+                                 container_t **dst);
+
+bool bitset_bitset_container_ixor(bitset_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  container_t **dst);
+
+bool array_bitset_container_ixor(array_container_t *src_1,
+                                 const bitset_container_t *src_2,
+                                 container_t **dst);
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_ixor(run_container_t *src_1,
+                               const bitset_container_t *src_2,
+                               container_t **dst);
+
+bool bitset_run_container_ixor(bitset_container_t *src_1,
+                               const run_container_t *src_2, container_t **dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int array_run_container_ixor(array_container_t *src_1,
+                             const run_container_t *src_2, container_t **dst);
+
+int run_array_container_ixor(run_container_t *src_1,
+                             const array_container_t *src_2, container_t **dst);
+
+bool array_array_container_ixor(array_container_t *src_1,
+                                const array_container_t *src_2,
+                                container_t **dst);
+
+int run_run_container_ixor(run_container_t *src_1, const run_container_t *src_2,
+                           container_t **dst);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif
+/* end file include/roaring/containers/mixed_xor.h */
+/* begin file include/roaring/containers/containers.h */
+#ifndef CONTAINERS_CONTAINERS_H
+#define CONTAINERS_CONTAINERS_H
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+// would enum be possible or better?
+
+/**
+ * The switch case statements follow
+ * BITSET_CONTAINER_TYPE -- ARRAY_CONTAINER_TYPE -- RUN_CONTAINER_TYPE
+ * so it makes more sense to number them 1, 2, 3 (in the vague hope that the
+ * compiler might exploit this ordering).
+ */
+
+#define BITSET_CONTAINER_TYPE 1
+#define ARRAY_CONTAINER_TYPE 2
+#define RUN_CONTAINER_TYPE 3
+#define SHARED_CONTAINER_TYPE 4
+
+/**
+ * Macros for pairing container type codes, suitable for switch statements.
+ * Use PAIR_CONTAINER_TYPES() for the switch, CONTAINER_PAIR() for the cases:
+ *
+ *     switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+ *        case CONTAINER_PAIR(BITSET,ARRAY):
+ *        ...
+ *     }
+ */
+#define PAIR_CONTAINER_TYPES(type1, type2) (4 * (type1) + (type2))
+
+#define CONTAINER_PAIR(name1, name2) \
+    (4 * (name1##_CONTAINER_TYPE) + (name2##_CONTAINER_TYPE))
+
+/**
+ * A shared container is a wrapper around a container
+ * with reference counting.
+ */
+STRUCT_CONTAINER(shared_container_s) {
+    container_t *container;
+    uint8_t typecode;
+    croaring_refcount_t counter;  // to be managed atomically
+};
+
+typedef struct shared_container_s shared_container_t;
+
+#define CAST_shared(c) CAST(shared_container_t *, c)  // safer downcast
+#define const_CAST_shared(c) CAST(const shared_container_t *, c)
+#define movable_CAST_shared(c) movable_CAST(shared_container_t **, c)
+
+/*
+ * With copy_on_write = true
+ *  Create a new shared container if the typecode is not SHARED_CONTAINER_TYPE,
+ * otherwise, increase the count
+ * If copy_on_write = false, then clone.
+ * Return NULL in case of failure.
+ **/
+container_t *get_copy_of_container(container_t *container, uint8_t *typecode,
+                                   bool copy_on_write);
+
+/* Frees a shared container (actually decrement its counter and only frees when
+ * the counter falls to zero). */
+void shared_container_free(shared_container_t *container);
+
+/* extract a copy from the shared container, freeing the shared container if
+there is just one instance left,
+clone instances when the counter is higher than one
+*/
+container_t *shared_container_extract_copy(shared_container_t *container,
+                                           uint8_t *typecode);
+
+/* access to container underneath */
+static inline const container_t *container_unwrap_shared(
+    const container_t *candidate_shared_container, uint8_t *type) {
+    if (*type == SHARED_CONTAINER_TYPE) {
+        *type = const_CAST_shared(candidate_shared_container)->typecode;
+        assert(*type != SHARED_CONTAINER_TYPE);
+        return const_CAST_shared(candidate_shared_container)->container;
+    } else {
+        return candidate_shared_container;
+    }
+}
+
+/* access to container underneath */
+static inline container_t *container_mutable_unwrap_shared(container_t *c,
+                                                           uint8_t *type) {
+    if (*type == SHARED_CONTAINER_TYPE) {  // the passed in container is shared
+        *type = CAST_shared(c)->typecode;
+        assert(*type != SHARED_CONTAINER_TYPE);
+        return CAST_shared(c)->container;  // return the enclosed container
+    } else {
+        return c;  // wasn't shared, so return as-is
+    }
+}
+
+/* access to container underneath and queries its type */
+static inline uint8_t get_container_type(const container_t *c, uint8_t type) {
+    if (type == SHARED_CONTAINER_TYPE) {
+        return const_CAST_shared(c)->typecode;
+    } else {
+        return type;
+    }
+}
+
+/**
+ * Copies a container, requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation. If the container is not shared, then it is
+ * physically cloned. Sharable containers are not cloneable.
+ */
+container_t *container_clone(const container_t *container, uint8_t typecode);
+
+/* access to container underneath, cloning it if needed */
+static inline container_t *get_writable_copy_if_shared(container_t *c,
+                                                       uint8_t *type) {
+    if (*type == SHARED_CONTAINER_TYPE) {  // shared, return enclosed container
+        return shared_container_extract_copy(CAST_shared(c), type);
+    } else {
+        return c;  // not shared, so return as-is
+    }
+}
+
+/**
+ * End of shared container code
+ */
+
+static const char *container_names[] = {"bitset", "array", "run", "shared"};
+static const char *shared_container_names[] = {
+    "bitset (shared)", "array (shared)", "run (shared)"};
+
+// no matter what the initial container was, convert it to a bitset
+// if a new container is produced, caller responsible for freeing the previous
+// one
+// container should not be a shared container
+static inline bitset_container_t *container_to_bitset(container_t *c,
+                                                      uint8_t typecode) {
+    bitset_container_t *result = NULL;
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return CAST_bitset(c);  // nothing to do
+        case ARRAY_CONTAINER_TYPE:
+            result = bitset_container_from_array(CAST_array(c));
+            return result;
+        case RUN_CONTAINER_TYPE:
+            result = bitset_container_from_run(CAST_run(c));
+            return result;
+        case SHARED_CONTAINER_TYPE:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;  // unreached
+}
+
+/**
+ * Get the container name from the typecode
+ * (unused at time of writing)
+ */
+/*static inline const char *get_container_name(uint8_t typecode) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return container_names[0];
+        case ARRAY_CONTAINER_TYPE:
+            return container_names[1];
+        case RUN_CONTAINER_TYPE:
+            return container_names[2];
+        case SHARED_CONTAINER_TYPE:
+            return container_names[3];
+        default:
+            assert(false);
+            roaring_unreachable;
+            return "unknown";
+    }
+}*/
+
+static inline const char *get_full_container_name(const container_t *c,
+                                                  uint8_t typecode) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return container_names[0];
+        case ARRAY_CONTAINER_TYPE:
+            return container_names[1];
+        case RUN_CONTAINER_TYPE:
+            return container_names[2];
+        case SHARED_CONTAINER_TYPE:
+            switch (const_CAST_shared(c)->typecode) {
+                case BITSET_CONTAINER_TYPE:
+                    return shared_container_names[0];
+                case ARRAY_CONTAINER_TYPE:
+                    return shared_container_names[1];
+                case RUN_CONTAINER_TYPE:
+                    return shared_container_names[2];
+                default:
+                    assert(false);
+                    roaring_unreachable;
+                    return "unknown";
+            }
+            break;
+        default:
+            assert(false);
+            roaring_unreachable;
+            return "unknown";
+    }
+    roaring_unreachable;
+    return NULL;
+}
+
+/**
+ * Get the container cardinality (number of elements), requires a  typecode
+ */
+static inline int container_get_cardinality(const container_t *c,
+                                            uint8_t typecode) {
+    c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_cardinality(const_CAST_bitset(c));
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_cardinality(const_CAST_array(c));
+        case RUN_CONTAINER_TYPE:
+            return run_container_cardinality(const_CAST_run(c));
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;  // unreached
+}
+
+// returns true if a container is known to be full. Note that a lazy bitset
+// container
+// might be full without us knowing
+static inline bool container_is_full(const container_t *c, uint8_t typecode) {
+    c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_cardinality(const_CAST_bitset(c)) ==
+                   (1 << 16);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_cardinality(const_CAST_array(c)) ==
+                   (1 << 16);
+        case RUN_CONTAINER_TYPE:
+            return run_container_is_full(const_CAST_run(c));
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;  // unreached
+}
+
+static inline int container_shrink_to_fit(container_t *c, uint8_t type) {
+    c = container_mutable_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return 0;  // no shrinking possible
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_shrink_to_fit(CAST_array(c));
+        case RUN_CONTAINER_TYPE:
+            return run_container_shrink_to_fit(CAST_run(c));
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;  // unreached
+}
+
+/**
+ * make a container with a run of ones
+ */
+/* initially always use a run container, even if an array might be
+ * marginally
+ * smaller */
+static inline container_t *container_range_of_ones(uint32_t range_start,
+                                                   uint32_t range_end,
+                                                   uint8_t *result_type) {
+    assert(range_end >= range_start);
+    uint64_t cardinality = range_end - range_start + 1;
+    if (cardinality <= 2) {
+        *result_type = ARRAY_CONTAINER_TYPE;
+        return array_container_create_range(range_start, range_end);
+    } else {
+        *result_type = RUN_CONTAINER_TYPE;
+        return run_container_create_range(range_start, range_end);
+    }
+}
+
+/*  Create a container with all the values between in [min,max) at a
+    distance k*step from min. */
+static inline container_t *container_from_range(uint8_t *type, uint32_t min,
+                                                uint32_t max, uint16_t step) {
+    if (step == 0) return NULL;  // being paranoid
+    if (step == 1) {
+        return container_range_of_ones(min, max, type);
+        // Note: the result is not always a run (need to check the cardinality)
+        //*type = RUN_CONTAINER_TYPE;
+        // return run_container_create_range(min, max);
+    }
+    int size = (max - min + step - 1) / step;
+    if (size <= DEFAULT_MAX_SIZE) {  // array container
+        *type = ARRAY_CONTAINER_TYPE;
+        array_container_t *array = array_container_create_given_capacity(size);
+        array_container_add_from_range(array, min, max, step);
+        assert(array->cardinality == size);
+        return array;
+    } else {  // bitset container
+        *type = BITSET_CONTAINER_TYPE;
+        bitset_container_t *bitset = bitset_container_create();
+        bitset_container_add_from_range(bitset, min, max, step);
+        assert(bitset->cardinality == size);
+        return bitset;
+    }
+}
+
+/**
+ * "repair" the container after lazy operations.
+ */
+static inline container_t *container_repair_after_lazy(container_t *c,
+                                                       uint8_t *type) {
+    c = get_writable_copy_if_shared(c, type);  // !!! unnecessary cloning
+    container_t *result = NULL;
+    switch (*type) {
+        case BITSET_CONTAINER_TYPE: {
+            bitset_container_t *bc = CAST_bitset(c);
+            bc->cardinality = bitset_container_compute_cardinality(bc);
+            if (bc->cardinality <= DEFAULT_MAX_SIZE) {
+                result = array_container_from_bitset(bc);
+                bitset_container_free(bc);
+                *type = ARRAY_CONTAINER_TYPE;
+                return result;
+            }
+            return c;
+        }
+        case ARRAY_CONTAINER_TYPE:
+            return c;  // nothing to do
+        case RUN_CONTAINER_TYPE:
+            return convert_run_to_efficient_container_and_free(CAST_run(c),
+                                                               type);
+        case SHARED_CONTAINER_TYPE:
+            assert(false);
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;  // unreached
+}
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes written should be
+ * container_write(container, buf).
+ *
+ */
+static inline int32_t container_write(const container_t *c, uint8_t typecode,
+                                      char *buf) {
+    c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_write(const_CAST_bitset(c), buf);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_write(const_CAST_array(c), buf);
+        case RUN_CONTAINER_TYPE:
+            return run_container_write(const_CAST_run(c), buf);
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;  // unreached
+}
+
+/**
+ * Get the container size in bytes under portable serialization (see
+ * container_write), requires a
+ * typecode
+ */
+static inline int32_t container_size_in_bytes(const container_t *c,
+                                              uint8_t typecode) {
+    c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_size_in_bytes(const_CAST_bitset(c));
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_size_in_bytes(const_CAST_array(c));
+        case RUN_CONTAINER_TYPE:
+            return run_container_size_in_bytes(const_CAST_run(c));
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;  // unreached
+}
+
+/**
+ * print the container (useful for debugging), requires a  typecode
+ */
+void container_printf(const container_t *container, uint8_t typecode);
+
+/**
+ * print the content of the container as a comma-separated list of 32-bit values
+ * starting at base, requires a  typecode
+ */
+void container_printf_as_uint32_array(const container_t *container,
+                                      uint8_t typecode, uint32_t base);
+
+bool container_internal_validate(const container_t *container, uint8_t typecode,
+                                 const char **reason);
+
+/**
+ * Checks whether a container is not empty, requires a  typecode
+ */
+static inline bool container_nonzero_cardinality(const container_t *c,
+                                                 uint8_t typecode) {
+    c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_const_nonzero_cardinality(
+                const_CAST_bitset(c));
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_nonzero_cardinality(const_CAST_array(c));
+        case RUN_CONTAINER_TYPE:
+            return run_container_nonzero_cardinality(const_CAST_run(c));
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;  // unreached
+}
+
+/**
+ * Recover memory from a container, requires a  typecode
+ */
+void container_free(container_t *container, uint8_t typecode);
+
+/**
+ * Convert a container to an array of values, requires a  typecode as well as a
+ * "base" (most significant values)
+ * Returns number of ints added.
+ */
+static inline int container_to_uint32_array(uint32_t *output,
+                                            const container_t *c,
+                                            uint8_t typecode, uint32_t base) {
+    c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_to_uint32_array(output,
+                                                    const_CAST_bitset(c), base);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_to_uint32_array(output, const_CAST_array(c),
+                                                   base);
+        case RUN_CONTAINER_TYPE:
+            return run_container_to_uint32_array(output, const_CAST_run(c),
+                                                 base);
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;  // unreached
+}
+
+/**
+ * Add a value to a container, requires a  typecode, fills in new_typecode and
+ * return (possibly different) container.
+ * This function may allocate a new container, and caller is responsible for
+ * memory deallocation
+ */
+static inline container_t *container_add(
+    container_t *c, uint16_t val,
+    uint8_t typecode,  // !!! should be second argument?
+    uint8_t *new_typecode) {
+    c = get_writable_copy_if_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            bitset_container_set(CAST_bitset(c), val);
+            *new_typecode = BITSET_CONTAINER_TYPE;
+            return c;
+        case ARRAY_CONTAINER_TYPE: {
+            array_container_t *ac = CAST_array(c);
+            if (array_container_try_add(ac, val, DEFAULT_MAX_SIZE) != -1) {
+                *new_typecode = ARRAY_CONTAINER_TYPE;
+                return ac;
+            } else {
+                bitset_container_t *bitset = bitset_container_from_array(ac);
+                bitset_container_add(bitset, val);
+                *new_typecode = BITSET_CONTAINER_TYPE;
+                return bitset;
+            }
+        } break;
+        case RUN_CONTAINER_TYPE:
+            // per Java, no container type adjustments are done (revisit?)
+            run_container_add(CAST_run(c), val);
+            *new_typecode = RUN_CONTAINER_TYPE;
+            return c;
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;
+    }
+}
+
+/**
+ * Remove a value from a container, requires a  typecode, fills in new_typecode
+ * and
+ * return (possibly different) container.
+ * This function may allocate a new container, and caller is responsible for
+ * memory deallocation
+ */
+static inline container_t *container_remove(
+    container_t *c, uint16_t val,
+    uint8_t typecode,  // !!! should be second argument?
+    uint8_t *new_typecode) {
+    c = get_writable_copy_if_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            if (bitset_container_remove(CAST_bitset(c), val)) {
+                int card = bitset_container_cardinality(CAST_bitset(c));
+                if (card <= DEFAULT_MAX_SIZE) {
+                    *new_typecode = ARRAY_CONTAINER_TYPE;
+                    return array_container_from_bitset(CAST_bitset(c));
+                }
+            }
+            *new_typecode = typecode;
+            return c;
+        case ARRAY_CONTAINER_TYPE:
+            *new_typecode = typecode;
+            array_container_remove(CAST_array(c), val);
+            return c;
+        case RUN_CONTAINER_TYPE:
+            // per Java, no container type adjustments are done (revisit?)
+            run_container_remove(CAST_run(c), val);
+            *new_typecode = RUN_CONTAINER_TYPE;
+            return c;
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;
+    }
+}
+
+/**
+ * Check whether a value is in a container, requires a  typecode
+ */
+static inline bool container_contains(
+    const container_t *c, uint16_t val,
+    uint8_t typecode  // !!! should be second argument?
+) {
+    c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_get(const_CAST_bitset(c), val);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_contains(const_CAST_array(c), val);
+        case RUN_CONTAINER_TYPE:
+            return run_container_contains(const_CAST_run(c), val);
+        default:
+            assert(false);
+            roaring_unreachable;
+            return false;
+    }
+}
+
+/**
+ * Check whether a range of values from range_start (included) to range_end
+ * (excluded) is in a container, requires a typecode
+ */
+static inline bool container_contains_range(
+    const container_t *c, uint32_t range_start, uint32_t range_end,
+    uint8_t typecode  // !!! should be second argument?
+) {
+    c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_get_range(const_CAST_bitset(c), range_start,
+                                              range_end);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_contains_range(const_CAST_array(c),
+                                                  range_start, range_end);
+        case RUN_CONTAINER_TYPE:
+            return run_container_contains_range(const_CAST_run(c), range_start,
+                                                range_end);
+        default:
+            assert(false);
+            roaring_unreachable;
+            return false;
+    }
+}
+
+/**
+ * Returns true if the two containers have the same content. Note that
+ * two containers having different types can be "equal" in this sense.
+ */
+static inline bool container_equals(const container_t *c1, uint8_t type1,
+                                    const container_t *c2, uint8_t type2) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            return bitset_container_equals(const_CAST_bitset(c1),
+                                           const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            return run_container_equals_bitset(const_CAST_run(c2),
+                                               const_CAST_bitset(c1));
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            return run_container_equals_bitset(const_CAST_run(c1),
+                                               const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            // java would always return false?
+            return array_container_equal_bitset(const_CAST_array(c2),
+                                                const_CAST_bitset(c1));
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            // java would always return false?
+            return array_container_equal_bitset(const_CAST_array(c1),
+                                                const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            return run_container_equals_array(const_CAST_run(c2),
+                                              const_CAST_array(c1));
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            return run_container_equals_array(const_CAST_run(c1),
+                                              const_CAST_array(c2));
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            return array_container_equals(const_CAST_array(c1),
+                                          const_CAST_array(c2));
+
+        case CONTAINER_PAIR(RUN, RUN):
+            return run_container_equals(const_CAST_run(c1), const_CAST_run(c2));
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return false;
+    }
+}
+
+/**
+ * Returns true if the container c1 is a subset of the container c2. Note that
+ * c1 can be a subset of c2 even if they have a different type.
+ */
+static inline bool container_is_subset(const container_t *c1, uint8_t type1,
+                                       const container_t *c2, uint8_t type2) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            return bitset_container_is_subset(const_CAST_bitset(c1),
+                                              const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            return bitset_container_is_subset_run(const_CAST_bitset(c1),
+                                                  const_CAST_run(c2));
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            return run_container_is_subset_bitset(const_CAST_run(c1),
+                                                  const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            return false;  // by construction, size(c1) > size(c2)
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            return array_container_is_subset_bitset(const_CAST_array(c1),
+                                                    const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            return array_container_is_subset_run(const_CAST_array(c1),
+                                                 const_CAST_run(c2));
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            return run_container_is_subset_array(const_CAST_run(c1),
+                                                 const_CAST_array(c2));
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            return array_container_is_subset(const_CAST_array(c1),
+                                             const_CAST_array(c2));
+
+        case CONTAINER_PAIR(RUN, RUN):
+            return run_container_is_subset(const_CAST_run(c1),
+                                           const_CAST_run(c2));
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return false;
+    }
+}
+
+// macro-izations possibilities for generic non-inplace binary-op dispatch
+
+/**
+ * Compute intersection between two containers, generate a new container (having
+ * type result_type), requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ */
+static inline container_t *container_and(const container_t *c1, uint8_t type1,
+                                         const container_t *c2, uint8_t type2,
+                                         uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            *result_type =
+                bitset_bitset_container_intersection(
+                    const_CAST_bitset(c1), const_CAST_bitset(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            result = array_container_create();
+            array_container_intersection(
+                const_CAST_array(c1), const_CAST_array(c2), CAST_array(result));
+            *result_type = ARRAY_CONTAINER_TYPE;  // never bitset
+            return result;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            result = run_container_create();
+            run_container_intersection(const_CAST_run(c1), const_CAST_run(c2),
+                                       CAST_run(result));
+            return convert_run_to_efficient_container_and_free(CAST_run(result),
+                                                               result_type);
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            result = array_container_create();
+            array_bitset_container_intersection(const_CAST_array(c2),
+                                                const_CAST_bitset(c1),
+                                                CAST_array(result));
+            *result_type = ARRAY_CONTAINER_TYPE;  // never bitset
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE;  // never bitset
+            array_bitset_container_intersection(const_CAST_array(c1),
+                                                const_CAST_bitset(c2),
+                                                CAST_array(result));
+            return result;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            *result_type =
+                run_bitset_container_intersection(
+                    const_CAST_run(c2), const_CAST_bitset(c1), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            *result_type =
+                run_bitset_container_intersection(
+                    const_CAST_run(c1), const_CAST_bitset(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE;  // never bitset
+            array_run_container_intersection(
+                const_CAST_array(c1), const_CAST_run(c2), CAST_array(result));
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE;  // never bitset
+            array_run_container_intersection(
+                const_CAST_array(c2), const_CAST_run(c1), CAST_array(result));
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;
+    }
+}
+
+/**
+ * Compute the size of the intersection between two containers.
+ */
+static inline int container_and_cardinality(const container_t *c1,
+                                            uint8_t type1,
+                                            const container_t *c2,
+                                            uint8_t type2) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            return bitset_container_and_justcard(const_CAST_bitset(c1),
+                                                 const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            return array_container_intersection_cardinality(
+                const_CAST_array(c1), const_CAST_array(c2));
+
+        case CONTAINER_PAIR(RUN, RUN):
+            return run_container_intersection_cardinality(const_CAST_run(c1),
+                                                          const_CAST_run(c2));
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            return array_bitset_container_intersection_cardinality(
+                const_CAST_array(c2), const_CAST_bitset(c1));
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            return array_bitset_container_intersection_cardinality(
+                const_CAST_array(c1), const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            return run_bitset_container_intersection_cardinality(
+                const_CAST_run(c2), const_CAST_bitset(c1));
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            return run_bitset_container_intersection_cardinality(
+                const_CAST_run(c1), const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            return array_run_container_intersection_cardinality(
+                const_CAST_array(c1), const_CAST_run(c2));
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            return array_run_container_intersection_cardinality(
+                const_CAST_array(c2), const_CAST_run(c1));
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return 0;
+    }
+}
+
+/**
+ * Check whether two containers intersect.
+ */
+static inline bool container_intersect(const container_t *c1, uint8_t type1,
+                                       const container_t *c2, uint8_t type2) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            return bitset_container_intersect(const_CAST_bitset(c1),
+                                              const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            return array_container_intersect(const_CAST_array(c1),
+                                             const_CAST_array(c2));
+
+        case CONTAINER_PAIR(RUN, RUN):
+            return run_container_intersect(const_CAST_run(c1),
+                                           const_CAST_run(c2));
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            return array_bitset_container_intersect(const_CAST_array(c2),
+                                                    const_CAST_bitset(c1));
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            return array_bitset_container_intersect(const_CAST_array(c1),
+                                                    const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            return run_bitset_container_intersect(const_CAST_run(c2),
+                                                  const_CAST_bitset(c1));
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            return run_bitset_container_intersect(const_CAST_run(c1),
+                                                  const_CAST_bitset(c2));
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            return array_run_container_intersect(const_CAST_array(c1),
+                                                 const_CAST_run(c2));
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            return array_run_container_intersect(const_CAST_array(c2),
+                                                 const_CAST_run(c1));
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return 0;
+    }
+}
+
+/**
+ * Compute intersection between two containers, with result in the first
+ container if possible. If the returned pointer is identical to c1,
+ then the container has been modified. If the returned pointer is different
+ from c1, then a new container has been created and the caller is responsible
+ for freeing it.
+ The type of the first container may change. Returns the modified
+ (and possibly new) container.
+*/
+static inline container_t *container_iand(container_t *c1, uint8_t type1,
+                                          const container_t *c2, uint8_t type2,
+                                          uint8_t *result_type) {
+    c1 = get_writable_copy_if_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            *result_type = bitset_bitset_container_intersection_inplace(
+                               CAST_bitset(c1), const_CAST_bitset(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            array_container_intersection_inplace(CAST_array(c1),
+                                                 const_CAST_array(c2));
+            *result_type = ARRAY_CONTAINER_TYPE;
+            return c1;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            result = run_container_create();
+            run_container_intersection(const_CAST_run(c1), const_CAST_run(c2),
+                                       CAST_run(result));
+            // as of January 2016, Java code used non-in-place intersection for
+            // two runcontainers
+            return convert_run_to_efficient_container_and_free(CAST_run(result),
+                                                               result_type);
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            // c1 is a bitmap so no inplace possible
+            result = array_container_create();
+            array_bitset_container_intersection(const_CAST_array(c2),
+                                                const_CAST_bitset(c1),
+                                                CAST_array(result));
+            *result_type = ARRAY_CONTAINER_TYPE;  // never bitset
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            *result_type = ARRAY_CONTAINER_TYPE;  // never bitset
+            array_bitset_container_intersection(
+                const_CAST_array(c1), const_CAST_bitset(c2),
+                CAST_array(c1));  // result is allowed to be same as c1
+            return c1;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            // will attempt in-place computation
+            *result_type = run_bitset_container_intersection(
+                               const_CAST_run(c2), const_CAST_bitset(c1), &c1)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return c1;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            *result_type =
+                run_bitset_container_intersection(
+                    const_CAST_run(c1), const_CAST_bitset(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE;  // never bitset
+            array_run_container_intersection(
+                const_CAST_array(c1), const_CAST_run(c2), CAST_array(result));
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE;  // never bitset
+            array_run_container_intersection(
+                const_CAST_array(c2), const_CAST_run(c1), CAST_array(result));
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;
+    }
+}
+
+/**
+ * Compute union between two containers, generate a new container (having type
+ * result_type), requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ */
+static inline container_t *container_or(const container_t *c1, uint8_t type1,
+                                        const container_t *c2, uint8_t type2,
+                                        uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            result = bitset_container_create();
+            bitset_container_or(const_CAST_bitset(c1), const_CAST_bitset(c2),
+                                CAST_bitset(result));
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            *result_type =
+                array_array_container_union(const_CAST_array(c1),
+                                            const_CAST_array(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            result = run_container_create();
+            run_container_union(const_CAST_run(c1), const_CAST_run(c2),
+                                CAST_run(result));
+            *result_type = RUN_CONTAINER_TYPE;
+            // todo: could be optimized since will never convert to array
+            result = convert_run_to_efficient_container_and_free(
+                CAST_run(result), result_type);
+            return result;
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            result = bitset_container_create();
+            array_bitset_container_union(const_CAST_array(c2),
+                                         const_CAST_bitset(c1),
+                                         CAST_bitset(result));
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            result = bitset_container_create();
+            array_bitset_container_union(const_CAST_array(c1),
+                                         const_CAST_bitset(c2),
+                                         CAST_bitset(result));
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            if (run_container_is_full(const_CAST_run(c2))) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE;
+                run_container_copy(const_CAST_run(c2), CAST_run(result));
+                return result;
+            }
+            result = bitset_container_create();
+            run_bitset_container_union(
+                const_CAST_run(c2), const_CAST_bitset(c1), CAST_bitset(result));
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            if (run_container_is_full(const_CAST_run(c1))) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE;
+                run_container_copy(const_CAST_run(c1), CAST_run(result));
+                return result;
+            }
+            result = bitset_container_create();
+            run_bitset_container_union(
+                const_CAST_run(c1), const_CAST_bitset(c2), CAST_bitset(result));
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            result = run_container_create();
+            array_run_container_union(const_CAST_array(c1), const_CAST_run(c2),
+                                      CAST_run(result));
+            result = convert_run_to_efficient_container_and_free(
+                CAST_run(result), result_type);
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            result = run_container_create();
+            array_run_container_union(const_CAST_array(c2), const_CAST_run(c1),
+                                      CAST_run(result));
+            result = convert_run_to_efficient_container_and_free(
+                CAST_run(result), result_type);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;  // unreached
+    }
+}
+
+/**
+ * Compute union between two containers, generate a new container (having type
+ * result_type), requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ *
+ * This lazy version delays some operations such as the maintenance of the
+ * cardinality. It requires repair later on the generated containers.
+ */
+static inline container_t *container_lazy_or(const container_t *c1,
+                                             uint8_t type1,
+                                             const container_t *c2,
+                                             uint8_t type2,
+                                             uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            result = bitset_container_create();
+            bitset_container_or_nocard(const_CAST_bitset(c1),
+                                       const_CAST_bitset(c2),
+                                       CAST_bitset(result));  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            *result_type =
+                array_array_container_lazy_union(const_CAST_array(c1),
+                                                 const_CAST_array(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            result = run_container_create();
+            run_container_union(const_CAST_run(c1), const_CAST_run(c2),
+                                CAST_run(result));
+            *result_type = RUN_CONTAINER_TYPE;
+            // we are being lazy
+            result = convert_run_to_efficient_container_and_free(
+                CAST_run(result), result_type);
+            return result;
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            result = bitset_container_create();
+            array_bitset_container_lazy_union(const_CAST_array(c2),
+                                              const_CAST_bitset(c1),
+                                              CAST_bitset(result));  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            result = bitset_container_create();
+            array_bitset_container_lazy_union(const_CAST_array(c1),
+                                              const_CAST_bitset(c2),
+                                              CAST_bitset(result));  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            if (run_container_is_full(const_CAST_run(c2))) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE;
+                run_container_copy(const_CAST_run(c2), CAST_run(result));
+                return result;
+            }
+            result = bitset_container_create();
+            run_bitset_container_lazy_union(const_CAST_run(c2),
+                                            const_CAST_bitset(c1),
+                                            CAST_bitset(result));  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            if (run_container_is_full(const_CAST_run(c1))) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE;
+                run_container_copy(const_CAST_run(c1), CAST_run(result));
+                return result;
+            }
+            result = bitset_container_create();
+            run_bitset_container_lazy_union(const_CAST_run(c1),
+                                            const_CAST_bitset(c2),
+                                            CAST_bitset(result));  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            result = run_container_create();
+            array_run_container_union(const_CAST_array(c1), const_CAST_run(c2),
+                                      CAST_run(result));
+            *result_type = RUN_CONTAINER_TYPE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container(result, result_type);
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            result = run_container_create();
+            array_run_container_union(const_CAST_array(c2), const_CAST_run(c1),
+                                      CAST_run(result));  // TODO make lazy
+            *result_type = RUN_CONTAINER_TYPE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container(result, result_type);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;  // unreached
+    }
+}
+
+/**
+ * Compute the union between two containers, with result in the first container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container
+ */
+static inline container_t *container_ior(container_t *c1, uint8_t type1,
+                                         const container_t *c2, uint8_t type2,
+                                         uint8_t *result_type) {
+    c1 = get_writable_copy_if_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            bitset_container_or(const_CAST_bitset(c1), const_CAST_bitset(c2),
+                                CAST_bitset(c1));
+#ifdef OR_BITSET_CONVERSION_TO_FULL
+            if (CAST_bitset(c1)->cardinality == (1 << 16)) {  // we convert
+                result = run_container_create_range(0, (1 << 16));
+                *result_type = RUN_CONTAINER_TYPE;
+                return result;
+            }
+#endif
+            *result_type = BITSET_CONTAINER_TYPE;
+            return c1;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            *result_type = array_array_container_inplace_union(
+                               CAST_array(c1), const_CAST_array(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            if ((result == NULL) && (*result_type == ARRAY_CONTAINER_TYPE)) {
+                return c1;  // the computation was done in-place!
+            }
+            return result;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            run_container_union_inplace(CAST_run(c1), const_CAST_run(c2));
+            return convert_run_to_efficient_container(CAST_run(c1),
+                                                      result_type);
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            array_bitset_container_union(
+                const_CAST_array(c2), const_CAST_bitset(c1), CAST_bitset(c1));
+            *result_type = BITSET_CONTAINER_TYPE;  // never array
+            return c1;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            // c1 is an array, so no in-place possible
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE;
+            array_bitset_container_union(const_CAST_array(c1),
+                                         const_CAST_bitset(c2),
+                                         CAST_bitset(result));
+            return result;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            if (run_container_is_full(const_CAST_run(c2))) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE;
+                run_container_copy(const_CAST_run(c2), CAST_run(result));
+                return result;
+            }
+            run_bitset_container_union(const_CAST_run(c2),
+                                       const_CAST_bitset(c1),
+                                       CAST_bitset(c1));  // allowed
+            *result_type = BITSET_CONTAINER_TYPE;
+            return c1;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            if (run_container_is_full(const_CAST_run(c1))) {
+                *result_type = RUN_CONTAINER_TYPE;
+                return c1;
+            }
+            result = bitset_container_create();
+            run_bitset_container_union(
+                const_CAST_run(c1), const_CAST_bitset(c2), CAST_bitset(result));
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            result = run_container_create();
+            array_run_container_union(const_CAST_array(c1), const_CAST_run(c2),
+                                      CAST_run(result));
+            result = convert_run_to_efficient_container_and_free(
+                CAST_run(result), result_type);
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            array_run_container_inplace_union(const_CAST_array(c2),
+                                              CAST_run(c1));
+            c1 = convert_run_to_efficient_container(CAST_run(c1), result_type);
+            return c1;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;
+    }
+}
+
+/**
+ * Compute the union between two containers, with result in the first container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container
+ *
+ * This lazy version delays some operations such as the maintenance of the
+ * cardinality. It requires repair later on the generated containers.
+ */
+static inline container_t *container_lazy_ior(container_t *c1, uint8_t type1,
+                                              const container_t *c2,
+                                              uint8_t type2,
+                                              uint8_t *result_type) {
+    assert(type1 != SHARED_CONTAINER_TYPE);
+    // c1 = get_writable_copy_if_shared(c1,&type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+#ifdef LAZY_OR_BITSET_CONVERSION_TO_FULL
+            // if we have two bitsets, we might as well compute the cardinality
+            bitset_container_or(const_CAST_bitset(c1), const_CAST_bitset(c2),
+                                CAST_bitset(c1));
+            // it is possible that two bitsets can lead to a full container
+            if (CAST_bitset(c1)->cardinality == (1 << 16)) {  // we convert
+                result = run_container_create_range(0, (1 << 16));
+                *result_type = RUN_CONTAINER_TYPE;
+                return result;
+            }
+#else
+            bitset_container_or_nocard(const_CAST_bitset(c1),
+                                       const_CAST_bitset(c2), CAST_bitset(c1));
+
+#endif
+            *result_type = BITSET_CONTAINER_TYPE;
+            return c1;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            *result_type = array_array_container_lazy_inplace_union(
+                               CAST_array(c1), const_CAST_array(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            if ((result == NULL) && (*result_type == ARRAY_CONTAINER_TYPE)) {
+                return c1;  // the computation was done in-place!
+            }
+            return result;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            run_container_union_inplace(CAST_run(c1), const_CAST_run(c2));
+            *result_type = RUN_CONTAINER_TYPE;
+            return convert_run_to_efficient_container(CAST_run(c1),
+                                                      result_type);
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            array_bitset_container_lazy_union(const_CAST_array(c2),
+                                              const_CAST_bitset(c1),
+                                              CAST_bitset(c1));  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE;                // never array
+            return c1;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            // c1 is an array, so no in-place possible
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE;
+            array_bitset_container_lazy_union(const_CAST_array(c1),
+                                              const_CAST_bitset(c2),
+                                              CAST_bitset(result));  // is lazy
+            return result;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            if (run_container_is_full(const_CAST_run(c2))) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE;
+                run_container_copy(const_CAST_run(c2), CAST_run(result));
+                return result;
+            }
+            run_bitset_container_lazy_union(
+                const_CAST_run(c2), const_CAST_bitset(c1),
+                CAST_bitset(c1));  // allowed //  lazy
+            *result_type = BITSET_CONTAINER_TYPE;
+            return c1;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            if (run_container_is_full(const_CAST_run(c1))) {
+                *result_type = RUN_CONTAINER_TYPE;
+                return c1;
+            }
+            result = bitset_container_create();
+            run_bitset_container_lazy_union(const_CAST_run(c1),
+                                            const_CAST_bitset(c2),
+                                            CAST_bitset(result));  //  lazy
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            result = run_container_create();
+            array_run_container_union(const_CAST_array(c1), const_CAST_run(c2),
+                                      CAST_run(result));
+            *result_type = RUN_CONTAINER_TYPE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container_and_free(result,
+            // result_type);
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            array_run_container_inplace_union(const_CAST_array(c2),
+                                              CAST_run(c1));
+            *result_type = RUN_CONTAINER_TYPE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container_and_free(result,
+            // result_type);
+            return c1;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;
+    }
+}
+
+/**
+ * Compute symmetric difference (xor) between two containers, generate a new
+ * container (having type result_type), requires a typecode. This allocates new
+ * memory, caller is responsible for deallocation.
+ */
+static inline container_t *container_xor(const container_t *c1, uint8_t type1,
+                                         const container_t *c2, uint8_t type2,
+                                         uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            *result_type =
+                bitset_bitset_container_xor(const_CAST_bitset(c1),
+                                            const_CAST_bitset(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            *result_type =
+                array_array_container_xor(const_CAST_array(c1),
+                                          const_CAST_array(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            *result_type = (uint8_t)run_run_container_xor(
+                const_CAST_run(c1), const_CAST_run(c2), &result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            *result_type =
+                array_bitset_container_xor(const_CAST_array(c2),
+                                           const_CAST_bitset(c1), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            *result_type =
+                array_bitset_container_xor(const_CAST_array(c1),
+                                           const_CAST_bitset(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            *result_type =
+                run_bitset_container_xor(const_CAST_run(c2),
+                                         const_CAST_bitset(c1), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            *result_type =
+                run_bitset_container_xor(const_CAST_run(c1),
+                                         const_CAST_bitset(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            *result_type = (uint8_t)array_run_container_xor(
+                const_CAST_array(c1), const_CAST_run(c2), &result);
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            *result_type = (uint8_t)array_run_container_xor(
+                const_CAST_array(c2), const_CAST_run(c1), &result);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;  // unreached
+    }
+}
+
+/* Applies an offset to the non-empty container 'c'.
+ * The results are stored in new containers returned via 'lo' and 'hi', for the
+ * low and high halves of the result (where the low half matches the original
+ * key and the high one corresponds to values for the following key). Either one
+ * of 'lo' and 'hi' are allowed to be 'NULL', but not both. Whenever one of them
+ * is not 'NULL', it should point to a 'NULL' container. Whenever one of them is
+ * 'NULL' the shifted elements for that part will not be computed. If either of
+ * the resulting containers turns out to be empty, the pointed container will
+ * remain 'NULL'.
+ */
+static inline void container_add_offset(const container_t *c, uint8_t type,
+                                        container_t **lo, container_t **hi,
+                                        uint16_t offset) {
+    assert(offset != 0);
+    assert(container_nonzero_cardinality(c, type));
+    assert(lo != NULL || hi != NULL);
+    assert(lo == NULL || *lo == NULL);
+    assert(hi == NULL || *hi == NULL);
+
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            bitset_container_offset(const_CAST_bitset(c), lo, hi, offset);
+            break;
+        case ARRAY_CONTAINER_TYPE:
+            array_container_offset(const_CAST_array(c), lo, hi, offset);
+            break;
+        case RUN_CONTAINER_TYPE:
+            run_container_offset(const_CAST_run(c), lo, hi, offset);
+            break;
+        default:
+            assert(false);
+            roaring_unreachable;
+            break;
+    }
+}
+
+/**
+ * Compute xor between two containers, generate a new container (having type
+ * result_type), requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ *
+ * This lazy version delays some operations such as the maintenance of the
+ * cardinality. It requires repair later on the generated containers.
+ */
+static inline container_t *container_lazy_xor(const container_t *c1,
+                                              uint8_t type1,
+                                              const container_t *c2,
+                                              uint8_t type2,
+                                              uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            result = bitset_container_create();
+            bitset_container_xor_nocard(const_CAST_bitset(c1),
+                                        const_CAST_bitset(c2),
+                                        CAST_bitset(result));  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            *result_type =
+                array_array_container_lazy_xor(const_CAST_array(c1),
+                                               const_CAST_array(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            // nothing special done yet.
+            *result_type = (uint8_t)run_run_container_xor(
+                const_CAST_run(c1), const_CAST_run(c2), &result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE;
+            array_bitset_container_lazy_xor(const_CAST_array(c2),
+                                            const_CAST_bitset(c1),
+                                            CAST_bitset(result));
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE;
+            array_bitset_container_lazy_xor(const_CAST_array(c1),
+                                            const_CAST_bitset(c2),
+                                            CAST_bitset(result));
+            return result;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            result = bitset_container_create();
+            run_bitset_container_lazy_xor(
+                const_CAST_run(c2), const_CAST_bitset(c1), CAST_bitset(result));
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            result = bitset_container_create();
+            run_bitset_container_lazy_xor(
+                const_CAST_run(c1), const_CAST_bitset(c2), CAST_bitset(result));
+            *result_type = BITSET_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            result = run_container_create();
+            array_run_container_lazy_xor(const_CAST_array(c1),
+                                         const_CAST_run(c2), CAST_run(result));
+            *result_type = RUN_CONTAINER_TYPE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container(result, result_type);
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            result = run_container_create();
+            array_run_container_lazy_xor(const_CAST_array(c2),
+                                         const_CAST_run(c1), CAST_run(result));
+            *result_type = RUN_CONTAINER_TYPE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container(result, result_type);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;  // unreached
+    }
+}
+
+/**
+ * Compute the xor between two containers, with result in the first container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created. The original container is freed by container_ixor.
+ * The type of the first container may change. Returns the modified (and
+ * possibly new) container.
+ */
+static inline container_t *container_ixor(container_t *c1, uint8_t type1,
+                                          const container_t *c2, uint8_t type2,
+                                          uint8_t *result_type) {
+    c1 = get_writable_copy_if_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            *result_type = bitset_bitset_container_ixor(
+                               CAST_bitset(c1), const_CAST_bitset(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            *result_type = array_array_container_ixor(
+                               CAST_array(c1), const_CAST_array(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            *result_type = (uint8_t)run_run_container_ixor(
+                CAST_run(c1), const_CAST_run(c2), &result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            *result_type = bitset_array_container_ixor(
+                               CAST_bitset(c1), const_CAST_array(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            *result_type = array_bitset_container_ixor(
+                               CAST_array(c1), const_CAST_bitset(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            *result_type = bitset_run_container_ixor(
+                               CAST_bitset(c1), const_CAST_run(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+
+            return result;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            *result_type = run_bitset_container_ixor(
+                               CAST_run(c1), const_CAST_bitset(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            *result_type = (uint8_t)array_run_container_ixor(
+                CAST_array(c1), const_CAST_run(c2), &result);
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            *result_type = (uint8_t)run_array_container_ixor(
+                CAST_run(c1), const_CAST_array(c2), &result);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;
+    }
+}
+
+/**
+ * Compute the xor between two containers, with result in the first container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container
+ *
+ * This lazy version delays some operations such as the maintenance of the
+ * cardinality. It requires repair later on the generated containers.
+ */
+static inline container_t *container_lazy_ixor(container_t *c1, uint8_t type1,
+                                               const container_t *c2,
+                                               uint8_t type2,
+                                               uint8_t *result_type) {
+    assert(type1 != SHARED_CONTAINER_TYPE);
+    // c1 = get_writable_copy_if_shared(c1,&type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            bitset_container_xor_nocard(CAST_bitset(c1), const_CAST_bitset(c2),
+                                        CAST_bitset(c1));  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE;
+            return c1;
+
+        // TODO: other cases being lazy, esp. when we know inplace not likely
+        // could see the corresponding code for union
+        default:
+            // we may have a dirty bitset (without a precomputed cardinality)
+            // and calling container_ixor on it might be unsafe.
+            if (type1 == BITSET_CONTAINER_TYPE) {
+                bitset_container_t *bc = CAST_bitset(c1);
+                if (bc->cardinality == BITSET_UNKNOWN_CARDINALITY) {
+                    bc->cardinality = bitset_container_compute_cardinality(bc);
+                }
+            }
+            return container_ixor(c1, type1, c2, type2, result_type);
+    }
+}
+
+/**
+ * Compute difference (andnot) between two containers, generate a new
+ * container (having type result_type), requires a typecode. This allocates new
+ * memory, caller is responsible for deallocation.
+ */
+static inline container_t *container_andnot(const container_t *c1,
+                                            uint8_t type1,
+                                            const container_t *c2,
+                                            uint8_t type2,
+                                            uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            *result_type =
+                bitset_bitset_container_andnot(const_CAST_bitset(c1),
+                                               const_CAST_bitset(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            result = array_container_create();
+            array_array_container_andnot(
+                const_CAST_array(c1), const_CAST_array(c2), CAST_array(result));
+            *result_type = ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            if (run_container_is_full(const_CAST_run(c2))) {
+                result = array_container_create();
+                *result_type = ARRAY_CONTAINER_TYPE;
+                return result;
+            }
+            *result_type = (uint8_t)run_run_container_andnot(
+                const_CAST_run(c1), const_CAST_run(c2), &result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            *result_type =
+                bitset_array_container_andnot(const_CAST_bitset(c1),
+                                              const_CAST_array(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            result = array_container_create();
+            array_bitset_container_andnot(const_CAST_array(c1),
+                                          const_CAST_bitset(c2),
+                                          CAST_array(result));
+            *result_type = ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            if (run_container_is_full(const_CAST_run(c2))) {
+                result = array_container_create();
+                *result_type = ARRAY_CONTAINER_TYPE;
+                return result;
+            }
+            *result_type =
+                bitset_run_container_andnot(const_CAST_bitset(c1),
+                                            const_CAST_run(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            *result_type =
+                run_bitset_container_andnot(const_CAST_run(c1),
+                                            const_CAST_bitset(c2), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            if (run_container_is_full(const_CAST_run(c2))) {
+                result = array_container_create();
+                *result_type = ARRAY_CONTAINER_TYPE;
+                return result;
+            }
+            result = array_container_create();
+            array_run_container_andnot(const_CAST_array(c1), const_CAST_run(c2),
+                                       CAST_array(result));
+            *result_type = ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            *result_type = (uint8_t)run_array_container_andnot(
+                const_CAST_run(c1), const_CAST_array(c2), &result);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;  // unreached
+    }
+}
+
+/**
+ * Compute the andnot between two containers, with result in the first
+ * container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created. The original container is freed by container_iandnot.
+ * The type of the first container may change. Returns the modified (and
+ * possibly new) container.
+ */
+static inline container_t *container_iandnot(container_t *c1, uint8_t type1,
+                                             const container_t *c2,
+                                             uint8_t type2,
+                                             uint8_t *result_type) {
+    c1 = get_writable_copy_if_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    container_t *result = NULL;
+    switch (PAIR_CONTAINER_TYPES(type1, type2)) {
+        case CONTAINER_PAIR(BITSET, BITSET):
+            *result_type = bitset_bitset_container_iandnot(
+                               CAST_bitset(c1), const_CAST_bitset(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, ARRAY):
+            array_array_container_iandnot(CAST_array(c1), const_CAST_array(c2));
+            *result_type = ARRAY_CONTAINER_TYPE;
+            return c1;
+
+        case CONTAINER_PAIR(RUN, RUN):
+            *result_type = (uint8_t)run_run_container_iandnot(
+                CAST_run(c1), const_CAST_run(c2), &result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET, ARRAY):
+            *result_type = bitset_array_container_iandnot(
+                               CAST_bitset(c1), const_CAST_array(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, BITSET):
+            *result_type = ARRAY_CONTAINER_TYPE;
+            array_bitset_container_iandnot(CAST_array(c1),
+                                           const_CAST_bitset(c2));
+            return c1;
+
+        case CONTAINER_PAIR(BITSET, RUN):
+            *result_type = bitset_run_container_iandnot(
+                               CAST_bitset(c1), const_CAST_run(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(RUN, BITSET):
+            *result_type = run_bitset_container_iandnot(
+                               CAST_run(c1), const_CAST_bitset(c2), &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY, RUN):
+            *result_type = ARRAY_CONTAINER_TYPE;
+            array_run_container_iandnot(CAST_array(c1), const_CAST_run(c2));
+            return c1;
+
+        case CONTAINER_PAIR(RUN, ARRAY):
+            *result_type = (uint8_t)run_array_container_iandnot(
+                CAST_run(c1), const_CAST_array(c2), &result);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;
+    }
+}
+
+/**
+ * Visit all values x of the container once, passing (base+x,ptr)
+ * to iterator. You need to specify a container and its type.
+ * Returns true if the iteration should continue.
+ */
+static inline bool container_iterate(const container_t *c, uint8_t type,
+                                     uint32_t base, roaring_iterator iterator,
+                                     void *ptr) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_iterate(const_CAST_bitset(c), base,
+                                            iterator, ptr);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_iterate(const_CAST_array(c), base, iterator,
+                                           ptr);
+        case RUN_CONTAINER_TYPE:
+            return run_container_iterate(const_CAST_run(c), base, iterator,
+                                         ptr);
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return false;
+}
+
+static inline bool container_iterate64(const container_t *c, uint8_t type,
+                                       uint32_t base,
+                                       roaring_iterator64 iterator,
+                                       uint64_t high_bits, void *ptr) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_iterate64(const_CAST_bitset(c), base,
+                                              iterator, high_bits, ptr);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_iterate64(const_CAST_array(c), base,
+                                             iterator, high_bits, ptr);
+        case RUN_CONTAINER_TYPE:
+            return run_container_iterate64(const_CAST_run(c), base, iterator,
+                                           high_bits, ptr);
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return false;
+}
+
+static inline container_t *container_not(const container_t *c, uint8_t type,
+                                         uint8_t *result_type) {
+    c = container_unwrap_shared(c, &type);
+    container_t *result = NULL;
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            *result_type =
+                bitset_container_negation(const_CAST_bitset(c), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+        case ARRAY_CONTAINER_TYPE:
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE;
+            array_container_negation(const_CAST_array(c), CAST_bitset(result));
+            return result;
+        case RUN_CONTAINER_TYPE:
+            *result_type =
+                (uint8_t)run_container_negation(const_CAST_run(c), &result);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return NULL;
+}
+
+static inline container_t *container_not_range(const container_t *c,
+                                               uint8_t type,
+                                               uint32_t range_start,
+                                               uint32_t range_end,
+                                               uint8_t *result_type) {
+    c = container_unwrap_shared(c, &type);
+    container_t *result = NULL;
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            *result_type =
+                bitset_container_negation_range(const_CAST_bitset(c),
+                                                range_start, range_end, &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+        case ARRAY_CONTAINER_TYPE:
+            *result_type =
+                array_container_negation_range(const_CAST_array(c), range_start,
+                                               range_end, &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+        case RUN_CONTAINER_TYPE:
+            *result_type = (uint8_t)run_container_negation_range(
+                const_CAST_run(c), range_start, range_end, &result);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return NULL;
+}
+
+static inline container_t *container_inot(container_t *c, uint8_t type,
+                                          uint8_t *result_type) {
+    c = get_writable_copy_if_shared(c, &type);
+    container_t *result = NULL;
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            *result_type =
+                bitset_container_negation_inplace(CAST_bitset(c), &result)
+                    ? BITSET_CONTAINER_TYPE
+                    : ARRAY_CONTAINER_TYPE;
+            return result;
+        case ARRAY_CONTAINER_TYPE:
+            // will never be inplace
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE;
+            array_container_negation(CAST_array(c), CAST_bitset(result));
+            array_container_free(CAST_array(c));
+            return result;
+        case RUN_CONTAINER_TYPE:
+            *result_type =
+                (uint8_t)run_container_negation_inplace(CAST_run(c), &result);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return NULL;
+}
+
+static inline container_t *container_inot_range(container_t *c, uint8_t type,
+                                                uint32_t range_start,
+                                                uint32_t range_end,
+                                                uint8_t *result_type) {
+    c = get_writable_copy_if_shared(c, &type);
+    container_t *result = NULL;
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            *result_type = bitset_container_negation_range_inplace(
+                               CAST_bitset(c), range_start, range_end, &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+        case ARRAY_CONTAINER_TYPE:
+            *result_type = array_container_negation_range_inplace(
+                               CAST_array(c), range_start, range_end, &result)
+                               ? BITSET_CONTAINER_TYPE
+                               : ARRAY_CONTAINER_TYPE;
+            return result;
+        case RUN_CONTAINER_TYPE:
+            *result_type = (uint8_t)run_container_negation_range_inplace(
+                CAST_run(c), range_start, range_end, &result);
+            return result;
+
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return NULL;
+}
+
+/**
+ * If the element of given rank is in this container, supposing that
+ * the first
+ * element has rank start_rank, then the function returns true and
+ * sets element
+ * accordingly.
+ * Otherwise, it returns false and update start_rank.
+ */
+static inline bool container_select(const container_t *c, uint8_t type,
+                                    uint32_t *start_rank, uint32_t rank,
+                                    uint32_t *element) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_select(const_CAST_bitset(c), start_rank,
+                                           rank, element);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_select(const_CAST_array(c), start_rank, rank,
+                                          element);
+        case RUN_CONTAINER_TYPE:
+            return run_container_select(const_CAST_run(c), start_rank, rank,
+                                        element);
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return false;
+}
+
+static inline uint16_t container_maximum(const container_t *c, uint8_t type) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_maximum(const_CAST_bitset(c));
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_maximum(const_CAST_array(c));
+        case RUN_CONTAINER_TYPE:
+            return run_container_maximum(const_CAST_run(c));
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return false;
+}
+
+static inline uint16_t container_minimum(const container_t *c, uint8_t type) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_minimum(const_CAST_bitset(c));
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_minimum(const_CAST_array(c));
+        case RUN_CONTAINER_TYPE:
+            return run_container_minimum(const_CAST_run(c));
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return false;
+}
+
+// number of values smaller or equal to x
+static inline int container_rank(const container_t *c, uint8_t type,
+                                 uint16_t x) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_rank(const_CAST_bitset(c), x);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_rank(const_CAST_array(c), x);
+        case RUN_CONTAINER_TYPE:
+            return run_container_rank(const_CAST_run(c), x);
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return false;
+}
+
+// bulk version of container_rank(); return number of consumed elements
+static inline uint32_t container_rank_many(const container_t *c, uint8_t type,
+                                           uint64_t start_rank,
+                                           const uint32_t *begin,
+                                           const uint32_t *end, uint64_t *ans) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_rank_many(const_CAST_bitset(c), start_rank,
+                                              begin, end, ans);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_rank_many(const_CAST_array(c), start_rank,
+                                             begin, end, ans);
+        case RUN_CONTAINER_TYPE:
+            return run_container_rank_many(const_CAST_run(c), start_rank, begin,
+                                           end, ans);
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return 0;
+}
+
+// return the index of x, if not exsist return -1
+static inline int container_get_index(const container_t *c, uint8_t type,
+                                      uint16_t x) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_get_index(const_CAST_bitset(c), x);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_get_index(const_CAST_array(c), x);
+        case RUN_CONTAINER_TYPE:
+            return run_container_get_index(const_CAST_run(c), x);
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return false;
+}
+
+/**
+ * Add all values in range [min, max] to a given container.
+ *
+ * If the returned pointer is different from $container, then a new container
+ * has been created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container.
+ */
+static inline container_t *container_add_range(container_t *c, uint8_t type,
+                                               uint32_t min, uint32_t max,
+                                               uint8_t *result_type) {
+    // NB: when selecting new container type, we perform only inexpensive checks
+    switch (type) {
+        case BITSET_CONTAINER_TYPE: {
+            bitset_container_t *bitset = CAST_bitset(c);
+
+            int32_t union_cardinality = 0;
+            union_cardinality += bitset->cardinality;
+            union_cardinality += max - min + 1;
+            union_cardinality -=
+                bitset_lenrange_cardinality(bitset->words, min, max - min);
+
+            if (union_cardinality == INT32_C(0x10000)) {
+                *result_type = RUN_CONTAINER_TYPE;
+                return run_container_create_range(0, INT32_C(0x10000));
+            } else {
+                *result_type = BITSET_CONTAINER_TYPE;
+                bitset_set_lenrange(bitset->words, min, max - min);
+                bitset->cardinality = union_cardinality;
+                return bitset;
+            }
+        }
+        case ARRAY_CONTAINER_TYPE: {
+            array_container_t *array = CAST_array(c);
+
+            int32_t nvals_greater =
+                count_greater(array->array, array->cardinality, (uint16_t)max);
+            int32_t nvals_less =
+                count_less(array->array, array->cardinality - nvals_greater,
+                           (uint16_t)min);
+            int32_t union_cardinality =
+                nvals_less + (max - min + 1) + nvals_greater;
+
+            if (union_cardinality == INT32_C(0x10000)) {
+                *result_type = RUN_CONTAINER_TYPE;
+                return run_container_create_range(0, INT32_C(0x10000));
+            } else if (union_cardinality <= DEFAULT_MAX_SIZE) {
+                *result_type = ARRAY_CONTAINER_TYPE;
+                array_container_add_range_nvals(array, min, max, nvals_less,
+                                                nvals_greater);
+                return array;
+            } else {
+                *result_type = BITSET_CONTAINER_TYPE;
+                bitset_container_t *bitset = bitset_container_from_array(array);
+                bitset_set_lenrange(bitset->words, min, max - min);
+                bitset->cardinality = union_cardinality;
+                return bitset;
+            }
+        }
+        case RUN_CONTAINER_TYPE: {
+            run_container_t *run = CAST_run(c);
+
+            int32_t nruns_greater =
+                rle16_count_greater(run->runs, run->n_runs, (uint16_t)max);
+            int32_t nruns_less = rle16_count_less(
+                run->runs, run->n_runs - nruns_greater, (uint16_t)min);
+
+            int32_t run_size_bytes =
+                (nruns_less + 1 + nruns_greater) * sizeof(rle16_t);
+            int32_t bitset_size_bytes =
+                BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+
+            if (run_size_bytes <= bitset_size_bytes) {
+                run_container_add_range_nruns(run, min, max, nruns_less,
+                                              nruns_greater);
+                *result_type = RUN_CONTAINER_TYPE;
+                return run;
+            } else {
+                return container_from_run_range(run, min, max, result_type);
+            }
+        }
+        default:
+            roaring_unreachable;
+    }
+}
+
+/*
+ * Removes all elements in range [min, max].
+ * Returns one of:
+ *   - NULL if no elements left
+ *   - pointer to the original container
+ *   - pointer to a newly-allocated container (if it is more efficient)
+ *
+ * If the returned pointer is different from $container, then a new container
+ * has been created and the caller is responsible for freeing the original
+ * container.
+ */
+static inline container_t *container_remove_range(container_t *c, uint8_t type,
+                                                  uint32_t min, uint32_t max,
+                                                  uint8_t *result_type) {
+    switch (type) {
+        case BITSET_CONTAINER_TYPE: {
+            bitset_container_t *bitset = CAST_bitset(c);
+
+            int32_t result_cardinality =
+                bitset->cardinality -
+                bitset_lenrange_cardinality(bitset->words, min, max - min);
+
+            if (result_cardinality == 0) {
+                return NULL;
+            } else if (result_cardinality <= DEFAULT_MAX_SIZE) {
+                *result_type = ARRAY_CONTAINER_TYPE;
+                bitset_reset_range(bitset->words, min, max + 1);
+                bitset->cardinality = result_cardinality;
+                return array_container_from_bitset(bitset);
+            } else {
+                *result_type = BITSET_CONTAINER_TYPE;
+                bitset_reset_range(bitset->words, min, max + 1);
+                bitset->cardinality = result_cardinality;
+                return bitset;
+            }
+        }
+        case ARRAY_CONTAINER_TYPE: {
+            array_container_t *array = CAST_array(c);
+
+            int32_t nvals_greater =
+                count_greater(array->array, array->cardinality, (uint16_t)max);
+            int32_t nvals_less =
+                count_less(array->array, array->cardinality - nvals_greater,
+                           (uint16_t)min);
+            int32_t result_cardinality = nvals_less + nvals_greater;
+
+            if (result_cardinality == 0) {
+                return NULL;
+            } else {
+                *result_type = ARRAY_CONTAINER_TYPE;
+                array_container_remove_range(
+                    array, nvals_less, array->cardinality - result_cardinality);
+                return array;
+            }
+        }
+        case RUN_CONTAINER_TYPE: {
+            run_container_t *run = CAST_run(c);
+
+            if (run->n_runs == 0) {
+                return NULL;
+            }
+            if (min <= run_container_minimum(run) &&
+                max >= run_container_maximum(run)) {
+                return NULL;
+            }
+
+            run_container_remove_range(run, min, max);
+            return convert_run_to_efficient_container(run, result_type);
+        }
+        default:
+            roaring_unreachable;
+    }
+}
+
+#ifdef __cplusplus
+using api::roaring_container_iterator_t;
+#endif
+
+/**
+ * Initializes the iterator at the first entry in the container.
+ */
+roaring_container_iterator_t container_init_iterator(const container_t *c,
+                                                     uint8_t typecode,
+                                                     uint16_t *value);
+
+/**
+ * Initializes the iterator at the last entry in the container.
+ */
+roaring_container_iterator_t container_init_iterator_last(const container_t *c,
+                                                          uint8_t typecode,
+                                                          uint16_t *value);
+
+/**
+ * Moves the iterator to the next entry. Returns true and sets `value` if a
+ * value is present.
+ */
+bool container_iterator_next(const container_t *c, uint8_t typecode,
+                             roaring_container_iterator_t *it, uint16_t *value);
+
+/**
+ * Moves the iterator to the previous entry. Returns true and sets `value` if a
+ * value is present.
+ */
+bool container_iterator_prev(const container_t *c, uint8_t typecode,
+                             roaring_container_iterator_t *it, uint16_t *value);
+
+/**
+ * Moves the iterator to the smallest entry that is greater than or equal to
+ * `val`. Returns true and sets `value_out` if a value is present. `value_out`
+ * should be initialized to a value.
+ */
+bool container_iterator_lower_bound(const container_t *c, uint8_t typecode,
+                                    roaring_container_iterator_t *it,
+                                    uint16_t *value_out, uint16_t val);
+
+/**
+ * Reads up to `count` entries from the container, and writes them into `buf`
+ * as `high16 | entry`. Returns true and sets `value_out` if a value is present
+ * after reading the entries. Sets `consumed` to the number of values read.
+ * `count` should be greater than zero.
+ */
+bool container_iterator_read_into_uint32(const container_t *c, uint8_t typecode,
+                                         roaring_container_iterator_t *it,
+                                         uint32_t high16, uint32_t *buf,
+                                         uint32_t count, uint32_t *consumed,
+                                         uint16_t *value_out);
+
+/**
+ * Reads up to `count` entries from the container, and writes them into `buf`
+ * as `high48 | entry`. Returns true and sets `value_out` if a value is present
+ * after reading the entries. Sets `consumed` to the number of values read.
+ * `count` should be greater than zero.
+ */
+bool container_iterator_read_into_uint64(const container_t *c, uint8_t typecode,
+                                         roaring_container_iterator_t *it,
+                                         uint64_t high48, uint64_t *buf,
+                                         uint32_t count, uint32_t *consumed,
+                                         uint16_t *value_out);
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#endif
+/* end file include/roaring/containers/containers.h */
+/* begin file include/roaring/roaring_array.h */
+#ifndef INCLUDE_ROARING_ARRAY_H
+#define INCLUDE_ROARING_ARRAY_H
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+
+// Note: in pure C++ code, you should avoid putting `using` in header files
+using api::roaring_array_t;
+
+namespace internal {
+#endif
+
+enum {
+    SERIAL_COOKIE_NO_RUNCONTAINER = 12346,
+    SERIAL_COOKIE = 12347,
+    FROZEN_COOKIE = 13766,
+    NO_OFFSET_THRESHOLD = 4
+};
+
+/**
+ * Create a new roaring array
+ */
+roaring_array_t *ra_create(void);
+
+/**
+ * Initialize an existing roaring array with the specified capacity (in number
+ * of containers)
+ */
+bool ra_init_with_capacity(roaring_array_t *new_ra, uint32_t cap);
+
+/**
+ * Initialize with zero capacity
+ */
+void ra_init(roaring_array_t *t);
+
+/**
+ * Copies this roaring array, we assume that dest is not initialized
+ */
+bool ra_copy(const roaring_array_t *source, roaring_array_t *dest,
+             bool copy_on_write);
+
+/*
+ * Shrinks the capacity, returns the number of bytes saved.
+ */
+int ra_shrink_to_fit(roaring_array_t *ra);
+
+/**
+ * Copies this roaring array, we assume that dest is initialized
+ */
+bool ra_overwrite(const roaring_array_t *source, roaring_array_t *dest,
+                  bool copy_on_write);
+
+/**
+ * Frees the memory used by a roaring array
+ */
+void ra_clear(roaring_array_t *r);
+
+/**
+ * Frees the memory used by a roaring array, but does not free the containers
+ */
+void ra_clear_without_containers(roaring_array_t *r);
+
+/**
+ * Frees just the containers
+ */
+void ra_clear_containers(roaring_array_t *ra);
+
+/**
+ * Get the index corresponding to a 16-bit key
+ */
+inline int32_t ra_get_index(const roaring_array_t *ra, uint16_t x) {
+    if ((ra->size == 0) || ra->keys[ra->size - 1] == x) return ra->size - 1;
+    return binarySearch(ra->keys, (int32_t)ra->size, x);
+}
+
+/**
+ * Retrieves the container at index i, filling in the typecode
+ */
+inline container_t *ra_get_container_at_index(const roaring_array_t *ra,
+                                              uint16_t i, uint8_t *typecode) {
+    *typecode = ra->typecodes[i];
+    return ra->containers[i];
+}
+
+/**
+ * Retrieves the key at index i
+ */
+inline uint16_t ra_get_key_at_index(const roaring_array_t *ra, uint16_t i) {
+    return ra->keys[i];
+}
+
+/**
+ * Add a new key-value pair at index i
+ */
+void ra_insert_new_key_value_at(roaring_array_t *ra, int32_t i, uint16_t key,
+                                container_t *c, uint8_t typecode);
+
+/**
+ * Append a new key-value pair
+ */
+void ra_append(roaring_array_t *ra, uint16_t key, container_t *c,
+               uint8_t typecode);
+
+/**
+ * Append a new key-value pair to ra, cloning (in COW sense) a value from sa
+ * at index index
+ */
+void ra_append_copy(roaring_array_t *ra, const roaring_array_t *sa,
+                    uint16_t index, bool copy_on_write);
+
+/**
+ * Append new key-value pairs to ra, cloning (in COW sense)  values from sa
+ * at indexes
+ * [start_index, end_index)
+ */
+void ra_append_copy_range(roaring_array_t *ra, const roaring_array_t *sa,
+                          int32_t start_index, int32_t end_index,
+                          bool copy_on_write);
+
+/** appends from sa to ra, ending with the greatest key that is
+ * is less or equal stopping_key
+ */
+void ra_append_copies_until(roaring_array_t *ra, const roaring_array_t *sa,
+                            uint16_t stopping_key, bool copy_on_write);
+
+/** appends from sa to ra, starting with the smallest key that is
+ * is strictly greater than before_start
+ */
+
+void ra_append_copies_after(roaring_array_t *ra, const roaring_array_t *sa,
+                            uint16_t before_start, bool copy_on_write);
+
+/**
+ * Move the key-value pairs to ra from sa at indexes
+ * [start_index, end_index), old array should not be freed
+ * (use ra_clear_without_containers)
+ **/
+void ra_append_move_range(roaring_array_t *ra, roaring_array_t *sa,
+                          int32_t start_index, int32_t end_index);
+/**
+ * Append new key-value pairs to ra,  from sa at indexes
+ * [start_index, end_index)
+ */
+void ra_append_range(roaring_array_t *ra, roaring_array_t *sa,
+                     int32_t start_index, int32_t end_index,
+                     bool copy_on_write);
+
+/**
+ * Set the container at the corresponding index using the specified
+ * typecode.
+ */
+inline void ra_set_container_at_index(const roaring_array_t *ra, int32_t i,
+                                      container_t *c, uint8_t typecode) {
+    assert(i < ra->size);
+    ra->containers[i] = c;
+    ra->typecodes[i] = typecode;
+}
+
+container_t *ra_get_container(roaring_array_t *ra, uint16_t x,
+                              uint8_t *typecode);
+
+/**
+ * If needed, increase the capacity of the array so that it can fit k values
+ * (at
+ * least);
+ */
+bool extend_array(roaring_array_t *ra, int32_t k);
+
+inline int32_t ra_get_size(const roaring_array_t *ra) { return ra->size; }
+
+static inline int32_t ra_advance_until(const roaring_array_t *ra, uint16_t x,
+                                       int32_t pos) {
+    return advanceUntil(ra->keys, pos, ra->size, x);
+}
+
+int32_t ra_advance_until_freeing(roaring_array_t *ra, uint16_t x, int32_t pos);
+
+void ra_downsize(roaring_array_t *ra, int32_t new_length);
+
+inline void ra_replace_key_and_container_at_index(roaring_array_t *ra,
+                                                  int32_t i, uint16_t key,
+                                                  container_t *c,
+                                                  uint8_t typecode) {
+    assert(i < ra->size);
+
+    ra->keys[i] = key;
+    ra->containers[i] = c;
+    ra->typecodes[i] = typecode;
+}
+
+// write set bits to an array
+void ra_to_uint32_array(const roaring_array_t *ra, uint32_t *ans);
+
+bool ra_range_uint32_array(const roaring_array_t *ra, size_t offset,
+                           size_t limit, uint32_t *ans);
+
+/**
+ * write a bitmap to a buffer. This is meant to be compatible with
+ * the
+ * Java and Go versions. Return the size in bytes of the serialized
+ * output (which should be ra_portable_size_in_bytes(ra)).
+ */
+size_t ra_portable_serialize(const roaring_array_t *ra, char *buf);
+
+/**
+ * read a bitmap from a serialized version. This is meant to be compatible
+ * with the Java and Go versions.
+ * maxbytes  indicates how many bytes available from buf.
+ * When the function returns true, roaring_array_t is populated with the data
+ * and *readbytes indicates how many bytes were read. In all cases, if the
+ * function returns true, then maxbytes >= *readbytes.
+ */
+bool ra_portable_deserialize(roaring_array_t *ra, const char *buf,
+                             const size_t maxbytes, size_t *readbytes);
+
+/**
+ * Quickly checks whether there is a serialized bitmap at the pointer,
+ * not exceeding size "maxbytes" in bytes. This function does not allocate
+ * memory dynamically.
+ *
+ * This function returns 0 if and only if no valid bitmap is found.
+ * Otherwise, it returns how many bytes are occupied by the bitmap data.
+ */
+size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes);
+
+/**
+ * How many bytes are required to serialize this bitmap (meant to be
+ * compatible
+ * with Java and Go versions)
+ */
+size_t ra_portable_size_in_bytes(const roaring_array_t *ra);
+
+/**
+ * return true if it contains at least one run container.
+ */
+bool ra_has_run_container(const roaring_array_t *ra);
+
+/**
+ * Size of the header when serializing (meant to be compatible
+ * with Java and Go versions)
+ */
+uint32_t ra_portable_header_size(const roaring_array_t *ra);
+
+/**
+ * If the container at the index i is share, unshare it (creating a local
+ * copy if needed).
+ */
+static inline void ra_unshare_container_at_index(roaring_array_t *ra,
+                                                 uint16_t i) {
+    assert(i < ra->size);
+    ra->containers[i] =
+        get_writable_copy_if_shared(ra->containers[i], &ra->typecodes[i]);
+}
+
+/**
+ * remove at index i, sliding over all entries after i
+ */
+void ra_remove_at_index(roaring_array_t *ra, int32_t i);
+
+/**
+ * clears all containers, sets the size at 0 and shrinks the memory usage.
+ */
+void ra_reset(roaring_array_t *ra);
+
+/**
+ * remove at index i, sliding over all entries after i. Free removed container.
+ */
+void ra_remove_at_index_and_free(roaring_array_t *ra, int32_t i);
+
+/**
+ * remove a chunk of indices, sliding over entries after it
+ */
+// void ra_remove_index_range(roaring_array_t *ra, int32_t begin, int32_t end);
+
+// used in inplace andNot only, to slide left the containers from
+// the mutated RoaringBitmap that are after the largest container of
+// the argument RoaringBitmap.  It is followed by a call to resize.
+//
+void ra_copy_range(roaring_array_t *ra, uint32_t begin, uint32_t end,
+                   uint32_t new_begin);
+
+/**
+ * Shifts rightmost $count containers to the left (distance < 0) or
+ * to the right (distance > 0).
+ * Allocates memory if necessary.
+ * This function doesn't free or create new containers.
+ * Caller is responsible for that.
+ */
+void ra_shift_tail(roaring_array_t *ra, int32_t count, int32_t distance);
+
+#ifdef __cplusplus
+}  // namespace internal
+}
+}  // extern "C" { namespace roaring {
+#endif
+
+#endif
+/* end file include/roaring/roaring_array.h */
+/* begin file include/roaring/art/art.h */
+#ifndef ART_ART_H
+#define ART_ART_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * This file contains an implementation of an Adaptive Radix Tree as described
+ * in https://db.in.tum.de/~leis/papers/ART.pdf.
+ *
+ * The ART contains the keys in _byte lexographical_ order.
+ *
+ * Other features:
+ *  * Fixed 48 bit key length: all keys are assumed to be be 48 bits in size.
+ *    This allows us to put the key and key prefixes directly in nodes, reducing
+ *    indirection at no additional memory overhead.
+ *  * Key compression: the only inner nodes created are at points where key
+ *    chunks _differ_. This means that if there are two entries with different
+ *    high 48 bits, then there is only one inner node containing the common key
+ *    prefix, and two leaves.
+ *  * Intrusive leaves: the leaf struct is included in user values. This removes
+ *    a layer of indirection.
+ */
+
+// Fixed length of keys in the ART. All keys are assumed to be of this length.
+#define ART_KEY_BYTES 6
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+typedef uint8_t art_key_chunk_t;
+typedef struct art_node_s art_node_t;
+
+/**
+ * Wrapper to allow an empty tree.
+ */
+typedef struct art_s {
+    art_node_t *root;
+} art_t;
+
+/**
+ * Values inserted into the tree have to be cast-able to art_val_t. This
+ * improves performance by reducing indirection.
+ *
+ * NOTE: Value pointers must be unique! This is because each value struct
+ * contains the key corresponding to the value.
+ */
+typedef struct art_val_s {
+    art_key_chunk_t key[ART_KEY_BYTES];
+} art_val_t;
+
+/**
+ * Compares two keys, returns their relative order:
+ *  * Key 1 <  key 2: returns a negative value
+ *  * Key 1 == key 2: returns 0
+ *  * Key 1 >  key 2: returns a positive value
+ */
+int art_compare_keys(const art_key_chunk_t key1[],
+                     const art_key_chunk_t key2[]);
+
+/**
+ * Inserts the given key and value.
+ */
+void art_insert(art_t *art, const art_key_chunk_t *key, art_val_t *val);
+
+/**
+ * Returns the value erased, NULL if not found.
+ */
+art_val_t *art_erase(art_t *art, const art_key_chunk_t *key);
+
+/**
+ * Returns the value associated with the given key, NULL if not found.
+ */
+art_val_t *art_find(const art_t *art, const art_key_chunk_t *key);
+
+/**
+ * Returns true if the ART is empty.
+ */
+bool art_is_empty(const art_t *art);
+
+/**
+ * Frees the nodes of the ART except the values, which the user is expected to
+ * free.
+ */
+void art_free(art_t *art);
+
+/**
+ * Returns the size in bytes of the ART. Includes size of pointers to values,
+ * but not the values themselves.
+ */
+size_t art_size_in_bytes(const art_t *art);
+
+/**
+ * Prints the ART using printf, useful for debugging.
+ */
+void art_printf(const art_t *art);
+
+/**
+ * Callback for validating the value stored in a leaf.
+ *
+ * Should return true if the value is valid, false otherwise
+ * If false is returned, `*reason` should be set to a static string describing
+ * the reason for the failure.
+ */
+typedef bool (*art_validate_cb_t)(const art_val_t *val, const char **reason);
+
+/**
+ * Validate the ART tree, ensuring it is internally consistent.
+ */
+bool art_internal_validate(const art_t *art, const char **reason,
+                           art_validate_cb_t validate_cb);
+
+/**
+ * ART-internal iterator bookkeeping. Users should treat this as an opaque type.
+ */
+typedef struct art_iterator_frame_s {
+    art_node_t *node;
+    uint8_t index_in_node;
+} art_iterator_frame_t;
+
+/**
+ * Users should only access `key` and `value` in iterators. The iterator is
+ * valid when `value != NULL`.
+ */
+typedef struct art_iterator_s {
+    art_key_chunk_t key[ART_KEY_BYTES];
+    art_val_t *value;
+
+    uint8_t depth;  // Key depth
+    uint8_t frame;  // Node depth
+
+    // State for each node in the ART the iterator has travelled from the root.
+    // This is `ART_KEY_BYTES + 1` because it includes state for the leaf too.
+    art_iterator_frame_t frames[ART_KEY_BYTES + 1];
+} art_iterator_t;
+
+/**
+ * Creates an iterator initialzed to the first or last entry in the ART,
+ * depending on `first`. The iterator is not valid if there are no entries in
+ * the ART.
+ */
+art_iterator_t art_init_iterator(const art_t *art, bool first);
+
+/**
+ * Returns an initialized iterator positioned at a key equal to or greater than
+ * the given key, if it exists.
+ */
+art_iterator_t art_lower_bound(const art_t *art, const art_key_chunk_t *key);
+
+/**
+ * Returns an initialized iterator positioned at a key greater than the given
+ * key, if it exists.
+ */
+art_iterator_t art_upper_bound(const art_t *art, const art_key_chunk_t *key);
+
+/**
+ * The following iterator movement functions return true if a new entry was
+ * encountered.
+ */
+bool art_iterator_move(art_iterator_t *iterator, bool forward);
+bool art_iterator_next(art_iterator_t *iterator);
+bool art_iterator_prev(art_iterator_t *iterator);
+
+/**
+ * Moves the iterator forward to a key equal to or greater than the given key.
+ */
+bool art_iterator_lower_bound(art_iterator_t *iterator,
+                              const art_key_chunk_t *key);
+
+/**
+ * Insert the value and positions the iterator at the key.
+ */
+void art_iterator_insert(art_t *art, art_iterator_t *iterator,
+                         const art_key_chunk_t *key, art_val_t *val);
+
+/**
+ * Erase the value pointed at by the iterator. Moves the iterator to the next
+ * leaf. Returns the value erased or NULL if nothing was erased.
+ */
+art_val_t *art_iterator_erase(art_t *art, art_iterator_t *iterator);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace roaring
+}  // namespace internal
+#endif
+
+#endif
+/* end file include/roaring/art/art.h */
+/* begin file src/array_util.c */
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#ifdef __cplusplus
+using namespace ::roaring::internal;
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+extern inline int32_t binarySearch(const uint16_t *array, int32_t lenarray,
+                                   uint16_t ikey);
+
+#if CROARING_IS_X64
+// used by intersect_vector16
+ALIGNED(0x1000)
+static const uint8_t shuffle_mask16[] = {
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 10,   11,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    10,   11,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    10,   11,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    10,   11,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    10,   11,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    10,   11,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    10,   11,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,
+    0xFF, 0xFF, 0xFF, 0xFF, 12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    8,    9,    12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    8,    9,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    8,    9,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    8,    9,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    10,   11,   12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    10,   11,   12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    10,   11,   12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    10,   11,   12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    10,   11,   12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
+    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
+    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    8,    9,    10,   11,   12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    10,   11,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    10,   11,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    10,   11,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    10,   11,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    10,   11,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    8,    9,    10,   11,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    8,    9,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    8,    9,
+    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    8,    9,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    8,    9,
+    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    8,    9,    12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    8,    9,    12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 10,   11,   12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    10,   11,   12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    10,   11,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    10,   11,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
+    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    10,   11,   12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    8,    9,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    10,   11,   12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    8,    9,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 6,    7,    8,    9,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    10,   11,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
+    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,
+    12,   13,   14,   15};
+
+/**
+ * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
+ * Optimized by D. Lemire on May 3rd 2013
+ */
+CROARING_TARGET_AVX2
+int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
+                           const uint16_t *__restrict__ B, size_t s_b,
+                           uint16_t *C) {
+    size_t count = 0;
+    size_t i_a = 0, i_b = 0;
+    const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
+    const size_t st_a = (s_a / vectorlength) * vectorlength;
+    const size_t st_b = (s_b / vectorlength) * vectorlength;
+    __m128i v_a, v_b;
+    if ((i_a < st_a) && (i_b < st_b)) {
+        v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+        v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+        while ((A[i_a] == 0) || (B[i_b] == 0)) {
+            const __m128i res_v = _mm_cmpestrm(
+                v_b, vectorlength, v_a, vectorlength,
+                _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+            const int r = _mm_extract_epi32(res_v, 0);
+            __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + r);
+            __m128i p = _mm_shuffle_epi8(v_a, sm16);
+            _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
+            count += _mm_popcnt_u32(r);
+            const uint16_t a_max = A[i_a + vectorlength - 1];
+            const uint16_t b_max = B[i_b + vectorlength - 1];
+            if (a_max <= b_max) {
+                i_a += vectorlength;
+                if (i_a == st_a) break;
+                v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+            }
+            if (b_max <= a_max) {
+                i_b += vectorlength;
+                if (i_b == st_b) break;
+                v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+            }
+        }
+        if ((i_a < st_a) && (i_b < st_b))
+            while (true) {
+                const __m128i res_v = _mm_cmpistrm(
+                    v_b, v_a,
+                    _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+                const int r = _mm_extract_epi32(res_v, 0);
+                __m128i sm16 =
+                    _mm_loadu_si128((const __m128i *)shuffle_mask16 + r);
+                __m128i p = _mm_shuffle_epi8(v_a, sm16);
+                _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
+                count += _mm_popcnt_u32(r);
+                const uint16_t a_max = A[i_a + vectorlength - 1];
+                const uint16_t b_max = B[i_b + vectorlength - 1];
+                if (a_max <= b_max) {
+                    i_a += vectorlength;
+                    if (i_a == st_a) break;
+                    v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+                }
+                if (b_max <= a_max) {
+                    i_b += vectorlength;
+                    if (i_b == st_b) break;
+                    v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+                }
+            }
+    }
+    // intersect the tail using scalar intersection
+    while (i_a < s_a && i_b < s_b) {
+        uint16_t a = A[i_a];
+        uint16_t b = B[i_b];
+        if (a < b) {
+            i_a++;
+        } else if (b < a) {
+            i_b++;
+        } else {
+            C[count] = a;  //==b;
+            count++;
+            i_a++;
+            i_b++;
+        }
+    }
+    return (int32_t)count;
+}
+
+ALLOW_UNALIGNED
+int array_container_to_uint32_array_vector16(void *vout, const uint16_t *array,
+                                             size_t cardinality,
+                                             uint32_t base) {
+    int outpos = 0;
+    uint32_t *out = (uint32_t *)vout;
+    size_t i = 0;
+    for (; i + sizeof(__m128i) / sizeof(uint16_t) <= cardinality;
+         i += sizeof(__m128i) / sizeof(uint16_t)) {
+        __m128i vinput = _mm_loadu_si128((const __m128i *)(array + i));
+        __m256i voutput = _mm256_add_epi32(_mm256_cvtepu16_epi32(vinput),
+                                           _mm256_set1_epi32(base));
+        _mm256_storeu_si256((__m256i *)(out + outpos), voutput);
+        outpos += sizeof(__m256i) / sizeof(uint32_t);
+    }
+    for (; i < cardinality; ++i) {
+        const uint32_t val = base + array[i];
+        memcpy(out + outpos, &val,
+               sizeof(uint32_t));  // should be compiled as a MOV on x64
+        outpos++;
+    }
+    return outpos;
+}
+
+int32_t intersect_vector16_inplace(uint16_t *__restrict__ A, size_t s_a,
+                                   const uint16_t *__restrict__ B, size_t s_b) {
+    size_t count = 0;
+    size_t i_a = 0, i_b = 0;
+    const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
+    const size_t st_a = (s_a / vectorlength) * vectorlength;
+    const size_t st_b = (s_b / vectorlength) * vectorlength;
+    __m128i v_a, v_b;
+    if ((i_a < st_a) && (i_b < st_b)) {
+        v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+        v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+        __m128i tmp[2] = {_mm_setzero_si128()};
+        size_t tmp_count = 0;
+        while ((A[i_a] == 0) || (B[i_b] == 0)) {
+            const __m128i res_v = _mm_cmpestrm(
+                v_b, vectorlength, v_a, vectorlength,
+                _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+            const int r = _mm_extract_epi32(res_v, 0);
+            __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + r);
+            __m128i p = _mm_shuffle_epi8(v_a, sm16);
+            _mm_storeu_si128((__m128i *)&((uint16_t *)tmp)[tmp_count], p);
+            tmp_count += _mm_popcnt_u32(r);
+            const uint16_t a_max = A[i_a + vectorlength - 1];
+            const uint16_t b_max = B[i_b + vectorlength - 1];
+            if (a_max <= b_max) {
+                _mm_storeu_si128((__m128i *)&A[count], tmp[0]);
+                _mm_storeu_si128(tmp, _mm_setzero_si128());
+                count += tmp_count;
+                tmp_count = 0;
+                i_a += vectorlength;
+                if (i_a == st_a) break;
+                v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+            }
+            if (b_max <= a_max) {
+                i_b += vectorlength;
+                if (i_b == st_b) break;
+                v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+            }
+        }
+        if ((i_a < st_a) && (i_b < st_b)) {
+            while (true) {
+                const __m128i res_v = _mm_cmpistrm(
+                    v_b, v_a,
+                    _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+                const int r = _mm_extract_epi32(res_v, 0);
+                __m128i sm16 =
+                    _mm_loadu_si128((const __m128i *)shuffle_mask16 + r);
+                __m128i p = _mm_shuffle_epi8(v_a, sm16);
+                _mm_storeu_si128((__m128i *)&((uint16_t *)tmp)[tmp_count], p);
+                tmp_count += _mm_popcnt_u32(r);
+                const uint16_t a_max = A[i_a + vectorlength - 1];
+                const uint16_t b_max = B[i_b + vectorlength - 1];
+                if (a_max <= b_max) {
+                    _mm_storeu_si128((__m128i *)&A[count], tmp[0]);
+                    _mm_storeu_si128(tmp, _mm_setzero_si128());
+                    count += tmp_count;
+                    tmp_count = 0;
+                    i_a += vectorlength;
+                    if (i_a == st_a) break;
+                    v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+                }
+                if (b_max <= a_max) {
+                    i_b += vectorlength;
+                    if (i_b == st_b) break;
+                    v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+                }
+            }
+        }
+        // tmp_count <= 8, so this does not affect efficiency so much
+        for (size_t i = 0; i < tmp_count; i++) {
+            A[count] = ((uint16_t *)tmp)[i];
+            count++;
+        }
+        i_a += tmp_count;  // We can at least jump pass $tmp_count elements in A
+    }
+    // intersect the tail using scalar intersection
+    while (i_a < s_a && i_b < s_b) {
+        uint16_t a = A[i_a];
+        uint16_t b = B[i_b];
+        if (a < b) {
+            i_a++;
+        } else if (b < a) {
+            i_b++;
+        } else {
+            A[count] = a;  //==b;
+            count++;
+            i_a++;
+            i_b++;
+        }
+    }
+    return (int32_t)count;
+}
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A,
+                                       size_t s_a,
+                                       const uint16_t *__restrict__ B,
+                                       size_t s_b) {
+    size_t count = 0;
+    size_t i_a = 0, i_b = 0;
+    const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
+    const size_t st_a = (s_a / vectorlength) * vectorlength;
+    const size_t st_b = (s_b / vectorlength) * vectorlength;
+    __m128i v_a, v_b;
+    if ((i_a < st_a) && (i_b < st_b)) {
+        v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+        v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+        while ((A[i_a] == 0) || (B[i_b] == 0)) {
+            const __m128i res_v = _mm_cmpestrm(
+                v_b, vectorlength, v_a, vectorlength,
+                _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+            const int r = _mm_extract_epi32(res_v, 0);
+            count += _mm_popcnt_u32(r);
+            const uint16_t a_max = A[i_a + vectorlength - 1];
+            const uint16_t b_max = B[i_b + vectorlength - 1];
+            if (a_max <= b_max) {
+                i_a += vectorlength;
+                if (i_a == st_a) break;
+                v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+            }
+            if (b_max <= a_max) {
+                i_b += vectorlength;
+                if (i_b == st_b) break;
+                v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+            }
+        }
+        if ((i_a < st_a) && (i_b < st_b))
+            while (true) {
+                const __m128i res_v = _mm_cmpistrm(
+                    v_b, v_a,
+                    _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+                const int r = _mm_extract_epi32(res_v, 0);
+                count += _mm_popcnt_u32(r);
+                const uint16_t a_max = A[i_a + vectorlength - 1];
+                const uint16_t b_max = B[i_b + vectorlength - 1];
+                if (a_max <= b_max) {
+                    i_a += vectorlength;
+                    if (i_a == st_a) break;
+                    v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+                }
+                if (b_max <= a_max) {
+                    i_b += vectorlength;
+                    if (i_b == st_b) break;
+                    v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+                }
+            }
+    }
+    // intersect the tail using scalar intersection
+    while (i_a < s_a && i_b < s_b) {
+        uint16_t a = A[i_a];
+        uint16_t b = B[i_b];
+        if (a < b) {
+            i_a++;
+        } else if (b < a) {
+            i_b++;
+        } else {
+            count++;
+            i_a++;
+            i_b++;
+        }
+    }
+    return (int32_t)count;
+}
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+/////////
+// Warning:
+// This function may not be safe if A == C or B == C.
+/////////
+int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
+                            const uint16_t *__restrict__ B, size_t s_b,
+                            uint16_t *C) {
+    // we handle the degenerate case
+    if (s_a == 0) return 0;
+    if (s_b == 0) {
+        if (A != C) memcpy(C, A, sizeof(uint16_t) * s_a);
+        return (int32_t)s_a;
+    }
+    // handle the leading zeroes, it is messy but it allows us to use the fast
+    // _mm_cmpistrm instrinsic safely
+    int32_t count = 0;
+    if ((A[0] == 0) || (B[0] == 0)) {
+        if ((A[0] == 0) && (B[0] == 0)) {
+            A++;
+            s_a--;
+            B++;
+            s_b--;
+        } else if (A[0] == 0) {
+            C[count++] = 0;
+            A++;
+            s_a--;
+        } else {
+            B++;
+            s_b--;
+        }
+    }
+    // at this point, we have two non-empty arrays, made of non-zero
+    // increasing values.
+    size_t i_a = 0, i_b = 0;
+    const size_t vectorlength = sizeof(__m128i) / sizeof(uint16_t);
+    const size_t st_a = (s_a / vectorlength) * vectorlength;
+    const size_t st_b = (s_b / vectorlength) * vectorlength;
+    if ((i_a < st_a) && (i_b < st_b)) {  // this is the vectorized code path
+        __m128i v_a, v_b;                //, v_bmax;
+        // we load a vector from A and a vector from B
+        v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+        v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+        // we have a runningmask which indicates which values from A have been
+        // spotted in B, these don't get written out.
+        __m128i runningmask_a_found_in_b = _mm_setzero_si128();
+        /****
+         * start of the main vectorized loop
+         *****/
+        while (true) {
+            // afoundinb will contain a mask indicate for each entry in A
+            // whether it is seen
+            // in B
+            const __m128i a_found_in_b = _mm_cmpistrm(
+                v_b, v_a,
+                _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+            runningmask_a_found_in_b =
+                _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
+            // we always compare the last values of A and B
+            const uint16_t a_max = A[i_a + vectorlength - 1];
+            const uint16_t b_max = B[i_b + vectorlength - 1];
+            if (a_max <= b_max) {
+                // Ok. In this code path, we are ready to write our v_a
+                // because there is no need to read more from B, they will
+                // all be large values.
+                const int bitmask_belongs_to_difference =
+                    _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
+                /*** next few lines are probably expensive *****/
+                __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 +
+                                               bitmask_belongs_to_difference);
+                __m128i p = _mm_shuffle_epi8(v_a, sm16);
+                _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
+                count += _mm_popcnt_u32(bitmask_belongs_to_difference);
+                // we advance a
+                i_a += vectorlength;
+                if (i_a == st_a)  // no more
+                    break;
+                runningmask_a_found_in_b = _mm_setzero_si128();
+                v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+            }
+            if (b_max <= a_max) {
+                // in this code path, the current v_b has become useless
+                i_b += vectorlength;
+                if (i_b == st_b) break;
+                v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+            }
+        }
+        // at this point, either we have i_a == st_a, which is the end of the
+        // vectorized processing,
+        // or we have i_b == st_b,  and we are not done processing the vector...
+        // so we need to finish it off.
+        if (i_a < st_a) {        // we have unfinished business...
+            uint16_t buffer[8];  // buffer to do a masked load
+            memset(buffer, 0, 8 * sizeof(uint16_t));
+            memcpy(buffer, B + i_b, (s_b - i_b) * sizeof(uint16_t));
+            v_b = _mm_lddqu_si128((__m128i *)buffer);
+            const __m128i a_found_in_b = _mm_cmpistrm(
+                v_b, v_a,
+                _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+            runningmask_a_found_in_b =
+                _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
+            const int bitmask_belongs_to_difference =
+                _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
+            __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 +
+                                           bitmask_belongs_to_difference);
+            __m128i p = _mm_shuffle_epi8(v_a, sm16);
+            _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
+            count += _mm_popcnt_u32(bitmask_belongs_to_difference);
+            i_a += vectorlength;
+        }
+        // at this point we should have i_a == st_a and i_b == st_b
+    }
+    // do the tail using scalar code
+    while (i_a < s_a && i_b < s_b) {
+        uint16_t a = A[i_a];
+        uint16_t b = B[i_b];
+        if (b < a) {
+            i_b++;
+        } else if (a < b) {
+            C[count] = a;
+            count++;
+            i_a++;
+        } else {  //==
+            i_a++;
+            i_b++;
+        }
+    }
+    if (i_a < s_a) {
+        if (C == A) {
+            assert((size_t)count <= i_a);
+            if ((size_t)count < i_a) {
+                memmove(C + count, A + i_a, sizeof(uint16_t) * (s_a - i_a));
+            }
+        } else {
+            for (size_t i = 0; i < (s_a - i_a); i++) {
+                C[count + i] = A[i + i_a];
+            }
+        }
+        count += (int32_t)(s_a - i_a);
+    }
+    return count;
+}
+CROARING_UNTARGET_AVX2
+#endif  // CROARING_IS_X64
+
+/**
+ * Branchless binary search going after 4 values at once.
+ * Assumes that array is sorted.
+ * You have that array[*index1] >= target1, array[*index12] >= target2, ...
+ * except when *index1 = n, in which case you know that all values in array are
+ * smaller than target1, and so forth.
+ * It has logarithmic complexity.
+ */
+static void binarySearch4(const uint16_t *array, int32_t n, uint16_t target1,
+                          uint16_t target2, uint16_t target3, uint16_t target4,
+                          int32_t *index1, int32_t *index2, int32_t *index3,
+                          int32_t *index4) {
+    const uint16_t *base1 = array;
+    const uint16_t *base2 = array;
+    const uint16_t *base3 = array;
+    const uint16_t *base4 = array;
+    if (n == 0) return;
+    while (n > 1) {
+        int32_t half = n >> 1;
+        base1 = (base1[half] < target1) ? &base1[half] : base1;
+        base2 = (base2[half] < target2) ? &base2[half] : base2;
+        base3 = (base3[half] < target3) ? &base3[half] : base3;
+        base4 = (base4[half] < target4) ? &base4[half] : base4;
+        n -= half;
+    }
+    *index1 = (int32_t)((*base1 < target1) + base1 - array);
+    *index2 = (int32_t)((*base2 < target2) + base2 - array);
+    *index3 = (int32_t)((*base3 < target3) + base3 - array);
+    *index4 = (int32_t)((*base4 < target4) + base4 - array);
+}
+
+/**
+ * Branchless binary search going after 2 values at once.
+ * Assumes that array is sorted.
+ * You have that array[*index1] >= target1, array[*index12] >= target2.
+ * except when *index1 = n, in which case you know that all values in array are
+ * smaller than target1, and so forth.
+ * It has logarithmic complexity.
+ */
+static void binarySearch2(const uint16_t *array, int32_t n, uint16_t target1,
+                          uint16_t target2, int32_t *index1, int32_t *index2) {
+    const uint16_t *base1 = array;
+    const uint16_t *base2 = array;
+    if (n == 0) return;
+    while (n > 1) {
+        int32_t half = n >> 1;
+        base1 = (base1[half] < target1) ? &base1[half] : base1;
+        base2 = (base2[half] < target2) ? &base2[half] : base2;
+        n -= half;
+    }
+    *index1 = (int32_t)((*base1 < target1) + base1 - array);
+    *index2 = (int32_t)((*base2 < target2) + base2 - array);
+}
+
+/* Computes the intersection between one small and one large set of uint16_t.
+ * Stores the result into buffer and return the number of elements.
+ * Processes the small set in blocks of 4 values calling binarySearch4
+ * and binarySearch2. This approach can be slightly superior to a conventional
+ * galloping search in some instances.
+ */
+int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s,
+                                const uint16_t *large, size_t size_l,
+                                uint16_t *buffer) {
+    size_t pos = 0, idx_l = 0, idx_s = 0;
+
+    if (0 == size_s) {
+        return 0;
+    }
+    int32_t index1 = 0, index2 = 0, index3 = 0, index4 = 0;
+    while ((idx_s + 4 <= size_s) && (idx_l < size_l)) {
+        uint16_t target1 = small[idx_s];
+        uint16_t target2 = small[idx_s + 1];
+        uint16_t target3 = small[idx_s + 2];
+        uint16_t target4 = small[idx_s + 3];
+        binarySearch4(large + idx_l, (int32_t)(size_l - idx_l), target1,
+                      target2, target3, target4, &index1, &index2, &index3,
+                      &index4);
+        if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
+            buffer[pos++] = target1;
+        }
+        if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
+            buffer[pos++] = target2;
+        }
+        if ((index3 + idx_l < size_l) && (large[idx_l + index3] == target3)) {
+            buffer[pos++] = target3;
+        }
+        if ((index4 + idx_l < size_l) && (large[idx_l + index4] == target4)) {
+            buffer[pos++] = target4;
+        }
+        idx_s += 4;
+        idx_l += index4;
+    }
+    if ((idx_s + 2 <= size_s) && (idx_l < size_l)) {
+        uint16_t target1 = small[idx_s];
+        uint16_t target2 = small[idx_s + 1];
+        binarySearch2(large + idx_l, (int32_t)(size_l - idx_l), target1,
+                      target2, &index1, &index2);
+        if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
+            buffer[pos++] = target1;
+        }
+        if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
+            buffer[pos++] = target2;
+        }
+        idx_s += 2;
+        idx_l += index2;
+    }
+    if ((idx_s < size_s) && (idx_l < size_l)) {
+        uint16_t val_s = small[idx_s];
+        int32_t index =
+            binarySearch(large + idx_l, (int32_t)(size_l - idx_l), val_s);
+        if (index >= 0) buffer[pos++] = val_s;
+    }
+    return (int32_t)pos;
+}
+
+// TODO: this could be accelerated, possibly, by using binarySearch4 as above.
+int32_t intersect_skewed_uint16_cardinality(const uint16_t *small,
+                                            size_t size_s,
+                                            const uint16_t *large,
+                                            size_t size_l) {
+    size_t pos = 0, idx_l = 0, idx_s = 0;
+
+    if (0 == size_s) {
+        return 0;
+    }
+
+    uint16_t val_l = large[idx_l], val_s = small[idx_s];
+
+    while (true) {
+        if (val_l < val_s) {
+            idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
+            if (idx_l == size_l) break;
+            val_l = large[idx_l];
+        } else if (val_s < val_l) {
+            idx_s++;
+            if (idx_s == size_s) break;
+            val_s = small[idx_s];
+        } else {
+            pos++;
+            idx_s++;
+            if (idx_s == size_s) break;
+            val_s = small[idx_s];
+            idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
+            if (idx_l == size_l) break;
+            val_l = large[idx_l];
+        }
+    }
+
+    return (int32_t)pos;
+}
+
+bool intersect_skewed_uint16_nonempty(const uint16_t *small, size_t size_s,
+                                      const uint16_t *large, size_t size_l) {
+    size_t idx_l = 0, idx_s = 0;
+
+    if (0 == size_s) {
+        return false;
+    }
+
+    uint16_t val_l = large[idx_l], val_s = small[idx_s];
+
+    while (true) {
+        if (val_l < val_s) {
+            idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
+            if (idx_l == size_l) break;
+            val_l = large[idx_l];
+        } else if (val_s < val_l) {
+            idx_s++;
+            if (idx_s == size_s) break;
+            val_s = small[idx_s];
+        } else {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/**
+ * Generic intersection function.
+ */
+int32_t intersect_uint16(const uint16_t *A, const size_t lenA,
+                         const uint16_t *B, const size_t lenB, uint16_t *out) {
+    const uint16_t *initout = out;
+    if (lenA == 0 || lenB == 0) return 0;
+    const uint16_t *endA = A + lenA;
+    const uint16_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return (int32_t)(out - initout);
+        }
+        while (*A > *B) {
+            if (++B == endB) return (int32_t)(out - initout);
+        }
+        if (*A == *B) {
+            *out++ = *A;
+            if (++A == endA || ++B == endB) return (int32_t)(out - initout);
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    // return (int32_t)(out - initout);  // NOTREACHED
+}
+
+int32_t intersect_uint16_cardinality(const uint16_t *A, const size_t lenA,
+                                     const uint16_t *B, const size_t lenB) {
+    int32_t answer = 0;
+    if (lenA == 0 || lenB == 0) return 0;
+    const uint16_t *endA = A + lenA;
+    const uint16_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return answer;
+        }
+        while (*A > *B) {
+            if (++B == endB) return answer;
+        }
+        if (*A == *B) {
+            ++answer;
+            if (++A == endA || ++B == endB) return answer;
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    // return answer;  // NOTREACHED
+}
+
+bool intersect_uint16_nonempty(const uint16_t *A, const size_t lenA,
+                               const uint16_t *B, const size_t lenB) {
+    if (lenA == 0 || lenB == 0) return 0;
+    const uint16_t *endA = A + lenA;
+    const uint16_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return false;
+        }
+        while (*A > *B) {
+            if (++B == endB) return false;
+        }
+        if (*A == *B) {
+            return true;
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    return false;  // NOTREACHED
+}
+
+/**
+ * Generic intersection function.
+ */
+size_t intersection_uint32(const uint32_t *A, const size_t lenA,
+                           const uint32_t *B, const size_t lenB,
+                           uint32_t *out) {
+    const uint32_t *initout = out;
+    if (lenA == 0 || lenB == 0) return 0;
+    const uint32_t *endA = A + lenA;
+    const uint32_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return (out - initout);
+        }
+        while (*A > *B) {
+            if (++B == endB) return (out - initout);
+        }
+        if (*A == *B) {
+            *out++ = *A;
+            if (++A == endA || ++B == endB) return (out - initout);
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    // return (out - initout);  // NOTREACHED
+}
+
+size_t intersection_uint32_card(const uint32_t *A, const size_t lenA,
+                                const uint32_t *B, const size_t lenB) {
+    if (lenA == 0 || lenB == 0) return 0;
+    size_t card = 0;
+    const uint32_t *endA = A + lenA;
+    const uint32_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return card;
+        }
+        while (*A > *B) {
+            if (++B == endB) return card;
+        }
+        if (*A == *B) {
+            card++;
+            if (++A == endA || ++B == endB) return card;
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    // return card;  // NOTREACHED
+}
+
+// can one vectorize the computation of the union? (Update: Yes! See
+// union_vector16).
+
+size_t union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
+                    size_t size_2, uint16_t *buffer) {
+    size_t pos = 0, idx_1 = 0, idx_2 = 0;
+
+    if (0 == size_2) {
+        memmove(buffer, set_1, size_1 * sizeof(uint16_t));
+        return size_1;
+    }
+    if (0 == size_1) {
+        memmove(buffer, set_2, size_2 * sizeof(uint16_t));
+        return size_2;
+    }
+
+    uint16_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
+
+    while (true) {
+        if (val_1 < val_2) {
+            buffer[pos++] = val_1;
+            ++idx_1;
+            if (idx_1 >= size_1) break;
+            val_1 = set_1[idx_1];
+        } else if (val_2 < val_1) {
+            buffer[pos++] = val_2;
+            ++idx_2;
+            if (idx_2 >= size_2) break;
+            val_2 = set_2[idx_2];
+        } else {
+            buffer[pos++] = val_1;
+            ++idx_1;
+            ++idx_2;
+            if (idx_1 >= size_1 || idx_2 >= size_2) break;
+            val_1 = set_1[idx_1];
+            val_2 = set_2[idx_2];
+        }
+    }
+
+    if (idx_1 < size_1) {
+        const size_t n_elems = size_1 - idx_1;
+        memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint16_t));
+        pos += n_elems;
+    } else if (idx_2 < size_2) {
+        const size_t n_elems = size_2 - idx_2;
+        memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint16_t));
+        pos += n_elems;
+    }
+
+    return pos;
+}
+
+int difference_uint16(const uint16_t *a1, int length1, const uint16_t *a2,
+                      int length2, uint16_t *a_out) {
+    int out_card = 0;
+    int k1 = 0, k2 = 0;
+    if (length1 == 0) return 0;
+    if (length2 == 0) {
+        if (a1 != a_out) memcpy(a_out, a1, sizeof(uint16_t) * length1);
+        return length1;
+    }
+    uint16_t s1 = a1[k1];
+    uint16_t s2 = a2[k2];
+    while (true) {
+        if (s1 < s2) {
+            a_out[out_card++] = s1;
+            ++k1;
+            if (k1 >= length1) {
+                break;
+            }
+            s1 = a1[k1];
+        } else if (s1 == s2) {
+            ++k1;
+            ++k2;
+            if (k1 >= length1) {
+                break;
+            }
+            if (k2 >= length2) {
+                memmove(a_out + out_card, a1 + k1,
+                        sizeof(uint16_t) * (length1 - k1));
+                return out_card + length1 - k1;
+            }
+            s1 = a1[k1];
+            s2 = a2[k2];
+        } else {  // if (val1>val2)
+            ++k2;
+            if (k2 >= length2) {
+                memmove(a_out + out_card, a1 + k1,
+                        sizeof(uint16_t) * (length1 - k1));
+                return out_card + length1 - k1;
+            }
+            s2 = a2[k2];
+        }
+    }
+    return out_card;
+}
+
+int32_t xor_uint16(const uint16_t *array_1, int32_t card_1,
+                   const uint16_t *array_2, int32_t card_2, uint16_t *out) {
+    int32_t pos1 = 0, pos2 = 0, pos_out = 0;
+    while (pos1 < card_1 && pos2 < card_2) {
+        const uint16_t v1 = array_1[pos1];
+        const uint16_t v2 = array_2[pos2];
+        if (v1 == v2) {
+            ++pos1;
+            ++pos2;
+            continue;
+        }
+        if (v1 < v2) {
+            out[pos_out++] = v1;
+            ++pos1;
+        } else {
+            out[pos_out++] = v2;
+            ++pos2;
+        }
+    }
+    if (pos1 < card_1) {
+        const size_t n_elems = card_1 - pos1;
+        memcpy(out + pos_out, array_1 + pos1, n_elems * sizeof(uint16_t));
+        pos_out += (int32_t)n_elems;
+    } else if (pos2 < card_2) {
+        const size_t n_elems = card_2 - pos2;
+        memcpy(out + pos_out, array_2 + pos2, n_elems * sizeof(uint16_t));
+        pos_out += (int32_t)n_elems;
+    }
+    return pos_out;
+}
+
+#if CROARING_IS_X64
+
+/***
+ * start of the SIMD 16-bit union code
+ *
+ */
+CROARING_TARGET_AVX2
+
+// Assuming that vInput1 and vInput2 are sorted, produces a sorted output going
+// from vecMin all the way to vecMax
+// developed originally for merge sort using SIMD instructions.
+// Standard merge. See, e.g., Inoue and Taura, SIMD- and Cache-Friendly
+// Algorithm for Sorting an Array of Structures
+static inline void sse_merge(const __m128i *vInput1,
+                             const __m128i *vInput2,              // input 1 & 2
+                             __m128i *vecMin, __m128i *vecMax) {  // output
+    __m128i vecTmp;
+    vecTmp = _mm_min_epu16(*vInput1, *vInput2);
+    *vecMax = _mm_max_epu16(*vInput1, *vInput2);
+    vecTmp = _mm_alignr_epi8(vecTmp, vecTmp, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+}
+CROARING_UNTARGET_AVX2
+// used by store_unique, generated by simdunion.py
+static uint8_t uniqshuf[] = {
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xc,  0xd,  0xe,  0xf,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF};
+CROARING_TARGET_AVX2
+// write vector new, while omitting repeated values assuming that previously
+// written vector was "old"
+static inline int store_unique(__m128i old, __m128i newval, uint16_t *output) {
+    __m128i vecTmp = _mm_alignr_epi8(newval, old, 16 - 2);
+    // lots of high latency instructions follow (optimize?)
+    int M = _mm_movemask_epi8(
+        _mm_packs_epi16(_mm_cmpeq_epi16(vecTmp, newval), _mm_setzero_si128()));
+    int numberofnewvalues = 8 - _mm_popcnt_u32(M);
+    __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
+    __m128i val = _mm_shuffle_epi8(newval, key);
+    _mm_storeu_si128((__m128i *)output, val);
+    return numberofnewvalues;
+}
+CROARING_UNTARGET_AVX2
+
+// working in-place, this function overwrites the repeated values
+// could be avoided?
+static inline uint32_t unique(uint16_t *out, uint32_t len) {
+    uint32_t pos = 1;
+    for (uint32_t i = 1; i < len; ++i) {
+        if (out[i] != out[i - 1]) {
+            out[pos++] = out[i];
+        }
+    }
+    return pos;
+}
+
+// use with qsort, could be avoided
+static int uint16_compare(const void *a, const void *b) {
+    return (*(uint16_t *)a - *(uint16_t *)b);
+}
+
+CROARING_TARGET_AVX2
+// a one-pass SSE union algorithm
+// This function may not be safe if array1 == output or array2 == output.
+uint32_t union_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
+                        const uint16_t *__restrict__ array2, uint32_t length2,
+                        uint16_t *__restrict__ output) {
+    if ((length1 < 8) || (length2 < 8)) {
+        return (uint32_t)union_uint16(array1, length1, array2, length2, output);
+    }
+    __m128i vA, vB, V, vecMin, vecMax;
+    __m128i laststore;
+    uint16_t *initoutput = output;
+    uint32_t len1 = length1 / 8;
+    uint32_t len2 = length2 / 8;
+    uint32_t pos1 = 0;
+    uint32_t pos2 = 0;
+    // we start the machine
+    vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
+    pos1++;
+    vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
+    pos2++;
+    sse_merge(&vA, &vB, &vecMin, &vecMax);
+    laststore = _mm_set1_epi16(-1);
+    output += store_unique(laststore, vecMin, output);
+    laststore = vecMin;
+    if ((pos1 < len1) && (pos2 < len2)) {
+        uint16_t curA, curB;
+        curA = array1[8 * pos1];
+        curB = array2[8 * pos2];
+        while (true) {
+            if (curA <= curB) {
+                V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
+                pos1++;
+                if (pos1 < len1) {
+                    curA = array1[8 * pos1];
+                } else {
+                    break;
+                }
+            } else {
+                V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
+                pos2++;
+                if (pos2 < len2) {
+                    curB = array2[8 * pos2];
+                } else {
+                    break;
+                }
+            }
+            sse_merge(&V, &vecMax, &vecMin, &vecMax);
+            output += store_unique(laststore, vecMin, output);
+            laststore = vecMin;
+        }
+        sse_merge(&V, &vecMax, &vecMin, &vecMax);
+        output += store_unique(laststore, vecMin, output);
+        laststore = vecMin;
+    }
+    // we finish the rest off using a scalar algorithm
+    // could be improved?
+    //
+    // copy the small end on a tmp buffer
+    uint32_t len = (uint32_t)(output - initoutput);
+    uint16_t buffer[16];
+    uint32_t leftoversize = store_unique(laststore, vecMax, buffer);
+    if (pos1 == len1) {
+        memcpy(buffer + leftoversize, array1 + 8 * pos1,
+               (length1 - 8 * len1) * sizeof(uint16_t));
+        leftoversize += length1 - 8 * len1;
+        qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
+
+        leftoversize = unique(buffer, leftoversize);
+        len += (uint32_t)union_uint16(buffer, leftoversize, array2 + 8 * pos2,
+                                      length2 - 8 * pos2, output);
+    } else {
+        memcpy(buffer + leftoversize, array2 + 8 * pos2,
+               (length2 - 8 * len2) * sizeof(uint16_t));
+        leftoversize += length2 - 8 * len2;
+        qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
+        leftoversize = unique(buffer, leftoversize);
+        len += (uint32_t)union_uint16(buffer, leftoversize, array1 + 8 * pos1,
+                                      length1 - 8 * pos1, output);
+    }
+    return len;
+}
+CROARING_UNTARGET_AVX2
+
+/**
+ * End of the SIMD 16-bit union code
+ *
+ */
+
+/**
+ * Start of SIMD 16-bit XOR code
+ */
+
+CROARING_TARGET_AVX2
+// write vector new, while omitting repeated values assuming that previously
+// written vector was "old"
+static inline int store_unique_xor(__m128i old, __m128i newval,
+                                   uint16_t *output) {
+    __m128i vecTmp1 = _mm_alignr_epi8(newval, old, 16 - 4);
+    __m128i vecTmp2 = _mm_alignr_epi8(newval, old, 16 - 2);
+    __m128i equalleft = _mm_cmpeq_epi16(vecTmp2, vecTmp1);
+    __m128i equalright = _mm_cmpeq_epi16(vecTmp2, newval);
+    __m128i equalleftoright = _mm_or_si128(equalleft, equalright);
+    int M = _mm_movemask_epi8(
+        _mm_packs_epi16(equalleftoright, _mm_setzero_si128()));
+    int numberofnewvalues = 8 - _mm_popcnt_u32(M);
+    __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
+    __m128i val = _mm_shuffle_epi8(vecTmp2, key);
+    _mm_storeu_si128((__m128i *)output, val);
+    return numberofnewvalues;
+}
+CROARING_UNTARGET_AVX2
+
+// working in-place, this function overwrites the repeated values
+// could be avoided? Warning: assumes len > 0
+static inline uint32_t unique_xor(uint16_t *out, uint32_t len) {
+    uint32_t pos = 1;
+    for (uint32_t i = 1; i < len; ++i) {
+        if (out[i] != out[i - 1]) {
+            out[pos++] = out[i];
+        } else
+            pos--;  // if it is identical to previous, delete it
+    }
+    return pos;
+}
+CROARING_TARGET_AVX2
+// a one-pass SSE xor algorithm
+uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
+                      const uint16_t *__restrict__ array2, uint32_t length2,
+                      uint16_t *__restrict__ output) {
+    if ((length1 < 8) || (length2 < 8)) {
+        return xor_uint16(array1, length1, array2, length2, output);
+    }
+    __m128i vA, vB, V, vecMin, vecMax;
+    __m128i laststore;
+    uint16_t *initoutput = output;
+    uint32_t len1 = length1 / 8;
+    uint32_t len2 = length2 / 8;
+    uint32_t pos1 = 0;
+    uint32_t pos2 = 0;
+    // we start the machine
+    vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
+    pos1++;
+    vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
+    pos2++;
+    sse_merge(&vA, &vB, &vecMin, &vecMax);
+    laststore = _mm_set1_epi16(-1);
+    uint16_t buffer[17];
+    output += store_unique_xor(laststore, vecMin, output);
+
+    laststore = vecMin;
+    if ((pos1 < len1) && (pos2 < len2)) {
+        uint16_t curA, curB;
+        curA = array1[8 * pos1];
+        curB = array2[8 * pos2];
+        while (true) {
+            if (curA <= curB) {
+                V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
+                pos1++;
+                if (pos1 < len1) {
+                    curA = array1[8 * pos1];
+                } else {
+                    break;
+                }
+            } else {
+                V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
+                pos2++;
+                if (pos2 < len2) {
+                    curB = array2[8 * pos2];
+                } else {
+                    break;
+                }
+            }
+            sse_merge(&V, &vecMax, &vecMin, &vecMax);
+            // conditionally stores the last value of laststore as well as all
+            // but the
+            // last value of vecMin
+            output += store_unique_xor(laststore, vecMin, output);
+            laststore = vecMin;
+        }
+        sse_merge(&V, &vecMax, &vecMin, &vecMax);
+        // conditionally stores the last value of laststore as well as all but
+        // the
+        // last value of vecMin
+        output += store_unique_xor(laststore, vecMin, output);
+        laststore = vecMin;
+    }
+    uint32_t len = (uint32_t)(output - initoutput);
+
+    // we finish the rest off using a scalar algorithm
+    // could be improved?
+    // conditionally stores the last value of laststore as well as all but the
+    // last value of vecMax,
+    // we store to "buffer"
+    int leftoversize = store_unique_xor(laststore, vecMax, buffer);
+    uint16_t vec7 = (uint16_t)_mm_extract_epi16(vecMax, 7);
+    uint16_t vec6 = (uint16_t)_mm_extract_epi16(vecMax, 6);
+    if (vec7 != vec6) buffer[leftoversize++] = vec7;
+    if (pos1 == len1) {
+        memcpy(buffer + leftoversize, array1 + 8 * pos1,
+               (length1 - 8 * len1) * sizeof(uint16_t));
+        leftoversize += length1 - 8 * len1;
+        if (leftoversize == 0) {  // trivial case
+            memcpy(output, array2 + 8 * pos2,
+                   (length2 - 8 * pos2) * sizeof(uint16_t));
+            len += (length2 - 8 * pos2);
+        } else {
+            qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
+            leftoversize = unique_xor(buffer, leftoversize);
+            len += xor_uint16(buffer, leftoversize, array2 + 8 * pos2,
+                              length2 - 8 * pos2, output);
+        }
+    } else {
+        memcpy(buffer + leftoversize, array2 + 8 * pos2,
+               (length2 - 8 * len2) * sizeof(uint16_t));
+        leftoversize += length2 - 8 * len2;
+        if (leftoversize == 0) {  // trivial case
+            memcpy(output, array1 + 8 * pos1,
+                   (length1 - 8 * pos1) * sizeof(uint16_t));
+            len += (length1 - 8 * pos1);
+        } else {
+            qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
+            leftoversize = unique_xor(buffer, leftoversize);
+            len += xor_uint16(buffer, leftoversize, array1 + 8 * pos1,
+                              length1 - 8 * pos1, output);
+        }
+    }
+    return len;
+}
+CROARING_UNTARGET_AVX2
+/**
+ * End of SIMD 16-bit XOR code
+ */
+
+#endif  // CROARING_IS_X64
+
+size_t union_uint32(const uint32_t *set_1, size_t size_1, const uint32_t *set_2,
+                    size_t size_2, uint32_t *buffer) {
+    size_t pos = 0, idx_1 = 0, idx_2 = 0;
+
+    if (0 == size_2) {
+        memmove(buffer, set_1, size_1 * sizeof(uint32_t));
+        return size_1;
+    }
+    if (0 == size_1) {
+        memmove(buffer, set_2, size_2 * sizeof(uint32_t));
+        return size_2;
+    }
+
+    uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
+
+    while (true) {
+        if (val_1 < val_2) {
+            buffer[pos++] = val_1;
+            ++idx_1;
+            if (idx_1 >= size_1) break;
+            val_1 = set_1[idx_1];
+        } else if (val_2 < val_1) {
+            buffer[pos++] = val_2;
+            ++idx_2;
+            if (idx_2 >= size_2) break;
+            val_2 = set_2[idx_2];
+        } else {
+            buffer[pos++] = val_1;
+            ++idx_1;
+            ++idx_2;
+            if (idx_1 >= size_1 || idx_2 >= size_2) break;
+            val_1 = set_1[idx_1];
+            val_2 = set_2[idx_2];
+        }
+    }
+
+    if (idx_1 < size_1) {
+        const size_t n_elems = size_1 - idx_1;
+        memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint32_t));
+        pos += n_elems;
+    } else if (idx_2 < size_2) {
+        const size_t n_elems = size_2 - idx_2;
+        memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint32_t));
+        pos += n_elems;
+    }
+
+    return pos;
+}
+
+size_t union_uint32_card(const uint32_t *set_1, size_t size_1,
+                         const uint32_t *set_2, size_t size_2) {
+    size_t pos = 0, idx_1 = 0, idx_2 = 0;
+
+    if (0 == size_2) {
+        return size_1;
+    }
+    if (0 == size_1) {
+        return size_2;
+    }
+
+    uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
+
+    while (true) {
+        if (val_1 < val_2) {
+            ++idx_1;
+            ++pos;
+            if (idx_1 >= size_1) break;
+            val_1 = set_1[idx_1];
+        } else if (val_2 < val_1) {
+            ++idx_2;
+            ++pos;
+            if (idx_2 >= size_2) break;
+            val_2 = set_2[idx_2];
+        } else {
+            ++idx_1;
+            ++idx_2;
+            ++pos;
+            if (idx_1 >= size_1 || idx_2 >= size_2) break;
+            val_1 = set_1[idx_1];
+            val_2 = set_2[idx_2];
+        }
+    }
+
+    if (idx_1 < size_1) {
+        const size_t n_elems = size_1 - idx_1;
+        pos += n_elems;
+    } else if (idx_2 < size_2) {
+        const size_t n_elems = size_2 - idx_2;
+        pos += n_elems;
+    }
+    return pos;
+}
+
+size_t fast_union_uint16(const uint16_t *set_1, size_t size_1,
+                         const uint16_t *set_2, size_t size_2,
+                         uint16_t *buffer) {
+#if CROARING_IS_X64
+    if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+        // compute union with smallest array first
+        if (size_1 < size_2) {
+            return union_vector16(set_1, (uint32_t)size_1, set_2,
+                                  (uint32_t)size_2, buffer);
+        } else {
+            return union_vector16(set_2, (uint32_t)size_2, set_1,
+                                  (uint32_t)size_1, buffer);
+        }
+    } else {
+        // compute union with smallest array first
+        if (size_1 < size_2) {
+            return union_uint16(set_1, size_1, set_2, size_2, buffer);
+        } else {
+            return union_uint16(set_2, size_2, set_1, size_1, buffer);
+        }
+    }
+#else
+    // compute union with smallest array first
+    if (size_1 < size_2) {
+        return union_uint16(set_1, size_1, set_2, size_2, buffer);
+    } else {
+        return union_uint16(set_2, size_2, set_1, size_1, buffer);
+    }
+#endif
+}
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+static inline bool _avx512_memequals(const void *s1, const void *s2, size_t n) {
+    const uint8_t *ptr1 = (const uint8_t *)s1;
+    const uint8_t *ptr2 = (const uint8_t *)s2;
+    const uint8_t *end1 = ptr1 + n;
+    const uint8_t *end8 = ptr1 + ((n >> 3) << 3);
+    const uint8_t *end32 = ptr1 + ((n >> 5) << 5);
+    const uint8_t *end64 = ptr1 + ((n >> 6) << 6);
+
+    while (ptr1 < end64) {
+        __m512i r1 = _mm512_loadu_si512((const __m512i *)ptr1);
+        __m512i r2 = _mm512_loadu_si512((const __m512i *)ptr2);
+
+        uint64_t mask = _mm512_cmpeq_epi8_mask(r1, r2);
+
+        if (mask != UINT64_MAX) {
+            return false;
+        }
+
+        ptr1 += 64;
+        ptr2 += 64;
+    }
+
+    while (ptr1 < end32) {
+        __m256i r1 = _mm256_loadu_si256((const __m256i *)ptr1);
+        __m256i r2 = _mm256_loadu_si256((const __m256i *)ptr2);
+        int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
+        if ((uint32_t)mask != UINT32_MAX) {
+            return false;
+        }
+        ptr1 += 32;
+        ptr2 += 32;
+    }
+
+    while (ptr1 < end8) {
+        uint64_t v1, v2;
+        memcpy(&v1, ptr1, sizeof(uint64_t));
+        memcpy(&v2, ptr2, sizeof(uint64_t));
+        if (v1 != v2) {
+            return false;
+        }
+        ptr1 += 8;
+        ptr2 += 8;
+    }
+
+    while (ptr1 < end1) {
+        if (*ptr1 != *ptr2) {
+            return false;
+        }
+        ptr1++;
+        ptr2++;
+    }
+
+    return true;
+}
+CROARING_UNTARGET_AVX512
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+
+CROARING_TARGET_AVX2
+static inline bool _avx2_memequals(const void *s1, const void *s2, size_t n) {
+    const uint8_t *ptr1 = (const uint8_t *)s1;
+    const uint8_t *ptr2 = (const uint8_t *)s2;
+    const uint8_t *end1 = ptr1 + n;
+    const uint8_t *end8 = ptr1 + n / 8 * 8;
+    const uint8_t *end32 = ptr1 + n / 32 * 32;
+
+    while (ptr1 < end32) {
+        __m256i r1 = _mm256_loadu_si256((const __m256i *)ptr1);
+        __m256i r2 = _mm256_loadu_si256((const __m256i *)ptr2);
+        int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
+        if ((uint32_t)mask != UINT32_MAX) {
+            return false;
+        }
+        ptr1 += 32;
+        ptr2 += 32;
+    }
+
+    while (ptr1 < end8) {
+        uint64_t v1, v2;
+        memcpy(&v1, ptr1, sizeof(uint64_t));
+        memcpy(&v2, ptr2, sizeof(uint64_t));
+        if (v1 != v2) {
+            return false;
+        }
+        ptr1 += 8;
+        ptr2 += 8;
+    }
+
+    while (ptr1 < end1) {
+        if (*ptr1 != *ptr2) {
+            return false;
+        }
+        ptr1++;
+        ptr2++;
+    }
+
+    return true;
+}
+CROARING_UNTARGET_AVX2
+#endif
+
+bool memequals(const void *s1, const void *s2, size_t n) {
+    if (n == 0) {
+        return true;
+    }
+#if CROARING_IS_X64
+    int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    if (support & ROARING_SUPPORTS_AVX512) {
+        return _avx512_memequals(s1, s2, n);
+    } else
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+        if (support & ROARING_SUPPORTS_AVX2) {
+            return _avx2_memequals(s1, s2, n);
+        } else {
+            return memcmp(s1, s2, n) == 0;
+        }
+#else
+    return memcmp(s1, s2, n) == 0;
+#endif
+}
+
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+ALLOW_UNALIGNED
+int avx512_array_container_to_uint32_array(void *vout, const uint16_t *array,
+                                           size_t cardinality, uint32_t base) {
+    int outpos = 0;
+    uint32_t *out = (uint32_t *)vout;
+    size_t i = 0;
+    for (; i + sizeof(__m256i) / sizeof(uint16_t) <= cardinality;
+         i += sizeof(__m256i) / sizeof(uint16_t)) {
+        __m256i vinput = _mm256_loadu_si256((const __m256i *)(array + i));
+        __m512i voutput = _mm512_add_epi32(_mm512_cvtepu16_epi32(vinput),
+                                           _mm512_set1_epi32(base));
+        _mm512_storeu_si512((__m512i *)(out + outpos), voutput);
+        outpos += sizeof(__m512i) / sizeof(uint32_t);
+    }
+    for (; i < cardinality; ++i) {
+        const uint32_t val = base + array[i];
+        memcpy(out + outpos, &val,
+               sizeof(uint32_t));  // should be compiled as a MOV on x64
+        outpos++;
+    }
+    return outpos;
+}
+CROARING_UNTARGET_AVX512
+#endif  // #if CROARING_COMPILER_SUPPORTS_AVX512
+#endif  // #if CROARING_IS_X64
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif/* end file src/array_util.c */
+/* begin file src/art/art.c */
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#define ART_NODE4_TYPE 0
+#define ART_NODE16_TYPE 1
+#define ART_NODE48_TYPE 2
+#define ART_NODE256_TYPE 3
+#define ART_NUM_TYPES 4
+
+// Node48 placeholder value to indicate no child is present at this key index.
+#define ART_NODE48_EMPTY_VAL 48
+
+// We use the least significant bit of node pointers to indicate whether a node
+// is a leaf or an inner node. This is never surfaced to the user.
+//
+// Using pointer tagging to indicate leaves not only saves a bit of memory by
+// sparing the typecode, but also allows us to use an intrusive leaf struct.
+// Using an intrusive leaf struct leaves leaf allocation up to the user. Upon
+// deallocation of the ART, we know not to free the leaves without having to
+// dereference the leaf pointers.
+//
+// All internal operations on leaves should use CAST_LEAF before using the leaf.
+// The only places that use SET_LEAF are locations where a field is directly
+// assigned to a leaf pointer. After using SET_LEAF, the leaf should be treated
+// as a node of unknown type.
+#define IS_LEAF(p) (((uintptr_t)(p) & 1))
+#define SET_LEAF(p) ((art_node_t *)((uintptr_t)(p) | 1))
+#define CAST_LEAF(p) ((art_leaf_t *)((void *)((uintptr_t)(p) & ~1)))
+
+#define NODE48_AVAILABLE_CHILDREN_MASK ((UINT64_C(1) << 48) - 1)
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+typedef uint8_t art_typecode_t;
+
+// Aliasing with a "leaf" naming so that its purpose is clearer in the context
+// of the trie internals.
+typedef art_val_t art_leaf_t;
+
+typedef struct art_internal_validate_s {
+    const char **reason;
+    art_validate_cb_t validate_cb;
+
+    int depth;
+    art_key_chunk_t current_key[ART_KEY_BYTES];
+} art_internal_validate_t;
+
+// Set the reason message, and return false for convenience.
+static inline bool art_validate_fail(const art_internal_validate_t *validate,
+                                     const char *msg) {
+    *validate->reason = msg;
+    return false;
+}
+
+// Inner node, with prefix.
+//
+// We use a fixed-length array as a pointer would be larger than the array.
+typedef struct art_inner_node_s {
+    art_typecode_t typecode;
+    uint8_t prefix_size;
+    uint8_t prefix[ART_KEY_BYTES - 1];
+} art_inner_node_t;
+
+// Inner node types.
+
+// Node4: key[i] corresponds with children[i]. Keys are sorted.
+typedef struct art_node4_s {
+    art_inner_node_t base;
+    uint8_t count;
+    uint8_t keys[4];
+    art_node_t *children[4];
+} art_node4_t;
+
+// Node16: key[i] corresponds with children[i]. Keys are sorted.
+typedef struct art_node16_s {
+    art_inner_node_t base;
+    uint8_t count;
+    uint8_t keys[16];
+    art_node_t *children[16];
+} art_node16_t;
+
+// Node48: key[i] corresponds with children[key[i]] if key[i] !=
+// ART_NODE48_EMPTY_VAL. Keys are naturally sorted due to direct indexing.
+typedef struct art_node48_s {
+    art_inner_node_t base;
+    uint8_t count;
+    // Bitset where the ith bit is set if children[i] is available
+    // Because there are at most 48 children, only the bottom 48 bits are used.
+    uint64_t available_children;
+    uint8_t keys[256];
+    art_node_t *children[48];
+} art_node48_t;
+
+// Node256: children[i] is directly indexed by key chunk. A child is present if
+// children[i] != NULL.
+typedef struct art_node256_s {
+    art_inner_node_t base;
+    uint16_t count;
+    art_node_t *children[256];
+} art_node256_t;
+
+// Helper struct to refer to a child within a node at a specific index.
+typedef struct art_indexed_child_s {
+    art_node_t *child;
+    uint8_t index;
+    art_key_chunk_t key_chunk;
+} art_indexed_child_t;
+
+static inline bool art_is_leaf(const art_node_t *node) { return IS_LEAF(node); }
+
+static void art_leaf_populate(art_leaf_t *leaf, const art_key_chunk_t key[]) {
+    memcpy(leaf->key, key, ART_KEY_BYTES);
+}
+
+static inline uint8_t art_get_type(const art_inner_node_t *node) {
+    return node->typecode;
+}
+
+static inline void art_init_inner_node(art_inner_node_t *node,
+                                       art_typecode_t typecode,
+                                       const art_key_chunk_t prefix[],
+                                       uint8_t prefix_size) {
+    node->typecode = typecode;
+    node->prefix_size = prefix_size;
+    memcpy(node->prefix, prefix, prefix_size * sizeof(art_key_chunk_t));
+}
+
+static void art_free_node(art_node_t *node);
+
+// ===================== Start of node-specific functions ======================
+
+static art_node4_t *art_node4_create(const art_key_chunk_t prefix[],
+                                     uint8_t prefix_size);
+static art_node16_t *art_node16_create(const art_key_chunk_t prefix[],
+                                       uint8_t prefix_size);
+static art_node48_t *art_node48_create(const art_key_chunk_t prefix[],
+                                       uint8_t prefix_size);
+static art_node256_t *art_node256_create(const art_key_chunk_t prefix[],
+                                         uint8_t prefix_size);
+
+static art_node_t *art_node4_insert(art_node4_t *node, art_node_t *child,
+                                    uint8_t key);
+static art_node_t *art_node16_insert(art_node16_t *node, art_node_t *child,
+                                     uint8_t key);
+static art_node_t *art_node48_insert(art_node48_t *node, art_node_t *child,
+                                     uint8_t key);
+static art_node_t *art_node256_insert(art_node256_t *node, art_node_t *child,
+                                      uint8_t key);
+
+static art_node4_t *art_node4_create(const art_key_chunk_t prefix[],
+                                     uint8_t prefix_size) {
+    art_node4_t *node = (art_node4_t *)roaring_malloc(sizeof(art_node4_t));
+    art_init_inner_node(&node->base, ART_NODE4_TYPE, prefix, prefix_size);
+    node->count = 0;
+    return node;
+}
+
+static void art_free_node4(art_node4_t *node) {
+    for (size_t i = 0; i < node->count; ++i) {
+        art_free_node(node->children[i]);
+    }
+    roaring_free(node);
+}
+
+static inline art_node_t *art_node4_find_child(const art_node4_t *node,
+                                               art_key_chunk_t key) {
+    for (size_t i = 0; i < node->count; ++i) {
+        if (node->keys[i] == key) {
+            return node->children[i];
+        }
+    }
+    return NULL;
+}
+
+static art_node_t *art_node4_insert(art_node4_t *node, art_node_t *child,
+                                    uint8_t key) {
+    if (node->count < 4) {
+        size_t idx = 0;
+        for (; idx < node->count; ++idx) {
+            if (node->keys[idx] > key) {
+                break;
+            }
+        }
+        size_t after = node->count - idx;
+        // Shift other keys to maintain sorted order.
+        memmove(node->keys + idx + 1, node->keys + idx,
+                after * sizeof(art_key_chunk_t));
+        memmove(node->children + idx + 1, node->children + idx,
+                after * sizeof(art_node_t *));
+
+        node->children[idx] = child;
+        node->keys[idx] = key;
+        node->count++;
+        return (art_node_t *)node;
+    }
+    art_node16_t *new_node =
+        art_node16_create(node->base.prefix, node->base.prefix_size);
+    // Instead of calling insert, this could be specialized to 2x memcpy and
+    // setting the count.
+    for (size_t i = 0; i < 4; ++i) {
+        art_node16_insert(new_node, node->children[i], node->keys[i]);
+    }
+    roaring_free(node);
+    return art_node16_insert(new_node, child, key);
+}
+
+static inline art_node_t *art_node4_erase(art_node4_t *node,
+                                          art_key_chunk_t key_chunk) {
+    int idx = -1;
+    for (size_t i = 0; i < node->count; ++i) {
+        if (node->keys[i] == key_chunk) {
+            idx = i;
+        }
+    }
+    if (idx == -1) {
+        return (art_node_t *)node;
+    }
+    if (node->count == 2) {
+        // Only one child remains after erasing, so compress the path by
+        // removing this node.
+        uint8_t other_idx = idx ^ 1;
+        art_node_t *remaining_child = node->children[other_idx];
+        art_key_chunk_t remaining_child_key = node->keys[other_idx];
+        if (!art_is_leaf(remaining_child)) {
+            // Correct the prefix of the child node.
+            art_inner_node_t *inner_node = (art_inner_node_t *)remaining_child;
+            memmove(inner_node->prefix + node->base.prefix_size + 1,
+                    inner_node->prefix, inner_node->prefix_size);
+            memcpy(inner_node->prefix, node->base.prefix,
+                   node->base.prefix_size);
+            inner_node->prefix[node->base.prefix_size] = remaining_child_key;
+            inner_node->prefix_size += node->base.prefix_size + 1;
+        }
+        roaring_free(node);
+        return remaining_child;
+    }
+    // Shift other keys to maintain sorted order.
+    size_t after_next = node->count - idx - 1;
+    memmove(node->keys + idx, node->keys + idx + 1,
+            after_next * sizeof(art_key_chunk_t));
+    memmove(node->children + idx, node->children + idx + 1,
+            after_next * sizeof(art_node_t *));
+    node->count--;
+    return (art_node_t *)node;
+}
+
+static inline void art_node4_replace(art_node4_t *node,
+                                     art_key_chunk_t key_chunk,
+                                     art_node_t *new_child) {
+    for (size_t i = 0; i < node->count; ++i) {
+        if (node->keys[i] == key_chunk) {
+            node->children[i] = new_child;
+            return;
+        }
+    }
+}
+
+static inline art_indexed_child_t art_node4_next_child(const art_node4_t *node,
+                                                       int index) {
+    art_indexed_child_t indexed_child;
+    index++;
+    if (index >= node->count) {
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    indexed_child.index = index;
+    indexed_child.child = node->children[index];
+    indexed_child.key_chunk = node->keys[index];
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node4_prev_child(const art_node4_t *node,
+                                                       int index) {
+    if (index > node->count) {
+        index = node->count;
+    }
+    index--;
+    art_indexed_child_t indexed_child;
+    if (index < 0) {
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    indexed_child.index = index;
+    indexed_child.child = node->children[index];
+    indexed_child.key_chunk = node->keys[index];
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node4_child_at(const art_node4_t *node,
+                                                     int index) {
+    art_indexed_child_t indexed_child;
+    if (index < 0 || index >= node->count) {
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    indexed_child.index = index;
+    indexed_child.child = node->children[index];
+    indexed_child.key_chunk = node->keys[index];
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node4_lower_bound(
+    art_node4_t *node, art_key_chunk_t key_chunk) {
+    art_indexed_child_t indexed_child;
+    for (size_t i = 0; i < node->count; ++i) {
+        if (node->keys[i] >= key_chunk) {
+            indexed_child.index = i;
+            indexed_child.child = node->children[i];
+            indexed_child.key_chunk = node->keys[i];
+            return indexed_child;
+        }
+    }
+    indexed_child.child = NULL;
+    return indexed_child;
+}
+
+static bool art_internal_validate_at(const art_node_t *node,
+                                     art_internal_validate_t validator);
+
+static bool art_node4_internal_validate(const art_node4_t *node,
+                                        art_internal_validate_t validator) {
+    if (node->count == 0) {
+        return art_validate_fail(&validator, "Node4 has no children");
+    }
+    if (node->count > 4) {
+        return art_validate_fail(&validator, "Node4 has too many children");
+    }
+    if (node->count == 1) {
+        return art_validate_fail(
+            &validator, "Node4 and child node should have been combined");
+    }
+    validator.depth++;
+    for (int i = 0; i < node->count; ++i) {
+        if (i > 0) {
+            if (node->keys[i - 1] >= node->keys[i]) {
+                return art_validate_fail(
+                    &validator, "Node4 keys are not strictly increasing");
+            }
+        }
+        for (int j = i + 1; j < node->count; ++j) {
+            if (node->children[i] == node->children[j]) {
+                return art_validate_fail(&validator,
+                                         "Node4 has duplicate children");
+            }
+        }
+        validator.current_key[validator.depth - 1] = node->keys[i];
+        if (!art_internal_validate_at(node->children[i], validator)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static art_node16_t *art_node16_create(const art_key_chunk_t prefix[],
+                                       uint8_t prefix_size) {
+    art_node16_t *node = (art_node16_t *)roaring_malloc(sizeof(art_node16_t));
+    art_init_inner_node(&node->base, ART_NODE16_TYPE, prefix, prefix_size);
+    node->count = 0;
+    return node;
+}
+
+static void art_free_node16(art_node16_t *node) {
+    for (size_t i = 0; i < node->count; ++i) {
+        art_free_node(node->children[i]);
+    }
+    roaring_free(node);
+}
+
+static inline art_node_t *art_node16_find_child(const art_node16_t *node,
+                                                art_key_chunk_t key) {
+    for (size_t i = 0; i < node->count; ++i) {
+        if (node->keys[i] == key) {
+            return node->children[i];
+        }
+    }
+    return NULL;
+}
+
+static art_node_t *art_node16_insert(art_node16_t *node, art_node_t *child,
+                                     uint8_t key) {
+    if (node->count < 16) {
+        size_t idx = 0;
+        for (; idx < node->count; ++idx) {
+            if (node->keys[idx] > key) {
+                break;
+            }
+        }
+        size_t after = node->count - idx;
+        // Shift other keys to maintain sorted order.
+        memmove(node->keys + idx + 1, node->keys + idx,
+                after * sizeof(art_key_chunk_t));
+        memmove(node->children + idx + 1, node->children + idx,
+                after * sizeof(art_node_t *));
+
+        node->children[idx] = child;
+        node->keys[idx] = key;
+        node->count++;
+        return (art_node_t *)node;
+    }
+    art_node48_t *new_node =
+        art_node48_create(node->base.prefix, node->base.prefix_size);
+    for (size_t i = 0; i < 16; ++i) {
+        art_node48_insert(new_node, node->children[i], node->keys[i]);
+    }
+    roaring_free(node);
+    return art_node48_insert(new_node, child, key);
+}
+
+static inline art_node_t *art_node16_erase(art_node16_t *node,
+                                           uint8_t key_chunk) {
+    for (size_t i = 0; i < node->count; ++i) {
+        if (node->keys[i] == key_chunk) {
+            // Shift other keys to maintain sorted order.
+            size_t after_next = node->count - i - 1;
+            memmove(node->keys + i, node->keys + i + 1,
+                    after_next * sizeof(key_chunk));
+            memmove(node->children + i, node->children + i + 1,
+                    after_next * sizeof(art_node_t *));
+            node->count--;
+            break;
+        }
+    }
+    if (node->count > 4) {
+        return (art_node_t *)node;
+    }
+    art_node4_t *new_node =
+        art_node4_create(node->base.prefix, node->base.prefix_size);
+    // Instead of calling insert, this could be specialized to 2x memcpy and
+    // setting the count.
+    for (size_t i = 0; i < 4; ++i) {
+        art_node4_insert(new_node, node->children[i], node->keys[i]);
+    }
+    roaring_free(node);
+    return (art_node_t *)new_node;
+}
+
+static inline void art_node16_replace(art_node16_t *node,
+                                      art_key_chunk_t key_chunk,
+                                      art_node_t *new_child) {
+    for (uint8_t i = 0; i < node->count; ++i) {
+        if (node->keys[i] == key_chunk) {
+            node->children[i] = new_child;
+            return;
+        }
+    }
+}
+
+static inline art_indexed_child_t art_node16_next_child(
+    const art_node16_t *node, int index) {
+    art_indexed_child_t indexed_child;
+    index++;
+    if (index >= node->count) {
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    indexed_child.index = index;
+    indexed_child.child = node->children[index];
+    indexed_child.key_chunk = node->keys[index];
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node16_prev_child(
+    const art_node16_t *node, int index) {
+    if (index > node->count) {
+        index = node->count;
+    }
+    index--;
+    art_indexed_child_t indexed_child;
+    if (index < 0) {
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    indexed_child.index = index;
+    indexed_child.child = node->children[index];
+    indexed_child.key_chunk = node->keys[index];
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node16_child_at(const art_node16_t *node,
+                                                      int index) {
+    art_indexed_child_t indexed_child;
+    if (index < 0 || index >= node->count) {
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    indexed_child.index = index;
+    indexed_child.child = node->children[index];
+    indexed_child.key_chunk = node->keys[index];
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node16_lower_bound(
+    art_node16_t *node, art_key_chunk_t key_chunk) {
+    art_indexed_child_t indexed_child;
+    for (size_t i = 0; i < node->count; ++i) {
+        if (node->keys[i] >= key_chunk) {
+            indexed_child.index = i;
+            indexed_child.child = node->children[i];
+            indexed_child.key_chunk = node->keys[i];
+            return indexed_child;
+        }
+    }
+    indexed_child.child = NULL;
+    return indexed_child;
+}
+
+static bool art_node16_internal_validate(const art_node16_t *node,
+                                         art_internal_validate_t validator) {
+    if (node->count <= 4) {
+        return art_validate_fail(&validator, "Node16 has too few children");
+    }
+    if (node->count > 16) {
+        return art_validate_fail(&validator, "Node16 has too many children");
+    }
+    validator.depth++;
+    for (int i = 0; i < node->count; ++i) {
+        if (i > 0) {
+            if (node->keys[i - 1] >= node->keys[i]) {
+                return art_validate_fail(
+                    &validator, "Node16 keys are not strictly increasing");
+            }
+        }
+        for (int j = i + 1; j < node->count; ++j) {
+            if (node->children[i] == node->children[j]) {
+                return art_validate_fail(&validator,
+                                         "Node16 has duplicate children");
+            }
+        }
+        validator.current_key[validator.depth - 1] = node->keys[i];
+        if (!art_internal_validate_at(node->children[i], validator)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static art_node48_t *art_node48_create(const art_key_chunk_t prefix[],
+                                       uint8_t prefix_size) {
+    art_node48_t *node = (art_node48_t *)roaring_malloc(sizeof(art_node48_t));
+    art_init_inner_node(&node->base, ART_NODE48_TYPE, prefix, prefix_size);
+    node->count = 0;
+    node->available_children = NODE48_AVAILABLE_CHILDREN_MASK;
+    for (size_t i = 0; i < 256; ++i) {
+        node->keys[i] = ART_NODE48_EMPTY_VAL;
+    }
+    return node;
+}
+
+static void art_free_node48(art_node48_t *node) {
+    uint64_t used_children =
+        (node->available_children) ^ NODE48_AVAILABLE_CHILDREN_MASK;
+    while (used_children != 0) {
+        // We checked above that used_children is not zero
+        uint8_t child_idx = roaring_trailing_zeroes(used_children);
+        art_free_node(node->children[child_idx]);
+        used_children &= ~(UINT64_C(1) << child_idx);
+    }
+    roaring_free(node);
+}
+
+static inline art_node_t *art_node48_find_child(const art_node48_t *node,
+                                                art_key_chunk_t key) {
+    uint8_t val_idx = node->keys[key];
+    if (val_idx != ART_NODE48_EMPTY_VAL) {
+        return node->children[val_idx];
+    }
+    return NULL;
+}
+
+static art_node_t *art_node48_insert(art_node48_t *node, art_node_t *child,
+                                     uint8_t key) {
+    if (node->count < 48) {
+        // node->available_children is only zero when the node is full (count ==
+        // 48), we just checked count < 48
+        uint8_t val_idx = roaring_trailing_zeroes(node->available_children);
+        node->keys[key] = val_idx;
+        node->children[val_idx] = child;
+        node->count++;
+        node->available_children &= ~(UINT64_C(1) << val_idx);
+        return (art_node_t *)node;
+    }
+    art_node256_t *new_node =
+        art_node256_create(node->base.prefix, node->base.prefix_size);
+    for (size_t i = 0; i < 256; ++i) {
+        uint8_t val_idx = node->keys[i];
+        if (val_idx != ART_NODE48_EMPTY_VAL) {
+            art_node256_insert(new_node, node->children[val_idx], i);
+        }
+    }
+    roaring_free(node);
+    return art_node256_insert(new_node, child, key);
+}
+
+static inline art_node_t *art_node48_erase(art_node48_t *node,
+                                           uint8_t key_chunk) {
+    uint8_t val_idx = node->keys[key_chunk];
+    if (val_idx == ART_NODE48_EMPTY_VAL) {
+        return (art_node_t *)node;
+    }
+    node->keys[key_chunk] = ART_NODE48_EMPTY_VAL;
+    node->available_children |= UINT64_C(1) << val_idx;
+    node->count--;
+    if (node->count > 16) {
+        return (art_node_t *)node;
+    }
+
+    art_node16_t *new_node =
+        art_node16_create(node->base.prefix, node->base.prefix_size);
+    for (size_t i = 0; i < 256; ++i) {
+        val_idx = node->keys[i];
+        if (val_idx != ART_NODE48_EMPTY_VAL) {
+            art_node16_insert(new_node, node->children[val_idx], i);
+        }
+    }
+    roaring_free(node);
+    return (art_node_t *)new_node;
+}
+
+static inline void art_node48_replace(art_node48_t *node,
+                                      art_key_chunk_t key_chunk,
+                                      art_node_t *new_child) {
+    uint8_t val_idx = node->keys[key_chunk];
+    assert(val_idx != ART_NODE48_EMPTY_VAL);
+    node->children[val_idx] = new_child;
+}
+
+static inline art_indexed_child_t art_node48_next_child(
+    const art_node48_t *node, int index) {
+    art_indexed_child_t indexed_child;
+    index++;
+    for (size_t i = index; i < 256; ++i) {
+        if (node->keys[i] != ART_NODE48_EMPTY_VAL) {
+            indexed_child.index = i;
+            indexed_child.child = node->children[node->keys[i]];
+            indexed_child.key_chunk = i;
+            return indexed_child;
+        }
+    }
+    indexed_child.child = NULL;
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node48_prev_child(
+    const art_node48_t *node, int index) {
+    if (index > 256) {
+        index = 256;
+    }
+    index--;
+    art_indexed_child_t indexed_child;
+    for (int i = index; i >= 0; --i) {
+        if (node->keys[i] != ART_NODE48_EMPTY_VAL) {
+            indexed_child.index = i;
+            indexed_child.child = node->children[node->keys[i]];
+            indexed_child.key_chunk = i;
+            return indexed_child;
+        }
+    }
+    indexed_child.child = NULL;
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node48_child_at(const art_node48_t *node,
+                                                      int index) {
+    art_indexed_child_t indexed_child;
+    if (index < 0 || index >= 256) {
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    indexed_child.index = index;
+    indexed_child.child = node->children[node->keys[index]];
+    indexed_child.key_chunk = index;
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node48_lower_bound(
+    art_node48_t *node, art_key_chunk_t key_chunk) {
+    art_indexed_child_t indexed_child;
+    for (size_t i = key_chunk; i < 256; ++i) {
+        if (node->keys[i] != ART_NODE48_EMPTY_VAL) {
+            indexed_child.index = i;
+            indexed_child.child = node->children[node->keys[i]];
+            indexed_child.key_chunk = i;
+            return indexed_child;
+        }
+    }
+    indexed_child.child = NULL;
+    return indexed_child;
+}
+
+static bool art_node48_internal_validate(const art_node48_t *node,
+                                         art_internal_validate_t validator) {
+    if (node->count <= 16) {
+        return art_validate_fail(&validator, "Node48 has too few children");
+    }
+    if (node->count > 48) {
+        return art_validate_fail(&validator, "Node48 has too many children");
+    }
+    uint64_t used_children = 0;
+    for (int i = 0; i < 256; ++i) {
+        uint8_t child_idx = node->keys[i];
+        if (child_idx != ART_NODE48_EMPTY_VAL) {
+            if (used_children & (UINT64_C(1) << child_idx)) {
+                return art_validate_fail(
+                    &validator, "Node48 keys point to the same child index");
+            }
+
+            art_node_t *child = node->children[child_idx];
+            if (child == NULL) {
+                return art_validate_fail(&validator, "Node48 has a NULL child");
+            }
+            used_children |= UINT64_C(1) << child_idx;
+        }
+    }
+    uint64_t expected_used_children =
+        (node->available_children) ^ NODE48_AVAILABLE_CHILDREN_MASK;
+    if (used_children != expected_used_children) {
+        return art_validate_fail(
+            &validator,
+            "Node48 available_children does not match actual children");
+    }
+    while (used_children != 0) {
+        uint8_t child_idx = roaring_trailing_zeroes(used_children);
+        used_children &= used_children - 1;
+
+        uint64_t other_children = used_children;
+        while (other_children != 0) {
+            uint8_t other_child_idx = roaring_trailing_zeroes(other_children);
+            if (node->children[child_idx] == node->children[other_child_idx]) {
+                return art_validate_fail(&validator,
+                                         "Node48 has duplicate children");
+            }
+            other_children &= other_children - 1;
+        }
+    }
+
+    validator.depth++;
+    for (int i = 0; i < 256; ++i) {
+        if (node->keys[i] != ART_NODE48_EMPTY_VAL) {
+            validator.current_key[validator.depth - 1] = i;
+            if (!art_internal_validate_at(node->children[node->keys[i]],
+                                          validator)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+static art_node256_t *art_node256_create(const art_key_chunk_t prefix[],
+                                         uint8_t prefix_size) {
+    art_node256_t *node =
+        (art_node256_t *)roaring_malloc(sizeof(art_node256_t));
+    art_init_inner_node(&node->base, ART_NODE256_TYPE, prefix, prefix_size);
+    node->count = 0;
+    for (size_t i = 0; i < 256; ++i) {
+        node->children[i] = NULL;
+    }
+    return node;
+}
+
+static void art_free_node256(art_node256_t *node) {
+    for (size_t i = 0; i < 256; ++i) {
+        if (node->children[i] != NULL) {
+            art_free_node(node->children[i]);
+        }
+    }
+    roaring_free(node);
+}
+
+static inline art_node_t *art_node256_find_child(const art_node256_t *node,
+                                                 art_key_chunk_t key) {
+    return node->children[key];
+}
+
+static art_node_t *art_node256_insert(art_node256_t *node, art_node_t *child,
+                                      uint8_t key) {
+    node->children[key] = child;
+    node->count++;
+    return (art_node_t *)node;
+}
+
+static inline art_node_t *art_node256_erase(art_node256_t *node,
+                                            uint8_t key_chunk) {
+    node->children[key_chunk] = NULL;
+    node->count--;
+    if (node->count > 48) {
+        return (art_node_t *)node;
+    }
+
+    art_node48_t *new_node =
+        art_node48_create(node->base.prefix, node->base.prefix_size);
+    for (size_t i = 0; i < 256; ++i) {
+        if (node->children[i] != NULL) {
+            art_node48_insert(new_node, node->children[i], i);
+        }
+    }
+    roaring_free(node);
+    return (art_node_t *)new_node;
+}
+
+static inline void art_node256_replace(art_node256_t *node,
+                                       art_key_chunk_t key_chunk,
+                                       art_node_t *new_child) {
+    node->children[key_chunk] = new_child;
+}
+
+static inline art_indexed_child_t art_node256_next_child(
+    const art_node256_t *node, int index) {
+    art_indexed_child_t indexed_child;
+    index++;
+    for (size_t i = index; i < 256; ++i) {
+        if (node->children[i] != NULL) {
+            indexed_child.index = i;
+            indexed_child.child = node->children[i];
+            indexed_child.key_chunk = i;
+            return indexed_child;
+        }
+    }
+    indexed_child.child = NULL;
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node256_prev_child(
+    const art_node256_t *node, int index) {
+    if (index > 256) {
+        index = 256;
+    }
+    index--;
+    art_indexed_child_t indexed_child;
+    for (int i = index; i >= 0; --i) {
+        if (node->children[i] != NULL) {
+            indexed_child.index = i;
+            indexed_child.child = node->children[i];
+            indexed_child.key_chunk = i;
+            return indexed_child;
+        }
+    }
+    indexed_child.child = NULL;
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node256_child_at(
+    const art_node256_t *node, int index) {
+    art_indexed_child_t indexed_child;
+    if (index < 0 || index >= 256) {
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    indexed_child.index = index;
+    indexed_child.child = node->children[index];
+    indexed_child.key_chunk = index;
+    return indexed_child;
+}
+
+static inline art_indexed_child_t art_node256_lower_bound(
+    art_node256_t *node, art_key_chunk_t key_chunk) {
+    art_indexed_child_t indexed_child;
+    for (size_t i = key_chunk; i < 256; ++i) {
+        if (node->children[i] != NULL) {
+            indexed_child.index = i;
+            indexed_child.child = node->children[i];
+            indexed_child.key_chunk = i;
+            return indexed_child;
+        }
+    }
+    indexed_child.child = NULL;
+    return indexed_child;
+}
+
+static bool art_node256_internal_validate(const art_node256_t *node,
+                                          art_internal_validate_t validator) {
+    if (node->count <= 48) {
+        return art_validate_fail(&validator, "Node256 has too few children");
+    }
+    if (node->count > 256) {
+        return art_validate_fail(&validator, "Node256 has too many children");
+    }
+    validator.depth++;
+    int actual_count = 0;
+    for (int i = 0; i < 256; ++i) {
+        if (node->children[i] != NULL) {
+            actual_count++;
+
+            for (int j = i + 1; j < 256; ++j) {
+                if (node->children[i] == node->children[j]) {
+                    return art_validate_fail(&validator,
+                                             "Node256 has duplicate children");
+                }
+            }
+
+            validator.current_key[validator.depth - 1] = i;
+            if (!art_internal_validate_at(node->children[i], validator)) {
+                return false;
+            }
+        }
+    }
+    if (actual_count != node->count) {
+        return art_validate_fail(
+            &validator, "Node256 count does not match actual children");
+    }
+    return true;
+}
+
+// Finds the child with the given key chunk in the inner node, returns NULL if
+// no such child is found.
+static art_node_t *art_find_child(const art_inner_node_t *node,
+                                  art_key_chunk_t key_chunk) {
+    switch (art_get_type(node)) {
+        case ART_NODE4_TYPE:
+            return art_node4_find_child((art_node4_t *)node, key_chunk);
+        case ART_NODE16_TYPE:
+            return art_node16_find_child((art_node16_t *)node, key_chunk);
+        case ART_NODE48_TYPE:
+            return art_node48_find_child((art_node48_t *)node, key_chunk);
+        case ART_NODE256_TYPE:
+            return art_node256_find_child((art_node256_t *)node, key_chunk);
+        default:
+            assert(false);
+            return NULL;
+    }
+}
+
+// Replaces the child with the given key chunk in the inner node.
+static void art_replace(art_inner_node_t *node, art_key_chunk_t key_chunk,
+                        art_node_t *new_child) {
+    switch (art_get_type(node)) {
+        case ART_NODE4_TYPE:
+            art_node4_replace((art_node4_t *)node, key_chunk, new_child);
+            break;
+        case ART_NODE16_TYPE:
+            art_node16_replace((art_node16_t *)node, key_chunk, new_child);
+            break;
+        case ART_NODE48_TYPE:
+            art_node48_replace((art_node48_t *)node, key_chunk, new_child);
+            break;
+        case ART_NODE256_TYPE:
+            art_node256_replace((art_node256_t *)node, key_chunk, new_child);
+            break;
+        default:
+            assert(false);
+    }
+}
+
+// Erases the child with the given key chunk from the inner node, returns the
+// updated node (the same as the initial node if it was not shrunk).
+static art_node_t *art_node_erase(art_inner_node_t *node,
+                                  art_key_chunk_t key_chunk) {
+    switch (art_get_type(node)) {
+        case ART_NODE4_TYPE:
+            return art_node4_erase((art_node4_t *)node, key_chunk);
+        case ART_NODE16_TYPE:
+            return art_node16_erase((art_node16_t *)node, key_chunk);
+        case ART_NODE48_TYPE:
+            return art_node48_erase((art_node48_t *)node, key_chunk);
+        case ART_NODE256_TYPE:
+            return art_node256_erase((art_node256_t *)node, key_chunk);
+        default:
+            assert(false);
+            return NULL;
+    }
+}
+
+// Inserts the leaf with the given key chunk in the inner node, returns a
+// pointer to the (possibly expanded) node.
+static art_node_t *art_node_insert_leaf(art_inner_node_t *node,
+                                        art_key_chunk_t key_chunk,
+                                        art_leaf_t *leaf) {
+    art_node_t *child = (art_node_t *)(SET_LEAF(leaf));
+    switch (art_get_type(node)) {
+        case ART_NODE4_TYPE:
+            return art_node4_insert((art_node4_t *)node, child, key_chunk);
+        case ART_NODE16_TYPE:
+            return art_node16_insert((art_node16_t *)node, child, key_chunk);
+        case ART_NODE48_TYPE:
+            return art_node48_insert((art_node48_t *)node, child, key_chunk);
+        case ART_NODE256_TYPE:
+            return art_node256_insert((art_node256_t *)node, child, key_chunk);
+        default:
+            assert(false);
+            return NULL;
+    }
+}
+
+// Frees the node and its children. Leaves are freed by the user.
+static void art_free_node(art_node_t *node) {
+    if (art_is_leaf(node)) {
+        // We leave it up to the user to free leaves.
+        return;
+    }
+    switch (art_get_type((art_inner_node_t *)node)) {
+        case ART_NODE4_TYPE:
+            art_free_node4((art_node4_t *)node);
+            break;
+        case ART_NODE16_TYPE:
+            art_free_node16((art_node16_t *)node);
+            break;
+        case ART_NODE48_TYPE:
+            art_free_node48((art_node48_t *)node);
+            break;
+        case ART_NODE256_TYPE:
+            art_free_node256((art_node256_t *)node);
+            break;
+        default:
+            assert(false);
+    }
+}
+
+// Returns the next child in key order, or NULL if called on a leaf.
+// Provided index may be in the range [-1, 255].
+static art_indexed_child_t art_node_next_child(const art_node_t *node,
+                                               int index) {
+    if (art_is_leaf(node)) {
+        art_indexed_child_t indexed_child;
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    switch (art_get_type((art_inner_node_t *)node)) {
+        case ART_NODE4_TYPE:
+            return art_node4_next_child((art_node4_t *)node, index);
+        case ART_NODE16_TYPE:
+            return art_node16_next_child((art_node16_t *)node, index);
+        case ART_NODE48_TYPE:
+            return art_node48_next_child((art_node48_t *)node, index);
+        case ART_NODE256_TYPE:
+            return art_node256_next_child((art_node256_t *)node, index);
+        default:
+            assert(false);
+            return (art_indexed_child_t){0};
+    }
+}
+
+// Returns the previous child in key order, or NULL if called on a leaf.
+// Provided index may be in the range [0, 256].
+static art_indexed_child_t art_node_prev_child(const art_node_t *node,
+                                               int index) {
+    if (art_is_leaf(node)) {
+        art_indexed_child_t indexed_child;
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    switch (art_get_type((art_inner_node_t *)node)) {
+        case ART_NODE4_TYPE:
+            return art_node4_prev_child((art_node4_t *)node, index);
+        case ART_NODE16_TYPE:
+            return art_node16_prev_child((art_node16_t *)node, index);
+        case ART_NODE48_TYPE:
+            return art_node48_prev_child((art_node48_t *)node, index);
+        case ART_NODE256_TYPE:
+            return art_node256_prev_child((art_node256_t *)node, index);
+        default:
+            assert(false);
+            return (art_indexed_child_t){0};
+    }
+}
+
+// Returns the child found at the provided index, or NULL if called on a leaf.
+// Provided index is only valid if returned by art_node_(next|prev)_child.
+static art_indexed_child_t art_node_child_at(const art_node_t *node,
+                                             int index) {
+    if (art_is_leaf(node)) {
+        art_indexed_child_t indexed_child;
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    switch (art_get_type((art_inner_node_t *)node)) {
+        case ART_NODE4_TYPE:
+            return art_node4_child_at((art_node4_t *)node, index);
+        case ART_NODE16_TYPE:
+            return art_node16_child_at((art_node16_t *)node, index);
+        case ART_NODE48_TYPE:
+            return art_node48_child_at((art_node48_t *)node, index);
+        case ART_NODE256_TYPE:
+            return art_node256_child_at((art_node256_t *)node, index);
+        default:
+            assert(false);
+            return (art_indexed_child_t){0};
+    }
+}
+
+// Returns the child with the smallest key equal to or greater than the given
+// key chunk, NULL if called on a leaf or no such child was found.
+static art_indexed_child_t art_node_lower_bound(const art_node_t *node,
+                                                art_key_chunk_t key_chunk) {
+    if (art_is_leaf(node)) {
+        art_indexed_child_t indexed_child;
+        indexed_child.child = NULL;
+        return indexed_child;
+    }
+    switch (art_get_type((art_inner_node_t *)node)) {
+        case ART_NODE4_TYPE:
+            return art_node4_lower_bound((art_node4_t *)node, key_chunk);
+        case ART_NODE16_TYPE:
+            return art_node16_lower_bound((art_node16_t *)node, key_chunk);
+        case ART_NODE48_TYPE:
+            return art_node48_lower_bound((art_node48_t *)node, key_chunk);
+        case ART_NODE256_TYPE:
+            return art_node256_lower_bound((art_node256_t *)node, key_chunk);
+        default:
+            assert(false);
+            return (art_indexed_child_t){0};
+    }
+}
+
+// ====================== End of node-specific functions =======================
+
+// Compares the given ranges of two keys, returns their relative order:
+// * Key range 1 <  key range 2: a negative value
+// * Key range 1 == key range 2: 0
+// * Key range 1 >  key range 2: a positive value
+static inline int art_compare_prefix(const art_key_chunk_t key1[],
+                                     uint8_t key1_from,
+                                     const art_key_chunk_t key2[],
+                                     uint8_t key2_from, uint8_t length) {
+    return memcmp(key1 + key1_from, key2 + key2_from, length);
+}
+
+// Compares two keys in full, see art_compare_prefix.
+int art_compare_keys(const art_key_chunk_t key1[],
+                     const art_key_chunk_t key2[]) {
+    return art_compare_prefix(key1, 0, key2, 0, ART_KEY_BYTES);
+}
+
+// Returns the length of the common prefix between two key ranges.
+static uint8_t art_common_prefix(const art_key_chunk_t key1[],
+                                 uint8_t key1_from, uint8_t key1_to,
+                                 const art_key_chunk_t key2[],
+                                 uint8_t key2_from, uint8_t key2_to) {
+    uint8_t min_len = key1_to - key1_from;
+    uint8_t key2_len = key2_to - key2_from;
+    if (key2_len < min_len) {
+        min_len = key2_len;
+    }
+    uint8_t offset = 0;
+    for (; offset < min_len; ++offset) {
+        if (key1[key1_from + offset] != key2[key2_from + offset]) {
+            return offset;
+        }
+    }
+    return offset;
+}
+
+// Returns a pointer to the rootmost node where the value was inserted, may not
+// be equal to `node`.
+static art_node_t *art_insert_at(art_node_t *node, const art_key_chunk_t key[],
+                                 uint8_t depth, art_leaf_t *new_leaf) {
+    if (art_is_leaf(node)) {
+        art_leaf_t *leaf = CAST_LEAF(node);
+        uint8_t common_prefix = art_common_prefix(
+            leaf->key, depth, ART_KEY_BYTES, key, depth, ART_KEY_BYTES);
+
+        // Previously this was a leaf, create an inner node instead and add both
+        // the existing and new leaf to it.
+        art_node_t *new_node =
+            (art_node_t *)art_node4_create(key + depth, common_prefix);
+
+        new_node = art_node_insert_leaf((art_inner_node_t *)new_node,
+                                        leaf->key[depth + common_prefix], leaf);
+        new_node = art_node_insert_leaf((art_inner_node_t *)new_node,
+                                        key[depth + common_prefix], new_leaf);
+
+        // The new inner node is now the rootmost node.
+        return new_node;
+    }
+    art_inner_node_t *inner_node = (art_inner_node_t *)node;
+    // Not a leaf: inner node
+    uint8_t common_prefix =
+        art_common_prefix(inner_node->prefix, 0, inner_node->prefix_size, key,
+                          depth, ART_KEY_BYTES);
+    if (common_prefix != inner_node->prefix_size) {
+        // Partial prefix match.  Create a new internal node to hold the common
+        // prefix.
+        art_node4_t *node4 =
+            art_node4_create(inner_node->prefix, common_prefix);
+
+        // Make the existing internal node a child of the new internal node.
+        node4 = (art_node4_t *)art_node4_insert(
+            node4, node, inner_node->prefix[common_prefix]);
+
+        // Correct the prefix of the moved internal node, trimming off the chunk
+        // inserted into the new internal node.
+        inner_node->prefix_size = inner_node->prefix_size - common_prefix - 1;
+        if (inner_node->prefix_size > 0) {
+            // Move the remaining prefix to the correct position.
+            memmove(inner_node->prefix, inner_node->prefix + common_prefix + 1,
+                    inner_node->prefix_size);
+        }
+
+        // Insert the value in the new internal node.
+        return art_node_insert_leaf(&node4->base, key[common_prefix + depth],
+                                    new_leaf);
+    }
+    // Prefix matches entirely or node has no prefix. Look for an existing
+    // child.
+    art_key_chunk_t key_chunk = key[depth + common_prefix];
+    art_node_t *child = art_find_child(inner_node, key_chunk);
+    if (child != NULL) {
+        art_node_t *new_child =
+            art_insert_at(child, key, depth + common_prefix + 1, new_leaf);
+        if (new_child != child) {
+            // Node type changed.
+            art_replace(inner_node, key_chunk, new_child);
+        }
+        return node;
+    }
+    return art_node_insert_leaf(inner_node, key_chunk, new_leaf);
+}
+
+// Erase helper struct.
+typedef struct art_erase_result_s {
+    // The rootmost node where the value was erased, may not be equal to `node`.
+    // If no value was removed, this is null.
+    art_node_t *rootmost_node;
+
+    // Value removed, null if not removed.
+    art_val_t *value_erased;
+} art_erase_result_t;
+
+// Searches for the given key starting at `node`, erases it if found.
+static art_erase_result_t art_erase_at(art_node_t *node,
+                                       const art_key_chunk_t *key,
+                                       uint8_t depth) {
+    art_erase_result_t result;
+    result.rootmost_node = NULL;
+    result.value_erased = NULL;
+
+    if (art_is_leaf(node)) {
+        art_leaf_t *leaf = CAST_LEAF(node);
+        uint8_t common_prefix = art_common_prefix(leaf->key, 0, ART_KEY_BYTES,
+                                                  key, 0, ART_KEY_BYTES);
+        if (common_prefix != ART_KEY_BYTES) {
+            // Leaf key mismatch.
+            return result;
+        }
+        result.value_erased = (art_val_t *)leaf;
+        return result;
+    }
+    art_inner_node_t *inner_node = (art_inner_node_t *)node;
+    uint8_t common_prefix =
+        art_common_prefix(inner_node->prefix, 0, inner_node->prefix_size, key,
+                          depth, ART_KEY_BYTES);
+    if (common_prefix != inner_node->prefix_size) {
+        // Prefix mismatch.
+        return result;
+    }
+    art_key_chunk_t key_chunk = key[depth + common_prefix];
+    art_node_t *child = art_find_child(inner_node, key_chunk);
+    if (child == NULL) {
+        // No child with key chunk.
+        return result;
+    }
+    // Try to erase the key further down. Skip the key chunk associated with the
+    // child in the node.
+    art_erase_result_t child_result =
+        art_erase_at(child, key, depth + common_prefix + 1);
+    if (child_result.value_erased == NULL) {
+        return result;
+    }
+    result.value_erased = child_result.value_erased;
+    result.rootmost_node = node;
+    if (child_result.rootmost_node == NULL) {
+        // Child node was fully erased, erase it from this node's children.
+        result.rootmost_node = art_node_erase(inner_node, key_chunk);
+    } else if (child_result.rootmost_node != child) {
+        // Child node was not fully erased, update the pointer to it in this
+        // node.
+        art_replace(inner_node, key_chunk, child_result.rootmost_node);
+    }
+    return result;
+}
+
+// Searches for the given key starting at `node`, returns NULL if the key was
+// not found.
+static art_val_t *art_find_at(const art_node_t *node,
+                              const art_key_chunk_t *key, uint8_t depth) {
+    while (!art_is_leaf(node)) {
+        art_inner_node_t *inner_node = (art_inner_node_t *)node;
+        uint8_t common_prefix =
+            art_common_prefix(inner_node->prefix, 0, inner_node->prefix_size,
+                              key, depth, ART_KEY_BYTES);
+        if (common_prefix != inner_node->prefix_size) {
+            return NULL;
+        }
+        art_node_t *child =
+            art_find_child(inner_node, key[depth + inner_node->prefix_size]);
+        if (child == NULL) {
+            return NULL;
+        }
+        node = child;
+        // Include both the prefix and the child key chunk in the depth.
+        depth += inner_node->prefix_size + 1;
+    }
+    art_leaf_t *leaf = CAST_LEAF(node);
+    if (depth >= ART_KEY_BYTES) {
+        return (art_val_t *)leaf;
+    }
+    uint8_t common_prefix =
+        art_common_prefix(leaf->key, 0, ART_KEY_BYTES, key, 0, ART_KEY_BYTES);
+    if (common_prefix == ART_KEY_BYTES) {
+        return (art_val_t *)leaf;
+    }
+    return NULL;
+}
+
+// Returns the size in bytes of the subtrie.
+size_t art_size_in_bytes_at(const art_node_t *node) {
+    if (art_is_leaf(node)) {
+        return 0;
+    }
+    size_t size = 0;
+    switch (art_get_type((art_inner_node_t *)node)) {
+        case ART_NODE4_TYPE: {
+            size += sizeof(art_node4_t);
+        } break;
+        case ART_NODE16_TYPE: {
+            size += sizeof(art_node16_t);
+        } break;
+        case ART_NODE48_TYPE: {
+            size += sizeof(art_node48_t);
+        } break;
+        case ART_NODE256_TYPE: {
+            size += sizeof(art_node256_t);
+        } break;
+        default:
+            assert(false);
+            break;
+    }
+    art_indexed_child_t indexed_child = art_node_next_child(node, -1);
+    while (indexed_child.child != NULL) {
+        size += art_size_in_bytes_at(indexed_child.child);
+        indexed_child = art_node_next_child(node, indexed_child.index);
+    }
+    return size;
+}
+
+static void art_node_print_type(const art_node_t *node) {
+    if (art_is_leaf(node)) {
+        printf("Leaf");
+        return;
+    }
+    switch (art_get_type((art_inner_node_t *)node)) {
+        case ART_NODE4_TYPE:
+            printf("Node4");
+            return;
+        case ART_NODE16_TYPE:
+            printf("Node16");
+            return;
+        case ART_NODE48_TYPE:
+            printf("Node48");
+            return;
+        case ART_NODE256_TYPE:
+            printf("Node256");
+            return;
+        default:
+            assert(false);
+            return;
+    }
+}
+
+void art_node_printf(const art_node_t *node, uint8_t depth) {
+    if (art_is_leaf(node)) {
+        printf("{ type: Leaf, key: ");
+        art_leaf_t *leaf = CAST_LEAF(node);
+        for (size_t i = 0; i < ART_KEY_BYTES; ++i) {
+            printf("%02x", leaf->key[i]);
+        }
+        printf(" }\n");
+        return;
+    }
+    printf("{\n");
+    depth++;
+
+    printf("%*s", depth, "");
+    printf("type: ");
+    art_node_print_type(node);
+    printf("\n");
+
+    art_inner_node_t *inner_node = (art_inner_node_t *)node;
+    printf("%*s", depth, "");
+    printf("prefix_size: %d\n", inner_node->prefix_size);
+
+    printf("%*s", depth, "");
+    printf("prefix: ");
+    for (uint8_t i = 0; i < inner_node->prefix_size; ++i) {
+        printf("%02x", inner_node->prefix[i]);
+    }
+    printf("\n");
+
+    switch (art_get_type(inner_node)) {
+        case ART_NODE4_TYPE: {
+            art_node4_t *node4 = (art_node4_t *)node;
+            for (uint8_t i = 0; i < node4->count; ++i) {
+                printf("%*s", depth, "");
+                printf("key: %02x ", node4->keys[i]);
+                art_node_printf(node4->children[i], depth);
+            }
+        } break;
+        case ART_NODE16_TYPE: {
+            art_node16_t *node16 = (art_node16_t *)node;
+            for (uint8_t i = 0; i < node16->count; ++i) {
+                printf("%*s", depth, "");
+                printf("key: %02x ", node16->keys[i]);
+                art_node_printf(node16->children[i], depth);
+            }
+        } break;
+        case ART_NODE48_TYPE: {
+            art_node48_t *node48 = (art_node48_t *)node;
+            for (int i = 0; i < 256; ++i) {
+                if (node48->keys[i] != ART_NODE48_EMPTY_VAL) {
+                    printf("%*s", depth, "");
+                    printf("key: %02x ", i);
+                    printf("child: %02x ", node48->keys[i]);
+                    art_node_printf(node48->children[node48->keys[i]], depth);
+                }
+            }
+        } break;
+        case ART_NODE256_TYPE: {
+            art_node256_t *node256 = (art_node256_t *)node;
+            for (int i = 0; i < 256; ++i) {
+                if (node256->children[i] != NULL) {
+                    printf("%*s", depth, "");
+                    printf("key: %02x ", i);
+                    art_node_printf(node256->children[i], depth);
+                }
+            }
+        } break;
+        default:
+            assert(false);
+            break;
+    }
+    depth--;
+    printf("%*s", depth, "");
+    printf("}\n");
+}
+
+void art_insert(art_t *art, const art_key_chunk_t *key, art_val_t *val) {
+    art_leaf_t *leaf = (art_leaf_t *)val;
+    art_leaf_populate(leaf, key);
+    if (art->root == NULL) {
+        art->root = (art_node_t *)SET_LEAF(leaf);
+        return;
+    }
+    art->root = art_insert_at(art->root, key, 0, leaf);
+}
+
+art_val_t *art_erase(art_t *art, const art_key_chunk_t *key) {
+    if (art->root == NULL) {
+        return NULL;
+    }
+    art_erase_result_t result = art_erase_at(art->root, key, 0);
+    if (result.value_erased == NULL) {
+        return NULL;
+    }
+    art->root = result.rootmost_node;
+    return result.value_erased;
+}
+
+art_val_t *art_find(const art_t *art, const art_key_chunk_t *key) {
+    if (art->root == NULL) {
+        return NULL;
+    }
+    return art_find_at(art->root, key, 0);
+}
+
+bool art_is_empty(const art_t *art) { return art->root == NULL; }
+
+void art_free(art_t *art) {
+    if (art->root == NULL) {
+        return;
+    }
+    art_free_node(art->root);
+}
+
+size_t art_size_in_bytes(const art_t *art) {
+    size_t size = sizeof(art_t);
+    if (art->root != NULL) {
+        size += art_size_in_bytes_at(art->root);
+    }
+    return size;
+}
+
+void art_printf(const art_t *art) {
+    if (art->root == NULL) {
+        return;
+    }
+    art_node_printf(art->root, 0);
+}
+
+// Returns the current node that the iterator is positioned at.
+static inline art_node_t *art_iterator_node(art_iterator_t *iterator) {
+    return iterator->frames[iterator->frame].node;
+}
+
+// Sets the iterator key and value to the leaf's key and value. Always returns
+// true for convenience.
+static inline bool art_iterator_valid_loc(art_iterator_t *iterator,
+                                          art_leaf_t *leaf) {
+    iterator->frames[iterator->frame].node = SET_LEAF(leaf);
+    iterator->frames[iterator->frame].index_in_node = 0;
+    memcpy(iterator->key, leaf->key, ART_KEY_BYTES);
+    iterator->value = (art_val_t *)leaf;
+    return true;
+}
+
+// Invalidates the iterator key and value. Always returns false for convenience.
+static inline bool art_iterator_invalid_loc(art_iterator_t *iterator) {
+    memset(iterator->key, 0, ART_KEY_BYTES);
+    iterator->value = NULL;
+    return false;
+}
+
+// Moves the iterator one level down in the tree, given a node at the current
+// level and the index of the child that we're going down to.
+//
+// Note: does not set the index at the new level.
+static void art_iterator_down(art_iterator_t *iterator,
+                              const art_inner_node_t *node,
+                              uint8_t index_in_node) {
+    iterator->frames[iterator->frame].node = (art_node_t *)node;
+    iterator->frames[iterator->frame].index_in_node = index_in_node;
+    iterator->frame++;
+    art_indexed_child_t indexed_child =
+        art_node_child_at((art_node_t *)node, index_in_node);
+    assert(indexed_child.child != NULL);
+    iterator->frames[iterator->frame].node = indexed_child.child;
+    iterator->depth += node->prefix_size + 1;
+}
+
+// Moves the iterator to the next/previous child of the current node. Returns
+// the child moved to, or NULL if there is no neighboring child.
+static art_node_t *art_iterator_neighbor_child(
+    art_iterator_t *iterator, const art_inner_node_t *inner_node,
+    bool forward) {
+    art_iterator_frame_t frame = iterator->frames[iterator->frame];
+    art_indexed_child_t indexed_child;
+    if (forward) {
+        indexed_child = art_node_next_child(frame.node, frame.index_in_node);
+    } else {
+        indexed_child = art_node_prev_child(frame.node, frame.index_in_node);
+    }
+    if (indexed_child.child != NULL) {
+        art_iterator_down(iterator, inner_node, indexed_child.index);
+    }
+    return indexed_child.child;
+}
+
+// Moves the iterator one level up in the tree, returns false if not possible.
+static bool art_iterator_up(art_iterator_t *iterator) {
+    if (iterator->frame == 0) {
+        return false;
+    }
+    iterator->frame--;
+    // We went up, so we are at an inner node.
+    iterator->depth -=
+        ((art_inner_node_t *)art_iterator_node(iterator))->prefix_size + 1;
+    return true;
+}
+
+// Moves the iterator one level, followed by a move to the next / previous leaf.
+// Sets the status of the iterator.
+static bool art_iterator_up_and_move(art_iterator_t *iterator, bool forward) {
+    if (!art_iterator_up(iterator)) {
+        // We're at the root.
+        return art_iterator_invalid_loc(iterator);
+    }
+    return art_iterator_move(iterator, forward);
+}
+
+// Initializes the iterator at the first / last leaf of the given node.
+// Returns true for convenience.
+static bool art_node_init_iterator(const art_node_t *node,
+                                   art_iterator_t *iterator, bool first) {
+    while (!art_is_leaf(node)) {
+        art_indexed_child_t indexed_child;
+        if (first) {
+            indexed_child = art_node_next_child(node, -1);
+        } else {
+            indexed_child = art_node_prev_child(node, 256);
+        }
+        art_iterator_down(iterator, (art_inner_node_t *)node,
+                          indexed_child.index);
+        node = indexed_child.child;
+    }
+    // We're at a leaf.
+    iterator->frames[iterator->frame].node = (art_node_t *)node;
+    iterator->frames[iterator->frame].index_in_node = 0;  // Should not matter.
+    return art_iterator_valid_loc(iterator, CAST_LEAF(node));
+}
+
+bool art_iterator_move(art_iterator_t *iterator, bool forward) {
+    if (art_is_leaf(art_iterator_node(iterator))) {
+        bool went_up = art_iterator_up(iterator);
+        if (!went_up) {
+            // This leaf is the root, we're done.
+            return art_iterator_invalid_loc(iterator);
+        }
+    }
+    // Advance within inner node.
+    art_node_t *neighbor_child = art_iterator_neighbor_child(
+        iterator, (art_inner_node_t *)art_iterator_node(iterator), forward);
+    if (neighbor_child != NULL) {
+        // There is another child at this level, go down to the first or last
+        // leaf.
+        return art_node_init_iterator(neighbor_child, iterator, forward);
+    }
+    // No more children at this level, go up.
+    return art_iterator_up_and_move(iterator, forward);
+}
+
+// Assumes the iterator is positioned at a node with an equal prefix path up to
+// the depth of the iterator.
+static bool art_node_iterator_lower_bound(const art_node_t *node,
+                                          art_iterator_t *iterator,
+                                          const art_key_chunk_t key[]) {
+    while (!art_is_leaf(node)) {
+        art_inner_node_t *inner_node = (art_inner_node_t *)node;
+        int prefix_comparison =
+            art_compare_prefix(inner_node->prefix, 0, key, iterator->depth,
+                               inner_node->prefix_size);
+        if (prefix_comparison < 0) {
+            // Prefix so far has been equal, but we've found a smaller key.
+            // Since we take the lower bound within each node, we can return the
+            // next leaf.
+            return art_iterator_up_and_move(iterator, true);
+        } else if (prefix_comparison > 0) {
+            // No key equal to the key we're looking for, return the first leaf.
+            return art_node_init_iterator(node, iterator, true);
+        }
+        // Prefix is equal, move to lower bound child.
+        art_key_chunk_t key_chunk =
+            key[iterator->depth + inner_node->prefix_size];
+        art_indexed_child_t indexed_child =
+            art_node_lower_bound(node, key_chunk);
+        if (indexed_child.child == NULL) {
+            // Only smaller keys among children.
+            return art_iterator_up_and_move(iterator, true);
+        }
+        if (indexed_child.key_chunk > key_chunk) {
+            // Only larger children, return the first larger child.
+            art_iterator_down(iterator, inner_node, indexed_child.index);
+            return art_node_init_iterator(indexed_child.child, iterator, true);
+        }
+        // We found a child with an equal prefix.
+        art_iterator_down(iterator, inner_node, indexed_child.index);
+        node = indexed_child.child;
+    }
+    art_leaf_t *leaf = CAST_LEAF(node);
+    if (art_compare_keys(leaf->key, key) >= 0) {
+        // Leaf has an equal or larger key.
+        return art_iterator_valid_loc(iterator, leaf);
+    }
+    // Leaf has an equal prefix, but the full key is smaller. Move to the next
+    // leaf.
+    return art_iterator_up_and_move(iterator, true);
+}
+
+art_iterator_t art_init_iterator(const art_t *art, bool first) {
+    art_iterator_t iterator = {0};
+    if (art->root == NULL) {
+        return iterator;
+    }
+    art_node_init_iterator(art->root, &iterator, first);
+    return iterator;
+}
+
+bool art_iterator_next(art_iterator_t *iterator) {
+    return art_iterator_move(iterator, true);
+}
+
+bool art_iterator_prev(art_iterator_t *iterator) {
+    return art_iterator_move(iterator, false);
+}
+
+bool art_iterator_lower_bound(art_iterator_t *iterator,
+                              const art_key_chunk_t *key) {
+    if (iterator->value == NULL) {
+        // We're beyond the end / start of the ART so the iterator does not have
+        // a valid key. Start from the root.
+        iterator->frame = 0;
+        iterator->depth = 0;
+        return art_node_iterator_lower_bound(art_iterator_node(iterator),
+                                             iterator, key);
+    }
+    int compare_result =
+        art_compare_prefix(iterator->key, 0, key, 0, ART_KEY_BYTES);
+    // Move up until we have an equal prefix, after which we can do a normal
+    // lower bound search.
+    while (compare_result != 0) {
+        if (!art_iterator_up(iterator)) {
+            if (compare_result < 0) {
+                // Only smaller keys found.
+                return art_iterator_invalid_loc(iterator);
+            } else {
+                return art_node_init_iterator(art_iterator_node(iterator),
+                                              iterator, true);
+            }
+        }
+        // Since we're only moving up, we can keep comparing against the
+        // iterator key.
+        art_inner_node_t *inner_node =
+            (art_inner_node_t *)art_iterator_node(iterator);
+        compare_result =
+            art_compare_prefix(iterator->key, 0, key, 0,
+                               iterator->depth + inner_node->prefix_size);
+    }
+    if (compare_result > 0) {
+        return art_node_init_iterator(art_iterator_node(iterator), iterator,
+                                      true);
+    }
+    return art_node_iterator_lower_bound(art_iterator_node(iterator), iterator,
+                                         key);
+}
+
+art_iterator_t art_lower_bound(const art_t *art, const art_key_chunk_t *key) {
+    art_iterator_t iterator = {0};
+    if (art->root != NULL) {
+        art_node_iterator_lower_bound(art->root, &iterator, key);
+    }
+    return iterator;
+}
+
+art_iterator_t art_upper_bound(const art_t *art, const art_key_chunk_t *key) {
+    art_iterator_t iterator = {0};
+    if (art->root != NULL) {
+        if (art_node_iterator_lower_bound(art->root, &iterator, key) &&
+            art_compare_keys(iterator.key, key) == 0) {
+            art_iterator_next(&iterator);
+        }
+    }
+    return iterator;
+}
+
+void art_iterator_insert(art_t *art, art_iterator_t *iterator,
+                         const art_key_chunk_t *key, art_val_t *val) {
+    // TODO: This can likely be faster.
+    art_insert(art, key, val);
+    assert(art->root != NULL);
+    iterator->frame = 0;
+    iterator->depth = 0;
+    art_node_iterator_lower_bound(art->root, iterator, key);
+}
+
+// TODO: consider keeping `art_t *art` in the iterator.
+art_val_t *art_iterator_erase(art_t *art, art_iterator_t *iterator) {
+    if (iterator->value == NULL) {
+        return NULL;
+    }
+    art_key_chunk_t initial_key[ART_KEY_BYTES];
+    memcpy(initial_key, iterator->key, ART_KEY_BYTES);
+
+    art_val_t *value_erased = iterator->value;
+    bool went_up = art_iterator_up(iterator);
+    if (!went_up) {
+        // We're erasing the root.
+        art->root = NULL;
+        art_iterator_invalid_loc(iterator);
+        return value_erased;
+    }
+
+    // Erase the leaf.
+    art_inner_node_t *parent_node =
+        (art_inner_node_t *)art_iterator_node(iterator);
+    art_key_chunk_t key_chunk_in_parent =
+        iterator->key[iterator->depth + parent_node->prefix_size];
+    art_node_t *new_parent_node =
+        art_node_erase(parent_node, key_chunk_in_parent);
+
+    if (new_parent_node != ((art_node_t *)parent_node)) {
+        // Replace the pointer to the inner node we erased from in its
+        // parent (it may be a leaf now).
+        iterator->frames[iterator->frame].node = new_parent_node;
+        went_up = art_iterator_up(iterator);
+        if (went_up) {
+            art_inner_node_t *grandparent_node =
+                (art_inner_node_t *)art_iterator_node(iterator);
+            art_key_chunk_t key_chunk_in_grandparent =
+                iterator->key[iterator->depth + grandparent_node->prefix_size];
+            art_replace(grandparent_node, key_chunk_in_grandparent,
+                        new_parent_node);
+        } else {
+            // We were already at the rootmost node.
+            art->root = new_parent_node;
+        }
+    }
+
+    iterator->frame = 0;
+    iterator->depth = 0;
+    // Do a lower bound search for the initial key, which will find the first
+    // greater key if it exists. This can likely be mildly faster if we instead
+    // start from the current position.
+    art_node_iterator_lower_bound(art->root, iterator, initial_key);
+    return value_erased;
+}
+
+static bool art_internal_validate_at(const art_node_t *node,
+                                     art_internal_validate_t validator) {
+    if (node == NULL) {
+        return art_validate_fail(&validator, "node is null");
+    }
+    if (art_is_leaf(node)) {
+        art_leaf_t *leaf = CAST_LEAF(node);
+        if (art_compare_prefix(leaf->key, 0, validator.current_key, 0,
+                               validator.depth) != 0) {
+            return art_validate_fail(
+                &validator,
+                "leaf key does not match its position's prefix in the tree");
+        }
+        if (validator.validate_cb != NULL &&
+            !validator.validate_cb(leaf, validator.reason)) {
+            if (*validator.reason == NULL) {
+                *validator.reason = "leaf validation failed";
+            }
+            return false;
+        }
+    } else {
+        art_inner_node_t *inner_node = (art_inner_node_t *)node;
+
+        if (validator.depth + inner_node->prefix_size + 1 > ART_KEY_BYTES) {
+            return art_validate_fail(&validator,
+                                     "node has too much prefix at given depth");
+        }
+        memcpy(validator.current_key + validator.depth, inner_node->prefix,
+               inner_node->prefix_size);
+        validator.depth += inner_node->prefix_size;
+
+        switch (inner_node->typecode) {
+            case ART_NODE4_TYPE:
+                if (!art_node4_internal_validate((art_node4_t *)inner_node,
+                                                 validator)) {
+                    return false;
+                }
+                break;
+            case ART_NODE16_TYPE:
+                if (!art_node16_internal_validate((art_node16_t *)inner_node,
+                                                  validator)) {
+                    return false;
+                }
+                break;
+            case ART_NODE48_TYPE:
+                if (!art_node48_internal_validate((art_node48_t *)inner_node,
+                                                  validator)) {
+                    return false;
+                }
+                break;
+            case ART_NODE256_TYPE:
+                if (!art_node256_internal_validate((art_node256_t *)inner_node,
+                                                   validator)) {
+                    return false;
+                }
+                break;
+            default:
+                return art_validate_fail(&validator, "invalid node type");
+        }
+    }
+    return true;
+}
+
+bool art_internal_validate(const art_t *art, const char **reason,
+                           art_validate_cb_t validate_cb) {
+    const char *reason_local;
+    if (reason == NULL) {
+        // Always allow assigning through *reason
+        reason = &reason_local;
+    }
+    *reason = NULL;
+    if (art->root == NULL) {
+        return true;
+    }
+    art_internal_validate_t validator = {
+        .reason = reason,
+        .validate_cb = validate_cb,
+        .depth = 0,
+        .current_key = {0},
+    };
+    return art_internal_validate_at(art->root, validator);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace roaring
+}  // namespace internal
+#endif
+/* end file src/art/art.c */
+/* begin file src/bitset.c */
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+extern inline void bitset_print(const bitset_t *b);
+extern inline bool bitset_for_each(const bitset_t *b, bitset_iterator iterator,
+                                   void *ptr);
+extern inline size_t bitset_next_set_bits(const bitset_t *bitset,
+                                          size_t *buffer, size_t capacity,
+                                          size_t *startfrom);
+extern inline void bitset_set_to_value(bitset_t *bitset, size_t i, bool flag);
+extern inline bool bitset_next_set_bit(const bitset_t *bitset, size_t *i);
+extern inline void bitset_set(bitset_t *bitset, size_t i);
+extern inline bool bitset_get(const bitset_t *bitset, size_t i);
+extern inline size_t bitset_size_in_words(const bitset_t *bitset);
+extern inline size_t bitset_size_in_bits(const bitset_t *bitset);
+extern inline size_t bitset_size_in_bytes(const bitset_t *bitset);
+
+/* Create a new bitset. Return NULL in case of failure. */
+bitset_t *bitset_create(void) {
+    bitset_t *bitset = NULL;
+    /* Allocate the bitset itself. */
+    if ((bitset = (bitset_t *)roaring_malloc(sizeof(bitset_t))) == NULL) {
+        return NULL;
+    }
+    bitset->array = NULL;
+    bitset->arraysize = 0;
+    bitset->capacity = 0;
+    return bitset;
+}
+
+/* Create a new bitset able to contain size bits. Return NULL in case of
+ * failure. */
+bitset_t *bitset_create_with_capacity(size_t size) {
+    bitset_t *bitset = NULL;
+    /* Allocate the bitset itself. */
+    if ((bitset = (bitset_t *)roaring_malloc(sizeof(bitset_t))) == NULL) {
+        return NULL;
+    }
+    bitset->arraysize =
+        (size + sizeof(uint64_t) * 8 - 1) / (sizeof(uint64_t) * 8);
+    bitset->capacity = bitset->arraysize;
+    if ((bitset->array = (uint64_t *)roaring_calloc(
+             bitset->arraysize, sizeof(uint64_t))) == NULL) {
+        roaring_free(bitset);
+        return NULL;
+    }
+    return bitset;
+}
+
+/* Create a copy */
+bitset_t *bitset_copy(const bitset_t *bitset) {
+    bitset_t *copy = NULL;
+    /* Allocate the bitset itself. */
+    if ((copy = (bitset_t *)roaring_malloc(sizeof(bitset_t))) == NULL) {
+        return NULL;
+    }
+    memcpy(copy, bitset, sizeof(bitset_t));
+    copy->capacity = copy->arraysize;
+    if ((copy->array = (uint64_t *)roaring_malloc(sizeof(uint64_t) *
+                                                  bitset->arraysize)) == NULL) {
+        roaring_free(copy);
+        return NULL;
+    }
+    memcpy(copy->array, bitset->array, sizeof(uint64_t) * bitset->arraysize);
+    return copy;
+}
+
+void bitset_clear(bitset_t *bitset) {
+    memset(bitset->array, 0, sizeof(uint64_t) * bitset->arraysize);
+}
+
+void bitset_fill(bitset_t *bitset) {
+    memset(bitset->array, 0xff, sizeof(uint64_t) * bitset->arraysize);
+}
+
+void bitset_shift_left(bitset_t *bitset, size_t s) {
+    size_t extra_words = s / 64;
+    int inword_shift = s % 64;
+    size_t as = bitset->arraysize;
+    if (inword_shift == 0) {
+        bitset_resize(bitset, as + extra_words, false);
+        // could be done with a memmove
+        for (size_t i = as + extra_words; i > extra_words; i--) {
+            bitset->array[i - 1] = bitset->array[i - 1 - extra_words];
+        }
+    } else {
+        bitset_resize(bitset, as + extra_words + 1, true);
+        bitset->array[as + extra_words] =
+            bitset->array[as - 1] >> (64 - inword_shift);
+        for (size_t i = as + extra_words; i >= extra_words + 2; i--) {
+            bitset->array[i - 1] =
+                (bitset->array[i - 1 - extra_words] << inword_shift) |
+                (bitset->array[i - 2 - extra_words] >> (64 - inword_shift));
+        }
+        bitset->array[extra_words] = bitset->array[0] << inword_shift;
+    }
+    for (size_t i = 0; i < extra_words; i++) {
+        bitset->array[i] = 0;
+    }
+}
+
+void bitset_shift_right(bitset_t *bitset, size_t s) {
+    size_t extra_words = s / 64;
+    int inword_shift = s % 64;
+    size_t as = bitset->arraysize;
+    if (inword_shift == 0) {
+        // could be done with a memmove
+        for (size_t i = 0; i < as - extra_words; i++) {
+            bitset->array[i] = bitset->array[i + extra_words];
+        }
+        bitset_resize(bitset, as - extra_words, false);
+
+    } else {
+        for (size_t i = 0; i + extra_words + 1 < as; i++) {
+            bitset->array[i] =
+                (bitset->array[i + extra_words] >> inword_shift) |
+                (bitset->array[i + extra_words + 1] << (64 - inword_shift));
+        }
+        bitset->array[as - extra_words - 1] =
+            (bitset->array[as - 1] >> inword_shift);
+        bitset_resize(bitset, as - extra_words, false);
+    }
+}
+
+/* Free memory. */
+void bitset_free(bitset_t *bitset) {
+    if (bitset == NULL) {
+        return;
+    }
+    roaring_free(bitset->array);
+    roaring_free(bitset);
+}
+
+/* Resize the bitset so that it can support newarraysize * 64 bits. Return true
+ * in case of success, false for failure. */
+bool bitset_resize(bitset_t *bitset, size_t newarraysize, bool padwithzeroes) {
+    if (newarraysize > SIZE_MAX / 64) {
+        return false;
+    }
+    size_t smallest =
+        newarraysize < bitset->arraysize ? newarraysize : bitset->arraysize;
+    if (bitset->capacity < newarraysize) {
+        uint64_t *newarray;
+        size_t newcapacity = bitset->capacity;
+        if (newcapacity == 0) {
+            newcapacity = 1;
+        }
+        while (newcapacity < newarraysize) {
+            newcapacity *= 2;
+        }
+        if ((newarray = (uint64_t *)roaring_realloc(
+                 bitset->array, sizeof(uint64_t) * newcapacity)) == NULL) {
+            return false;
+        }
+        bitset->capacity = newcapacity;
+        bitset->array = newarray;
+    }
+    if (padwithzeroes && (newarraysize > smallest))
+        memset(bitset->array + smallest, 0,
+               sizeof(uint64_t) * (newarraysize - smallest));
+    bitset->arraysize = newarraysize;
+    return true;  // success!
+}
+
+size_t bitset_count(const bitset_t *bitset) {
+    size_t card = 0;
+    size_t k = 0;
+    for (; k + 7 < bitset->arraysize; k += 8) {
+        card += roaring_hamming(bitset->array[k]);
+        card += roaring_hamming(bitset->array[k + 1]);
+        card += roaring_hamming(bitset->array[k + 2]);
+        card += roaring_hamming(bitset->array[k + 3]);
+        card += roaring_hamming(bitset->array[k + 4]);
+        card += roaring_hamming(bitset->array[k + 5]);
+        card += roaring_hamming(bitset->array[k + 6]);
+        card += roaring_hamming(bitset->array[k + 7]);
+    }
+    for (; k + 3 < bitset->arraysize; k += 4) {
+        card += roaring_hamming(bitset->array[k]);
+        card += roaring_hamming(bitset->array[k + 1]);
+        card += roaring_hamming(bitset->array[k + 2]);
+        card += roaring_hamming(bitset->array[k + 3]);
+    }
+    for (; k < bitset->arraysize; k++) {
+        card += roaring_hamming(bitset->array[k]);
+    }
+    return card;
+}
+
+bool bitset_inplace_union(bitset_t *CBITSET_RESTRICT b1,
+                          const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    for (size_t k = 0; k < minlength; ++k) {
+        b1->array[k] |= b2->array[k];
+    }
+    if (b2->arraysize > b1->arraysize) {
+        size_t oldsize = b1->arraysize;
+        if (!bitset_resize(b1, b2->arraysize, false)) return false;
+        memcpy(b1->array + oldsize, b2->array + oldsize,
+               (b2->arraysize - oldsize) * sizeof(uint64_t));
+    }
+    return true;
+}
+
+size_t bitset_minimum(const bitset_t *bitset) {
+    for (size_t k = 0; k < bitset->arraysize; k++) {
+        uint64_t w = bitset->array[k];
+        if (w != 0) {
+            return roaring_trailing_zeroes(w) + k * 64;
+        }
+    }
+    return 0;
+}
+
+bool bitset_grow(bitset_t *bitset, size_t newarraysize) {
+    if (newarraysize < bitset->arraysize) {
+        return false;
+    }
+    if (newarraysize > SIZE_MAX / 64) {
+        return false;
+    }
+    if (bitset->capacity < newarraysize) {
+        uint64_t *newarray;
+        size_t newcapacity = (UINT64_C(0xFFFFFFFFFFFFFFFF) >>
+                              roaring_leading_zeroes(newarraysize)) +
+                             1;
+        while (newcapacity < newarraysize) {
+            newcapacity *= 2;
+        }
+        if ((newarray = (uint64_t *)roaring_realloc(
+                 bitset->array, sizeof(uint64_t) * newcapacity)) == NULL) {
+            return false;
+        }
+        bitset->capacity = newcapacity;
+        bitset->array = newarray;
+    }
+    memset(bitset->array + bitset->arraysize, 0,
+           sizeof(uint64_t) * (newarraysize - bitset->arraysize));
+    bitset->arraysize = newarraysize;
+    return true;  // success!
+}
+
+size_t bitset_maximum(const bitset_t *bitset) {
+    for (size_t k = bitset->arraysize; k > 0; k--) {
+        uint64_t w = bitset->array[k - 1];
+        if (w != 0) {
+            return 63 - roaring_leading_zeroes(w) + (k - 1) * 64;
+        }
+    }
+    return 0;
+}
+
+/* Returns true if bitsets share no common elements, false otherwise.
+ *
+ * Performs early-out if common element found. */
+bool bitsets_disjoint(const bitset_t *CBITSET_RESTRICT b1,
+                      const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+
+    for (size_t k = 0; k < minlength; k++) {
+        if ((b1->array[k] & b2->array[k]) != 0) return false;
+    }
+    return true;
+}
+
+/* Returns true if bitsets contain at least 1 common element, false if they are
+ * disjoint.
+ *
+ * Performs early-out if common element found. */
+bool bitsets_intersect(const bitset_t *CBITSET_RESTRICT b1,
+                       const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+
+    for (size_t k = 0; k < minlength; k++) {
+        if ((b1->array[k] & b2->array[k]) != 0) return true;
+    }
+    return false;
+}
+
+/* Returns true if b has any bits set in or after b->array[starting_loc]. */
+static bool any_bits_set(const bitset_t *b, size_t starting_loc) {
+    if (starting_loc >= b->arraysize) {
+        return false;
+    }
+    for (size_t k = starting_loc; k < b->arraysize; k++) {
+        if (b->array[k] != 0) return true;
+    }
+    return false;
+}
+
+/* Returns true if b1 has all of b2's bits set.
+ *
+ * Performs early out if a bit is found in b2 that is not found in b1. */
+bool bitset_contains_all(const bitset_t *CBITSET_RESTRICT b1,
+                         const bitset_t *CBITSET_RESTRICT b2) {
+    size_t min_size = b1->arraysize;
+    if (b1->arraysize > b2->arraysize) {
+        min_size = b2->arraysize;
+    }
+    for (size_t k = 0; k < min_size; k++) {
+        if ((b1->array[k] & b2->array[k]) != b2->array[k]) {
+            return false;
+        }
+    }
+    if (b2->arraysize > b1->arraysize) {
+        /* Need to check if b2 has any bits set beyond b1's array */
+        return !any_bits_set(b2, b1->arraysize);
+    }
+    return true;
+}
+
+size_t bitset_union_count(const bitset_t *CBITSET_RESTRICT b1,
+                          const bitset_t *CBITSET_RESTRICT b2) {
+    size_t answer = 0;
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    for (; k + 3 < minlength; k += 4) {
+        answer += roaring_hamming(b1->array[k] | b2->array[k]);
+        answer += roaring_hamming(b1->array[k + 1] | b2->array[k + 1]);
+        answer += roaring_hamming(b1->array[k + 2] | b2->array[k + 2]);
+        answer += roaring_hamming(b1->array[k + 3] | b2->array[k + 3]);
+    }
+    for (; k < minlength; ++k) {
+        answer += roaring_hamming(b1->array[k] | b2->array[k]);
+    }
+    if (b2->arraysize > b1->arraysize) {
+        // k is equal to b1->arraysize
+        for (; k + 3 < b2->arraysize; k += 4) {
+            answer += roaring_hamming(b2->array[k]);
+            answer += roaring_hamming(b2->array[k + 1]);
+            answer += roaring_hamming(b2->array[k + 2]);
+            answer += roaring_hamming(b2->array[k + 3]);
+        }
+        for (; k < b2->arraysize; ++k) {
+            answer += roaring_hamming(b2->array[k]);
+        }
+    } else {
+        // k is equal to b2->arraysize
+        for (; k + 3 < b1->arraysize; k += 4) {
+            answer += roaring_hamming(b1->array[k]);
+            answer += roaring_hamming(b1->array[k + 1]);
+            answer += roaring_hamming(b1->array[k + 2]);
+            answer += roaring_hamming(b1->array[k + 3]);
+        }
+        for (; k < b1->arraysize; ++k) {
+            answer += roaring_hamming(b1->array[k]);
+        }
+    }
+    return answer;
+}
+
+void bitset_inplace_intersection(bitset_t *CBITSET_RESTRICT b1,
+                                 const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    for (; k < minlength; ++k) {
+        b1->array[k] &= b2->array[k];
+    }
+    for (; k < b1->arraysize; ++k) {
+        b1->array[k] = 0;  // memset could, maybe, be a tiny bit faster
+    }
+}
+
+size_t bitset_intersection_count(const bitset_t *CBITSET_RESTRICT b1,
+                                 const bitset_t *CBITSET_RESTRICT b2) {
+    size_t answer = 0;
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    for (size_t k = 0; k < minlength; ++k) {
+        answer += roaring_hamming(b1->array[k] & b2->array[k]);
+    }
+    return answer;
+}
+
+void bitset_inplace_difference(bitset_t *CBITSET_RESTRICT b1,
+                               const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    for (; k < minlength; ++k) {
+        b1->array[k] &= ~(b2->array[k]);
+    }
+}
+
+size_t bitset_difference_count(const bitset_t *CBITSET_RESTRICT b1,
+                               const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    size_t answer = 0;
+    for (; k < minlength; ++k) {
+        answer += roaring_hamming(b1->array[k] & ~(b2->array[k]));
+    }
+    for (; k < b1->arraysize; ++k) {
+        answer += roaring_hamming(b1->array[k]);
+    }
+    return answer;
+}
+
+bool bitset_inplace_symmetric_difference(bitset_t *CBITSET_RESTRICT b1,
+                                         const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    for (; k < minlength; ++k) {
+        b1->array[k] ^= b2->array[k];
+    }
+    if (b2->arraysize > b1->arraysize) {
+        size_t oldsize = b1->arraysize;
+        if (!bitset_resize(b1, b2->arraysize, false)) return false;
+        memcpy(b1->array + oldsize, b2->array + oldsize,
+               (b2->arraysize - oldsize) * sizeof(uint64_t));
+    }
+    return true;
+}
+
+size_t bitset_symmetric_difference_count(const bitset_t *CBITSET_RESTRICT b1,
+                                         const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    size_t answer = 0;
+    for (; k < minlength; ++k) {
+        answer += roaring_hamming(b1->array[k] ^ b2->array[k]);
+    }
+    if (b2->arraysize > b1->arraysize) {
+        for (; k < b2->arraysize; ++k) {
+            answer += roaring_hamming(b2->array[k]);
+        }
+    } else {
+        for (; k < b1->arraysize; ++k) {
+            answer += roaring_hamming(b1->array[k]);
+        }
+    }
+    return answer;
+}
+
+bool bitset_trim(bitset_t *bitset) {
+    size_t newsize = bitset->arraysize;
+    while (newsize > 0) {
+        if (bitset->array[newsize - 1] == 0)
+            newsize -= 1;
+        else
+            break;
+    }
+    if (bitset->capacity == newsize) return true;  // nothing to do
+    uint64_t *newarray;
+    if ((newarray = (uint64_t *)roaring_realloc(
+             bitset->array, sizeof(uint64_t) * newsize)) == NULL) {
+        return false;
+    }
+    bitset->array = newarray;
+    bitset->capacity = newsize;
+    bitset->arraysize = newsize;
+    return true;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/bitset.c */
+/* begin file src/bitset_util.c */
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#ifdef __cplusplus
+using namespace ::roaring::internal;
+extern "C" {
+namespace roaring {
+namespace api {
+#endif
+
+#if CROARING_IS_X64
+static uint8_t lengthTable[256] = {
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
+    2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
+    2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+    4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
+    3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+    4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+#endif
+
+#if CROARING_IS_X64
+ALIGNED(32)
+static uint32_t vecDecodeTable[256][8] = {
+    {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
+    {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
+    {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
+    {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
+    {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
+    {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
+    {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
+    {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
+    {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
+    {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
+    {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
+    {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
+    {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
+    {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
+    {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
+    {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
+    {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
+    {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
+    {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
+    {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
+    {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
+    {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
+    {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
+    {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
+    {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
+    {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
+    {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
+    {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
+    {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
+    {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
+    {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
+    {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
+    {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
+    {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
+    {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
+    {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
+    {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
+    {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
+    {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
+    {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
+    {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
+    {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
+    {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
+    {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
+    {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
+    {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
+    {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
+    {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
+    {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
+    {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
+    {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
+    {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
+    {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
+    {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
+    {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
+    {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
+    {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
+    {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
+    {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
+    {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
+    {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
+    {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
+    {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
+    {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
+    {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
+    {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
+    {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
+    {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
+    {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
+    {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
+    {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
+    {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
+    {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
+    {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
+    {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
+    {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
+    {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
+    {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
+    {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
+    {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
+    {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
+    {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
+    {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
+    {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
+    {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
+    {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
+    {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
+    {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
+    {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
+    {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
+    {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
+    {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
+    {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
+    {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
+    {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
+    {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
+    {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
+    {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
+    {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
+    {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
+    {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
+    {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
+    {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
+    {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
+    {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
+    {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
+    {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
+    {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
+    {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
+    {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
+    {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
+    {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
+    {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
+    {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
+    {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
+    {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
+    {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
+    {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
+    {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
+    {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
+    {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
+    {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
+    {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
+    {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
+    {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
+    {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
+    {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
+    {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
+    {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
+    {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
+    {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
+    {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
+    {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
+    {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
+    {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
+    {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
+    {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
+    {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
+    {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
+    {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
+    {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
+    {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
+    {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
+    {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
+    {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
+    {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
+    {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
+    {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
+    {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
+    {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
+    {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
+    {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
+    {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
+    {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
+    {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
+    {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
+    {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
+    {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
+    {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
+    {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
+    {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
+    {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
+    {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
+    {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
+    {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
+    {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
+    {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
+    {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
+    {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
+    {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
+    {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
+    {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
+    {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
+    {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
+    {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
+    {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
+    {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
+    {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
+    {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
+    {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
+    {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
+    {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
+    {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
+    {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
+    {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
+    {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
+    {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
+    {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
+    {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
+    {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
+    {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
+    {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
+    {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
+    {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
+    {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
+    {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
+    {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
+    {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
+    {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
+    {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
+    {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
+    {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
+    {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
+    {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
+    {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
+    {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
+    {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
+    {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
+    {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
+    {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
+    {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
+    {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
+    {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
+    {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
+    {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
+    {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
+    {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
+    {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
+    {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
+    {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
+    {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
+    {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
+    {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
+    {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
+    {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
+    {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
+    {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
+    {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
+    {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
+    {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
+    {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
+    {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
+    {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
+    {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
+    {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
+    {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
+    {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
+    {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
+    {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
+    {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
+    {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
+    {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
+    {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
+    {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
+    {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
+    {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
+    {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
+    {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
+    {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
+    {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
+    {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
+    {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
+    {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
+    {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
+    {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
+    {1, 2, 3, 4, 5, 6, 7, 8}  /* 0xFF (11111111) */
+};
+
+#endif  // #if CROARING_IS_X64
+
+#if CROARING_IS_X64
+// same as vecDecodeTable but in 16 bits
+ALIGNED(32)
+static uint16_t vecDecodeTable_uint16[256][8] = {
+    {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
+    {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
+    {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
+    {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
+    {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
+    {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
+    {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
+    {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
+    {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
+    {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
+    {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
+    {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
+    {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
+    {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
+    {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
+    {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
+    {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
+    {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
+    {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
+    {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
+    {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
+    {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
+    {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
+    {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
+    {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
+    {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
+    {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
+    {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
+    {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
+    {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
+    {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
+    {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
+    {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
+    {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
+    {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
+    {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
+    {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
+    {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
+    {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
+    {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
+    {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
+    {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
+    {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
+    {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
+    {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
+    {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
+    {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
+    {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
+    {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
+    {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
+    {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
+    {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
+    {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
+    {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
+    {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
+    {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
+    {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
+    {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
+    {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
+    {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
+    {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
+    {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
+    {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
+    {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
+    {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
+    {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
+    {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
+    {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
+    {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
+    {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
+    {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
+    {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
+    {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
+    {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
+    {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
+    {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
+    {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
+    {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
+    {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
+    {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
+    {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
+    {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
+    {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
+    {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
+    {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
+    {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
+    {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
+    {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
+    {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
+    {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
+    {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
+    {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
+    {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
+    {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
+    {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
+    {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
+    {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
+    {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
+    {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
+    {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
+    {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
+    {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
+    {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
+    {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
+    {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
+    {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
+    {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
+    {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
+    {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
+    {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
+    {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
+    {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
+    {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
+    {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
+    {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
+    {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
+    {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
+    {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
+    {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
+    {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
+    {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
+    {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
+    {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
+    {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
+    {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
+    {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
+    {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
+    {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
+    {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
+    {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
+    {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
+    {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
+    {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
+    {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
+    {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
+    {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
+    {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
+    {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
+    {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
+    {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
+    {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
+    {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
+    {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
+    {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
+    {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
+    {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
+    {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
+    {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
+    {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
+    {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
+    {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
+    {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
+    {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
+    {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
+    {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
+    {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
+    {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
+    {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
+    {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
+    {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
+    {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
+    {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
+    {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
+    {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
+    {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
+    {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
+    {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
+    {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
+    {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
+    {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
+    {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
+    {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
+    {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
+    {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
+    {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
+    {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
+    {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
+    {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
+    {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
+    {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
+    {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
+    {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
+    {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
+    {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
+    {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
+    {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
+    {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
+    {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
+    {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
+    {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
+    {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
+    {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
+    {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
+    {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
+    {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
+    {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
+    {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
+    {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
+    {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
+    {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
+    {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
+    {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
+    {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
+    {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
+    {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
+    {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
+    {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
+    {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
+    {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
+    {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
+    {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
+    {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
+    {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
+    {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
+    {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
+    {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
+    {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
+    {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
+    {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
+    {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
+    {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
+    {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
+    {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
+    {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
+    {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
+    {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
+    {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
+    {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
+    {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
+    {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
+    {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
+    {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
+    {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
+    {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
+    {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
+    {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
+    {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
+    {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
+    {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
+    {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
+    {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
+    {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
+    {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
+    {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
+    {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
+    {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
+    {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
+    {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
+    {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
+    {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
+    {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
+    {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
+    {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
+    {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
+    {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
+    {1, 2, 3, 4, 5, 6, 7, 8}  /* 0xFF (11111111) */
+};
+
+#endif
+
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+const uint8_t vbmi2_table[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length,
+                                     uint32_t *vout, size_t outcapacity,
+                                     uint32_t base) {
+    uint32_t *out = (uint32_t *)vout;
+    uint32_t *initout = out;
+    uint32_t *safeout = out + outcapacity;
+    __m512i base_v = _mm512_set1_epi32(base);
+    __m512i index_table = _mm512_loadu_si512(vbmi2_table);
+    size_t i = 0;
+
+    for (; (i < length) && ((out + 64) < safeout); i += 1) {
+        uint64_t v = words[i];
+        __m512i vec = _mm512_maskz_compress_epi8(v, index_table);
+
+        uint8_t advance = (uint8_t)roaring_hamming(v);
+
+        __m512i vbase =
+            _mm512_add_epi32(base_v, _mm512_set1_epi32((int)(i * 64)));
+        __m512i r1 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec, 0));
+        __m512i r2 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec, 1));
+        __m512i r3 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec, 2));
+        __m512i r4 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec, 3));
+
+        r1 = _mm512_add_epi32(r1, vbase);
+        r2 = _mm512_add_epi32(r2, vbase);
+        r3 = _mm512_add_epi32(r3, vbase);
+        r4 = _mm512_add_epi32(r4, vbase);
+        _mm512_storeu_si512((__m512i *)out, r1);
+        _mm512_storeu_si512((__m512i *)(out + 16), r2);
+        _mm512_storeu_si512((__m512i *)(out + 32), r3);
+        _mm512_storeu_si512((__m512i *)(out + 48), r4);
+
+        out += advance;
+    }
+
+    base += i * 64;
+
+    for (; (i < length) && (out < safeout); ++i) {
+        uint64_t w = words[i];
+        while ((w != 0) && (out < safeout)) {
+            uint64_t t =
+                w & (~w + 1);  // on x64, should compile to BLSI (careful: the
+                               // Intel compiler seems to fail)
+            int r =
+                roaring_trailing_zeroes(w);  // on x64, should compile to TZCNT
+            uint32_t val = r + base;
+            memcpy(out, &val,
+                   sizeof(uint32_t));  // should be compiled as a MOV on x64
+            out++;
+            w ^= t;
+        }
+        base += 64;
+    }
+
+    return out - initout;
+}
+
+// Reference:
+// https://lemire.me/blog/2022/05/10/faster-bitset-decoding-using-intel-avx-512/
+size_t bitset_extract_setbits_avx512_uint16(const uint64_t *array,
+                                            size_t length, uint16_t *vout,
+                                            size_t capacity, uint16_t base) {
+    uint16_t *out = (uint16_t *)vout;
+    uint16_t *initout = out;
+    uint16_t *safeout = vout + capacity;
+
+    __m512i base_v = _mm512_set1_epi16(base);
+    __m512i index_table = _mm512_loadu_si512(vbmi2_table);
+    size_t i = 0;
+
+    for (; (i < length) && ((out + 64) < safeout); i++) {
+        uint64_t v = array[i];
+        __m512i vec = _mm512_maskz_compress_epi8(v, index_table);
+
+        uint8_t advance = (uint8_t)roaring_hamming(v);
+
+        __m512i vbase =
+            _mm512_add_epi16(base_v, _mm512_set1_epi16((short)(i * 64)));
+        __m512i r1 = _mm512_cvtepi8_epi16(_mm512_extracti32x8_epi32(vec, 0));
+        __m512i r2 = _mm512_cvtepi8_epi16(_mm512_extracti32x8_epi32(vec, 1));
+
+        r1 = _mm512_add_epi16(r1, vbase);
+        r2 = _mm512_add_epi16(r2, vbase);
+
+        _mm512_storeu_si512((__m512i *)out, r1);
+        _mm512_storeu_si512((__m512i *)(out + 32), r2);
+        out += advance;
+    }
+
+    base += i * 64;
+
+    for (; (i < length) && (out < safeout); ++i) {
+        uint64_t w = array[i];
+        while ((w != 0) && (out < safeout)) {
+            uint64_t t =
+                w & (~w + 1);  // on x64, should compile to BLSI (careful: the
+                               // Intel compiler seems to fail)
+            int r =
+                roaring_trailing_zeroes(w);  // on x64, should compile to TZCNT
+            uint32_t val = r + base;
+            memcpy(out, &val, sizeof(uint16_t));
+            out++;
+            w ^= t;
+        }
+        base += 64;
+    }
+
+    return out - initout;
+}
+CROARING_UNTARGET_AVX512
+#endif
+
+CROARING_TARGET_AVX2
+size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
+                                   uint32_t *out, size_t outcapacity,
+                                   uint32_t base) {
+    uint32_t *initout = out;
+    __m256i baseVec = _mm256_set1_epi32(base - 1);
+    __m256i incVec = _mm256_set1_epi32(64);
+    __m256i add8 = _mm256_set1_epi32(8);
+    uint32_t *safeout = out + outcapacity;
+    size_t i = 0;
+    for (; (i < length) && (out + 64 <= safeout); ++i) {
+        uint64_t w = words[i];
+        if (w == 0) {
+            baseVec = _mm256_add_epi32(baseVec, incVec);
+        } else {
+            for (int k = 0; k < 4; ++k) {
+                uint8_t byteA = (uint8_t)w;
+                uint8_t byteB = (uint8_t)(w >> 8);
+                w >>= 16;
+                __m256i vecA =
+                    _mm256_loadu_si256((const __m256i *)vecDecodeTable[byteA]);
+                __m256i vecB =
+                    _mm256_loadu_si256((const __m256i *)vecDecodeTable[byteB]);
+                uint8_t advanceA = lengthTable[byteA];
+                uint8_t advanceB = lengthTable[byteB];
+                vecA = _mm256_add_epi32(baseVec, vecA);
+                baseVec = _mm256_add_epi32(baseVec, add8);
+                vecB = _mm256_add_epi32(baseVec, vecB);
+                baseVec = _mm256_add_epi32(baseVec, add8);
+                _mm256_storeu_si256((__m256i *)out, vecA);
+                out += advanceA;
+                _mm256_storeu_si256((__m256i *)out, vecB);
+                out += advanceB;
+            }
+        }
+    }
+    base += i * 64;
+    for (; (i < length) && (out < safeout); ++i) {
+        uint64_t w = words[i];
+        while ((w != 0) && (out < safeout)) {
+            uint64_t t =
+                w & (~w + 1);  // on x64, should compile to BLSI (careful: the
+                               // Intel compiler seems to fail)
+            int r =
+                roaring_trailing_zeroes(w);  // on x64, should compile to TZCNT
+            uint32_t val = r + base;
+            memcpy(out, &val,
+                   sizeof(uint32_t));  // should be compiled as a MOV on x64
+            out++;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return out - initout;
+}
+CROARING_UNTARGET_AVX2
+#endif  // CROARING_IS_X64
+
+size_t bitset_extract_setbits(const uint64_t *words, size_t length,
+                              uint32_t *out, uint32_t base) {
+    int outpos = 0;
+    for (size_t i = 0; i < length; ++i) {
+        uint64_t w = words[i];
+        while (w != 0) {
+            uint64_t t =
+                w & (~w + 1);  // on x64, should compile to BLSI (careful: the
+                               // Intel compiler seems to fail)
+            int r =
+                roaring_trailing_zeroes(w);  // on x64, should compile to TZCNT
+            uint32_t val = r + base;
+            memcpy(out + outpos, &val,
+                   sizeof(uint32_t));  // should be compiled as a MOV on x64
+            outpos++;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return outpos;
+}
+
+size_t bitset_extract_intersection_setbits_uint16(
+    const uint64_t *__restrict__ words1, const uint64_t *__restrict__ words2,
+    size_t length, uint16_t *out, uint16_t base) {
+    int outpos = 0;
+    for (size_t i = 0; i < length; ++i) {
+        uint64_t w = words1[i] & words2[i];
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            int r = roaring_trailing_zeroes(w);
+            out[outpos++] = (uint16_t)(r + base);
+            w ^= t;
+        }
+        base += 64;
+    }
+    return outpos;
+}
+
+#if CROARING_IS_X64
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out" as 16-bit integers, values start at "base" (can
+ *be set to zero).
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ *
+ * This function uses SSE decoding.
+ */
+CROARING_TARGET_AVX2
+size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length,
+                                         uint16_t *out, size_t outcapacity,
+                                         uint16_t base) {
+    uint16_t *initout = out;
+    __m128i baseVec = _mm_set1_epi16(base - 1);
+    __m128i incVec = _mm_set1_epi16(64);
+    __m128i add8 = _mm_set1_epi16(8);
+    uint16_t *safeout = out + outcapacity;
+    const int numberofbytes = 2;  // process two bytes at a time
+    size_t i = 0;
+    for (; (i < length) && (out + numberofbytes * 8 <= safeout); ++i) {
+        uint64_t w = words[i];
+        if (w == 0) {
+            baseVec = _mm_add_epi16(baseVec, incVec);
+        } else {
+            for (int k = 0; k < 4; ++k) {
+                uint8_t byteA = (uint8_t)w;
+                uint8_t byteB = (uint8_t)(w >> 8);
+                w >>= 16;
+                __m128i vecA = _mm_loadu_si128(
+                    (const __m128i *)vecDecodeTable_uint16[byteA]);
+                __m128i vecB = _mm_loadu_si128(
+                    (const __m128i *)vecDecodeTable_uint16[byteB]);
+                uint8_t advanceA = lengthTable[byteA];
+                uint8_t advanceB = lengthTable[byteB];
+                vecA = _mm_add_epi16(baseVec, vecA);
+                baseVec = _mm_add_epi16(baseVec, add8);
+                vecB = _mm_add_epi16(baseVec, vecB);
+                baseVec = _mm_add_epi16(baseVec, add8);
+                _mm_storeu_si128((__m128i *)out, vecA);
+                out += advanceA;
+                _mm_storeu_si128((__m128i *)out, vecB);
+                out += advanceB;
+            }
+        }
+    }
+    base += (uint16_t)(i * 64);
+    for (; (i < length) && (out < safeout); ++i) {
+        uint64_t w = words[i];
+        while ((w != 0) && (out < safeout)) {
+            uint64_t t = w & (~w + 1);
+            int r = roaring_trailing_zeroes(w);
+            *out = (uint16_t)(r + base);
+            out++;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return out - initout;
+}
+CROARING_UNTARGET_AVX2
+#endif
+
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out", values start at "base" (can be set to zero).
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ */
+size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length,
+                                     uint16_t *out, uint16_t base) {
+    int outpos = 0;
+    for (size_t i = 0; i < length; ++i) {
+        uint64_t w = words[i];
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            int r = roaring_trailing_zeroes(w);
+            out[outpos++] = (uint16_t)(r + base);
+            w ^= t;
+        }
+        base += 64;
+    }
+    return outpos;
+}
+
+#if defined(CROARING_ASMBITMANIPOPTIMIZATION) && defined(CROARING_IS_X64)
+
+static inline uint64_t _asm_bitset_set_list_withcard(uint64_t *words,
+                                                     uint64_t card,
+                                                     const uint16_t *list,
+                                                     uint64_t length) {
+    uint64_t offset, load, pos;
+    uint64_t shift = 6;
+    const uint16_t *end = list + length;
+    if (!length) return card;
+    // TODO: could unroll for performance, see bitset_set_list
+    // bts is not available as an intrinsic in GCC
+    __asm volatile(
+        "1:\n"
+        "movzwq (%[list]), %[pos]\n"
+        "shrx %[shift], %[pos], %[offset]\n"
+        "mov (%[words],%[offset],8), %[load]\n"
+        "bts %[pos], %[load]\n"
+        "mov %[load], (%[words],%[offset],8)\n"
+        "sbb $-1, %[card]\n"
+        "add $2, %[list]\n"
+        "cmp %[list], %[end]\n"
+        "jnz 1b"
+        : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
+          [pos] "=&r"(pos), [offset] "=&r"(offset)
+        : [end] "r"(end), [words] "r"(words), [shift] "r"(shift));
+    return card;
+}
+
+static inline void _asm_bitset_set_list(uint64_t *words, const uint16_t *list,
+                                        uint64_t length) {
+    uint64_t pos;
+    const uint16_t *end = list + length;
+
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t load;
+    for (; list + 3 < end; list += 4) {
+        pos = list[0];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[words],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[words],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos));
+        pos = list[1];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[words],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[words],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos));
+        pos = list[2];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[words],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[words],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos));
+        pos = list[3];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[words],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[words],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos));
+    }
+
+    while (list != end) {
+        pos = list[0];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[words],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[words],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos));
+        list++;
+    }
+}
+
+static inline uint64_t _asm_bitset_clear_list(uint64_t *words, uint64_t card,
+                                              const uint16_t *list,
+                                              uint64_t length) {
+    uint64_t offset, load, pos;
+    uint64_t shift = 6;
+    const uint16_t *end = list + length;
+    if (!length) return card;
+    // btr is not available as an intrinsic in GCC
+    __asm volatile(
+        "1:\n"
+        "movzwq (%[list]), %[pos]\n"
+        "shrx %[shift], %[pos], %[offset]\n"
+        "mov (%[words],%[offset],8), %[load]\n"
+        "btr %[pos], %[load]\n"
+        "mov %[load], (%[words],%[offset],8)\n"
+        "sbb $0, %[card]\n"
+        "add $2, %[list]\n"
+        "cmp %[list], %[end]\n"
+        "jnz 1b"
+        : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
+          [pos] "=&r"(pos), [offset] "=&r"(offset)
+        : [end] "r"(end), [words] "r"(words), [shift] "r"(shift)
+        :
+        /* clobbers */ "memory");
+    return card;
+}
+
+static inline uint64_t _scalar_bitset_clear_list(uint64_t *words, uint64_t card,
+                                                 const uint16_t *list,
+                                                 uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *(const uint16_t *)list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = words[offset];
+        newload = load & ~(UINT64_C(1) << index);
+        card -= (load ^ newload) >> index;
+        words[offset] = newload;
+        list++;
+    }
+    return card;
+}
+
+static inline uint64_t _scalar_bitset_set_list_withcard(uint64_t *words,
+                                                        uint64_t card,
+                                                        const uint16_t *list,
+                                                        uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = words[offset];
+        newload = load | (UINT64_C(1) << index);
+        card += (load ^ newload) >> index;
+        words[offset] = newload;
+        list++;
+    }
+    return card;
+}
+
+static inline void _scalar_bitset_set_list(uint64_t *words,
+                                           const uint16_t *list,
+                                           uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = words[offset];
+        newload = load | (UINT64_C(1) << index);
+        words[offset] = newload;
+        list++;
+    }
+}
+
+uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list,
+                           uint64_t length) {
+    if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+        return _asm_bitset_clear_list(words, card, list, length);
+    } else {
+        return _scalar_bitset_clear_list(words, card, list, length);
+    }
+}
+
+uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card,
+                                  const uint16_t *list, uint64_t length) {
+    if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+        return _asm_bitset_set_list_withcard(words, card, list, length);
+    } else {
+        return _scalar_bitset_set_list_withcard(words, card, list, length);
+    }
+}
+
+void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length) {
+    if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+        _asm_bitset_set_list(words, list, length);
+    } else {
+        _scalar_bitset_set_list(words, list, length);
+    }
+}
+#else
+uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list,
+                           uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *(const uint16_t *)list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = words[offset];
+        newload = load & ~(UINT64_C(1) << index);
+        card -= (load ^ newload) >> index;
+        words[offset] = newload;
+        list++;
+    }
+    return card;
+}
+
+uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card,
+                                  const uint16_t *list, uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = words[offset];
+        newload = load | (UINT64_C(1) << index);
+        card += (load ^ newload) >> index;
+        words[offset] = newload;
+        list++;
+    }
+    return card;
+}
+
+void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = words[offset];
+        newload = load | (UINT64_C(1) << index);
+        words[offset] = newload;
+        list++;
+    }
+}
+
+#endif
+
+/* flip specified bits */
+/* TODO: consider whether worthwhile to make an asm version */
+
+uint64_t bitset_flip_list_withcard(uint64_t *words, uint64_t card,
+                                   const uint16_t *list, uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = words[offset];
+        newload = load ^ (UINT64_C(1) << index);
+        // todo: is a branch here all that bad?
+        card +=
+            (1 - 2 * (((UINT64_C(1) << index) & load) >> index));  // +1 or -1
+        words[offset] = newload;
+        list++;
+    }
+    return card;
+}
+
+void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = words[offset];
+        newload = load ^ (UINT64_C(1) << index);
+        words[offset] = newload;
+        list++;
+    }
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace api {
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif/* end file src/bitset_util.c */
+/* begin file src/containers/array.c */
+/*
+ * array.c
+ *
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+extern inline uint16_t array_container_minimum(const array_container_t *arr);
+extern inline uint16_t array_container_maximum(const array_container_t *arr);
+extern inline int array_container_index_equalorlarger(
+    const array_container_t *arr, uint16_t x);
+
+extern inline int array_container_rank(const array_container_t *arr,
+                                       uint16_t x);
+extern inline uint32_t array_container_rank_many(const array_container_t *arr,
+                                                 uint64_t start_rank,
+                                                 const uint32_t *begin,
+                                                 const uint32_t *end,
+                                                 uint64_t *ans);
+extern inline int array_container_get_index(const array_container_t *arr,
+                                            uint16_t x);
+extern inline bool array_container_contains(const array_container_t *arr,
+                                            uint16_t pos);
+extern inline int array_container_cardinality(const array_container_t *array);
+extern inline bool array_container_nonzero_cardinality(
+    const array_container_t *array);
+extern inline int32_t array_container_serialized_size_in_bytes(int32_t card);
+extern inline bool array_container_empty(const array_container_t *array);
+extern inline bool array_container_full(const array_container_t *array);
+
+/* Create a new array with capacity size. Return NULL in case of failure. */
+array_container_t *array_container_create_given_capacity(int32_t size) {
+    array_container_t *container;
+
+    if ((container = (array_container_t *)roaring_malloc(
+             sizeof(array_container_t))) == NULL) {
+        return NULL;
+    }
+
+    if (size <= 0) {  // we don't want to rely on malloc(0)
+        container->array = NULL;
+    } else if ((container->array = (uint16_t *)roaring_malloc(sizeof(uint16_t) *
+                                                              size)) == NULL) {
+        roaring_free(container);
+        return NULL;
+    }
+
+    container->capacity = size;
+    container->cardinality = 0;
+
+    return container;
+}
+
+/* Create a new array. Return NULL in case of failure. */
+array_container_t *array_container_create(void) {
+    return array_container_create_given_capacity(ARRAY_DEFAULT_INIT_SIZE);
+}
+
+/* Create a new array containing all values in [min,max). */
+array_container_t *array_container_create_range(uint32_t min, uint32_t max) {
+    array_container_t *answer =
+        array_container_create_given_capacity(max - min + 1);
+    if (answer == NULL) return answer;
+    answer->cardinality = 0;
+    for (uint32_t k = min; k < max; k++) {
+        answer->array[answer->cardinality++] = k;
+    }
+    return answer;
+}
+
+/* Duplicate container */
+ALLOW_UNALIGNED
+array_container_t *array_container_clone(const array_container_t *src) {
+    array_container_t *newcontainer =
+        array_container_create_given_capacity(src->capacity);
+    if (newcontainer == NULL) return NULL;
+
+    newcontainer->cardinality = src->cardinality;
+
+    memcpy(newcontainer->array, src->array,
+           src->cardinality * sizeof(uint16_t));
+
+    return newcontainer;
+}
+
+void array_container_offset(const array_container_t *c, container_t **loc,
+                            container_t **hic, uint16_t offset) {
+    array_container_t *lo = NULL, *hi = NULL;
+    int top, lo_cap, hi_cap;
+
+    top = (1 << 16) - offset;
+
+    lo_cap = count_less(c->array, c->cardinality, top);
+    if (loc && lo_cap) {
+        lo = array_container_create_given_capacity(lo_cap);
+        for (int i = 0; i < lo_cap; ++i) {
+            array_container_add(lo, c->array[i] + offset);
+        }
+        *loc = (container_t *)lo;
+    }
+
+    hi_cap = c->cardinality - lo_cap;
+    if (hic && hi_cap) {
+        hi = array_container_create_given_capacity(hi_cap);
+        for (int i = lo_cap; i < c->cardinality; ++i) {
+            array_container_add(hi, c->array[i] + offset);
+        }
+        *hic = (container_t *)hi;
+    }
+}
+
+int array_container_shrink_to_fit(array_container_t *src) {
+    if (src->cardinality == src->capacity) return 0;  // nothing to do
+    int savings = src->capacity - src->cardinality;
+    src->capacity = src->cardinality;
+    if (src->capacity ==
+        0) {  // we do not want to rely on realloc for zero allocs
+        roaring_free(src->array);
+        src->array = NULL;
+    } else {
+        uint16_t *oldarray = src->array;
+        src->array = (uint16_t *)roaring_realloc(
+            oldarray, src->capacity * sizeof(uint16_t));
+        if (src->array == NULL) roaring_free(oldarray);  // should never happen?
+    }
+    return savings;
+}
+
+/* Free memory. */
+void array_container_free(array_container_t *arr) {
+    if (arr->array !=
+        NULL) {  // Jon Strabala reports that some tools complain otherwise
+        roaring_free(arr->array);
+        arr->array = NULL;  // pedantic
+    }
+    roaring_free(arr);
+}
+
+static inline int32_t grow_capacity(int32_t capacity) {
+    return (capacity <= 0)   ? ARRAY_DEFAULT_INIT_SIZE
+           : capacity < 64   ? capacity * 2
+           : capacity < 1024 ? capacity * 3 / 2
+                             : capacity * 5 / 4;
+}
+
+static inline int32_t clamp(int32_t val, int32_t min, int32_t max) {
+    return ((val < min) ? min : (val > max) ? max : val);
+}
+
+void array_container_grow(array_container_t *container, int32_t min,
+                          bool preserve) {
+    int32_t max = (min <= DEFAULT_MAX_SIZE ? DEFAULT_MAX_SIZE : 65536);
+    int32_t new_capacity = clamp(grow_capacity(container->capacity), min, max);
+
+    container->capacity = new_capacity;
+    uint16_t *array = container->array;
+
+    if (preserve) {
+        container->array =
+            (uint16_t *)roaring_realloc(array, new_capacity * sizeof(uint16_t));
+        if (container->array == NULL) roaring_free(array);
+    } else {
+        // Jon Strabala reports that some tools complain otherwise
+        if (array != NULL) {
+            roaring_free(array);
+        }
+        container->array =
+            (uint16_t *)roaring_malloc(new_capacity * sizeof(uint16_t));
+    }
+
+    // if realloc fails, we have container->array == NULL.
+}
+
+/* Copy one container into another. We assume that they are distinct. */
+void array_container_copy(const array_container_t *src,
+                          array_container_t *dst) {
+    const int32_t cardinality = src->cardinality;
+    if (cardinality > dst->capacity) {
+        array_container_grow(dst, cardinality, false);
+    }
+
+    dst->cardinality = cardinality;
+    memcpy(dst->array, src->array, cardinality * sizeof(uint16_t));
+}
+
+void array_container_add_from_range(array_container_t *arr, uint32_t min,
+                                    uint32_t max, uint16_t step) {
+    for (uint32_t value = min; value < max; value += step) {
+        array_container_append(arr, value);
+    }
+}
+
+/* Computes the union of array1 and array2 and write the result to arrayout.
+ * It is assumed that arrayout is distinct from both array1 and array2.
+ */
+void array_container_union(const array_container_t *array_1,
+                           const array_container_t *array_2,
+                           array_container_t *out) {
+    const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality;
+    const int32_t max_cardinality = card_1 + card_2;
+
+    if (out->capacity < max_cardinality) {
+        array_container_grow(out, max_cardinality, false);
+    }
+    out->cardinality = (int32_t)fast_union_uint16(
+        array_1->array, card_1, array_2->array, card_2, out->array);
+}
+
+/* Computes the  difference of array1 and array2 and write the result
+ * to array out.
+ * Array out does not need to be distinct from array_1
+ */
+void array_container_andnot(const array_container_t *array_1,
+                            const array_container_t *array_2,
+                            array_container_t *out) {
+    if (out->capacity < array_1->cardinality)
+        array_container_grow(out, array_1->cardinality, false);
+#if CROARING_IS_X64
+    if ((croaring_hardware_support() & ROARING_SUPPORTS_AVX2) &&
+        (out != array_1) && (out != array_2)) {
+        out->cardinality = difference_vector16(
+            array_1->array, array_1->cardinality, array_2->array,
+            array_2->cardinality, out->array);
+    } else {
+        out->cardinality =
+            difference_uint16(array_1->array, array_1->cardinality,
+                              array_2->array, array_2->cardinality, out->array);
+    }
+#else
+    out->cardinality =
+        difference_uint16(array_1->array, array_1->cardinality, array_2->array,
+                          array_2->cardinality, out->array);
+#endif
+}
+
+/* Computes the symmetric difference of array1 and array2 and write the
+ * result
+ * to arrayout.
+ * It is assumed that arrayout is distinct from both array1 and array2.
+ */
+void array_container_xor(const array_container_t *array_1,
+                         const array_container_t *array_2,
+                         array_container_t *out) {
+    const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality;
+    const int32_t max_cardinality = card_1 + card_2;
+    if (out->capacity < max_cardinality) {
+        array_container_grow(out, max_cardinality, false);
+    }
+
+#if CROARING_IS_X64
+    if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+        out->cardinality =
+            xor_vector16(array_1->array, array_1->cardinality, array_2->array,
+                         array_2->cardinality, out->array);
+    } else {
+        out->cardinality =
+            xor_uint16(array_1->array, array_1->cardinality, array_2->array,
+                       array_2->cardinality, out->array);
+    }
+#else
+    out->cardinality =
+        xor_uint16(array_1->array, array_1->cardinality, array_2->array,
+                   array_2->cardinality, out->array);
+#endif
+}
+
+static inline int32_t minimum_int32(int32_t a, int32_t b) {
+    return (a < b) ? a : b;
+}
+
+/* computes the intersection of array1 and array2 and write the result to
+ * arrayout.
+ * It is assumed that arrayout is distinct from both array1 and array2.
+ * */
+void array_container_intersection(const array_container_t *array1,
+                                  const array_container_t *array2,
+                                  array_container_t *out) {
+    int32_t card_1 = array1->cardinality, card_2 = array2->cardinality,
+            min_card = minimum_int32(card_1, card_2);
+    const int threshold = 64;  // subject to tuning
+#if CROARING_IS_X64
+    if (out->capacity < min_card) {
+        array_container_grow(out, min_card + sizeof(__m128i) / sizeof(uint16_t),
+                             false);
+    }
+#else
+    if (out->capacity < min_card) {
+        array_container_grow(out, min_card, false);
+    }
+#endif
+
+    if (card_1 * threshold < card_2) {
+        out->cardinality = intersect_skewed_uint16(
+            array1->array, card_1, array2->array, card_2, out->array);
+    } else if (card_2 * threshold < card_1) {
+        out->cardinality = intersect_skewed_uint16(
+            array2->array, card_2, array1->array, card_1, out->array);
+    } else {
+#if CROARING_IS_X64
+        if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+            out->cardinality = intersect_vector16(
+                array1->array, card_1, array2->array, card_2, out->array);
+        } else {
+            out->cardinality = intersect_uint16(
+                array1->array, card_1, array2->array, card_2, out->array);
+        }
+#else
+        out->cardinality = intersect_uint16(array1->array, card_1,
+                                            array2->array, card_2, out->array);
+#endif
+    }
+}
+
+/* computes the size of the intersection of array1 and array2
+ * */
+int array_container_intersection_cardinality(const array_container_t *array1,
+                                             const array_container_t *array2) {
+    int32_t card_1 = array1->cardinality, card_2 = array2->cardinality;
+    const int threshold = 64;  // subject to tuning
+    if (card_1 * threshold < card_2) {
+        return intersect_skewed_uint16_cardinality(array1->array, card_1,
+                                                   array2->array, card_2);
+    } else if (card_2 * threshold < card_1) {
+        return intersect_skewed_uint16_cardinality(array2->array, card_2,
+                                                   array1->array, card_1);
+    } else {
+#if CROARING_IS_X64
+        if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+            return intersect_vector16_cardinality(array1->array, card_1,
+                                                  array2->array, card_2);
+        } else {
+            return intersect_uint16_cardinality(array1->array, card_1,
+                                                array2->array, card_2);
+        }
+#else
+        return intersect_uint16_cardinality(array1->array, card_1,
+                                            array2->array, card_2);
+#endif
+    }
+}
+
+bool array_container_intersect(const array_container_t *array1,
+                               const array_container_t *array2) {
+    int32_t card_1 = array1->cardinality, card_2 = array2->cardinality;
+    const int threshold = 64;  // subject to tuning
+    if (card_1 * threshold < card_2) {
+        return intersect_skewed_uint16_nonempty(array1->array, card_1,
+                                                array2->array, card_2);
+    } else if (card_2 * threshold < card_1) {
+        return intersect_skewed_uint16_nonempty(array2->array, card_2,
+                                                array1->array, card_1);
+    } else {
+        // we do not bother vectorizing
+        return intersect_uint16_nonempty(array1->array, card_1, array2->array,
+                                         card_2);
+    }
+}
+
+/* computes the intersection of array1 and array2 and write the result to
+ * array1.
+ * */
+void array_container_intersection_inplace(array_container_t *src_1,
+                                          const array_container_t *src_2) {
+    int32_t card_1 = src_1->cardinality, card_2 = src_2->cardinality;
+    const int threshold = 64;  // subject to tuning
+    if (card_1 * threshold < card_2) {
+        src_1->cardinality = intersect_skewed_uint16(
+            src_1->array, card_1, src_2->array, card_2, src_1->array);
+    } else if (card_2 * threshold < card_1) {
+        src_1->cardinality = intersect_skewed_uint16(
+            src_2->array, card_2, src_1->array, card_1, src_1->array);
+    } else {
+#if CROARING_IS_X64
+        if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+            src_1->cardinality = intersect_vector16_inplace(
+                src_1->array, card_1, src_2->array, card_2);
+        } else {
+            src_1->cardinality = intersect_uint16(
+                src_1->array, card_1, src_2->array, card_2, src_1->array);
+        }
+#else
+        src_1->cardinality = intersect_uint16(
+            src_1->array, card_1, src_2->array, card_2, src_1->array);
+#endif
+    }
+}
+
+ALLOW_UNALIGNED
+int array_container_to_uint32_array(void *vout, const array_container_t *cont,
+                                    uint32_t base) {
+#if CROARING_IS_X64
+    int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    if (support & ROARING_SUPPORTS_AVX512) {
+        return avx512_array_container_to_uint32_array(vout, cont->array,
+                                                      cont->cardinality, base);
+    }
+#endif
+    if (support & ROARING_SUPPORTS_AVX2) {
+        return array_container_to_uint32_array_vector16(
+            vout, cont->array, cont->cardinality, base);
+    }
+#endif  // CROARING_IS_X64
+    int outpos = 0;
+    uint32_t *out = (uint32_t *)vout;
+    size_t i = 0;
+    for (; i < (size_t)cont->cardinality; ++i) {
+        const uint32_t val = base + cont->array[i];
+        memcpy(out + outpos, &val,
+               sizeof(uint32_t));  // should be compiled as a MOV on x64
+        outpos++;
+    }
+    return outpos;
+}
+
+void array_container_printf(const array_container_t *v) {
+    if (v->cardinality == 0) {
+        printf("{}");
+        return;
+    }
+    printf("{");
+    printf("%d", v->array[0]);
+    for (int i = 1; i < v->cardinality; ++i) {
+        printf(",%d", v->array[i]);
+    }
+    printf("}");
+}
+
+void array_container_printf_as_uint32_array(const array_container_t *v,
+                                            uint32_t base) {
+    if (v->cardinality == 0) {
+        return;
+    }
+    printf("%u", v->array[0] + base);
+    for (int i = 1; i < v->cardinality; ++i) {
+        printf(",%u", v->array[i] + base);
+    }
+}
+
+/*
+ * Validate the container. Returns true if valid.
+ */
+bool array_container_validate(const array_container_t *v, const char **reason) {
+    if (v->capacity < 0) {
+        *reason = "negative capacity";
+        return false;
+    }
+    if (v->cardinality < 0) {
+        *reason = "negative cardinality";
+        return false;
+    }
+    if (v->cardinality > v->capacity) {
+        *reason = "cardinality exceeds capacity";
+        return false;
+    }
+    if (v->cardinality > DEFAULT_MAX_SIZE) {
+        *reason = "cardinality exceeds DEFAULT_MAX_SIZE";
+        return false;
+    }
+    if (v->cardinality == 0) {
+        *reason = "zero cardinality";
+        return false;
+    }
+
+    if (v->array == NULL) {
+        *reason = "NULL array pointer";
+        return false;
+    }
+    uint16_t prev = v->array[0];
+    for (int i = 1; i < v->cardinality; ++i) {
+        if (v->array[i] <= prev) {
+            *reason = "array elements not strictly increasing";
+            return false;
+        }
+        prev = v->array[i];
+    }
+
+    return true;
+}
+
+/* Compute the number of runs */
+int32_t array_container_number_of_runs(const array_container_t *ac) {
+    // Can SIMD work here?
+    int32_t nr_runs = 0;
+    int32_t prev = -2;
+    for (const uint16_t *p = ac->array; p != ac->array + ac->cardinality; ++p) {
+        if (*p != prev + 1) nr_runs++;
+        prev = *p;
+    }
+    return nr_runs;
+}
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * The number of bytes written should be
+ * array_container_size_in_bytes(container).
+ *
+ */
+int32_t array_container_write(const array_container_t *container, char *buf) {
+    memcpy(buf, container->array, container->cardinality * sizeof(uint16_t));
+    return array_container_size_in_bytes(container);
+}
+
+bool array_container_is_subset(const array_container_t *container1,
+                               const array_container_t *container2) {
+    if (container1->cardinality > container2->cardinality) {
+        return false;
+    }
+    int i1 = 0, i2 = 0;
+    while (i1 < container1->cardinality && i2 < container2->cardinality) {
+        if (container1->array[i1] == container2->array[i2]) {
+            i1++;
+            i2++;
+        } else if (container1->array[i1] > container2->array[i2]) {
+            i2++;
+        } else {  // container1->array[i1] < container2->array[i2]
+            return false;
+        }
+    }
+    if (i1 == container1->cardinality) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+int32_t array_container_read(int32_t cardinality, array_container_t *container,
+                             const char *buf) {
+    if (container->capacity < cardinality) {
+        array_container_grow(container, cardinality, false);
+    }
+    container->cardinality = cardinality;
+    memcpy(container->array, buf, container->cardinality * sizeof(uint16_t));
+
+    return array_container_size_in_bytes(container);
+}
+
+bool array_container_iterate(const array_container_t *cont, uint32_t base,
+                             roaring_iterator iterator, void *ptr) {
+    for (int i = 0; i < cont->cardinality; i++)
+        if (!iterator(cont->array[i] + base, ptr)) return false;
+    return true;
+}
+
+bool array_container_iterate64(const array_container_t *cont, uint32_t base,
+                               roaring_iterator64 iterator, uint64_t high_bits,
+                               void *ptr) {
+    for (int i = 0; i < cont->cardinality; i++)
+        if (!iterator(high_bits | (uint64_t)(cont->array[i] + base), ptr))
+            return false;
+    return true;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/containers/array.c */
+/* begin file src/containers/bitset.c */
+/*
+ * bitset.c
+ *
+ */
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+extern inline int bitset_container_cardinality(
+    const bitset_container_t *bitset);
+extern inline void bitset_container_set(bitset_container_t *bitset,
+                                        uint16_t pos);
+// unused at this time:
+// extern inline void bitset_container_unset(bitset_container_t *bitset,
+// uint16_t pos);
+extern inline bool bitset_container_get(const bitset_container_t *bitset,
+                                        uint16_t pos);
+extern inline int32_t bitset_container_serialized_size_in_bytes(void);
+extern inline bool bitset_container_add(bitset_container_t *bitset,
+                                        uint16_t pos);
+extern inline bool bitset_container_remove(bitset_container_t *bitset,
+                                           uint16_t pos);
+extern inline bool bitset_container_contains(const bitset_container_t *bitset,
+                                             uint16_t pos);
+
+void bitset_container_clear(bitset_container_t *bitset) {
+    memset(bitset->words, 0, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    bitset->cardinality = 0;
+}
+
+void bitset_container_set_all(bitset_container_t *bitset) {
+    memset(bitset->words, INT64_C(-1),
+           sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    bitset->cardinality = (1 << 16);
+}
+
+/* Create a new bitset. Return NULL in case of failure. */
+bitset_container_t *bitset_container_create(void) {
+    bitset_container_t *bitset =
+        (bitset_container_t *)roaring_malloc(sizeof(bitset_container_t));
+
+    if (!bitset) {
+        return NULL;
+    }
+
+    size_t align_size = 32;
+#if CROARING_IS_X64
+    int support = croaring_hardware_support();
+    if (support & ROARING_SUPPORTS_AVX512) {
+        // sizeof(__m512i) == 64
+        align_size = 64;
+    } else {
+        // sizeof(__m256i) == 32
+        align_size = 32;
+    }
+#endif
+    bitset->words = (uint64_t *)roaring_aligned_malloc(
+        align_size, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    if (!bitset->words) {
+        roaring_free(bitset);
+        return NULL;
+    }
+    bitset_container_clear(bitset);
+    return bitset;
+}
+
+/* Copy one container into another. We assume that they are distinct. */
+void bitset_container_copy(const bitset_container_t *source,
+                           bitset_container_t *dest) {
+    dest->cardinality = source->cardinality;
+    memcpy(dest->words, source->words,
+           sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+}
+
+void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min,
+                                     uint32_t max, uint16_t step) {
+    if (step == 0) return;   // refuse to crash
+    if ((64 % step) == 0) {  // step divides 64
+        uint64_t mask = 0;   // construct the repeated mask
+        for (uint32_t value = (min % step); value < 64; value += step) {
+            mask |= ((uint64_t)1 << value);
+        }
+        uint32_t firstword = min / 64;
+        uint32_t endword = (max - 1) / 64;
+        bitset->cardinality = (max - min + step - 1) / step;
+        if (firstword == endword) {
+            bitset->words[firstword] |=
+                mask & (((~UINT64_C(0)) << (min % 64)) &
+                        ((~UINT64_C(0)) >> ((~max + 1) % 64)));
+            return;
+        }
+        bitset->words[firstword] = mask & ((~UINT64_C(0)) << (min % 64));
+        for (uint32_t i = firstword + 1; i < endword; i++)
+            bitset->words[i] = mask;
+        bitset->words[endword] = mask & ((~UINT64_C(0)) >> ((~max + 1) % 64));
+    } else {
+        for (uint32_t value = min; value < max; value += step) {
+            bitset_container_add(bitset, value);
+        }
+    }
+}
+
+/* Free memory. */
+void bitset_container_free(bitset_container_t *bitset) {
+    if (bitset->words !=
+        NULL) {  // Jon Strabala reports that some tools complain otherwise
+        roaring_aligned_free(bitset->words);
+        bitset->words = NULL;  // pedantic
+    }
+    roaring_free(bitset);
+}
+
+/* duplicate container. */
+ALLOW_UNALIGNED
+bitset_container_t *bitset_container_clone(const bitset_container_t *src) {
+    bitset_container_t *bitset =
+        (bitset_container_t *)roaring_malloc(sizeof(bitset_container_t));
+
+    if (!bitset) {
+        return NULL;
+    }
+
+    size_t align_size = 32;
+#if CROARING_IS_X64
+    if (croaring_hardware_support() & ROARING_SUPPORTS_AVX512) {
+        // sizeof(__m512i) == 64
+        align_size = 64;
+    } else {
+        // sizeof(__m256i) == 32
+        align_size = 32;
+    }
+#endif
+    bitset->words = (uint64_t *)roaring_aligned_malloc(
+        align_size, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    if (!bitset->words) {
+        roaring_free(bitset);
+        return NULL;
+    }
+    bitset->cardinality = src->cardinality;
+    memcpy(bitset->words, src->words,
+           sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    return bitset;
+}
+
+void bitset_container_offset(const bitset_container_t *c, container_t **loc,
+                             container_t **hic, uint16_t offset) {
+    bitset_container_t *bc = NULL;
+    uint64_t val;
+    uint16_t b, i, end;
+
+    b = offset >> 6;
+    i = offset % 64;
+    end = 1024 - b;
+
+    if (loc != NULL) {
+        bc = bitset_container_create();
+        if (i == 0) {
+            memcpy(bc->words + b, c->words, 8 * end);
+        } else {
+            bc->words[b] = c->words[0] << i;
+            for (uint32_t k = 1; k < end; ++k) {
+                val = c->words[k] << i;
+                val |= c->words[k - 1] >> (64 - i);
+                bc->words[b + k] = val;
+            }
+        }
+
+        bc->cardinality = bitset_container_compute_cardinality(bc);
+        if (bc->cardinality != 0) {
+            *loc = bc;
+        }
+        if (bc->cardinality == c->cardinality) {
+            return;
+        }
+    }
+
+    if (hic == NULL) {
+        // Both hic and loc can't be NULL, so bc is never NULL here
+        if (bc->cardinality == 0) {
+            bitset_container_free(bc);
+        }
+        return;
+    }
+
+    if (bc == NULL || bc->cardinality != 0) {
+        bc = bitset_container_create();
+    }
+
+    if (i == 0) {
+        memcpy(bc->words, c->words + end, 8 * b);
+    } else {
+        for (uint32_t k = end; k < 1024; ++k) {
+            val = c->words[k] << i;
+            val |= c->words[k - 1] >> (64 - i);
+            bc->words[k - end] = val;
+        }
+        bc->words[b] = c->words[1023] >> (64 - i);
+    }
+
+    bc->cardinality = bitset_container_compute_cardinality(bc);
+    if (bc->cardinality == 0) {
+        bitset_container_free(bc);
+        return;
+    }
+    *hic = bc;
+}
+
+void bitset_container_set_range(bitset_container_t *bitset, uint32_t begin,
+                                uint32_t end) {
+    bitset_set_range(bitset->words, begin, end);
+    bitset->cardinality =
+        bitset_container_compute_cardinality(bitset);  // could be smarter
+}
+
+bool bitset_container_intersect(const bitset_container_t *src_1,
+                                const bitset_container_t *src_2) {
+    // could vectorize, but this is probably already quite fast in practice
+    const uint64_t *__restrict__ words_1 = src_1->words;
+    const uint64_t *__restrict__ words_2 = src_2->words;
+    for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) {
+        if ((words_1[i] & words_2[i]) != 0) return true;
+    }
+    return false;
+}
+
+#if CROARING_IS_X64
+#ifndef WORDS_IN_AVX2_REG
+#define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
+#endif
+#ifndef WORDS_IN_AVX512_REG
+#define WORDS_IN_AVX512_REG sizeof(__m512i) / sizeof(uint64_t)
+#endif
+/* Get the number of bits set (force computation) */
+static inline int _scalar_bitset_container_compute_cardinality(
+    const bitset_container_t *bitset) {
+    const uint64_t *words = bitset->words;
+    int32_t sum = 0;
+    for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) {
+        sum += roaring_hamming(words[i]);
+        sum += roaring_hamming(words[i + 1]);
+        sum += roaring_hamming(words[i + 2]);
+        sum += roaring_hamming(words[i + 3]);
+    }
+    return sum;
+}
+/* Get the number of bits set (force computation) */
+int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
+    int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    if (support & ROARING_SUPPORTS_AVX512) {
+        return (int)avx512_vpopcount(
+            (const __m512i *)bitset->words,
+            BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG));
+    } else
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+        if (support & ROARING_SUPPORTS_AVX2) {
+            return (int)avx2_harley_seal_popcount256(
+                (const __m256i *)bitset->words,
+                BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));
+        } else {
+            return _scalar_bitset_container_compute_cardinality(bitset);
+        }
+}
+
+#elif defined(CROARING_USENEON)
+int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
+    uint16x8_t n0 = vdupq_n_u16(0);
+    uint16x8_t n1 = vdupq_n_u16(0);
+    uint16x8_t n2 = vdupq_n_u16(0);
+    uint16x8_t n3 = vdupq_n_u16(0);
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {
+        uint64x2_t c0 = vld1q_u64(&bitset->words[i + 0]);
+        n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));
+        uint64x2_t c1 = vld1q_u64(&bitset->words[i + 2]);
+        n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));
+        uint64x2_t c2 = vld1q_u64(&bitset->words[i + 4]);
+        n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));
+        uint64x2_t c3 = vld1q_u64(&bitset->words[i + 6]);
+        n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));
+    }
+    uint64x2_t n = vdupq_n_u64(0);
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));
+    return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);
+}
+
+#else  // CROARING_IS_X64
+
+/* Get the number of bits set (force computation) */
+int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
+    const uint64_t *words = bitset->words;
+    int32_t sum = 0;
+    for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) {
+        sum += roaring_hamming(words[i]);
+        sum += roaring_hamming(words[i + 1]);
+        sum += roaring_hamming(words[i + 2]);
+        sum += roaring_hamming(words[i + 3]);
+    }
+    return sum;
+}
+
+#endif  // CROARING_IS_X64
+
+#if CROARING_IS_X64
+
+#define BITSET_CONTAINER_FN_REPEAT 8
+#ifndef WORDS_IN_AVX512_REG
+#define WORDS_IN_AVX512_REG sizeof(__m512i) / sizeof(uint64_t)
+#endif  // WORDS_IN_AVX512_REG
+
+/* Computes a binary operation (eg union) on bitset1 and bitset2 and write the
+   result to bitsetout */
+// clang-format off
+#define AVX512_BITSET_CONTAINER_FN1(before, opname, opsymbol, avx_intrinsic,   \
+                                neon_intrinsic, after)                         \
+  static inline int _avx512_bitset_container_##opname##_nocard(                \
+      const bitset_container_t *src_1, const bitset_container_t *src_2,        \
+      bitset_container_t *dst) {                                               \
+    const uint8_t * __restrict__ words_1 = (const uint8_t *)src_1->words;      \
+    const uint8_t * __restrict__ words_2 = (const uint8_t *)src_2->words;      \
+    /* not using the blocking optimization for some reason*/                   \
+    uint8_t *out = (uint8_t*)dst->words;                                       \
+    const int innerloop = 8;                                                   \
+    for (size_t i = 0;                                                         \
+        i < BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG);            \
+                                                         i+=innerloop) {       \
+        __m512i A1, A2, AO;                                                    \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1));                   \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2));                   \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)out, AO);                               \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 64));              \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 64));              \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+64), AO);                          \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 128));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 128));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+128), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 192));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 192));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+192), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 256));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 256));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+256), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 320));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 320));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+320), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 384));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 384));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+384), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 448));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 448));             \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm512_storeu_si512((__m512i *)(out+448), AO);                  \
+        out+=512;                                                       \
+        words_1 += 512;                                                 \
+        words_2 += 512;                                                 \
+    }                                                                   \
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                      \
+    return dst->cardinality;                                            \
+  }
+
+#define AVX512_BITSET_CONTAINER_FN2(before, opname, opsymbol, avx_intrinsic,           \
+                                neon_intrinsic, after)                                 \
+  /* next, a version that updates cardinality*/                                        \
+  static inline int _avx512_bitset_container_##opname(const bitset_container_t *src_1, \
+                                      const bitset_container_t *src_2,                 \
+                                      bitset_container_t *dst) {                       \
+    const __m512i * __restrict__ words_1 = (const __m512i *) src_1->words;             \
+    const __m512i * __restrict__ words_2 = (const __m512i *) src_2->words;             \
+    __m512i *out = (__m512i *) dst->words;                                             \
+    dst->cardinality = (int32_t)avx512_harley_seal_popcount512andstore_##opname(words_2,\
+				words_1, out,BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG));           \
+    return dst->cardinality;                                                            \
+  }
+
+#define AVX512_BITSET_CONTAINER_FN3(before, opname, opsymbol, avx_intrinsic,            \
+                                neon_intrinsic, after)                                  \
+  /* next, a version that just computes the cardinality*/                               \
+  static inline int _avx512_bitset_container_##opname##_justcard(                       \
+      const bitset_container_t *src_1, const bitset_container_t *src_2) {               \
+    const __m512i * __restrict__ data1 = (const __m512i *) src_1->words;                \
+    const __m512i * __restrict__ data2 = (const __m512i *) src_2->words;                \
+    return (int)avx512_harley_seal_popcount512_##opname(data2,                          \
+				data1, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG));                 \
+  }
+
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, or,    |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, union, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, and,          &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, intersection, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, xor,    ^,  _mm512_xor_si512,    veorq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, andnot, &~, _mm512_andnot_si512, vbicq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, or,    |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, union, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, and,          &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, intersection, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, xor,    ^,  _mm512_xor_si512,    veorq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, andnot, &~, _mm512_andnot_si512, vbicq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, or,    |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, union, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, and,          &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, intersection, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, xor,    ^,  _mm512_xor_si512,    veorq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, andnot, &~, _mm512_andnot_si512, vbicq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+
+#ifndef WORDS_IN_AVX2_REG
+#define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
+#endif // WORDS_IN_AVX2_REG
+#define LOOP_SIZE                    \
+    BITSET_CONTAINER_SIZE_IN_WORDS / \
+        ((WORDS_IN_AVX2_REG)*BITSET_CONTAINER_FN_REPEAT)
+
+/* Computes a binary operation (eg union) on bitset1 and bitset2 and write the
+   result to bitsetout */
+// clang-format off
+#define AVX_BITSET_CONTAINER_FN1(before, opname, opsymbol, avx_intrinsic,               \
+                                neon_intrinsic, after)                                \
+  static inline int _avx2_bitset_container_##opname##_nocard(                                \
+      const bitset_container_t *src_1, const bitset_container_t *src_2,        \
+      bitset_container_t *dst) {                                               \
+    const uint8_t *__restrict__ words_1 = (const uint8_t *)src_1->words;       \
+    const uint8_t *__restrict__ words_2 = (const uint8_t *)src_2->words;       \
+    /* not using the blocking optimization for some reason*/                   \
+    uint8_t *out = (uint8_t *)dst->words;                                      \
+    const int innerloop = 8;                                                   \
+    for (size_t i = 0;                                                         \
+         i < BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG);             \
+         i += innerloop) {                                                     \
+      __m256i A1, A2, AO;                                                      \
+      A1 = _mm256_lddqu_si256((const __m256i *)(words_1));                     \
+      A2 = _mm256_lddqu_si256((const __m256i *)(words_2));                     \
+      AO = avx_intrinsic(A2, A1);                                              \
+      _mm256_storeu_si256((__m256i *)out, AO);                                 \
+      A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 32));                \
+      A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 32));                \
+      AO = avx_intrinsic(A2, A1);                                              \
+      _mm256_storeu_si256((__m256i *)(out + 32), AO);                          \
+      A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 64));                \
+      A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 64));                \
+      AO = avx_intrinsic(A2, A1);                                              \
+      _mm256_storeu_si256((__m256i *)(out + 64), AO);                          \
+      A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 96));                \
+      A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 96));                \
+      AO = avx_intrinsic(A2, A1);                                              \
+      _mm256_storeu_si256((__m256i *)(out + 96), AO);                          \
+      A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 128));               \
+      A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 128));               \
+      AO = avx_intrinsic(A2, A1);                                              \
+      _mm256_storeu_si256((__m256i *)(out + 128), AO);                         \
+      A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 160));               \
+      A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 160));               \
+      AO = avx_intrinsic(A2, A1);                                              \
+      _mm256_storeu_si256((__m256i *)(out + 160), AO);                         \
+      A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 192));               \
+      A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 192));               \
+      AO = avx_intrinsic(A2, A1);                                              \
+      _mm256_storeu_si256((__m256i *)(out + 192), AO);                         \
+      A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 224));               \
+      A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 224));               \
+      AO = avx_intrinsic(A2, A1);                                              \
+      _mm256_storeu_si256((__m256i *)(out + 224), AO);                         \
+      out += 256;                                                              \
+      words_1 += 256;                                                          \
+      words_2 += 256;                                                          \
+    }                                                                          \
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                             \
+    return dst->cardinality;                                                   \
+  }
+
+#define AVX_BITSET_CONTAINER_FN2(before, opname, opsymbol, avx_intrinsic,               \
+                                neon_intrinsic, after)                                \
+  /* next, a version that updates cardinality*/                                \
+  static inline int _avx2_bitset_container_##opname(const bitset_container_t *src_1,         \
+                                      const bitset_container_t *src_2,         \
+                                      bitset_container_t *dst) {               \
+    const __m256i *__restrict__ words_1 = (const __m256i *)src_1->words;       \
+    const __m256i *__restrict__ words_2 = (const __m256i *)src_2->words;       \
+    __m256i *out = (__m256i *)dst->words;                                      \
+    dst->cardinality = (int32_t)avx2_harley_seal_popcount256andstore_##opname( \
+        words_2, words_1, out,                                                 \
+        BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));                 \
+    return dst->cardinality;                                                   \
+  }                                                                            \
+
+#define AVX_BITSET_CONTAINER_FN3(before, opname, opsymbol, avx_intrinsic,               \
+                                neon_intrinsic, after)                                \
+  /* next, a version that just computes the cardinality*/                      \
+  static inline int _avx2_bitset_container_##opname##_justcard(                              \
+      const bitset_container_t *src_1, const bitset_container_t *src_2) {      \
+    const __m256i *__restrict__ data1 = (const __m256i *)src_1->words;         \
+    const __m256i *__restrict__ data2 = (const __m256i *)src_2->words;         \
+    return (int)avx2_harley_seal_popcount256_##opname(                         \
+        data2, data1, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));   \
+  }
+
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, or,    |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, and,          &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, xor,    ^,  _mm256_xor_si256,    veorq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, or,    |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, and,          &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, xor,    ^,  _mm256_xor_si256,    veorq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, or,    |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, and,          &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, xor,    ^,  _mm256_xor_si256,    veorq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+CROARING_TARGET_AVX2
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
+
+
+#define SCALAR_BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic,            \
+                                   neon_intrinsic)                             \
+  static inline int _scalar_bitset_container_##opname(const bitset_container_t *src_1,       \
+                                        const bitset_container_t *src_2,       \
+                                        bitset_container_t *dst) {             \
+    const uint64_t *__restrict__ words_1 = src_1->words;                       \
+    const uint64_t *__restrict__ words_2 = src_2->words;                       \
+    uint64_t *out = dst->words;                                                \
+    int32_t sum = 0;                                                           \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {           \
+      const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]),                \
+                     word_2 = (words_1[i + 1]) opsymbol(words_2[i + 1]);       \
+      out[i] = word_1;                                                         \
+      out[i + 1] = word_2;                                                     \
+      sum += roaring_hamming(word_1);                                                  \
+      sum += roaring_hamming(word_2);                                                  \
+    }                                                                          \
+    dst->cardinality = sum;                                                    \
+    return dst->cardinality;                                                   \
+  }                                                                            \
+  static inline int _scalar_bitset_container_##opname##_nocard(                              \
+      const bitset_container_t *src_1, const bitset_container_t *src_2,        \
+      bitset_container_t *dst) {                                               \
+    const uint64_t *__restrict__ words_1 = src_1->words;                       \
+    const uint64_t *__restrict__ words_2 = src_2->words;                       \
+    uint64_t *out = dst->words;                                                \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) {              \
+      out[i] = (words_1[i])opsymbol(words_2[i]);                               \
+    }                                                                          \
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                             \
+    return dst->cardinality;                                                   \
+  }                                                                            \
+  static inline int _scalar_bitset_container_##opname##_justcard(                            \
+      const bitset_container_t *src_1, const bitset_container_t *src_2) {      \
+    const uint64_t *__restrict__ words_1 = src_1->words;                       \
+    const uint64_t *__restrict__ words_2 = src_2->words;                       \
+    int32_t sum = 0;                                                           \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {           \
+      const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]),                \
+                     word_2 = (words_1[i + 1]) opsymbol(words_2[i + 1]);       \
+      sum += roaring_hamming(word_1);                                                  \
+      sum += roaring_hamming(word_2);                                                  \
+    }                                                                          \
+    return sum;                                                                \
+  }
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+SCALAR_BITSET_CONTAINER_FN(or,    |, _mm256_or_si256, vorrq_u64)
+SCALAR_BITSET_CONTAINER_FN(union, |, _mm256_or_si256, vorrq_u64)
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+SCALAR_BITSET_CONTAINER_FN(and,          &, _mm256_and_si256, vandq_u64)
+SCALAR_BITSET_CONTAINER_FN(intersection, &, _mm256_and_si256, vandq_u64)
+
+SCALAR_BITSET_CONTAINER_FN(xor,    ^,  _mm256_xor_si256,    veorq_u64)
+SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64)
+
+#if CROARING_COMPILER_SUPPORTS_AVX512
+#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)   \
+  int bitset_container_##opname(const bitset_container_t *src_1,               \
+                                const bitset_container_t *src_2,               \
+                                bitset_container_t *dst) {                     \
+    int support = croaring_hardware_support();                                 \
+    if ( support & ROARING_SUPPORTS_AVX512 ) {                                 \
+      return _avx512_bitset_container_##opname(src_1, src_2, dst);             \
+    }                                                                          \
+    else if ( support & ROARING_SUPPORTS_AVX2 ) {                              \
+      return _avx2_bitset_container_##opname(src_1, src_2, dst);               \
+    } else {                                                                   \
+      return _scalar_bitset_container_##opname(src_1, src_2, dst);             \
+    }                                                                          \
+  }                                                                            \
+  int bitset_container_##opname##_nocard(const bitset_container_t *src_1,      \
+                                         const bitset_container_t *src_2,      \
+                                         bitset_container_t *dst) {            \
+    int support = croaring_hardware_support();                                 \
+    if ( support & ROARING_SUPPORTS_AVX512 ) {                                 \
+      return _avx512_bitset_container_##opname##_nocard(src_1, src_2, dst);    \
+    }                                                                          \
+    else if ( support & ROARING_SUPPORTS_AVX2 ) {                              \
+      return _avx2_bitset_container_##opname##_nocard(src_1, src_2, dst);      \
+    } else {                                                                   \
+      return _scalar_bitset_container_##opname##_nocard(src_1, src_2, dst);    \
+    }                                                                          \
+  }                                                                            \
+  int bitset_container_##opname##_justcard(const bitset_container_t *src_1,    \
+                                           const bitset_container_t *src_2) {  \
+     int support = croaring_hardware_support();                                \
+    if ( support & ROARING_SUPPORTS_AVX512 ) {                                 \
+      return _avx512_bitset_container_##opname##_justcard(src_1, src_2);       \
+    }                                                                          \
+    else if ( support & ROARING_SUPPORTS_AVX2 ) {                              \
+      return _avx2_bitset_container_##opname##_justcard(src_1, src_2);         \
+    } else {                                                                   \
+      return _scalar_bitset_container_##opname##_justcard(src_1, src_2);       \
+    }                                                                          \
+  }
+
+#else // CROARING_COMPILER_SUPPORTS_AVX512
+
+
+#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)   \
+  int bitset_container_##opname(const bitset_container_t *src_1,               \
+                                const bitset_container_t *src_2,               \
+                                bitset_container_t *dst) {                     \
+    if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {               \
+      return _avx2_bitset_container_##opname(src_1, src_2, dst);               \
+    } else {                                                                   \
+      return _scalar_bitset_container_##opname(src_1, src_2, dst);             \
+    }                                                                          \
+  }                                                                            \
+  int bitset_container_##opname##_nocard(const bitset_container_t *src_1,      \
+                                         const bitset_container_t *src_2,      \
+                                         bitset_container_t *dst) {            \
+    if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {               \
+      return _avx2_bitset_container_##opname##_nocard(src_1, src_2, dst);      \
+    } else {                                                                   \
+      return _scalar_bitset_container_##opname##_nocard(src_1, src_2, dst);    \
+    }                                                                          \
+  }                                                                            \
+  int bitset_container_##opname##_justcard(const bitset_container_t *src_1,    \
+                                           const bitset_container_t *src_2) {  \
+    if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {               \
+      return _avx2_bitset_container_##opname##_justcard(src_1, src_2);         \
+    } else {                                                                   \
+      return _scalar_bitset_container_##opname##_justcard(src_1, src_2);       \
+    }                                                                          \
+  }
+
+#endif //  CROARING_COMPILER_SUPPORTS_AVX512
+
+#elif defined(CROARING_USENEON)
+
+#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)  \
+int bitset_container_##opname(const bitset_container_t *src_1,                \
+                              const bitset_container_t *src_2,                \
+                              bitset_container_t *dst) {                      \
+    const uint64_t * __restrict__ words_1 = src_1->words;                     \
+    const uint64_t * __restrict__ words_2 = src_2->words;                     \
+    uint64_t *out = dst->words;                                               \
+    uint16x8_t n0 = vdupq_n_u16(0);                                           \
+    uint16x8_t n1 = vdupq_n_u16(0);                                           \
+    uint16x8_t n2 = vdupq_n_u16(0);                                           \
+    uint16x8_t n3 = vdupq_n_u16(0);                                           \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {          \
+        uint64x2_t c0 = neon_intrinsic(vld1q_u64(&words_1[i + 0]),            \
+                                       vld1q_u64(&words_2[i + 0]));           \
+        n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));   \
+        vst1q_u64(&out[i + 0], c0);                                           \
+        uint64x2_t c1 = neon_intrinsic(vld1q_u64(&words_1[i + 2]),            \
+                                       vld1q_u64(&words_2[i + 2]));           \
+        n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));   \
+        vst1q_u64(&out[i + 2], c1);                                           \
+        uint64x2_t c2 = neon_intrinsic(vld1q_u64(&words_1[i + 4]),            \
+                                       vld1q_u64(&words_2[i + 4]));           \
+        n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));   \
+        vst1q_u64(&out[i + 4], c2);                                           \
+        uint64x2_t c3 = neon_intrinsic(vld1q_u64(&words_1[i + 6]),            \
+                                       vld1q_u64(&words_2[i + 6]));           \
+        n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));   \
+        vst1q_u64(&out[i + 6], c3);                                           \
+    }                                                                         \
+    uint64x2_t n = vdupq_n_u64(0);                                            \
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));                           \
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));                           \
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));                           \
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));                           \
+    dst->cardinality = vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);           \
+    return dst->cardinality;                                                  \
+}                                                                             \
+int bitset_container_##opname##_nocard(const bitset_container_t *src_1,       \
+                                       const bitset_container_t *src_2,       \
+                                             bitset_container_t *dst) {       \
+    const uint64_t * __restrict__ words_1 = src_1->words;                     \
+    const uint64_t * __restrict__ words_2 = src_2->words;                     \
+    uint64_t *out = dst->words;                                               \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {          \
+        vst1q_u64(&out[i + 0], neon_intrinsic(vld1q_u64(&words_1[i + 0]),     \
+                                              vld1q_u64(&words_2[i + 0])));   \
+        vst1q_u64(&out[i + 2], neon_intrinsic(vld1q_u64(&words_1[i + 2]),     \
+                                              vld1q_u64(&words_2[i + 2])));   \
+        vst1q_u64(&out[i + 4], neon_intrinsic(vld1q_u64(&words_1[i + 4]),     \
+                                              vld1q_u64(&words_2[i + 4])));   \
+        vst1q_u64(&out[i + 6], neon_intrinsic(vld1q_u64(&words_1[i + 6]),     \
+                                              vld1q_u64(&words_2[i + 6])));   \
+    }                                                                         \
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                            \
+    return dst->cardinality;                                                  \
+}                                                                             \
+int bitset_container_##opname##_justcard(const bitset_container_t *src_1,     \
+                                         const bitset_container_t *src_2) {   \
+    const uint64_t * __restrict__ words_1 = src_1->words;                     \
+    const uint64_t * __restrict__ words_2 = src_2->words;                     \
+    uint16x8_t n0 = vdupq_n_u16(0);                                           \
+    uint16x8_t n1 = vdupq_n_u16(0);                                           \
+    uint16x8_t n2 = vdupq_n_u16(0);                                           \
+    uint16x8_t n3 = vdupq_n_u16(0);                                           \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {          \
+        uint64x2_t c0 = neon_intrinsic(vld1q_u64(&words_1[i + 0]),            \
+                                       vld1q_u64(&words_2[i + 0]));           \
+        n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));   \
+        uint64x2_t c1 = neon_intrinsic(vld1q_u64(&words_1[i + 2]),            \
+                                       vld1q_u64(&words_2[i + 2]));           \
+        n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));   \
+        uint64x2_t c2 = neon_intrinsic(vld1q_u64(&words_1[i + 4]),            \
+                                       vld1q_u64(&words_2[i + 4]));           \
+        n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));   \
+        uint64x2_t c3 = neon_intrinsic(vld1q_u64(&words_1[i + 6]),            \
+                                       vld1q_u64(&words_2[i + 6]));           \
+        n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));   \
+    }                                                                         \
+    uint64x2_t n = vdupq_n_u64(0);                                            \
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));                           \
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));                           \
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));                           \
+    n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));                           \
+    return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);                       \
+}
+
+#else
+
+#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)  \
+int bitset_container_##opname(const bitset_container_t *src_1,            \
+                              const bitset_container_t *src_2,            \
+                              bitset_container_t *dst) {                  \
+    const uint64_t * __restrict__ words_1 = src_1->words;                 \
+    const uint64_t * __restrict__ words_2 = src_2->words;                 \
+    uint64_t *out = dst->words;                                           \
+    int32_t sum = 0;                                                      \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {      \
+        const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]),         \
+                       word_2 = (words_1[i + 1])opsymbol(words_2[i + 1]); \
+        out[i] = word_1;                                                  \
+        out[i + 1] = word_2;                                              \
+        sum += roaring_hamming(word_1);                                    \
+        sum += roaring_hamming(word_2);                                    \
+    }                                                                     \
+    dst->cardinality = sum;                                               \
+    return dst->cardinality;                                              \
+}                                                                         \
+int bitset_container_##opname##_nocard(const bitset_container_t *src_1,   \
+                                       const bitset_container_t *src_2,   \
+                                       bitset_container_t *dst) {         \
+    const uint64_t * __restrict__ words_1 = src_1->words;                 \
+    const uint64_t * __restrict__ words_2 = src_2->words;                 \
+    uint64_t *out = dst->words;                                           \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) {         \
+        out[i] = (words_1[i])opsymbol(words_2[i]);                        \
+    }                                                                     \
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                        \
+    return dst->cardinality;                                              \
+}                                                                         \
+int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
+                              const bitset_container_t *src_2) {          \
+   printf("A1\n"); const uint64_t * __restrict__ words_1 = src_1->words;                 \
+    const uint64_t * __restrict__ words_2 = src_2->words;                 \
+    int32_t sum = 0;                                                      \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {      \
+        const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]),         \
+                       word_2 = (words_1[i + 1])opsymbol(words_2[i + 1]); \
+        sum += roaring_hamming(word_1);                                    \
+        sum += roaring_hamming(word_2);                                    \
+    }                                                                     \
+    return sum;                                                           \
+}
+
+#endif // CROARING_IS_X64
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+BITSET_CONTAINER_FN(or,    |, _mm256_or_si256, vorrq_u64)
+BITSET_CONTAINER_FN(union, |, _mm256_or_si256, vorrq_u64)
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+BITSET_CONTAINER_FN(and,          &, _mm256_and_si256, vandq_u64)
+BITSET_CONTAINER_FN(intersection, &, _mm256_and_si256, vandq_u64)
+
+BITSET_CONTAINER_FN(xor,    ^,  _mm256_xor_si256,    veorq_u64)
+BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64)
+// clang-format On
+
+
+ALLOW_UNALIGNED
+int bitset_container_to_uint32_array(
+    uint32_t *out,
+    const bitset_container_t *bc,
+    uint32_t base
+){
+#if CROARING_IS_X64
+   int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+   if(( support & ROARING_SUPPORTS_AVX512 ) &&  (bc->cardinality >= 8192))  // heuristic
+		return (int) bitset_extract_setbits_avx512(bc->words,
+                BITSET_CONTAINER_SIZE_IN_WORDS, out, bc->cardinality, base);
+   else
+#endif
+   if(( support & ROARING_SUPPORTS_AVX2 ) &&  (bc->cardinality >= 8192))  // heuristic
+		return (int) bitset_extract_setbits_avx2(bc->words,
+                BITSET_CONTAINER_SIZE_IN_WORDS, out, bc->cardinality, base);
+	else
+		return (int) bitset_extract_setbits(bc->words,
+                BITSET_CONTAINER_SIZE_IN_WORDS, out, base);
+#else
+	return (int) bitset_extract_setbits(bc->words,
+                BITSET_CONTAINER_SIZE_IN_WORDS, out, base);
+#endif
+}
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void bitset_container_printf(const bitset_container_t * v) {
+	printf("{");
+	uint32_t base = 0;
+	bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
+	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
+		uint64_t w = v->words[i];
+		while (w != 0) {
+			uint64_t t = w & (~w + 1);
+			int r = roaring_trailing_zeroes(w);
+			if(iamfirst) {// predicted to be false
+				printf("%u",base + r);
+				iamfirst = false;
+			} else {
+				printf(",%u",base + r);
+			}
+			w ^= t;
+		}
+		base += 64;
+	}
+	printf("}");
+}
+
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit integers starting at base.
+ */
+void bitset_container_printf_as_uint32_array(const bitset_container_t * v, uint32_t base) {
+	bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
+	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
+		uint64_t w = v->words[i];
+		while (w != 0) {
+			uint64_t t = w & (~w + 1);
+			int r = roaring_trailing_zeroes(w);
+			if(iamfirst) {// predicted to be false
+				printf("%u", r + base);
+				iamfirst = false;
+			} else {
+				printf(",%u",r + base);
+			}
+			w ^= t;
+		}
+		base += 64;
+	}
+}
+
+/*
+ * Validate the container. Returns true if valid.
+ */
+bool bitset_container_validate(const bitset_container_t *v, const char **reason) {
+    if (v->words == NULL) {
+        *reason = "words is NULL";
+        return false;
+    }
+    if (v->cardinality != bitset_container_compute_cardinality(v)) {
+        *reason = "cardinality is incorrect";
+        return false;
+    }
+    if (v->cardinality <= DEFAULT_MAX_SIZE) {
+        *reason = "cardinality is too small for a bitmap container";
+        return false;
+    }
+    // Attempt to forcibly load the first and last words, hopefully causing
+    // a segfault or an address sanitizer error if words is not allocated.
+    volatile uint64_t *words = v->words;
+    (void) words[0];
+    (void) words[BITSET_CONTAINER_SIZE_IN_WORDS - 1];
+    return true;
+}
+
+
+// TODO: use the fast lower bound, also
+int bitset_container_number_of_runs(bitset_container_t *bc) {
+  int num_runs = 0;
+  uint64_t next_word = bc->words[0];
+
+  for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS-1; ++i) {
+    uint64_t word = next_word;
+    next_word = bc->words[i+1];
+    num_runs += roaring_hamming((~word) & (word << 1)) + ( (word >> 63) & ~next_word);
+  }
+
+  uint64_t word = next_word;
+  num_runs += roaring_hamming((~word) & (word << 1));
+  if((word & 0x8000000000000000ULL) != 0)
+    num_runs++;
+  return num_runs;
+}
+
+
+int32_t bitset_container_write(const bitset_container_t *container,
+                                  char *buf) {
+	memcpy(buf, container->words, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
+	return bitset_container_size_in_bytes(container);
+}
+
+
+int32_t bitset_container_read(int32_t cardinality, bitset_container_t *container,
+		const char *buf)  {
+	container->cardinality = cardinality;
+	memcpy(container->words, buf, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
+	return bitset_container_size_in_bytes(container);
+}
+
+bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base, roaring_iterator iterator, void *ptr) {
+  for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
+    uint64_t w = cont->words[i];
+    while (w != 0) {
+      uint64_t t = w & (~w + 1);
+      int r = roaring_trailing_zeroes(w);
+      if(!iterator(r + base, ptr)) return false;
+      w ^= t;
+    }
+    base += 64;
+  }
+  return true;
+}
+
+bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base, roaring_iterator64 iterator, uint64_t high_bits, void *ptr) {
+  for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
+    uint64_t w = cont->words[i];
+    while (w != 0) {
+      uint64_t t = w & (~w + 1);
+      int r = roaring_trailing_zeroes(w);
+      if(!iterator(high_bits | (uint64_t)(r + base), ptr)) return false;
+      w ^= t;
+    }
+    base += 64;
+  }
+  return true;
+}
+
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+ALLOW_UNALIGNED
+static inline bool _avx512_bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
+  const __m512i *ptr1 = (const __m512i*)container1->words;
+  const __m512i *ptr2 = (const __m512i*)container2->words;
+  for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)/64; i++) {
+      __m512i r1 = _mm512_loadu_si512(ptr1+i);
+      __m512i r2 = _mm512_loadu_si512(ptr2+i);
+      __mmask64 mask = _mm512_cmpeq_epi8_mask(r1, r2);
+      if ((uint64_t)mask != UINT64_MAX) {
+          return false;
+      }
+  }
+	return true;
+}
+CROARING_UNTARGET_AVX512
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX2
+ALLOW_UNALIGNED
+static inline bool _avx2_bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
+    const __m256i *ptr1 = (const __m256i*)container1->words;
+    const __m256i *ptr2 = (const __m256i*)container2->words;
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)/32; i++) {
+      __m256i r1 = _mm256_loadu_si256(ptr1+i);
+      __m256i r2 = _mm256_loadu_si256(ptr2+i);
+      int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
+      if ((uint32_t)mask != UINT32_MAX) {
+          return false;
+      }
+  }
+	return true;
+}
+CROARING_UNTARGET_AVX2
+#endif // CROARING_IS_X64
+
+ALLOW_UNALIGNED
+bool bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
+  if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
+    if(container1->cardinality != container2->cardinality) {
+      return false;
+    }
+    if (container1->cardinality == INT32_C(0x10000)) {
+      return true;
+    }
+  }
+#if CROARING_IS_X64
+  int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+  if( support & ROARING_SUPPORTS_AVX512 ) {
+    return _avx512_bitset_container_equals(container1, container2);
+  }
+  else
+#endif
+  if( support & ROARING_SUPPORTS_AVX2 ) {
+    return _avx2_bitset_container_equals(container1, container2);
+  }
+#endif
+  return memcmp(container1->words,
+                container2->words,
+                BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)) == 0;
+}
+
+bool bitset_container_is_subset(const bitset_container_t *container1,
+                          const bitset_container_t *container2) {
+    if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
+        if(container1->cardinality > container2->cardinality) {
+            return false;
+        }
+    }
+    for(int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
+		if((container1->words[i] & container2->words[i]) != container1->words[i]) {
+			return false;
+		}
+	}
+	return true;
+}
+
+bool bitset_container_select(const bitset_container_t *container, uint32_t *start_rank, uint32_t rank, uint32_t *element) {
+    int card = bitset_container_cardinality(container);
+    if(rank >= *start_rank + card) {
+        *start_rank += card;
+        return false;
+    }
+    const uint64_t *words = container->words;
+    int32_t size;
+    for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 1) {
+        size = roaring_hamming(words[i]);
+        if(rank <= *start_rank + size) {
+            uint64_t w = container->words[i];
+            uint16_t base = i*64;
+            while (w != 0) {
+                uint64_t t = w & (~w + 1);
+                int r = roaring_trailing_zeroes(w);
+                if(*start_rank == rank) {
+                    *element = r+base;
+                    return true;
+                }
+                w ^= t;
+                *start_rank += 1;
+            }
+        }
+        else
+            *start_rank += size;
+    }
+    assert(false);
+    roaring_unreachable;
+}
+
+
+/* Returns the smallest value (assumes not empty) */
+uint16_t bitset_container_minimum(const bitset_container_t *container) {
+  for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
+    uint64_t w = container->words[i];
+    if (w != 0) {
+      int r = roaring_trailing_zeroes(w);
+      return r + i * 64;
+    }
+  }
+  return UINT16_MAX;
+}
+
+/* Returns the largest value (assumes not empty) */
+uint16_t bitset_container_maximum(const bitset_container_t *container) {
+  for (int32_t i = BITSET_CONTAINER_SIZE_IN_WORDS - 1; i > 0; --i ) {
+    uint64_t w = container->words[i];
+    if (w != 0) {
+      int r = roaring_leading_zeroes(w);
+      return i * 64 + 63  - r;
+    }
+  }
+  return 0;
+}
+
+/* Returns the number of values equal or smaller than x */
+int bitset_container_rank(const bitset_container_t *container, uint16_t x) {
+  // credit: aqrit
+  int sum = 0;
+  int i = 0;
+  for (int end = x / 64; i < end; i++){
+    sum += roaring_hamming(container->words[i]);
+  }
+  uint64_t lastword = container->words[i];
+  uint64_t lastpos = UINT64_C(1) << (x % 64);
+  uint64_t mask = lastpos + lastpos - 1; // smear right
+  sum += roaring_hamming(lastword & mask);
+  return sum;
+}
+
+uint32_t bitset_container_rank_many(const bitset_container_t *container, uint64_t start_rank, const uint32_t* begin, const uint32_t* end, uint64_t* ans){
+  const uint16_t high = (uint16_t)((*begin) >> 16);
+  int i = 0;
+  int sum = 0;
+  const uint32_t* iter = begin;
+  for(; iter != end; iter++) {
+      uint32_t x = *iter;
+      uint16_t xhigh = (uint16_t)(x >> 16);
+      if(xhigh != high) return iter - begin; // stop at next container
+
+      uint16_t xlow = (uint16_t)x;
+      for(int count = xlow / 64; i < count; i++){
+        sum += roaring_hamming(container->words[i]);
+      }
+      uint64_t lastword = container->words[i];
+      uint64_t lastpos = UINT64_C(1) << (xlow % 64);
+      uint64_t mask = lastpos + lastpos - 1; // smear right
+      *(ans++) = start_rank + sum + roaring_hamming(lastword & mask);
+  }
+  return iter - begin;
+}
+
+
+/* Returns the index of x , if not exsist return -1 */
+int bitset_container_get_index(const bitset_container_t *container, uint16_t x) {
+  if (bitset_container_get(container, x)) {
+    // credit: aqrit
+    int sum = 0;
+    int i = 0;
+    for (int end = x / 64; i < end; i++){
+      sum += roaring_hamming(container->words[i]);
+    }
+    uint64_t lastword = container->words[i];
+    uint64_t lastpos = UINT64_C(1) << (x % 64);
+    uint64_t mask = lastpos + lastpos - 1; // smear right
+    sum += roaring_hamming(lastword & mask);
+    return sum - 1;
+  } else {
+    return -1;
+  }
+}
+
+/* Returns the index of the first value equal or larger than x, or -1 */
+int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x) {
+  uint32_t x32 = x;
+  uint32_t k = x32 / 64;
+  uint64_t word = container->words[k];
+  const int diff = x32 - k * 64; // in [0,64)
+  word = (word >> diff) << diff; // a mask is faster, but we don't care
+  while(word == 0) {
+    k++;
+    if(k == BITSET_CONTAINER_SIZE_IN_WORDS) return -1;
+    word = container->words[k];
+  }
+  return k * 64 + roaring_trailing_zeroes(word);
+}
+
+#ifdef __cplusplus
+} } }  // extern "C" { namespace roaring { namespace internal {
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif/* end file src/containers/bitset.c */
+/* begin file src/containers/containers.c */
+
+
+#ifdef __cplusplus
+extern "C" {
+// In Windows MSVC C++ compiler, (type){init} does not compile,
+// it causes C4576: a parenthesized type followed by an initializer list is a
+// non-standard explicit type conversion syntax The correct syntax is type{init}
+#define ROARING_INIT_ROARING_CONTAINER_ITERATOR_T roaring_container_iterator_t
+namespace roaring {
+namespace internal {
+#else
+#define ROARING_INIT_ROARING_CONTAINER_ITERATOR_T (roaring_container_iterator_t)
+#endif
+
+static inline uint32_t minimum_uint32(uint32_t a, uint32_t b) {
+    return (a < b) ? a : b;
+}
+
+extern inline const container_t *container_unwrap_shared(
+    const container_t *candidate_shared_container, uint8_t *type);
+
+extern inline container_t *container_mutable_unwrap_shared(
+    container_t *candidate_shared_container, uint8_t *type);
+
+extern inline int container_get_cardinality(const container_t *c,
+                                            uint8_t typecode);
+
+extern inline container_t *container_iand(container_t *c1, uint8_t type1,
+                                          const container_t *c2, uint8_t type2,
+                                          uint8_t *result_type);
+
+extern inline container_t *container_ior(container_t *c1, uint8_t type1,
+                                         const container_t *c2, uint8_t type2,
+                                         uint8_t *result_type);
+
+extern inline container_t *container_ixor(container_t *c1, uint8_t type1,
+                                          const container_t *c2, uint8_t type2,
+                                          uint8_t *result_type);
+
+extern inline container_t *container_iandnot(container_t *c1, uint8_t type1,
+                                             const container_t *c2,
+                                             uint8_t type2,
+                                             uint8_t *result_type);
+
+void container_free(container_t *c, uint8_t type) {
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            bitset_container_free(CAST_bitset(c));
+            break;
+        case ARRAY_CONTAINER_TYPE:
+            array_container_free(CAST_array(c));
+            break;
+        case RUN_CONTAINER_TYPE:
+            run_container_free(CAST_run(c));
+            break;
+        case SHARED_CONTAINER_TYPE:
+            shared_container_free(CAST_shared(c));
+            break;
+        default:
+            assert(false);
+            roaring_unreachable;
+    }
+}
+
+void container_printf(const container_t *c, uint8_t type) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            bitset_container_printf(const_CAST_bitset(c));
+            return;
+        case ARRAY_CONTAINER_TYPE:
+            array_container_printf(const_CAST_array(c));
+            return;
+        case RUN_CONTAINER_TYPE:
+            run_container_printf(const_CAST_run(c));
+            return;
+        default:
+            roaring_unreachable;
+    }
+}
+
+void container_printf_as_uint32_array(const container_t *c, uint8_t typecode,
+                                      uint32_t base) {
+    c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            bitset_container_printf_as_uint32_array(const_CAST_bitset(c), base);
+            return;
+        case ARRAY_CONTAINER_TYPE:
+            array_container_printf_as_uint32_array(const_CAST_array(c), base);
+            return;
+        case RUN_CONTAINER_TYPE:
+            run_container_printf_as_uint32_array(const_CAST_run(c), base);
+            return;
+        default:
+            roaring_unreachable;
+    }
+}
+
+bool container_internal_validate(const container_t *container, uint8_t typecode,
+                                 const char **reason) {
+    if (container == NULL) {
+        *reason = "container is NULL";
+        return false;
+    }
+    // Not using container_unwrap_shared because it asserts if shared containers
+    // are nested
+    if (typecode == SHARED_CONTAINER_TYPE) {
+        const shared_container_t *shared_container =
+            const_CAST_shared(container);
+        if (croaring_refcount_get(&shared_container->counter) == 0) {
+            *reason = "shared container has zero refcount";
+            return false;
+        }
+        if (shared_container->typecode == SHARED_CONTAINER_TYPE) {
+            *reason = "shared container is nested";
+            return false;
+        }
+        if (shared_container->container == NULL) {
+            *reason = "shared container has NULL container";
+            return false;
+        }
+        container = shared_container->container;
+        typecode = shared_container->typecode;
+    }
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_validate(const_CAST_bitset(container),
+                                             reason);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_validate(const_CAST_array(container),
+                                            reason);
+        case RUN_CONTAINER_TYPE:
+            return run_container_validate(const_CAST_run(container), reason);
+        default:
+            *reason = "invalid typecode";
+            return false;
+    }
+}
+
+extern inline bool container_nonzero_cardinality(const container_t *c,
+                                                 uint8_t typecode);
+
+extern inline int container_to_uint32_array(uint32_t *output,
+                                            const container_t *c,
+                                            uint8_t typecode, uint32_t base);
+
+extern inline container_t *container_add(container_t *c, uint16_t val,
+                                         uint8_t typecode,  // !!! 2nd arg?
+                                         uint8_t *new_typecode);
+
+extern inline bool container_contains(const container_t *c, uint16_t val,
+                                      uint8_t typecode);  // !!! 2nd arg?
+
+extern inline container_t *container_and(const container_t *c1, uint8_t type1,
+                                         const container_t *c2, uint8_t type2,
+                                         uint8_t *result_type);
+
+extern inline container_t *container_or(const container_t *c1, uint8_t type1,
+                                        const container_t *c2, uint8_t type2,
+                                        uint8_t *result_type);
+
+extern inline container_t *container_xor(const container_t *c1, uint8_t type1,
+                                         const container_t *c2, uint8_t type2,
+                                         uint8_t *result_type);
+
+container_t *get_copy_of_container(container_t *c, uint8_t *typecode,
+                                   bool copy_on_write) {
+    if (copy_on_write) {
+        shared_container_t *shared_container;
+        if (*typecode == SHARED_CONTAINER_TYPE) {
+            shared_container = CAST_shared(c);
+            croaring_refcount_inc(&shared_container->counter);
+            return shared_container;
+        }
+        assert(*typecode != SHARED_CONTAINER_TYPE);
+
+        if ((shared_container = (shared_container_t *)roaring_malloc(
+                 sizeof(shared_container_t))) == NULL) {
+            return NULL;
+        }
+
+        shared_container->container = c;
+        shared_container->typecode = *typecode;
+        // At this point, we are creating new shared container
+        // so there should be no other references, and setting
+        // the counter to 2 - even non-atomically - is safe as
+        // long as the value is set before the return statement.
+        shared_container->counter = 2;
+        *typecode = SHARED_CONTAINER_TYPE;
+
+        return shared_container;
+    }  // copy_on_write
+    // otherwise, no copy on write...
+    const container_t *actual_container = container_unwrap_shared(c, typecode);
+    assert(*typecode != SHARED_CONTAINER_TYPE);
+    return container_clone(actual_container, *typecode);
+}
+
+/**
+ * Copies a container, requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ */
+container_t *container_clone(const container_t *c, uint8_t typecode) {
+    // We do not want to allow cloning of shared containers.
+    // c = container_unwrap_shared(c, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_clone(const_CAST_bitset(c));
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_clone(const_CAST_array(c));
+        case RUN_CONTAINER_TYPE:
+            return run_container_clone(const_CAST_run(c));
+        case SHARED_CONTAINER_TYPE:
+            // Shared containers are not cloneable. Are you mixing COW and
+            // non-COW bitmaps?
+            return NULL;
+        default:
+            assert(false);
+            roaring_unreachable;
+            return NULL;
+    }
+}
+
+container_t *shared_container_extract_copy(shared_container_t *sc,
+                                           uint8_t *typecode) {
+    assert(sc->typecode != SHARED_CONTAINER_TYPE);
+    *typecode = sc->typecode;
+    container_t *answer;
+    if (croaring_refcount_dec(&sc->counter)) {
+        answer = sc->container;
+        sc->container = NULL;  // paranoid
+        roaring_free(sc);
+    } else {
+        answer = container_clone(sc->container, *typecode);
+    }
+    assert(*typecode != SHARED_CONTAINER_TYPE);
+    return answer;
+}
+
+void shared_container_free(shared_container_t *container) {
+    if (croaring_refcount_dec(&container->counter)) {
+        assert(container->typecode != SHARED_CONTAINER_TYPE);
+        container_free(container->container, container->typecode);
+        container->container = NULL;  // paranoid
+        roaring_free(container);
+    }
+}
+
+extern inline container_t *container_not(const container_t *c1, uint8_t type1,
+                                         uint8_t *result_type);
+
+extern inline container_t *container_not_range(const container_t *c1,
+                                               uint8_t type1,
+                                               uint32_t range_start,
+                                               uint32_t range_end,
+                                               uint8_t *result_type);
+
+extern inline container_t *container_inot(container_t *c1, uint8_t type1,
+                                          uint8_t *result_type);
+
+extern inline container_t *container_inot_range(container_t *c1, uint8_t type1,
+                                                uint32_t range_start,
+                                                uint32_t range_end,
+                                                uint8_t *result_type);
+
+extern inline container_t *container_range_of_ones(uint32_t range_start,
+                                                   uint32_t range_end,
+                                                   uint8_t *result_type);
+
+// where are the correponding things for union and intersection??
+extern inline container_t *container_lazy_xor(const container_t *c1,
+                                              uint8_t type1,
+                                              const container_t *c2,
+                                              uint8_t type2,
+                                              uint8_t *result_type);
+
+extern inline container_t *container_lazy_ixor(container_t *c1, uint8_t type1,
+                                               const container_t *c2,
+                                               uint8_t type2,
+                                               uint8_t *result_type);
+
+extern inline container_t *container_andnot(const container_t *c1,
+                                            uint8_t type1,
+                                            const container_t *c2,
+                                            uint8_t type2,
+                                            uint8_t *result_type);
+
+roaring_container_iterator_t container_init_iterator(const container_t *c,
+                                                     uint8_t typecode,
+                                                     uint16_t *value) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE: {
+            const bitset_container_t *bc = const_CAST_bitset(c);
+            uint32_t wordindex = 0;
+            uint64_t word;
+            while ((word = bc->words[wordindex]) == 0) {
+                wordindex++;
+            }
+            // word is non-zero
+            int32_t index = wordindex * 64 + roaring_trailing_zeroes(word);
+            *value = index;
+            return ROARING_INIT_ROARING_CONTAINER_ITERATOR_T{
+                .index = index,
+            };
+        }
+        case ARRAY_CONTAINER_TYPE: {
+            const array_container_t *ac = const_CAST_array(c);
+            *value = ac->array[0];
+            return ROARING_INIT_ROARING_CONTAINER_ITERATOR_T{
+                .index = 0,
+            };
+        }
+        case RUN_CONTAINER_TYPE: {
+            const run_container_t *rc = const_CAST_run(c);
+            *value = rc->runs[0].value;
+            return ROARING_INIT_ROARING_CONTAINER_ITERATOR_T{
+                .index = 0,
+            };
+        }
+        default:
+            assert(false);
+            roaring_unreachable;
+            return ROARING_INIT_ROARING_CONTAINER_ITERATOR_T{0};
+    }
+}
+
+roaring_container_iterator_t container_init_iterator_last(const container_t *c,
+                                                          uint8_t typecode,
+                                                          uint16_t *value) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE: {
+            const bitset_container_t *bc = const_CAST_bitset(c);
+            uint32_t wordindex = BITSET_CONTAINER_SIZE_IN_WORDS - 1;
+            uint64_t word;
+            while ((word = bc->words[wordindex]) == 0) {
+                wordindex--;
+            }
+            // word is non-zero
+            int32_t index =
+                wordindex * 64 + (63 - roaring_leading_zeroes(word));
+            *value = index;
+            return ROARING_INIT_ROARING_CONTAINER_ITERATOR_T{
+                .index = index,
+            };
+        }
+        case ARRAY_CONTAINER_TYPE: {
+            const array_container_t *ac = const_CAST_array(c);
+            int32_t index = ac->cardinality - 1;
+            *value = ac->array[index];
+            return ROARING_INIT_ROARING_CONTAINER_ITERATOR_T{
+                .index = index,
+            };
+        }
+        case RUN_CONTAINER_TYPE: {
+            const run_container_t *rc = const_CAST_run(c);
+            int32_t run_index = rc->n_runs - 1;
+            const rle16_t *last_run = &rc->runs[run_index];
+            *value = last_run->value + last_run->length;
+            return ROARING_INIT_ROARING_CONTAINER_ITERATOR_T{
+                .index = run_index,
+            };
+        }
+        default:
+            assert(false);
+            roaring_unreachable;
+            return ROARING_INIT_ROARING_CONTAINER_ITERATOR_T{0};
+    }
+}
+
+bool container_iterator_next(const container_t *c, uint8_t typecode,
+                             roaring_container_iterator_t *it,
+                             uint16_t *value) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE: {
+            const bitset_container_t *bc = const_CAST_bitset(c);
+            it->index++;
+
+            uint32_t wordindex = it->index / 64;
+            if (wordindex >= BITSET_CONTAINER_SIZE_IN_WORDS) {
+                return false;
+            }
+
+            uint64_t word =
+                bc->words[wordindex] & (UINT64_MAX << (it->index % 64));
+            // next part could be optimized/simplified
+            while (word == 0 &&
+                   (wordindex + 1 < BITSET_CONTAINER_SIZE_IN_WORDS)) {
+                wordindex++;
+                word = bc->words[wordindex];
+            }
+            if (word != 0) {
+                it->index = wordindex * 64 + roaring_trailing_zeroes(word);
+                *value = it->index;
+                return true;
+            }
+            return false;
+        }
+        case ARRAY_CONTAINER_TYPE: {
+            const array_container_t *ac = const_CAST_array(c);
+            it->index++;
+            if (it->index < ac->cardinality) {
+                *value = ac->array[it->index];
+                return true;
+            }
+            return false;
+        }
+        case RUN_CONTAINER_TYPE: {
+            if (*value == UINT16_MAX) {  // Avoid overflow to zero
+                return false;
+            }
+
+            const run_container_t *rc = const_CAST_run(c);
+            uint32_t limit =
+                rc->runs[it->index].value + rc->runs[it->index].length;
+            if (*value < limit) {
+                (*value)++;
+                return true;
+            }
+
+            it->index++;
+            if (it->index < rc->n_runs) {
+                *value = rc->runs[it->index].value;
+                return true;
+            }
+            return false;
+        }
+        default:
+            assert(false);
+            roaring_unreachable;
+            return false;
+    }
+}
+
+bool container_iterator_prev(const container_t *c, uint8_t typecode,
+                             roaring_container_iterator_t *it,
+                             uint16_t *value) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE: {
+            if (--it->index < 0) {
+                return false;
+            }
+
+            const bitset_container_t *bc = const_CAST_bitset(c);
+            int32_t wordindex = it->index / 64;
+            uint64_t word =
+                bc->words[wordindex] & (UINT64_MAX >> (63 - (it->index % 64)));
+
+            while (word == 0 && --wordindex >= 0) {
+                word = bc->words[wordindex];
+            }
+            if (word == 0) {
+                return false;
+            }
+
+            it->index = (wordindex * 64) + (63 - roaring_leading_zeroes(word));
+            *value = it->index;
+            return true;
+        }
+        case ARRAY_CONTAINER_TYPE: {
+            if (--it->index < 0) {
+                return false;
+            }
+            const array_container_t *ac = const_CAST_array(c);
+            *value = ac->array[it->index];
+            return true;
+        }
+        case RUN_CONTAINER_TYPE: {
+            if (*value == 0) {
+                return false;
+            }
+
+            const run_container_t *rc = const_CAST_run(c);
+            (*value)--;
+            if (*value >= rc->runs[it->index].value) {
+                return true;
+            }
+
+            if (--it->index < 0) {
+                return false;
+            }
+
+            *value = rc->runs[it->index].value + rc->runs[it->index].length;
+            return true;
+        }
+        default:
+            assert(false);
+            roaring_unreachable;
+            return false;
+    }
+}
+
+bool container_iterator_lower_bound(const container_t *c, uint8_t typecode,
+                                    roaring_container_iterator_t *it,
+                                    uint16_t *value_out, uint16_t val) {
+    if (val > container_maximum(c, typecode)) {
+        return false;
+    }
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE: {
+            const bitset_container_t *bc = const_CAST_bitset(c);
+            it->index = bitset_container_index_equalorlarger(bc, val);
+            *value_out = it->index;
+            return true;
+        }
+        case ARRAY_CONTAINER_TYPE: {
+            const array_container_t *ac = const_CAST_array(c);
+            it->index = array_container_index_equalorlarger(ac, val);
+            *value_out = ac->array[it->index];
+            return true;
+        }
+        case RUN_CONTAINER_TYPE: {
+            const run_container_t *rc = const_CAST_run(c);
+            it->index = run_container_index_equalorlarger(rc, val);
+            if (rc->runs[it->index].value <= val) {
+                *value_out = val;
+            } else {
+                *value_out = rc->runs[it->index].value;
+            }
+            return true;
+        }
+        default:
+            assert(false);
+            roaring_unreachable;
+            return false;
+    }
+}
+
+bool container_iterator_read_into_uint32(const container_t *c, uint8_t typecode,
+                                         roaring_container_iterator_t *it,
+                                         uint32_t high16, uint32_t *buf,
+                                         uint32_t count, uint32_t *consumed,
+                                         uint16_t *value_out) {
+    *consumed = 0;
+    if (count == 0) {
+        return false;
+    }
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE: {
+            const bitset_container_t *bc = const_CAST_bitset(c);
+            uint32_t wordindex = it->index / 64;
+            uint64_t word =
+                bc->words[wordindex] & (UINT64_MAX << (it->index % 64));
+            do {
+                // Read set bits.
+                while (word != 0 && *consumed < count) {
+                    *buf = high16 |
+                           (wordindex * 64 + roaring_trailing_zeroes(word));
+                    word = word & (word - 1);
+                    buf++;
+                    (*consumed)++;
+                }
+                // Skip unset bits.
+                while (word == 0 &&
+                       wordindex + 1 < BITSET_CONTAINER_SIZE_IN_WORDS) {
+                    wordindex++;
+                    word = bc->words[wordindex];
+                }
+            } while (word != 0 && *consumed < count);
+
+            if (word != 0) {
+                it->index = wordindex * 64 + roaring_trailing_zeroes(word);
+                *value_out = it->index;
+                return true;
+            }
+            return false;
+        }
+        case ARRAY_CONTAINER_TYPE: {
+            const array_container_t *ac = const_CAST_array(c);
+            uint32_t num_values =
+                minimum_uint32(ac->cardinality - it->index, count);
+            for (uint32_t i = 0; i < num_values; i++) {
+                buf[i] = high16 | ac->array[it->index + i];
+            }
+            *consumed += num_values;
+            it->index += num_values;
+            if (it->index < ac->cardinality) {
+                *value_out = ac->array[it->index];
+                return true;
+            }
+            return false;
+        }
+        case RUN_CONTAINER_TYPE: {
+            const run_container_t *rc = const_CAST_run(c);
+            do {
+                uint32_t largest_run_value =
+                    rc->runs[it->index].value + rc->runs[it->index].length;
+                uint32_t num_values = minimum_uint32(
+                    largest_run_value - *value_out + 1, count - *consumed);
+                for (uint32_t i = 0; i < num_values; i++) {
+                    buf[i] = high16 | (*value_out + i);
+                }
+                *value_out += num_values;
+                buf += num_values;
+                *consumed += num_values;
+
+                // We check for `value == 0` because `it->value += num_values`
+                // can overflow when `value == UINT16_MAX`, and `count >
+                // length`. In this case `value` will overflow to 0.
+                if (*value_out > largest_run_value || *value_out == 0) {
+                    it->index++;
+                    if (it->index < rc->n_runs) {
+                        *value_out = rc->runs[it->index].value;
+                    } else {
+                        return false;
+                    }
+                }
+            } while (*consumed < count);
+            return true;
+        }
+        default:
+            assert(false);
+            roaring_unreachable;
+            return 0;
+    }
+}
+
+bool container_iterator_read_into_uint64(const container_t *c, uint8_t typecode,
+                                         roaring_container_iterator_t *it,
+                                         uint64_t high48, uint64_t *buf,
+                                         uint32_t count, uint32_t *consumed,
+                                         uint16_t *value_out) {
+    *consumed = 0;
+    if (count == 0) {
+        return false;
+    }
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE: {
+            const bitset_container_t *bc = const_CAST_bitset(c);
+            uint32_t wordindex = it->index / 64;
+            uint64_t word =
+                bc->words[wordindex] & (UINT64_MAX << (it->index % 64));
+            do {
+                // Read set bits.
+                while (word != 0 && *consumed < count) {
+                    *buf = high48 |
+                           (wordindex * 64 + roaring_trailing_zeroes(word));
+                    word = word & (word - 1);
+                    buf++;
+                    (*consumed)++;
+                }
+                // Skip unset bits.
+                while (word == 0 &&
+                       wordindex + 1 < BITSET_CONTAINER_SIZE_IN_WORDS) {
+                    wordindex++;
+                    word = bc->words[wordindex];
+                }
+            } while (word != 0 && *consumed < count);
+
+            if (word != 0) {
+                it->index = wordindex * 64 + roaring_trailing_zeroes(word);
+                *value_out = it->index;
+                return true;
+            }
+            return false;
+        }
+        case ARRAY_CONTAINER_TYPE: {
+            const array_container_t *ac = const_CAST_array(c);
+            uint32_t num_values =
+                minimum_uint32(ac->cardinality - it->index, count);
+            for (uint32_t i = 0; i < num_values; i++) {
+                buf[i] = high48 | ac->array[it->index + i];
+            }
+            *consumed += num_values;
+            it->index += num_values;
+            if (it->index < ac->cardinality) {
+                *value_out = ac->array[it->index];
+                return true;
+            }
+            return false;
+        }
+        case RUN_CONTAINER_TYPE: {
+            const run_container_t *rc = const_CAST_run(c);
+            do {
+                uint32_t largest_run_value =
+                    rc->runs[it->index].value + rc->runs[it->index].length;
+                uint32_t num_values = minimum_uint32(
+                    largest_run_value - *value_out + 1, count - *consumed);
+                for (uint32_t i = 0; i < num_values; i++) {
+                    buf[i] = high48 | (*value_out + i);
+                }
+                *value_out += num_values;
+                buf += num_values;
+                *consumed += num_values;
+
+                // We check for `value == 0` because `it->value += num_values`
+                // can overflow when `value == UINT16_MAX`, and `count >
+                // length`. In this case `value` will overflow to 0.
+                if (*value_out > largest_run_value || *value_out == 0) {
+                    it->index++;
+                    if (it->index < rc->n_runs) {
+                        *value_out = rc->runs[it->index].value;
+                    } else {
+                        return false;
+                    }
+                }
+            } while (*consumed < count);
+            return true;
+        }
+        default:
+            assert(false);
+            roaring_unreachable;
+            return 0;
+    }
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+
+#undef ROARING_INIT_ROARING_CONTAINER_ITERATOR_T
+/* end file src/containers/containers.c */
+/* begin file src/containers/convert.c */
+#include <stdio.h>
+
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+// file contains grubby stuff that must know impl. details of all container
+// types.
+bitset_container_t *bitset_container_from_array(const array_container_t *ac) {
+    bitset_container_t *ans = bitset_container_create();
+    int limit = array_container_cardinality(ac);
+    for (int i = 0; i < limit; ++i) bitset_container_set(ans, ac->array[i]);
+    return ans;
+}
+
+bitset_container_t *bitset_container_from_run(const run_container_t *arr) {
+    int card = run_container_cardinality(arr);
+    bitset_container_t *answer = bitset_container_create();
+    for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) {
+        rle16_t vl = arr->runs[rlepos];
+        bitset_set_lenrange(answer->words, vl.value, vl.length);
+    }
+    answer->cardinality = card;
+    return answer;
+}
+
+array_container_t *array_container_from_run(const run_container_t *arr) {
+    array_container_t *answer =
+        array_container_create_given_capacity(run_container_cardinality(arr));
+    answer->cardinality = 0;
+    for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) {
+        int run_start = arr->runs[rlepos].value;
+        int run_end = run_start + arr->runs[rlepos].length;
+
+        for (int run_value = run_start; run_value <= run_end; ++run_value) {
+            answer->array[answer->cardinality++] = (uint16_t)run_value;
+        }
+    }
+    return answer;
+}
+
+array_container_t *array_container_from_bitset(const bitset_container_t *bits) {
+    array_container_t *result =
+        array_container_create_given_capacity(bits->cardinality);
+    result->cardinality = bits->cardinality;
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    if (croaring_hardware_support() & ROARING_SUPPORTS_AVX512) {
+        bitset_extract_setbits_avx512_uint16(
+            bits->words, BITSET_CONTAINER_SIZE_IN_WORDS, result->array,
+            bits->cardinality, 0);
+    } else
+#endif
+    {
+        //  sse version ends up being slower here
+        // (bitset_extract_setbits_sse_uint16)
+        // because of the sparsity of the data
+        bitset_extract_setbits_uint16(
+            bits->words, BITSET_CONTAINER_SIZE_IN_WORDS, result->array, 0);
+    }
+#else
+    // If the system is not x64, then we have no accelerated function.
+    bitset_extract_setbits_uint16(bits->words, BITSET_CONTAINER_SIZE_IN_WORDS,
+                                  result->array, 0);
+#endif
+
+    return result;
+}
+
+/* assumes that container has adequate space.  Run from [s,e] (inclusive) */
+static void add_run(run_container_t *rc, int s, int e) {
+    rc->runs[rc->n_runs].value = s;
+    rc->runs[rc->n_runs].length = e - s;
+    rc->n_runs++;
+}
+
+run_container_t *run_container_from_array(const array_container_t *c) {
+    int32_t n_runs = array_container_number_of_runs(c);
+    run_container_t *answer = run_container_create_given_capacity(n_runs);
+    int prev = -2;
+    int run_start = -1;
+    int32_t card = c->cardinality;
+    if (card == 0) return answer;
+    for (int i = 0; i < card; ++i) {
+        const uint16_t cur_val = c->array[i];
+        if (cur_val != prev + 1) {
+            // new run starts; flush old one, if any
+            if (run_start != -1) add_run(answer, run_start, prev);
+            run_start = cur_val;
+        }
+        prev = c->array[i];
+    }
+    // now prev is the last seen value
+    add_run(answer, run_start, prev);
+    // assert(run_container_cardinality(answer) == c->cardinality);
+    return answer;
+}
+
+/**
+ * Convert the runcontainer to either a Bitmap or an Array Container, depending
+ * on the cardinality.  Frees the container.
+ * Allocates and returns new container, which caller is responsible for freeing.
+ * It does not free the run container.
+ */
+container_t *convert_to_bitset_or_array_container(run_container_t *rc,
+                                                  int32_t card,
+                                                  uint8_t *resulttype) {
+    if (card <= DEFAULT_MAX_SIZE) {
+        array_container_t *answer = array_container_create_given_capacity(card);
+        answer->cardinality = 0;
+        for (int rlepos = 0; rlepos < rc->n_runs; ++rlepos) {
+            uint16_t run_start = rc->runs[rlepos].value;
+            uint16_t run_end = run_start + rc->runs[rlepos].length;
+            for (uint16_t run_value = run_start; run_value < run_end;
+                 ++run_value) {
+                answer->array[answer->cardinality++] = run_value;
+            }
+            answer->array[answer->cardinality++] = run_end;
+        }
+        assert(card == answer->cardinality);
+        *resulttype = ARRAY_CONTAINER_TYPE;
+        // run_container_free(r);
+        return answer;
+    }
+    bitset_container_t *answer = bitset_container_create();
+    for (int rlepos = 0; rlepos < rc->n_runs; ++rlepos) {
+        uint16_t run_start = rc->runs[rlepos].value;
+        bitset_set_lenrange(answer->words, run_start, rc->runs[rlepos].length);
+    }
+    answer->cardinality = card;
+    *resulttype = BITSET_CONTAINER_TYPE;
+    // run_container_free(r);
+    return answer;
+}
+
+/* Converts a run container to either an array or a bitset, IF it saves space.
+ */
+/* If a conversion occurs, the caller is responsible to free the original
+ * container and
+ * he becomes responsible to free the new one. */
+container_t *convert_run_to_efficient_container(run_container_t *c,
+                                                uint8_t *typecode_after) {
+    int32_t size_as_run_container =
+        run_container_serialized_size_in_bytes(c->n_runs);
+
+    int32_t size_as_bitset_container =
+        bitset_container_serialized_size_in_bytes();
+    int32_t card = run_container_cardinality(c);
+    int32_t size_as_array_container =
+        array_container_serialized_size_in_bytes(card);
+
+    int32_t min_size_non_run =
+        size_as_bitset_container < size_as_array_container
+            ? size_as_bitset_container
+            : size_as_array_container;
+    if (size_as_run_container <= min_size_non_run) {  // no conversion
+        *typecode_after = RUN_CONTAINER_TYPE;
+        return c;
+    }
+    if (card <= DEFAULT_MAX_SIZE) {
+        // to array
+        array_container_t *answer = array_container_create_given_capacity(card);
+        answer->cardinality = 0;
+        for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) {
+            int run_start = c->runs[rlepos].value;
+            int run_end = run_start + c->runs[rlepos].length;
+
+            for (int run_value = run_start; run_value <= run_end; ++run_value) {
+                answer->array[answer->cardinality++] = (uint16_t)run_value;
+            }
+        }
+        *typecode_after = ARRAY_CONTAINER_TYPE;
+        return answer;
+    }
+
+    // else to bitset
+    bitset_container_t *answer = bitset_container_create();
+
+    for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) {
+        int start = c->runs[rlepos].value;
+        int end = start + c->runs[rlepos].length;
+        bitset_set_range(answer->words, start, end + 1);
+    }
+    answer->cardinality = card;
+    *typecode_after = BITSET_CONTAINER_TYPE;
+    return answer;
+}
+
+// like convert_run_to_efficient_container but frees the old result if needed
+container_t *convert_run_to_efficient_container_and_free(
+    run_container_t *c, uint8_t *typecode_after) {
+    container_t *answer = convert_run_to_efficient_container(c, typecode_after);
+    if (answer != c) run_container_free(c);
+    return answer;
+}
+
+/* once converted, the original container is disposed here, rather than
+   in roaring_array
+*/
+
+// TODO: split into run-  array-  and bitset-  subfunctions for sanity;
+// a few function calls won't really matter.
+
+container_t *convert_run_optimize(container_t *c, uint8_t typecode_original,
+                                  uint8_t *typecode_after) {
+    if (typecode_original == RUN_CONTAINER_TYPE) {
+        container_t *newc =
+            convert_run_to_efficient_container(CAST_run(c), typecode_after);
+        if (newc != c) {
+            container_free(c, typecode_original);
+        }
+        return newc;
+    } else if (typecode_original == ARRAY_CONTAINER_TYPE) {
+        // it might need to be converted to a run container.
+        array_container_t *c_qua_array = CAST_array(c);
+        int32_t n_runs = array_container_number_of_runs(c_qua_array);
+        int32_t size_as_run_container =
+            run_container_serialized_size_in_bytes(n_runs);
+        int32_t card = array_container_cardinality(c_qua_array);
+        int32_t size_as_array_container =
+            array_container_serialized_size_in_bytes(card);
+
+        if (size_as_run_container >= size_as_array_container) {
+            *typecode_after = ARRAY_CONTAINER_TYPE;
+            return c;
+        }
+        // else convert array to run container
+        run_container_t *answer = run_container_create_given_capacity(n_runs);
+        int prev = -2;
+        int run_start = -1;
+
+        assert(card > 0);
+        for (int i = 0; i < card; ++i) {
+            uint16_t cur_val = c_qua_array->array[i];
+            if (cur_val != prev + 1) {
+                // new run starts; flush old one, if any
+                if (run_start != -1) add_run(answer, run_start, prev);
+                run_start = cur_val;
+            }
+            prev = c_qua_array->array[i];
+        }
+        assert(run_start >= 0);
+        // now prev is the last seen value
+        add_run(answer, run_start, prev);
+        *typecode_after = RUN_CONTAINER_TYPE;
+        array_container_free(c_qua_array);
+        return answer;
+    } else if (typecode_original ==
+               BITSET_CONTAINER_TYPE) {  // run conversions on bitset
+        // does bitset need conversion to run?
+        bitset_container_t *c_qua_bitset = CAST_bitset(c);
+        int32_t n_runs = bitset_container_number_of_runs(c_qua_bitset);
+        int32_t size_as_run_container =
+            run_container_serialized_size_in_bytes(n_runs);
+        int32_t size_as_bitset_container =
+            bitset_container_serialized_size_in_bytes();
+
+        if (size_as_bitset_container <= size_as_run_container) {
+            // no conversion needed.
+            *typecode_after = BITSET_CONTAINER_TYPE;
+            return c;
+        }
+        // bitset to runcontainer (ported from Java  RunContainer(
+        // BitmapContainer bc, int nbrRuns))
+        assert(n_runs > 0);  // no empty bitmaps
+        run_container_t *answer = run_container_create_given_capacity(n_runs);
+
+        int long_ctr = 0;
+        uint64_t cur_word = c_qua_bitset->words[0];
+        while (true) {
+            while (cur_word == UINT64_C(0) &&
+                   long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1)
+                cur_word = c_qua_bitset->words[++long_ctr];
+
+            if (cur_word == UINT64_C(0)) {
+                bitset_container_free(c_qua_bitset);
+                *typecode_after = RUN_CONTAINER_TYPE;
+                return answer;
+            }
+
+            int local_run_start = roaring_trailing_zeroes(cur_word);
+            int run_start = local_run_start + 64 * long_ctr;
+            uint64_t cur_word_with_1s = cur_word | (cur_word - 1);
+
+            int run_end = 0;
+            while (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF) &&
+                   long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1)
+                cur_word_with_1s = c_qua_bitset->words[++long_ctr];
+
+            if (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF)) {
+                run_end = 64 + long_ctr * 64;  // exclusive, I guess
+                add_run(answer, run_start, run_end - 1);
+                bitset_container_free(c_qua_bitset);
+                *typecode_after = RUN_CONTAINER_TYPE;
+                return answer;
+            }
+            int local_run_end = roaring_trailing_zeroes(~cur_word_with_1s);
+            run_end = local_run_end + long_ctr * 64;
+            add_run(answer, run_start, run_end - 1);
+            cur_word = cur_word_with_1s & (cur_word_with_1s + 1);
+        }
+        return answer;
+    } else {
+        assert(false);
+        roaring_unreachable;
+        return NULL;
+    }
+}
+
+container_t *container_from_run_range(const run_container_t *run, uint32_t min,
+                                      uint32_t max, uint8_t *typecode_after) {
+    // We expect most of the time to end up with a bitset container
+    bitset_container_t *bitset = bitset_container_create();
+    *typecode_after = BITSET_CONTAINER_TYPE;
+    int32_t union_cardinality = 0;
+    for (int32_t i = 0; i < run->n_runs; ++i) {
+        uint32_t rle_min = run->runs[i].value;
+        uint32_t rle_max = rle_min + run->runs[i].length;
+        bitset_set_lenrange(bitset->words, rle_min, rle_max - rle_min);
+        union_cardinality += run->runs[i].length + 1;
+    }
+    union_cardinality += max - min + 1;
+    union_cardinality -=
+        bitset_lenrange_cardinality(bitset->words, min, max - min);
+    bitset_set_lenrange(bitset->words, min, max - min);
+    bitset->cardinality = union_cardinality;
+    if (bitset->cardinality <= DEFAULT_MAX_SIZE) {
+        // we need to convert to an array container
+        array_container_t *array = array_container_from_bitset(bitset);
+        *typecode_after = ARRAY_CONTAINER_TYPE;
+        bitset_container_free(bitset);
+        return array;
+    }
+    return bitset;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/containers/convert.c */
+/* begin file src/containers/mixed_andnot.c */
+/*
+ * mixed_andnot.c.  More methods since operation is not symmetric,
+ * except no "wide" andnot , so no lazy options motivated.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst, a valid array container that could be the same as dst.*/
+void array_bitset_container_andnot(const array_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   array_container_t *dst) {
+    // follows Java implementation as of June 2016
+    if (dst->capacity < src_1->cardinality) {
+        array_container_grow(dst, src_1->cardinality, false);
+    }
+    int32_t newcard = 0;
+    const int32_t origcard = src_1->cardinality;
+    for (int i = 0; i < origcard; ++i) {
+        uint16_t key = src_1->array[i];
+        dst->array[newcard] = key;
+        newcard += 1 - bitset_container_contains(src_2, key);
+    }
+    dst->cardinality = newcard;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * src_1 */
+
+void array_bitset_container_iandnot(array_container_t *src_1,
+                                    const bitset_container_t *src_2) {
+    array_bitset_container_andnot(src_1, src_2, src_1);
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst, which does not initially have a valid container.
+ * Return true for a bitset result; false for array
+ */
+
+bool bitset_array_container_andnot(const bitset_container_t *src_1,
+                                   const array_container_t *src_2,
+                                   container_t **dst) {
+    // Java did this directly, but we have option of asm or avx
+    bitset_container_t *result = bitset_container_create();
+    bitset_container_copy(src_1, result);
+    result->cardinality =
+        (int32_t)bitset_clear_list(result->words, (uint64_t)result->cardinality,
+                                   src_2->array, (uint64_t)src_2->cardinality);
+
+    // do required type conversions.
+    if (result->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(result);
+        bitset_container_free(result);
+        return false;
+    }
+    *dst = result;
+    return true;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_array_container_iandnot(bitset_container_t *src_1,
+                                    const array_container_t *src_2,
+                                    container_t **dst) {
+    *dst = src_1;
+    src_1->cardinality =
+        (int32_t)bitset_clear_list(src_1->words, (uint64_t)src_1->cardinality,
+                                   src_2->array, (uint64_t)src_2->cardinality);
+
+    if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(src_1);
+        bitset_container_free(src_1);
+        return false;  // not bitset
+    } else
+        return true;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_andnot(const run_container_t *src_1,
+                                 const bitset_container_t *src_2,
+                                 container_t **dst) {
+    // follows the Java implementation as of June 2016
+    int card = run_container_cardinality(src_1);
+    if (card <= DEFAULT_MAX_SIZE) {
+        // must be an array
+        array_container_t *answer = array_container_create_given_capacity(card);
+        answer->cardinality = 0;
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            rle16_t rle = src_1->runs[rlepos];
+            for (int run_value = rle.value; run_value <= rle.value + rle.length;
+                 ++run_value) {
+                if (!bitset_container_get(src_2, (uint16_t)run_value)) {
+                    answer->array[answer->cardinality++] = (uint16_t)run_value;
+                }
+            }
+        }
+        *dst = answer;
+        return false;
+    } else {  // we guess it will be a bitset, though have to check guess when
+              // done
+        bitset_container_t *answer = bitset_container_clone(src_2);
+
+        uint32_t last_pos = 0;
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            rle16_t rle = src_1->runs[rlepos];
+
+            uint32_t start = rle.value;
+            uint32_t end = start + rle.length + 1;
+            bitset_reset_range(answer->words, last_pos, start);
+            bitset_flip_range(answer->words, start, end);
+            last_pos = end;
+        }
+        bitset_reset_range(answer->words, last_pos, (uint32_t)(1 << 16));
+
+        answer->cardinality = bitset_container_compute_cardinality(answer);
+
+        if (answer->cardinality <= DEFAULT_MAX_SIZE) {
+            *dst = array_container_from_bitset(answer);
+            bitset_container_free(answer);
+            return false;  // not bitset
+        }
+        *dst = answer;
+        return true;  // bitset
+    }
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_iandnot(run_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  container_t **dst) {
+    // dummy implementation
+    bool ans = run_bitset_container_andnot(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset").  dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool bitset_run_container_andnot(const bitset_container_t *src_1,
+                                 const run_container_t *src_2,
+                                 container_t **dst) {
+    // follows Java implementation
+    bitset_container_t *result = bitset_container_create();
+
+    bitset_container_copy(src_1, result);
+    for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) {
+        rle16_t rle = src_2->runs[rlepos];
+        bitset_reset_range(result->words, rle.value,
+                           rle.value + rle.length + UINT32_C(1));
+    }
+    result->cardinality = bitset_container_compute_cardinality(result);
+
+    if (result->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(result);
+        bitset_container_free(result);
+        return false;  // not bitset
+    }
+    *dst = result;
+    return true;  // bitset
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_run_container_iandnot(bitset_container_t *src_1,
+                                  const run_container_t *src_2,
+                                  container_t **dst) {
+    *dst = src_1;
+
+    for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) {
+        rle16_t rle = src_2->runs[rlepos];
+        bitset_reset_range(src_1->words, rle.value,
+                           rle.value + rle.length + UINT32_C(1));
+    }
+    src_1->cardinality = bitset_container_compute_cardinality(src_1);
+
+    if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(src_1);
+        bitset_container_free(src_1);
+        return false;  // not bitset
+    } else
+        return true;
+}
+
+/* helper. a_out must be a valid array container with adequate capacity.
+ * Returns the cardinality of the output container. Partly Based on Java
+ * implementation Util.unsignedDifference.
+ *
+ * TODO: Util.unsignedDifference does not use advanceUntil.  Is it cheaper
+ * to avoid advanceUntil?
+ */
+
+static int run_array_array_subtract(const run_container_t *rc,
+                                    const array_container_t *a_in,
+                                    array_container_t *a_out) {
+    int out_card = 0;
+    int32_t in_array_pos =
+        -1;  // since advanceUntil always assumes we start the search AFTER this
+
+    for (int rlepos = 0; rlepos < rc->n_runs; rlepos++) {
+        int32_t start = rc->runs[rlepos].value;
+        int32_t end = start + rc->runs[rlepos].length + 1;
+
+        in_array_pos = advanceUntil(a_in->array, in_array_pos,
+                                    a_in->cardinality, (uint16_t)start);
+
+        if (in_array_pos >= a_in->cardinality) {  // run has no items subtracted
+            for (int32_t i = start; i < end; ++i)
+                a_out->array[out_card++] = (uint16_t)i;
+        } else {
+            uint16_t next_nonincluded = a_in->array[in_array_pos];
+            if (next_nonincluded >= end) {
+                // another case when run goes unaltered
+                for (int32_t i = start; i < end; ++i)
+                    a_out->array[out_card++] = (uint16_t)i;
+                in_array_pos--;  // ensure we see this item again if necessary
+            } else {
+                for (int32_t i = start; i < end; ++i)
+                    if (i != next_nonincluded)
+                        a_out->array[out_card++] = (uint16_t)i;
+                    else  // 0 should ensure  we don't match
+                        next_nonincluded =
+                            (in_array_pos + 1 >= a_in->cardinality)
+                                ? 0
+                                : a_in->array[++in_array_pos];
+                in_array_pos--;  // see again
+            }
+        }
+    }
+    return out_card;
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any type of container.
+ */
+
+int run_array_container_andnot(const run_container_t *src_1,
+                               const array_container_t *src_2,
+                               container_t **dst) {
+    // follows the Java impl as of June 2016
+
+    int card = run_container_cardinality(src_1);
+    const int arbitrary_threshold = 32;
+
+    if (card <= arbitrary_threshold) {
+        if (src_2->cardinality == 0) {
+            *dst = run_container_clone(src_1);
+            return RUN_CONTAINER_TYPE;
+        }
+        // Java's "lazyandNot.toEfficientContainer" thing
+        run_container_t *answer = run_container_create_given_capacity(
+            card + array_container_cardinality(src_2));
+
+        int rlepos = 0;
+        int xrlepos = 0;  // "x" is src_2
+        rle16_t rle = src_1->runs[rlepos];
+        int32_t start = rle.value;
+        int32_t end = start + rle.length + 1;
+        int32_t xstart = src_2->array[xrlepos];
+
+        while ((rlepos < src_1->n_runs) && (xrlepos < src_2->cardinality)) {
+            if (end <= xstart) {
+                // output the first run
+                answer->runs[answer->n_runs++] =
+                    MAKE_RLE16(start, end - start - 1);
+                rlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+            } else if (xstart + 1 <= start) {
+                // exit the second run
+                xrlepos++;
+                if (xrlepos < src_2->cardinality) {
+                    xstart = src_2->array[xrlepos];
+                }
+            } else {
+                if (start < xstart) {
+                    answer->runs[answer->n_runs++] =
+                        MAKE_RLE16(start, xstart - start - 1);
+                }
+                if (xstart + 1 < end) {
+                    start = xstart + 1;
+                } else {
+                    rlepos++;
+                    if (rlepos < src_1->n_runs) {
+                        start = src_1->runs[rlepos].value;
+                        end = start + src_1->runs[rlepos].length + 1;
+                    }
+                }
+            }
+        }
+        if (rlepos < src_1->n_runs) {
+            answer->runs[answer->n_runs++] = MAKE_RLE16(start, end - start - 1);
+            rlepos++;
+            if (rlepos < src_1->n_runs) {
+                memcpy(answer->runs + answer->n_runs, src_1->runs + rlepos,
+                       (src_1->n_runs - rlepos) * sizeof(rle16_t));
+                answer->n_runs += (src_1->n_runs - rlepos);
+            }
+        }
+        uint8_t return_type;
+        *dst = convert_run_to_efficient_container(answer, &return_type);
+        if (answer != *dst) run_container_free(answer);
+        return return_type;
+    }
+    // else it's a bitmap or array
+
+    if (card <= DEFAULT_MAX_SIZE) {
+        array_container_t *ac = array_container_create_given_capacity(card);
+        // nb Java code used a generic iterator-based merge to compute
+        // difference
+        ac->cardinality = run_array_array_subtract(src_1, src_2, ac);
+        *dst = ac;
+        return ARRAY_CONTAINER_TYPE;
+    }
+    bitset_container_t *ans = bitset_container_from_run(src_1);
+    bool result_is_bitset = bitset_array_container_iandnot(ans, src_2, dst);
+    return (result_is_bitset ? BITSET_CONTAINER_TYPE : ARRAY_CONTAINER_TYPE);
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+int run_array_container_iandnot(run_container_t *src_1,
+                                const array_container_t *src_2,
+                                container_t **dst) {
+    // dummy implementation same as June 2016 Java
+    int ans = run_array_container_andnot(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+/* dst must be a valid array container, allowed to be src_1 */
+
+void array_run_container_andnot(const array_container_t *src_1,
+                                const run_container_t *src_2,
+                                array_container_t *dst) {
+    // basically following Java impl as of June 2016
+    if (src_1->cardinality > dst->capacity) {
+        array_container_grow(dst, src_1->cardinality, false);
+    }
+
+    if (src_2->n_runs == 0) {
+        memmove(dst->array, src_1->array,
+                sizeof(uint16_t) * src_1->cardinality);
+        dst->cardinality = src_1->cardinality;
+        return;
+    }
+    int32_t run_start = src_2->runs[0].value;
+    int32_t run_end = run_start + src_2->runs[0].length;
+    int which_run = 0;
+
+    uint16_t val = 0;
+    int dest_card = 0;
+    for (int i = 0; i < src_1->cardinality; ++i) {
+        val = src_1->array[i];
+        if (val < run_start)
+            dst->array[dest_card++] = val;
+        else if (val <= run_end) {
+            ;  // omitted item
+        } else {
+            do {
+                if (which_run + 1 < src_2->n_runs) {
+                    ++which_run;
+                    run_start = src_2->runs[which_run].value;
+                    run_end = run_start + src_2->runs[which_run].length;
+
+                } else
+                    run_start = run_end = (1 << 16) + 1;
+            } while (val > run_end);
+            --i;
+        }
+    }
+    dst->cardinality = dest_card;
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+void array_run_container_iandnot(array_container_t *src_1,
+                                 const run_container_t *src_2) {
+    array_run_container_andnot(src_1, src_2, src_1);
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int run_run_container_andnot(const run_container_t *src_1,
+                             const run_container_t *src_2, container_t **dst) {
+    run_container_t *ans = run_container_create();
+    run_container_andnot(src_1, src_2, ans);
+    uint8_t typecode_after;
+    *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after);
+    return typecode_after;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+int run_run_container_iandnot(run_container_t *src_1,
+                              const run_container_t *src_2, container_t **dst) {
+    // following Java impl as of June 2016 (dummy)
+    int ans = run_run_container_andnot(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+/*
+ * dst is a valid array container and may be the same as src_1
+ */
+
+void array_array_container_andnot(const array_container_t *src_1,
+                                  const array_container_t *src_2,
+                                  array_container_t *dst) {
+    array_container_andnot(src_1, src_2, dst);
+}
+
+/* inplace array-array andnot will always be able to reuse the space of
+ * src_1 */
+void array_array_container_iandnot(array_container_t *src_1,
+                                   const array_container_t *src_2) {
+    array_container_andnot(src_1, src_2, src_1);
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially). Return value is
+ * "dst is a bitset"
+ */
+
+bool bitset_bitset_container_andnot(const bitset_container_t *src_1,
+                                    const bitset_container_t *src_2,
+                                    container_t **dst) {
+    bitset_container_t *ans = bitset_container_create();
+    int card = bitset_container_andnot(src_1, src_2, ans);
+    if (card <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(ans);
+        bitset_container_free(ans);
+        return false;  // not bitset
+    } else {
+        *dst = ans;
+        return true;
+    }
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_bitset_container_iandnot(bitset_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     container_t **dst) {
+    int card = bitset_container_andnot(src_1, src_2, src_1);
+    if (card <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(src_1);
+        bitset_container_free(src_1);
+        return false;  // not bitset
+    } else {
+        *dst = src_1;
+        return true;
+    }
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/containers/mixed_andnot.c */
+/* begin file src/containers/mixed_equal.c */
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+bool array_container_equal_bitset(const array_container_t* container1,
+                                  const bitset_container_t* container2) {
+    if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+        if (container2->cardinality != container1->cardinality) {
+            return false;
+        }
+    }
+    int32_t pos = 0;
+    for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
+        uint64_t w = container2->words[i];
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            uint16_t r = i * 64 + roaring_trailing_zeroes(w);
+            if (pos >= container1->cardinality) {
+                return false;
+            }
+            if (container1->array[pos] != r) {
+                return false;
+            }
+            ++pos;
+            w ^= t;
+        }
+    }
+    return (pos == container1->cardinality);
+}
+
+bool run_container_equals_array(const run_container_t* container1,
+                                const array_container_t* container2) {
+    if (run_container_cardinality(container1) != container2->cardinality)
+        return false;
+    int32_t pos = 0;
+    for (int i = 0; i < container1->n_runs; ++i) {
+        const uint32_t run_start = container1->runs[i].value;
+        const uint32_t le = container1->runs[i].length;
+
+        if (container2->array[pos] != run_start) {
+            return false;
+        }
+
+        if (container2->array[pos + le] != run_start + le) {
+            return false;
+        }
+
+        pos += le + 1;
+    }
+    return true;
+}
+
+bool run_container_equals_bitset(const run_container_t* container1,
+                                 const bitset_container_t* container2) {
+    int run_card = run_container_cardinality(container1);
+    int bitset_card = (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)
+                          ? container2->cardinality
+                          : bitset_container_compute_cardinality(container2);
+    if (bitset_card != run_card) {
+        return false;
+    }
+
+    for (int32_t i = 0; i < container1->n_runs; i++) {
+        uint32_t begin = container1->runs[i].value;
+        if (container1->runs[i].length) {
+            uint32_t end = begin + container1->runs[i].length + 1;
+            if (!bitset_container_contains_range(container2, begin, end)) {
+                return false;
+            }
+        } else {
+            if (!bitset_container_contains(container2, begin)) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/containers/mixed_equal.c */
+/* begin file src/containers/mixed_intersection.c */
+/*
+ * mixed_intersection.c
+ *
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst.  */
+void array_bitset_container_intersection(const array_container_t *src_1,
+                                         const bitset_container_t *src_2,
+                                         array_container_t *dst) {
+    if (dst->capacity < src_1->cardinality) {
+        array_container_grow(dst, src_1->cardinality, false);
+    }
+    int32_t newcard = 0;  // dst could be src_1
+    const int32_t origcard = src_1->cardinality;
+    for (int i = 0; i < origcard; ++i) {
+        uint16_t key = src_1->array[i];
+        // this branchless approach is much faster...
+        dst->array[newcard] = key;
+        newcard += bitset_container_contains(src_2, key);
+        /**
+         * we could do it this way instead...
+         * if (bitset_container_contains(src_2, key)) {
+         * dst->array[newcard++] = key;
+         * }
+         * but if the result is unpredictible, the processor generates
+         * many mispredicted branches.
+         * Difference can be huge (from 3 cycles when predictible all the way
+         * to 16 cycles when unpredictible.
+         * See
+         * https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/bitset/c/arraybitsetintersection.c
+         */
+    }
+    dst->cardinality = newcard;
+}
+
+/* Compute the size of the intersection of src_1 and src_2. */
+int array_bitset_container_intersection_cardinality(
+    const array_container_t *src_1, const bitset_container_t *src_2) {
+    int32_t newcard = 0;
+    const int32_t origcard = src_1->cardinality;
+    for (int i = 0; i < origcard; ++i) {
+        uint16_t key = src_1->array[i];
+        newcard += bitset_container_contains(src_2, key);
+    }
+    return newcard;
+}
+
+bool array_bitset_container_intersect(const array_container_t *src_1,
+                                      const bitset_container_t *src_2) {
+    const int32_t origcard = src_1->cardinality;
+    for (int i = 0; i < origcard; ++i) {
+        uint16_t key = src_1->array[i];
+        if (bitset_container_contains(src_2, key)) return true;
+    }
+    return false;
+}
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be equal to src_1. We assume that dst is a
+ * valid container. */
+void array_run_container_intersection(const array_container_t *src_1,
+                                      const run_container_t *src_2,
+                                      array_container_t *dst) {
+    if (run_container_is_full(src_2)) {
+        if (dst != src_1) array_container_copy(src_1, dst);
+        return;
+    }
+    if (dst->capacity < src_1->cardinality) {
+        array_container_grow(dst, src_1->cardinality, false);
+    }
+    if (src_2->n_runs == 0) {
+        return;
+    }
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    rle16_t rle = src_2->runs[rlepos];
+    int32_t newcard = 0;
+    while (arraypos < src_1->cardinality) {
+        const uint16_t arrayval = src_1->array[arraypos];
+        while (rle.value + rle.length <
+               arrayval) {  // this will frequently be false
+            ++rlepos;
+            if (rlepos == src_2->n_runs) {
+                dst->cardinality = newcard;
+                return;  // we are done
+            }
+            rle = src_2->runs[rlepos];
+        }
+        if (rle.value > arrayval) {
+            arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
+                                    rle.value);
+        } else {
+            dst->array[newcard] = arrayval;
+            newcard++;
+            arraypos++;
+        }
+    }
+    dst->cardinality = newcard;
+}
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * *dst. If the result is true then the result is a bitset_container_t
+ * otherwise is a array_container_t. If *dst ==  src_2, an in-place processing
+ * is attempted.*/
+bool run_bitset_container_intersection(const run_container_t *src_1,
+                                       const bitset_container_t *src_2,
+                                       container_t **dst) {
+    if (run_container_is_full(src_1)) {
+        if (*dst != src_2) *dst = bitset_container_clone(src_2);
+        return true;
+    }
+    int32_t card = run_container_cardinality(src_1);
+    if (card <= DEFAULT_MAX_SIZE) {
+        // result can only be an array (assuming that we never make a
+        // RunContainer)
+        if (card > src_2->cardinality) {
+            card = src_2->cardinality;
+        }
+        array_container_t *answer = array_container_create_given_capacity(card);
+        *dst = answer;
+        if (*dst == NULL) {
+            return false;
+        }
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            rle16_t rle = src_1->runs[rlepos];
+            uint32_t endofrun = (uint32_t)rle.value + rle.length;
+            for (uint32_t runValue = rle.value; runValue <= endofrun;
+                 ++runValue) {
+                answer->array[answer->cardinality] = (uint16_t)runValue;
+                answer->cardinality +=
+                    bitset_container_contains(src_2, runValue);
+            }
+        }
+        return false;
+    }
+    if (*dst == src_2) {  // we attempt in-place
+        bitset_container_t *answer = CAST_bitset(*dst);
+        uint32_t start = 0;
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            const rle16_t rle = src_1->runs[rlepos];
+            uint32_t end = rle.value;
+            bitset_reset_range(src_2->words, start, end);
+
+            start = end + rle.length + 1;
+        }
+        bitset_reset_range(src_2->words, start, UINT32_C(1) << 16);
+        answer->cardinality = bitset_container_compute_cardinality(answer);
+        if (src_2->cardinality > DEFAULT_MAX_SIZE) {
+            return true;
+        } else {
+            array_container_t *newanswer = array_container_from_bitset(src_2);
+            if (newanswer == NULL) {
+                *dst = NULL;
+                return false;
+            }
+            *dst = newanswer;
+            return false;
+        }
+    } else {  // no inplace
+        // we expect the answer to be a bitmap (if we are lucky)
+        bitset_container_t *answer = bitset_container_clone(src_2);
+
+        *dst = answer;
+        if (answer == NULL) {
+            return true;
+        }
+        uint32_t start = 0;
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            const rle16_t rle = src_1->runs[rlepos];
+            uint32_t end = rle.value;
+            bitset_reset_range(answer->words, start, end);
+            start = end + rle.length + 1;
+        }
+        bitset_reset_range(answer->words, start, UINT32_C(1) << 16);
+        answer->cardinality = bitset_container_compute_cardinality(answer);
+
+        if (answer->cardinality > DEFAULT_MAX_SIZE) {
+            return true;
+        } else {
+            array_container_t *newanswer = array_container_from_bitset(answer);
+            bitset_container_free(CAST_bitset(*dst));
+            if (newanswer == NULL) {
+                *dst = NULL;
+                return false;
+            }
+            *dst = newanswer;
+            return false;
+        }
+    }
+}
+
+/* Compute the size of the intersection between src_1 and src_2 . */
+int array_run_container_intersection_cardinality(const array_container_t *src_1,
+                                                 const run_container_t *src_2) {
+    if (run_container_is_full(src_2)) {
+        return src_1->cardinality;
+    }
+    if (src_2->n_runs == 0) {
+        return 0;
+    }
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    rle16_t rle = src_2->runs[rlepos];
+    int32_t newcard = 0;
+    while (arraypos < src_1->cardinality) {
+        const uint16_t arrayval = src_1->array[arraypos];
+        while (rle.value + rle.length <
+               arrayval) {  // this will frequently be false
+            ++rlepos;
+            if (rlepos == src_2->n_runs) {
+                return newcard;  // we are done
+            }
+            rle = src_2->runs[rlepos];
+        }
+        if (rle.value > arrayval) {
+            arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
+                                    rle.value);
+        } else {
+            newcard++;
+            arraypos++;
+        }
+    }
+    return newcard;
+}
+
+/* Compute the intersection  between src_1 and src_2
+ **/
+int run_bitset_container_intersection_cardinality(
+    const run_container_t *src_1, const bitset_container_t *src_2) {
+    if (run_container_is_full(src_1)) {
+        return bitset_container_cardinality(src_2);
+    }
+    int answer = 0;
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        answer +=
+            bitset_lenrange_cardinality(src_2->words, rle.value, rle.length);
+    }
+    return answer;
+}
+
+bool array_run_container_intersect(const array_container_t *src_1,
+                                   const run_container_t *src_2) {
+    if (run_container_is_full(src_2)) {
+        return !array_container_empty(src_1);
+    }
+    if (src_2->n_runs == 0) {
+        return false;
+    }
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    rle16_t rle = src_2->runs[rlepos];
+    while (arraypos < src_1->cardinality) {
+        const uint16_t arrayval = src_1->array[arraypos];
+        while (rle.value + rle.length <
+               arrayval) {  // this will frequently be false
+            ++rlepos;
+            if (rlepos == src_2->n_runs) {
+                return false;  // we are done
+            }
+            rle = src_2->runs[rlepos];
+        }
+        if (rle.value > arrayval) {
+            arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
+                                    rle.value);
+        } else {
+            return true;
+        }
+    }
+    return false;
+}
+
+/* Compute the intersection  between src_1 and src_2
+ **/
+bool run_bitset_container_intersect(const run_container_t *src_1,
+                                    const bitset_container_t *src_2) {
+    if (run_container_is_full(src_1)) {
+        return !bitset_container_empty(src_2);
+    }
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        if (!bitset_lenrange_empty(src_2->words, rle.value, rle.length))
+            return true;
+    }
+    return false;
+}
+
+/*
+ * Compute the intersection between src_1 and src_2 and write the result
+ * to *dst. If the return function is true, the result is a bitset_container_t
+ * otherwise is a array_container_t.
+ */
+bool bitset_bitset_container_intersection(const bitset_container_t *src_1,
+                                          const bitset_container_t *src_2,
+                                          container_t **dst) {
+    const int newCardinality = bitset_container_and_justcard(src_1, src_2);
+    if (newCardinality > DEFAULT_MAX_SIZE) {
+        *dst = bitset_container_create();
+        if (*dst != NULL) {
+            bitset_container_and_nocard(src_1, src_2, CAST_bitset(*dst));
+            CAST_bitset(*dst)->cardinality = newCardinality;
+        }
+        return true;  // it is a bitset
+    }
+    *dst = array_container_create_given_capacity(newCardinality);
+    if (*dst != NULL) {
+        CAST_array(*dst)->cardinality = newCardinality;
+        bitset_extract_intersection_setbits_uint16(
+            src_1->words, src_2->words, BITSET_CONTAINER_SIZE_IN_WORDS,
+            CAST_array(*dst)->array, 0);
+    }
+    return false;  // not a bitset
+}
+
+bool bitset_bitset_container_intersection_inplace(
+    bitset_container_t *src_1, const bitset_container_t *src_2,
+    container_t **dst) {
+    const int newCardinality = bitset_container_and_justcard(src_1, src_2);
+    if (newCardinality > DEFAULT_MAX_SIZE) {
+        *dst = src_1;
+        bitset_container_and_nocard(src_1, src_2, src_1);
+        CAST_bitset(*dst)->cardinality = newCardinality;
+        return true;  // it is a bitset
+    }
+    *dst = array_container_create_given_capacity(newCardinality);
+    if (*dst != NULL) {
+        CAST_array(*dst)->cardinality = newCardinality;
+        bitset_extract_intersection_setbits_uint16(
+            src_1->words, src_2->words, BITSET_CONTAINER_SIZE_IN_WORDS,
+            CAST_array(*dst)->array, 0);
+    }
+    return false;  // not a bitset
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/containers/mixed_intersection.c */
+/* begin file src/containers/mixed_negation.c */
+/*
+ * mixed_negation.c
+ *
+ */
+
+#include <assert.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+// TODO: make simplified and optimized negation code across
+// the full range.
+
+/* Negation across the entire range of the container.
+ * Compute the  negation of src  and write the result
+ * to *dst. The complement of a
+ * sufficiently sparse set will always be dense and a hence a bitmap
+' * We assume that dst is pre-allocated and a valid bitset container
+ * There can be no in-place version.
+ */
+void array_container_negation(const array_container_t *src,
+                              bitset_container_t *dst) {
+    uint64_t card = UINT64_C(1 << 16);
+    bitset_container_set_all(dst);
+
+    if (src->cardinality == 0) {
+        return;
+    }
+
+    dst->cardinality = (int32_t)bitset_clear_list(dst->words, card, src->array,
+                                                  (uint64_t)src->cardinality);
+}
+
+/* Negation across the entire range of the container
+ * Compute the  negation of src  and write the result
+ * to *dst.  A true return value indicates a bitset result,
+ * otherwise the result is an array container.
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_container_negation(const bitset_container_t *src,
+                               container_t **dst) {
+    return bitset_container_negation_range(src, 0, (1 << 16), dst);
+}
+
+/* inplace version */
+/*
+ * Same as bitset_container_negation except that if the output is to
+ * be a
+ * bitset_container_t, then src is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_container_negation_inplace(bitset_container_t *src,
+                                       container_t **dst) {
+    return bitset_container_negation_range_inplace(src, 0, (1 << 16), dst);
+}
+
+/* Negation across the entire range of container
+ * Compute the  negation of src  and write the result
+ * to *dst.  Return values are the *_TYPECODES as defined * in containers.h
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+int run_container_negation(const run_container_t *src, container_t **dst) {
+    return run_container_negation_range(src, 0, (1 << 16), dst);
+}
+
+/*
+ * Same as run_container_negation except that if the output is to
+ * be a
+ * run_container_t, and has the capacity to hold the result,
+ * then src is modified and no allocation is made.
+ * In all cases, the result is in *dst.
+ */
+int run_container_negation_inplace(run_container_t *src, container_t **dst) {
+    return run_container_negation_range_inplace(src, 0, (1 << 16), dst);
+}
+
+/* Negation across a range of the container.
+ * Compute the  negation of src  and write the result
+ * to *dst. Returns true if the result is a bitset container
+ * and false for an array container.  *dst is not preallocated.
+ */
+bool array_container_negation_range(const array_container_t *src,
+                                    const int range_start, const int range_end,
+                                    container_t **dst) {
+    /* close port of the Java implementation */
+    if (range_start >= range_end) {
+        *dst = array_container_clone(src);
+        return false;
+    }
+
+    int32_t start_index =
+        binarySearch(src->array, src->cardinality, (uint16_t)range_start);
+    if (start_index < 0) start_index = -start_index - 1;
+
+    int32_t last_index =
+        binarySearch(src->array, src->cardinality, (uint16_t)(range_end - 1));
+    if (last_index < 0) last_index = -last_index - 2;
+
+    const int32_t current_values_in_range = last_index - start_index + 1;
+    const int32_t span_to_be_flipped = range_end - range_start;
+    const int32_t new_values_in_range =
+        span_to_be_flipped - current_values_in_range;
+    const int32_t cardinality_change =
+        new_values_in_range - current_values_in_range;
+    const int32_t new_cardinality = src->cardinality + cardinality_change;
+
+    if (new_cardinality > DEFAULT_MAX_SIZE) {
+        bitset_container_t *temp = bitset_container_from_array(src);
+        bitset_flip_range(temp->words, (uint32_t)range_start,
+                          (uint32_t)range_end);
+        temp->cardinality = new_cardinality;
+        *dst = temp;
+        return true;
+    }
+
+    array_container_t *arr =
+        array_container_create_given_capacity(new_cardinality);
+    *dst = (container_t *)arr;
+    if (new_cardinality == 0) {
+        arr->cardinality = new_cardinality;
+        return false;  // we are done.
+    }
+    // copy stuff before the active area
+    memcpy(arr->array, src->array, start_index * sizeof(uint16_t));
+
+    // work on the range
+    int32_t out_pos = start_index, in_pos = start_index;
+    int32_t val_in_range = range_start;
+    for (; val_in_range < range_end && in_pos <= last_index; ++val_in_range) {
+        if ((uint16_t)val_in_range != src->array[in_pos]) {
+            arr->array[out_pos++] = (uint16_t)val_in_range;
+        } else {
+            ++in_pos;
+        }
+    }
+    for (; val_in_range < range_end; ++val_in_range)
+        arr->array[out_pos++] = (uint16_t)val_in_range;
+
+    // content after the active range
+    memcpy(arr->array + out_pos, src->array + (last_index + 1),
+           (src->cardinality - (last_index + 1)) * sizeof(uint16_t));
+    arr->cardinality = new_cardinality;
+    return false;
+}
+
+/* Even when the result would fit, it is unclear how to make an
+ * inplace version without inefficient copying.
+ */
+
+bool array_container_negation_range_inplace(array_container_t *src,
+                                            const int range_start,
+                                            const int range_end,
+                                            container_t **dst) {
+    bool ans = array_container_negation_range(src, range_start, range_end, dst);
+    // TODO : try a real inplace version
+    array_container_free(src);
+    return ans;
+}
+
+/* Negation across a range of the container
+ * Compute the  negation of src  and write the result
+ * to *dst.  A true return value indicates a bitset result,
+ * otherwise the result is an array container.
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_container_negation_range(const bitset_container_t *src,
+                                     const int range_start, const int range_end,
+                                     container_t **dst) {
+    // TODO maybe consider density-based estimate
+    // and sometimes build result directly as array, with
+    // conversion back to bitset if wrong.  Or determine
+    // actual result cardinality, then go directly for the known final cont.
+
+    // keep computation using bitsets as long as possible.
+    bitset_container_t *t = bitset_container_clone(src);
+    bitset_flip_range(t->words, (uint32_t)range_start, (uint32_t)range_end);
+    t->cardinality = bitset_container_compute_cardinality(t);
+
+    if (t->cardinality > DEFAULT_MAX_SIZE) {
+        *dst = t;
+        return true;
+    } else {
+        *dst = array_container_from_bitset(t);
+        bitset_container_free(t);
+        return false;
+    }
+}
+
+/* inplace version */
+/*
+ * Same as bitset_container_negation except that if the output is to
+ * be a
+ * bitset_container_t, then src is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_container_negation_range_inplace(bitset_container_t *src,
+                                             const int range_start,
+                                             const int range_end,
+                                             container_t **dst) {
+    bitset_flip_range(src->words, (uint32_t)range_start, (uint32_t)range_end);
+    src->cardinality = bitset_container_compute_cardinality(src);
+    if (src->cardinality > DEFAULT_MAX_SIZE) {
+        *dst = src;
+        return true;
+    }
+    *dst = array_container_from_bitset(src);
+    bitset_container_free(src);
+    return false;
+}
+
+/* Negation across a range of container
+ * Compute the  negation of src  and write the result
+ * to *dst. Return values are the *_TYPECODES as defined * in containers.h
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+int run_container_negation_range(const run_container_t *src,
+                                 const int range_start, const int range_end,
+                                 container_t **dst) {
+    uint8_t return_typecode;
+
+    // follows the Java implementation
+    if (range_end <= range_start) {
+        *dst = run_container_clone(src);
+        return RUN_CONTAINER_TYPE;
+    }
+
+    run_container_t *ans = run_container_create_given_capacity(
+        src->n_runs + 1);  // src->n_runs + 1);
+    int k = 0;
+    for (; k < src->n_runs && src->runs[k].value < range_start; ++k) {
+        ans->runs[k] = src->runs[k];
+        ans->n_runs++;
+    }
+
+    run_container_smart_append_exclusive(
+        ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1));
+
+    for (; k < src->n_runs; ++k) {
+        run_container_smart_append_exclusive(ans, src->runs[k].value,
+                                             src->runs[k].length);
+    }
+
+    *dst = convert_run_to_efficient_container(ans, &return_typecode);
+    if (return_typecode != RUN_CONTAINER_TYPE) run_container_free(ans);
+
+    return return_typecode;
+}
+
+/*
+ * Same as run_container_negation except that if the output is to
+ * be a
+ * run_container_t, and has the capacity to hold the result,
+ * then src is modified and no allocation is made.
+ * In all cases, the result is in *dst.
+ */
+int run_container_negation_range_inplace(run_container_t *src,
+                                         const int range_start,
+                                         const int range_end,
+                                         container_t **dst) {
+    uint8_t return_typecode;
+
+    if (range_end <= range_start) {
+        *dst = src;
+        return RUN_CONTAINER_TYPE;
+    }
+
+    // TODO: efficient special case when range is 0 to 65535 inclusive
+
+    if (src->capacity == src->n_runs) {
+        // no excess room.  More checking to see if result can fit
+        bool last_val_before_range = false;
+        bool first_val_in_range = false;
+        bool last_val_in_range = false;
+        bool first_val_past_range = false;
+
+        if (range_start > 0)
+            last_val_before_range =
+                run_container_contains(src, (uint16_t)(range_start - 1));
+        first_val_in_range = run_container_contains(src, (uint16_t)range_start);
+
+        if (last_val_before_range == first_val_in_range) {
+            last_val_in_range =
+                run_container_contains(src, (uint16_t)(range_end - 1));
+            if (range_end != 0x10000)
+                first_val_past_range =
+                    run_container_contains(src, (uint16_t)range_end);
+
+            if (last_val_in_range ==
+                first_val_past_range) {  // no space for inplace
+                int ans = run_container_negation_range(src, range_start,
+                                                       range_end, dst);
+                run_container_free(src);
+                return ans;
+            }
+        }
+    }
+    // all other cases: result will fit
+
+    run_container_t *ans = src;
+    int my_nbr_runs = src->n_runs;
+
+    ans->n_runs = 0;
+    int k = 0;
+    for (; (k < my_nbr_runs) && (src->runs[k].value < range_start); ++k) {
+        // ans->runs[k] = src->runs[k]; (would be self-copy)
+        ans->n_runs++;
+    }
+
+    // as with Java implementation, use locals to give self a buffer of depth 1
+    rle16_t buffered = MAKE_RLE16(0, 0);
+    rle16_t next = buffered;
+    if (k < my_nbr_runs) buffered = src->runs[k];
+
+    run_container_smart_append_exclusive(
+        ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1));
+
+    for (; k < my_nbr_runs; ++k) {
+        if (k + 1 < my_nbr_runs) next = src->runs[k + 1];
+
+        run_container_smart_append_exclusive(ans, buffered.value,
+                                             buffered.length);
+        buffered = next;
+    }
+
+    *dst = convert_run_to_efficient_container(ans, &return_typecode);
+    if (return_typecode != RUN_CONTAINER_TYPE) run_container_free(ans);
+
+    return return_typecode;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/containers/mixed_negation.c */
+/* begin file src/containers/mixed_subset.c */
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+bool array_container_is_subset_bitset(const array_container_t* container1,
+                                      const bitset_container_t* container2) {
+    if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+        if (container2->cardinality < container1->cardinality) {
+            return false;
+        }
+    }
+    for (int i = 0; i < container1->cardinality; ++i) {
+        if (!bitset_container_contains(container2, container1->array[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool run_container_is_subset_array(const run_container_t* container1,
+                                   const array_container_t* container2) {
+    if (run_container_cardinality(container1) > container2->cardinality)
+        return false;
+    int32_t start_pos = -1, stop_pos = -1;
+    for (int i = 0; i < container1->n_runs; ++i) {
+        int32_t start = container1->runs[i].value;
+        int32_t stop = start + container1->runs[i].length;
+        start_pos = advanceUntil(container2->array, stop_pos,
+                                 container2->cardinality, start);
+        stop_pos = advanceUntil(container2->array, stop_pos,
+                                container2->cardinality, stop);
+        if (stop_pos == container2->cardinality) {
+            return false;
+        } else if (stop_pos - start_pos != stop - start ||
+                   container2->array[start_pos] != start ||
+                   container2->array[stop_pos] != stop) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool array_container_is_subset_run(const array_container_t* container1,
+                                   const run_container_t* container2) {
+    if (container1->cardinality > run_container_cardinality(container2))
+        return false;
+    int i_array = 0, i_run = 0;
+    while (i_array < container1->cardinality && i_run < container2->n_runs) {
+        uint32_t start = container2->runs[i_run].value;
+        uint32_t stop = start + container2->runs[i_run].length;
+        if (container1->array[i_array] < start) {
+            return false;
+        } else if (container1->array[i_array] > stop) {
+            i_run++;
+        } else {  // the value of the array is in the run
+            i_array++;
+        }
+    }
+    if (i_array == container1->cardinality) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool run_container_is_subset_bitset(const run_container_t* container1,
+                                    const bitset_container_t* container2) {
+    // todo: this code could be much faster
+    if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+        if (container2->cardinality < run_container_cardinality(container1)) {
+            return false;
+        }
+    } else {
+        int32_t card = bitset_container_compute_cardinality(
+            container2);  // modify container2?
+        if (card < run_container_cardinality(container1)) {
+            return false;
+        }
+    }
+    for (int i = 0; i < container1->n_runs; ++i) {
+        uint32_t run_start = container1->runs[i].value;
+        uint32_t le = container1->runs[i].length;
+        for (uint32_t j = run_start; j <= run_start + le; ++j) {
+            if (!bitset_container_contains(container2, j)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+bool bitset_container_is_subset_run(const bitset_container_t* container1,
+                                    const run_container_t* container2) {
+    // todo: this code could be much faster
+    if (container1->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+        if (container1->cardinality > run_container_cardinality(container2)) {
+            return false;
+        }
+    }
+    int32_t i_bitset = 0, i_run = 0;
+    while (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS &&
+           i_run < container2->n_runs) {
+        uint64_t w = container1->words[i_bitset];
+        while (w != 0 && i_run < container2->n_runs) {
+            uint32_t start = container2->runs[i_run].value;
+            uint32_t stop = start + container2->runs[i_run].length;
+            uint64_t t = w & (~w + 1);
+            uint16_t r = i_bitset * 64 + roaring_trailing_zeroes(w);
+            if (r < start) {
+                return false;
+            } else if (r > stop) {
+                i_run++;
+                continue;
+            } else {
+                w ^= t;
+            }
+        }
+        if (w == 0) {
+            i_bitset++;
+        } else {
+            return false;
+        }
+    }
+    if (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS) {
+        // terminated iterating on the run containers, check that rest of bitset
+        // is empty
+        for (; i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS; i_bitset++) {
+            if (container1->words[i_bitset] != 0) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/containers/mixed_subset.c */
+/* begin file src/containers/mixed_union.c */
+/*
+ * mixed_union.c
+ *
+ */
+
+#include <assert.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst.  */
+void array_bitset_container_union(const array_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  bitset_container_t *dst) {
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    dst->cardinality = (int32_t)bitset_set_list_withcard(
+        dst->words, dst->cardinality, src_1->array, src_1->cardinality);
+}
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). */
+void array_bitset_container_lazy_union(const array_container_t *src_1,
+                                       const bitset_container_t *src_2,
+                                       bitset_container_t *dst) {
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    bitset_set_list(dst->words, src_1->array, src_1->cardinality);
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
+}
+
+void run_bitset_container_union(const run_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                bitset_container_t *dst) {
+    assert(!run_container_is_full(src_1));  // catch this case upstream
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        bitset_set_lenrange(dst->words, rle.value, rle.length);
+    }
+    dst->cardinality = bitset_container_compute_cardinality(dst);
+}
+
+void run_bitset_container_lazy_union(const run_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     bitset_container_t *dst) {
+    assert(!run_container_is_full(src_1));  // catch this case upstream
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        bitset_set_lenrange(dst->words, rle.value, rle.length);
+    }
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
+}
+
+// why do we leave the result as a run container??
+void array_run_container_union(const array_container_t *src_1,
+                               const run_container_t *src_2,
+                               run_container_t *dst) {
+    if (run_container_is_full(src_2)) {
+        run_container_copy(src_2, dst);
+        return;
+    }
+    // TODO: see whether the "2*" is spurious
+    run_container_grow(dst, 2 * (src_1->cardinality + src_2->n_runs), false);
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    rle16_t previousrle;
+    if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
+        previousrle = run_container_append_first(dst, src_2->runs[rlepos]);
+        rlepos++;
+    } else {
+        previousrle =
+            run_container_append_value_first(dst, src_1->array[arraypos]);
+        arraypos++;
+    }
+    while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) {
+        if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
+            run_container_append(dst, src_2->runs[rlepos], &previousrle);
+            rlepos++;
+        } else {
+            run_container_append_value(dst, src_1->array[arraypos],
+                                       &previousrle);
+            arraypos++;
+        }
+    }
+    if (arraypos < src_1->cardinality) {
+        while (arraypos < src_1->cardinality) {
+            run_container_append_value(dst, src_1->array[arraypos],
+                                       &previousrle);
+            arraypos++;
+        }
+    } else {
+        while (rlepos < src_2->n_runs) {
+            run_container_append(dst, src_2->runs[rlepos], &previousrle);
+            rlepos++;
+        }
+    }
+}
+
+void array_run_container_inplace_union(const array_container_t *src_1,
+                                       run_container_t *src_2) {
+    if (run_container_is_full(src_2)) {
+        return;
+    }
+    const int32_t maxoutput = src_1->cardinality + src_2->n_runs;
+    const int32_t neededcapacity = maxoutput + src_2->n_runs;
+    if (src_2->capacity < neededcapacity)
+        run_container_grow(src_2, neededcapacity, true);
+    memmove(src_2->runs + maxoutput, src_2->runs,
+            src_2->n_runs * sizeof(rle16_t));
+    rle16_t *inputsrc2 = src_2->runs + maxoutput;
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    int src2nruns = src_2->n_runs;
+    src_2->n_runs = 0;
+
+    rle16_t previousrle;
+
+    if (inputsrc2[rlepos].value <= src_1->array[arraypos]) {
+        previousrle = run_container_append_first(src_2, inputsrc2[rlepos]);
+        rlepos++;
+    } else {
+        previousrle =
+            run_container_append_value_first(src_2, src_1->array[arraypos]);
+        arraypos++;
+    }
+
+    while ((rlepos < src2nruns) && (arraypos < src_1->cardinality)) {
+        if (inputsrc2[rlepos].value <= src_1->array[arraypos]) {
+            run_container_append(src_2, inputsrc2[rlepos], &previousrle);
+            rlepos++;
+        } else {
+            run_container_append_value(src_2, src_1->array[arraypos],
+                                       &previousrle);
+            arraypos++;
+        }
+    }
+    if (arraypos < src_1->cardinality) {
+        while (arraypos < src_1->cardinality) {
+            run_container_append_value(src_2, src_1->array[arraypos],
+                                       &previousrle);
+            arraypos++;
+        }
+    } else {
+        while (rlepos < src2nruns) {
+            run_container_append(src_2, inputsrc2[rlepos], &previousrle);
+            rlepos++;
+        }
+    }
+}
+
+bool array_array_container_union(const array_container_t *src_1,
+                                 const array_container_t *src_2,
+                                 container_t **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    if (totalCardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_create_given_capacity(totalCardinality);
+        if (*dst != NULL) {
+            array_container_union(src_1, src_2, CAST_array(*dst));
+        } else {
+            return true;  // otherwise failure won't be caught
+        }
+        return false;  // not a bitset
+    }
+    *dst = bitset_container_create();
+    bool returnval = true;  // expect a bitset
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = CAST_bitset(*dst);
+        bitset_set_list(ourbitset->words, src_1->array, src_1->cardinality);
+        ourbitset->cardinality = (int32_t)bitset_set_list_withcard(
+            ourbitset->words, src_1->cardinality, src_2->array,
+            src_2->cardinality);
+        if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
+            // need to convert!
+            *dst = array_container_from_bitset(ourbitset);
+            bitset_container_free(ourbitset);
+            returnval = false;  // not going to be a bitset
+        }
+    }
+    return returnval;
+}
+
+bool array_array_container_inplace_union(array_container_t *src_1,
+                                         const array_container_t *src_2,
+                                         container_t **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    *dst = NULL;
+    if (totalCardinality <= DEFAULT_MAX_SIZE) {
+        if (src_1->capacity < totalCardinality) {
+            *dst = array_container_create_given_capacity(
+                2 * totalCardinality);  // be purposefully generous
+            if (*dst != NULL) {
+                array_container_union(src_1, src_2, CAST_array(*dst));
+            } else {
+                return true;  // otherwise failure won't be caught
+            }
+            return false;  // not a bitset
+        } else {
+            memmove(src_1->array + src_2->cardinality, src_1->array,
+                    src_1->cardinality * sizeof(uint16_t));
+            // In theory, we could use fast_union_uint16, but it is unsafe. It
+            // fails with Intel compilers in particular.
+            // https://github.com/RoaringBitmap/CRoaring/pull/452
+            // See report https://github.com/RoaringBitmap/CRoaring/issues/476
+            src_1->cardinality = (int32_t)union_uint16(
+                src_1->array + src_2->cardinality, src_1->cardinality,
+                src_2->array, src_2->cardinality, src_1->array);
+            return false;  // not a bitset
+        }
+    }
+    *dst = bitset_container_create();
+    bool returnval = true;  // expect a bitset
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = CAST_bitset(*dst);
+        bitset_set_list(ourbitset->words, src_1->array, src_1->cardinality);
+        ourbitset->cardinality = (int32_t)bitset_set_list_withcard(
+            ourbitset->words, src_1->cardinality, src_2->array,
+            src_2->cardinality);
+        if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
+            // need to convert!
+            if (src_1->capacity < ourbitset->cardinality) {
+                array_container_grow(src_1, ourbitset->cardinality, false);
+            }
+
+            bitset_extract_setbits_uint16(ourbitset->words,
+                                          BITSET_CONTAINER_SIZE_IN_WORDS,
+                                          src_1->array, 0);
+            src_1->cardinality = ourbitset->cardinality;
+            *dst = src_1;
+            bitset_container_free(ourbitset);
+            returnval = false;  // not going to be a bitset
+        }
+    }
+    return returnval;
+}
+
+bool array_array_container_lazy_union(const array_container_t *src_1,
+                                      const array_container_t *src_2,
+                                      container_t **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    //
+    // We assume that operations involving bitset containers will be faster than
+    // operations involving solely array containers, except maybe when array
+    // containers are small. Indeed, for example, it is cheap to compute the
+    // union between an array and a bitset container, generally more so than
+    // between a large array and another array. So it is advantageous to favour
+    // bitset containers during the computation. Of course, if we convert array
+    // containers eagerly to bitset containers, we may later need to revert the
+    // bitset containers to array containerr to satisfy the Roaring format
+    // requirements, but such one-time conversions at the end may not be overly
+    // expensive. We arrived to this design based on extensive benchmarking.
+    //
+    if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
+        *dst = array_container_create_given_capacity(totalCardinality);
+        if (*dst != NULL) {
+            array_container_union(src_1, src_2, CAST_array(*dst));
+        } else {
+            return true;  // otherwise failure won't be caught
+        }
+        return false;  // not a bitset
+    }
+    *dst = bitset_container_create();
+    bool returnval = true;  // expect a bitset
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = CAST_bitset(*dst);
+        bitset_set_list(ourbitset->words, src_1->array, src_1->cardinality);
+        bitset_set_list(ourbitset->words, src_2->array, src_2->cardinality);
+        ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
+    }
+    return returnval;
+}
+
+bool array_array_container_lazy_inplace_union(array_container_t *src_1,
+                                              const array_container_t *src_2,
+                                              container_t **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    *dst = NULL;
+    //
+    // We assume that operations involving bitset containers will be faster than
+    // operations involving solely array containers, except maybe when array
+    // containers are small. Indeed, for example, it is cheap to compute the
+    // union between an array and a bitset container, generally more so than
+    // between a large array and another array. So it is advantageous to favour
+    // bitset containers during the computation. Of course, if we convert array
+    // containers eagerly to bitset containers, we may later need to revert the
+    // bitset containers to array containerr to satisfy the Roaring format
+    // requirements, but such one-time conversions at the end may not be overly
+    // expensive. We arrived to this design based on extensive benchmarking.
+    //
+    if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
+        if (src_1->capacity < totalCardinality) {
+            *dst = array_container_create_given_capacity(
+                2 * totalCardinality);  // be purposefully generous
+            if (*dst != NULL) {
+                array_container_union(src_1, src_2, CAST_array(*dst));
+            } else {
+                return true;  // otherwise failure won't be caught
+            }
+            return false;  // not a bitset
+        } else {
+            memmove(src_1->array + src_2->cardinality, src_1->array,
+                    src_1->cardinality * sizeof(uint16_t));
+            /*
+              Next line is safe:
+
+              We just need to focus on the reading and writing performed on
+              array1. In `union_vector16`, both vectorized and scalar code still
+              obey the basic rule: read from two inputs, do the union, and then
+              write the output.
+
+              Let's say the length(cardinality) of input2 is L2:
+              ```
+                  |<-  L2  ->|
+              array1: [output--- |input 1---|---]
+              array2: [input 2---]
+              ```
+              Let's define 3 __m128i pointers, `pos1` starts from `input1`,
+              `pos2` starts from `input2`, these 2 point at the next byte to
+              read, `out` starts from `output`, pointing at the next byte to
+              overwrite.
+              ```
+              array1: [output--- |input 1---|---]
+                          ^          ^
+                      out        pos1
+              array2: [input 2---]
+                          ^
+                          pos2
+              ```
+              The union output always contains less or equal number of elements
+              than all inputs added, so we have:
+              ```
+              out <= pos1 + pos2
+              ```
+              therefore:
+              ```
+              out <= pos1 + L2
+              ```
+              which means you will not overwrite data beyond pos1, so the data
+              haven't read is safe, and we don't care the data already read.
+            */
+            src_1->cardinality = (int32_t)fast_union_uint16(
+                src_1->array + src_2->cardinality, src_1->cardinality,
+                src_2->array, src_2->cardinality, src_1->array);
+            return false;  // not a bitset
+        }
+    }
+    *dst = bitset_container_create();
+    bool returnval = true;  // expect a bitset
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = CAST_bitset(*dst);
+        bitset_set_list(ourbitset->words, src_1->array, src_1->cardinality);
+        bitset_set_list(ourbitset->words, src_2->array, src_2->cardinality);
+        ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
+    }
+    return returnval;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/containers/mixed_union.c */
+/* begin file src/containers/mixed_xor.c */
+/*
+ * mixed_xor.c
+ */
+
+#include <assert.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially).
+ * Result is true iff dst is a bitset  */
+bool array_bitset_container_xor(const array_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                container_t **dst) {
+    bitset_container_t *result = bitset_container_create();
+    bitset_container_copy(src_2, result);
+    result->cardinality = (int32_t)bitset_flip_list_withcard(
+        result->words, result->cardinality, src_1->array, src_1->cardinality);
+
+    // do required type conversions.
+    if (result->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(result);
+        bitset_container_free(result);
+        return false;  // not bitset
+    }
+    *dst = result;
+    return true;  // bitset
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY).
+ */
+
+void array_bitset_container_lazy_xor(const array_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     bitset_container_t *dst) {
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    bitset_flip_list(dst->words, src_1->array, src_1->cardinality);
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_xor(const run_container_t *src_1,
+                              const bitset_container_t *src_2,
+                              container_t **dst) {
+    bitset_container_t *result = bitset_container_create();
+
+    bitset_container_copy(src_2, result);
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        bitset_flip_range(result->words, rle.value,
+                          rle.value + rle.length + UINT32_C(1));
+    }
+    result->cardinality = bitset_container_compute_cardinality(result);
+
+    if (result->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(result);
+        bitset_container_free(result);
+        return false;  // not bitset
+    }
+    *dst = result;
+    return true;  // bitset
+}
+
+/* lazy xor.  Dst is initialized and may be equal to src_2.
+ *  Result is left as a bitset container, even if actual
+ *  cardinality would dictate an array container.
+ */
+
+void run_bitset_container_lazy_xor(const run_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   bitset_container_t *dst) {
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        bitset_flip_range(dst->words, rle.value,
+                          rle.value + rle.length + UINT32_C(1));
+    }
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int array_run_container_xor(const array_container_t *src_1,
+                            const run_container_t *src_2, container_t **dst) {
+    // semi following Java XOR implementation as of May 2016
+    // the C OR implementation works quite differently and can return a run
+    // container
+    // TODO could optimize for full run containers.
+
+    // use of lazy following Java impl.
+    const int arbitrary_threshold = 32;
+    if (src_1->cardinality < arbitrary_threshold) {
+        run_container_t *ans = run_container_create();
+        array_run_container_lazy_xor(src_1, src_2, ans);  // keeps runs.
+        uint8_t typecode_after;
+        *dst =
+            convert_run_to_efficient_container_and_free(ans, &typecode_after);
+        return typecode_after;
+    }
+
+    int card = run_container_cardinality(src_2);
+    if (card <= DEFAULT_MAX_SIZE) {
+        // Java implementation works with the array, xoring the run elements via
+        // iterator
+        array_container_t *temp = array_container_from_run(src_2);
+        bool ret_is_bitset = array_array_container_xor(temp, src_1, dst);
+        array_container_free(temp);
+        return ret_is_bitset ? BITSET_CONTAINER_TYPE : ARRAY_CONTAINER_TYPE;
+
+    } else {  // guess that it will end up as a bitset
+        bitset_container_t *result = bitset_container_from_run(src_2);
+        bool is_bitset = bitset_array_container_ixor(result, src_1, dst);
+        // any necessary type conversion has been done by the ixor
+        int retval = (is_bitset ? BITSET_CONTAINER_TYPE : ARRAY_CONTAINER_TYPE);
+        return retval;
+    }
+}
+
+/* Dst is a valid run container. (Can it be src_2? Let's say not.)
+ * Leaves result as run container, even if other options are
+ * smaller.
+ */
+
+void array_run_container_lazy_xor(const array_container_t *src_1,
+                                  const run_container_t *src_2,
+                                  run_container_t *dst) {
+    run_container_grow(dst, src_1->cardinality + src_2->n_runs, false);
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    dst->n_runs = 0;
+
+    while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) {
+        if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
+            run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value,
+                                                 src_2->runs[rlepos].length);
+            rlepos++;
+        } else {
+            run_container_smart_append_exclusive(dst, src_1->array[arraypos],
+                                                 0);
+            arraypos++;
+        }
+    }
+    while (arraypos < src_1->cardinality) {
+        run_container_smart_append_exclusive(dst, src_1->array[arraypos], 0);
+        arraypos++;
+    }
+    while (rlepos < src_2->n_runs) {
+        run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value,
+                                             src_2->runs[rlepos].length);
+        rlepos++;
+    }
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int run_run_container_xor(const run_container_t *src_1,
+                          const run_container_t *src_2, container_t **dst) {
+    run_container_t *ans = run_container_create();
+    run_container_xor(src_1, src_2, ans);
+    uint8_t typecode_after;
+    *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after);
+    return typecode_after;
+}
+
+/*
+ * Java implementation (as of May 2016) for array_run, run_run
+ * and  bitset_run don't do anything different for inplace.
+ * Could adopt the mixed_union.c approach instead (ie, using
+ * smart_append_exclusive)
+ *
+ */
+
+bool array_array_container_xor(const array_container_t *src_1,
+                               const array_container_t *src_2,
+                               container_t **dst) {
+    int totalCardinality =
+        src_1->cardinality + src_2->cardinality;  // upper bound
+    if (totalCardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_create_given_capacity(totalCardinality);
+        array_container_xor(src_1, src_2, CAST_array(*dst));
+        return false;  // not a bitset
+    }
+    *dst = bitset_container_from_array(src_1);
+    bool returnval = true;  // expect a bitset
+    bitset_container_t *ourbitset = CAST_bitset(*dst);
+    ourbitset->cardinality = (uint32_t)bitset_flip_list_withcard(
+        ourbitset->words, src_1->cardinality, src_2->array, src_2->cardinality);
+    if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
+        // need to convert!
+        *dst = array_container_from_bitset(ourbitset);
+        bitset_container_free(ourbitset);
+        returnval = false;  // not going to be a bitset
+    }
+
+    return returnval;
+}
+
+bool array_array_container_lazy_xor(const array_container_t *src_1,
+                                    const array_container_t *src_2,
+                                    container_t **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    //
+    // We assume that operations involving bitset containers will be faster than
+    // operations involving solely array containers, except maybe when array
+    // containers are small. Indeed, for example, it is cheap to compute the
+    // exclusive union between an array and a bitset container, generally more
+    // so than between a large array and another array. So it is advantageous to
+    // favour bitset containers during the computation. Of course, if we convert
+    // array containers eagerly to bitset containers, we may later need to
+    // revert the bitset containers to array containerr to satisfy the Roaring
+    // format requirements, but such one-time conversions at the end may not be
+    // overly expensive. We arrived to this design based on extensive
+    // benchmarking on unions. For XOR/exclusive union, we simply followed the
+    // heuristic used by the unions (see  mixed_union.c). Further tuning is
+    // possible.
+    //
+    if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
+        *dst = array_container_create_given_capacity(totalCardinality);
+        if (*dst != NULL) array_container_xor(src_1, src_2, CAST_array(*dst));
+        return false;  // not a bitset
+    }
+    *dst = bitset_container_from_array(src_1);
+    bool returnval = true;  // expect a bitset (maybe, for XOR??)
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = CAST_bitset(*dst);
+        bitset_flip_list(ourbitset->words, src_2->array, src_2->cardinality);
+        ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
+    }
+    return returnval;
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially). Return value is
+ * "dst is a bitset"
+ */
+
+bool bitset_bitset_container_xor(const bitset_container_t *src_1,
+                                 const bitset_container_t *src_2,
+                                 container_t **dst) {
+    bitset_container_t *ans = bitset_container_create();
+    int card = bitset_container_xor(src_1, src_2, ans);
+    if (card <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(ans);
+        bitset_container_free(ans);
+        return false;  // not bitset
+    } else {
+        *dst = ans;
+        return true;
+    }
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_array_container_ixor(bitset_container_t *src_1,
+                                 const array_container_t *src_2,
+                                 container_t **dst) {
+    *dst = src_1;
+    src_1->cardinality = (uint32_t)bitset_flip_list_withcard(
+        src_1->words, src_1->cardinality, src_2->array, src_2->cardinality);
+
+    if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(src_1);
+        bitset_container_free(src_1);
+        return false;  // not bitset
+    } else
+        return true;
+}
+
+/* a bunch of in-place, some of which may not *really* be inplace.
+ * TODO: write actual inplace routine if efficiency warrants it
+ * Anything inplace with a bitset is a good candidate
+ */
+
+bool bitset_bitset_container_ixor(bitset_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  container_t **dst) {
+    int card = bitset_container_xor(src_1, src_2, src_1);
+    if (card <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(src_1);
+        bitset_container_free(src_1);
+        return false;  // not bitset
+    } else {
+        *dst = src_1;
+        return true;
+    }
+}
+
+bool array_bitset_container_ixor(array_container_t *src_1,
+                                 const bitset_container_t *src_2,
+                                 container_t **dst) {
+    bool ans = array_bitset_container_xor(src_1, src_2, dst);
+    array_container_free(src_1);
+    return ans;
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_ixor(run_container_t *src_1,
+                               const bitset_container_t *src_2,
+                               container_t **dst) {
+    bool ans = run_bitset_container_xor(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+bool bitset_run_container_ixor(bitset_container_t *src_1,
+                               const run_container_t *src_2,
+                               container_t **dst) {
+    bool ans = run_bitset_container_xor(src_2, src_1, dst);
+    bitset_container_free(src_1);
+    return ans;
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int array_run_container_ixor(array_container_t *src_1,
+                             const run_container_t *src_2, container_t **dst) {
+    int ans = array_run_container_xor(src_1, src_2, dst);
+    array_container_free(src_1);
+    return ans;
+}
+
+int run_array_container_ixor(run_container_t *src_1,
+                             const array_container_t *src_2,
+                             container_t **dst) {
+    int ans = array_run_container_xor(src_2, src_1, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+bool array_array_container_ixor(array_container_t *src_1,
+                                const array_container_t *src_2,
+                                container_t **dst) {
+    bool ans = array_array_container_xor(src_1, src_2, dst);
+    array_container_free(src_1);
+    return ans;
+}
+
+int run_run_container_ixor(run_container_t *src_1, const run_container_t *src_2,
+                           container_t **dst) {
+    int ans = run_run_container_xor(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/containers/mixed_xor.c */
+/* begin file src/containers/run.c */
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+extern inline uint16_t run_container_minimum(const run_container_t *run);
+extern inline uint16_t run_container_maximum(const run_container_t *run);
+extern inline int32_t interleavedBinarySearch(const rle16_t *array,
+                                              int32_t lenarray, uint16_t ikey);
+extern inline bool run_container_contains(const run_container_t *run,
+                                          uint16_t pos);
+extern inline int run_container_index_equalorlarger(const run_container_t *arr,
+                                                    uint16_t x);
+extern inline bool run_container_is_full(const run_container_t *run);
+extern inline bool run_container_nonzero_cardinality(const run_container_t *rc);
+extern inline int32_t run_container_serialized_size_in_bytes(int32_t num_runs);
+extern inline run_container_t *run_container_create_range(uint32_t start,
+                                                          uint32_t stop);
+extern inline int run_container_cardinality(const run_container_t *run);
+
+bool run_container_add(run_container_t *run, uint16_t pos) {
+    int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos);
+    if (index >= 0) return false;  // already there
+    index = -index - 2;            // points to preceding value, possibly -1
+    if (index >= 0) {              // possible match
+        int32_t offset = pos - run->runs[index].value;
+        int32_t le = run->runs[index].length;
+        if (offset <= le) return false;  // already there
+        if (offset == le + 1) {
+            // we may need to fuse
+            if (index + 1 < run->n_runs) {
+                if (run->runs[index + 1].value == pos + 1) {
+                    // indeed fusion is needed
+                    run->runs[index].length = run->runs[index + 1].value +
+                                              run->runs[index + 1].length -
+                                              run->runs[index].value;
+                    recoverRoomAtIndex(run, (uint16_t)(index + 1));
+                    return true;
+                }
+            }
+            run->runs[index].length++;
+            return true;
+        }
+        if (index + 1 < run->n_runs) {
+            // we may need to fuse
+            if (run->runs[index + 1].value == pos + 1) {
+                // indeed fusion is needed
+                run->runs[index + 1].value = pos;
+                run->runs[index + 1].length = run->runs[index + 1].length + 1;
+                return true;
+            }
+        }
+    }
+    if (index == -1) {
+        // we may need to extend the first run
+        if (0 < run->n_runs) {
+            if (run->runs[0].value == pos + 1) {
+                run->runs[0].length++;
+                run->runs[0].value--;
+                return true;
+            }
+        }
+    }
+    makeRoomAtIndex(run, (uint16_t)(index + 1));
+    run->runs[index + 1].value = pos;
+    run->runs[index + 1].length = 0;
+    return true;
+}
+
+/* Create a new run container. Return NULL in case of failure. */
+run_container_t *run_container_create_given_capacity(int32_t size) {
+    run_container_t *run;
+    /* Allocate the run container itself. */
+    if ((run = (run_container_t *)roaring_malloc(sizeof(run_container_t))) ==
+        NULL) {
+        return NULL;
+    }
+    if (size <= 0) {  // we don't want to rely on malloc(0)
+        run->runs = NULL;
+    } else if ((run->runs = (rle16_t *)roaring_malloc(sizeof(rle16_t) *
+                                                      size)) == NULL) {
+        roaring_free(run);
+        return NULL;
+    }
+    run->capacity = size;
+    run->n_runs = 0;
+    return run;
+}
+
+int run_container_shrink_to_fit(run_container_t *src) {
+    if (src->n_runs == src->capacity) return 0;  // nothing to do
+    int savings = src->capacity - src->n_runs;
+    src->capacity = src->n_runs;
+    rle16_t *oldruns = src->runs;
+    src->runs =
+        (rle16_t *)roaring_realloc(oldruns, src->capacity * sizeof(rle16_t));
+    if (src->runs == NULL) roaring_free(oldruns);  // should never happen?
+    return savings;
+}
+/* Create a new run container. Return NULL in case of failure. */
+run_container_t *run_container_create(void) {
+    return run_container_create_given_capacity(RUN_DEFAULT_INIT_SIZE);
+}
+
+ALLOW_UNALIGNED
+run_container_t *run_container_clone(const run_container_t *src) {
+    run_container_t *run = run_container_create_given_capacity(src->capacity);
+    if (run == NULL) return NULL;
+    run->capacity = src->capacity;
+    run->n_runs = src->n_runs;
+    memcpy(run->runs, src->runs, src->n_runs * sizeof(rle16_t));
+    return run;
+}
+
+void run_container_offset(const run_container_t *c, container_t **loc,
+                          container_t **hic, uint16_t offset) {
+    run_container_t *lo = NULL, *hi = NULL;
+
+    bool split;
+    int lo_cap, hi_cap;
+    int top, pivot;
+
+    top = (1 << 16) - offset;
+    pivot = run_container_index_equalorlarger(c, top);
+
+    if (pivot == -1) {
+        split = false;
+        lo_cap = c->n_runs;
+        hi_cap = 0;
+    } else {
+        split = c->runs[pivot].value < top;
+        lo_cap = pivot + (split ? 1 : 0);
+        hi_cap = c->n_runs - pivot;
+    }
+
+    if (loc && lo_cap) {
+        lo = run_container_create_given_capacity(lo_cap);
+        memcpy(lo->runs, c->runs, lo_cap * sizeof(rle16_t));
+        lo->n_runs = lo_cap;
+        for (int i = 0; i < lo_cap; ++i) {
+            lo->runs[i].value += offset;
+        }
+        *loc = (container_t *)lo;
+    }
+
+    if (hic && hi_cap) {
+        hi = run_container_create_given_capacity(hi_cap);
+        memcpy(hi->runs, c->runs + pivot, hi_cap * sizeof(rle16_t));
+        hi->n_runs = hi_cap;
+        for (int i = 0; i < hi_cap; ++i) {
+            hi->runs[i].value += offset;
+        }
+        *hic = (container_t *)hi;
+    }
+
+    // Fix the split.
+    if (split) {
+        if (lo != NULL) {
+            // Add the missing run to 'lo', exhausting length.
+            lo->runs[lo->n_runs - 1].length =
+                (1 << 16) - lo->runs[lo->n_runs - 1].value - 1;
+        }
+
+        if (hi != NULL) {
+            // Fix the first run in 'hi'.
+            hi->runs[0].length -= UINT16_MAX - hi->runs[0].value + 1;
+            hi->runs[0].value = 0;
+        }
+    }
+}
+
+/* Free memory. */
+void run_container_free(run_container_t *run) {
+    if (run->runs !=
+        NULL) {  // Jon Strabala reports that some tools complain otherwise
+        roaring_free(run->runs);
+        run->runs = NULL;  // pedantic
+    }
+    roaring_free(run);
+}
+
+void run_container_grow(run_container_t *run, int32_t min, bool copy) {
+    int32_t newCapacity = (run->capacity == 0)   ? RUN_DEFAULT_INIT_SIZE
+                          : run->capacity < 64   ? run->capacity * 2
+                          : run->capacity < 1024 ? run->capacity * 3 / 2
+                                                 : run->capacity * 5 / 4;
+    if (newCapacity < min) newCapacity = min;
+    run->capacity = newCapacity;
+    assert(run->capacity >= min);
+    if (copy) {
+        rle16_t *oldruns = run->runs;
+        run->runs = (rle16_t *)roaring_realloc(oldruns,
+                                               run->capacity * sizeof(rle16_t));
+        if (run->runs == NULL) roaring_free(oldruns);
+    } else {
+        // Jon Strabala reports that some tools complain otherwise
+        if (run->runs != NULL) {
+            roaring_free(run->runs);
+        }
+        run->runs = (rle16_t *)roaring_malloc(run->capacity * sizeof(rle16_t));
+    }
+    // We may have run->runs == NULL.
+}
+
+/* copy one container into another */
+void run_container_copy(const run_container_t *src, run_container_t *dst) {
+    const int32_t n_runs = src->n_runs;
+    if (src->n_runs > dst->capacity) {
+        run_container_grow(dst, n_runs, false);
+    }
+    dst->n_runs = n_runs;
+    memcpy(dst->runs, src->runs, sizeof(rle16_t) * n_runs);
+}
+
+/* Compute the union of `src_1' and `src_2' and write the result to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void run_container_union(const run_container_t *src_1,
+                         const run_container_t *src_2, run_container_t *dst) {
+    // TODO: this could be a lot more efficient
+
+    // we start out with inexpensive checks
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            run_container_copy(src_1, dst);
+            return;
+        }
+        if (if2) {
+            run_container_copy(src_2, dst);
+            return;
+        }
+    }
+    const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
+    if (dst->capacity < neededcapacity)
+        run_container_grow(dst, neededcapacity, false);
+    dst->n_runs = 0;
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+
+    rle16_t previousrle;
+    if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) {
+        previousrle = run_container_append_first(dst, src_1->runs[rlepos]);
+        rlepos++;
+    } else {
+        previousrle = run_container_append_first(dst, src_2->runs[xrlepos]);
+        xrlepos++;
+    }
+
+    while ((xrlepos < src_2->n_runs) && (rlepos < src_1->n_runs)) {
+        rle16_t newrl;
+        if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) {
+            newrl = src_1->runs[rlepos];
+            rlepos++;
+        } else {
+            newrl = src_2->runs[xrlepos];
+            xrlepos++;
+        }
+        run_container_append(dst, newrl, &previousrle);
+    }
+    while (xrlepos < src_2->n_runs) {
+        run_container_append(dst, src_2->runs[xrlepos], &previousrle);
+        xrlepos++;
+    }
+    while (rlepos < src_1->n_runs) {
+        run_container_append(dst, src_1->runs[rlepos], &previousrle);
+        rlepos++;
+    }
+}
+
+/* Compute the union of `src_1' and `src_2' and write the result to `src_1'
+ */
+void run_container_union_inplace(run_container_t *src_1,
+                                 const run_container_t *src_2) {
+    // TODO: this could be a lot more efficient
+
+    // we start out with inexpensive checks
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            return;
+        }
+        if (if2) {
+            run_container_copy(src_2, src_1);
+            return;
+        }
+    }
+    // we move the data to the end of the current array
+    const int32_t maxoutput = src_1->n_runs + src_2->n_runs;
+    const int32_t neededcapacity = maxoutput + src_1->n_runs;
+    if (src_1->capacity < neededcapacity)
+        run_container_grow(src_1, neededcapacity, true);
+    memmove(src_1->runs + maxoutput, src_1->runs,
+            src_1->n_runs * sizeof(rle16_t));
+    rle16_t *inputsrc1 = src_1->runs + maxoutput;
+    const int32_t input1nruns = src_1->n_runs;
+    src_1->n_runs = 0;
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+
+    rle16_t previousrle;
+    if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) {
+        previousrle = run_container_append_first(src_1, inputsrc1[rlepos]);
+        rlepos++;
+    } else {
+        previousrle = run_container_append_first(src_1, src_2->runs[xrlepos]);
+        xrlepos++;
+    }
+    while ((xrlepos < src_2->n_runs) && (rlepos < input1nruns)) {
+        rle16_t newrl;
+        if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) {
+            newrl = inputsrc1[rlepos];
+            rlepos++;
+        } else {
+            newrl = src_2->runs[xrlepos];
+            xrlepos++;
+        }
+        run_container_append(src_1, newrl, &previousrle);
+    }
+    while (xrlepos < src_2->n_runs) {
+        run_container_append(src_1, src_2->runs[xrlepos], &previousrle);
+        xrlepos++;
+    }
+    while (rlepos < input1nruns) {
+        run_container_append(src_1, inputsrc1[rlepos], &previousrle);
+        rlepos++;
+    }
+}
+
+/* Compute the symmetric difference of `src_1' and `src_2' and write the result
+ * to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void run_container_xor(const run_container_t *src_1,
+                       const run_container_t *src_2, run_container_t *dst) {
+    // don't bother to convert xor with full range into negation
+    // since negation is implemented similarly
+
+    const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
+    if (dst->capacity < neededcapacity)
+        run_container_grow(dst, neededcapacity, false);
+
+    int32_t pos1 = 0;
+    int32_t pos2 = 0;
+    dst->n_runs = 0;
+
+    while ((pos1 < src_1->n_runs) && (pos2 < src_2->n_runs)) {
+        if (src_1->runs[pos1].value <= src_2->runs[pos2].value) {
+            run_container_smart_append_exclusive(dst, src_1->runs[pos1].value,
+                                                 src_1->runs[pos1].length);
+            pos1++;
+        } else {
+            run_container_smart_append_exclusive(dst, src_2->runs[pos2].value,
+                                                 src_2->runs[pos2].length);
+            pos2++;
+        }
+    }
+    while (pos1 < src_1->n_runs) {
+        run_container_smart_append_exclusive(dst, src_1->runs[pos1].value,
+                                             src_1->runs[pos1].length);
+        pos1++;
+    }
+
+    while (pos2 < src_2->n_runs) {
+        run_container_smart_append_exclusive(dst, src_2->runs[pos2].value,
+                                             src_2->runs[pos2].length);
+        pos2++;
+    }
+}
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+void run_container_intersection(const run_container_t *src_1,
+                                const run_container_t *src_2,
+                                run_container_t *dst) {
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            run_container_copy(src_2, dst);
+            return;
+        }
+        if (if2) {
+            run_container_copy(src_1, dst);
+            return;
+        }
+    }
+    // TODO: this could be a lot more efficient, could use SIMD optimizations
+    const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
+    if (dst->capacity < neededcapacity)
+        run_container_grow(dst, neededcapacity, false);
+    dst->n_runs = 0;
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+    int32_t start = src_1->runs[rlepos].value;
+    int32_t end = start + src_1->runs[rlepos].length + 1;
+    int32_t xstart = src_2->runs[xrlepos].value;
+    int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
+    while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
+        if (end <= xstart) {
+            ++rlepos;
+            if (rlepos < src_1->n_runs) {
+                start = src_1->runs[rlepos].value;
+                end = start + src_1->runs[rlepos].length + 1;
+            }
+        } else if (xend <= start) {
+            ++xrlepos;
+            if (xrlepos < src_2->n_runs) {
+                xstart = src_2->runs[xrlepos].value;
+                xend = xstart + src_2->runs[xrlepos].length + 1;
+            }
+        } else {  // they overlap
+            const int32_t lateststart = start > xstart ? start : xstart;
+            int32_t earliestend;
+            if (end == xend) {  // improbable
+                earliestend = end;
+                rlepos++;
+                xrlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+                if (xrlepos < src_2->n_runs) {
+                    xstart = src_2->runs[xrlepos].value;
+                    xend = xstart + src_2->runs[xrlepos].length + 1;
+                }
+            } else if (end < xend) {
+                earliestend = end;
+                rlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+
+            } else {  // end > xend
+                earliestend = xend;
+                xrlepos++;
+                if (xrlepos < src_2->n_runs) {
+                    xstart = src_2->runs[xrlepos].value;
+                    xend = xstart + src_2->runs[xrlepos].length + 1;
+                }
+            }
+            dst->runs[dst->n_runs].value = (uint16_t)lateststart;
+            dst->runs[dst->n_runs].length =
+                (uint16_t)(earliestend - lateststart - 1);
+            dst->n_runs++;
+        }
+    }
+}
+
+/* Compute the size of the intersection of src_1 and src_2 . */
+int run_container_intersection_cardinality(const run_container_t *src_1,
+                                           const run_container_t *src_2) {
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            return run_container_cardinality(src_2);
+        }
+        if (if2) {
+            return run_container_cardinality(src_1);
+        }
+    }
+    int answer = 0;
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+    int32_t start = src_1->runs[rlepos].value;
+    int32_t end = start + src_1->runs[rlepos].length + 1;
+    int32_t xstart = src_2->runs[xrlepos].value;
+    int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
+    while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
+        if (end <= xstart) {
+            ++rlepos;
+            if (rlepos < src_1->n_runs) {
+                start = src_1->runs[rlepos].value;
+                end = start + src_1->runs[rlepos].length + 1;
+            }
+        } else if (xend <= start) {
+            ++xrlepos;
+            if (xrlepos < src_2->n_runs) {
+                xstart = src_2->runs[xrlepos].value;
+                xend = xstart + src_2->runs[xrlepos].length + 1;
+            }
+        } else {  // they overlap
+            const int32_t lateststart = start > xstart ? start : xstart;
+            int32_t earliestend;
+            if (end == xend) {  // improbable
+                earliestend = end;
+                rlepos++;
+                xrlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+                if (xrlepos < src_2->n_runs) {
+                    xstart = src_2->runs[xrlepos].value;
+                    xend = xstart + src_2->runs[xrlepos].length + 1;
+                }
+            } else if (end < xend) {
+                earliestend = end;
+                rlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+
+            } else {  // end > xend
+                earliestend = xend;
+                xrlepos++;
+                if (xrlepos < src_2->n_runs) {
+                    xstart = src_2->runs[xrlepos].value;
+                    xend = xstart + src_2->runs[xrlepos].length + 1;
+                }
+            }
+            answer += earliestend - lateststart;
+        }
+    }
+    return answer;
+}
+
+bool run_container_intersect(const run_container_t *src_1,
+                             const run_container_t *src_2) {
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            return !run_container_empty(src_2);
+        }
+        if (if2) {
+            return !run_container_empty(src_1);
+        }
+    }
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+    int32_t start = src_1->runs[rlepos].value;
+    int32_t end = start + src_1->runs[rlepos].length + 1;
+    int32_t xstart = src_2->runs[xrlepos].value;
+    int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
+    while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
+        if (end <= xstart) {
+            ++rlepos;
+            if (rlepos < src_1->n_runs) {
+                start = src_1->runs[rlepos].value;
+                end = start + src_1->runs[rlepos].length + 1;
+            }
+        } else if (xend <= start) {
+            ++xrlepos;
+            if (xrlepos < src_2->n_runs) {
+                xstart = src_2->runs[xrlepos].value;
+                xend = xstart + src_2->runs[xrlepos].length + 1;
+            }
+        } else {  // they overlap
+            return true;
+        }
+    }
+    return false;
+}
+
+/* Compute the difference of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+void run_container_andnot(const run_container_t *src_1,
+                          const run_container_t *src_2, run_container_t *dst) {
+    // following Java implementation as of June 2016
+
+    if (dst->capacity < src_1->n_runs + src_2->n_runs)
+        run_container_grow(dst, src_1->n_runs + src_2->n_runs, false);
+
+    dst->n_runs = 0;
+
+    int rlepos1 = 0;
+    int rlepos2 = 0;
+    int32_t start = src_1->runs[rlepos1].value;
+    int32_t end = start + src_1->runs[rlepos1].length + 1;
+    int32_t start2 = src_2->runs[rlepos2].value;
+    int32_t end2 = start2 + src_2->runs[rlepos2].length + 1;
+
+    while ((rlepos1 < src_1->n_runs) && (rlepos2 < src_2->n_runs)) {
+        if (end <= start2) {
+            // output the first run
+            dst->runs[dst->n_runs++] = MAKE_RLE16(start, end - start - 1);
+            rlepos1++;
+            if (rlepos1 < src_1->n_runs) {
+                start = src_1->runs[rlepos1].value;
+                end = start + src_1->runs[rlepos1].length + 1;
+            }
+        } else if (end2 <= start) {
+            // exit the second run
+            rlepos2++;
+            if (rlepos2 < src_2->n_runs) {
+                start2 = src_2->runs[rlepos2].value;
+                end2 = start2 + src_2->runs[rlepos2].length + 1;
+            }
+        } else {
+            if (start < start2) {
+                dst->runs[dst->n_runs++] =
+                    MAKE_RLE16(start, start2 - start - 1);
+            }
+            if (end2 < end) {
+                start = end2;
+            } else {
+                rlepos1++;
+                if (rlepos1 < src_1->n_runs) {
+                    start = src_1->runs[rlepos1].value;
+                    end = start + src_1->runs[rlepos1].length + 1;
+                }
+            }
+        }
+    }
+    if (rlepos1 < src_1->n_runs) {
+        dst->runs[dst->n_runs++] = MAKE_RLE16(start, end - start - 1);
+        rlepos1++;
+        if (rlepos1 < src_1->n_runs) {
+            memcpy(dst->runs + dst->n_runs, src_1->runs + rlepos1,
+                   sizeof(rle16_t) * (src_1->n_runs - rlepos1));
+            dst->n_runs += src_1->n_runs - rlepos1;
+        }
+    }
+}
+
+ALLOW_UNALIGNED
+int run_container_to_uint32_array(void *vout, const run_container_t *cont,
+                                  uint32_t base) {
+    int outpos = 0;
+    uint32_t *out = (uint32_t *)vout;
+    for (int i = 0; i < cont->n_runs; ++i) {
+        uint32_t run_start = base + cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+        for (int j = 0; j <= le; ++j) {
+            uint32_t val = run_start + j;
+            memcpy(out + outpos, &val,
+                   sizeof(uint32_t));  // should be compiled as a MOV on x64
+            outpos++;
+        }
+    }
+    return outpos;
+}
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void run_container_printf(const run_container_t *cont) {
+    for (int i = 0; i < cont->n_runs; ++i) {
+        uint16_t run_start = cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+        printf("[%d,%d]", run_start, run_start + le);
+    }
+}
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit
+ * integers starting at base.
+ */
+void run_container_printf_as_uint32_array(const run_container_t *cont,
+                                          uint32_t base) {
+    if (cont->n_runs == 0) return;
+    {
+        uint32_t run_start = base + cont->runs[0].value;
+        uint16_t le = cont->runs[0].length;
+        printf("%u", run_start);
+        for (uint32_t j = 1; j <= le; ++j) printf(",%u", run_start + j);
+    }
+    for (int32_t i = 1; i < cont->n_runs; ++i) {
+        uint32_t run_start = base + cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+        for (uint32_t j = 0; j <= le; ++j) printf(",%u", run_start + j);
+    }
+}
+
+/*
+ * Validate the container. Returns true if valid.
+ */
+bool run_container_validate(const run_container_t *run, const char **reason) {
+    if (run->n_runs < 0) {
+        *reason = "negative run count";
+        return false;
+    }
+    if (run->capacity < 0) {
+        *reason = "negative run capacity";
+        return false;
+    }
+    if (run->capacity < run->n_runs) {
+        *reason = "capacity less than run count";
+        return false;
+    }
+
+    if (run->n_runs == 0) {
+        *reason = "zero run count";
+        return false;
+    }
+    if (run->runs == NULL) {
+        *reason = "NULL runs";
+        return false;
+    }
+
+    // Use uint32_t to avoid overflow issues on ranges that contain UINT16_MAX.
+    uint32_t last_end = 0;
+    for (int i = 0; i < run->n_runs; ++i) {
+        uint32_t start = run->runs[i].value;
+        uint32_t end = start + run->runs[i].length + 1;
+        if (end <= start) {
+            *reason = "run start + length overflow";
+            return false;
+        }
+        if (end > (1 << 16)) {
+            *reason = "run start + length too large";
+            return false;
+        }
+        if (start < last_end) {
+            *reason = "run start less than last end";
+            return false;
+        }
+        if (start == last_end && last_end != 0) {
+            *reason = "run start equal to last end, should have combined";
+            return false;
+        }
+        last_end = end;
+    }
+    return true;
+}
+
+int32_t run_container_write(const run_container_t *container, char *buf) {
+    uint16_t cast_16 = container->n_runs;
+    memcpy(buf, &cast_16, sizeof(uint16_t));
+    memcpy(buf + sizeof(uint16_t), container->runs,
+           container->n_runs * sizeof(rle16_t));
+    return run_container_size_in_bytes(container);
+}
+
+int32_t run_container_read(int32_t cardinality, run_container_t *container,
+                           const char *buf) {
+    (void)cardinality;
+    uint16_t cast_16;
+    memcpy(&cast_16, buf, sizeof(uint16_t));
+    container->n_runs = cast_16;
+    if (container->n_runs > container->capacity)
+        run_container_grow(container, container->n_runs, false);
+    if (container->n_runs > 0) {
+        memcpy(container->runs, buf + sizeof(uint16_t),
+               container->n_runs * sizeof(rle16_t));
+    }
+    return run_container_size_in_bytes(container);
+}
+
+bool run_container_iterate(const run_container_t *cont, uint32_t base,
+                           roaring_iterator iterator, void *ptr) {
+    for (int i = 0; i < cont->n_runs; ++i) {
+        uint32_t run_start = base + cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+
+        for (int j = 0; j <= le; ++j)
+            if (!iterator(run_start + j, ptr)) return false;
+    }
+    return true;
+}
+
+bool run_container_iterate64(const run_container_t *cont, uint32_t base,
+                             roaring_iterator64 iterator, uint64_t high_bits,
+                             void *ptr) {
+    for (int i = 0; i < cont->n_runs; ++i) {
+        uint32_t run_start = base + cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+
+        for (int j = 0; j <= le; ++j)
+            if (!iterator(high_bits | (uint64_t)(run_start + j), ptr))
+                return false;
+    }
+    return true;
+}
+
+bool run_container_is_subset(const run_container_t *container1,
+                             const run_container_t *container2) {
+    int i1 = 0, i2 = 0;
+    while (i1 < container1->n_runs && i2 < container2->n_runs) {
+        int start1 = container1->runs[i1].value;
+        int stop1 = start1 + container1->runs[i1].length;
+        int start2 = container2->runs[i2].value;
+        int stop2 = start2 + container2->runs[i2].length;
+        if (start1 < start2) {
+            return false;
+        } else {  // start1 >= start2
+            if (stop1 < stop2) {
+                i1++;
+            } else if (stop1 == stop2) {
+                i1++;
+                i2++;
+            } else {  // stop1 > stop2
+                i2++;
+            }
+        }
+    }
+    if (i1 == container1->n_runs) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+// TODO: write smart_append_exclusive version to match the overloaded 1 param
+// Java version (or  is it even used?)
+
+// follows the Java implementation closely
+// length is the rle-value.  Ie, run [10,12) uses a length value 1.
+void run_container_smart_append_exclusive(run_container_t *src,
+                                          const uint16_t start,
+                                          const uint16_t length) {
+    int old_end;
+    rle16_t *last_run = src->n_runs ? src->runs + (src->n_runs - 1) : NULL;
+    rle16_t *appended_last_run = src->runs + src->n_runs;
+
+    if (!src->n_runs ||
+        (start > (old_end = last_run->value + last_run->length + 1))) {
+        *appended_last_run = MAKE_RLE16(start, length);
+        src->n_runs++;
+        return;
+    }
+    if (old_end == start) {
+        // we merge
+        last_run->length += (length + 1);
+        return;
+    }
+    int new_end = start + length + 1;
+
+    if (start == last_run->value) {
+        // wipe out previous
+        if (new_end < old_end) {
+            *last_run = MAKE_RLE16(new_end, old_end - new_end - 1);
+            return;
+        } else if (new_end > old_end) {
+            *last_run = MAKE_RLE16(old_end, new_end - old_end - 1);
+            return;
+        } else {
+            src->n_runs--;
+            return;
+        }
+    }
+    last_run->length = start - last_run->value - 1;
+    if (new_end < old_end) {
+        *appended_last_run = MAKE_RLE16(new_end, old_end - new_end - 1);
+        src->n_runs++;
+    } else if (new_end > old_end) {
+        *appended_last_run = MAKE_RLE16(old_end, new_end - old_end - 1);
+        src->n_runs++;
+    }
+}
+
+bool run_container_select(const run_container_t *container,
+                          uint32_t *start_rank, uint32_t rank,
+                          uint32_t *element) {
+    for (int i = 0; i < container->n_runs; i++) {
+        uint16_t length = container->runs[i].length;
+        if (rank <= *start_rank + length) {
+            uint16_t value = container->runs[i].value;
+            *element = value + rank - (*start_rank);
+            return true;
+        } else
+            *start_rank += length + 1;
+    }
+    return false;
+}
+
+int run_container_rank(const run_container_t *container, uint16_t x) {
+    int sum = 0;
+    uint32_t x32 = x;
+    for (int i = 0; i < container->n_runs; i++) {
+        uint32_t startpoint = container->runs[i].value;
+        uint32_t length = container->runs[i].length;
+        uint32_t endpoint = length + startpoint;
+        if (x <= endpoint) {
+            if (x < startpoint) break;
+            return sum + (x32 - startpoint) + 1;
+        } else {
+            sum += length + 1;
+        }
+    }
+    return sum;
+}
+uint32_t run_container_rank_many(const run_container_t *container,
+                                 uint64_t start_rank, const uint32_t *begin,
+                                 const uint32_t *end, uint64_t *ans) {
+    const uint16_t high = (uint16_t)((*begin) >> 16);
+    const uint32_t *iter = begin;
+    int sum = 0;
+    int i = 0;
+    for (; iter != end; iter++) {
+        uint32_t x = *iter;
+        uint16_t xhigh = (uint16_t)(x >> 16);
+        if (xhigh != high) return iter - begin;  // stop at next container
+
+        uint32_t x32 = x & 0xFFFF;
+        while (i < container->n_runs) {
+            uint32_t startpoint = container->runs[i].value;
+            uint32_t length = container->runs[i].length;
+            uint32_t endpoint = length + startpoint;
+            if (x32 <= endpoint) {
+                if (x32 < startpoint) {
+                    *(ans++) = start_rank + sum;
+                } else {
+                    *(ans++) = start_rank + sum + (x32 - startpoint) + 1;
+                }
+                break;
+            } else {
+                sum += length + 1;
+                i++;
+            }
+        }
+        if (i >= container->n_runs) *(ans++) = start_rank + sum;
+    }
+
+    return iter - begin;
+}
+
+int run_container_get_index(const run_container_t *container, uint16_t x) {
+    if (run_container_contains(container, x)) {
+        int sum = 0;
+        uint32_t x32 = x;
+        for (int i = 0; i < container->n_runs; i++) {
+            uint32_t startpoint = container->runs[i].value;
+            uint32_t length = container->runs[i].length;
+            uint32_t endpoint = length + startpoint;
+            if (x <= endpoint) {
+                if (x < startpoint) break;
+                return sum + (x32 - startpoint);
+            } else {
+                sum += length + 1;
+            }
+        }
+        return sum - 1;
+    } else {
+        return -1;
+    }
+}
+
+#if defined(CROARING_IS_X64) && CROARING_COMPILER_SUPPORTS_AVX512
+
+CROARING_TARGET_AVX512
+ALLOW_UNALIGNED
+/* Get the cardinality of `run'. Requires an actual computation. */
+static inline int _avx512_run_container_cardinality(
+    const run_container_t *run) {
+    const int32_t n_runs = run->n_runs;
+    const rle16_t *runs = run->runs;
+
+    /* by initializing with n_runs, we omit counting the +1 for each pair. */
+    int sum = n_runs;
+    int32_t k = 0;
+    const int32_t step = sizeof(__m512i) / sizeof(rle16_t);
+    if (n_runs > step) {
+        __m512i total = _mm512_setzero_si512();
+        for (; k + step <= n_runs; k += step) {
+            __m512i ymm1 = _mm512_loadu_si512((const __m512i *)(runs + k));
+            __m512i justlengths = _mm512_srli_epi32(ymm1, 16);
+            total = _mm512_add_epi32(total, justlengths);
+        }
+
+        __m256i lo = _mm512_extracti32x8_epi32(total, 0);
+        __m256i hi = _mm512_extracti32x8_epi32(total, 1);
+
+        // a store might be faster than extract?
+        uint32_t buffer[sizeof(__m256i) / sizeof(rle16_t)];
+        _mm256_storeu_si256((__m256i *)buffer, lo);
+        sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) +
+               (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]);
+
+        _mm256_storeu_si256((__m256i *)buffer, hi);
+        sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) +
+               (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]);
+    }
+    for (; k < n_runs; ++k) {
+        sum += runs[k].length;
+    }
+
+    return sum;
+}
+
+CROARING_UNTARGET_AVX512
+
+CROARING_TARGET_AVX2
+ALLOW_UNALIGNED
+/* Get the cardinality of `run'. Requires an actual computation. */
+static inline int _avx2_run_container_cardinality(const run_container_t *run) {
+    const int32_t n_runs = run->n_runs;
+    const rle16_t *runs = run->runs;
+
+    /* by initializing with n_runs, we omit counting the +1 for each pair. */
+    int sum = n_runs;
+    int32_t k = 0;
+    const int32_t step = sizeof(__m256i) / sizeof(rle16_t);
+    if (n_runs > step) {
+        __m256i total = _mm256_setzero_si256();
+        for (; k + step <= n_runs; k += step) {
+            __m256i ymm1 = _mm256_lddqu_si256((const __m256i *)(runs + k));
+            __m256i justlengths = _mm256_srli_epi32(ymm1, 16);
+            total = _mm256_add_epi32(total, justlengths);
+        }
+        // a store might be faster than extract?
+        uint32_t buffer[sizeof(__m256i) / sizeof(rle16_t)];
+        _mm256_storeu_si256((__m256i *)buffer, total);
+        sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) +
+               (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]);
+    }
+    for (; k < n_runs; ++k) {
+        sum += runs[k].length;
+    }
+
+    return sum;
+}
+
+CROARING_UNTARGET_AVX2
+
+/* Get the cardinality of `run'. Requires an actual computation. */
+static inline int _scalar_run_container_cardinality(
+    const run_container_t *run) {
+    const int32_t n_runs = run->n_runs;
+    const rle16_t *runs = run->runs;
+
+    /* by initializing with n_runs, we omit counting the +1 for each pair. */
+    int sum = n_runs;
+    for (int k = 0; k < n_runs; ++k) {
+        sum += runs[k].length;
+    }
+
+    return sum;
+}
+
+int run_container_cardinality(const run_container_t *run) {
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    if (croaring_hardware_support() & ROARING_SUPPORTS_AVX512) {
+        return _avx512_run_container_cardinality(run);
+    } else
+#endif
+        if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+        return _avx2_run_container_cardinality(run);
+    } else {
+        return _scalar_run_container_cardinality(run);
+    }
+}
+#else
+
+/* Get the cardinality of `run'. Requires an actual computation. */
+ALLOW_UNALIGNED
+int run_container_cardinality(const run_container_t *run) {
+    const int32_t n_runs = run->n_runs;
+    const rle16_t *runs = run->runs;
+
+    /* by initializing with n_runs, we omit counting the +1 for each pair. */
+    int sum = n_runs;
+    for (int k = 0; k < n_runs; ++k) {
+        sum += runs[k].length;
+    }
+
+    return sum;
+}
+#endif
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif/* end file src/containers/run.c */
+/* begin file src/isadetection.c */
+
+/* From
+https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
+Highly modified.
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
+Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
+(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
+Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
+America and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+// Binaries produced by Visual Studio with solely AVX2 routines
+// can compile to AVX-512 thus causing crashes on non-AVX-512 systems.
+// This appears to affect VS 17.8 and 17.9. We disable AVX-512 and AVX2
+// on these systems. It seems that ClangCL is not affected.
+// https://github.com/RoaringBitmap/CRoaring/pull/603
+#ifndef __clang__
+#if _MSC_VER >= 1938
+#define ROARING_DISABLE_AVX 1
+#endif  // _MSC_VER >= 1938
+#endif  // __clang__
+
+// We need portability.h to be included first, see
+// https://github.com/RoaringBitmap/CRoaring/issues/394
+#if CROARING_REGULAR_VISUAL_STUDIO
+#include <intrin.h>
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+#include <cpuid.h>
+#endif  // CROARING_REGULAR_VISUAL_STUDIO
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+enum croaring_instruction_set {
+    CROARING_DEFAULT = 0x0,
+    CROARING_NEON = 0x1,
+    CROARING_AVX2 = 0x4,
+    CROARING_SSE42 = 0x8,
+    CROARING_PCLMULQDQ = 0x10,
+    CROARING_BMI1 = 0x20,
+    CROARING_BMI2 = 0x40,
+    CROARING_ALTIVEC = 0x80,
+    CROARING_AVX512F = 0x100,
+    CROARING_AVX512DQ = 0x200,
+    CROARING_AVX512BW = 0x400,
+    CROARING_AVX512VBMI2 = 0x800,
+    CROARING_AVX512BITALG = 0x1000,
+    CROARING_AVX512VPOPCNTDQ = 0x2000,
+    CROARING_UNINITIALIZED = 0x8000
+};
+
+#if CROARING_COMPILER_SUPPORTS_AVX512
+unsigned int CROARING_AVX512_REQUIRED =
+    (CROARING_AVX512F | CROARING_AVX512DQ | CROARING_AVX512BW |
+     CROARING_AVX512VBMI2 | CROARING_AVX512BITALG | CROARING_AVX512VPOPCNTDQ);
+#endif
+
+#if defined(__x86_64__) || defined(_M_AMD64)  // x64
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+                         uint32_t *edx) {
+#if CROARING_REGULAR_VISUAL_STUDIO
+    int cpu_info[4];
+    __cpuidex(cpu_info, *eax, *ecx);
+    *eax = cpu_info[0];
+    *ebx = cpu_info[1];
+    *ecx = cpu_info[2];
+    *edx = cpu_info[3];
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+    uint32_t level = *eax;
+    __get_cpuid(level, eax, ebx, ecx, edx);
+#else
+    uint32_t a = *eax, b, c = *ecx, d;
+    __asm__("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
+    *eax = a;
+    *ebx = b;
+    *ecx = c;
+    *edx = d;
+#endif
+}
+
+static inline uint64_t xgetbv(void) {
+#if defined(_MSC_VER)
+    return _xgetbv(0);
+#else
+    uint32_t xcr0_lo, xcr0_hi;
+    __asm__("xgetbv\n\t" : "=a"(xcr0_lo), "=d"(xcr0_hi) : "c"(0));
+    return xcr0_lo | ((uint64_t)xcr0_hi << 32);
+#endif
+}
+
+/**
+ * This is a relatively expensive function but it will get called at most
+ * *once* per compilation units. Normally, the CRoaring library is built
+ * as one compilation unit.
+ */
+static inline uint32_t dynamic_croaring_detect_supported_architectures(void) {
+    uint32_t eax, ebx, ecx, edx;
+    uint32_t host_isa = 0x0;
+    // Can be found on Intel ISA Reference for CPUID
+    static uint32_t cpuid_avx2_bit =
+        1 << 5;  ///< @private Bit 5 of EBX for EAX=0x7
+    static uint32_t cpuid_bmi1_bit =
+        1 << 3;  ///< @private bit 3 of EBX for EAX=0x7
+    static uint32_t cpuid_bmi2_bit =
+        1 << 8;  ///< @private bit 8 of EBX for EAX=0x7
+    static uint32_t cpuid_avx512f_bit =
+        1 << 16;  ///< @private bit 16 of EBX for EAX=0x7
+    static uint32_t cpuid_avx512dq_bit =
+        1 << 17;  ///< @private bit 17 of EBX for EAX=0x7
+    static uint32_t cpuid_avx512bw_bit =
+        1 << 30;  ///< @private bit 30 of EBX for EAX=0x7
+    static uint32_t cpuid_avx512vbmi2_bit =
+        1 << 6;  ///< @private bit 6 of ECX for EAX=0x7
+    static uint32_t cpuid_avx512bitalg_bit =
+        1 << 12;  ///< @private bit 12 of ECX for EAX=0x7
+    static uint32_t cpuid_avx512vpopcntdq_bit =
+        1 << 14;  ///< @private bit 14 of ECX for EAX=0x7
+    static uint64_t cpuid_avx256_saved = 1 << 2;  ///< @private bit 2 = AVX
+    static uint64_t cpuid_avx512_saved =
+        7 << 5;  ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
+    static uint32_t cpuid_sse42_bit =
+        1 << 20;  ///< @private bit 20 of ECX for EAX=0x1
+    static uint32_t cpuid_osxsave =
+        (1 << 26) | (1 << 27);  ///< @private bits 26+27 of ECX for EAX=0x1
+    static uint32_t cpuid_pclmulqdq_bit =
+        1 << 1;  ///< @private bit  1 of ECX for EAX=0x1
+
+    // EBX for EAX=0x1
+    eax = 0x1;
+    ecx = 0x0;
+    cpuid(&eax, &ebx, &ecx, &edx);
+
+    if (ecx & cpuid_sse42_bit) {
+        host_isa |= CROARING_SSE42;
+    } else {
+        return host_isa;  // everything after is redundant
+    }
+
+    if (ecx & cpuid_pclmulqdq_bit) {
+        host_isa |= CROARING_PCLMULQDQ;
+    }
+
+    if ((ecx & cpuid_osxsave) != cpuid_osxsave) {
+        return host_isa;
+    }
+
+    // xgetbv for checking if the OS saves registers
+    uint64_t xcr0 = xgetbv();
+
+    if ((xcr0 & cpuid_avx256_saved) == 0) {
+        return host_isa;
+    }
+
+    // ECX for EAX=0x7
+    eax = 0x7;
+    ecx = 0x0;
+    cpuid(&eax, &ebx, &ecx, &edx);
+    if (ebx & cpuid_avx2_bit) {
+        host_isa |= CROARING_AVX2;
+    }
+    if (ebx & cpuid_bmi1_bit) {
+        host_isa |= CROARING_BMI1;
+    }
+
+    if (ebx & cpuid_bmi2_bit) {
+        host_isa |= CROARING_BMI2;
+    }
+
+    if (!((xcr0 & cpuid_avx512_saved) == cpuid_avx512_saved)) {
+        return host_isa;
+    }
+
+    if (ebx & cpuid_avx512f_bit) {
+        host_isa |= CROARING_AVX512F;
+    }
+
+    if (ebx & cpuid_avx512bw_bit) {
+        host_isa |= CROARING_AVX512BW;
+    }
+
+    if (ebx & cpuid_avx512dq_bit) {
+        host_isa |= CROARING_AVX512DQ;
+    }
+
+    if (ecx & cpuid_avx512vbmi2_bit) {
+        host_isa |= CROARING_AVX512VBMI2;
+    }
+
+    if (ecx & cpuid_avx512bitalg_bit) {
+        host_isa |= CROARING_AVX512BITALG;
+    }
+
+    if (ecx & cpuid_avx512vpopcntdq_bit) {
+        host_isa |= CROARING_AVX512VPOPCNTDQ;
+    }
+
+    return host_isa;
+}
+
+#endif  // end SIMD extension detection code
+
+#if defined(__x86_64__) || defined(_M_AMD64)  // x64
+
+#if CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_CPP
+static inline uint32_t croaring_detect_supported_architectures(void) {
+    // thread-safe as per the C++11 standard.
+    static uint32_t buffer = dynamic_croaring_detect_supported_architectures();
+    return buffer;
+}
+#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C
+static uint32_t croaring_detect_supported_architectures(void) {
+    // we use an atomic for thread safety
+    static _Atomic uint32_t buffer = CROARING_UNINITIALIZED;
+    if (buffer == CROARING_UNINITIALIZED) {
+        // atomicity is sufficient
+        buffer = dynamic_croaring_detect_supported_architectures();
+    }
+    return buffer;
+}
+#else
+// If we do not have atomics, we do the best we can.
+static inline uint32_t croaring_detect_supported_architectures(void) {
+    static uint32_t buffer = CROARING_UNINITIALIZED;
+    if (buffer == CROARING_UNINITIALIZED) {
+        buffer = dynamic_croaring_detect_supported_architectures();
+    }
+    return buffer;
+}
+#endif  // CROARING_C_ATOMIC
+
+#ifdef ROARING_DISABLE_AVX
+
+int croaring_hardware_support(void) { return 0; }
+
+#elif defined(__AVX512F__) && defined(__AVX512DQ__) &&   \
+    defined(__AVX512BW__) && defined(__AVX512VBMI2__) && \
+    defined(__AVX512BITALG__) && defined(__AVX512VPOPCNTDQ__)
+int croaring_hardware_support(void) {
+    return ROARING_SUPPORTS_AVX2 | ROARING_SUPPORTS_AVX512;
+}
+#elif defined(__AVX2__)
+
+int croaring_hardware_support(void) {
+    static
+#if CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C
+        _Atomic
+#endif
+        int support = 0xFFFFFFF;
+    if (support == 0xFFFFFFF) {
+        bool avx512_support = false;
+#if CROARING_COMPILER_SUPPORTS_AVX512
+        avx512_support =
+            ((croaring_detect_supported_architectures() &
+              CROARING_AVX512_REQUIRED) == CROARING_AVX512_REQUIRED);
+#endif
+        support = ROARING_SUPPORTS_AVX2 |
+                  (avx512_support ? ROARING_SUPPORTS_AVX512 : 0);
+    }
+    return support;
+}
+#else
+
+int croaring_hardware_support(void) {
+    static
+#if CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C
+        _Atomic
+#endif
+        int support = 0xFFFFFFF;
+    if (support == 0xFFFFFFF) {
+        bool has_avx2 = (croaring_detect_supported_architectures() &
+                         CROARING_AVX2) == CROARING_AVX2;
+        bool has_avx512 = false;
+#if CROARING_COMPILER_SUPPORTS_AVX512
+        has_avx512 = (croaring_detect_supported_architectures() &
+                      CROARING_AVX512_REQUIRED) == CROARING_AVX512_REQUIRED;
+#endif  // CROARING_COMPILER_SUPPORTS_AVX512
+        support = (has_avx2 ? ROARING_SUPPORTS_AVX2 : 0) |
+                  (has_avx512 ? ROARING_SUPPORTS_AVX512 : 0);
+    }
+    return support;
+}
+#endif
+
+#endif  // defined(__x86_64__) || defined(_M_AMD64) // x64
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/isadetection.c */
+/* begin file src/memory.c */
+#include <stdlib.h>
+
+
+// without the following, we get lots of warnings about posix_memalign
+#ifndef __cplusplus
+extern int posix_memalign(void** __memptr, size_t __alignment, size_t __size);
+#endif  //__cplusplus // C++ does not have a well defined signature
+
+// portable version of  posix_memalign
+static void* roaring_bitmap_aligned_malloc(size_t alignment, size_t size) {
+    void* p;
+#ifdef _MSC_VER
+    p = _aligned_malloc(size, alignment);
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+    p = __mingw_aligned_malloc(size, alignment);
+#else
+    // somehow, if this is used before including "x86intrin.h", it creates an
+    // implicit defined warning.
+    if (posix_memalign(&p, alignment, size) != 0) return NULL;
+#endif
+    return p;
+}
+
+static void roaring_bitmap_aligned_free(void* memblock) {
+#ifdef _MSC_VER
+    _aligned_free(memblock);
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+    __mingw_aligned_free(memblock);
+#else
+    free(memblock);
+#endif
+}
+
+static roaring_memory_t global_memory_hook = {
+    .malloc = malloc,
+    .realloc = realloc,
+    .calloc = calloc,
+    .free = free,
+    .aligned_malloc = roaring_bitmap_aligned_malloc,
+    .aligned_free = roaring_bitmap_aligned_free,
+};
+
+void roaring_init_memory_hook(roaring_memory_t memory_hook) {
+    global_memory_hook = memory_hook;
+}
+
+void* roaring_malloc(size_t n) { return global_memory_hook.malloc(n); }
+
+void* roaring_realloc(void* p, size_t new_sz) {
+    return global_memory_hook.realloc(p, new_sz);
+}
+
+void* roaring_calloc(size_t n_elements, size_t element_size) {
+    return global_memory_hook.calloc(n_elements, element_size);
+}
+
+void roaring_free(void* p) { global_memory_hook.free(p); }
+
+void* roaring_aligned_malloc(size_t alignment, size_t size) {
+    return global_memory_hook.aligned_malloc(alignment, size);
+}
+
+void roaring_aligned_free(void* p) { global_memory_hook.aligned_free(p); }
+/* end file src/memory.c */
+/* begin file src/roaring64.c */
+#include <assert.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <string.h>
+
+
+// For serialization / deserialization
+// containers.h last to avoid conflict with ROARING_CONTAINER_T.
+
+#ifdef __cplusplus
+using namespace ::roaring::internal;
+
+extern "C" {
+namespace roaring {
+namespace api {
+#endif
+
+// TODO: Copy on write.
+// TODO: Error on failed allocation.
+
+typedef struct roaring64_bitmap_s {
+    art_t art;
+    uint8_t flags;
+} roaring64_bitmap_t;
+
+// Leaf type of the ART used to keep the high 48 bits of each entry.
+typedef struct roaring64_leaf_s {
+    art_val_t _pad;
+    uint8_t typecode;
+    container_t *container;
+} roaring64_leaf_t;
+
+// Alias to make it easier to work with, since it's an internal-only type
+// anyway.
+typedef struct roaring64_leaf_s leaf_t;
+
+// Iterator struct to hold iteration state.
+typedef struct roaring64_iterator_s {
+    const roaring64_bitmap_t *parent;
+    art_iterator_t art_it;
+    roaring_container_iterator_t container_it;
+    uint64_t high48;  // Key that art_it points to.
+
+    uint64_t value;
+    bool has_value;
+
+    // If has_value is false, then the iterator is saturated. This field
+    // indicates the direction of saturation. If true, there are no more values
+    // in the forward direction. If false, there are no more values in the
+    // backward direction.
+    bool saturated_forward;
+} roaring64_iterator_t;
+
+// Splits the given uint64 key into high 48 bit and low 16 bit components.
+// Expects high48_out to be of length ART_KEY_BYTES.
+static inline uint16_t split_key(uint64_t key, uint8_t high48_out[]) {
+    uint64_t tmp = croaring_htobe64(key);
+    memcpy(high48_out, (uint8_t *)(&tmp), ART_KEY_BYTES);
+    return (uint16_t)key;
+}
+
+// Recombines the high 48 bit and low 16 bit components into a uint64 key.
+// Expects high48_out to be of length ART_KEY_BYTES.
+static inline uint64_t combine_key(const uint8_t high48[], uint16_t low16) {
+    uint64_t result = 0;
+    memcpy((uint8_t *)(&result), high48, ART_KEY_BYTES);
+    return croaring_be64toh(result) | low16;
+}
+
+static inline uint64_t minimum(uint64_t a, uint64_t b) {
+    return (a < b) ? a : b;
+}
+
+static inline leaf_t *create_leaf(container_t *container, uint8_t typecode) {
+    leaf_t *leaf = (leaf_t *)roaring_malloc(sizeof(leaf_t));
+    leaf->container = container;
+    leaf->typecode = typecode;
+    return leaf;
+}
+
+static inline leaf_t *copy_leaf_container(const leaf_t *leaf) {
+    leaf_t *result_leaf = (leaf_t *)roaring_malloc(sizeof(leaf_t));
+    result_leaf->typecode = leaf->typecode;
+    // get_copy_of_container modifies the typecode passed in.
+    result_leaf->container = get_copy_of_container(
+        leaf->container, &result_leaf->typecode, /*copy_on_write=*/false);
+    return result_leaf;
+}
+
+static inline void free_leaf(leaf_t *leaf) { roaring_free(leaf); }
+
+static inline int compare_high48(art_key_chunk_t key1[],
+                                 art_key_chunk_t key2[]) {
+    return art_compare_keys(key1, key2);
+}
+
+static inline bool roaring64_iterator_init_at_leaf_first(
+    roaring64_iterator_t *it) {
+    it->high48 = combine_key(it->art_it.key, 0);
+    leaf_t *leaf = (leaf_t *)it->art_it.value;
+    uint16_t low16 = 0;
+    it->container_it =
+        container_init_iterator(leaf->container, leaf->typecode, &low16);
+    it->value = it->high48 | low16;
+    return (it->has_value = true);
+}
+
+static inline bool roaring64_iterator_init_at_leaf_last(
+    roaring64_iterator_t *it) {
+    it->high48 = combine_key(it->art_it.key, 0);
+    leaf_t *leaf = (leaf_t *)it->art_it.value;
+    uint16_t low16 = 0;
+    it->container_it =
+        container_init_iterator_last(leaf->container, leaf->typecode, &low16);
+    it->value = it->high48 | low16;
+    return (it->has_value = true);
+}
+
+static inline roaring64_iterator_t *roaring64_iterator_init_at(
+    const roaring64_bitmap_t *r, roaring64_iterator_t *it, bool first) {
+    it->parent = r;
+    it->art_it = art_init_iterator(&r->art, first);
+    it->has_value = it->art_it.value != NULL;
+    if (it->has_value) {
+        if (first) {
+            roaring64_iterator_init_at_leaf_first(it);
+        } else {
+            roaring64_iterator_init_at_leaf_last(it);
+        }
+    } else {
+        it->saturated_forward = first;
+    }
+    return it;
+}
+
+roaring64_bitmap_t *roaring64_bitmap_create(void) {
+    roaring64_bitmap_t *r =
+        (roaring64_bitmap_t *)roaring_malloc(sizeof(roaring64_bitmap_t));
+    r->art.root = NULL;
+    r->flags = 0;
+    return r;
+}
+
+void roaring64_bitmap_free(roaring64_bitmap_t *r) {
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    while (it.value != NULL) {
+        leaf_t *leaf = (leaf_t *)it.value;
+        container_free(leaf->container, leaf->typecode);
+        free_leaf(leaf);
+        art_iterator_next(&it);
+    }
+    art_free(&r->art);
+    roaring_free(r);
+}
+
+roaring64_bitmap_t *roaring64_bitmap_copy(const roaring64_bitmap_t *r) {
+    roaring64_bitmap_t *result = roaring64_bitmap_create();
+
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    while (it.value != NULL) {
+        leaf_t *leaf = (leaf_t *)it.value;
+        uint8_t result_typecode = leaf->typecode;
+        container_t *result_container = get_copy_of_container(
+            leaf->container, &result_typecode, /*copy_on_write=*/false);
+        leaf_t *result_leaf = create_leaf(result_container, result_typecode);
+        art_insert(&result->art, it.key, (art_val_t *)result_leaf);
+        art_iterator_next(&it);
+    }
+    return result;
+}
+
+roaring64_bitmap_t *roaring64_bitmap_from_range(uint64_t min, uint64_t max,
+                                                uint64_t step) {
+    if (step == 0 || max <= min) {
+        return NULL;
+    }
+    roaring64_bitmap_t *r = roaring64_bitmap_create();
+    if (step >= (1 << 16)) {
+        // Only one value per container.
+        for (uint64_t value = min; value < max; value += step) {
+            roaring64_bitmap_add(r, value);
+            if (value > UINT64_MAX - step) {
+                break;
+            }
+        }
+        return r;
+    }
+    do {
+        uint64_t high_bits = min & 0xFFFFFFFFFFFF0000;
+        uint16_t container_min = min & 0xFFFF;
+        uint32_t container_max = (uint32_t)minimum(max - high_bits, 1 << 16);
+
+        uint8_t typecode;
+        container_t *container = container_from_range(
+            &typecode, container_min, container_max, (uint16_t)step);
+
+        uint8_t high48[ART_KEY_BYTES];
+        split_key(min, high48);
+        leaf_t *leaf = create_leaf(container, typecode);
+        art_insert(&r->art, high48, (art_val_t *)leaf);
+
+        uint64_t gap = container_max - container_min + step - 1;
+        uint64_t increment = gap - (gap % step);
+        if (min > UINT64_MAX - increment) {
+            break;
+        }
+        min += increment;
+    } while (min < max);
+    return r;
+}
+
+roaring64_bitmap_t *roaring64_bitmap_of_ptr(size_t n_args,
+                                            const uint64_t *vals) {
+    roaring64_bitmap_t *r = roaring64_bitmap_create();
+    roaring64_bitmap_add_many(r, n_args, vals);
+    return r;
+}
+
+roaring64_bitmap_t *roaring64_bitmap_of(size_t n_args, ...) {
+    roaring64_bitmap_t *r = roaring64_bitmap_create();
+    roaring64_bulk_context_t context = {0};
+    va_list ap;
+    va_start(ap, n_args);
+    for (size_t i = 0; i < n_args; i++) {
+        uint64_t val = va_arg(ap, uint64_t);
+        roaring64_bitmap_add_bulk(r, &context, val);
+    }
+    va_end(ap);
+    return r;
+}
+
+static inline leaf_t *containerptr_roaring64_bitmap_add(roaring64_bitmap_t *r,
+                                                        uint8_t *high48,
+                                                        uint16_t low16,
+                                                        leaf_t *leaf) {
+    if (leaf != NULL) {
+        uint8_t typecode2;
+        container_t *container2 =
+            container_add(leaf->container, low16, leaf->typecode, &typecode2);
+        if (container2 != leaf->container) {
+            container_free(leaf->container, leaf->typecode);
+            leaf->container = container2;
+            leaf->typecode = typecode2;
+        }
+        return leaf;
+    } else {
+        array_container_t *ac = array_container_create();
+        uint8_t typecode;
+        container_t *container =
+            container_add(ac, low16, ARRAY_CONTAINER_TYPE, &typecode);
+        assert(ac == container);
+        leaf = create_leaf(container, typecode);
+        art_insert(&r->art, high48, (art_val_t *)leaf);
+        return leaf;
+    }
+}
+
+void roaring64_bitmap_add(roaring64_bitmap_t *r, uint64_t val) {
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+    leaf_t *leaf = (leaf_t *)art_find(&r->art, high48);
+    containerptr_roaring64_bitmap_add(r, high48, low16, leaf);
+}
+
+bool roaring64_bitmap_add_checked(roaring64_bitmap_t *r, uint64_t val) {
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+    leaf_t *leaf = (leaf_t *)art_find(&r->art, high48);
+
+    int old_cardinality = 0;
+    if (leaf != NULL) {
+        old_cardinality =
+            container_get_cardinality(leaf->container, leaf->typecode);
+    }
+    leaf = containerptr_roaring64_bitmap_add(r, high48, low16, leaf);
+    int new_cardinality =
+        container_get_cardinality(leaf->container, leaf->typecode);
+    return old_cardinality != new_cardinality;
+}
+
+void roaring64_bitmap_add_bulk(roaring64_bitmap_t *r,
+                               roaring64_bulk_context_t *context,
+                               uint64_t val) {
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+    if (context->leaf != NULL &&
+        compare_high48(context->high_bytes, high48) == 0) {
+        // We're at a container with the correct high bits.
+        uint8_t typecode2;
+        container_t *container2 =
+            container_add(context->leaf->container, low16,
+                          context->leaf->typecode, &typecode2);
+        if (container2 != context->leaf->container) {
+            container_free(context->leaf->container, context->leaf->typecode);
+            context->leaf->container = container2;
+            context->leaf->typecode = typecode2;
+        }
+    } else {
+        // We're not positioned anywhere yet or the high bits of the key
+        // differ.
+        leaf_t *leaf = (leaf_t *)art_find(&r->art, high48);
+        context->leaf =
+            containerptr_roaring64_bitmap_add(r, high48, low16, leaf);
+        memcpy(context->high_bytes, high48, ART_KEY_BYTES);
+    }
+}
+
+void roaring64_bitmap_add_many(roaring64_bitmap_t *r, size_t n_args,
+                               const uint64_t *vals) {
+    if (n_args == 0) {
+        return;
+    }
+    const uint64_t *end = vals + n_args;
+    roaring64_bulk_context_t context = {0};
+    for (const uint64_t *current_val = vals; current_val != end;
+         current_val++) {
+        roaring64_bitmap_add_bulk(r, &context, *current_val);
+    }
+}
+
+static inline void add_range_closed_at(art_t *art, uint8_t *high48,
+                                       uint16_t min, uint16_t max) {
+    leaf_t *leaf = (leaf_t *)art_find(art, high48);
+    if (leaf != NULL) {
+        uint8_t typecode2;
+        container_t *container2 = container_add_range(
+            leaf->container, leaf->typecode, min, max, &typecode2);
+        if (container2 != leaf->container) {
+            container_free(leaf->container, leaf->typecode);
+            leaf->container = container2;
+            leaf->typecode = typecode2;
+        }
+        return;
+    }
+    uint8_t typecode;
+    // container_add_range is inclusive, but `container_range_of_ones` is
+    // exclusive.
+    container_t *container = container_range_of_ones(min, max + 1, &typecode);
+    leaf = create_leaf(container, typecode);
+    art_insert(art, high48, (art_val_t *)leaf);
+}
+
+void roaring64_bitmap_add_range(roaring64_bitmap_t *r, uint64_t min,
+                                uint64_t max) {
+    if (min >= max) {
+        return;
+    }
+    roaring64_bitmap_add_range_closed(r, min, max - 1);
+}
+
+void roaring64_bitmap_add_range_closed(roaring64_bitmap_t *r, uint64_t min,
+                                       uint64_t max) {
+    if (min > max) {
+        return;
+    }
+
+    art_t *art = &r->art;
+    uint8_t min_high48[ART_KEY_BYTES];
+    uint16_t min_low16 = split_key(min, min_high48);
+    uint8_t max_high48[ART_KEY_BYTES];
+    uint16_t max_low16 = split_key(max, max_high48);
+    if (compare_high48(min_high48, max_high48) == 0) {
+        // Only populate range within one container.
+        add_range_closed_at(art, min_high48, min_low16, max_low16);
+        return;
+    }
+
+    // Populate a range across containers. Fill intermediate containers
+    // entirely.
+    add_range_closed_at(art, min_high48, min_low16, 0xffff);
+    uint64_t min_high_bits = min >> 16;
+    uint64_t max_high_bits = max >> 16;
+    for (uint64_t current = min_high_bits + 1; current < max_high_bits;
+         ++current) {
+        uint8_t current_high48[ART_KEY_BYTES];
+        split_key(current << 16, current_high48);
+        add_range_closed_at(art, current_high48, 0, 0xffff);
+    }
+    add_range_closed_at(art, max_high48, 0, max_low16);
+}
+
+bool roaring64_bitmap_contains(const roaring64_bitmap_t *r, uint64_t val) {
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+    leaf_t *leaf = (leaf_t *)art_find(&r->art, high48);
+    if (leaf != NULL) {
+        return container_contains(leaf->container, low16, leaf->typecode);
+    }
+    return false;
+}
+
+bool roaring64_bitmap_contains_range(const roaring64_bitmap_t *r, uint64_t min,
+                                     uint64_t max) {
+    if (min >= max) {
+        return true;
+    }
+
+    uint8_t min_high48[ART_KEY_BYTES];
+    uint16_t min_low16 = split_key(min, min_high48);
+    uint8_t max_high48[ART_KEY_BYTES];
+    uint16_t max_low16 = split_key(max, max_high48);
+    uint64_t max_high48_bits = (max - 1) & 0xFFFFFFFFFFFF0000;  // Inclusive
+
+    art_iterator_t it = art_lower_bound(&r->art, min_high48);
+    if (it.value == NULL || combine_key(it.key, 0) > min) {
+        return false;
+    }
+    uint64_t prev_high48_bits = min & 0xFFFFFFFFFFFF0000;
+    while (it.value != NULL) {
+        uint64_t current_high48_bits = combine_key(it.key, 0);
+        if (current_high48_bits > max_high48_bits) {
+            // We've passed the end of the range with all containers containing
+            // the range.
+            return true;
+        }
+        if (current_high48_bits - prev_high48_bits > 0x10000) {
+            // There is a gap in the iterator that falls in the range.
+            return false;
+        }
+
+        leaf_t *leaf = (leaf_t *)it.value;
+        uint32_t container_min = 0;
+        if (compare_high48(it.key, min_high48) == 0) {
+            container_min = min_low16;
+        }
+        uint32_t container_max = 0xFFFF + 1;  // Exclusive
+        if (compare_high48(it.key, max_high48) == 0) {
+            container_max = max_low16;
+        }
+
+        // For the first and last containers we use container_contains_range,
+        // for the intermediate containers we can use container_is_full.
+        if (container_min == 0 && container_max == 0xFFFF + 1) {
+            if (!container_is_full(leaf->container, leaf->typecode)) {
+                return false;
+            }
+        } else if (!container_contains_range(leaf->container, container_min,
+                                             container_max, leaf->typecode)) {
+            return false;
+        }
+        prev_high48_bits = current_high48_bits;
+        art_iterator_next(&it);
+    }
+    return prev_high48_bits == max_high48_bits;
+}
+
+bool roaring64_bitmap_contains_bulk(const roaring64_bitmap_t *r,
+                                    roaring64_bulk_context_t *context,
+                                    uint64_t val) {
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+
+    if (context->leaf == NULL || context->high_bytes != high48) {
+        // We're not positioned anywhere yet or the high bits of the key
+        // differ.
+        leaf_t *leaf = (leaf_t *)art_find(&r->art, high48);
+        if (leaf == NULL) {
+            return false;
+        }
+        context->leaf = leaf;
+        memcpy(context->high_bytes, high48, ART_KEY_BYTES);
+    }
+    return container_contains(context->leaf->container, low16,
+                              context->leaf->typecode);
+}
+
+bool roaring64_bitmap_select(const roaring64_bitmap_t *r, uint64_t rank,
+                             uint64_t *element) {
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    uint64_t start_rank = 0;
+    while (it.value != NULL) {
+        leaf_t *leaf = (leaf_t *)it.value;
+        uint64_t cardinality =
+            container_get_cardinality(leaf->container, leaf->typecode);
+        if (start_rank + cardinality > rank) {
+            uint32_t uint32_start = 0;
+            uint32_t uint32_rank = rank - start_rank;
+            uint32_t uint32_element = 0;
+            if (container_select(leaf->container, leaf->typecode, &uint32_start,
+                                 uint32_rank, &uint32_element)) {
+                *element = combine_key(it.key, (uint16_t)uint32_element);
+                return true;
+            }
+            return false;
+        }
+        start_rank += cardinality;
+        art_iterator_next(&it);
+    }
+    return false;
+}
+
+uint64_t roaring64_bitmap_rank(const roaring64_bitmap_t *r, uint64_t val) {
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    uint64_t rank = 0;
+    while (it.value != NULL) {
+        leaf_t *leaf = (leaf_t *)it.value;
+        int compare_result = compare_high48(it.key, high48);
+        if (compare_result < 0) {
+            rank += container_get_cardinality(leaf->container, leaf->typecode);
+        } else if (compare_result == 0) {
+            return rank +
+                   container_rank(leaf->container, leaf->typecode, low16);
+        } else {
+            return rank;
+        }
+        art_iterator_next(&it);
+    }
+    return rank;
+}
+
+bool roaring64_bitmap_get_index(const roaring64_bitmap_t *r, uint64_t val,
+                                uint64_t *out_index) {
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    uint64_t index = 0;
+    while (it.value != NULL) {
+        leaf_t *leaf = (leaf_t *)it.value;
+        int compare_result = compare_high48(it.key, high48);
+        if (compare_result < 0) {
+            index += container_get_cardinality(leaf->container, leaf->typecode);
+        } else if (compare_result == 0) {
+            int index16 =
+                container_get_index(leaf->container, leaf->typecode, low16);
+            if (index16 < 0) {
+                return false;
+            }
+            *out_index = index + index16;
+            return true;
+        } else {
+            return false;
+        }
+        art_iterator_next(&it);
+    }
+    return false;
+}
+
+static inline leaf_t *containerptr_roaring64_bitmap_remove(
+    roaring64_bitmap_t *r, uint8_t *high48, uint16_t low16, leaf_t *leaf) {
+    if (leaf == NULL) {
+        return NULL;
+    }
+
+    container_t *container = leaf->container;
+    uint8_t typecode = leaf->typecode;
+    uint8_t typecode2;
+    container_t *container2 =
+        container_remove(container, low16, typecode, &typecode2);
+    if (container2 != container) {
+        container_free(container, typecode);
+        leaf->container = container2;
+        leaf->typecode = typecode2;
+    }
+    if (!container_nonzero_cardinality(container2, typecode2)) {
+        container_free(container2, typecode2);
+        leaf = (leaf_t *)art_erase(&r->art, high48);
+        if (leaf != NULL) {
+            free_leaf(leaf);
+        }
+        return NULL;
+    }
+    return leaf;
+}
+
+void roaring64_bitmap_remove(roaring64_bitmap_t *r, uint64_t val) {
+    art_t *art = &r->art;
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+
+    leaf_t *leaf = (leaf_t *)art_find(art, high48);
+    containerptr_roaring64_bitmap_remove(r, high48, low16, leaf);
+}
+
+bool roaring64_bitmap_remove_checked(roaring64_bitmap_t *r, uint64_t val) {
+    art_t *art = &r->art;
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+    leaf_t *leaf = (leaf_t *)art_find(art, high48);
+
+    if (leaf == NULL) {
+        return false;
+    }
+    int old_cardinality =
+        container_get_cardinality(leaf->container, leaf->typecode);
+    leaf = containerptr_roaring64_bitmap_remove(r, high48, low16, leaf);
+    if (leaf == NULL) {
+        return true;
+    }
+    int new_cardinality =
+        container_get_cardinality(leaf->container, leaf->typecode);
+    return new_cardinality != old_cardinality;
+}
+
+void roaring64_bitmap_remove_bulk(roaring64_bitmap_t *r,
+                                  roaring64_bulk_context_t *context,
+                                  uint64_t val) {
+    art_t *art = &r->art;
+    uint8_t high48[ART_KEY_BYTES];
+    uint16_t low16 = split_key(val, high48);
+    if (context->leaf != NULL &&
+        compare_high48(context->high_bytes, high48) == 0) {
+        // We're at a container with the correct high bits.
+        uint8_t typecode2;
+        container_t *container2 =
+            container_remove(context->leaf->container, low16,
+                             context->leaf->typecode, &typecode2);
+        if (container2 != context->leaf->container) {
+            container_free(context->leaf->container, context->leaf->typecode);
+            context->leaf->container = container2;
+            context->leaf->typecode = typecode2;
+        }
+        if (!container_nonzero_cardinality(container2, typecode2)) {
+            leaf_t *leaf = (leaf_t *)art_erase(art, high48);
+            container_free(container2, typecode2);
+            free_leaf(leaf);
+        }
+    } else {
+        // We're not positioned anywhere yet or the high bits of the key
+        // differ.
+        leaf_t *leaf = (leaf_t *)art_find(art, high48);
+        context->leaf =
+            containerptr_roaring64_bitmap_remove(r, high48, low16, leaf);
+        memcpy(context->high_bytes, high48, ART_KEY_BYTES);
+    }
+}
+
+void roaring64_bitmap_remove_many(roaring64_bitmap_t *r, size_t n_args,
+                                  const uint64_t *vals) {
+    if (n_args == 0) {
+        return;
+    }
+    const uint64_t *end = vals + n_args;
+    roaring64_bulk_context_t context = {0};
+    for (const uint64_t *current_val = vals; current_val != end;
+         current_val++) {
+        roaring64_bitmap_remove_bulk(r, &context, *current_val);
+    }
+}
+
+static inline void remove_range_closed_at(art_t *art, uint8_t *high48,
+                                          uint16_t min, uint16_t max) {
+    leaf_t *leaf = (leaf_t *)art_find(art, high48);
+    if (leaf == NULL) {
+        return;
+    }
+    uint8_t typecode2;
+    container_t *container2 = container_remove_range(
+        leaf->container, leaf->typecode, min, max, &typecode2);
+    if (container2 != leaf->container) {
+        container_free(leaf->container, leaf->typecode);
+        if (container2 != NULL) {
+            leaf->container = container2;
+            leaf->typecode = typecode2;
+        } else {
+            art_erase(art, high48);
+            free_leaf(leaf);
+        }
+    }
+}
+
+void roaring64_bitmap_remove_range(roaring64_bitmap_t *r, uint64_t min,
+                                   uint64_t max) {
+    if (min >= max) {
+        return;
+    }
+    roaring64_bitmap_remove_range_closed(r, min, max - 1);
+}
+
+void roaring64_bitmap_remove_range_closed(roaring64_bitmap_t *r, uint64_t min,
+                                          uint64_t max) {
+    if (min > max) {
+        return;
+    }
+
+    art_t *art = &r->art;
+    uint8_t min_high48[ART_KEY_BYTES];
+    uint16_t min_low16 = split_key(min, min_high48);
+    uint8_t max_high48[ART_KEY_BYTES];
+    uint16_t max_low16 = split_key(max, max_high48);
+    if (compare_high48(min_high48, max_high48) == 0) {
+        // Only remove a range within one container.
+        remove_range_closed_at(art, min_high48, min_low16, max_low16);
+        return;
+    }
+
+    // Remove a range across containers. Remove intermediate containers
+    // entirely.
+    remove_range_closed_at(art, min_high48, min_low16, 0xffff);
+
+    art_iterator_t it = art_upper_bound(art, min_high48);
+    while (it.value != NULL && art_compare_keys(it.key, max_high48) < 0) {
+        leaf_t *leaf = (leaf_t *)art_iterator_erase(art, &it);
+        container_free(leaf->container, leaf->typecode);
+        free_leaf(leaf);
+    }
+    remove_range_closed_at(art, max_high48, 0, max_low16);
+}
+
+uint64_t roaring64_bitmap_get_cardinality(const roaring64_bitmap_t *r) {
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    uint64_t cardinality = 0;
+    while (it.value != NULL) {
+        leaf_t *leaf = (leaf_t *)it.value;
+        cardinality +=
+            container_get_cardinality(leaf->container, leaf->typecode);
+        art_iterator_next(&it);
+    }
+    return cardinality;
+}
+
+uint64_t roaring64_bitmap_range_cardinality(const roaring64_bitmap_t *r,
+                                            uint64_t min, uint64_t max) {
+    if (min >= max) {
+        return 0;
+    }
+    max--;  // A closed range is easier to work with.
+
+    uint64_t cardinality = 0;
+    uint8_t min_high48[ART_KEY_BYTES];
+    uint16_t min_low16 = split_key(min, min_high48);
+    uint8_t max_high48[ART_KEY_BYTES];
+    uint16_t max_low16 = split_key(max, max_high48);
+
+    art_iterator_t it = art_lower_bound(&r->art, min_high48);
+    while (it.value != NULL) {
+        int max_compare_result = compare_high48(it.key, max_high48);
+        if (max_compare_result > 0) {
+            // We're outside the range.
+            break;
+        }
+
+        leaf_t *leaf = (leaf_t *)it.value;
+        if (max_compare_result == 0) {
+            // We're at the max high key, add only the range up to the low
+            // 16 bits of max.
+            cardinality +=
+                container_rank(leaf->container, leaf->typecode, max_low16);
+        } else {
+            // We're not yet at the max high key, add the full container
+            // range.
+            cardinality +=
+                container_get_cardinality(leaf->container, leaf->typecode);
+        }
+        if (compare_high48(it.key, min_high48) == 0 && min_low16 > 0) {
+            // We're at the min high key, remove the range up to the low 16
+            // bits of min.
+            cardinality -=
+                container_rank(leaf->container, leaf->typecode, min_low16 - 1);
+        }
+        art_iterator_next(&it);
+    }
+    return cardinality;
+}
+
+bool roaring64_bitmap_is_empty(const roaring64_bitmap_t *r) {
+    return art_is_empty(&r->art);
+}
+
+uint64_t roaring64_bitmap_minimum(const roaring64_bitmap_t *r) {
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    if (it.value == NULL) {
+        return UINT64_MAX;
+    }
+    leaf_t *leaf = (leaf_t *)it.value;
+    return combine_key(it.key,
+                       container_minimum(leaf->container, leaf->typecode));
+}
+
+uint64_t roaring64_bitmap_maximum(const roaring64_bitmap_t *r) {
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/false);
+    if (it.value == NULL) {
+        return 0;
+    }
+    leaf_t *leaf = (leaf_t *)it.value;
+    return combine_key(it.key,
+                       container_maximum(leaf->container, leaf->typecode));
+}
+
+bool roaring64_bitmap_run_optimize(roaring64_bitmap_t *r) {
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    bool has_run_container = false;
+    while (it.value != NULL) {
+        leaf_t *leaf = (leaf_t *)it.value;
+        uint8_t new_typecode;
+        // We don't need to free the existing container if a new one was
+        // created, convert_run_optimize does that internally.
+        leaf->container = convert_run_optimize(leaf->container, leaf->typecode,
+                                               &new_typecode);
+        leaf->typecode = new_typecode;
+        has_run_container |= new_typecode == RUN_CONTAINER_TYPE;
+        art_iterator_next(&it);
+    }
+    return has_run_container;
+}
+
+static bool roaring64_leaf_internal_validate(const art_val_t *val,
+                                             const char **reason) {
+    leaf_t *leaf = (leaf_t *)val;
+    return container_internal_validate(leaf->container, leaf->typecode, reason);
+}
+
+bool roaring64_bitmap_internal_validate(const roaring64_bitmap_t *r,
+                                        const char **reason) {
+    return art_internal_validate(&r->art, reason,
+                                 roaring64_leaf_internal_validate);
+}
+
+bool roaring64_bitmap_equals(const roaring64_bitmap_t *r1,
+                             const roaring64_bitmap_t *r2) {
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL && it2.value != NULL) {
+        if (compare_high48(it1.key, it2.key) != 0) {
+            return false;
+        }
+        leaf_t *leaf1 = (leaf_t *)it1.value;
+        leaf_t *leaf2 = (leaf_t *)it2.value;
+        if (!container_equals(leaf1->container, leaf1->typecode,
+                              leaf2->container, leaf2->typecode)) {
+            return false;
+        }
+        art_iterator_next(&it1);
+        art_iterator_next(&it2);
+    }
+    return it1.value == NULL && it2.value == NULL;
+}
+
+bool roaring64_bitmap_is_subset(const roaring64_bitmap_t *r1,
+                                const roaring64_bitmap_t *r2) {
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL) {
+        bool it2_present = it2.value != NULL;
+
+        int compare_result = 0;
+        if (it2_present) {
+            compare_result = compare_high48(it1.key, it2.key);
+            if (compare_result == 0) {
+                leaf_t *leaf1 = (leaf_t *)it1.value;
+                leaf_t *leaf2 = (leaf_t *)it2.value;
+                if (!container_is_subset(leaf1->container, leaf1->typecode,
+                                         leaf2->container, leaf2->typecode)) {
+                    return false;
+                }
+                art_iterator_next(&it1);
+                art_iterator_next(&it2);
+            }
+        }
+        if (!it2_present || compare_result < 0) {
+            return false;
+        } else if (compare_result > 0) {
+            art_iterator_lower_bound(&it2, it1.key);
+        }
+    }
+    return true;
+}
+
+bool roaring64_bitmap_is_strict_subset(const roaring64_bitmap_t *r1,
+                                       const roaring64_bitmap_t *r2) {
+    return roaring64_bitmap_get_cardinality(r1) <
+               roaring64_bitmap_get_cardinality(r2) &&
+           roaring64_bitmap_is_subset(r1, r2);
+}
+
+roaring64_bitmap_t *roaring64_bitmap_and(const roaring64_bitmap_t *r1,
+                                         const roaring64_bitmap_t *r2) {
+    roaring64_bitmap_t *result = roaring64_bitmap_create();
+
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL && it2.value != NULL) {
+        // Cases:
+        // 1. it1 <  it2 -> it1++
+        // 2. it1 == it1 -> output it1 & it2, it1++, it2++
+        // 3. it1 >  it2 -> it2++
+        int compare_result = compare_high48(it1.key, it2.key);
+        if (compare_result == 0) {
+            // Case 2: iterators at the same high key position.
+            leaf_t *result_leaf = (leaf_t *)roaring_malloc(sizeof(leaf_t));
+            leaf_t *leaf1 = (leaf_t *)it1.value;
+            leaf_t *leaf2 = (leaf_t *)it2.value;
+            result_leaf->container = container_and(
+                leaf1->container, leaf1->typecode, leaf2->container,
+                leaf2->typecode, &result_leaf->typecode);
+
+            if (container_nonzero_cardinality(result_leaf->container,
+                                              result_leaf->typecode)) {
+                art_insert(&result->art, it1.key, (art_val_t *)result_leaf);
+            } else {
+                container_free(result_leaf->container, result_leaf->typecode);
+                free_leaf(result_leaf);
+            }
+            art_iterator_next(&it1);
+            art_iterator_next(&it2);
+        } else if (compare_result < 0) {
+            // Case 1: it1 is before it2.
+            art_iterator_lower_bound(&it1, it2.key);
+        } else {
+            // Case 3: it2 is before it1.
+            art_iterator_lower_bound(&it2, it1.key);
+        }
+    }
+    return result;
+}
+
+uint64_t roaring64_bitmap_and_cardinality(const roaring64_bitmap_t *r1,
+                                          const roaring64_bitmap_t *r2) {
+    uint64_t result = 0;
+
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL && it2.value != NULL) {
+        // Cases:
+        // 1. it1 <  it2 -> it1++
+        // 2. it1 == it1 -> output cardinaltiy it1 & it2, it1++, it2++
+        // 3. it1 >  it2 -> it2++
+        int compare_result = compare_high48(it1.key, it2.key);
+        if (compare_result == 0) {
+            // Case 2: iterators at the same high key position.
+            leaf_t *leaf1 = (leaf_t *)it1.value;
+            leaf_t *leaf2 = (leaf_t *)it2.value;
+            result +=
+                container_and_cardinality(leaf1->container, leaf1->typecode,
+                                          leaf2->container, leaf2->typecode);
+            art_iterator_next(&it1);
+            art_iterator_next(&it2);
+        } else if (compare_result < 0) {
+            // Case 1: it1 is before it2.
+            art_iterator_lower_bound(&it1, it2.key);
+        } else {
+            // Case 3: it2 is before it1.
+            art_iterator_lower_bound(&it2, it1.key);
+        }
+    }
+    return result;
+}
+
+// Inplace and (modifies its first argument).
+void roaring64_bitmap_and_inplace(roaring64_bitmap_t *r1,
+                                  const roaring64_bitmap_t *r2) {
+    if (r1 == r2) {
+        return;
+    }
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL) {
+        // Cases:
+        // 1. !it2_present -> erase it1
+        // 2. it2_present
+        //    a. it1 <  it2 -> erase it1
+        //    b. it1 == it2 -> output it1 & it2, it1++, it2++
+        //    c. it1 >  it2 -> it2++
+        bool it2_present = it2.value != NULL;
+        int compare_result = 0;
+        if (it2_present) {
+            compare_result = compare_high48(it1.key, it2.key);
+            if (compare_result == 0) {
+                // Case 2a: iterators at the same high key position.
+                leaf_t *leaf1 = (leaf_t *)it1.value;
+                leaf_t *leaf2 = (leaf_t *)it2.value;
+
+                // We do the computation "in place" only when c1 is not a
+                // shared container. Rationale: using a shared container
+                // safely with in place computation would require making a
+                // copy and then doing the computation in place which is
+                // likely less efficient than avoiding in place entirely and
+                // always generating a new container.
+                uint8_t typecode2;
+                container_t *container2;
+                if (leaf1->typecode == SHARED_CONTAINER_TYPE) {
+                    container2 = container_and(
+                        leaf1->container, leaf1->typecode, leaf2->container,
+                        leaf2->typecode, &typecode2);
+                } else {
+                    container2 = container_iand(
+                        leaf1->container, leaf1->typecode, leaf2->container,
+                        leaf2->typecode, &typecode2);
+                }
+
+                if (container2 != leaf1->container) {
+                    container_free(leaf1->container, leaf1->typecode);
+                    leaf1->container = container2;
+                    leaf1->typecode = typecode2;
+                }
+                if (!container_nonzero_cardinality(container2, typecode2)) {
+                    container_free(container2, typecode2);
+                    art_iterator_erase(&r1->art, &it1);
+                    free_leaf(leaf1);
+                } else {
+                    // Only advance the iterator if we didn't delete the
+                    // leaf, as erasing advances by itself.
+                    art_iterator_next(&it1);
+                }
+                art_iterator_next(&it2);
+            }
+        }
+
+        if (!it2_present || compare_result < 0) {
+            // Cases 1 and 3a: it1 is the only iterator or is before it2.
+            leaf_t *leaf = (leaf_t *)art_iterator_erase(&r1->art, &it1);
+            assert(leaf != NULL);
+            container_free(leaf->container, leaf->typecode);
+            free_leaf(leaf);
+        } else if (compare_result > 0) {
+            // Case 2c: it1 is after it2.
+            art_iterator_lower_bound(&it2, it1.key);
+        }
+    }
+}
+
+bool roaring64_bitmap_intersect(const roaring64_bitmap_t *r1,
+                                const roaring64_bitmap_t *r2) {
+    bool intersect = false;
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL && it2.value != NULL) {
+        // Cases:
+        // 1. it1 <  it2 -> it1++
+        // 2. it1 == it1 -> intersect |= it1 & it2, it1++, it2++
+        // 3. it1 >  it2 -> it2++
+        int compare_result = compare_high48(it1.key, it2.key);
+        if (compare_result == 0) {
+            // Case 2: iterators at the same high key position.
+            leaf_t *leaf1 = (leaf_t *)it1.value;
+            leaf_t *leaf2 = (leaf_t *)it2.value;
+            intersect |= container_intersect(leaf1->container, leaf1->typecode,
+                                             leaf2->container, leaf2->typecode);
+            art_iterator_next(&it1);
+            art_iterator_next(&it2);
+        } else if (compare_result < 0) {
+            // Case 1: it1 is before it2.
+            art_iterator_lower_bound(&it1, it2.key);
+        } else {
+            // Case 3: it2 is before it1.
+            art_iterator_lower_bound(&it2, it1.key);
+        }
+    }
+    return intersect;
+}
+
+bool roaring64_bitmap_intersect_with_range(const roaring64_bitmap_t *r,
+                                           uint64_t min, uint64_t max) {
+    if (min >= max) {
+        return false;
+    }
+    roaring64_iterator_t it;
+    roaring64_iterator_init_at(r, &it, /*first=*/true);
+    if (!roaring64_iterator_move_equalorlarger(&it, min)) {
+        return false;
+    }
+    return roaring64_iterator_has_value(&it) &&
+           roaring64_iterator_value(&it) < max;
+}
+
+double roaring64_bitmap_jaccard_index(const roaring64_bitmap_t *r1,
+                                      const roaring64_bitmap_t *r2) {
+    uint64_t c1 = roaring64_bitmap_get_cardinality(r1);
+    uint64_t c2 = roaring64_bitmap_get_cardinality(r2);
+    uint64_t inter = roaring64_bitmap_and_cardinality(r1, r2);
+    return (double)inter / (double)(c1 + c2 - inter);
+}
+
+roaring64_bitmap_t *roaring64_bitmap_or(const roaring64_bitmap_t *r1,
+                                        const roaring64_bitmap_t *r2) {
+    roaring64_bitmap_t *result = roaring64_bitmap_create();
+
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL || it2.value != NULL) {
+        bool it1_present = it1.value != NULL;
+        bool it2_present = it2.value != NULL;
+
+        // Cases:
+        // 1. it1_present  && !it2_present -> output it1, it1++
+        // 2. !it1_present && it2_present  -> output it2, it2++
+        // 3. it1_present  && it2_present
+        //    a. it1 <  it2 -> output it1, it1++
+        //    b. it1 == it2 -> output it1 | it2, it1++, it2++
+        //    c. it1 >  it2 -> output it2, it2++
+        int compare_result = 0;
+        if (it1_present && it2_present) {
+            compare_result = compare_high48(it1.key, it2.key);
+            if (compare_result == 0) {
+                // Case 3b: iterators at the same high key position.
+                leaf_t *leaf1 = (leaf_t *)it1.value;
+                leaf_t *leaf2 = (leaf_t *)it2.value;
+                leaf_t *result_leaf = (leaf_t *)roaring_malloc(sizeof(leaf_t));
+                result_leaf->container = container_or(
+                    leaf1->container, leaf1->typecode, leaf2->container,
+                    leaf2->typecode, &result_leaf->typecode);
+                art_insert(&result->art, it1.key, (art_val_t *)result_leaf);
+                art_iterator_next(&it1);
+                art_iterator_next(&it2);
+            }
+        }
+        if ((it1_present && !it2_present) || compare_result < 0) {
+            // Cases 1 and 3a: it1 is the only iterator or is before it2.
+            leaf_t *result_leaf = copy_leaf_container((leaf_t *)it1.value);
+            art_insert(&result->art, it1.key, (art_val_t *)result_leaf);
+            art_iterator_next(&it1);
+        } else if ((!it1_present && it2_present) || compare_result > 0) {
+            // Cases 2 and 3c: it2 is the only iterator or is before it1.
+            leaf_t *result_leaf = copy_leaf_container((leaf_t *)it2.value);
+            art_insert(&result->art, it2.key, (art_val_t *)result_leaf);
+            art_iterator_next(&it2);
+        }
+    }
+    return result;
+}
+
+uint64_t roaring64_bitmap_or_cardinality(const roaring64_bitmap_t *r1,
+                                         const roaring64_bitmap_t *r2) {
+    uint64_t c1 = roaring64_bitmap_get_cardinality(r1);
+    uint64_t c2 = roaring64_bitmap_get_cardinality(r2);
+    uint64_t inter = roaring64_bitmap_and_cardinality(r1, r2);
+    return c1 + c2 - inter;
+}
+
+void roaring64_bitmap_or_inplace(roaring64_bitmap_t *r1,
+                                 const roaring64_bitmap_t *r2) {
+    if (r1 == r2) {
+        return;
+    }
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL || it2.value != NULL) {
+        bool it1_present = it1.value != NULL;
+        bool it2_present = it2.value != NULL;
+
+        // Cases:
+        // 1. it1_present  && !it2_present -> it1++
+        // 2. !it1_present && it2_present  -> add it2, it2++
+        // 3. it1_present  && it2_present
+        //    a. it1 <  it2 -> it1++
+        //    b. it1 == it2 -> it1 | it2, it1++, it2++
+        //    c. it1 >  it2 -> add it2, it2++
+        int compare_result = 0;
+        if (it1_present && it2_present) {
+            compare_result = compare_high48(it1.key, it2.key);
+            if (compare_result == 0) {
+                // Case 3b: iterators at the same high key position.
+                leaf_t *leaf1 = (leaf_t *)it1.value;
+                leaf_t *leaf2 = (leaf_t *)it2.value;
+                uint8_t typecode2;
+                container_t *container2;
+                if (leaf1->typecode == SHARED_CONTAINER_TYPE) {
+                    container2 = container_or(leaf1->container, leaf1->typecode,
+                                              leaf2->container, leaf2->typecode,
+                                              &typecode2);
+                } else {
+                    container2 = container_ior(
+                        leaf1->container, leaf1->typecode, leaf2->container,
+                        leaf2->typecode, &typecode2);
+                }
+                if (container2 != leaf1->container) {
+                    container_free(leaf1->container, leaf1->typecode);
+                    leaf1->container = container2;
+                    leaf1->typecode = typecode2;
+                }
+                art_iterator_next(&it1);
+                art_iterator_next(&it2);
+            }
+        }
+        if ((it1_present && !it2_present) || compare_result < 0) {
+            // Cases 1 and 3a: it1 is the only iterator or is before it2.
+            art_iterator_next(&it1);
+        } else if ((!it1_present && it2_present) || compare_result > 0) {
+            // Cases 2 and 3c: it2 is the only iterator or is before it1.
+            leaf_t *result_leaf = copy_leaf_container((leaf_t *)it2.value);
+            art_iterator_insert(&r1->art, &it1, it2.key,
+                                (art_val_t *)result_leaf);
+            art_iterator_next(&it2);
+        }
+    }
+}
+
+roaring64_bitmap_t *roaring64_bitmap_xor(const roaring64_bitmap_t *r1,
+                                         const roaring64_bitmap_t *r2) {
+    roaring64_bitmap_t *result = roaring64_bitmap_create();
+
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL || it2.value != NULL) {
+        bool it1_present = it1.value != NULL;
+        bool it2_present = it2.value != NULL;
+
+        // Cases:
+        // 1. it1_present  && !it2_present -> output it1, it1++
+        // 2. !it1_present && it2_present  -> output it2, it2++
+        // 3. it1_present  && it2_present
+        //    a. it1 <  it2 -> output it1, it1++
+        //    b. it1 == it2 -> output it1 ^ it2, it1++, it2++
+        //    c. it1 >  it2 -> output it2, it2++
+        int compare_result = 0;
+        if (it1_present && it2_present) {
+            compare_result = compare_high48(it1.key, it2.key);
+            if (compare_result == 0) {
+                // Case 3b: iterators at the same high key position.
+                leaf_t *leaf1 = (leaf_t *)it1.value;
+                leaf_t *leaf2 = (leaf_t *)it2.value;
+                leaf_t *result_leaf = (leaf_t *)roaring_malloc(sizeof(leaf_t));
+                result_leaf->container = container_xor(
+                    leaf1->container, leaf1->typecode, leaf2->container,
+                    leaf2->typecode, &result_leaf->typecode);
+                if (container_nonzero_cardinality(result_leaf->container,
+                                                  result_leaf->typecode)) {
+                    art_insert(&result->art, it1.key, (art_val_t *)result_leaf);
+                } else {
+                    container_free(result_leaf->container,
+                                   result_leaf->typecode);
+                    free_leaf(result_leaf);
+                }
+                art_iterator_next(&it1);
+                art_iterator_next(&it2);
+            }
+        }
+        if ((it1_present && !it2_present) || compare_result < 0) {
+            // Cases 1 and 3a: it1 is the only iterator or is before it2.
+            leaf_t *result_leaf = copy_leaf_container((leaf_t *)it1.value);
+            art_insert(&result->art, it1.key, (art_val_t *)result_leaf);
+            art_iterator_next(&it1);
+        } else if ((!it1_present && it2_present) || compare_result > 0) {
+            // Cases 2 and 3c: it2 is the only iterator or is before it1.
+            leaf_t *result_leaf = copy_leaf_container((leaf_t *)it2.value);
+            art_insert(&result->art, it2.key, (art_val_t *)result_leaf);
+            art_iterator_next(&it2);
+        }
+    }
+    return result;
+}
+
+uint64_t roaring64_bitmap_xor_cardinality(const roaring64_bitmap_t *r1,
+                                          const roaring64_bitmap_t *r2) {
+    uint64_t c1 = roaring64_bitmap_get_cardinality(r1);
+    uint64_t c2 = roaring64_bitmap_get_cardinality(r2);
+    uint64_t inter = roaring64_bitmap_and_cardinality(r1, r2);
+    return c1 + c2 - 2 * inter;
+}
+
+void roaring64_bitmap_xor_inplace(roaring64_bitmap_t *r1,
+                                  const roaring64_bitmap_t *r2) {
+    assert(r1 != r2);
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL || it2.value != NULL) {
+        bool it1_present = it1.value != NULL;
+        bool it2_present = it2.value != NULL;
+
+        // Cases:
+        // 1.  it1_present && !it2_present -> it1++
+        // 2. !it1_present &&  it2_present -> add it2, it2++
+        // 3.  it1_present &&  it2_present
+        //    a. it1 <  it2 -> it1++
+        //    b. it1 == it2 -> it1 ^ it2, it1++, it2++
+        //    c. it1 >  it2 -> add it2, it2++
+        int compare_result = 0;
+        if (it1_present && it2_present) {
+            compare_result = compare_high48(it1.key, it2.key);
+            if (compare_result == 0) {
+                // Case 3b: iterators at the same high key position.
+                leaf_t *leaf1 = (leaf_t *)it1.value;
+                leaf_t *leaf2 = (leaf_t *)it2.value;
+                container_t *container1 = leaf1->container;
+                uint8_t typecode1 = leaf1->typecode;
+                uint8_t typecode2;
+                container_t *container2;
+                if (leaf1->typecode == SHARED_CONTAINER_TYPE) {
+                    container2 = container_xor(
+                        leaf1->container, leaf1->typecode, leaf2->container,
+                        leaf2->typecode, &typecode2);
+                    if (container2 != container1) {
+                        // We only free when doing container_xor, not
+                        // container_ixor, as ixor frees the original
+                        // internally.
+                        container_free(container1, typecode1);
+                    }
+                } else {
+                    container2 = container_ixor(
+                        leaf1->container, leaf1->typecode, leaf2->container,
+                        leaf2->typecode, &typecode2);
+                }
+                leaf1->container = container2;
+                leaf1->typecode = typecode2;
+
+                if (!container_nonzero_cardinality(container2, typecode2)) {
+                    container_free(container2, typecode2);
+                    art_iterator_erase(&r1->art, &it1);
+                    free_leaf(leaf1);
+                } else {
+                    // Only advance the iterator if we didn't delete the
+                    // leaf, as erasing advances by itself.
+                    art_iterator_next(&it1);
+                }
+                art_iterator_next(&it2);
+            }
+        }
+        if ((it1_present && !it2_present) || compare_result < 0) {
+            // Cases 1 and 3a: it1 is the only iterator or is before it2.
+            art_iterator_next(&it1);
+        } else if ((!it1_present && it2_present) || compare_result > 0) {
+            // Cases 2 and 3c: it2 is the only iterator or is before it1.
+            leaf_t *result_leaf = copy_leaf_container((leaf_t *)it2.value);
+            if (it1_present) {
+                art_iterator_insert(&r1->art, &it1, it2.key,
+                                    (art_val_t *)result_leaf);
+                art_iterator_next(&it1);
+            } else {
+                art_insert(&r1->art, it2.key, (art_val_t *)result_leaf);
+            }
+            art_iterator_next(&it2);
+        }
+    }
+}
+
+roaring64_bitmap_t *roaring64_bitmap_andnot(const roaring64_bitmap_t *r1,
+                                            const roaring64_bitmap_t *r2) {
+    roaring64_bitmap_t *result = roaring64_bitmap_create();
+
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL) {
+        // Cases:
+        // 1. it1_present && !it2_present -> output it1, it1++
+        // 2. it1_present && it2_present
+        //    a. it1 <  it2 -> output it1, it1++
+        //    b. it1 == it2 -> output it1 - it2, it1++, it2++
+        //    c. it1 >  it2 -> it2++
+        bool it2_present = it2.value != NULL;
+        int compare_result = 0;
+        if (it2_present) {
+            compare_result = compare_high48(it1.key, it2.key);
+            if (compare_result == 0) {
+                // Case 2b: iterators at the same high key position.
+                leaf_t *result_leaf = (leaf_t *)roaring_malloc(sizeof(leaf_t));
+                leaf_t *leaf1 = (leaf_t *)it1.value;
+                leaf_t *leaf2 = (leaf_t *)it2.value;
+                result_leaf->container = container_andnot(
+                    leaf1->container, leaf1->typecode, leaf2->container,
+                    leaf2->typecode, &result_leaf->typecode);
+
+                if (container_nonzero_cardinality(result_leaf->container,
+                                                  result_leaf->typecode)) {
+                    art_insert(&result->art, it1.key, (art_val_t *)result_leaf);
+                } else {
+                    container_free(result_leaf->container,
+                                   result_leaf->typecode);
+                    free_leaf(result_leaf);
+                }
+                art_iterator_next(&it1);
+                art_iterator_next(&it2);
+            }
+        }
+        if (!it2_present || compare_result < 0) {
+            // Cases 1 and 2a: it1 is the only iterator or is before it2.
+            leaf_t *result_leaf = copy_leaf_container((leaf_t *)it1.value);
+            art_insert(&result->art, it1.key, (art_val_t *)result_leaf);
+            art_iterator_next(&it1);
+        } else if (compare_result > 0) {
+            // Case 2c: it1 is after it2.
+            art_iterator_next(&it2);
+        }
+    }
+    return result;
+}
+
+uint64_t roaring64_bitmap_andnot_cardinality(const roaring64_bitmap_t *r1,
+                                             const roaring64_bitmap_t *r2) {
+    uint64_t c1 = roaring64_bitmap_get_cardinality(r1);
+    uint64_t inter = roaring64_bitmap_and_cardinality(r1, r2);
+    return c1 - inter;
+}
+
+void roaring64_bitmap_andnot_inplace(roaring64_bitmap_t *r1,
+                                     const roaring64_bitmap_t *r2) {
+    art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true);
+    art_iterator_t it2 = art_init_iterator(&r2->art, /*first=*/true);
+
+    while (it1.value != NULL) {
+        // Cases:
+        // 1. it1_present && !it2_present -> it1++
+        // 2. it1_present &&  it2_present
+        //    a. it1 <  it2 -> it1++
+        //    b. it1 == it2 -> it1 - it2, it1++, it2++
+        //    c. it1 >  it2 -> it2++
+        bool it2_present = it2.value != NULL;
+        int compare_result = 0;
+        if (it2_present) {
+            compare_result = compare_high48(it1.key, it2.key);
+            if (compare_result == 0) {
+                // Case 2b: iterators at the same high key position.
+                leaf_t *leaf1 = (leaf_t *)it1.value;
+                leaf_t *leaf2 = (leaf_t *)it2.value;
+                container_t *container1 = leaf1->container;
+                uint8_t typecode1 = leaf1->typecode;
+                uint8_t typecode2;
+                container_t *container2;
+                if (leaf1->typecode == SHARED_CONTAINER_TYPE) {
+                    container2 = container_andnot(
+                        leaf1->container, leaf1->typecode, leaf2->container,
+                        leaf2->typecode, &typecode2);
+                    if (container2 != container1) {
+                        // We only free when doing container_andnot, not
+                        // container_iandnot, as iandnot frees the original
+                        // internally.
+                        container_free(container1, typecode1);
+                    }
+                } else {
+                    container2 = container_iandnot(
+                        leaf1->container, leaf1->typecode, leaf2->container,
+                        leaf2->typecode, &typecode2);
+                }
+                if (container2 != container1) {
+                    leaf1->container = container2;
+                    leaf1->typecode = typecode2;
+                }
+
+                if (!container_nonzero_cardinality(container2, typecode2)) {
+                    container_free(container2, typecode2);
+                    art_iterator_erase(&r1->art, &it1);
+                    free_leaf(leaf1);
+                } else {
+                    // Only advance the iterator if we didn't delete the
+                    // leaf, as erasing advances by itself.
+                    art_iterator_next(&it1);
+                }
+                art_iterator_next(&it2);
+            }
+        }
+        if (!it2_present || compare_result < 0) {
+            // Cases 1 and 2a: it1 is the only iterator or is before it2.
+            art_iterator_next(&it1);
+        } else if (compare_result > 0) {
+            // Case 2c: it1 is after it2.
+            art_iterator_next(&it2);
+        }
+    }
+}
+
+/**
+ * Flips the leaf at high48 in the range [min, max), returning a new leaf with a
+ * new container. If the high48 key is not found in the existing bitmap, a new
+ * container is created. Returns null if the negation results in an empty range.
+ */
+static leaf_t *roaring64_flip_leaf(const roaring64_bitmap_t *r,
+                                   uint8_t high48[], uint32_t min,
+                                   uint32_t max) {
+    leaf_t *leaf1 = (leaf_t *)art_find(&r->art, high48);
+    container_t *container2;
+    uint8_t typecode2;
+    if (leaf1 == NULL) {
+        // No container at this key, create a full container.
+        container2 = container_range_of_ones(min, max, &typecode2);
+    } else if (min == 0 && max > 0xFFFF) {
+        // Flip whole container.
+        container2 =
+            container_not(leaf1->container, leaf1->typecode, &typecode2);
+    } else {
+        // Partially flip a container.
+        container2 = container_not_range(leaf1->container, leaf1->typecode, min,
+                                         max, &typecode2);
+    }
+    if (container_nonzero_cardinality(container2, typecode2)) {
+        return create_leaf(container2, typecode2);
+    }
+    container_free(container2, typecode2);
+    return NULL;
+}
+
+/**
+ * Flips the leaf at high48 in the range [min, max). If the high48 key is not
+ * found in the bitmap, a new container is created. Deletes the leaf and
+ * associated container if the negation results in an empty range.
+ */
+static void roaring64_flip_leaf_inplace(roaring64_bitmap_t *r, uint8_t high48[],
+                                        uint32_t min, uint32_t max) {
+    leaf_t *leaf = (leaf_t *)art_find(&r->art, high48);
+    container_t *container2;
+    uint8_t typecode2;
+    if (leaf == NULL) {
+        // No container at this key, insert a full container.
+        container2 = container_range_of_ones(min, max, &typecode2);
+        art_insert(&r->art, high48,
+                   (art_val_t *)create_leaf(container2, typecode2));
+        return;
+    }
+
+    if (min == 0 && max > 0xFFFF) {
+        // Flip whole container.
+        container2 =
+            container_inot(leaf->container, leaf->typecode, &typecode2);
+    } else {
+        // Partially flip a container.
+        container2 = container_inot_range(leaf->container, leaf->typecode, min,
+                                          max, &typecode2);
+    }
+
+    leaf->container = container2;
+    leaf->typecode = typecode2;
+
+    if (!container_nonzero_cardinality(leaf->container, leaf->typecode)) {
+        art_erase(&r->art, high48);
+        container_free(leaf->container, leaf->typecode);
+        free_leaf(leaf);
+    }
+}
+
+roaring64_bitmap_t *roaring64_bitmap_flip(const roaring64_bitmap_t *r,
+                                          uint64_t min, uint64_t max) {
+    if (min >= max) {
+        return roaring64_bitmap_copy(r);
+    }
+    return roaring64_bitmap_flip_closed(r, min, max - 1);
+}
+
+roaring64_bitmap_t *roaring64_bitmap_flip_closed(const roaring64_bitmap_t *r1,
+                                                 uint64_t min, uint64_t max) {
+    if (min > max) {
+        return roaring64_bitmap_copy(r1);
+    }
+    uint8_t min_high48_key[ART_KEY_BYTES];
+    uint16_t min_low16 = split_key(min, min_high48_key);
+    uint8_t max_high48_key[ART_KEY_BYTES];
+    uint16_t max_low16 = split_key(max, max_high48_key);
+    uint64_t min_high48_bits = (min & 0xFFFFFFFFFFFF0000ULL) >> 16;
+    uint64_t max_high48_bits = (max & 0xFFFFFFFFFFFF0000ULL) >> 16;
+
+    roaring64_bitmap_t *r2 = roaring64_bitmap_create();
+    art_iterator_t it = art_init_iterator(&r1->art, /*first=*/true);
+
+    // Copy the containers before min unchanged.
+    while (it.value != NULL && compare_high48(it.key, min_high48_key) < 0) {
+        leaf_t *leaf1 = (leaf_t *)it.value;
+        uint8_t typecode2 = leaf1->typecode;
+        container_t *container2 = get_copy_of_container(
+            leaf1->container, &typecode2, /*copy_on_write=*/false);
+        art_insert(&r2->art, it.key,
+                   (art_val_t *)create_leaf(container2, typecode2));
+        art_iterator_next(&it);
+    }
+
+    // Flip the range (including non-existent containers!) between min and max.
+    for (uint64_t high48_bits = min_high48_bits; high48_bits <= max_high48_bits;
+         high48_bits++) {
+        uint8_t current_high48_key[ART_KEY_BYTES];
+        split_key(high48_bits << 16, current_high48_key);
+
+        uint32_t min_container = 0;
+        if (high48_bits == min_high48_bits) {
+            min_container = min_low16;
+        }
+        uint32_t max_container = 0xFFFF + 1;  // Exclusive range.
+        if (high48_bits == max_high48_bits) {
+            max_container = max_low16 + 1;  // Exclusive.
+        }
+
+        leaf_t *leaf = roaring64_flip_leaf(r1, current_high48_key,
+                                           min_container, max_container);
+        if (leaf != NULL) {
+            art_insert(&r2->art, current_high48_key, (art_val_t *)leaf);
+        }
+    }
+
+    // Copy the containers after max unchanged.
+    it = art_upper_bound(&r1->art, max_high48_key);
+    while (it.value != NULL) {
+        leaf_t *leaf1 = (leaf_t *)it.value;
+        uint8_t typecode2 = leaf1->typecode;
+        container_t *container2 = get_copy_of_container(
+            leaf1->container, &typecode2, /*copy_on_write=*/false);
+        art_insert(&r2->art, it.key,
+                   (art_val_t *)create_leaf(container2, typecode2));
+        art_iterator_next(&it);
+    }
+
+    return r2;
+}
+
+void roaring64_bitmap_flip_inplace(roaring64_bitmap_t *r, uint64_t min,
+                                   uint64_t max) {
+    if (min >= max) {
+        return;
+    }
+    roaring64_bitmap_flip_closed_inplace(r, min, max - 1);
+}
+
+void roaring64_bitmap_flip_closed_inplace(roaring64_bitmap_t *r, uint64_t min,
+                                          uint64_t max) {
+    if (min > max) {
+        return;
+    }
+    uint16_t min_low16 = (uint16_t)min;
+    uint16_t max_low16 = (uint16_t)max;
+    uint64_t min_high48_bits = (min & 0xFFFFFFFFFFFF0000ULL) >> 16;
+    uint64_t max_high48_bits = (max & 0xFFFFFFFFFFFF0000ULL) >> 16;
+
+    // Flip the range (including non-existent containers!) between min and max.
+    for (uint64_t high48_bits = min_high48_bits; high48_bits <= max_high48_bits;
+         high48_bits++) {
+        uint8_t current_high48_key[ART_KEY_BYTES];
+        split_key(high48_bits << 16, current_high48_key);
+
+        uint32_t min_container = 0;
+        if (high48_bits == min_high48_bits) {
+            min_container = min_low16;
+        }
+        uint32_t max_container = 0xFFFF + 1;  // Exclusive range.
+        if (high48_bits == max_high48_bits) {
+            max_container = max_low16 + 1;  // Exclusive.
+        }
+
+        roaring64_flip_leaf_inplace(r, current_high48_key, min_container,
+                                    max_container);
+    }
+}
+
+// Returns the number of distinct high 32-bit entries in the bitmap.
+static inline uint64_t count_high32(const roaring64_bitmap_t *r) {
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    uint64_t high32_count = 0;
+    uint32_t prev_high32 = 0;
+    while (it.value != NULL) {
+        uint32_t current_high32 = (uint32_t)(combine_key(it.key, 0) >> 32);
+        if (high32_count == 0 || prev_high32 != current_high32) {
+            high32_count++;
+            prev_high32 = current_high32;
+        }
+        art_iterator_next(&it);
+    }
+    return high32_count;
+}
+
+// Frees the (32-bit!) bitmap without freeing the containers.
+static inline void roaring_bitmap_free_without_containers(roaring_bitmap_t *r) {
+    ra_clear_without_containers(&r->high_low_container);
+    roaring_free(r);
+}
+
+size_t roaring64_bitmap_portable_size_in_bytes(const roaring64_bitmap_t *r) {
+    // https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations
+    size_t size = 0;
+
+    // Write as uint64 the distinct number of "buckets", where a bucket is
+    // defined as the most significant 32 bits of an element.
+    uint64_t high32_count;
+    size += sizeof(high32_count);
+
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    uint32_t prev_high32 = 0;
+    roaring_bitmap_t *bitmap32 = NULL;
+
+    // Iterate through buckets ordered by increasing keys.
+    while (it.value != NULL) {
+        uint32_t current_high32 = (uint32_t)(combine_key(it.key, 0) >> 32);
+        if (bitmap32 == NULL || prev_high32 != current_high32) {
+            if (bitmap32 != NULL) {
+                // Write as uint32 the most significant 32 bits of the bucket.
+                size += sizeof(prev_high32);
+
+                // Write the 32-bit Roaring bitmaps representing the least
+                // significant bits of a set of elements.
+                size += roaring_bitmap_portable_size_in_bytes(bitmap32);
+                roaring_bitmap_free_without_containers(bitmap32);
+            }
+
+            // Start a new 32-bit bitmap with the current high 32 bits.
+            art_iterator_t it2 = it;
+            uint32_t containers_with_high32 = 0;
+            while (it2.value != NULL && (uint32_t)(combine_key(it2.key, 0) >>
+                                                   32) == current_high32) {
+                containers_with_high32++;
+                art_iterator_next(&it2);
+            }
+            bitmap32 =
+                roaring_bitmap_create_with_capacity(containers_with_high32);
+
+            prev_high32 = current_high32;
+        }
+        leaf_t *leaf = (leaf_t *)it.value;
+        ra_append(&bitmap32->high_low_container,
+                  (uint16_t)(current_high32 >> 16), leaf->container,
+                  leaf->typecode);
+        art_iterator_next(&it);
+    }
+
+    if (bitmap32 != NULL) {
+        // Write as uint32 the most significant 32 bits of the bucket.
+        size += sizeof(prev_high32);
+
+        // Write the 32-bit Roaring bitmaps representing the least
+        // significant bits of a set of elements.
+        size += roaring_bitmap_portable_size_in_bytes(bitmap32);
+        roaring_bitmap_free_without_containers(bitmap32);
+    }
+
+    return size;
+}
+
+size_t roaring64_bitmap_portable_serialize(const roaring64_bitmap_t *r,
+                                           char *buf) {
+    // https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations
+    if (buf == NULL) {
+        return 0;
+    }
+    const char *initial_buf = buf;
+
+    // Write as uint64 the distinct number of "buckets", where a bucket is
+    // defined as the most significant 32 bits of an element.
+    uint64_t high32_count = count_high32(r);
+    memcpy(buf, &high32_count, sizeof(high32_count));
+    buf += sizeof(high32_count);
+
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    uint32_t prev_high32 = 0;
+    roaring_bitmap_t *bitmap32 = NULL;
+
+    // Iterate through buckets ordered by increasing keys.
+    while (it.value != NULL) {
+        uint64_t current_high48 = combine_key(it.key, 0);
+        uint32_t current_high32 = (uint32_t)(current_high48 >> 32);
+        if (bitmap32 == NULL || prev_high32 != current_high32) {
+            if (bitmap32 != NULL) {
+                // Write as uint32 the most significant 32 bits of the bucket.
+                memcpy(buf, &prev_high32, sizeof(prev_high32));
+                buf += sizeof(prev_high32);
+
+                // Write the 32-bit Roaring bitmaps representing the least
+                // significant bits of a set of elements.
+                buf += roaring_bitmap_portable_serialize(bitmap32, buf);
+                roaring_bitmap_free_without_containers(bitmap32);
+            }
+
+            // Start a new 32-bit bitmap with the current high 32 bits.
+            art_iterator_t it2 = it;
+            uint32_t containers_with_high32 = 0;
+            while (it2.value != NULL &&
+                   (uint32_t)combine_key(it2.key, 0) == current_high32) {
+                containers_with_high32++;
+                art_iterator_next(&it2);
+            }
+            bitmap32 =
+                roaring_bitmap_create_with_capacity(containers_with_high32);
+
+            prev_high32 = current_high32;
+        }
+        leaf_t *leaf = (leaf_t *)it.value;
+        ra_append(&bitmap32->high_low_container,
+                  (uint16_t)(current_high48 >> 16), leaf->container,
+                  leaf->typecode);
+        art_iterator_next(&it);
+    }
+
+    if (bitmap32 != NULL) {
+        // Write as uint32 the most significant 32 bits of the bucket.
+        memcpy(buf, &prev_high32, sizeof(prev_high32));
+        buf += sizeof(prev_high32);
+
+        // Write the 32-bit Roaring bitmaps representing the least
+        // significant bits of a set of elements.
+        buf += roaring_bitmap_portable_serialize(bitmap32, buf);
+        roaring_bitmap_free_without_containers(bitmap32);
+    }
+
+    return buf - initial_buf;
+}
+
+size_t roaring64_bitmap_portable_deserialize_size(const char *buf,
+                                                  size_t maxbytes) {
+    // https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations
+    if (buf == NULL) {
+        return 0;
+    }
+    size_t read_bytes = 0;
+
+    // Read as uint64 the distinct number of "buckets", where a bucket is
+    // defined as the most significant 32 bits of an element.
+    uint64_t buckets;
+    if (read_bytes + sizeof(buckets) > maxbytes) {
+        return 0;
+    }
+    memcpy(&buckets, buf, sizeof(buckets));
+    buf += sizeof(buckets);
+    read_bytes += sizeof(buckets);
+
+    // Buckets should be 32 bits with 4 bits of zero padding.
+    if (buckets > UINT32_MAX) {
+        return 0;
+    }
+
+    // Iterate through buckets ordered by increasing keys.
+    for (uint64_t bucket = 0; bucket < buckets; ++bucket) {
+        // Read as uint32 the most significant 32 bits of the bucket.
+        uint32_t high32;
+        if (read_bytes + sizeof(high32) > maxbytes) {
+            return 0;
+        }
+        buf += sizeof(high32);
+        read_bytes += sizeof(high32);
+
+        // Read the 32-bit Roaring bitmaps representing the least significant
+        // bits of a set of elements.
+        size_t bitmap32_size = roaring_bitmap_portable_deserialize_size(
+            buf, maxbytes - read_bytes);
+        if (bitmap32_size == 0) {
+            return 0;
+        }
+        buf += bitmap32_size;
+        read_bytes += bitmap32_size;
+    }
+    return read_bytes;
+}
+
+roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe(
+    const char *buf, size_t maxbytes) {
+    // https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations
+    if (buf == NULL) {
+        return NULL;
+    }
+    size_t read_bytes = 0;
+
+    // Read as uint64 the distinct number of "buckets", where a bucket is
+    // defined as the most significant 32 bits of an element.
+    uint64_t buckets;
+    if (read_bytes + sizeof(buckets) > maxbytes) {
+        return NULL;
+    }
+    memcpy(&buckets, buf, sizeof(buckets));
+    buf += sizeof(buckets);
+    read_bytes += sizeof(buckets);
+
+    // Buckets should be 32 bits with 4 bits of zero padding.
+    if (buckets > UINT32_MAX) {
+        return NULL;
+    }
+
+    roaring64_bitmap_t *r = roaring64_bitmap_create();
+    // Iterate through buckets ordered by increasing keys.
+    for (uint64_t bucket = 0; bucket < buckets; ++bucket) {
+        // Read as uint32 the most significant 32 bits of the bucket.
+        uint32_t high32;
+        if (read_bytes + sizeof(high32) > maxbytes) {
+            roaring64_bitmap_free(r);
+            return NULL;
+        }
+        memcpy(&high32, buf, sizeof(high32));
+        buf += sizeof(high32);
+        read_bytes += sizeof(high32);
+
+        // Read the 32-bit Roaring bitmaps representing the least significant
+        // bits of a set of elements.
+        size_t bitmap32_size = roaring_bitmap_portable_deserialize_size(
+            buf, maxbytes - read_bytes);
+        if (bitmap32_size == 0) {
+            roaring64_bitmap_free(r);
+            return NULL;
+        }
+
+        roaring_bitmap_t *bitmap32 = roaring_bitmap_portable_deserialize_safe(
+            buf, maxbytes - read_bytes);
+        if (bitmap32 == NULL) {
+            roaring64_bitmap_free(r);
+            return NULL;
+        }
+        buf += bitmap32_size;
+        read_bytes += bitmap32_size;
+
+        // Insert all containers of the 32-bit bitmap into the 64-bit bitmap.
+        uint32_t r32_size = ra_get_size(&bitmap32->high_low_container);
+        for (size_t i = 0; i < r32_size; ++i) {
+            uint16_t key16 =
+                ra_get_key_at_index(&bitmap32->high_low_container, (uint16_t)i);
+            uint8_t typecode;
+            container_t *container = ra_get_container_at_index(
+                &bitmap32->high_low_container, (uint16_t)i, &typecode);
+
+            uint64_t high48_bits =
+                (((uint64_t)high32) << 32) | (((uint64_t)key16) << 16);
+            uint8_t high48[ART_KEY_BYTES];
+            split_key(high48_bits, high48);
+            leaf_t *leaf = create_leaf(container, typecode);
+            art_insert(&r->art, high48, (art_val_t *)leaf);
+        }
+        roaring_bitmap_free_without_containers(bitmap32);
+    }
+    return r;
+}
+
+bool roaring64_bitmap_iterate(const roaring64_bitmap_t *r,
+                              roaring_iterator64 iterator, void *ptr) {
+    art_iterator_t it = art_init_iterator(&r->art, /*first=*/true);
+    while (it.value != NULL) {
+        uint64_t high48 = combine_key(it.key, 0);
+        uint64_t high32 = high48 & 0xFFFFFFFF00000000ULL;
+        uint32_t low32 = high48;
+        leaf_t *leaf = (leaf_t *)it.value;
+        if (!container_iterate64(leaf->container, leaf->typecode, low32,
+                                 iterator, high32, ptr)) {
+            return false;
+        }
+        art_iterator_next(&it);
+    }
+    return true;
+}
+
+void roaring64_bitmap_to_uint64_array(const roaring64_bitmap_t *r,
+                                      uint64_t *out) {
+    roaring64_iterator_t it = {0};
+    roaring64_iterator_init_at(r, &it, /*first=*/true);
+    roaring64_iterator_read(&it, out, UINT64_MAX);
+}
+
+roaring64_iterator_t *roaring64_iterator_create(const roaring64_bitmap_t *r) {
+    roaring64_iterator_t *it =
+        (roaring64_iterator_t *)roaring_malloc(sizeof(roaring64_iterator_t));
+    return roaring64_iterator_init_at(r, it, /*first=*/true);
+}
+
+roaring64_iterator_t *roaring64_iterator_create_last(
+    const roaring64_bitmap_t *r) {
+    roaring64_iterator_t *it =
+        (roaring64_iterator_t *)roaring_malloc(sizeof(roaring64_iterator_t));
+    return roaring64_iterator_init_at(r, it, /*first=*/false);
+}
+
+void roaring64_iterator_reinit(const roaring64_bitmap_t *r,
+                               roaring64_iterator_t *it) {
+    roaring64_iterator_init_at(r, it, /*first=*/true);
+}
+
+void roaring64_iterator_reinit_last(const roaring64_bitmap_t *r,
+                                    roaring64_iterator_t *it) {
+    roaring64_iterator_init_at(r, it, /*first=*/false);
+}
+
+roaring64_iterator_t *roaring64_iterator_copy(const roaring64_iterator_t *it) {
+    roaring64_iterator_t *new_it =
+        (roaring64_iterator_t *)roaring_malloc(sizeof(roaring64_iterator_t));
+    memcpy(new_it, it, sizeof(*it));
+    return new_it;
+}
+
+void roaring64_iterator_free(roaring64_iterator_t *it) { roaring_free(it); }
+
+bool roaring64_iterator_has_value(const roaring64_iterator_t *it) {
+    return it->has_value;
+}
+
+uint64_t roaring64_iterator_value(const roaring64_iterator_t *it) {
+    return it->value;
+}
+
+bool roaring64_iterator_advance(roaring64_iterator_t *it) {
+    if (it->art_it.value == NULL) {
+        if (it->saturated_forward) {
+            return (it->has_value = false);
+        }
+        roaring64_iterator_init_at(it->parent, it, /*first=*/true);
+        return it->has_value;
+    }
+    leaf_t *leaf = (leaf_t *)it->art_it.value;
+    uint16_t low16 = (uint16_t)it->value;
+    if (container_iterator_next(leaf->container, leaf->typecode,
+                                &it->container_it, &low16)) {
+        it->value = it->high48 | low16;
+        return (it->has_value = true);
+    }
+    if (art_iterator_next(&it->art_it)) {
+        return roaring64_iterator_init_at_leaf_first(it);
+    }
+    it->saturated_forward = true;
+    return (it->has_value = false);
+}
+
+bool roaring64_iterator_previous(roaring64_iterator_t *it) {
+    if (it->art_it.value == NULL) {
+        if (!it->saturated_forward) {
+            // Saturated backward.
+            return (it->has_value = false);
+        }
+        roaring64_iterator_init_at(it->parent, it, /*first=*/false);
+        return it->has_value;
+    }
+    leaf_t *leaf = (leaf_t *)it->art_it.value;
+    uint16_t low16 = (uint16_t)it->value;
+    if (container_iterator_prev(leaf->container, leaf->typecode,
+                                &it->container_it, &low16)) {
+        it->value = it->high48 | low16;
+        return (it->has_value = true);
+    }
+    if (art_iterator_prev(&it->art_it)) {
+        return roaring64_iterator_init_at_leaf_last(it);
+    }
+    it->saturated_forward = false;  // Saturated backward.
+    return (it->has_value = false);
+}
+
+bool roaring64_iterator_move_equalorlarger(roaring64_iterator_t *it,
+                                           uint64_t val) {
+    uint8_t val_high48[ART_KEY_BYTES];
+    uint16_t val_low16 = split_key(val, val_high48);
+    if (!it->has_value || it->high48 != (val & 0xFFFFFFFFFFFF0000)) {
+        // The ART iterator is before or after the high48 bits of `val` (or
+        // beyond the ART altogether), so we need to move to a leaf with a key
+        // equal or greater.
+        if (!art_iterator_lower_bound(&it->art_it, val_high48)) {
+            // Only smaller keys found.
+            it->saturated_forward = true;
+            return (it->has_value = false);
+        }
+        it->high48 = combine_key(it->art_it.key, 0);
+        // Fall through to the next if statement.
+    }
+
+    if (it->high48 == (val & 0xFFFFFFFFFFFF0000)) {
+        // We're at equal high bits, check if a suitable value can be found in
+        // this container.
+        leaf_t *leaf = (leaf_t *)it->art_it.value;
+        uint16_t low16 = (uint16_t)it->value;
+        if (container_iterator_lower_bound(leaf->container, leaf->typecode,
+                                           &it->container_it, &low16,
+                                           val_low16)) {
+            it->value = it->high48 | low16;
+            return (it->has_value = true);
+        }
+        // Only smaller entries in this container, move to the next.
+        if (!art_iterator_next(&it->art_it)) {
+            it->saturated_forward = true;
+            return (it->has_value = false);
+        }
+    }
+
+    // We're at a leaf with high bits greater than `val`, so the first entry in
+    // this container is our result.
+    return roaring64_iterator_init_at_leaf_first(it);
+}
+
+uint64_t roaring64_iterator_read(roaring64_iterator_t *it, uint64_t *buf,
+                                 uint64_t count) {
+    uint64_t consumed = 0;
+    while (it->has_value && consumed < count) {
+        uint32_t container_consumed;
+        leaf_t *leaf = (leaf_t *)it->art_it.value;
+        uint16_t low16 = (uint16_t)it->value;
+        uint32_t container_count = UINT32_MAX;
+        if (count - consumed < (uint64_t)UINT32_MAX) {
+            container_count = count - consumed;
+        }
+        bool has_value = container_iterator_read_into_uint64(
+            leaf->container, leaf->typecode, &it->container_it, it->high48, buf,
+            container_count, &container_consumed, &low16);
+        consumed += container_consumed;
+        buf += container_consumed;
+        if (has_value) {
+            it->has_value = true;
+            it->value = it->high48 | low16;
+            assert(consumed == count);
+            return consumed;
+        }
+        it->has_value = art_iterator_next(&it->art_it);
+        if (it->has_value) {
+            roaring64_iterator_init_at_leaf_first(it);
+        }
+    }
+    return consumed;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace roaring
+}  // namespace api
+#endif
+/* end file src/roaring64.c */
+/* begin file src/roaring_array.c */
+#include <assert.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace internal {
+#endif
+
+// Convention: [0,ra->size) all elements are initialized
+//  [ra->size, ra->allocation_size) is junk and contains nothing needing freeing
+
+extern inline int32_t ra_get_size(const roaring_array_t *ra);
+extern inline int32_t ra_get_index(const roaring_array_t *ra, uint16_t x);
+
+extern inline container_t *ra_get_container_at_index(const roaring_array_t *ra,
+                                                     uint16_t i,
+                                                     uint8_t *typecode);
+
+extern inline void ra_unshare_container_at_index(roaring_array_t *ra,
+                                                 uint16_t i);
+
+extern inline void ra_replace_key_and_container_at_index(roaring_array_t *ra,
+                                                         int32_t i,
+                                                         uint16_t key,
+                                                         container_t *c,
+                                                         uint8_t typecode);
+
+extern inline void ra_set_container_at_index(const roaring_array_t *ra,
+                                             int32_t i, container_t *c,
+                                             uint8_t typecode);
+
+static bool realloc_array(roaring_array_t *ra, int32_t new_capacity) {
+    //
+    // Note: not implemented using C's realloc(), because the memory layout is
+    // Struct-of-Arrays vs. Array-of-Structs:
+    // https://github.com/RoaringBitmap/CRoaring/issues/256
+
+    if (new_capacity == 0) {
+        roaring_free(ra->containers);
+        ra->containers = NULL;
+        ra->keys = NULL;
+        ra->typecodes = NULL;
+        ra->allocation_size = 0;
+        return true;
+    }
+    const size_t memoryneeded =
+        new_capacity *
+        (sizeof(uint16_t) + sizeof(container_t *) + sizeof(uint8_t));
+    void *bigalloc = roaring_malloc(memoryneeded);
+    if (!bigalloc) return false;
+    void *oldbigalloc = ra->containers;
+    container_t **newcontainers = (container_t **)bigalloc;
+    uint16_t *newkeys = (uint16_t *)(newcontainers + new_capacity);
+    uint8_t *newtypecodes = (uint8_t *)(newkeys + new_capacity);
+    assert((char *)(newtypecodes + new_capacity) ==
+           (char *)bigalloc + memoryneeded);
+    if (ra->size > 0) {
+        memcpy(newcontainers, ra->containers, sizeof(container_t *) * ra->size);
+        memcpy(newkeys, ra->keys, sizeof(uint16_t) * ra->size);
+        memcpy(newtypecodes, ra->typecodes, sizeof(uint8_t) * ra->size);
+    }
+    ra->containers = newcontainers;
+    ra->keys = newkeys;
+    ra->typecodes = newtypecodes;
+    ra->allocation_size = new_capacity;
+    roaring_free(oldbigalloc);
+    return true;
+}
+
+bool ra_init_with_capacity(roaring_array_t *new_ra, uint32_t cap) {
+    if (!new_ra) return false;
+    ra_init(new_ra);
+
+    // Containers hold 64Ki elements, so 64Ki containers is enough to hold
+    // `0x10000 * 0x10000` (all 2^32) elements
+    if (cap > 0x10000) {
+        cap = 0x10000;
+    }
+
+    if (cap > 0) {
+        void *bigalloc = roaring_malloc(
+            cap * (sizeof(uint16_t) + sizeof(container_t *) + sizeof(uint8_t)));
+        if (bigalloc == NULL) return false;
+        new_ra->containers = (container_t **)bigalloc;
+        new_ra->keys = (uint16_t *)(new_ra->containers + cap);
+        new_ra->typecodes = (uint8_t *)(new_ra->keys + cap);
+        // Narrowing is safe because of above check
+        new_ra->allocation_size = (int32_t)cap;
+    }
+    return true;
+}
+
+int ra_shrink_to_fit(roaring_array_t *ra) {
+    int savings = (ra->allocation_size - ra->size) *
+                  (sizeof(uint16_t) + sizeof(container_t *) + sizeof(uint8_t));
+    if (!realloc_array(ra, ra->size)) {
+        return 0;
+    }
+    ra->allocation_size = ra->size;
+    return savings;
+}
+
+void ra_init(roaring_array_t *new_ra) {
+    if (!new_ra) {
+        return;
+    }
+    new_ra->keys = NULL;
+    new_ra->containers = NULL;
+    new_ra->typecodes = NULL;
+
+    new_ra->allocation_size = 0;
+    new_ra->size = 0;
+    new_ra->flags = 0;
+}
+
+bool ra_overwrite(const roaring_array_t *source, roaring_array_t *dest,
+                  bool copy_on_write) {
+    ra_clear_containers(dest);  // we are going to overwrite them
+    if (source->size == 0) {    // Note: can't call memcpy(NULL), even w/size
+        dest->size = 0;         // <--- This is important.
+        return true;            // output was just cleared, so they match
+    }
+    if (dest->allocation_size < source->size) {
+        if (!realloc_array(dest, source->size)) {
+            return false;
+        }
+    }
+    dest->size = source->size;
+    memcpy(dest->keys, source->keys, dest->size * sizeof(uint16_t));
+    // we go through the containers, turning them into shared containers...
+    if (copy_on_write) {
+        for (int32_t i = 0; i < dest->size; ++i) {
+            source->containers[i] = get_copy_of_container(
+                source->containers[i], &source->typecodes[i], copy_on_write);
+        }
+        // we do a shallow copy to the other bitmap
+        memcpy(dest->containers, source->containers,
+               dest->size * sizeof(container_t *));
+        memcpy(dest->typecodes, source->typecodes,
+               dest->size * sizeof(uint8_t));
+    } else {
+        memcpy(dest->typecodes, source->typecodes,
+               dest->size * sizeof(uint8_t));
+        for (int32_t i = 0; i < dest->size; i++) {
+            dest->containers[i] =
+                container_clone(source->containers[i], source->typecodes[i]);
+            if (dest->containers[i] == NULL) {
+                for (int32_t j = 0; j < i; j++) {
+                    container_free(dest->containers[j], dest->typecodes[j]);
+                }
+                ra_clear_without_containers(dest);
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+void ra_clear_containers(roaring_array_t *ra) {
+    for (int32_t i = 0; i < ra->size; ++i) {
+        container_free(ra->containers[i], ra->typecodes[i]);
+    }
+}
+
+void ra_reset(roaring_array_t *ra) {
+    ra_clear_containers(ra);
+    ra->size = 0;
+    ra_shrink_to_fit(ra);
+}
+
+void ra_clear_without_containers(roaring_array_t *ra) {
+    roaring_free(
+        ra->containers);  // keys and typecodes are allocated with containers
+    ra->size = 0;
+    ra->allocation_size = 0;
+    ra->containers = NULL;
+    ra->keys = NULL;
+    ra->typecodes = NULL;
+}
+
+void ra_clear(roaring_array_t *ra) {
+    ra_clear_containers(ra);
+    ra_clear_without_containers(ra);
+}
+
+bool extend_array(roaring_array_t *ra, int32_t k) {
+    int32_t desired_size = ra->size + k;
+    const int32_t max_containers = 65536;
+    assert(desired_size <= max_containers);
+    if (desired_size > ra->allocation_size) {
+        int32_t new_capacity =
+            (ra->size < 1024) ? 2 * desired_size : 5 * desired_size / 4;
+        if (new_capacity > max_containers) {
+            new_capacity = max_containers;
+        }
+
+        return realloc_array(ra, new_capacity);
+    }
+    return true;
+}
+
+void ra_append(roaring_array_t *ra, uint16_t key, container_t *c,
+               uint8_t typecode) {
+    extend_array(ra, 1);
+    const int32_t pos = ra->size;
+
+    ra->keys[pos] = key;
+    ra->containers[pos] = c;
+    ra->typecodes[pos] = typecode;
+    ra->size++;
+}
+
+void ra_append_copy(roaring_array_t *ra, const roaring_array_t *sa,
+                    uint16_t index, bool copy_on_write) {
+    extend_array(ra, 1);
+    const int32_t pos = ra->size;
+
+    // old contents is junk that does not need freeing
+    ra->keys[pos] = sa->keys[index];
+    // the shared container will be in two bitmaps
+    if (copy_on_write) {
+        sa->containers[index] = get_copy_of_container(
+            sa->containers[index], &sa->typecodes[index], copy_on_write);
+        ra->containers[pos] = sa->containers[index];
+        ra->typecodes[pos] = sa->typecodes[index];
+    } else {
+        ra->containers[pos] =
+            container_clone(sa->containers[index], sa->typecodes[index]);
+        ra->typecodes[pos] = sa->typecodes[index];
+    }
+    ra->size++;
+}
+
+void ra_append_copies_until(roaring_array_t *ra, const roaring_array_t *sa,
+                            uint16_t stopping_key, bool copy_on_write) {
+    for (int32_t i = 0; i < sa->size; ++i) {
+        if (sa->keys[i] >= stopping_key) break;
+        ra_append_copy(ra, sa, (uint16_t)i, copy_on_write);
+    }
+}
+
+void ra_append_copy_range(roaring_array_t *ra, const roaring_array_t *sa,
+                          int32_t start_index, int32_t end_index,
+                          bool copy_on_write) {
+    extend_array(ra, end_index - start_index);
+    for (int32_t i = start_index; i < end_index; ++i) {
+        const int32_t pos = ra->size;
+        ra->keys[pos] = sa->keys[i];
+        if (copy_on_write) {
+            sa->containers[i] = get_copy_of_container(
+                sa->containers[i], &sa->typecodes[i], copy_on_write);
+            ra->containers[pos] = sa->containers[i];
+            ra->typecodes[pos] = sa->typecodes[i];
+        } else {
+            ra->containers[pos] =
+                container_clone(sa->containers[i], sa->typecodes[i]);
+            ra->typecodes[pos] = sa->typecodes[i];
+        }
+        ra->size++;
+    }
+}
+
+void ra_append_copies_after(roaring_array_t *ra, const roaring_array_t *sa,
+                            uint16_t before_start, bool copy_on_write) {
+    int start_location = ra_get_index(sa, before_start);
+    if (start_location >= 0)
+        ++start_location;
+    else
+        start_location = -start_location - 1;
+    ra_append_copy_range(ra, sa, start_location, sa->size, copy_on_write);
+}
+
+void ra_append_move_range(roaring_array_t *ra, roaring_array_t *sa,
+                          int32_t start_index, int32_t end_index) {
+    extend_array(ra, end_index - start_index);
+
+    for (int32_t i = start_index; i < end_index; ++i) {
+        const int32_t pos = ra->size;
+
+        ra->keys[pos] = sa->keys[i];
+        ra->containers[pos] = sa->containers[i];
+        ra->typecodes[pos] = sa->typecodes[i];
+        ra->size++;
+    }
+}
+
+void ra_append_range(roaring_array_t *ra, roaring_array_t *sa,
+                     int32_t start_index, int32_t end_index,
+                     bool copy_on_write) {
+    extend_array(ra, end_index - start_index);
+
+    for (int32_t i = start_index; i < end_index; ++i) {
+        const int32_t pos = ra->size;
+        ra->keys[pos] = sa->keys[i];
+        if (copy_on_write) {
+            sa->containers[i] = get_copy_of_container(
+                sa->containers[i], &sa->typecodes[i], copy_on_write);
+            ra->containers[pos] = sa->containers[i];
+            ra->typecodes[pos] = sa->typecodes[i];
+        } else {
+            ra->containers[pos] =
+                container_clone(sa->containers[i], sa->typecodes[i]);
+            ra->typecodes[pos] = sa->typecodes[i];
+        }
+        ra->size++;
+    }
+}
+
+container_t *ra_get_container(roaring_array_t *ra, uint16_t x,
+                              uint8_t *typecode) {
+    int i = binarySearch(ra->keys, (int32_t)ra->size, x);
+    if (i < 0) return NULL;
+    *typecode = ra->typecodes[i];
+    return ra->containers[i];
+}
+
+extern inline container_t *ra_get_container_at_index(const roaring_array_t *ra,
+                                                     uint16_t i,
+                                                     uint8_t *typecode);
+
+extern inline uint16_t ra_get_key_at_index(const roaring_array_t *ra,
+                                           uint16_t i);
+
+extern inline int32_t ra_get_index(const roaring_array_t *ra, uint16_t x);
+
+extern inline int32_t ra_advance_until(const roaring_array_t *ra, uint16_t x,
+                                       int32_t pos);
+
+// everything skipped over is freed
+int32_t ra_advance_until_freeing(roaring_array_t *ra, uint16_t x, int32_t pos) {
+    while (pos < ra->size && ra->keys[pos] < x) {
+        container_free(ra->containers[pos], ra->typecodes[pos]);
+        ++pos;
+    }
+    return pos;
+}
+
+void ra_insert_new_key_value_at(roaring_array_t *ra, int32_t i, uint16_t key,
+                                container_t *c, uint8_t typecode) {
+    extend_array(ra, 1);
+    // May be an optimization opportunity with DIY memmove
+    memmove(&(ra->keys[i + 1]), &(ra->keys[i]),
+            sizeof(uint16_t) * (ra->size - i));
+    memmove(&(ra->containers[i + 1]), &(ra->containers[i]),
+            sizeof(container_t *) * (ra->size - i));
+    memmove(&(ra->typecodes[i + 1]), &(ra->typecodes[i]),
+            sizeof(uint8_t) * (ra->size - i));
+    ra->keys[i] = key;
+    ra->containers[i] = c;
+    ra->typecodes[i] = typecode;
+    ra->size++;
+}
+
+// note: Java routine set things to 0, enabling GC.
+// Java called it "resize" but it was always used to downsize.
+// Allowing upsize would break the conventions about
+// valid containers below ra->size.
+
+void ra_downsize(roaring_array_t *ra, int32_t new_length) {
+    assert(new_length <= ra->size);
+    ra->size = new_length;
+}
+
+void ra_remove_at_index(roaring_array_t *ra, int32_t i) {
+    memmove(&(ra->containers[i]), &(ra->containers[i + 1]),
+            sizeof(container_t *) * (ra->size - i - 1));
+    memmove(&(ra->keys[i]), &(ra->keys[i + 1]),
+            sizeof(uint16_t) * (ra->size - i - 1));
+    memmove(&(ra->typecodes[i]), &(ra->typecodes[i + 1]),
+            sizeof(uint8_t) * (ra->size - i - 1));
+    ra->size--;
+}
+
+void ra_remove_at_index_and_free(roaring_array_t *ra, int32_t i) {
+    container_free(ra->containers[i], ra->typecodes[i]);
+    ra_remove_at_index(ra, i);
+}
+
+// used in inplace andNot only, to slide left the containers from
+// the mutated RoaringBitmap that are after the largest container of
+// the argument RoaringBitmap.  In use it should be followed by a call to
+// downsize.
+//
+void ra_copy_range(roaring_array_t *ra, uint32_t begin, uint32_t end,
+                   uint32_t new_begin) {
+    assert(begin <= end);
+    assert(new_begin < begin);
+
+    const int range = end - begin;
+
+    // We ensure to previously have freed overwritten containers
+    // that are not copied elsewhere
+
+    memmove(&(ra->containers[new_begin]), &(ra->containers[begin]),
+            sizeof(container_t *) * range);
+    memmove(&(ra->keys[new_begin]), &(ra->keys[begin]),
+            sizeof(uint16_t) * range);
+    memmove(&(ra->typecodes[new_begin]), &(ra->typecodes[begin]),
+            sizeof(uint8_t) * range);
+}
+
+void ra_shift_tail(roaring_array_t *ra, int32_t count, int32_t distance) {
+    if (distance > 0) {
+        extend_array(ra, distance);
+    }
+    int32_t srcpos = ra->size - count;
+    int32_t dstpos = srcpos + distance;
+    memmove(&(ra->keys[dstpos]), &(ra->keys[srcpos]), sizeof(uint16_t) * count);
+    memmove(&(ra->containers[dstpos]), &(ra->containers[srcpos]),
+            sizeof(container_t *) * count);
+    memmove(&(ra->typecodes[dstpos]), &(ra->typecodes[srcpos]),
+            sizeof(uint8_t) * count);
+    ra->size += distance;
+}
+
+void ra_to_uint32_array(const roaring_array_t *ra, uint32_t *ans) {
+    size_t ctr = 0;
+    for (int32_t i = 0; i < ra->size; ++i) {
+        int num_added = container_to_uint32_array(
+            ans + ctr, ra->containers[i], ra->typecodes[i],
+            ((uint32_t)ra->keys[i]) << 16);
+        ctr += num_added;
+    }
+}
+
+bool ra_range_uint32_array(const roaring_array_t *ra, size_t offset,
+                           size_t limit, uint32_t *ans) {
+    size_t ctr = 0;
+    size_t dtr = 0;
+
+    size_t t_limit = 0;
+
+    bool first = false;
+    size_t first_skip = 0;
+
+    uint32_t *t_ans = NULL;
+    size_t cur_len = 0;
+
+    for (int i = 0; i < ra->size; ++i) {
+        const container_t *c =
+            container_unwrap_shared(ra->containers[i], &ra->typecodes[i]);
+        switch (ra->typecodes[i]) {
+            case BITSET_CONTAINER_TYPE:
+                t_limit = (const_CAST_bitset(c))->cardinality;
+                break;
+            case ARRAY_CONTAINER_TYPE:
+                t_limit = (const_CAST_array(c))->cardinality;
+                break;
+            case RUN_CONTAINER_TYPE:
+                t_limit = run_container_cardinality(const_CAST_run(c));
+                break;
+        }
+        if (ctr + t_limit - 1 >= offset && ctr < offset + limit) {
+            if (!first) {
+                // first_skip = t_limit - (ctr + t_limit - offset);
+                first_skip = offset - ctr;
+                first = true;
+                t_ans = (uint32_t *)roaring_malloc(sizeof(*t_ans) *
+                                                   (first_skip + limit));
+                if (t_ans == NULL) {
+                    return false;
+                }
+                memset(t_ans, 0, sizeof(*t_ans) * (first_skip + limit));
+                cur_len = first_skip + limit;
+            }
+            if (dtr + t_limit > cur_len) {
+                uint32_t *append_ans = (uint32_t *)roaring_malloc(
+                    sizeof(*append_ans) * (cur_len + t_limit));
+                if (append_ans == NULL) {
+                    if (t_ans != NULL) roaring_free(t_ans);
+                    return false;
+                }
+                memset(append_ans, 0,
+                       sizeof(*append_ans) * (cur_len + t_limit));
+                cur_len = cur_len + t_limit;
+                memcpy(append_ans, t_ans, dtr * sizeof(uint32_t));
+                roaring_free(t_ans);
+                t_ans = append_ans;
+            }
+            switch (ra->typecodes[i]) {
+                case BITSET_CONTAINER_TYPE:
+                    container_to_uint32_array(t_ans + dtr, const_CAST_bitset(c),
+                                              ra->typecodes[i],
+                                              ((uint32_t)ra->keys[i]) << 16);
+                    break;
+                case ARRAY_CONTAINER_TYPE:
+                    container_to_uint32_array(t_ans + dtr, const_CAST_array(c),
+                                              ra->typecodes[i],
+                                              ((uint32_t)ra->keys[i]) << 16);
+                    break;
+                case RUN_CONTAINER_TYPE:
+                    container_to_uint32_array(t_ans + dtr, const_CAST_run(c),
+                                              ra->typecodes[i],
+                                              ((uint32_t)ra->keys[i]) << 16);
+                    break;
+            }
+            dtr += t_limit;
+        }
+        ctr += t_limit;
+        if (dtr - first_skip >= limit) break;
+    }
+    if (t_ans != NULL) {
+        memcpy(ans, t_ans + first_skip, limit * sizeof(uint32_t));
+        free(t_ans);
+    }
+    return true;
+}
+
+bool ra_has_run_container(const roaring_array_t *ra) {
+    for (int32_t k = 0; k < ra->size; ++k) {
+        if (get_container_type(ra->containers[k], ra->typecodes[k]) ==
+            RUN_CONTAINER_TYPE)
+            return true;
+    }
+    return false;
+}
+
+uint32_t ra_portable_header_size(const roaring_array_t *ra) {
+    if (ra_has_run_container(ra)) {
+        if (ra->size <
+            NO_OFFSET_THRESHOLD) {  // for small bitmaps, we omit the offsets
+            return 4 + (ra->size + 7) / 8 + 4 * ra->size;
+        }
+        return 4 + (ra->size + 7) / 8 +
+               8 * ra->size;  // - 4 because we pack the size with the cookie
+    } else {
+        return 4 + 4 + 8 * ra->size;
+    }
+}
+
+size_t ra_portable_size_in_bytes(const roaring_array_t *ra) {
+    size_t count = ra_portable_header_size(ra);
+
+    for (int32_t k = 0; k < ra->size; ++k) {
+        count += container_size_in_bytes(ra->containers[k], ra->typecodes[k]);
+    }
+    return count;
+}
+
+// This function is endian-sensitive.
+size_t ra_portable_serialize(const roaring_array_t *ra, char *buf) {
+    char *initbuf = buf;
+    uint32_t startOffset = 0;
+    bool hasrun = ra_has_run_container(ra);
+    if (hasrun) {
+        uint32_t cookie = SERIAL_COOKIE | ((uint32_t)(ra->size - 1) << 16);
+        memcpy(buf, &cookie, sizeof(cookie));
+        buf += sizeof(cookie);
+        uint32_t s = (ra->size + 7) / 8;
+        uint8_t *bitmapOfRunContainers = (uint8_t *)roaring_calloc(s, 1);
+        assert(bitmapOfRunContainers != NULL);  // todo: handle
+        for (int32_t i = 0; i < ra->size; ++i) {
+            if (get_container_type(ra->containers[i], ra->typecodes[i]) ==
+                RUN_CONTAINER_TYPE) {
+                bitmapOfRunContainers[i / 8] |= (1 << (i % 8));
+            }
+        }
+        memcpy(buf, bitmapOfRunContainers, s);
+        buf += s;
+        roaring_free(bitmapOfRunContainers);
+        if (ra->size < NO_OFFSET_THRESHOLD) {
+            startOffset = 4 + 4 * ra->size + s;
+        } else {
+            startOffset = 4 + 8 * ra->size + s;
+        }
+    } else {  // backwards compatibility
+        uint32_t cookie = SERIAL_COOKIE_NO_RUNCONTAINER;
+
+        memcpy(buf, &cookie, sizeof(cookie));
+        buf += sizeof(cookie);
+        memcpy(buf, &ra->size, sizeof(ra->size));
+        buf += sizeof(ra->size);
+
+        startOffset = 4 + 4 + 4 * ra->size + 4 * ra->size;
+    }
+    for (int32_t k = 0; k < ra->size; ++k) {
+        memcpy(buf, &ra->keys[k], sizeof(ra->keys[k]));
+        buf += sizeof(ra->keys[k]);
+        // get_cardinality returns a value in [1,1<<16], subtracting one
+        // we get [0,1<<16 - 1] which fits in 16 bits
+        uint16_t card = (uint16_t)(container_get_cardinality(ra->containers[k],
+                                                             ra->typecodes[k]) -
+                                   1);
+        memcpy(buf, &card, sizeof(card));
+        buf += sizeof(card);
+    }
+    if ((!hasrun) || (ra->size >= NO_OFFSET_THRESHOLD)) {
+        // writing the containers offsets
+        for (int32_t k = 0; k < ra->size; k++) {
+            memcpy(buf, &startOffset, sizeof(startOffset));
+            buf += sizeof(startOffset);
+            startOffset =
+                startOffset +
+                container_size_in_bytes(ra->containers[k], ra->typecodes[k]);
+        }
+    }
+    for (int32_t k = 0; k < ra->size; ++k) {
+        buf += container_write(ra->containers[k], ra->typecodes[k], buf);
+    }
+    return buf - initbuf;
+}
+
+// Quickly checks whether there is a serialized bitmap at the pointer,
+// not exceeding size "maxbytes" in bytes. This function does not allocate
+// memory dynamically.
+//
+// This function returns 0 if and only if no valid bitmap is found.
+// Otherwise, it returns how many bytes are occupied.
+//
+size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) {
+    size_t bytestotal = sizeof(int32_t);  // for cookie
+    if (bytestotal > maxbytes) return 0;
+    uint32_t cookie;
+    memcpy(&cookie, buf, sizeof(int32_t));
+    buf += sizeof(uint32_t);
+    if ((cookie & 0xFFFF) != SERIAL_COOKIE &&
+        cookie != SERIAL_COOKIE_NO_RUNCONTAINER) {
+        return 0;
+    }
+    int32_t size;
+
+    if ((cookie & 0xFFFF) == SERIAL_COOKIE)
+        size = (cookie >> 16) + 1;
+    else {
+        bytestotal += sizeof(int32_t);
+        if (bytestotal > maxbytes) return 0;
+        memcpy(&size, buf, sizeof(int32_t));
+        buf += sizeof(uint32_t);
+    }
+    if (size > (1 << 16)) {
+        return 0;
+    }
+    char *bitmapOfRunContainers = NULL;
+    bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
+    if (hasrun) {
+        int32_t s = (size + 7) / 8;
+        bytestotal += s;
+        if (bytestotal > maxbytes) return 0;
+        bitmapOfRunContainers = (char *)buf;
+        buf += s;
+    }
+    bytestotal += size * 2 * sizeof(uint16_t);
+    if (bytestotal > maxbytes) return 0;
+    uint16_t *keyscards = (uint16_t *)buf;
+    buf += size * 2 * sizeof(uint16_t);
+    if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) {
+        // skipping the offsets
+        bytestotal += size * 4;
+        if (bytestotal > maxbytes) return 0;
+        buf += size * 4;
+    }
+    // Reading the containers
+    for (int32_t k = 0; k < size; ++k) {
+        uint16_t tmp;
+        memcpy(&tmp, keyscards + 2 * k + 1, sizeof(tmp));
+        uint32_t thiscard = tmp + 1;
+        bool isbitmap = (thiscard > DEFAULT_MAX_SIZE);
+        bool isrun = false;
+        if (hasrun) {
+            if ((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) {
+                isbitmap = false;
+                isrun = true;
+            }
+        }
+        if (isbitmap) {
+            size_t containersize =
+                BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+            bytestotal += containersize;
+            if (bytestotal > maxbytes) return 0;
+            buf += containersize;
+        } else if (isrun) {
+            bytestotal += sizeof(uint16_t);
+            if (bytestotal > maxbytes) return 0;
+            uint16_t n_runs;
+            memcpy(&n_runs, buf, sizeof(uint16_t));
+            buf += sizeof(uint16_t);
+            size_t containersize = n_runs * sizeof(rle16_t);
+            bytestotal += containersize;
+            if (bytestotal > maxbytes) return 0;
+            buf += containersize;
+        } else {
+            size_t containersize = thiscard * sizeof(uint16_t);
+            bytestotal += containersize;
+            if (bytestotal > maxbytes) return 0;
+            buf += containersize;
+        }
+    }
+    return bytestotal;
+}
+
+// This function populates answer from the content of buf (reading up to
+// maxbytes bytes). The function returns false if a properly serialized bitmap
+// cannot be found. If it returns true, readbytes is populated by how many bytes
+// were read, we have that *readbytes <= maxbytes.
+//
+// This function is endian-sensitive.
+bool ra_portable_deserialize(roaring_array_t *answer, const char *buf,
+                             const size_t maxbytes, size_t *readbytes) {
+    *readbytes = sizeof(int32_t);  // for cookie
+    if (*readbytes > maxbytes) {
+        // Ran out of bytes while reading first 4 bytes.
+        return false;
+    }
+    uint32_t cookie;
+    memcpy(&cookie, buf, sizeof(int32_t));
+    buf += sizeof(uint32_t);
+    if ((cookie & 0xFFFF) != SERIAL_COOKIE &&
+        cookie != SERIAL_COOKIE_NO_RUNCONTAINER) {
+        // "I failed to find one of the right cookies.
+        return false;
+    }
+    int32_t size;
+
+    if ((cookie & 0xFFFF) == SERIAL_COOKIE)
+        size = (cookie >> 16) + 1;
+    else {
+        *readbytes += sizeof(int32_t);
+        if (*readbytes > maxbytes) {
+            // Ran out of bytes while reading second part of the cookie.
+            return false;
+        }
+        memcpy(&size, buf, sizeof(int32_t));
+        buf += sizeof(uint32_t);
+    }
+    if (size < 0) {
+        // You cannot have a negative number of containers, the data must be
+        // corrupted.
+        return false;
+    }
+    if (size > (1 << 16)) {
+        // You cannot have so many containers, the data must be corrupted.
+        return false;
+    }
+    const char *bitmapOfRunContainers = NULL;
+    bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
+    if (hasrun) {
+        int32_t s = (size + 7) / 8;
+        *readbytes += s;
+        if (*readbytes > maxbytes) {  // data is corrupted?
+            // Ran out of bytes while reading run bitmap.
+            return false;
+        }
+        bitmapOfRunContainers = buf;
+        buf += s;
+    }
+    uint16_t *keyscards = (uint16_t *)buf;
+
+    *readbytes += size * 2 * sizeof(uint16_t);
+    if (*readbytes > maxbytes) {
+        // Ran out of bytes while reading key-cardinality array.
+        return false;
+    }
+    buf += size * 2 * sizeof(uint16_t);
+
+    bool is_ok = ra_init_with_capacity(answer, size);
+    if (!is_ok) {
+        // Failed to allocate memory for roaring array. Bailing out.
+        return false;
+    }
+
+    for (int32_t k = 0; k < size; ++k) {
+        uint16_t tmp;
+        memcpy(&tmp, keyscards + 2 * k, sizeof(tmp));
+        answer->keys[k] = tmp;
+    }
+    if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) {
+        *readbytes += size * 4;
+        if (*readbytes > maxbytes) {  // data is corrupted?
+            // Ran out of bytes while reading offsets.
+            ra_clear(answer);  // we need to clear the containers already
+                               // allocated, and the roaring array
+            return false;
+        }
+
+        // skipping the offsets
+        buf += size * 4;
+    }
+    // Reading the containers
+    for (int32_t k = 0; k < size; ++k) {
+        uint16_t tmp;
+        memcpy(&tmp, keyscards + 2 * k + 1, sizeof(tmp));
+        uint32_t thiscard = tmp + 1;
+        bool isbitmap = (thiscard > DEFAULT_MAX_SIZE);
+        bool isrun = false;
+        if (hasrun) {
+            if ((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) {
+                isbitmap = false;
+                isrun = true;
+            }
+        }
+        if (isbitmap) {
+            // we check that the read is allowed
+            size_t containersize =
+                BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+            *readbytes += containersize;
+            if (*readbytes > maxbytes) {
+                // Running out of bytes while reading a bitset container.
+                ra_clear(answer);  // we need to clear the containers already
+                                   // allocated, and the roaring array
+                return false;
+            }
+            // it is now safe to read
+            bitset_container_t *c = bitset_container_create();
+            if (c == NULL) {  // memory allocation failure
+                // Failed to allocate memory for a bitset container.
+                ra_clear(answer);  // we need to clear the containers already
+                                   // allocated, and the roaring array
+                return false;
+            }
+            answer->size++;
+            buf += bitset_container_read(thiscard, c, buf);
+            answer->containers[k] = c;
+            answer->typecodes[k] = BITSET_CONTAINER_TYPE;
+        } else if (isrun) {
+            // we check that the read is allowed
+            *readbytes += sizeof(uint16_t);
+            if (*readbytes > maxbytes) {
+                // Running out of bytes while reading a run container (header).
+                ra_clear(answer);  // we need to clear the containers already
+                                   // allocated, and the roaring array
+                return false;
+            }
+            uint16_t n_runs;
+            memcpy(&n_runs, buf, sizeof(uint16_t));
+            size_t containersize = n_runs * sizeof(rle16_t);
+            *readbytes += containersize;
+            if (*readbytes > maxbytes) {  // data is corrupted?
+                // Running out of bytes while reading a run container.
+                ra_clear(answer);  // we need to clear the containers already
+                                   // allocated, and the roaring array
+                return false;
+            }
+            // it is now safe to read
+
+            run_container_t *c = run_container_create();
+            if (c == NULL) {  // memory allocation failure
+                // Failed to allocate memory for a run container.
+                ra_clear(answer);  // we need to clear the containers already
+                                   // allocated, and the roaring array
+                return false;
+            }
+            answer->size++;
+            buf += run_container_read(thiscard, c, buf);
+            answer->containers[k] = c;
+            answer->typecodes[k] = RUN_CONTAINER_TYPE;
+        } else {
+            // we check that the read is allowed
+            size_t containersize = thiscard * sizeof(uint16_t);
+            *readbytes += containersize;
+            if (*readbytes > maxbytes) {  // data is corrupted?
+                // Running out of bytes while reading an array container.
+                ra_clear(answer);  // we need to clear the containers already
+                                   // allocated, and the roaring array
+                return false;
+            }
+            // it is now safe to read
+            array_container_t *c =
+                array_container_create_given_capacity(thiscard);
+            if (c == NULL) {  // memory allocation failure
+                // Failed to allocate memory for an array container.
+                ra_clear(answer);  // we need to clear the containers already
+                                   // allocated, and the roaring array
+                return false;
+            }
+            answer->size++;
+            buf += array_container_read(thiscard, c, buf);
+            answer->containers[k] = c;
+            answer->typecodes[k] = ARRAY_CONTAINER_TYPE;
+        }
+    }
+    return true;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace internal {
+#endif
+/* end file src/roaring_array.c */
+/* begin file src/roaring.c */
+#include <assert.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+
+// Include after roaring.h
+
+#ifdef __cplusplus
+using namespace ::roaring::internal;
+
+extern "C" {
+namespace roaring {
+namespace api {
+#endif
+
+#define CROARING_SERIALIZATION_ARRAY_UINT32 1
+#define CROARING_SERIALIZATION_CONTAINER 2
+extern inline int roaring_trailing_zeroes(unsigned long long input_num);
+extern inline int roaring_leading_zeroes(unsigned long long input_num);
+extern inline void roaring_bitmap_init_cleared(roaring_bitmap_t *r);
+extern inline bool roaring_bitmap_get_copy_on_write(const roaring_bitmap_t *r);
+extern inline void roaring_bitmap_set_copy_on_write(roaring_bitmap_t *r,
+                                                    bool cow);
+extern inline roaring_bitmap_t *roaring_bitmap_create(void);
+extern inline void roaring_bitmap_add_range(roaring_bitmap_t *r, uint64_t min,
+                                            uint64_t max);
+extern inline void roaring_bitmap_remove_range(roaring_bitmap_t *r,
+                                               uint64_t min, uint64_t max);
+
+static inline bool is_cow(const roaring_bitmap_t *r) {
+    return r->high_low_container.flags & ROARING_FLAG_COW;
+}
+static inline bool is_frozen(const roaring_bitmap_t *r) {
+    return r->high_low_container.flags & ROARING_FLAG_FROZEN;
+}
+
+// this is like roaring_bitmap_add, but it populates pointer arguments in such a
+// way
+// that we can recover the container touched, which, in turn can be used to
+// accelerate some functions (when you repeatedly need to add to the same
+// container)
+static inline container_t *containerptr_roaring_bitmap_add(roaring_bitmap_t *r,
+                                                           uint32_t val,
+                                                           uint8_t *type,
+                                                           int *index) {
+    roaring_array_t *ra = &r->high_low_container;
+
+    uint16_t hb = val >> 16;
+    const int i = ra_get_index(ra, hb);
+    if (i >= 0) {
+        ra_unshare_container_at_index(ra, (uint16_t)i);
+        container_t *c = ra_get_container_at_index(ra, (uint16_t)i, type);
+        uint8_t new_type = *type;
+        container_t *c2 = container_add(c, val & 0xFFFF, *type, &new_type);
+        *index = i;
+        if (c2 != c) {
+            container_free(c, *type);
+            ra_set_container_at_index(ra, i, c2, new_type);
+            *type = new_type;
+            return c2;
+        } else {
+            return c;
+        }
+    } else {
+        array_container_t *new_ac = array_container_create();
+        container_t *c =
+            container_add(new_ac, val & 0xFFFF, ARRAY_CONTAINER_TYPE, type);
+        // we could just assume that it stays an array container
+        ra_insert_new_key_value_at(ra, -i - 1, hb, c, *type);
+        *index = -i - 1;
+        return c;
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_create_with_capacity(uint32_t cap) {
+    roaring_bitmap_t *ans =
+        (roaring_bitmap_t *)roaring_malloc(sizeof(roaring_bitmap_t));
+    if (!ans) {
+        return NULL;
+    }
+    bool is_ok = ra_init_with_capacity(&ans->high_low_container, cap);
+    if (!is_ok) {
+        roaring_free(ans);
+        return NULL;
+    }
+    return ans;
+}
+
+bool roaring_bitmap_init_with_capacity(roaring_bitmap_t *r, uint32_t cap) {
+    return ra_init_with_capacity(&r->high_low_container, cap);
+}
+
+static inline void add_bulk_impl(roaring_bitmap_t *r,
+                                 roaring_bulk_context_t *context,
+                                 uint32_t val) {
+    uint16_t key = val >> 16;
+    if (context->container == NULL || context->key != key) {
+        uint8_t typecode;
+        int idx;
+        context->container =
+            containerptr_roaring_bitmap_add(r, val, &typecode, &idx);
+        context->typecode = typecode;
+        context->idx = idx;
+        context->key = key;
+    } else {
+        // no need to seek the container, it is at hand
+        // because we already have the container at hand, we can do the
+        // insertion directly, bypassing the roaring_bitmap_add call
+        uint8_t new_typecode;
+        container_t *container2 = container_add(
+            context->container, val & 0xFFFF, context->typecode, &new_typecode);
+        if (container2 != context->container) {
+            // rare instance when we need to change the container type
+            container_free(context->container, context->typecode);
+            ra_set_container_at_index(&r->high_low_container, context->idx,
+                                      container2, new_typecode);
+            context->typecode = new_typecode;
+            context->container = container2;
+        }
+    }
+}
+
+void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args,
+                             const uint32_t *vals) {
+    uint32_t val;
+    const uint32_t *start = vals;
+    const uint32_t *end = vals + n_args;
+    const uint32_t *current_val = start;
+
+    if (n_args == 0) {
+        return;
+    }
+
+    uint8_t typecode;
+    int idx;
+    container_t *container;
+    val = *current_val;
+    container = containerptr_roaring_bitmap_add(r, val, &typecode, &idx);
+    roaring_bulk_context_t context = {container, idx, (uint16_t)(val >> 16),
+                                      typecode};
+
+    for (; current_val != end; current_val++) {
+        memcpy(&val, current_val, sizeof(val));
+        add_bulk_impl(r, &context, val);
+    }
+}
+
+void roaring_bitmap_add_bulk(roaring_bitmap_t *r,
+                             roaring_bulk_context_t *context, uint32_t val) {
+    add_bulk_impl(r, context, val);
+}
+
+bool roaring_bitmap_contains_bulk(const roaring_bitmap_t *r,
+                                  roaring_bulk_context_t *context,
+                                  uint32_t val) {
+    uint16_t key = val >> 16;
+    if (context->container == NULL || context->key != key) {
+        int32_t start_idx = -1;
+        if (context->container != NULL && context->key < key) {
+            start_idx = context->idx;
+        }
+        int idx = ra_advance_until(&r->high_low_container, key, start_idx);
+        if (idx == ra_get_size(&r->high_low_container)) {
+            return false;
+        }
+        uint8_t typecode;
+        context->container = ra_get_container_at_index(
+            &r->high_low_container, (uint16_t)idx, &typecode);
+        context->typecode = typecode;
+        context->idx = idx;
+        context->key =
+            ra_get_key_at_index(&r->high_low_container, (uint16_t)idx);
+        // ra_advance_until finds the next key >= the target, we found a later
+        // container.
+        if (context->key != key) {
+            return false;
+        }
+    }
+    // context is now set up
+    return container_contains(context->container, val & 0xFFFF,
+                              context->typecode);
+}
+
+roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals) {
+    roaring_bitmap_t *answer = roaring_bitmap_create();
+    roaring_bitmap_add_many(answer, n_args, vals);
+    return answer;
+}
+
+roaring_bitmap_t *roaring_bitmap_of(size_t n_args, ...) {
+    // todo: could be greatly optimized but we do not expect this call to ever
+    // include long lists
+    roaring_bitmap_t *answer = roaring_bitmap_create();
+    roaring_bulk_context_t context = {0};
+    va_list ap;
+    va_start(ap, n_args);
+    for (size_t i = 0; i < n_args; i++) {
+        uint32_t val = va_arg(ap, uint32_t);
+        roaring_bitmap_add_bulk(answer, &context, val);
+    }
+    va_end(ap);
+    return answer;
+}
+
+static inline uint64_t minimum_uint64(uint64_t a, uint64_t b) {
+    return (a < b) ? a : b;
+}
+
+roaring_bitmap_t *roaring_bitmap_from_range(uint64_t min, uint64_t max,
+                                            uint32_t step) {
+    if (max >= UINT64_C(0x100000000)) {
+        max = UINT64_C(0x100000000);
+    }
+    if (step == 0) return NULL;
+    if (max <= min) return NULL;
+    roaring_bitmap_t *answer = roaring_bitmap_create();
+    if (step >= (1 << 16)) {
+        for (uint32_t value = (uint32_t)min; value < max; value += step) {
+            roaring_bitmap_add(answer, value);
+        }
+        return answer;
+    }
+    uint64_t min_tmp = min;
+    do {
+        uint32_t key = (uint32_t)min_tmp >> 16;
+        uint32_t container_min = min_tmp & 0xFFFF;
+        uint32_t container_max =
+            (uint32_t)minimum_uint64(max - (key << 16), 1 << 16);
+        uint8_t type;
+        container_t *container = container_from_range(
+            &type, container_min, container_max, (uint16_t)step);
+        ra_append(&answer->high_low_container, (uint16_t)key, container, type);
+        uint32_t gap = container_max - container_min + step - 1;
+        min_tmp += gap - (gap % step);
+    } while (min_tmp < max);
+    // cardinality of bitmap will be ((uint64_t) max - min + step - 1 ) / step
+    return answer;
+}
+
+void roaring_bitmap_add_range_closed(roaring_bitmap_t *r, uint32_t min,
+                                     uint32_t max) {
+    if (min > max) {
+        return;
+    }
+
+    roaring_array_t *ra = &r->high_low_container;
+
+    uint32_t min_key = min >> 16;
+    uint32_t max_key = max >> 16;
+
+    int32_t num_required_containers = max_key - min_key + 1;
+    int32_t suffix_length =
+        count_greater(ra->keys, ra->size, (uint16_t)max_key);
+    int32_t prefix_length =
+        count_less(ra->keys, ra->size - suffix_length, (uint16_t)min_key);
+    int32_t common_length = ra->size - prefix_length - suffix_length;
+
+    if (num_required_containers > common_length) {
+        ra_shift_tail(ra, suffix_length,
+                      num_required_containers - common_length);
+    }
+
+    int32_t src = prefix_length + common_length - 1;
+    int32_t dst = ra->size - suffix_length - 1;
+    for (uint32_t key = max_key; key != min_key - 1;
+         key--) {  // beware of min_key==0
+        uint32_t container_min = (min_key == key) ? (min & 0xffff) : 0;
+        uint32_t container_max = (max_key == key) ? (max & 0xffff) : 0xffff;
+        container_t *new_container;
+        uint8_t new_type;
+
+        if (src >= 0 && ra->keys[src] == key) {
+            ra_unshare_container_at_index(ra, (uint16_t)src);
+            new_container =
+                container_add_range(ra->containers[src], ra->typecodes[src],
+                                    container_min, container_max, &new_type);
+            if (new_container != ra->containers[src]) {
+                container_free(ra->containers[src], ra->typecodes[src]);
+            }
+            src--;
+        } else {
+            new_container = container_from_range(&new_type, container_min,
+                                                 container_max + 1, 1);
+        }
+        ra_replace_key_and_container_at_index(ra, dst, (uint16_t)key,
+                                              new_container, new_type);
+        dst--;
+    }
+}
+
+void roaring_bitmap_remove_range_closed(roaring_bitmap_t *r, uint32_t min,
+                                        uint32_t max) {
+    if (min > max) {
+        return;
+    }
+
+    roaring_array_t *ra = &r->high_low_container;
+
+    uint32_t min_key = min >> 16;
+    uint32_t max_key = max >> 16;
+
+    int32_t src = count_less(ra->keys, ra->size, (uint16_t)min_key);
+    int32_t dst = src;
+    while (src < ra->size && ra->keys[src] <= max_key) {
+        uint32_t container_min =
+            (min_key == ra->keys[src]) ? (min & 0xffff) : 0;
+        uint32_t container_max =
+            (max_key == ra->keys[src]) ? (max & 0xffff) : 0xffff;
+        ra_unshare_container_at_index(ra, (uint16_t)src);
+        container_t *new_container;
+        uint8_t new_type;
+        new_container =
+            container_remove_range(ra->containers[src], ra->typecodes[src],
+                                   container_min, container_max, &new_type);
+        if (new_container != ra->containers[src]) {
+            container_free(ra->containers[src], ra->typecodes[src]);
+        }
+        if (new_container) {
+            ra_replace_key_and_container_at_index(ra, dst, ra->keys[src],
+                                                  new_container, new_type);
+            dst++;
+        }
+        src++;
+    }
+    if (src > dst) {
+        ra_shift_tail(ra, ra->size - src, dst - src);
+    }
+}
+
+void roaring_bitmap_printf(const roaring_bitmap_t *r) {
+    const roaring_array_t *ra = &r->high_low_container;
+
+    printf("{");
+    for (int i = 0; i < ra->size; ++i) {
+        container_printf_as_uint32_array(ra->containers[i], ra->typecodes[i],
+                                         ((uint32_t)ra->keys[i]) << 16);
+
+        if (i + 1 < ra->size) {
+            printf(",");
+        }
+    }
+    printf("}");
+}
+
+void roaring_bitmap_printf_describe(const roaring_bitmap_t *r) {
+    const roaring_array_t *ra = &r->high_low_container;
+
+    printf("{");
+    for (int i = 0; i < ra->size; ++i) {
+        printf("%d: %s (%d)", ra->keys[i],
+               get_full_container_name(ra->containers[i], ra->typecodes[i]),
+               container_get_cardinality(ra->containers[i], ra->typecodes[i]));
+        if (ra->typecodes[i] == SHARED_CONTAINER_TYPE) {
+            printf("(shared count = %" PRIu32 " )",
+                   croaring_refcount_get(
+                       &(CAST_shared(ra->containers[i])->counter)));
+        }
+
+        if (i + 1 < ra->size) {
+            printf(", ");
+        }
+    }
+    printf("}");
+}
+
+typedef struct min_max_sum_s {
+    uint32_t min;
+    uint32_t max;
+    uint64_t sum;
+} min_max_sum_t;
+
+static bool min_max_sum_fnc(uint32_t value, void *param) {
+    min_max_sum_t *mms = (min_max_sum_t *)param;
+    if (value > mms->max) mms->max = value;
+    if (value < mms->min) mms->min = value;
+    mms->sum += value;
+    return true;  // we always process all data points
+}
+
+/**
+ *  (For advanced users.)
+ * Collect statistics about the bitmap
+ */
+void roaring_bitmap_statistics(const roaring_bitmap_t *r,
+                               roaring_statistics_t *stat) {
+    const roaring_array_t *ra = &r->high_low_container;
+
+    memset(stat, 0, sizeof(*stat));
+    stat->n_containers = ra->size;
+    stat->cardinality = roaring_bitmap_get_cardinality(r);
+    min_max_sum_t mms;
+    mms.min = UINT32_C(0xFFFFFFFF);
+    mms.max = UINT32_C(0);
+    mms.sum = 0;
+    roaring_iterate(r, &min_max_sum_fnc, &mms);
+    stat->min_value = mms.min;
+    stat->max_value = mms.max;
+    stat->sum_value = mms.sum;
+
+    for (int i = 0; i < ra->size; ++i) {
+        uint8_t truetype =
+            get_container_type(ra->containers[i], ra->typecodes[i]);
+        uint32_t card =
+            container_get_cardinality(ra->containers[i], ra->typecodes[i]);
+        uint32_t sbytes =
+            container_size_in_bytes(ra->containers[i], ra->typecodes[i]);
+        switch (truetype) {
+            case BITSET_CONTAINER_TYPE:
+                stat->n_bitset_containers++;
+                stat->n_values_bitset_containers += card;
+                stat->n_bytes_bitset_containers += sbytes;
+                break;
+            case ARRAY_CONTAINER_TYPE:
+                stat->n_array_containers++;
+                stat->n_values_array_containers += card;
+                stat->n_bytes_array_containers += sbytes;
+                break;
+            case RUN_CONTAINER_TYPE:
+                stat->n_run_containers++;
+                stat->n_values_run_containers += card;
+                stat->n_bytes_run_containers += sbytes;
+                break;
+            default:
+                assert(false);
+                roaring_unreachable;
+        }
+    }
+}
+
+/*
+ * Checks that:
+ * - Array containers are sorted and contain no duplicates
+ * - Range containers are sorted and contain no overlapping ranges
+ * - Roaring containers are sorted by key and there are no duplicate keys
+ * - The correct container type is use for each container (e.g. bitmaps aren't
+ * used for small containers)
+ */
+bool roaring_bitmap_internal_validate(const roaring_bitmap_t *r,
+                                      const char **reason) {
+    const char *reason_local;
+    if (reason == NULL) {
+        // Always allow assigning through *reason
+        reason = &reason_local;
+    }
+    *reason = NULL;
+    const roaring_array_t *ra = &r->high_low_container;
+    if (ra->size < 0) {
+        *reason = "negative size";
+        return false;
+    }
+    if (ra->allocation_size < 0) {
+        *reason = "negative allocation size";
+        return false;
+    }
+    if (ra->size > ra->allocation_size) {
+        *reason = "more containers than allocated space";
+        return false;
+    }
+    if (ra->flags & ~(ROARING_FLAG_COW | ROARING_FLAG_FROZEN)) {
+        *reason = "invalid flags";
+        return false;
+    }
+    if (ra->size == 0) {
+        return true;
+    }
+
+    if (ra->keys == NULL) {
+        *reason = "keys is NULL";
+        return false;
+    }
+    if (ra->typecodes == NULL) {
+        *reason = "typecodes is NULL";
+        return false;
+    }
+    if (ra->containers == NULL) {
+        *reason = "containers is NULL";
+        return false;
+    }
+
+    uint32_t prev_key = ra->keys[0];
+    for (int32_t i = 1; i < ra->size; ++i) {
+        if (ra->keys[i] <= prev_key) {
+            *reason = "keys not strictly increasing";
+            return false;
+        }
+        prev_key = ra->keys[i];
+    }
+
+    for (int32_t i = 0; i < ra->size; ++i) {
+        if (!container_internal_validate(ra->containers[i], ra->typecodes[i],
+                                         reason)) {
+            // reason should already be set
+            if (*reason == NULL) {
+                *reason = "container failed to validate but no reason given";
+            }
+            return false;
+        }
+    }
+
+    return true;
+}
+
+roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r) {
+    roaring_bitmap_t *ans =
+        (roaring_bitmap_t *)roaring_malloc(sizeof(roaring_bitmap_t));
+    if (!ans) {
+        return NULL;
+    }
+    if (!ra_init_with_capacity(  // allocation of list of containers can fail
+            &ans->high_low_container, r->high_low_container.size)) {
+        roaring_free(ans);
+        return NULL;
+    }
+    if (!ra_overwrite(  // memory allocation of individual containers may fail
+            &r->high_low_container, &ans->high_low_container, is_cow(r))) {
+        roaring_bitmap_free(ans);  // overwrite should leave in freeable state
+        return NULL;
+    }
+    roaring_bitmap_set_copy_on_write(ans, is_cow(r));
+    return ans;
+}
+
+bool roaring_bitmap_overwrite(roaring_bitmap_t *dest,
+                              const roaring_bitmap_t *src) {
+    roaring_bitmap_set_copy_on_write(dest, is_cow(src));
+    return ra_overwrite(&src->high_low_container, &dest->high_low_container,
+                        is_cow(src));
+}
+
+void roaring_bitmap_free(const roaring_bitmap_t *r) {
+    if (r == NULL) {
+        return;
+    }
+    if (!is_frozen(r)) {
+        ra_clear((roaring_array_t *)&r->high_low_container);
+    }
+    roaring_free((roaring_bitmap_t *)r);
+}
+
+void roaring_bitmap_clear(roaring_bitmap_t *r) {
+    ra_reset(&r->high_low_container);
+}
+
+void roaring_bitmap_add(roaring_bitmap_t *r, uint32_t val) {
+    roaring_array_t *ra = &r->high_low_container;
+
+    const uint16_t hb = val >> 16;
+    const int i = ra_get_index(ra, hb);
+    uint8_t typecode;
+    if (i >= 0) {
+        ra_unshare_container_at_index(ra, (uint16_t)i);
+        container_t *container =
+            ra_get_container_at_index(ra, (uint16_t)i, &typecode);
+        uint8_t newtypecode = typecode;
+        container_t *container2 =
+            container_add(container, val & 0xFFFF, typecode, &newtypecode);
+        if (container2 != container) {
+            container_free(container, typecode);
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        }
+    } else {
+        array_container_t *newac = array_container_create();
+        container_t *container =
+            container_add(newac, val & 0xFFFF, ARRAY_CONTAINER_TYPE, &typecode);
+        // we could just assume that it stays an array container
+        ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
+                                   container, typecode);
+    }
+}
+
+bool roaring_bitmap_add_checked(roaring_bitmap_t *r, uint32_t val) {
+    const uint16_t hb = val >> 16;
+    const int i = ra_get_index(&r->high_low_container, hb);
+    uint8_t typecode;
+    bool result = false;
+    if (i >= 0) {
+        ra_unshare_container_at_index(&r->high_low_container, (uint16_t)i);
+        container_t *container = ra_get_container_at_index(
+            &r->high_low_container, (uint16_t)i, &typecode);
+
+        const int oldCardinality =
+            container_get_cardinality(container, typecode);
+
+        uint8_t newtypecode = typecode;
+        container_t *container2 =
+            container_add(container, val & 0xFFFF, typecode, &newtypecode);
+        if (container2 != container) {
+            container_free(container, typecode);
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+            result = true;
+        } else {
+            const int newCardinality =
+                container_get_cardinality(container, newtypecode);
+
+            result = oldCardinality != newCardinality;
+        }
+    } else {
+        array_container_t *newac = array_container_create();
+        container_t *container =
+            container_add(newac, val & 0xFFFF, ARRAY_CONTAINER_TYPE, &typecode);
+        // we could just assume that it stays an array container
+        ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
+                                   container, typecode);
+        result = true;
+    }
+
+    return result;
+}
+
+void roaring_bitmap_remove(roaring_bitmap_t *r, uint32_t val) {
+    const uint16_t hb = val >> 16;
+    const int i = ra_get_index(&r->high_low_container, hb);
+    uint8_t typecode;
+    if (i >= 0) {
+        ra_unshare_container_at_index(&r->high_low_container, (uint16_t)i);
+        container_t *container = ra_get_container_at_index(
+            &r->high_low_container, (uint16_t)i, &typecode);
+        uint8_t newtypecode = typecode;
+        container_t *container2 =
+            container_remove(container, val & 0xFFFF, typecode, &newtypecode);
+        if (container2 != container) {
+            container_free(container, typecode);
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        }
+        if (container_get_cardinality(container2, newtypecode) != 0) {
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        } else {
+            ra_remove_at_index_and_free(&r->high_low_container, i);
+        }
+    }
+}
+
+bool roaring_bitmap_remove_checked(roaring_bitmap_t *r, uint32_t val) {
+    const uint16_t hb = val >> 16;
+    const int i = ra_get_index(&r->high_low_container, hb);
+    uint8_t typecode;
+    bool result = false;
+    if (i >= 0) {
+        ra_unshare_container_at_index(&r->high_low_container, (uint16_t)i);
+        container_t *container = ra_get_container_at_index(
+            &r->high_low_container, (uint16_t)i, &typecode);
+
+        const int oldCardinality =
+            container_get_cardinality(container, typecode);
+
+        uint8_t newtypecode = typecode;
+        container_t *container2 =
+            container_remove(container, val & 0xFFFF, typecode, &newtypecode);
+        if (container2 != container) {
+            container_free(container, typecode);
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        }
+
+        const int newCardinality =
+            container_get_cardinality(container2, newtypecode);
+
+        if (newCardinality != 0) {
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        } else {
+            ra_remove_at_index_and_free(&r->high_low_container, i);
+        }
+
+        result = oldCardinality != newCardinality;
+    }
+    return result;
+}
+
+void roaring_bitmap_remove_many(roaring_bitmap_t *r, size_t n_args,
+                                const uint32_t *vals) {
+    if (n_args == 0 || r->high_low_container.size == 0) {
+        return;
+    }
+    int32_t pos =
+        -1;  // position of the container used in the previous iteration
+    for (size_t i = 0; i < n_args; i++) {
+        uint16_t key = (uint16_t)(vals[i] >> 16);
+        if (pos < 0 || key != r->high_low_container.keys[pos]) {
+            pos = ra_get_index(&r->high_low_container, key);
+        }
+        if (pos >= 0) {
+            uint8_t new_typecode;
+            container_t *new_container;
+            new_container = container_remove(
+                r->high_low_container.containers[pos], vals[i] & 0xffff,
+                r->high_low_container.typecodes[pos], &new_typecode);
+            if (new_container != r->high_low_container.containers[pos]) {
+                container_free(r->high_low_container.containers[pos],
+                               r->high_low_container.typecodes[pos]);
+                ra_replace_key_and_container_at_index(&r->high_low_container,
+                                                      pos, key, new_container,
+                                                      new_typecode);
+            }
+            if (!container_nonzero_cardinality(new_container, new_typecode)) {
+                container_free(new_container, new_typecode);
+                ra_remove_at_index(&r->high_low_container, pos);
+                pos = -1;
+            }
+        }
+    }
+}
+
+// there should be some SIMD optimizations possible here
+roaring_bitmap_t *roaring_bitmap_and(const roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2) {
+    uint8_t result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    uint32_t neededcap = length1 > length2 ? length2 : length1;
+    roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap);
+    roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2));
+
+    int pos1 = 0, pos2 = 0;
+
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 =
+            ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+        const uint16_t s2 =
+            ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        if (s1 == s2) {
+            uint8_t type1, type2;
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            container_t *c = container_and(c1, type1, c2, type2, &result_type);
+
+            if (container_nonzero_cardinality(c, result_type)) {
+                ra_append(&answer->high_low_container, s1, c, result_type);
+            } else {
+                container_free(c, result_type);  // otherwise: memory leak!
+            }
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {  // s1 < s2
+            pos1 = ra_advance_until(&x1->high_low_container, s2, pos1);
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+        }
+    }
+    return answer;
+}
+
+/**
+ * Compute the union of 'number' bitmaps.
+ */
+roaring_bitmap_t *roaring_bitmap_or_many(size_t number,
+                                         const roaring_bitmap_t **x) {
+    if (number == 0) {
+        return roaring_bitmap_create();
+    }
+    if (number == 1) {
+        return roaring_bitmap_copy(x[0]);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_lazy_or(x[0], x[1], LAZY_OR_BITSET_CONVERSION);
+    for (size_t i = 2; i < number; i++) {
+        roaring_bitmap_lazy_or_inplace(answer, x[i], LAZY_OR_BITSET_CONVERSION);
+    }
+    roaring_bitmap_repair_after_lazy(answer);
+    return answer;
+}
+
+/**
+ * Compute the xor of 'number' bitmaps.
+ */
+roaring_bitmap_t *roaring_bitmap_xor_many(size_t number,
+                                          const roaring_bitmap_t **x) {
+    if (number == 0) {
+        return roaring_bitmap_create();
+    }
+    if (number == 1) {
+        return roaring_bitmap_copy(x[0]);
+    }
+    roaring_bitmap_t *answer = roaring_bitmap_lazy_xor(x[0], x[1]);
+    for (size_t i = 2; i < number; i++) {
+        roaring_bitmap_lazy_xor_inplace(answer, x[i]);
+    }
+    roaring_bitmap_repair_after_lazy(answer);
+    return answer;
+}
+
+// inplace and (modifies its first argument).
+void roaring_bitmap_and_inplace(roaring_bitmap_t *x1,
+                                const roaring_bitmap_t *x2) {
+    if (x1 == x2) return;
+    int pos1 = 0, pos2 = 0, intersection_size = 0;
+    const int length1 = ra_get_size(&x1->high_low_container);
+    const int length2 = ra_get_size(&x2->high_low_container);
+
+    // any skipped-over or newly emptied containers in x1
+    // have to be freed.
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 =
+            ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+        const uint16_t s2 =
+            ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        if (s1 == s2) {
+            uint8_t type1, type2, result_type;
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+
+            // We do the computation "in place" only when c1 is not a shared
+            // container. Rationale: using a shared container safely with in
+            // place computation would require making a copy and then doing the
+            // computation in place which is likely less efficient than avoiding
+            // in place entirely and always generating a new container.
+            container_t *c =
+                (type1 == SHARED_CONTAINER_TYPE)
+                    ? container_and(c1, type1, c2, type2, &result_type)
+                    : container_iand(c1, type1, c2, type2, &result_type);
+
+            if (c != c1) {  // in this instance a new container was created, and
+                            // we need to free the old one
+                container_free(c1, type1);
+            }
+            if (container_nonzero_cardinality(c, result_type)) {
+                ra_replace_key_and_container_at_index(&x1->high_low_container,
+                                                      intersection_size, s1, c,
+                                                      result_type);
+                intersection_size++;
+            } else {
+                container_free(c, result_type);
+            }
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {
+            pos1 = ra_advance_until_freeing(&x1->high_low_container, s2, pos1);
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+        }
+    }
+
+    // if we ended early because x2 ran out, then all remaining in x1 should be
+    // freed
+    while (pos1 < length1) {
+        container_free(x1->high_low_container.containers[pos1],
+                       x1->high_low_container.typecodes[pos1]);
+        ++pos1;
+    }
+
+    // all containers after this have either been copied or freed
+    ra_downsize(&x1->high_low_container, intersection_size);
+}
+
+roaring_bitmap_t *roaring_bitmap_or(const roaring_bitmap_t *x1,
+                                    const roaring_bitmap_t *x2) {
+    uint8_t result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        return roaring_bitmap_copy(x2);
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_create_with_capacity(length1 + length2);
+    roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2));
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            container_t *c = container_or(c1, type1, c2, type2, &result_type);
+
+            // since we assume that the initial containers are non-empty, the
+            // result here
+            // can only be non-empty
+            ra_append(&answer->high_low_container, s1, c, result_type);
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            // c1 = container_clone(c1, type1);
+            c1 = get_copy_of_container(c1, &type1, is_cow(x1));
+            if (is_cow(x1)) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c1,
+                                          type1);
+            }
+            ra_append(&answer->high_low_container, s1, c1, type1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            // c2 = container_clone(c2, type2);
+            c2 = get_copy_of_container(c2, &type2, is_cow(x2));
+            if (is_cow(x2)) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          type2);
+            }
+            ra_append(&answer->high_low_container, s2, c2, type2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2,
+                             is_cow(x2));
+    } else if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             is_cow(x1));
+    }
+    return answer;
+}
+
+// inplace or (modifies its first argument).
+void roaring_bitmap_or_inplace(roaring_bitmap_t *x1,
+                               const roaring_bitmap_t *x2) {
+    uint8_t result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_overwrite(x1, x2);
+        return;
+    }
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            if (!container_is_full(c1, type1)) {
+                container_t *c2 = ra_get_container_at_index(
+                    &x2->high_low_container, (uint16_t)pos2, &type2);
+                container_t *c =
+                    (type1 == SHARED_CONTAINER_TYPE)
+                        ? container_or(c1, type1, c2, type2, &result_type)
+                        : container_ior(c1, type1, c2, type2, &result_type);
+
+                if (c != c1) {  // in this instance a new container was created,
+                                // and we need to free the old one
+                    container_free(c1, type1);
+                }
+                ra_set_container_at_index(&x1->high_low_container, pos1, c,
+                                          result_type);
+            }
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            c2 = get_copy_of_container(c2, &type2, is_cow(x2));
+            if (is_cow(x2)) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          type2);
+            }
+
+            // container_t *c2_clone = container_clone(c2, type2);
+            ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
+                                       type2);
+            pos1++;
+            length1++;
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
+                             pos2, length2, is_cow(x2));
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_xor(const roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2) {
+    uint8_t result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        return roaring_bitmap_copy(x2);
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_create_with_capacity(length1 + length2);
+    roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2));
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            container_t *c = container_xor(c1, type1, c2, type2, &result_type);
+
+            if (container_nonzero_cardinality(c, result_type)) {
+                ra_append(&answer->high_low_container, s1, c, result_type);
+            } else {
+                container_free(c, result_type);
+            }
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            c1 = get_copy_of_container(c1, &type1, is_cow(x1));
+            if (is_cow(x1)) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c1,
+                                          type1);
+            }
+            ra_append(&answer->high_low_container, s1, c1, type1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            c2 = get_copy_of_container(c2, &type2, is_cow(x2));
+            if (is_cow(x2)) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          type2);
+            }
+            ra_append(&answer->high_low_container, s2, c2, type2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2,
+                             is_cow(x2));
+    } else if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             is_cow(x1));
+    }
+    return answer;
+}
+
+// inplace xor (modifies its first argument).
+
+void roaring_bitmap_xor_inplace(roaring_bitmap_t *x1,
+                                const roaring_bitmap_t *x2) {
+    assert(x1 != x2);
+    uint8_t result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_overwrite(x1, x2);
+        return;
+    }
+
+    // XOR can have new containers inserted from x2, but can also
+    // lose containers when x1 and x2 are nonempty and identical.
+
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+
+            // We do the computation "in place" only when c1 is not a shared
+            // container. Rationale: using a shared container safely with in
+            // place computation would require making a copy and then doing the
+            // computation in place which is likely less efficient than avoiding
+            // in place entirely and always generating a new container.
+
+            container_t *c;
+            if (type1 == SHARED_CONTAINER_TYPE) {
+                c = container_xor(c1, type1, c2, type2, &result_type);
+                shared_container_free(CAST_shared(c1));  // so release
+            } else {
+                c = container_ixor(c1, type1, c2, type2, &result_type);
+            }
+
+            if (container_nonzero_cardinality(c, result_type)) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c,
+                                          result_type);
+                ++pos1;
+            } else {
+                container_free(c, result_type);
+                ra_remove_at_index(&x1->high_low_container, pos1);
+                --length1;
+            }
+
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            c2 = get_copy_of_container(c2, &type2, is_cow(x2));
+            if (is_cow(x2)) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          type2);
+            }
+
+            ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
+                                       type2);
+            pos1++;
+            length1++;
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
+                             pos2, length2, is_cow(x2));
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_andnot(const roaring_bitmap_t *x1,
+                                        const roaring_bitmap_t *x2) {
+    uint8_t result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        roaring_bitmap_t *empty_bitmap = roaring_bitmap_create();
+        roaring_bitmap_set_copy_on_write(empty_bitmap,
+                                         is_cow(x1) || is_cow(x2));
+        return empty_bitmap;
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(length1);
+    roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2));
+
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = 0;
+    uint16_t s2 = 0;
+    while (true) {
+        s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+        s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            container_t *c =
+                container_andnot(c1, type1, c2, type2, &result_type);
+
+            if (container_nonzero_cardinality(c, result_type)) {
+                ra_append(&answer->high_low_container, s1, c, result_type);
+            } else {
+                container_free(c, result_type);
+            }
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+        } else if (s1 < s2) {  // s1 < s2
+            const int next_pos1 =
+                ra_advance_until(&x1->high_low_container, s2, pos1);
+            ra_append_copy_range(&answer->high_low_container,
+                                 &x1->high_low_container, pos1, next_pos1,
+                                 is_cow(x1));
+            // TODO : perhaps some of the copy_on_write should be based on
+            // answer rather than x1 (more stringent?).  Many similar cases
+            pos1 = next_pos1;
+            if (pos1 == length1) break;
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+            if (pos2 == length2) break;
+        }
+    }
+    if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             is_cow(x1));
+    }
+    return answer;
+}
+
+// inplace andnot (modifies its first argument).
+
+void roaring_bitmap_andnot_inplace(roaring_bitmap_t *x1,
+                                   const roaring_bitmap_t *x2) {
+    assert(x1 != x2);
+
+    uint8_t result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+    int intersection_size = 0;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_clear(x1);
+        return;
+    }
+
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+
+            // We do the computation "in place" only when c1 is not a shared
+            // container. Rationale: using a shared container safely with in
+            // place computation would require making a copy and then doing the
+            // computation in place which is likely less efficient than avoiding
+            // in place entirely and always generating a new container.
+
+            container_t *c;
+            if (type1 == SHARED_CONTAINER_TYPE) {
+                c = container_andnot(c1, type1, c2, type2, &result_type);
+                shared_container_free(CAST_shared(c1));  // release
+            } else {
+                c = container_iandnot(c1, type1, c2, type2, &result_type);
+            }
+
+            if (container_nonzero_cardinality(c, result_type)) {
+                ra_replace_key_and_container_at_index(&x1->high_low_container,
+                                                      intersection_size++, s1,
+                                                      c, result_type);
+            } else {
+                container_free(c, result_type);
+            }
+
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            if (pos1 != intersection_size) {
+                container_t *c1 = ra_get_container_at_index(
+                    &x1->high_low_container, (uint16_t)pos1, &type1);
+
+                ra_replace_key_and_container_at_index(
+                    &x1->high_low_container, intersection_size, s1, c1, type1);
+            }
+            intersection_size++;
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+
+    if (pos1 < length1) {
+        // all containers between intersection_size and
+        // pos1 are junk.  However, they have either been moved
+        // (thus still referenced) or involved in an iandnot
+        // that will clean up all containers that could not be reused.
+        // Thus we should not free the junk containers between
+        // intersection_size and pos1.
+        if (pos1 > intersection_size) {
+            // left slide of remaining items
+            ra_copy_range(&x1->high_low_container, pos1, length1,
+                          intersection_size);
+        }
+        // else current placement is fine
+        intersection_size += (length1 - pos1);
+    }
+    ra_downsize(&x1->high_low_container, intersection_size);
+}
+
+uint64_t roaring_bitmap_get_cardinality(const roaring_bitmap_t *r) {
+    const roaring_array_t *ra = &r->high_low_container;
+
+    uint64_t card = 0;
+    for (int i = 0; i < ra->size; ++i)
+        card += container_get_cardinality(ra->containers[i], ra->typecodes[i]);
+    return card;
+}
+
+uint64_t roaring_bitmap_range_cardinality(const roaring_bitmap_t *r,
+                                          uint64_t range_start,
+                                          uint64_t range_end) {
+    const roaring_array_t *ra = &r->high_low_container;
+
+    if (range_end > UINT32_MAX) {
+        range_end = UINT32_MAX + UINT64_C(1);
+    }
+    if (range_start >= range_end) {
+        return 0;
+    }
+    range_end--;  // make range_end inclusive
+    // now we have: 0 <= range_start <= range_end <= UINT32_MAX
+
+    uint16_t minhb = (uint16_t)(range_start >> 16);
+    uint16_t maxhb = (uint16_t)(range_end >> 16);
+
+    uint64_t card = 0;
+
+    int i = ra_get_index(ra, minhb);
+    if (i >= 0) {
+        if (minhb == maxhb) {
+            card += container_rank(ra->containers[i], ra->typecodes[i],
+                                   range_end & 0xffff);
+        } else {
+            card +=
+                container_get_cardinality(ra->containers[i], ra->typecodes[i]);
+        }
+        if ((range_start & 0xffff) != 0) {
+            card -= container_rank(ra->containers[i], ra->typecodes[i],
+                                   (range_start & 0xffff) - 1);
+        }
+        i++;
+    } else {
+        i = -i - 1;
+    }
+
+    for (; i < ra->size; i++) {
+        uint16_t key = ra->keys[i];
+        if (key < maxhb) {
+            card +=
+                container_get_cardinality(ra->containers[i], ra->typecodes[i]);
+        } else if (key == maxhb) {
+            card += container_rank(ra->containers[i], ra->typecodes[i],
+                                   range_end & 0xffff);
+            break;
+        } else {
+            break;
+        }
+    }
+
+    return card;
+}
+
+bool roaring_bitmap_is_empty(const roaring_bitmap_t *r) {
+    return r->high_low_container.size == 0;
+}
+
+void roaring_bitmap_to_uint32_array(const roaring_bitmap_t *r, uint32_t *ans) {
+    ra_to_uint32_array(&r->high_low_container, ans);
+}
+
+bool roaring_bitmap_range_uint32_array(const roaring_bitmap_t *r, size_t offset,
+                                       size_t limit, uint32_t *ans) {
+    return ra_range_uint32_array(&r->high_low_container, offset, limit, ans);
+}
+
+/** convert array and bitmap containers to run containers when it is more
+ * efficient;
+ * also convert from run containers when more space efficient.  Returns
+ * true if the result has at least one run container.
+ */
+bool roaring_bitmap_run_optimize(roaring_bitmap_t *r) {
+    bool answer = false;
+    for (int i = 0; i < r->high_low_container.size; i++) {
+        uint8_t type_original, type_after;
+        ra_unshare_container_at_index(
+            &r->high_low_container,
+            (uint16_t)i);  // TODO: this introduces extra cloning!
+        container_t *c = ra_get_container_at_index(&r->high_low_container,
+                                                   (uint16_t)i, &type_original);
+        container_t *c1 = convert_run_optimize(c, type_original, &type_after);
+        if (type_after == RUN_CONTAINER_TYPE) {
+            answer = true;
+        }
+        ra_set_container_at_index(&r->high_low_container, i, c1, type_after);
+    }
+    return answer;
+}
+
+size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r) {
+    size_t answer = 0;
+    for (int i = 0; i < r->high_low_container.size; i++) {
+        uint8_t type_original;
+        container_t *c = ra_get_container_at_index(&r->high_low_container,
+                                                   (uint16_t)i, &type_original);
+        answer += container_shrink_to_fit(c, type_original);
+    }
+    answer += ra_shrink_to_fit(&r->high_low_container);
+    return answer;
+}
+
+/**
+ *  Remove run-length encoding even when it is more space efficient
+ *  return whether a change was applied
+ */
+bool roaring_bitmap_remove_run_compression(roaring_bitmap_t *r) {
+    bool answer = false;
+    for (int i = 0; i < r->high_low_container.size; i++) {
+        uint8_t type_original, type_after;
+        container_t *c = ra_get_container_at_index(&r->high_low_container,
+                                                   (uint16_t)i, &type_original);
+        if (get_container_type(c, type_original) == RUN_CONTAINER_TYPE) {
+            answer = true;
+            if (type_original == SHARED_CONTAINER_TYPE) {
+                run_container_t *truec = CAST_run(CAST_shared(c)->container);
+                int32_t card = run_container_cardinality(truec);
+                container_t *c1 = convert_to_bitset_or_array_container(
+                    truec, card, &type_after);
+                shared_container_free(CAST_shared(c));  // frees run as needed
+                ra_set_container_at_index(&r->high_low_container, i, c1,
+                                          type_after);
+
+            } else {
+                int32_t card = run_container_cardinality(CAST_run(c));
+                container_t *c1 = convert_to_bitset_or_array_container(
+                    CAST_run(c), card, &type_after);
+                run_container_free(CAST_run(c));
+                ra_set_container_at_index(&r->high_low_container, i, c1,
+                                          type_after);
+            }
+        }
+    }
+    return answer;
+}
+
+size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf) {
+    size_t portablesize = roaring_bitmap_portable_size_in_bytes(r);
+    uint64_t cardinality = roaring_bitmap_get_cardinality(r);
+    uint64_t sizeasarray = cardinality * sizeof(uint32_t) + sizeof(uint32_t);
+    if (portablesize < sizeasarray) {
+        buf[0] = CROARING_SERIALIZATION_CONTAINER;
+        return roaring_bitmap_portable_serialize(r, buf + 1) + 1;
+    } else {
+        buf[0] = CROARING_SERIALIZATION_ARRAY_UINT32;
+        memcpy(buf + 1, &cardinality, sizeof(uint32_t));
+        roaring_bitmap_to_uint32_array(
+            r, (uint32_t *)(buf + 1 + sizeof(uint32_t)));
+        return 1 + (size_t)sizeasarray;
+    }
+}
+
+size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *r) {
+    size_t portablesize = roaring_bitmap_portable_size_in_bytes(r);
+    uint64_t sizeasarray =
+        roaring_bitmap_get_cardinality(r) * sizeof(uint32_t) + sizeof(uint32_t);
+    return portablesize < sizeasarray ? portablesize + 1
+                                      : (size_t)sizeasarray + 1;
+}
+
+size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *r) {
+    return ra_portable_size_in_bytes(&r->high_low_container);
+}
+
+roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf,
+                                                           size_t maxbytes) {
+    roaring_bitmap_t *ans =
+        (roaring_bitmap_t *)roaring_malloc(sizeof(roaring_bitmap_t));
+    if (ans == NULL) {
+        return NULL;
+    }
+    size_t bytesread;
+    bool is_ok = ra_portable_deserialize(&ans->high_low_container, buf,
+                                         maxbytes, &bytesread);
+    if (!is_ok) {
+        roaring_free(ans);
+        return NULL;
+    }
+    roaring_bitmap_set_copy_on_write(ans, false);
+    if (!is_ok) {
+        roaring_free(ans);
+        return NULL;
+    }
+    return ans;
+}
+
+roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf) {
+    return roaring_bitmap_portable_deserialize_safe(buf, SIZE_MAX);
+}
+
+size_t roaring_bitmap_portable_deserialize_size(const char *buf,
+                                                size_t maxbytes) {
+    return ra_portable_deserialize_size(buf, maxbytes);
+}
+
+size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *r, char *buf) {
+    return ra_portable_serialize(&r->high_low_container, buf);
+}
+
+roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf) {
+    const char *bufaschar = (const char *)buf;
+    if (bufaschar[0] == CROARING_SERIALIZATION_ARRAY_UINT32) {
+        /* This looks like a compressed set of uint32_t elements */
+        uint32_t card;
+
+        memcpy(&card, bufaschar + 1, sizeof(uint32_t));
+
+        const uint32_t *elems =
+            (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t));
+
+        roaring_bitmap_t *bitmap = roaring_bitmap_create();
+        if (bitmap == NULL) {
+            return NULL;
+        }
+        roaring_bulk_context_t context = {0};
+        for (uint32_t i = 0; i < card; i++) {
+            // elems may not be aligned, read with memcpy
+            uint32_t elem;
+            memcpy(&elem, elems + i, sizeof(elem));
+            roaring_bitmap_add_bulk(bitmap, &context, elem);
+        }
+        return bitmap;
+
+    } else if (bufaschar[0] == CROARING_SERIALIZATION_CONTAINER) {
+        return roaring_bitmap_portable_deserialize(bufaschar + 1);
+    } else
+        return (NULL);
+}
+
+roaring_bitmap_t *roaring_bitmap_deserialize_safe(const void *buf,
+                                                  size_t maxbytes) {
+    if (maxbytes < 1) {
+        return NULL;
+    }
+
+    const char *bufaschar = (const char *)buf;
+    if (bufaschar[0] == CROARING_SERIALIZATION_ARRAY_UINT32) {
+        if (maxbytes < 1 + sizeof(uint32_t)) {
+            return NULL;
+        }
+
+        /* This looks like a compressed set of uint32_t elements */
+        uint32_t card;
+        memcpy(&card, bufaschar + 1, sizeof(uint32_t));
+
+        // Check the buffer is big enough to contain card uint32_t elements
+        if (maxbytes < 1 + sizeof(uint32_t) + card * sizeof(uint32_t)) {
+            return NULL;
+        }
+
+        const uint32_t *elems =
+            (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t));
+
+        roaring_bitmap_t *bitmap = roaring_bitmap_create();
+        if (bitmap == NULL) {
+            return NULL;
+        }
+        roaring_bulk_context_t context = {0};
+        for (uint32_t i = 0; i < card; i++) {
+            // elems may not be aligned, read with memcpy
+            uint32_t elem;
+            memcpy(&elem, elems + i, sizeof(elem));
+            roaring_bitmap_add_bulk(bitmap, &context, elem);
+        }
+        return bitmap;
+
+    } else if (bufaschar[0] == CROARING_SERIALIZATION_CONTAINER) {
+        return roaring_bitmap_portable_deserialize_safe(bufaschar + 1,
+                                                        maxbytes - 1);
+    } else
+        return (NULL);
+}
+
+bool roaring_iterate(const roaring_bitmap_t *r, roaring_iterator iterator,
+                     void *ptr) {
+    const roaring_array_t *ra = &r->high_low_container;
+
+    for (int i = 0; i < ra->size; ++i)
+        if (!container_iterate(ra->containers[i], ra->typecodes[i],
+                               ((uint32_t)ra->keys[i]) << 16, iterator, ptr)) {
+            return false;
+        }
+    return true;
+}
+
+bool roaring_iterate64(const roaring_bitmap_t *r, roaring_iterator64 iterator,
+                       uint64_t high_bits, void *ptr) {
+    const roaring_array_t *ra = &r->high_low_container;
+
+    for (int i = 0; i < ra->size; ++i)
+        if (!container_iterate64(ra->containers[i], ra->typecodes[i],
+                                 ((uint32_t)ra->keys[i]) << 16, iterator,
+                                 high_bits, ptr)) {
+            return false;
+        }
+    return true;
+}
+
+/****
+ * begin roaring_uint32_iterator_t
+ *****/
+
+/**
+ * Partially initializes the iterator. Leaves it in either state:
+ * 1. Invalid due to `has_value = false`, or
+ * 2. At a container, with the high bits set, `has_value = true`.
+ */
+CROARING_WARN_UNUSED static bool iter_new_container_partial_init(
+    roaring_uint32_iterator_t *newit) {
+    newit->current_value = 0;
+    if (newit->container_index >= newit->parent->high_low_container.size ||
+        newit->container_index < 0) {
+        newit->current_value = UINT32_MAX;
+        return (newit->has_value = false);
+    }
+    newit->has_value = true;
+    // we precompute container, typecode and highbits so that successive
+    // iterators do not have to grab them from odd memory locations
+    // and have to worry about the (easily predicted) container_unwrap_shared
+    // call.
+    newit->container =
+        newit->parent->high_low_container.containers[newit->container_index];
+    newit->typecode =
+        newit->parent->high_low_container.typecodes[newit->container_index];
+    newit->highbits =
+        ((uint32_t)
+             newit->parent->high_low_container.keys[newit->container_index])
+        << 16;
+    newit->container =
+        container_unwrap_shared(newit->container, &(newit->typecode));
+    return true;
+}
+
+/**
+ * Positions the iterator at the first value of the current container that the
+ * iterator points at, if available.
+ */
+CROARING_WARN_UNUSED static bool loadfirstvalue(
+    roaring_uint32_iterator_t *newit) {
+    if (iter_new_container_partial_init(newit)) {
+        uint16_t value = 0;
+        newit->container_it =
+            container_init_iterator(newit->container, newit->typecode, &value);
+        newit->current_value = newit->highbits | value;
+    }
+    return newit->has_value;
+}
+
+/**
+ * Positions the iterator at the last value of the current container that the
+ * iterator points at, if available.
+ */
+CROARING_WARN_UNUSED static bool loadlastvalue(
+    roaring_uint32_iterator_t *newit) {
+    if (iter_new_container_partial_init(newit)) {
+        uint16_t value = 0;
+        newit->container_it = container_init_iterator_last(
+            newit->container, newit->typecode, &value);
+        newit->current_value = newit->highbits | value;
+    }
+    return newit->has_value;
+}
+
+/**
+ * Positions the iterator at the smallest value that is larger than or equal to
+ * `val` within the current container that the iterator points at. Assumes such
+ * a value exists within the current container.
+ */
+CROARING_WARN_UNUSED static bool loadfirstvalue_largeorequal(
+    roaring_uint32_iterator_t *newit, uint32_t val) {
+    bool partial_init = iter_new_container_partial_init(newit);
+    assert(partial_init);
+    if (!partial_init) {
+        return false;
+    }
+    uint16_t value = 0;
+    newit->container_it =
+        container_init_iterator(newit->container, newit->typecode, &value);
+    bool found = container_iterator_lower_bound(
+        newit->container, newit->typecode, &newit->container_it, &value,
+        val & 0xFFFF);
+    assert(found);
+    if (!found) {
+        return false;
+    }
+    newit->current_value = newit->highbits | value;
+    return true;
+}
+
+void roaring_iterator_init(const roaring_bitmap_t *r,
+                           roaring_uint32_iterator_t *newit) {
+    newit->parent = r;
+    newit->container_index = 0;
+    newit->has_value = loadfirstvalue(newit);
+}
+
+void roaring_iterator_init_last(const roaring_bitmap_t *r,
+                                roaring_uint32_iterator_t *newit) {
+    newit->parent = r;
+    newit->container_index = newit->parent->high_low_container.size - 1;
+    newit->has_value = loadlastvalue(newit);
+}
+
+roaring_uint32_iterator_t *roaring_iterator_create(const roaring_bitmap_t *r) {
+    roaring_uint32_iterator_t *newit =
+        (roaring_uint32_iterator_t *)roaring_malloc(
+            sizeof(roaring_uint32_iterator_t));
+    if (newit == NULL) return NULL;
+    roaring_iterator_init(r, newit);
+    return newit;
+}
+
+roaring_uint32_iterator_t *roaring_uint32_iterator_copy(
+    const roaring_uint32_iterator_t *it) {
+    roaring_uint32_iterator_t *newit =
+        (roaring_uint32_iterator_t *)roaring_malloc(
+            sizeof(roaring_uint32_iterator_t));
+    memcpy(newit, it, sizeof(roaring_uint32_iterator_t));
+    return newit;
+}
+
+bool roaring_uint32_iterator_move_equalorlarger(roaring_uint32_iterator_t *it,
+                                                uint32_t val) {
+    uint16_t hb = val >> 16;
+    const int i = ra_get_index(&it->parent->high_low_container, hb);
+    if (i >= 0) {
+        uint32_t lowvalue =
+            container_maximum(it->parent->high_low_container.containers[i],
+                              it->parent->high_low_container.typecodes[i]);
+        uint16_t lb = val & 0xFFFF;
+        if (lowvalue < lb) {
+            // will have to load first value of next container
+            it->container_index = i + 1;
+        } else {
+            // the value is necessarily within the range of the container
+            it->container_index = i;
+            it->has_value = loadfirstvalue_largeorequal(it, val);
+            return it->has_value;
+        }
+    } else {
+        // there is no matching, so we are going for the next container
+        it->container_index = -i - 1;
+    }
+    it->has_value = loadfirstvalue(it);
+    return it->has_value;
+}
+
+bool roaring_uint32_iterator_advance(roaring_uint32_iterator_t *it) {
+    if (it->container_index >= it->parent->high_low_container.size) {
+        return (it->has_value = false);
+    }
+    if (it->container_index < 0) {
+        it->container_index = 0;
+        return (it->has_value = loadfirstvalue(it));
+    }
+    uint16_t low16 = (uint16_t)it->current_value;
+    if (container_iterator_next(it->container, it->typecode, &it->container_it,
+                                &low16)) {
+        it->current_value = it->highbits | low16;
+        return (it->has_value = true);
+    }
+    it->container_index++;
+    return (it->has_value = loadfirstvalue(it));
+}
+
+bool roaring_uint32_iterator_previous(roaring_uint32_iterator_t *it) {
+    if (it->container_index < 0) {
+        return (it->has_value = false);
+    }
+    if (it->container_index >= it->parent->high_low_container.size) {
+        it->container_index = it->parent->high_low_container.size - 1;
+        return (it->has_value = loadlastvalue(it));
+    }
+    uint16_t low16 = (uint16_t)it->current_value;
+    if (container_iterator_prev(it->container, it->typecode, &it->container_it,
+                                &low16)) {
+        it->current_value = it->highbits | low16;
+        return (it->has_value = true);
+    }
+    it->container_index--;
+    return (it->has_value = loadlastvalue(it));
+}
+
+uint32_t roaring_uint32_iterator_read(roaring_uint32_iterator_t *it,
+                                      uint32_t *buf, uint32_t count) {
+    uint32_t ret = 0;
+    while (it->has_value && ret < count) {
+        uint32_t consumed;
+        uint16_t low16 = (uint16_t)it->current_value;
+        bool has_value = container_iterator_read_into_uint32(
+            it->container, it->typecode, &it->container_it, it->highbits, buf,
+            count - ret, &consumed, &low16);
+        ret += consumed;
+        buf += consumed;
+        if (has_value) {
+            it->has_value = true;
+            it->current_value = it->highbits | low16;
+            assert(ret == count);
+            return ret;
+        }
+        it->container_index++;
+        it->has_value = loadfirstvalue(it);
+    }
+    return ret;
+}
+
+void roaring_uint32_iterator_free(roaring_uint32_iterator_t *it) {
+    roaring_free(it);
+}
+
+/****
+ * end of roaring_uint32_iterator_t
+ *****/
+
+bool roaring_bitmap_equals(const roaring_bitmap_t *r1,
+                           const roaring_bitmap_t *r2) {
+    const roaring_array_t *ra1 = &r1->high_low_container;
+    const roaring_array_t *ra2 = &r2->high_low_container;
+
+    if (ra1->size != ra2->size) {
+        return false;
+    }
+    for (int i = 0; i < ra1->size; ++i) {
+        if (ra1->keys[i] != ra2->keys[i]) {
+            return false;
+        }
+    }
+    for (int i = 0; i < ra1->size; ++i) {
+        bool areequal = container_equals(ra1->containers[i], ra1->typecodes[i],
+                                         ra2->containers[i], ra2->typecodes[i]);
+        if (!areequal) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool roaring_bitmap_is_subset(const roaring_bitmap_t *r1,
+                              const roaring_bitmap_t *r2) {
+    const roaring_array_t *ra1 = &r1->high_low_container;
+    const roaring_array_t *ra2 = &r2->high_low_container;
+
+    const int length1 = ra1->size, length2 = ra2->size;
+
+    int pos1 = 0, pos2 = 0;
+
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 = ra_get_key_at_index(ra1, (uint16_t)pos1);
+        const uint16_t s2 = ra_get_key_at_index(ra2, (uint16_t)pos2);
+
+        if (s1 == s2) {
+            uint8_t type1, type2;
+            container_t *c1 =
+                ra_get_container_at_index(ra1, (uint16_t)pos1, &type1);
+            container_t *c2 =
+                ra_get_container_at_index(ra2, (uint16_t)pos2, &type2);
+            if (!container_is_subset(c1, type1, c2, type2)) return false;
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {  // s1 < s2
+            return false;
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(ra2, s1, pos2);
+        }
+    }
+    if (pos1 == length1)
+        return true;
+    else
+        return false;
+}
+
+static void insert_flipped_container(roaring_array_t *ans_arr,
+                                     const roaring_array_t *x1_arr, uint16_t hb,
+                                     uint16_t lb_start, uint16_t lb_end) {
+    const int i = ra_get_index(x1_arr, hb);
+    const int j = ra_get_index(ans_arr, hb);
+    uint8_t ctype_in, ctype_out;
+    container_t *flipped_container = NULL;
+    if (i >= 0) {
+        container_t *container_to_flip =
+            ra_get_container_at_index(x1_arr, (uint16_t)i, &ctype_in);
+        flipped_container =
+            container_not_range(container_to_flip, ctype_in, (uint32_t)lb_start,
+                                (uint32_t)(lb_end + 1), &ctype_out);
+
+        if (container_get_cardinality(flipped_container, ctype_out))
+            ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
+                                       ctype_out);
+        else {
+            container_free(flipped_container, ctype_out);
+        }
+    } else {
+        flipped_container = container_range_of_ones(
+            (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out);
+        ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
+                                   ctype_out);
+    }
+}
+
+static void inplace_flip_container(roaring_array_t *x1_arr, uint16_t hb,
+                                   uint16_t lb_start, uint16_t lb_end) {
+    const int i = ra_get_index(x1_arr, hb);
+    uint8_t ctype_in, ctype_out;
+    container_t *flipped_container = NULL;
+    if (i >= 0) {
+        container_t *container_to_flip =
+            ra_get_container_at_index(x1_arr, (uint16_t)i, &ctype_in);
+        flipped_container = container_inot_range(
+            container_to_flip, ctype_in, (uint32_t)lb_start,
+            (uint32_t)(lb_end + 1), &ctype_out);
+        // if a new container was created, the old one was already freed
+        if (container_get_cardinality(flipped_container, ctype_out)) {
+            ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out);
+        } else {
+            container_free(flipped_container, ctype_out);
+            ra_remove_at_index(x1_arr, i);
+        }
+
+    } else {
+        flipped_container = container_range_of_ones(
+            (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out);
+        ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container,
+                                   ctype_out);
+    }
+}
+
+static void insert_fully_flipped_container(roaring_array_t *ans_arr,
+                                           const roaring_array_t *x1_arr,
+                                           uint16_t hb) {
+    const int i = ra_get_index(x1_arr, hb);
+    const int j = ra_get_index(ans_arr, hb);
+    uint8_t ctype_in, ctype_out;
+    container_t *flipped_container = NULL;
+    if (i >= 0) {
+        container_t *container_to_flip =
+            ra_get_container_at_index(x1_arr, (uint16_t)i, &ctype_in);
+        flipped_container =
+            container_not(container_to_flip, ctype_in, &ctype_out);
+        if (container_get_cardinality(flipped_container, ctype_out))
+            ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
+                                       ctype_out);
+        else {
+            container_free(flipped_container, ctype_out);
+        }
+    } else {
+        flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out);
+        ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
+                                   ctype_out);
+    }
+}
+
+static void inplace_fully_flip_container(roaring_array_t *x1_arr, uint16_t hb) {
+    const int i = ra_get_index(x1_arr, hb);
+    uint8_t ctype_in, ctype_out;
+    container_t *flipped_container = NULL;
+    if (i >= 0) {
+        container_t *container_to_flip =
+            ra_get_container_at_index(x1_arr, (uint16_t)i, &ctype_in);
+        flipped_container =
+            container_inot(container_to_flip, ctype_in, &ctype_out);
+
+        if (container_get_cardinality(flipped_container, ctype_out)) {
+            ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out);
+        } else {
+            container_free(flipped_container, ctype_out);
+            ra_remove_at_index(x1_arr, i);
+        }
+
+    } else {
+        flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out);
+        ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container,
+                                   ctype_out);
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_flip(const roaring_bitmap_t *x1,
+                                      uint64_t range_start,
+                                      uint64_t range_end) {
+    if (range_start >= range_end) {
+        return roaring_bitmap_copy(x1);
+    }
+    if (range_end >= UINT64_C(0x100000000)) {
+        range_end = UINT64_C(0x100000000);
+    }
+
+    roaring_bitmap_t *ans = roaring_bitmap_create();
+    roaring_bitmap_set_copy_on_write(ans, is_cow(x1));
+
+    uint16_t hb_start = (uint16_t)(range_start >> 16);
+    const uint16_t lb_start = (uint16_t)range_start;  // & 0xFFFF;
+    uint16_t hb_end = (uint16_t)((range_end - 1) >> 16);
+    const uint16_t lb_end = (uint16_t)(range_end - 1);  // & 0xFFFF;
+
+    ra_append_copies_until(&ans->high_low_container, &x1->high_low_container,
+                           hb_start, is_cow(x1));
+    if (hb_start == hb_end) {
+        insert_flipped_container(&ans->high_low_container,
+                                 &x1->high_low_container, hb_start, lb_start,
+                                 lb_end);
+    } else {
+        // start and end containers are distinct
+        if (lb_start > 0) {
+            // handle first (partial) container
+            insert_flipped_container(&ans->high_low_container,
+                                     &x1->high_low_container, hb_start,
+                                     lb_start, 0xFFFF);
+            ++hb_start;  // for the full containers.  Can't wrap.
+        }
+
+        if (lb_end != 0xFFFF) --hb_end;  // later we'll handle the partial block
+
+        for (uint32_t hb = hb_start; hb <= hb_end; ++hb) {
+            insert_fully_flipped_container(&ans->high_low_container,
+                                           &x1->high_low_container,
+                                           (uint16_t)hb);
+        }
+
+        // handle a partial final container
+        if (lb_end != 0xFFFF) {
+            insert_flipped_container(&ans->high_low_container,
+                                     &x1->high_low_container, hb_end + 1, 0,
+                                     lb_end);
+            ++hb_end;
+        }
+    }
+    ra_append_copies_after(&ans->high_low_container, &x1->high_low_container,
+                           hb_end, is_cow(x1));
+    return ans;
+}
+
+void roaring_bitmap_flip_inplace(roaring_bitmap_t *x1, uint64_t range_start,
+                                 uint64_t range_end) {
+    if (range_start >= range_end) {
+        return;  // empty range
+    }
+    if (range_end >= UINT64_C(0x100000000)) {
+        range_end = UINT64_C(0x100000000);
+    }
+
+    uint16_t hb_start = (uint16_t)(range_start >> 16);
+    const uint16_t lb_start = (uint16_t)range_start;
+    uint16_t hb_end = (uint16_t)((range_end - 1) >> 16);
+    const uint16_t lb_end = (uint16_t)(range_end - 1);
+
+    if (hb_start == hb_end) {
+        inplace_flip_container(&x1->high_low_container, hb_start, lb_start,
+                               lb_end);
+    } else {
+        // start and end containers are distinct
+        if (lb_start > 0) {
+            // handle first (partial) container
+            inplace_flip_container(&x1->high_low_container, hb_start, lb_start,
+                                   0xFFFF);
+            ++hb_start;  // for the full containers.  Can't wrap.
+        }
+
+        if (lb_end != 0xFFFF) --hb_end;
+
+        for (uint32_t hb = hb_start; hb <= hb_end; ++hb) {
+            inplace_fully_flip_container(&x1->high_low_container, (uint16_t)hb);
+        }
+        // handle a partial final container
+        if (lb_end != 0xFFFF) {
+            inplace_flip_container(&x1->high_low_container, hb_end + 1, 0,
+                                   lb_end);
+            ++hb_end;
+        }
+    }
+}
+
+static void offset_append_with_merge(roaring_array_t *ra, int k, container_t *c,
+                                     uint8_t t) {
+    int size = ra_get_size(ra);
+    if (size == 0 || ra_get_key_at_index(ra, (uint16_t)(size - 1)) != k) {
+        // No merge.
+        ra_append(ra, (uint16_t)k, c, t);
+        return;
+    }
+
+    uint8_t last_t, new_t;
+    container_t *last_c, *new_c;
+
+    // NOTE: we don't need to unwrap here, since we added last_c ourselves
+    // we have the certainty it's not a shared container.
+    // The same applies to c, as it's the result of calling container_offset.
+    last_c = ra_get_container_at_index(ra, (uint16_t)(size - 1), &last_t);
+    new_c = container_ior(last_c, last_t, c, t, &new_t);
+
+    ra_set_container_at_index(ra, size - 1, new_c, new_t);
+
+    // Comparison of pointers of different origin is UB (or so claim some
+    // compiler makers), so we compare their bit representation only.
+    if ((uintptr_t)last_c != (uintptr_t)new_c) {
+        container_free(last_c, last_t);
+    }
+    container_free(c, t);
+}
+
+// roaring_bitmap_add_offset adds the value 'offset' to each and every value in
+// a bitmap, generating a new bitmap in the process. If offset + element is
+// outside of the range [0,2^32), that the element will be dropped.
+// We need "offset" to be 64 bits because we want to support values
+// between -0xFFFFFFFF up to +0xFFFFFFFF.
+roaring_bitmap_t *roaring_bitmap_add_offset(const roaring_bitmap_t *bm,
+                                            int64_t offset) {
+    roaring_bitmap_t *answer;
+    roaring_array_t *ans_ra;
+    int64_t container_offset;
+    uint16_t in_offset;
+
+    const roaring_array_t *bm_ra = &bm->high_low_container;
+    int length = bm_ra->size;
+
+    if (offset == 0) {
+        return roaring_bitmap_copy(bm);
+    }
+
+    container_offset = offset >> 16;
+    in_offset = (uint16_t)(offset - container_offset * (1 << 16));
+
+    answer = roaring_bitmap_create();
+    bool cow = is_cow(bm);
+    roaring_bitmap_set_copy_on_write(answer, cow);
+
+    ans_ra = &answer->high_low_container;
+
+    if (in_offset == 0) {
+        ans_ra = &answer->high_low_container;
+
+        for (int i = 0, j = 0; i < length; ++i) {
+            int64_t key = ra_get_key_at_index(bm_ra, (uint16_t)i);
+            key += container_offset;
+
+            if (key < 0 || key >= (1 << 16)) {
+                continue;
+            }
+            ra_append_copy(ans_ra, bm_ra, (uint16_t)i, cow);
+            ans_ra->keys[j++] = (uint16_t)key;
+        }
+        return answer;
+    }
+
+    uint8_t t;
+    const container_t *c;
+    container_t *lo, *hi, **lo_ptr, **hi_ptr;
+    int64_t k;
+
+    for (int i = 0; i < length; ++i) {
+        lo = hi = NULL;
+        lo_ptr = hi_ptr = NULL;
+
+        k = ra_get_key_at_index(bm_ra, (uint16_t)i) + container_offset;
+        if (k >= 0 && k < (1 << 16)) {
+            lo_ptr = &lo;
+        }
+        if (k + 1 >= 0 && k + 1 < (1 << 16)) {
+            hi_ptr = &hi;
+        }
+        if (lo_ptr == NULL && hi_ptr == NULL) {
+            continue;
+        }
+        c = ra_get_container_at_index(bm_ra, (uint16_t)i, &t);
+        c = container_unwrap_shared(c, &t);
+
+        container_add_offset(c, t, lo_ptr, hi_ptr, in_offset);
+        if (lo != NULL) {
+            offset_append_with_merge(ans_ra, (int)k, lo, t);
+        }
+        if (hi != NULL) {
+            ra_append(ans_ra, (uint16_t)(k + 1), hi, t);
+        }
+        // the `lo` and `hi` container type always keep same as container `c`.
+        // in the case of `container_add_offset` on bitset container, `lo` and
+        // `hi` may has small cardinality, they must be repaired to array
+        // container.
+    }
+
+    roaring_bitmap_repair_after_lazy(answer);  // do required type conversions.
+    return answer;
+}
+
+roaring_bitmap_t *roaring_bitmap_lazy_or(const roaring_bitmap_t *x1,
+                                         const roaring_bitmap_t *x2,
+                                         const bool bitsetconversion) {
+    uint8_t result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        return roaring_bitmap_copy(x2);
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_create_with_capacity(length1 + length2);
+    roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2));
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            container_t *c;
+            if (bitsetconversion &&
+                (get_container_type(c1, type1) != BITSET_CONTAINER_TYPE) &&
+                (get_container_type(c2, type2) != BITSET_CONTAINER_TYPE)) {
+                container_t *newc1 =
+                    container_mutable_unwrap_shared(c1, &type1);
+                newc1 = container_to_bitset(newc1, type1);
+                type1 = BITSET_CONTAINER_TYPE;
+                c = container_lazy_ior(newc1, type1, c2, type2, &result_type);
+                if (c != newc1) {  // should not happen
+                    container_free(newc1, type1);
+                }
+            } else {
+                c = container_lazy_or(c1, type1, c2, type2, &result_type);
+            }
+            // since we assume that the initial containers are non-empty,
+            // the
+            // result here
+            // can only be non-empty
+            ra_append(&answer->high_low_container, s1, c, result_type);
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            c1 = get_copy_of_container(c1, &type1, is_cow(x1));
+            if (is_cow(x1)) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c1,
+                                          type1);
+            }
+            ra_append(&answer->high_low_container, s1, c1, type1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            c2 = get_copy_of_container(c2, &type2, is_cow(x2));
+            if (is_cow(x2)) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          type2);
+            }
+            ra_append(&answer->high_low_container, s2, c2, type2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2,
+                             is_cow(x2));
+    } else if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             is_cow(x1));
+    }
+    return answer;
+}
+
+void roaring_bitmap_lazy_or_inplace(roaring_bitmap_t *x1,
+                                    const roaring_bitmap_t *x2,
+                                    const bool bitsetconversion) {
+    uint8_t result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_overwrite(x1, x2);
+        return;
+    }
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            if (!container_is_full(c1, type1)) {
+                if ((bitsetconversion == false) ||
+                    (get_container_type(c1, type1) == BITSET_CONTAINER_TYPE)) {
+                    c1 = get_writable_copy_if_shared(c1, &type1);
+                } else {
+                    // convert to bitset
+                    container_t *old_c1 = c1;
+                    uint8_t old_type1 = type1;
+                    c1 = container_mutable_unwrap_shared(c1, &type1);
+                    c1 = container_to_bitset(c1, type1);
+                    container_free(old_c1, old_type1);
+                    type1 = BITSET_CONTAINER_TYPE;
+                }
+
+                container_t *c2 = ra_get_container_at_index(
+                    &x2->high_low_container, (uint16_t)pos2, &type2);
+                container_t *c =
+                    container_lazy_ior(c1, type1, c2, type2, &result_type);
+
+                if (c != c1) {  // in this instance a new container was created,
+                                // and we need to free the old one
+                    container_free(c1, type1);
+                }
+
+                ra_set_container_at_index(&x1->high_low_container, pos1, c,
+                                          result_type);
+            }
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            // container_t *c2_clone = container_clone(c2, type2);
+            c2 = get_copy_of_container(c2, &type2, is_cow(x2));
+            if (is_cow(x2)) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          type2);
+            }
+            ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
+                                       type2);
+            pos1++;
+            length1++;
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
+                             pos2, length2, is_cow(x2));
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_lazy_xor(const roaring_bitmap_t *x1,
+                                          const roaring_bitmap_t *x2) {
+    uint8_t result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        return roaring_bitmap_copy(x2);
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_create_with_capacity(length1 + length2);
+    roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2));
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            container_t *c =
+                container_lazy_xor(c1, type1, c2, type2, &result_type);
+
+            if (container_nonzero_cardinality(c, result_type)) {
+                ra_append(&answer->high_low_container, s1, c, result_type);
+            } else {
+                container_free(c, result_type);
+            }
+
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            c1 = get_copy_of_container(c1, &type1, is_cow(x1));
+            if (is_cow(x1)) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c1,
+                                          type1);
+            }
+            ra_append(&answer->high_low_container, s1, c1, type1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            c2 = get_copy_of_container(c2, &type2, is_cow(x2));
+            if (is_cow(x2)) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          type2);
+            }
+            ra_append(&answer->high_low_container, s2, c2, type2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2,
+                             is_cow(x2));
+    } else if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             is_cow(x1));
+    }
+    return answer;
+}
+
+void roaring_bitmap_lazy_xor_inplace(roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2) {
+    assert(x1 != x2);
+    uint8_t result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_overwrite(x1, x2);
+        return;
+    }
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+
+            // We do the computation "in place" only when c1 is not a shared
+            // container. Rationale: using a shared container safely with in
+            // place computation would require making a copy and then doing the
+            // computation in place which is likely less efficient than avoiding
+            // in place entirely and always generating a new container.
+
+            container_t *c;
+            if (type1 == SHARED_CONTAINER_TYPE) {
+                c = container_lazy_xor(c1, type1, c2, type2, &result_type);
+                shared_container_free(CAST_shared(c1));  // release
+            } else {
+                c = container_lazy_ixor(c1, type1, c2, type2, &result_type);
+            }
+
+            if (container_nonzero_cardinality(c, result_type)) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c,
+                                          result_type);
+                ++pos1;
+            } else {
+                container_free(c, result_type);
+                ra_remove_at_index(&x1->high_low_container, pos1);
+                --length1;
+            }
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            // container_t *c2_clone = container_clone(c2, type2);
+            c2 = get_copy_of_container(c2, &type2, is_cow(x2));
+            if (is_cow(x2)) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          type2);
+            }
+            ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
+                                       type2);
+            pos1++;
+            length1++;
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
+                             pos2, length2, is_cow(x2));
+    }
+}
+
+void roaring_bitmap_repair_after_lazy(roaring_bitmap_t *r) {
+    roaring_array_t *ra = &r->high_low_container;
+
+    for (int i = 0; i < ra->size; ++i) {
+        const uint8_t old_type = ra->typecodes[i];
+        container_t *old_c = ra->containers[i];
+        uint8_t new_type = old_type;
+        container_t *new_c = container_repair_after_lazy(old_c, &new_type);
+        ra->containers[i] = new_c;
+        ra->typecodes[i] = new_type;
+    }
+}
+
+/**
+ * roaring_bitmap_rank returns the number of integers that are smaller or equal
+ * to x.
+ */
+uint64_t roaring_bitmap_rank(const roaring_bitmap_t *bm, uint32_t x) {
+    uint64_t size = 0;
+    uint32_t xhigh = x >> 16;
+    for (int i = 0; i < bm->high_low_container.size; i++) {
+        uint32_t key = bm->high_low_container.keys[i];
+        if (xhigh > key) {
+            size +=
+                container_get_cardinality(bm->high_low_container.containers[i],
+                                          bm->high_low_container.typecodes[i]);
+        } else if (xhigh == key) {
+            return size + container_rank(bm->high_low_container.containers[i],
+                                         bm->high_low_container.typecodes[i],
+                                         x & 0xFFFF);
+        } else {
+            return size;
+        }
+    }
+    return size;
+}
+void roaring_bitmap_rank_many(const roaring_bitmap_t *bm, const uint32_t *begin,
+                              const uint32_t *end, uint64_t *ans) {
+    uint64_t size = 0;
+
+    int i = 0;
+    const uint32_t *iter = begin;
+    while (i < bm->high_low_container.size && iter != end) {
+        uint32_t x = *iter;
+        uint32_t xhigh = x >> 16;
+        uint32_t key = bm->high_low_container.keys[i];
+        if (xhigh > key) {
+            size +=
+                container_get_cardinality(bm->high_low_container.containers[i],
+                                          bm->high_low_container.typecodes[i]);
+            i++;
+        } else if (xhigh == key) {
+            uint32_t consumed = container_rank_many(
+                bm->high_low_container.containers[i],
+                bm->high_low_container.typecodes[i], size, iter, end, ans);
+            iter += consumed;
+            ans += consumed;
+        } else {
+            *(ans++) = size;
+            iter++;
+        }
+    }
+}
+
+/**
+ * roaring_bitmap_get_index returns the index of x, if not exsist return -1.
+ */
+int64_t roaring_bitmap_get_index(const roaring_bitmap_t *bm, uint32_t x) {
+    int64_t index = 0;
+    const uint16_t xhigh = x >> 16;
+    int32_t high_idx = ra_get_index(&bm->high_low_container, xhigh);
+    if (high_idx < 0) return -1;
+
+    for (int i = 0; i < bm->high_low_container.size; i++) {
+        uint32_t key = bm->high_low_container.keys[i];
+        if (xhigh > key) {
+            index +=
+                container_get_cardinality(bm->high_low_container.containers[i],
+                                          bm->high_low_container.typecodes[i]);
+        } else if (xhigh == key) {
+            int32_t low_idx = container_get_index(
+                bm->high_low_container.containers[high_idx],
+                bm->high_low_container.typecodes[high_idx], x & 0xFFFF);
+            if (low_idx < 0) return -1;
+            return index + low_idx;
+        } else {
+            return -1;
+        }
+    }
+    return index;
+}
+
+/**
+ * roaring_bitmap_smallest returns the smallest value in the set.
+ * Returns UINT32_MAX if the set is empty.
+ */
+uint32_t roaring_bitmap_minimum(const roaring_bitmap_t *bm) {
+    if (bm->high_low_container.size > 0) {
+        container_t *c = bm->high_low_container.containers[0];
+        uint8_t type = bm->high_low_container.typecodes[0];
+        uint32_t key = bm->high_low_container.keys[0];
+        uint32_t lowvalue = container_minimum(c, type);
+        return lowvalue | (key << 16);
+    }
+    return UINT32_MAX;
+}
+
+/**
+ * roaring_bitmap_smallest returns the greatest value in the set.
+ * Returns 0 if the set is empty.
+ */
+uint32_t roaring_bitmap_maximum(const roaring_bitmap_t *bm) {
+    if (bm->high_low_container.size > 0) {
+        container_t *container =
+            bm->high_low_container.containers[bm->high_low_container.size - 1];
+        uint8_t typecode =
+            bm->high_low_container.typecodes[bm->high_low_container.size - 1];
+        uint32_t key =
+            bm->high_low_container.keys[bm->high_low_container.size - 1];
+        uint32_t lowvalue = container_maximum(container, typecode);
+        return lowvalue | (key << 16);
+    }
+    return 0;
+}
+
+bool roaring_bitmap_select(const roaring_bitmap_t *bm, uint32_t rank,
+                           uint32_t *element) {
+    container_t *container;
+    uint8_t typecode;
+    uint16_t key;
+    uint32_t start_rank = 0;
+    int i = 0;
+    bool valid = false;
+    while (!valid && i < bm->high_low_container.size) {
+        container = bm->high_low_container.containers[i];
+        typecode = bm->high_low_container.typecodes[i];
+        valid =
+            container_select(container, typecode, &start_rank, rank, element);
+        i++;
+    }
+
+    if (valid) {
+        key = bm->high_low_container.keys[i - 1];
+        *element |= (((uint32_t)key) << 16);  // w/o cast, key promotes signed
+        return true;
+    } else
+        return false;
+}
+
+bool roaring_bitmap_intersect(const roaring_bitmap_t *x1,
+                              const roaring_bitmap_t *x2) {
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    uint64_t answer = 0;
+    int pos1 = 0, pos2 = 0;
+
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 =
+            ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+        const uint16_t s2 =
+            ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        if (s1 == s2) {
+            uint8_t type1, type2;
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            if (container_intersect(c1, type1, c2, type2)) return true;
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {  // s1 < s2
+            pos1 = ra_advance_until(&x1->high_low_container, s2, pos1);
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+        }
+    }
+    return answer != 0;
+}
+
+bool roaring_bitmap_intersect_with_range(const roaring_bitmap_t *bm, uint64_t x,
+                                         uint64_t y) {
+    if (x >= y) {
+        // Empty range.
+        return false;
+    }
+    roaring_uint32_iterator_t it;
+    roaring_iterator_init(bm, &it);
+    if (!roaring_uint32_iterator_move_equalorlarger(&it, (uint32_t)x)) {
+        // No values above x.
+        return false;
+    }
+    if (it.current_value >= y) {
+        // No values below y.
+        return false;
+    }
+    return true;
+}
+
+uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *x1,
+                                        const roaring_bitmap_t *x2) {
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    uint64_t answer = 0;
+    int pos1 = 0, pos2 = 0;
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 =
+            ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+        const uint16_t s2 =
+            ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        if (s1 == s2) {
+            uint8_t type1, type2;
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            answer += container_and_cardinality(c1, type1, c2, type2);
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {  // s1 < s2
+            pos1 = ra_advance_until(&x1->high_low_container, s2, pos1);
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+        }
+    }
+    return answer;
+}
+
+double roaring_bitmap_jaccard_index(const roaring_bitmap_t *x1,
+                                    const roaring_bitmap_t *x2) {
+    const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
+    const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
+    const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
+    return (double)inter / (double)(c1 + c2 - inter);
+}
+
+uint64_t roaring_bitmap_or_cardinality(const roaring_bitmap_t *x1,
+                                       const roaring_bitmap_t *x2) {
+    const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
+    const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
+    const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
+    return c1 + c2 - inter;
+}
+
+uint64_t roaring_bitmap_andnot_cardinality(const roaring_bitmap_t *x1,
+                                           const roaring_bitmap_t *x2) {
+    const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
+    const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
+    return c1 - inter;
+}
+
+uint64_t roaring_bitmap_xor_cardinality(const roaring_bitmap_t *x1,
+                                        const roaring_bitmap_t *x2) {
+    const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
+    const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
+    const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
+    return c1 + c2 - 2 * inter;
+}
+
+bool roaring_bitmap_contains(const roaring_bitmap_t *r, uint32_t val) {
+    const uint16_t hb = val >> 16;
+    /*
+     * the next function call involves a binary search and lots of branching.
+     */
+    int32_t i = ra_get_index(&r->high_low_container, hb);
+    if (i < 0) return false;
+
+    uint8_t typecode;
+    // next call ought to be cheap
+    container_t *container = ra_get_container_at_index(&r->high_low_container,
+                                                       (uint16_t)i, &typecode);
+    // rest might be a tad expensive, possibly involving another round of binary
+    // search
+    return container_contains(container, val & 0xFFFF, typecode);
+}
+
+/**
+ * Check whether a range of values from range_start (included) to range_end
+ * (excluded) is present
+ */
+bool roaring_bitmap_contains_range(const roaring_bitmap_t *r,
+                                   uint64_t range_start, uint64_t range_end) {
+    if (range_end >= UINT64_C(0x100000000)) {
+        range_end = UINT64_C(0x100000000);
+    }
+    if (range_start >= range_end)
+        return true;  // empty range are always contained!
+    if (range_end - range_start == 1)
+        return roaring_bitmap_contains(r, (uint32_t)range_start);
+    uint16_t hb_rs = (uint16_t)(range_start >> 16);
+    uint16_t hb_re = (uint16_t)((range_end - 1) >> 16);
+    const int32_t span = hb_re - hb_rs;
+    const int32_t hlc_sz = ra_get_size(&r->high_low_container);
+    if (hlc_sz < span + 1) {
+        return false;
+    }
+    int32_t is = ra_get_index(&r->high_low_container, hb_rs);
+    int32_t ie = ra_get_index(&r->high_low_container, hb_re);
+    if ((ie < 0) || (is < 0) || ((ie - is) != span) || ie >= hlc_sz) {
+        return false;
+    }
+    const uint32_t lb_rs = range_start & 0xFFFF;
+    const uint32_t lb_re = ((range_end - 1) & 0xFFFF) + 1;
+    uint8_t type;
+    container_t *c =
+        ra_get_container_at_index(&r->high_low_container, (uint16_t)is, &type);
+    if (hb_rs == hb_re) {
+        return container_contains_range(c, lb_rs, lb_re, type);
+    }
+    if (!container_contains_range(c, lb_rs, 1 << 16, type)) {
+        return false;
+    }
+    c = ra_get_container_at_index(&r->high_low_container, (uint16_t)ie, &type);
+    if (!container_contains_range(c, 0, lb_re, type)) {
+        return false;
+    }
+    for (int32_t i = is + 1; i < ie; ++i) {
+        c = ra_get_container_at_index(&r->high_low_container, (uint16_t)i,
+                                      &type);
+        if (!container_is_full(c, type)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool roaring_bitmap_is_strict_subset(const roaring_bitmap_t *r1,
+                                     const roaring_bitmap_t *r2) {
+    return (roaring_bitmap_get_cardinality(r2) >
+                roaring_bitmap_get_cardinality(r1) &&
+            roaring_bitmap_is_subset(r1, r2));
+}
+
+/*
+ * FROZEN SERIALIZATION FORMAT DESCRIPTION
+ *
+ * -- (beginning must be aligned by 32 bytes) --
+ * <bitset_data> uint64_t[BITSET_CONTAINER_SIZE_IN_WORDS *
+ * num_bitset_containers] <run_data>    rle16_t[total number of rle elements in
+ * all run containers] <array_data>  uint16_t[total number of array elements in
+ * all array containers] <keys>        uint16_t[num_containers] <counts>
+ * uint16_t[num_containers] <typecodes>   uint8_t[num_containers] <header>
+ * uint32_t
+ *
+ * <header> is a 4-byte value which is a bit union of FROZEN_COOKIE (15 bits)
+ * and the number of containers (17 bits).
+ *
+ * <counts> stores number of elements for every container.
+ * Its meaning depends on container type.
+ * For array and bitset containers, this value is the container cardinality
+ * minus one. For run container, it is the number of rle_t elements (n_runs).
+ *
+ * <bitset_data>,<array_data>,<run_data> are flat arrays of elements of
+ * all containers of respective type.
+ *
+ * <*_data> and <keys> are kept close together because they are not accessed
+ * during deserilization. This may reduce IO in case of large mmaped bitmaps.
+ * All members have their native alignments during deserilization except
+ * <header>, which is not guaranteed to be aligned by 4 bytes.
+ */
+
+size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *rb) {
+    const roaring_array_t *ra = &rb->high_low_container;
+    size_t num_bytes = 0;
+    for (int32_t i = 0; i < ra->size; i++) {
+        switch (ra->typecodes[i]) {
+            case BITSET_CONTAINER_TYPE: {
+                num_bytes += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+                break;
+            }
+            case RUN_CONTAINER_TYPE: {
+                const run_container_t *rc = const_CAST_run(ra->containers[i]);
+                num_bytes += rc->n_runs * sizeof(rle16_t);
+                break;
+            }
+            case ARRAY_CONTAINER_TYPE: {
+                const array_container_t *ac =
+                    const_CAST_array(ra->containers[i]);
+                num_bytes += ac->cardinality * sizeof(uint16_t);
+                break;
+            }
+            default:
+                roaring_unreachable;
+        }
+    }
+    num_bytes += (2 + 2 + 1) * ra->size;  // keys, counts, typecodes
+    num_bytes += 4;                       // header
+    return num_bytes;
+}
+
+inline static void *arena_alloc(char **arena, size_t num_bytes) {
+    char *res = *arena;
+    *arena += num_bytes;
+    return res;
+}
+
+void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *rb, char *buf) {
+    /*
+     * Note: we do not require user to supply a specifically aligned buffer.
+     * Thus we have to use memcpy() everywhere.
+     */
+
+    const roaring_array_t *ra = &rb->high_low_container;
+
+    size_t bitset_zone_size = 0;
+    size_t run_zone_size = 0;
+    size_t array_zone_size = 0;
+    for (int32_t i = 0; i < ra->size; i++) {
+        switch (ra->typecodes[i]) {
+            case BITSET_CONTAINER_TYPE: {
+                bitset_zone_size +=
+                    BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+                break;
+            }
+            case RUN_CONTAINER_TYPE: {
+                const run_container_t *rc = const_CAST_run(ra->containers[i]);
+                run_zone_size += rc->n_runs * sizeof(rle16_t);
+                break;
+            }
+            case ARRAY_CONTAINER_TYPE: {
+                const array_container_t *ac =
+                    const_CAST_array(ra->containers[i]);
+                array_zone_size += ac->cardinality * sizeof(uint16_t);
+                break;
+            }
+            default:
+                roaring_unreachable;
+        }
+    }
+
+    uint64_t *bitset_zone = (uint64_t *)arena_alloc(&buf, bitset_zone_size);
+    rle16_t *run_zone = (rle16_t *)arena_alloc(&buf, run_zone_size);
+    uint16_t *array_zone = (uint16_t *)arena_alloc(&buf, array_zone_size);
+    uint16_t *key_zone = (uint16_t *)arena_alloc(&buf, 2 * ra->size);
+    uint16_t *count_zone = (uint16_t *)arena_alloc(&buf, 2 * ra->size);
+    uint8_t *typecode_zone = (uint8_t *)arena_alloc(&buf, ra->size);
+    uint32_t *header_zone = (uint32_t *)arena_alloc(&buf, 4);
+
+    for (int32_t i = 0; i < ra->size; i++) {
+        uint16_t count;
+        switch (ra->typecodes[i]) {
+            case BITSET_CONTAINER_TYPE: {
+                const bitset_container_t *bc =
+                    const_CAST_bitset(ra->containers[i]);
+                memcpy(bitset_zone, bc->words,
+                       BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
+                bitset_zone += BITSET_CONTAINER_SIZE_IN_WORDS;
+                if (bc->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+                    count = (uint16_t)(bc->cardinality - 1);
+                } else {
+                    count =
+                        (uint16_t)(bitset_container_compute_cardinality(bc) -
+                                   1);
+                }
+                break;
+            }
+            case RUN_CONTAINER_TYPE: {
+                const run_container_t *rc = const_CAST_run(ra->containers[i]);
+                size_t num_bytes = rc->n_runs * sizeof(rle16_t);
+                memcpy(run_zone, rc->runs, num_bytes);
+                run_zone += rc->n_runs;
+                count = (uint16_t)rc->n_runs;
+                break;
+            }
+            case ARRAY_CONTAINER_TYPE: {
+                const array_container_t *ac =
+                    const_CAST_array(ra->containers[i]);
+                size_t num_bytes = ac->cardinality * sizeof(uint16_t);
+                memcpy(array_zone, ac->array, num_bytes);
+                array_zone += ac->cardinality;
+                count = (uint16_t)(ac->cardinality - 1);
+                break;
+            }
+            default:
+                roaring_unreachable;
+        }
+        memcpy(&count_zone[i], &count, 2);
+    }
+    memcpy(key_zone, ra->keys, ra->size * sizeof(uint16_t));
+    memcpy(typecode_zone, ra->typecodes, ra->size * sizeof(uint8_t));
+    uint32_t header = ((uint32_t)ra->size << 15) | FROZEN_COOKIE;
+    memcpy(header_zone, &header, 4);
+}
+
+const roaring_bitmap_t *roaring_bitmap_frozen_view(const char *buf,
+                                                   size_t length) {
+    if ((uintptr_t)buf % 32 != 0) {
+        return NULL;
+    }
+
+    // cookie and num_containers
+    if (length < 4) {
+        return NULL;
+    }
+    uint32_t header;
+    memcpy(&header, buf + length - 4, 4);  // header may be misaligned
+    if ((header & 0x7FFF) != FROZEN_COOKIE) {
+        return NULL;
+    }
+    int32_t num_containers = (header >> 15);
+
+    // typecodes, counts and keys
+    if (length < 4 + (size_t)num_containers * (1 + 2 + 2)) {
+        return NULL;
+    }
+    uint16_t *keys = (uint16_t *)(buf + length - 4 - num_containers * 5);
+    uint16_t *counts = (uint16_t *)(buf + length - 4 - num_containers * 3);
+    uint8_t *typecodes = (uint8_t *)(buf + length - 4 - num_containers * 1);
+
+    // {bitset,array,run}_zone
+    int32_t num_bitset_containers = 0;
+    int32_t num_run_containers = 0;
+    int32_t num_array_containers = 0;
+    size_t bitset_zone_size = 0;
+    size_t run_zone_size = 0;
+    size_t array_zone_size = 0;
+    for (int32_t i = 0; i < num_containers; i++) {
+        switch (typecodes[i]) {
+            case BITSET_CONTAINER_TYPE:
+                num_bitset_containers++;
+                bitset_zone_size +=
+                    BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+                break;
+            case RUN_CONTAINER_TYPE:
+                num_run_containers++;
+                run_zone_size += counts[i] * sizeof(rle16_t);
+                break;
+            case ARRAY_CONTAINER_TYPE:
+                num_array_containers++;
+                array_zone_size += (counts[i] + UINT32_C(1)) * sizeof(uint16_t);
+                break;
+            default:
+                return NULL;
+        }
+    }
+    if (length != bitset_zone_size + run_zone_size + array_zone_size +
+                      5 * num_containers + 4) {
+        return NULL;
+    }
+    uint64_t *bitset_zone = (uint64_t *)(buf);
+    rle16_t *run_zone = (rle16_t *)(buf + bitset_zone_size);
+    uint16_t *array_zone = (uint16_t *)(buf + bitset_zone_size + run_zone_size);
+
+    size_t alloc_size = 0;
+    alloc_size += sizeof(roaring_bitmap_t);
+    alloc_size += num_containers * sizeof(container_t *);
+    alloc_size += num_bitset_containers * sizeof(bitset_container_t);
+    alloc_size += num_run_containers * sizeof(run_container_t);
+    alloc_size += num_array_containers * sizeof(array_container_t);
+
+    char *arena = (char *)roaring_malloc(alloc_size);
+    if (arena == NULL) {
+        return NULL;
+    }
+
+    roaring_bitmap_t *rb =
+        (roaring_bitmap_t *)arena_alloc(&arena, sizeof(roaring_bitmap_t));
+    rb->high_low_container.flags = ROARING_FLAG_FROZEN;
+    rb->high_low_container.allocation_size = num_containers;
+    rb->high_low_container.size = num_containers;
+    rb->high_low_container.keys = (uint16_t *)keys;
+    rb->high_low_container.typecodes = (uint8_t *)typecodes;
+    rb->high_low_container.containers = (container_t **)arena_alloc(
+        &arena, sizeof(container_t *) * num_containers);
+    // Ensure offset of high_low_container.containers is known distance used in
+    // C++ wrapper. sizeof(roaring_bitmap_t) is used as it is the size of the
+    // only allocation that precedes high_low_container.containers. If this is
+    // changed (new allocation or changed order), this offset will also need to
+    // be changed in the C++ wrapper.
+    assert(rb ==
+           (roaring_bitmap_t *)((char *)rb->high_low_container.containers -
+                                sizeof(roaring_bitmap_t)));
+    for (int32_t i = 0; i < num_containers; i++) {
+        switch (typecodes[i]) {
+            case BITSET_CONTAINER_TYPE: {
+                bitset_container_t *bitset = (bitset_container_t *)arena_alloc(
+                    &arena, sizeof(bitset_container_t));
+                bitset->words = bitset_zone;
+                bitset->cardinality = counts[i] + UINT32_C(1);
+                rb->high_low_container.containers[i] = bitset;
+                bitset_zone += BITSET_CONTAINER_SIZE_IN_WORDS;
+                break;
+            }
+            case RUN_CONTAINER_TYPE: {
+                run_container_t *run = (run_container_t *)arena_alloc(
+                    &arena, sizeof(run_container_t));
+                run->capacity = counts[i];
+                run->n_runs = counts[i];
+                run->runs = run_zone;
+                rb->high_low_container.containers[i] = run;
+                run_zone += run->n_runs;
+                break;
+            }
+            case ARRAY_CONTAINER_TYPE: {
+                array_container_t *array = (array_container_t *)arena_alloc(
+                    &arena, sizeof(array_container_t));
+                array->capacity = counts[i] + UINT32_C(1);
+                array->cardinality = counts[i] + UINT32_C(1);
+                array->array = array_zone;
+                rb->high_low_container.containers[i] = array;
+                array_zone += counts[i] + UINT32_C(1);
+                break;
+            }
+            default:
+                roaring_free(arena);
+                return NULL;
+        }
+    }
+
+    return rb;
+}
+
+ALLOW_UNALIGNED
+roaring_bitmap_t *roaring_bitmap_portable_deserialize_frozen(const char *buf) {
+    char *start_of_buf = (char *)buf;
+    uint32_t cookie;
+    int32_t num_containers;
+    uint16_t *descriptive_headers;
+    uint32_t *offset_headers = NULL;
+    const char *run_flag_bitset = NULL;
+    bool hasrun = false;
+
+    // deserialize cookie
+    memcpy(&cookie, buf, sizeof(uint32_t));
+    buf += sizeof(uint32_t);
+    if (cookie == SERIAL_COOKIE_NO_RUNCONTAINER) {
+        memcpy(&num_containers, buf, sizeof(int32_t));
+        buf += sizeof(int32_t);
+        descriptive_headers = (uint16_t *)buf;
+        buf += num_containers * 2 * sizeof(uint16_t);
+        offset_headers = (uint32_t *)buf;
+        buf += num_containers * sizeof(uint32_t);
+    } else if ((cookie & 0xFFFF) == SERIAL_COOKIE) {
+        num_containers = (cookie >> 16) + 1;
+        hasrun = true;
+        int32_t run_flag_bitset_size = (num_containers + 7) / 8;
+        run_flag_bitset = buf;
+        buf += run_flag_bitset_size;
+        descriptive_headers = (uint16_t *)buf;
+        buf += num_containers * 2 * sizeof(uint16_t);
+        if (num_containers >= NO_OFFSET_THRESHOLD) {
+            offset_headers = (uint32_t *)buf;
+            buf += num_containers * sizeof(uint32_t);
+        }
+    } else {
+        return NULL;
+    }
+
+    // calculate total size for allocation
+    int32_t num_bitset_containers = 0;
+    int32_t num_run_containers = 0;
+    int32_t num_array_containers = 0;
+
+    for (int32_t i = 0; i < num_containers; i++) {
+        uint16_t tmp;
+        memcpy(&tmp, descriptive_headers + 2 * i + 1, sizeof(tmp));
+        uint32_t cardinality = tmp + 1;
+        bool isbitmap = (cardinality > DEFAULT_MAX_SIZE);
+        bool isrun = false;
+        if (hasrun) {
+            if ((run_flag_bitset[i / 8] & (1 << (i % 8))) != 0) {
+                isbitmap = false;
+                isrun = true;
+            }
+        }
+
+        if (isbitmap) {
+            num_bitset_containers++;
+        } else if (isrun) {
+            num_run_containers++;
+        } else {
+            num_array_containers++;
+        }
+    }
+
+    size_t alloc_size = 0;
+    alloc_size += sizeof(roaring_bitmap_t);
+    alloc_size += num_containers * sizeof(container_t *);
+    alloc_size += num_bitset_containers * sizeof(bitset_container_t);
+    alloc_size += num_run_containers * sizeof(run_container_t);
+    alloc_size += num_array_containers * sizeof(array_container_t);
+    alloc_size += num_containers * sizeof(uint16_t);  // keys
+    alloc_size += num_containers * sizeof(uint8_t);   // typecodes
+
+    // allocate bitmap and construct containers
+    char *arena = (char *)roaring_malloc(alloc_size);
+    if (arena == NULL) {
+        return NULL;
+    }
+
+    roaring_bitmap_t *rb =
+        (roaring_bitmap_t *)arena_alloc(&arena, sizeof(roaring_bitmap_t));
+    rb->high_low_container.flags = ROARING_FLAG_FROZEN;
+    rb->high_low_container.allocation_size = num_containers;
+    rb->high_low_container.size = num_containers;
+    rb->high_low_container.containers = (container_t **)arena_alloc(
+        &arena, sizeof(container_t *) * num_containers);
+
+    uint16_t *keys =
+        (uint16_t *)arena_alloc(&arena, num_containers * sizeof(uint16_t));
+    uint8_t *typecodes =
+        (uint8_t *)arena_alloc(&arena, num_containers * sizeof(uint8_t));
+
+    rb->high_low_container.keys = keys;
+    rb->high_low_container.typecodes = typecodes;
+
+    for (int32_t i = 0; i < num_containers; i++) {
+        uint16_t tmp;
+        memcpy(&tmp, descriptive_headers + 2 * i + 1, sizeof(tmp));
+        int32_t cardinality = tmp + 1;
+        bool isbitmap = (cardinality > DEFAULT_MAX_SIZE);
+        bool isrun = false;
+        if (hasrun) {
+            if ((run_flag_bitset[i / 8] & (1 << (i % 8))) != 0) {
+                isbitmap = false;
+                isrun = true;
+            }
+        }
+
+        keys[i] = descriptive_headers[2 * i];
+
+        if (isbitmap) {
+            typecodes[i] = BITSET_CONTAINER_TYPE;
+            bitset_container_t *c = (bitset_container_t *)arena_alloc(
+                &arena, sizeof(bitset_container_t));
+            c->cardinality = cardinality;
+            if (offset_headers != NULL) {
+                c->words = (uint64_t *)(start_of_buf + offset_headers[i]);
+            } else {
+                c->words = (uint64_t *)buf;
+                buf += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+            }
+            rb->high_low_container.containers[i] = c;
+        } else if (isrun) {
+            typecodes[i] = RUN_CONTAINER_TYPE;
+            run_container_t *c =
+                (run_container_t *)arena_alloc(&arena, sizeof(run_container_t));
+            c->capacity = cardinality;
+            uint16_t n_runs;
+            if (offset_headers != NULL) {
+                memcpy(&n_runs, start_of_buf + offset_headers[i],
+                       sizeof(uint16_t));
+                c->n_runs = n_runs;
+                c->runs = (rle16_t *)(start_of_buf + offset_headers[i] +
+                                      sizeof(uint16_t));
+            } else {
+                memcpy(&n_runs, buf, sizeof(uint16_t));
+                c->n_runs = n_runs;
+                buf += sizeof(uint16_t);
+                c->runs = (rle16_t *)buf;
+                buf += c->n_runs * sizeof(rle16_t);
+            }
+            rb->high_low_container.containers[i] = c;
+        } else {
+            typecodes[i] = ARRAY_CONTAINER_TYPE;
+            array_container_t *c = (array_container_t *)arena_alloc(
+                &arena, sizeof(array_container_t));
+            c->cardinality = cardinality;
+            c->capacity = cardinality;
+            if (offset_headers != NULL) {
+                c->array = (uint16_t *)(start_of_buf + offset_headers[i]);
+            } else {
+                c->array = (uint16_t *)buf;
+                buf += cardinality * sizeof(uint16_t);
+            }
+            rb->high_low_container.containers[i] = c;
+        }
+    }
+
+    return rb;
+}
+
+bool roaring_bitmap_to_bitset(const roaring_bitmap_t *r, bitset_t *bitset) {
+    uint32_t max_value = roaring_bitmap_maximum(r);
+    size_t new_array_size = (size_t)(((uint64_t)max_value + 63) / 64);
+    bool resize_ok = bitset_resize(bitset, new_array_size, true);
+    if (!resize_ok) {
+        return false;
+    }
+    const roaring_array_t *ra = &r->high_low_container;
+    for (int i = 0; i < ra->size; ++i) {
+        uint64_t *words = bitset->array + (ra->keys[i] << 10);
+        uint8_t type = ra->typecodes[i];
+        const container_t *c = ra->containers[i];
+        if (type == SHARED_CONTAINER_TYPE) {
+            c = container_unwrap_shared(c, &type);
+        }
+        switch (type) {
+            case BITSET_CONTAINER_TYPE: {
+                size_t max_word_index = new_array_size - (ra->keys[i] << 10);
+                if (max_word_index > 1024) {
+                    max_word_index = 1024;
+                }
+                const bitset_container_t *src = const_CAST_bitset(c);
+                memcpy(words, src->words, max_word_index * sizeof(uint64_t));
+            } break;
+            case ARRAY_CONTAINER_TYPE: {
+                const array_container_t *src = const_CAST_array(c);
+                bitset_set_list(words, src->array, src->cardinality);
+            } break;
+            case RUN_CONTAINER_TYPE: {
+                const run_container_t *src = const_CAST_run(c);
+                for (int32_t rlepos = 0; rlepos < src->n_runs; ++rlepos) {
+                    rle16_t rle = src->runs[rlepos];
+                    bitset_set_lenrange(words, rle.value, rle.length);
+                }
+            } break;
+            default:
+                roaring_unreachable;
+        }
+    }
+    return true;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring {
+#endif
+/* end file src/roaring.c */
+/* begin file src/roaring_priority_queue.c */
+
+#ifdef __cplusplus
+using namespace ::roaring::internal;
+
+extern "C" {
+namespace roaring {
+namespace api {
+#endif
+
+struct roaring_pq_element_s {
+    uint64_t size;
+    bool is_temporary;
+    roaring_bitmap_t *bitmap;
+};
+
+typedef struct roaring_pq_element_s roaring_pq_element_t;
+
+struct roaring_pq_s {
+    roaring_pq_element_t *elements;
+    uint64_t size;
+};
+
+typedef struct roaring_pq_s roaring_pq_t;
+
+static inline bool compare(roaring_pq_element_t *t1, roaring_pq_element_t *t2) {
+    return t1->size < t2->size;
+}
+
+static void pq_add(roaring_pq_t *pq, roaring_pq_element_t *t) {
+    uint64_t i = pq->size;
+    pq->elements[pq->size++] = *t;
+    while (i > 0) {
+        uint64_t p = (i - 1) >> 1;
+        roaring_pq_element_t ap = pq->elements[p];
+        if (!compare(t, &ap)) break;
+        pq->elements[i] = ap;
+        i = p;
+    }
+    pq->elements[i] = *t;
+}
+
+static void pq_free(roaring_pq_t *pq) { roaring_free(pq); }
+
+static void percolate_down(roaring_pq_t *pq, uint32_t i) {
+    uint32_t size = (uint32_t)pq->size;
+    uint32_t hsize = size >> 1;
+    roaring_pq_element_t ai = pq->elements[i];
+    while (i < hsize) {
+        uint32_t l = (i << 1) + 1;
+        uint32_t r = l + 1;
+        roaring_pq_element_t bestc = pq->elements[l];
+        if (r < size) {
+            if (compare(pq->elements + r, &bestc)) {
+                l = r;
+                bestc = pq->elements[r];
+            }
+        }
+        if (!compare(&bestc, &ai)) {
+            break;
+        }
+        pq->elements[i] = bestc;
+        i = l;
+    }
+    pq->elements[i] = ai;
+}
+
+static roaring_pq_t *create_pq(const roaring_bitmap_t **arr, uint32_t length) {
+    size_t alloc_size =
+        sizeof(roaring_pq_t) + sizeof(roaring_pq_element_t) * length;
+    roaring_pq_t *answer = (roaring_pq_t *)roaring_malloc(alloc_size);
+    answer->elements = (roaring_pq_element_t *)(answer + 1);
+    answer->size = length;
+    for (uint32_t i = 0; i < length; i++) {
+        answer->elements[i].bitmap = (roaring_bitmap_t *)arr[i];
+        answer->elements[i].is_temporary = false;
+        answer->elements[i].size =
+            roaring_bitmap_portable_size_in_bytes(arr[i]);
+    }
+    for (int32_t i = (length >> 1); i >= 0; i--) {
+        percolate_down(answer, i);
+    }
+    return answer;
+}
+
+static roaring_pq_element_t pq_poll(roaring_pq_t *pq) {
+    roaring_pq_element_t ans = *pq->elements;
+    if (pq->size > 1) {
+        pq->elements[0] = pq->elements[--pq->size];
+        percolate_down(pq, 0);
+    } else
+        --pq->size;
+    // memmove(pq->elements,pq->elements+1,(pq->size-1)*sizeof(roaring_pq_element_t));--pq->size;
+    return ans;
+}
+
+// this function consumes and frees the inputs
+static roaring_bitmap_t *lazy_or_from_lazy_inputs(roaring_bitmap_t *x1,
+                                                  roaring_bitmap_t *x2) {
+    uint8_t result_type = 0;
+    const int length1 = ra_get_size(&x1->high_low_container),
+              length2 = ra_get_size(&x2->high_low_container);
+    if (0 == length1) {
+        roaring_bitmap_free(x1);
+        return x2;
+    }
+    if (0 == length2) {
+        roaring_bitmap_free(x2);
+        return x1;
+    }
+    uint32_t neededcap = length1 > length2 ? length2 : length1;
+    roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap);
+    int pos1 = 0, pos2 = 0;
+    uint8_t type1, type2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+    while (true) {
+        if (s1 == s2) {
+            // todo: unsharing can be inefficient as it may create a clone where
+            // none
+            // is needed, but it has the benefit of being easy to reason about.
+
+            ra_unshare_container_at_index(&x1->high_low_container,
+                                          (uint16_t)pos1);
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            assert(type1 != SHARED_CONTAINER_TYPE);
+
+            ra_unshare_container_at_index(&x2->high_low_container,
+                                          (uint16_t)pos2);
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            assert(type2 != SHARED_CONTAINER_TYPE);
+
+            container_t *c;
+
+            if ((type2 == BITSET_CONTAINER_TYPE) &&
+                (type1 != BITSET_CONTAINER_TYPE)) {
+                c = container_lazy_ior(c2, type2, c1, type1, &result_type);
+                container_free(c1, type1);
+                if (c != c2) {
+                    container_free(c2, type2);
+                }
+            } else {
+                c = container_lazy_ior(c1, type1, c2, type2, &result_type);
+                container_free(c2, type2);
+                if (c != c1) {
+                    container_free(c1, type1);
+                }
+            }
+            // since we assume that the initial containers are non-empty, the
+            // result here
+            // can only be non-empty
+            ra_append(&answer->high_low_container, s1, c, result_type);
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            container_t *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                        (uint16_t)pos1, &type1);
+            ra_append(&answer->high_low_container, s1, c1, type1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, (uint16_t)pos1);
+
+        } else {  // s1 > s2
+            container_t *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                        (uint16_t)pos2, &type2);
+            ra_append(&answer->high_low_container, s2, c2, type2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, (uint16_t)pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_move_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2);
+    } else if (pos2 == length2) {
+        ra_append_move_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1);
+    }
+    ra_clear_without_containers(&x1->high_low_container);
+    ra_clear_without_containers(&x2->high_low_container);
+    roaring_free(x1);
+    roaring_free(x2);
+    return answer;
+}
+
+/**
+ * Compute the union of 'number' bitmaps using a heap. This can
+ * sometimes be faster than roaring_bitmap_or_many which uses
+ * a naive algorithm. Caller is responsible for freeing the
+ * result.
+ */
+roaring_bitmap_t *roaring_bitmap_or_many_heap(uint32_t number,
+                                              const roaring_bitmap_t **x) {
+    if (number == 0) {
+        return roaring_bitmap_create();
+    }
+    if (number == 1) {
+        return roaring_bitmap_copy(x[0]);
+    }
+    roaring_pq_t *pq = create_pq(x, number);
+    while (pq->size > 1) {
+        roaring_pq_element_t x1 = pq_poll(pq);
+        roaring_pq_element_t x2 = pq_poll(pq);
+
+        if (x1.is_temporary && x2.is_temporary) {
+            roaring_bitmap_t *newb =
+                lazy_or_from_lazy_inputs(x1.bitmap, x2.bitmap);
+            // should normally return a fresh new bitmap *except* that
+            // it can return x1.bitmap or x2.bitmap in degenerate cases
+            bool temporary = !((newb == x1.bitmap) && (newb == x2.bitmap));
+            uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb);
+            roaring_pq_element_t newelement = {
+                .size = bsize, .is_temporary = temporary, .bitmap = newb};
+            pq_add(pq, &newelement);
+        } else if (x2.is_temporary) {
+            roaring_bitmap_lazy_or_inplace(x2.bitmap, x1.bitmap, false);
+            x2.size = roaring_bitmap_portable_size_in_bytes(x2.bitmap);
+            pq_add(pq, &x2);
+        } else if (x1.is_temporary) {
+            roaring_bitmap_lazy_or_inplace(x1.bitmap, x2.bitmap, false);
+            x1.size = roaring_bitmap_portable_size_in_bytes(x1.bitmap);
+
+            pq_add(pq, &x1);
+        } else {
+            roaring_bitmap_t *newb =
+                roaring_bitmap_lazy_or(x1.bitmap, x2.bitmap, false);
+            uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb);
+            roaring_pq_element_t newelement = {
+                .size = bsize, .is_temporary = true, .bitmap = newb};
+
+            pq_add(pq, &newelement);
+        }
+    }
+    roaring_pq_element_t X = pq_poll(pq);
+    roaring_bitmap_t *answer = X.bitmap;
+    roaring_bitmap_repair_after_lazy(answer);
+    pq_free(pq);
+    return answer;
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace api {
+#endif
+/* end file src/roaring_priority_queue.c */
diff --git a/roaring/roaring.h b/roaring/roaring.h
new file mode 100644
index 000000000..c5bb46b08
--- /dev/null
+++ b/roaring/roaring.h
@@ -0,0 +1,2899 @@
+// !!! DO NOT EDIT - THIS IS AN AUTO-GENERATED FILE !!!
+// Created by amalgamation.sh on 2024-04-02T07:40:50Z
+
+/*
+ * The CRoaring project is under a dual license (Apache/MIT).
+ * Users of the library may choose one or the other license.
+ */
+/*
+ * Copyright 2016-2022 The CRoaring authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * MIT License
+ *
+ * Copyright 2016-2022 The CRoaring authors
+ *
+ * Permission is hereby granted, free of charge, to any
+ * person obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the
+ * Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software
+ * is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice
+ * shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+ * TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+ * SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+ * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+/* begin file include/roaring/roaring_version.h */
+// /include/roaring/roaring_version.h automatically generated by release.py, do
+// not change by hand
+#ifndef ROARING_INCLUDE_ROARING_VERSION
+#define ROARING_INCLUDE_ROARING_VERSION
+#define ROARING_VERSION "3.0.0"
+enum {
+    ROARING_VERSION_MAJOR = 3,
+    ROARING_VERSION_MINOR = 0,
+    ROARING_VERSION_REVISION = 0
+};
+#endif  // ROARING_INCLUDE_ROARING_VERSION
+/* end file include/roaring/roaring_version.h */
+/* begin file include/roaring/roaring_types.h */
+/*
+  Typedefs used by various components
+*/
+
+#ifndef ROARING_TYPES_H
+#define ROARING_TYPES_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace api {
+#endif
+
+/**
+ * When building .c files as C++, there's added compile-time checking if the
+ * container types are derived from a `container_t` base class.  So long as
+ * such a base class is empty, the struct will behave compatibly with C structs
+ * despite the derivation.  This is due to the Empty Base Class Optimization:
+ *
+ * https://en.cppreference.com/w/cpp/language/ebo
+ *
+ * But since C isn't namespaced, taking `container_t` globally might collide
+ * with other projects.  So roaring.h uses ROARING_CONTAINER_T, while internal
+ * code #undefs that after declaring `typedef ROARING_CONTAINER_T container_t;`
+ */
+#if defined(__cplusplus)
+extern "C++" {
+struct container_s {};
+}
+#define ROARING_CONTAINER_T ::roaring::api::container_s
+#else
+#define ROARING_CONTAINER_T void  // no compile-time checking
+#endif
+
+#define ROARING_FLAG_COW UINT8_C(0x1)
+#define ROARING_FLAG_FROZEN UINT8_C(0x2)
+
+/**
+ * Roaring arrays are array-based key-value pairs having containers as values
+ * and 16-bit integer keys. A roaring bitmap  might be implemented as such.
+ */
+
+// parallel arrays.  Element sizes quite different.
+// Alternative is array
+// of structs.  Which would have better
+// cache performance through binary searches?
+
+typedef struct roaring_array_s {
+    int32_t size;
+    int32_t allocation_size;
+    ROARING_CONTAINER_T **containers;  // Use container_t in non-API files!
+    uint16_t *keys;
+    uint8_t *typecodes;
+    uint8_t flags;
+} roaring_array_t;
+
+typedef bool (*roaring_iterator)(uint32_t value, void *param);
+typedef bool (*roaring_iterator64)(uint64_t value, void *param);
+
+/**
+ *  (For advanced users.)
+ * The roaring_statistics_t can be used to collect detailed statistics about
+ * the composition of a roaring bitmap.
+ */
+typedef struct roaring_statistics_s {
+    uint32_t n_containers; /* number of containers */
+
+    uint32_t n_array_containers;  /* number of array containers */
+    uint32_t n_run_containers;    /* number of run containers */
+    uint32_t n_bitset_containers; /* number of bitmap containers */
+
+    uint32_t
+        n_values_array_containers;    /* number of values in array containers */
+    uint32_t n_values_run_containers; /* number of values in run containers */
+    uint32_t
+        n_values_bitset_containers; /* number of values in  bitmap containers */
+
+    uint32_t n_bytes_array_containers;  /* number of allocated bytes in array
+                                           containers */
+    uint32_t n_bytes_run_containers;    /* number of allocated bytes in run
+                                           containers */
+    uint32_t n_bytes_bitset_containers; /* number of allocated bytes in  bitmap
+                                           containers */
+
+    uint32_t
+        max_value; /* the maximal value, undefined if cardinality is zero */
+    uint32_t
+        min_value; /* the minimal value, undefined if cardinality is zero */
+    uint64_t sum_value; /* the sum of all values (could be used to compute
+                           average) */
+
+    uint64_t cardinality; /* total number of values stored in the bitmap */
+
+    // and n_values_arrays, n_values_rle, n_values_bitmap
+} roaring_statistics_t;
+
+/**
+ * Roaring-internal type used to iterate within a roaring container.
+ */
+typedef struct roaring_container_iterator_s {
+    // For bitset and array containers this is the index of the bit / entry.
+    // For run containers this points at the run.
+    int32_t index;
+} roaring_container_iterator_t;
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace api {
+#endif
+
+#endif /* ROARING_TYPES_H */
+/* end file include/roaring/roaring_types.h */
+/* begin file include/roaring/portability.h */
+/*
+ * portability.h
+ *
+ */
+
+/**
+ * All macros should be prefixed with either CROARING or ROARING.
+ * The library uses both ROARING_...
+ * as well as CROAIRING_ as prefixes. The ROARING_ prefix is for
+ * macros that are provided by the build system or that are closely
+ * related to the format. The header macros may also use ROARING_.
+ * The CROARING_ prefix is for internal macros that a user is unlikely
+ * to ever interact with.
+ */
+
+#ifndef INCLUDE_PORTABILITY_H_
+#define INCLUDE_PORTABILITY_H_
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif  // _GNU_SOURCE
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS 1
+#endif  // __STDC_FORMAT_MACROS
+
+#ifdef _MSC_VER
+#define CROARING_VISUAL_STUDIO 1
+/**
+ * We want to differentiate carefully between
+ * clang under visual studio and regular visual
+ * studio.
+ */
+#ifdef __clang__
+// clang under visual studio
+#define CROARING_CLANG_VISUAL_STUDIO 1
+#else
+// just regular visual studio (best guess)
+#define CROARING_REGULAR_VISUAL_STUDIO 1
+#endif  // __clang__
+#endif  // _MSC_VER
+#ifndef CROARING_VISUAL_STUDIO
+#define CROARING_VISUAL_STUDIO 0
+#endif
+#ifndef CROARING_CLANG_VISUAL_STUDIO
+#define CROARING_CLANG_VISUAL_STUDIO 0
+#endif
+#ifndef CROARING_REGULAR_VISUAL_STUDIO
+#define CROARING_REGULAR_VISUAL_STUDIO 0
+#endif
+
+#if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L)
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif  // !(defined(_POSIX_C_SOURCE)) || (_POSIX_C_SOURCE < 200809L)
+#if !(defined(_XOPEN_SOURCE)) || (_XOPEN_SOURCE < 700)
+#define _XOPEN_SOURCE 700
+#endif  // !(defined(_XOPEN_SOURCE)) || (_XOPEN_SOURCE < 700)
+
+#ifdef __illumos__
+#define __EXTENSIONS__
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>  // will provide posix_memalign with _POSIX_C_SOURCE as defined above
+#ifdef __GLIBC__
+#include <malloc.h>  // this should never be needed but there are some reports that it is needed.
+#endif
+
+#ifdef __cplusplus
+extern "C" {  // portability definitions are in global scope, not a namespace
+#endif
+
+#if defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ != 8
+#error This code assumes  64-bit long longs (by use of the GCC intrinsics). Your system is not currently supported.
+#endif
+
+#if CROARING_REGULAR_VISUAL_STUDIO
+#ifndef __restrict__
+#define __restrict__ __restrict
+#endif  // __restrict__
+#endif  // CROARING_REGULAR_VISUAL_STUDIO
+
+#if defined(__x86_64__) || defined(_M_X64)
+// we have an x64 processor
+#define CROARING_IS_X64 1
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+// Old visual studio systems won't support AVX2 well.
+#undef CROARING_IS_X64
+#endif
+
+#if defined(__clang_major__) && (__clang_major__ <= 8) && !defined(__AVX2__)
+// Older versions of clang have a bug affecting us
+// https://stackoverflow.com/questions/57228537/how-does-one-use-pragma-clang-attribute-push-with-c-namespaces
+#undef CROARING_IS_X64
+#endif
+
+#ifdef ROARING_DISABLE_X64
+#undef CROARING_IS_X64
+#endif
+// we include the intrinsic header
+#if !CROARING_REGULAR_VISUAL_STUDIO
+/* Non-Microsoft C/C++-compatible compiler */
+#include <x86intrin.h>  // on some recent GCC, this will declare posix_memalign
+
+#if CROARING_CLANG_VISUAL_STUDIO
+
+/**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ * e.g., if __AVX2__ is set... in turn,  we normally set these
+ * macros by compiling against the corresponding architecture
+ * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
+ * software with these advanced instructions. These headers would
+ * normally guard against such usage, but we carefully included
+ * <x86intrin.h>  (or <intrin.h>) before, so the headers
+ * are fooled.
+ */
+// To avoid reordering imports:
+// clang-format off
+#include <bmiintrin.h>   // for _blsr_u64
+#include <lzcntintrin.h> // for  __lzcnt64
+#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <wmmintrin.h>
+#if _MSC_VER >= 1920
+// Important: we need the AVX-512 headers:
+#include <avx512fintrin.h>
+#include <avx512dqintrin.h>
+#include <avx512cdintrin.h>
+#include <avx512bwintrin.h>
+#include <avx512vlintrin.h>
+#include <avx512vbmiintrin.h>
+#include <avx512vbmi2intrin.h>
+#include <avx512vpopcntdqintrin.h>
+// clang-format on
+#endif  // _MSC_VER >= 1920
+// unfortunately, we may not get _blsr_u64, but, thankfully, clang
+// has it as a macro.
+#ifndef _blsr_u64
+// we roll our own
+#define _blsr_u64(n) ((n - 1) & n)
+#endif  //  _blsr_u64
+#endif  // SIMDJSON_CLANG_VISUAL_STUDIO
+
+#endif  // CROARING_REGULAR_VISUAL_STUDIO
+#endif  // defined(__x86_64__) || defined(_M_X64)
+
+#if !defined(CROARING_USENEON) && !defined(DISABLENEON) && defined(__ARM_NEON)
+#define CROARING_USENEON
+#endif
+#if defined(CROARING_USENEON)
+#include <arm_neon.h>
+#endif
+
+#if !CROARING_REGULAR_VISUAL_STUDIO
+/* Non-Microsoft C/C++-compatible compiler, assumes that it supports inline
+ * assembly */
+#define CROARING_INLINE_ASM 1
+#endif  // _MSC_VER
+
+#if CROARING_REGULAR_VISUAL_STUDIO
+/* Microsoft C/C++-compatible compiler */
+#include <intrin.h>
+
+#ifndef __clang__  // if one compiles with MSVC *with* clang, then these
+                   // intrinsics are defined!!!
+#define CROARING_INTRINSICS 1
+// sadly there is no way to check whether we are missing these intrinsics
+// specifically.
+
+/* wrappers for Visual Studio built-ins that look like gcc built-ins
+ * __builtin_ctzll */
+/** result might be undefined when input_num is zero */
+inline int roaring_trailing_zeroes(unsigned long long input_num) {
+    unsigned long index;
+#ifdef _WIN64  // highly recommended!!!
+    _BitScanForward64(&index, input_num);
+#else   // if we must support 32-bit Windows
+    if ((uint32_t)input_num != 0) {
+        _BitScanForward(&index, (uint32_t)input_num);
+    } else {
+        _BitScanForward(&index, (uint32_t)(input_num >> 32));
+        index += 32;
+    }
+#endif  // _WIN64
+    return index;
+}
+
+/* wrappers for Visual Studio built-ins that look like gcc built-ins
+ * __builtin_clzll */
+/** result might be undefined when input_num is zero */
+inline int roaring_leading_zeroes(unsigned long long input_num) {
+    unsigned long index;
+#ifdef _WIN64  // highly recommended!!!
+    _BitScanReverse64(&index, input_num);
+#else   // if we must support 32-bit Windows
+    if (input_num > 0xFFFFFFFF) {
+        _BitScanReverse(&index, (uint32_t)(input_num >> 32));
+        index += 32;
+    } else {
+        _BitScanReverse(&index, (uint32_t)(input_num));
+    }
+#endif  // _WIN64
+    return 63 - index;
+}
+
+/* Use #define so this is effective even under /Ob0 (no inline) */
+#define roaring_unreachable __assume(0)
+#endif  // __clang__
+
+#endif  // CROARING_REGULAR_VISUAL_STUDIO
+
+#ifndef CROARING_INTRINSICS
+#define CROARING_INTRINSICS 1
+#define roaring_unreachable __builtin_unreachable()
+/** result might be undefined when input_num is zero */
+inline int roaring_trailing_zeroes(unsigned long long input_num) {
+    return __builtin_ctzll(input_num);
+}
+/** result might be undefined when input_num is zero */
+inline int roaring_leading_zeroes(unsigned long long input_num) {
+    return __builtin_clzll(input_num);
+}
+#endif
+
+#if CROARING_REGULAR_VISUAL_STUDIO
+#define ALIGNED(x) __declspec(align(x))
+#elif defined(__GNUC__) || defined(__clang__)
+#define ALIGNED(x) __attribute__((aligned(x)))
+#else
+#warning "Warning. Unrecognized compiler."
+#define ALIGNED(x)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define CROARING_WARN_UNUSED __attribute__((warn_unused_result))
+#else
+#define CROARING_WARN_UNUSED
+#endif
+
+#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
+
+#ifdef CROARING_USENEON
+// we can always compute the popcount fast.
+#elif (defined(_M_ARM) || defined(_M_ARM64)) && \
+    ((defined(_WIN64) || defined(_WIN32)) &&    \
+     defined(CROARING_REGULAR_VISUAL_STUDIO) && \
+     CROARING_REGULAR_VISUAL_STUDIO)
+// we will need this function:
+static inline int roaring_hamming_backup(uint64_t x) {
+    uint64_t c1 = UINT64_C(0x5555555555555555);
+    uint64_t c2 = UINT64_C(0x3333333333333333);
+    uint64_t c4 = UINT64_C(0x0F0F0F0F0F0F0F0F);
+    x -= (x >> 1) & c1;
+    x = ((x >> 2) & c2) + (x & c2);
+    x = (x + (x >> 4)) & c4;
+    x *= UINT64_C(0x0101010101010101);
+    return x >> 56;
+}
+#endif
+
+static inline int roaring_hamming(uint64_t x) {
+#if defined(_WIN64) && defined(CROARING_REGULAR_VISUAL_STUDIO) && \
+    CROARING_REGULAR_VISUAL_STUDIO
+#ifdef CROARING_USENEON
+    return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
+#elif defined(_M_ARM64)
+    return roaring_hamming_backup(x);
+    // (int) _CountOneBits64(x); is unavailable
+#else   // _M_ARM64
+    return (int)__popcnt64(x);
+#endif  // _M_ARM64
+#elif defined(_WIN32) && defined(CROARING_REGULAR_VISUAL_STUDIO) && \
+    CROARING_REGULAR_VISUAL_STUDIO
+#ifdef _M_ARM
+    return roaring_hamming_backup(x);
+    // _CountOneBits is unavailable
+#else   // _M_ARM
+    return (int)__popcnt((unsigned int)x) +
+           (int)__popcnt((unsigned int)(x >> 32));
+#endif  // _M_ARM
+#else
+    return __builtin_popcountll(x);
+#endif
+}
+
+#ifndef UINT64_C
+#define UINT64_C(c) (c##ULL)
+#endif  // UINT64_C
+
+#ifndef UINT32_C
+#define UINT32_C(c) (c##UL)
+#endif  // UINT32_C
+
+#ifdef __cplusplus
+}  // extern "C" {
+#endif  // __cplusplus
+
+// this is almost standard?
+#undef STRINGIFY_IMPLEMENTATION_
+#undef STRINGIFY
+#define STRINGIFY_IMPLEMENTATION_(a) #a
+#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a)
+
+// Our fast kernels require 64-bit systems.
+//
+// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
+// Furthermore, the number of SIMD registers is reduced.
+//
+// On 32-bit ARM, we would have smaller registers.
+//
+// The library should still have the fallback kernel. It is
+// slower, but it should run everywhere.
+
+//
+// Enable valid runtime implementations, and select
+// CROARING_BUILTIN_IMPLEMENTATION
+//
+
+// We are going to use runtime dispatch.
+#if CROARING_IS_X64
+#ifdef __clang__
+// clang does not have GCC push pop
+// warning: clang attribute push can't be used within a namespace in clang up
+// til 8.0 so CROARING_TARGET_REGION and CROARING_UNTARGET_REGION must be
+// *outside* of a namespace.
+#define CROARING_TARGET_REGION(T)                                      \
+    _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), \
+                                           apply_to = function)))
+#define CROARING_UNTARGET_REGION _Pragma("clang attribute pop")
+#elif defined(__GNUC__)
+// GCC is easier
+#define CROARING_TARGET_REGION(T) \
+    _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T)))
+#define CROARING_UNTARGET_REGION _Pragma("GCC pop_options")
+#endif  // clang then gcc
+
+#endif  // CROARING_IS_X64
+
+// Default target region macros don't do anything.
+#ifndef CROARING_TARGET_REGION
+#define CROARING_TARGET_REGION(T)
+#define CROARING_UNTARGET_REGION
+#endif
+
+#define CROARING_TARGET_AVX2 \
+    CROARING_TARGET_REGION("avx2,bmi,pclmul,lzcnt,popcnt")
+#define CROARING_TARGET_AVX512                                         \
+    CROARING_TARGET_REGION(                                            \
+        "avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512f,avx512dq,avx512bw," \
+        "avx512vbmi2,avx512bitalg,avx512vpopcntdq")
+#define CROARING_UNTARGET_AVX2 CROARING_UNTARGET_REGION
+#define CROARING_UNTARGET_AVX512 CROARING_UNTARGET_REGION
+
+#ifdef __AVX2__
+// No need for runtime dispatching.
+// It is unnecessary and harmful to old clang to tag regions.
+#undef CROARING_TARGET_AVX2
+#define CROARING_TARGET_AVX2
+#undef CROARING_UNTARGET_AVX2
+#define CROARING_UNTARGET_AVX2
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && \
+    defined(__AVX512VBMI2__) && defined(__AVX512BITALG__) &&                  \
+    defined(__AVX512VPOPCNTDQ__)
+// No need for runtime dispatching.
+// It is unnecessary and harmful to old clang to tag regions.
+#undef CROARING_TARGET_AVX512
+#define CROARING_TARGET_AVX512
+#undef CROARING_UNTARGET_AVX512
+#define CROARING_UNTARGET_AVX512
+#endif
+
+// Allow unaligned memory access
+#if defined(__GNUC__) || defined(__clang__)
+#define ALLOW_UNALIGNED __attribute__((no_sanitize("alignment")))
+#else
+#define ALLOW_UNALIGNED
+#endif
+
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
+#define CROARING_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#elif defined(_WIN32)
+#define CROARING_IS_BIG_ENDIAN 0
+#else
+#if defined(__APPLE__) || \
+    defined(__FreeBSD__)  // defined __BYTE_ORDER__ && defined
+                          // __ORDER_BIG_ENDIAN__
+#include <machine/endian.h>
+#elif defined(sun) || \
+    defined(__sun)  // defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/byteorder.h>
+#else  // defined(__APPLE__) || defined(__FreeBSD__)
+
+#ifdef __has_include
+#if __has_include(<endian.h>)
+#include <endian.h>
+#endif  //__has_include(<endian.h>)
+#endif  //__has_include
+
+#endif  // defined(__APPLE__) || defined(__FreeBSD__)
+
+#ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
+#define CROARING_IS_BIG_ENDIAN 0
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define CROARING_IS_BIG_ENDIAN 0
+#else  // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define CROARING_IS_BIG_ENDIAN 1
+#endif  // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#endif
+
+// Host <-> big endian conversion.
+#if CROARING_IS_BIG_ENDIAN
+#define croaring_htobe64(x) (x)
+
+#elif defined(_WIN32) || defined(_WIN64)  // CROARING_IS_BIG_ENDIAN
+#include <stdlib.h>
+#define croaring_htobe64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)  // CROARING_IS_BIG_ENDIAN
+#include <libkern/OSByteOrder.h>
+#define croaring_htobe64(x) OSSwapInt64(x)
+
+#elif defined(__has_include) && \
+    __has_include(<byteswap.h>)  // CROARING_IS_BIG_ENDIAN
+#include <byteswap.h>
+#define croaring_htobe64(x) __bswap_64(x)
+
+#else  // CROARING_IS_BIG_ENDIAN
+// Gets compiled to bswap or equivalent on most compilers.
+#define croaring_htobe64(x)                                                    \
+    (((x & 0x00000000000000FFULL) << 56) |                                     \
+     ((x & 0x000000000000FF00ULL) << 40) |                                     \
+     ((x & 0x0000000000FF0000ULL) << 24) |                                     \
+     ((x & 0x00000000FF000000ULL) << 8) | ((x & 0x000000FF00000000ULL) >> 8) | \
+     ((x & 0x0000FF0000000000ULL) >> 24) |                                     \
+     ((x & 0x00FF000000000000ULL) >> 40) |                                     \
+     ((x & 0xFF00000000000000ULL) >> 56))
+#endif  // CROARING_IS_BIG_ENDIAN
+#define croaring_be64toh(x) croaring_htobe64(x)
+// End of host <-> big endian conversion.
+
+// Defines for the possible CROARING atomic implementations
+#define CROARING_ATOMIC_IMPL_NONE 1
+#define CROARING_ATOMIC_IMPL_CPP 2
+#define CROARING_ATOMIC_IMPL_C 3
+#define CROARING_ATOMIC_IMPL_C_WINDOWS 4
+
+// If the use has forced a specific implementation, use that, otherwise,
+// figure out the best implementation we can use.
+#if !defined(CROARING_ATOMIC_IMPL)
+#if defined(__cplusplus) && __cplusplus >= 201103L
+#ifdef __has_include
+#if __has_include(<atomic>)
+#define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_CPP
+#endif  //__has_include(<atomic>)
+#else
+   // We lack __has_include to check:
+#define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_CPP
+#endif  //__has_include
+#elif __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__)
+#define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_C
+#elif CROARING_REGULAR_VISUAL_STUDIO
+   // https://www.technetworkhub.com/c11-atomics-in-visual-studio-2022-version-17/
+#define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_C_WINDOWS
+#endif
+#endif  // !defined(CROARING_ATOMIC_IMPL)
+
+#if CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C
+#include <stdatomic.h>
+typedef _Atomic(uint32_t) croaring_refcount_t;
+
+static inline void croaring_refcount_inc(croaring_refcount_t *val) {
+    // Increasing the reference counter can always be done with
+    // memory_order_relaxed: New references to an object can only be formed from
+    // an existing reference, and passing an existing reference from one thread
+    // to another must already provide any required synchronization.
+    atomic_fetch_add_explicit(val, 1, memory_order_relaxed);
+}
+
+static inline bool croaring_refcount_dec(croaring_refcount_t *val) {
+    // It is important to enforce any possible access to the object in one
+    // thread (through an existing reference) to happen before deleting the
+    // object in a different thread. This is achieved by a "release" operation
+    // after dropping a reference (any access to the object through this
+    // reference must obviously happened before), and an "acquire" operation
+    // before deleting the object.
+    bool is_zero = atomic_fetch_sub_explicit(val, 1, memory_order_release) == 1;
+    if (is_zero) {
+        atomic_thread_fence(memory_order_acquire);
+    }
+    return is_zero;
+}
+
+static inline uint32_t croaring_refcount_get(const croaring_refcount_t *val) {
+    return atomic_load_explicit(val, memory_order_relaxed);
+}
+#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_CPP
+#include <atomic>
+typedef std::atomic<uint32_t> croaring_refcount_t;
+
+static inline void croaring_refcount_inc(croaring_refcount_t *val) {
+    val->fetch_add(1, std::memory_order_relaxed);
+}
+
+static inline bool croaring_refcount_dec(croaring_refcount_t *val) {
+    // See above comments on the c11 atomic implementation for memory ordering
+    bool is_zero = val->fetch_sub(1, std::memory_order_release) == 1;
+    if (is_zero) {
+        std::atomic_thread_fence(std::memory_order_acquire);
+    }
+    return is_zero;
+}
+
+static inline uint32_t croaring_refcount_get(const croaring_refcount_t *val) {
+    return val->load(std::memory_order_relaxed);
+}
+#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C_WINDOWS
+#include <intrin.h>
+#pragma intrinsic(_InterlockedIncrement)
+#pragma intrinsic(_InterlockedDecrement)
+
+// _InterlockedIncrement and _InterlockedDecrement take a (signed) long, and
+// overflow is defined to wrap, so we can pretend it is a uint32_t for our case
+typedef volatile long croaring_refcount_t;
+
+static inline void croaring_refcount_inc(croaring_refcount_t *val) {
+    _InterlockedIncrement(val);
+}
+
+static inline bool croaring_refcount_dec(croaring_refcount_t *val) {
+    return _InterlockedDecrement(val) == 0;
+}
+
+static inline uint32_t croaring_refcount_get(const croaring_refcount_t *val) {
+    // Per
+    // https://learn.microsoft.com/en-us/windows/win32/sync/interlocked-variable-access
+    // > Simple reads and writes to properly-aligned 32-bit variables are atomic
+    // > operations. In other words, you will not end up with only one portion
+    // > of the variable updated; all bits are updated in an atomic fashion.
+    return *val;
+}
+#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_NONE
+#include <assert.h>
+typedef uint32_t croaring_refcount_t;
+
+static inline void croaring_refcount_inc(croaring_refcount_t *val) {
+    *val += 1;
+}
+
+static inline bool croaring_refcount_dec(croaring_refcount_t *val) {
+    assert(*val > 0);
+    *val -= 1;
+    return val == 0;
+}
+
+static inline uint32_t croaring_refcount_get(const croaring_refcount_t *val) {
+    return *val;
+}
+#else
+#error "Unknown atomic implementation"
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define CROARING_DEPRECATED __attribute__((deprecated))
+#else
+#define CROARING_DEPRECATED
+#endif  // defined(__GNUC__) || defined(__clang__)
+
+// We need portability.h to be included first,
+// but we also always want isadetection.h to be
+// included (right after).
+// See https://github.com/RoaringBitmap/CRoaring/issues/394
+// There is no scenario where we want portability.h to
+// be included, but not isadetection.h: the latter is a
+// strict requirement.
+#endif                             /* INCLUDE_PORTABILITY_H_ */
+/* end file include/roaring/portability.h */
+/* begin file include/roaring/bitset/bitset.h */
+#ifndef CBITSET_BITSET_H
+#define CBITSET_BITSET_H
+
+// For compatibility with MSVC with the use of `restrict`
+#if (__STDC_VERSION__ >= 199901L) || \
+    (defined(__GNUC__) && defined(__STDC_VERSION__))
+#define CBITSET_RESTRICT restrict
+#else
+#define CBITSET_RESTRICT
+#endif  // (__STDC_VERSION__ >= 199901L) || (defined(__GNUC__) &&
+        // defined(__STDC_VERSION__ ))
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace api {
+#endif
+
+struct bitset_s {
+    uint64_t *CBITSET_RESTRICT array;
+    /* For simplicity and performance, we prefer to have a size and a capacity
+     * that is a multiple of 64 bits. Thus we only track the size and the
+     * capacity in terms of 64-bit words allocated */
+    size_t arraysize;
+    size_t capacity;
+};
+
+typedef struct bitset_s bitset_t;
+
+/* Create a new bitset. Return NULL in case of failure. */
+bitset_t *bitset_create(void);
+
+/* Create a new bitset able to contain size bits. Return NULL in case of
+ * failure. */
+bitset_t *bitset_create_with_capacity(size_t size);
+
+/* Free memory. */
+void bitset_free(bitset_t *bitset);
+
+/* Set all bits to zero. */
+void bitset_clear(bitset_t *bitset);
+
+/* Set all bits to one. */
+void bitset_fill(bitset_t *bitset);
+
+/* Create a copy */
+bitset_t *bitset_copy(const bitset_t *bitset);
+
+/* For advanced users: Resize the bitset so that it can support newarraysize *
+ * 64 bits. Return true in case of success, false for failure. Pad with zeroes
+ * new buffer areas if requested. */
+bool bitset_resize(bitset_t *bitset, size_t newarraysize, bool padwithzeroes);
+
+/* returns how many bytes of memory the backend buffer uses */
+inline size_t bitset_size_in_bytes(const bitset_t *bitset) {
+    return bitset->arraysize * sizeof(uint64_t);
+}
+
+/* returns how many bits can be accessed */
+inline size_t bitset_size_in_bits(const bitset_t *bitset) {
+    return bitset->arraysize * 64;
+}
+
+/* returns how many words (64-bit) of memory the backend buffer uses */
+inline size_t bitset_size_in_words(const bitset_t *bitset) {
+    return bitset->arraysize;
+}
+
+/* For advanced users: Grow the bitset so that it can support newarraysize * 64
+ * bits with padding. Return true in case of success, false for failure. */
+bool bitset_grow(bitset_t *bitset, size_t newarraysize);
+
+/* attempts to recover unused memory, return false in case of
+ * roaring_reallocation failure */
+bool bitset_trim(bitset_t *bitset);
+
+/* shifts all bits by 's' positions so that the bitset representing values
+ * 1,2,10 would represent values 1+s, 2+s, 10+s */
+void bitset_shift_left(bitset_t *bitset, size_t s);
+
+/* shifts all bits by 's' positions so that the bitset representing values
+ * 1,2,10 would represent values 1-s, 2-s, 10-s, negative values are deleted */
+void bitset_shift_right(bitset_t *bitset, size_t s);
+
+/* Set the ith bit. Attempts to resize the bitset if needed (may silently fail)
+ */
+inline void bitset_set(bitset_t *bitset, size_t i) {
+    size_t shiftedi = i / 64;
+    if (shiftedi >= bitset->arraysize) {
+        if (!bitset_grow(bitset, shiftedi + 1)) {
+            return;
+        }
+    }
+    bitset->array[shiftedi] |= ((uint64_t)1) << (i % 64);
+}
+
+/* Set the ith bit to the specified value. Attempts to resize the bitset if
+ * needed (may silently fail) */
+inline void bitset_set_to_value(bitset_t *bitset, size_t i, bool flag) {
+    size_t shiftedi = i / 64;
+    uint64_t mask = ((uint64_t)1) << (i % 64);
+    uint64_t dynmask = ((uint64_t)flag) << (i % 64);
+    if (shiftedi >= bitset->arraysize) {
+        if (!bitset_grow(bitset, shiftedi + 1)) {
+            return;
+        }
+    }
+    uint64_t w = bitset->array[shiftedi];
+    w &= ~mask;
+    w |= dynmask;
+    bitset->array[shiftedi] = w;
+}
+
+/* Get the value of the ith bit.  */
+inline bool bitset_get(const bitset_t *bitset, size_t i) {
+    size_t shiftedi = i / 64;
+    if (shiftedi >= bitset->arraysize) {
+        return false;
+    }
+    return (bitset->array[shiftedi] & (((uint64_t)1) << (i % 64))) != 0;
+}
+
+/* Count number of bits set.  */
+size_t bitset_count(const bitset_t *bitset);
+
+/* Find the index of the first bit set. Or zero if the bitset is empty.  */
+size_t bitset_minimum(const bitset_t *bitset);
+
+/* Find the index of the last bit set. Or zero if the bitset is empty.  */
+size_t bitset_maximum(const bitset_t *bitset);
+
+/* compute the union in-place (to b1), returns true if successful, to generate a
+ * new bitset first call bitset_copy */
+bool bitset_inplace_union(bitset_t *CBITSET_RESTRICT b1,
+                          const bitset_t *CBITSET_RESTRICT b2);
+
+/* report the size of the union (without materializing it) */
+size_t bitset_union_count(const bitset_t *CBITSET_RESTRICT b1,
+                          const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the intersection in-place (to b1), to generate a new bitset first
+ * call bitset_copy */
+void bitset_inplace_intersection(bitset_t *CBITSET_RESTRICT b1,
+                                 const bitset_t *CBITSET_RESTRICT b2);
+
+/* report the size of the intersection (without materializing it) */
+size_t bitset_intersection_count(const bitset_t *CBITSET_RESTRICT b1,
+                                 const bitset_t *CBITSET_RESTRICT b2);
+
+/* returns true if the bitsets contain no common elements */
+bool bitsets_disjoint(const bitset_t *CBITSET_RESTRICT b1,
+                      const bitset_t *CBITSET_RESTRICT b2);
+
+/* returns true if the bitsets contain any common elements */
+bool bitsets_intersect(const bitset_t *CBITSET_RESTRICT b1,
+                       const bitset_t *CBITSET_RESTRICT b2);
+
+/* returns true if b1 contains all of the set bits of b2 */
+bool bitset_contains_all(const bitset_t *CBITSET_RESTRICT b1,
+                         const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the difference in-place (to b1), to generate a new bitset first call
+ * bitset_copy */
+void bitset_inplace_difference(bitset_t *CBITSET_RESTRICT b1,
+                               const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the size of the difference */
+size_t bitset_difference_count(const bitset_t *CBITSET_RESTRICT b1,
+                               const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the symmetric difference in-place (to b1), return true if successful,
+ * to generate a new bitset first call bitset_copy */
+bool bitset_inplace_symmetric_difference(bitset_t *CBITSET_RESTRICT b1,
+                                         const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the size of the symmetric difference  */
+size_t bitset_symmetric_difference_count(const bitset_t *CBITSET_RESTRICT b1,
+                                         const bitset_t *CBITSET_RESTRICT b2);
+
+/* iterate over the set bits
+ like so :
+  for(size_t i = 0; bitset_next_set_bit(b,&i) ; i++) {
+    //.....
+  }
+  */
+inline bool bitset_next_set_bit(const bitset_t *bitset, size_t *i) {
+    size_t x = *i / 64;
+    if (x >= bitset->arraysize) {
+        return false;
+    }
+    uint64_t w = bitset->array[x];
+    w >>= (*i & 63);
+    if (w != 0) {
+        *i += roaring_trailing_zeroes(w);
+        return true;
+    }
+    x++;
+    while (x < bitset->arraysize) {
+        w = bitset->array[x];
+        if (w != 0) {
+            *i = x * 64 + roaring_trailing_zeroes(w);
+            return true;
+        }
+        x++;
+    }
+    return false;
+}
+
+/* iterate over the set bits
+ like so :
+   size_t buffer[256];
+   size_t howmany = 0;
+  for(size_t startfrom = 0; (howmany = bitset_next_set_bits(b,buffer,256,
+ &startfrom)) > 0 ; startfrom++) {
+    //.....
+  }
+  */
+inline size_t bitset_next_set_bits(const bitset_t *bitset, size_t *buffer,
+                                   size_t capacity, size_t *startfrom) {
+    if (capacity == 0) return 0;  // sanity check
+    size_t x = *startfrom / 64;
+    if (x >= bitset->arraysize) {
+        return 0;  // nothing more to iterate over
+    }
+    uint64_t w = bitset->array[x];
+    w >>= (*startfrom & 63);
+    size_t howmany = 0;
+    size_t base = x << 6;
+    while (howmany < capacity) {
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            int r = roaring_trailing_zeroes(w);
+            buffer[howmany++] = r + base;
+            if (howmany == capacity) goto end;
+            w ^= t;
+        }
+        x += 1;
+        if (x == bitset->arraysize) {
+            break;
+        }
+        base += 64;
+        w = bitset->array[x];
+    }
+end:
+    if (howmany > 0) {
+        *startfrom = buffer[howmany - 1];
+    }
+    return howmany;
+}
+
+typedef bool (*bitset_iterator)(size_t value, void *param);
+
+// return true if uninterrupted
+inline bool bitset_for_each(const bitset_t *b, bitset_iterator iterator,
+                            void *ptr) {
+    size_t base = 0;
+    for (size_t i = 0; i < b->arraysize; ++i) {
+        uint64_t w = b->array[i];
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            int r = roaring_trailing_zeroes(w);
+            if (!iterator(r + base, ptr)) return false;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return true;
+}
+
+inline void bitset_print(const bitset_t *b) {
+    printf("{");
+    for (size_t i = 0; bitset_next_set_bit(b, &i); i++) {
+        printf("%zu, ", i);
+    }
+    printf("}");
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace api {
+#endif
+
+#endif
+/* end file include/roaring/bitset/bitset.h */
+/* begin file include/roaring/roaring.h */
+/*
+ * An implementation of Roaring Bitmaps in C.
+ */
+
+#ifndef ROARING_H
+#define ROARING_H
+
+#include <stdbool.h>
+#include <stddef.h>  // for `size_t`
+#include <stdint.h>
+
+
+// Include other headers after roaring_types.h
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace api {
+#endif
+
+typedef struct roaring_bitmap_s {
+    roaring_array_t high_low_container;
+} roaring_bitmap_t;
+
+/**
+ * Dynamically allocates a new bitmap (initially empty).
+ * Returns NULL if the allocation fails.
+ * Capacity is a performance hint for how many "containers" the data will need.
+ * Client is responsible for calling `roaring_bitmap_free()`.
+ */
+roaring_bitmap_t *roaring_bitmap_create_with_capacity(uint32_t cap);
+
+/**
+ * Dynamically allocates a new bitmap (initially empty).
+ * Returns NULL if the allocation fails.
+ * Client is responsible for calling `roaring_bitmap_free()`.
+ */
+inline roaring_bitmap_t *roaring_bitmap_create(void) {
+    return roaring_bitmap_create_with_capacity(0);
+}
+
+/**
+ * Initialize a roaring bitmap structure in memory controlled by client.
+ * Capacity is a performance hint for how many "containers" the data will need.
+ * Can return false if auxiliary allocations fail when capacity greater than 0.
+ */
+bool roaring_bitmap_init_with_capacity(roaring_bitmap_t *r, uint32_t cap);
+
+/**
+ * Initialize a roaring bitmap structure in memory controlled by client.
+ * The bitmap will be in a "clear" state, with no auxiliary allocations.
+ * Since this performs no allocations, the function will not fail.
+ */
+inline void roaring_bitmap_init_cleared(roaring_bitmap_t *r) {
+    roaring_bitmap_init_with_capacity(r, 0);
+}
+
+/**
+ * Add all the values between min (included) and max (excluded) that are at a
+ * distance k*step from min.
+ */
+roaring_bitmap_t *roaring_bitmap_from_range(uint64_t min, uint64_t max,
+                                            uint32_t step);
+
+/**
+ * Creates a new bitmap from a pointer of uint32_t integers
+ */
+roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals);
+
+/*
+ * Whether you want to use copy-on-write.
+ * Saves memory and avoids copies, but needs more care in a threaded context.
+ * Most users should ignore this flag.
+ *
+ * Note: If you do turn this flag to 'true', enabling COW, then ensure that you
+ * do so for all of your bitmaps, since interactions between bitmaps with and
+ * without COW is unsafe.
+ */
+inline bool roaring_bitmap_get_copy_on_write(const roaring_bitmap_t *r) {
+    return r->high_low_container.flags & ROARING_FLAG_COW;
+}
+inline void roaring_bitmap_set_copy_on_write(roaring_bitmap_t *r, bool cow) {
+    if (cow) {
+        r->high_low_container.flags |= ROARING_FLAG_COW;
+    } else {
+        r->high_low_container.flags &= ~ROARING_FLAG_COW;
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_add_offset(const roaring_bitmap_t *bm,
+                                            int64_t offset);
+/**
+ * Describe the inner structure of the bitmap.
+ */
+void roaring_bitmap_printf_describe(const roaring_bitmap_t *r);
+
+/**
+ * Creates a new bitmap from a list of uint32_t integers
+ *
+ * This function is deprecated, use `roaring_bitmap_from` instead, which
+ * doesn't require the number of elements to be passed in.
+ *
+ * @see roaring_bitmap_from
+ */
+CROARING_DEPRECATED roaring_bitmap_t *roaring_bitmap_of(size_t n, ...);
+
+#ifdef __cplusplus
+/**
+ * Creates a new bitmap which contains all values passed in as arguments.
+ *
+ * To create a bitmap from a variable number of arguments, use the
+ * `roaring_bitmap_of_ptr` function instead.
+ */
+// Use an immediately invoked closure, capturing by reference
+// (in case __VA_ARGS__ refers to context outside the closure)
+// Include a 0 at the beginning of the array to make the array length > 0
+// (zero sized arrays are not valid in standard c/c++)
+#define roaring_bitmap_from(...)                                              \
+    [&]() {                                                                   \
+        const uint32_t roaring_bitmap_from_array[] = {0, __VA_ARGS__};        \
+        return roaring_bitmap_of_ptr((sizeof(roaring_bitmap_from_array) /     \
+                                      sizeof(roaring_bitmap_from_array[0])) - \
+                                         1,                                   \
+                                     &roaring_bitmap_from_array[1]);          \
+    }()
+#else
+/**
+ * Creates a new bitmap which contains all values passed in as arguments.
+ *
+ * To create a bitmap from a variable number of arguments, use the
+ * `roaring_bitmap_of_ptr` function instead.
+ */
+// While __VA_ARGS__ occurs twice in expansion, one of the times is in a sizeof
+// expression, which is an unevaluated context, so it's even safe in the case
+// where expressions passed have side effects (roaring64_bitmap_from(my_func(),
+// ++i))
+// Include a 0 at the beginning of the array to make the array length > 0
+// (zero sized arrays are not valid in standard c/c++)
+#define roaring_bitmap_from(...)                                             \
+    roaring_bitmap_of_ptr(                                                   \
+        (sizeof((const uint32_t[]){0, __VA_ARGS__}) / sizeof(uint32_t)) - 1, \
+        &((const uint32_t[]){0, __VA_ARGS__})[1])
+#endif
+
+/**
+ * Copies a bitmap (this does memory allocation).
+ * The caller is responsible for memory management.
+ */
+roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r);
+
+/**
+ * Copies a bitmap from src to dest. It is assumed that the pointer dest
+ * is to an already allocated bitmap. The content of the dest bitmap is
+ * freed/deleted.
+ *
+ * It might be preferable and simpler to call roaring_bitmap_copy except
+ * that roaring_bitmap_overwrite can save on memory allocations.
+ *
+ * Returns true if successful, or false if there was an error. On failure,
+ * the dest bitmap is left in a valid, empty state (even if it was not empty
+ * before).
+ */
+bool roaring_bitmap_overwrite(roaring_bitmap_t *dest,
+                              const roaring_bitmap_t *src);
+
+/**
+ * Print the content of the bitmap.
+ */
+void roaring_bitmap_printf(const roaring_bitmap_t *r);
+
+/**
+ * Computes the intersection between two bitmaps and returns new bitmap. The
+ * caller is responsible for memory management.
+ *
+ * Performance hint: if you are computing the intersection between several
+ * bitmaps, two-by-two, it is best to start with the smallest bitmap.
+ * You may also rely on roaring_bitmap_and_inplace to avoid creating
+ * many temporary bitmaps.
+ */
+roaring_bitmap_t *roaring_bitmap_and(const roaring_bitmap_t *r1,
+                                     const roaring_bitmap_t *r2);
+
+/**
+ * Computes the size of the intersection between two bitmaps.
+ */
+uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *r1,
+                                        const roaring_bitmap_t *r2);
+
+/**
+ * Check whether two bitmaps intersect.
+ */
+bool roaring_bitmap_intersect(const roaring_bitmap_t *r1,
+                              const roaring_bitmap_t *r2);
+
+/**
+ * Check whether a bitmap and an open range intersect.
+ */
+bool roaring_bitmap_intersect_with_range(const roaring_bitmap_t *bm, uint64_t x,
+                                         uint64_t y);
+
+/**
+ * Computes the Jaccard index between two bitmaps. (Also known as the Tanimoto
+ * distance, or the Jaccard similarity coefficient)
+ *
+ * The Jaccard index is undefined if both bitmaps are empty.
+ */
+double roaring_bitmap_jaccard_index(const roaring_bitmap_t *r1,
+                                    const roaring_bitmap_t *r2);
+
+/**
+ * Computes the size of the union between two bitmaps.
+ */
+uint64_t roaring_bitmap_or_cardinality(const roaring_bitmap_t *r1,
+                                       const roaring_bitmap_t *r2);
+
+/**
+ * Computes the size of the difference (andnot) between two bitmaps.
+ */
+uint64_t roaring_bitmap_andnot_cardinality(const roaring_bitmap_t *r1,
+                                           const roaring_bitmap_t *r2);
+
+/**
+ * Computes the size of the symmetric difference (xor) between two bitmaps.
+ */
+uint64_t roaring_bitmap_xor_cardinality(const roaring_bitmap_t *r1,
+                                        const roaring_bitmap_t *r2);
+
+/**
+ * Inplace version of `roaring_bitmap_and()`, modifies r1
+ * r1 == r2 is allowed.
+ *
+ * Performance hint: if you are computing the intersection between several
+ * bitmaps, two-by-two, it is best to start with the smallest bitmap.
+ */
+void roaring_bitmap_and_inplace(roaring_bitmap_t *r1,
+                                const roaring_bitmap_t *r2);
+
+/**
+ * Computes the union between two bitmaps and returns new bitmap. The caller is
+ * responsible for memory management.
+ */
+roaring_bitmap_t *roaring_bitmap_or(const roaring_bitmap_t *r1,
+                                    const roaring_bitmap_t *r2);
+
+/**
+ * Inplace version of `roaring_bitmap_or(), modifies r1.
+ * TODO: decide whether r1 == r2 ok
+ */
+void roaring_bitmap_or_inplace(roaring_bitmap_t *r1,
+                               const roaring_bitmap_t *r2);
+
+/**
+ * Compute the union of 'number' bitmaps.
+ * Caller is responsible for freeing the result.
+ * See also `roaring_bitmap_or_many_heap()`
+ */
+roaring_bitmap_t *roaring_bitmap_or_many(size_t number,
+                                         const roaring_bitmap_t **rs);
+
+/**
+ * Compute the union of 'number' bitmaps using a heap. This can sometimes be
+ * faster than `roaring_bitmap_or_many() which uses a naive algorithm.
+ * Caller is responsible for freeing the result.
+ */
+roaring_bitmap_t *roaring_bitmap_or_many_heap(uint32_t number,
+                                              const roaring_bitmap_t **rs);
+
+/**
+ * Computes the symmetric difference (xor) between two bitmaps
+ * and returns new bitmap. The caller is responsible for memory management.
+ */
+roaring_bitmap_t *roaring_bitmap_xor(const roaring_bitmap_t *r1,
+                                     const roaring_bitmap_t *r2);
+
+/**
+ * Inplace version of roaring_bitmap_xor, modifies r1, r1 != r2.
+ */
+void roaring_bitmap_xor_inplace(roaring_bitmap_t *r1,
+                                const roaring_bitmap_t *r2);
+
+/**
+ * Compute the xor of 'number' bitmaps.
+ * Caller is responsible for freeing the result.
+ */
+roaring_bitmap_t *roaring_bitmap_xor_many(size_t number,
+                                          const roaring_bitmap_t **rs);
+
+/**
+ * Computes the difference (andnot) between two bitmaps and returns new bitmap.
+ * Caller is responsible for freeing the result.
+ */
+roaring_bitmap_t *roaring_bitmap_andnot(const roaring_bitmap_t *r1,
+                                        const roaring_bitmap_t *r2);
+
+/**
+ * Inplace version of roaring_bitmap_andnot, modifies r1, r1 != r2.
+ */
+void roaring_bitmap_andnot_inplace(roaring_bitmap_t *r1,
+                                   const roaring_bitmap_t *r2);
+
+/**
+ * TODO: consider implementing:
+ *
+ * "Compute the xor of 'number' bitmaps using a heap. This can sometimes be
+ *  faster than roaring_bitmap_xor_many which uses a naive algorithm. Caller is
+ *  responsible for freeing the result.""
+ *
+ * roaring_bitmap_t *roaring_bitmap_xor_many_heap(uint32_t number,
+ *                                                const roaring_bitmap_t **rs);
+ */
+
+/**
+ * Frees the memory.
+ */
+void roaring_bitmap_free(const roaring_bitmap_t *r);
+
+/**
+ * A bit of context usable with `roaring_bitmap_*_bulk()` functions
+ *
+ * Should be initialized with `{0}` (or `memset()` to all zeros).
+ * Callers should treat it as an opaque type.
+ *
+ * A context may only be used with a single bitmap
+ * (unless re-initialized to zero), and any modification to a bitmap
+ * (other than modifications performed with `_bulk()` functions with the context
+ * passed) will invalidate any contexts associated with that bitmap.
+ */
+typedef struct roaring_bulk_context_s {
+    ROARING_CONTAINER_T *container;
+    int idx;
+    uint16_t key;
+    uint8_t typecode;
+} roaring_bulk_context_t;
+
+/**
+ * Add an item, using context from a previous insert for speed optimization.
+ *
+ * `context` will be used to store information between calls to make bulk
+ * operations faster. `*context` should be zero-initialized before the first
+ * call to this function.
+ *
+ * Modifying the bitmap in any way (other than `-bulk` suffixed functions)
+ * will invalidate the stored context, calling this function with a non-zero
+ * context after doing any modification invokes undefined behavior.
+ *
+ * In order to exploit this optimization, the caller should call this function
+ * with values with the same "key" (high 16 bits of the value) consecutively.
+ */
+void roaring_bitmap_add_bulk(roaring_bitmap_t *r,
+                             roaring_bulk_context_t *context, uint32_t val);
+
+/**
+ * Add value n_args from pointer vals, faster than repeatedly calling
+ * `roaring_bitmap_add()`
+ *
+ * In order to exploit this optimization, the caller should attempt to keep
+ * values with the same "key" (high 16 bits of the value) as consecutive
+ * elements in `vals`
+ */
+void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args,
+                             const uint32_t *vals);
+
+/**
+ * Add value x
+ */
+void roaring_bitmap_add(roaring_bitmap_t *r, uint32_t x);
+
+/**
+ * Add value x
+ * Returns true if a new value was added, false if the value already existed.
+ */
+bool roaring_bitmap_add_checked(roaring_bitmap_t *r, uint32_t x);
+
+/**
+ * Add all values in range [min, max]
+ */
+void roaring_bitmap_add_range_closed(roaring_bitmap_t *r, uint32_t min,
+                                     uint32_t max);
+
+/**
+ * Add all values in range [min, max)
+ */
+inline void roaring_bitmap_add_range(roaring_bitmap_t *r, uint64_t min,
+                                     uint64_t max) {
+    if (max <= min) return;
+    roaring_bitmap_add_range_closed(r, (uint32_t)min, (uint32_t)(max - 1));
+}
+
+/**
+ * Remove value x
+ */
+void roaring_bitmap_remove(roaring_bitmap_t *r, uint32_t x);
+
+/**
+ * Remove all values in range [min, max]
+ */
+void roaring_bitmap_remove_range_closed(roaring_bitmap_t *r, uint32_t min,
+                                        uint32_t max);
+
+/**
+ * Remove all values in range [min, max)
+ */
+inline void roaring_bitmap_remove_range(roaring_bitmap_t *r, uint64_t min,
+                                        uint64_t max) {
+    if (max <= min) return;
+    roaring_bitmap_remove_range_closed(r, (uint32_t)min, (uint32_t)(max - 1));
+}
+
+/**
+ * Remove multiple values
+ */
+void roaring_bitmap_remove_many(roaring_bitmap_t *r, size_t n_args,
+                                const uint32_t *vals);
+
+/**
+ * Remove value x
+ * Returns true if a new value was removed, false if the value was not existing.
+ */
+bool roaring_bitmap_remove_checked(roaring_bitmap_t *r, uint32_t x);
+
+/**
+ * Check if value is present
+ */
+bool roaring_bitmap_contains(const roaring_bitmap_t *r, uint32_t val);
+
+/**
+ * Check whether a range of values from range_start (included)
+ * to range_end (excluded) is present
+ */
+bool roaring_bitmap_contains_range(const roaring_bitmap_t *r,
+                                   uint64_t range_start, uint64_t range_end);
+
+/**
+ * Check if an items is present, using context from a previous insert or search
+ * for speed optimization.
+ *
+ * `context` will be used to store information between calls to make bulk
+ * operations faster. `*context` should be zero-initialized before the first
+ * call to this function.
+ *
+ * Modifying the bitmap in any way (other than `-bulk` suffixed functions)
+ * will invalidate the stored context, calling this function with a non-zero
+ * context after doing any modification invokes undefined behavior.
+ *
+ * In order to exploit this optimization, the caller should call this function
+ * with values with the same "key" (high 16 bits of the value) consecutively.
+ */
+bool roaring_bitmap_contains_bulk(const roaring_bitmap_t *r,
+                                  roaring_bulk_context_t *context,
+                                  uint32_t val);
+
+/**
+ * Get the cardinality of the bitmap (number of elements).
+ */
+uint64_t roaring_bitmap_get_cardinality(const roaring_bitmap_t *r);
+
+/**
+ * Returns the number of elements in the range [range_start, range_end).
+ */
+uint64_t roaring_bitmap_range_cardinality(const roaring_bitmap_t *r,
+                                          uint64_t range_start,
+                                          uint64_t range_end);
+
+/**
+ * Returns true if the bitmap is empty (cardinality is zero).
+ */
+bool roaring_bitmap_is_empty(const roaring_bitmap_t *r);
+
+/**
+ * Empties the bitmap.  It will have no auxiliary allocations (so if the bitmap
+ * was initialized in client memory via roaring_bitmap_init(), then a call to
+ * roaring_bitmap_clear() would be enough to "free" it)
+ */
+void roaring_bitmap_clear(roaring_bitmap_t *r);
+
+/**
+ * Convert the bitmap to a sorted array, output in `ans`.
+ *
+ * Caller is responsible to ensure that there is enough memory allocated, e.g.
+ *
+ *     ans = malloc(roaring_bitmap_get_cardinality(bitmap) * sizeof(uint32_t));
+ */
+void roaring_bitmap_to_uint32_array(const roaring_bitmap_t *r, uint32_t *ans);
+
+/**
+ * Store the bitmap to a bitset. This can be useful for people
+ * who need the performance and simplicity of a standard bitset.
+ * We assume that the input bitset is originally empty (does not
+ * have any set bit).
+ *
+ *   bitset_t * out = bitset_create();
+ *   // if the bitset has content in it, call "bitset_clear(out)"
+ *   bool success = roaring_bitmap_to_bitset(mybitmap, out);
+ *   // on failure, success will be false.
+ *   // You can then query the bitset:
+ *   bool is_present = bitset_get(out,  10011 );
+ *   // you must free the memory:
+ *   bitset_free(out);
+ *
+ */
+bool roaring_bitmap_to_bitset(const roaring_bitmap_t *r, bitset_t *bitset);
+
+/**
+ * Convert the bitmap to a sorted array from `offset` by `limit`, output in
+ * `ans`.
+ *
+ * Caller is responsible to ensure that there is enough memory allocated, e.g.
+ *
+ *     ans = malloc(roaring_bitmap_get_cardinality(limit) * sizeof(uint32_t));
+ *
+ * Return false in case of failure (e.g., insufficient memory)
+ */
+bool roaring_bitmap_range_uint32_array(const roaring_bitmap_t *r, size_t offset,
+                                       size_t limit, uint32_t *ans);
+
+/**
+ * Remove run-length encoding even when it is more space efficient.
+ * Return whether a change was applied.
+ */
+bool roaring_bitmap_remove_run_compression(roaring_bitmap_t *r);
+
+/**
+ * Convert array and bitmap containers to run containers when it is more
+ * efficient; also convert from run containers when more space efficient.
+ *
+ * Returns true if the result has at least one run container.
+ * Additional savings might be possible by calling `shrinkToFit()`.
+ */
+bool roaring_bitmap_run_optimize(roaring_bitmap_t *r);
+
+/**
+ * If needed, reallocate memory to shrink the memory usage.
+ * Returns the number of bytes saved.
+ */
+size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r);
+
+/**
+ * Write the bitmap to an output pointer, this output buffer should refer to
+ * at least `roaring_bitmap_size_in_bytes(r)` allocated bytes.
+ *
+ * See `roaring_bitmap_portable_serialize()` if you want a format that's
+ * compatible with Java and Go implementations.  This format can sometimes be
+ * more space efficient than the portable form, e.g. when the data is sparse.
+ *
+ * Returns how many bytes written, should be `roaring_bitmap_size_in_bytes(r)`.
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf);
+
+/**
+ * Use with `roaring_bitmap_serialize()`.
+ *
+ * (See `roaring_bitmap_portable_deserialize()` if you want a format that's
+ * compatible with Java and Go implementations).
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf);
+
+/**
+ * Use with `roaring_bitmap_serialize()`.
+ *
+ * (See `roaring_bitmap_portable_deserialize_safe()` if you want a format that's
+ * compatible with Java and Go implementations).
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ *
+ * The difference with `roaring_bitmap_deserialize()` is that this function
+ * checks that the input buffer is a valid bitmap.  If the buffer is too small,
+ * NULL is returned.
+ */
+roaring_bitmap_t *roaring_bitmap_deserialize_safe(const void *buf,
+                                                  size_t maxbytes);
+
+/**
+ * How many bytes are required to serialize this bitmap (NOT compatible
+ * with Java and Go versions)
+ */
+size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *r);
+
+/**
+ * Read bitmap from a serialized buffer.
+ * In case of failure, NULL is returned.
+ *
+ * This function is unsafe in the sense that if there is no valid serialized
+ * bitmap at the pointer, then many bytes could be read, possibly causing a
+ * buffer overflow.  See also roaring_bitmap_portable_deserialize_safe().
+ *
+ * This is meant to be compatible with the Java and Go versions:
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf);
+
+/**
+ * Read bitmap from a serialized buffer safely (reading up to maxbytes).
+ * In case of failure, NULL is returned.
+ *
+ * This is meant to be compatible with the Java and Go versions:
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ *
+ * The function itself is safe in the sense that it will not cause buffer
+ * overflows. However, for correct operations, it is assumed that the bitmap
+ * read was once serialized from a valid bitmap (i.e., it follows the format
+ * specification). If you provided an incorrect input (garbage), then the bitmap
+ * read may not be in a valid state and following operations may not lead to
+ * sensible results. In particular, the serialized array containers need to be
+ * in sorted order, and the run containers should be in sorted non-overlapping
+ * order. This is is guaranteed to happen when serializing an existing bitmap,
+ * but not for random inputs.
+ *
+ * You may use roaring_bitmap_internal_validate to check the validity of the
+ * bitmap prior to using it. You may also use other strategies to check for
+ * corrupted inputs (e.g., checksums).
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf,
+                                                           size_t maxbytes);
+
+/**
+ * Read bitmap from a serialized buffer.
+ * In case of failure, NULL is returned.
+ *
+ * Bitmap returned by this function can be used in all readonly contexts.
+ * Bitmap must be freed as usual, by calling roaring_bitmap_free().
+ * Underlying buffer must not be freed or modified while it backs any bitmaps.
+ *
+ * The function is unsafe in the following ways:
+ * 1) It may execute unaligned memory accesses.
+ * 2) A buffer overflow may occur if buf does not point to a valid serialized
+ *    bitmap.
+ *
+ * This is meant to be compatible with the Java and Go versions:
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+roaring_bitmap_t *roaring_bitmap_portable_deserialize_frozen(const char *buf);
+
+/**
+ * Check how many bytes would be read (up to maxbytes) at this pointer if there
+ * is a bitmap, returns zero if there is no valid bitmap.
+ *
+ * This is meant to be compatible with the Java and Go versions:
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ */
+size_t roaring_bitmap_portable_deserialize_size(const char *buf,
+                                                size_t maxbytes);
+
+/**
+ * How many bytes are required to serialize this bitmap.
+ *
+ * This is meant to be compatible with the Java and Go versions:
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ */
+size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *r);
+
+/**
+ * Write a bitmap to a char buffer.  The output buffer should refer to at least
+ * `roaring_bitmap_portable_size_in_bytes(r)` bytes of allocated memory.
+ *
+ * Returns how many bytes were written which should match
+ * `roaring_bitmap_portable_size_in_bytes(r)`.
+ *
+ * This is meant to be compatible with the Java and Go versions:
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *r, char *buf);
+
+/*
+ * "Frozen" serialization format imitates memory layout of roaring_bitmap_t.
+ * Deserialized bitmap is a constant view of the underlying buffer.
+ * This significantly reduces amount of allocations and copying required during
+ * deserialization.
+ * It can be used with memory mapped files.
+ * Example can be found in benchmarks/frozen_benchmark.c
+ *
+ *         [#####] const roaring_bitmap_t *
+ *          | | |
+ *     +----+ | +-+
+ *     |      |   |
+ * [#####################################] underlying buffer
+ *
+ * Note that because frozen serialization format imitates C memory layout
+ * of roaring_bitmap_t, it is not fixed. It is different on big/little endian
+ * platforms and can be changed in future.
+ */
+
+/**
+ * Returns number of bytes required to serialize bitmap using frozen format.
+ */
+size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *r);
+
+/**
+ * Serializes bitmap using frozen format.
+ * Buffer size must be at least roaring_bitmap_frozen_size_in_bytes().
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *r, char *buf);
+
+/**
+ * Creates constant bitmap that is a view of a given buffer.
+ * Buffer data should have been written by `roaring_bitmap_frozen_serialize()`
+ * Its beginning must also be aligned by 32 bytes.
+ * Length must be equal exactly to `roaring_bitmap_frozen_size_in_bytes()`.
+ * In case of failure, NULL is returned.
+ *
+ * Bitmap returned by this function can be used in all readonly contexts.
+ * Bitmap must be freed as usual, by calling roaring_bitmap_free().
+ * Underlying buffer must not be freed or modified while it backs any bitmaps.
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+const roaring_bitmap_t *roaring_bitmap_frozen_view(const char *buf,
+                                                   size_t length);
+
+/**
+ * Iterate over the bitmap elements. The function iterator is called once for
+ * all the values with ptr (can be NULL) as the second parameter of each call.
+ *
+ * `roaring_iterator` is simply a pointer to a function that returns bool
+ * (true means that the iteration should continue while false means that it
+ * should stop), and takes (uint32_t,void*) as inputs.
+ *
+ * Returns true if the roaring_iterator returned true throughout (so that all
+ * data points were necessarily visited).
+ *
+ * Iteration is ordered: from the smallest to the largest elements.
+ */
+bool roaring_iterate(const roaring_bitmap_t *r, roaring_iterator iterator,
+                     void *ptr);
+
+bool roaring_iterate64(const roaring_bitmap_t *r, roaring_iterator64 iterator,
+                       uint64_t high_bits, void *ptr);
+
+/**
+ * Return true if the two bitmaps contain the same elements.
+ */
+bool roaring_bitmap_equals(const roaring_bitmap_t *r1,
+                           const roaring_bitmap_t *r2);
+
+/**
+ * Return true if all the elements of r1 are also in r2.
+ */
+bool roaring_bitmap_is_subset(const roaring_bitmap_t *r1,
+                              const roaring_bitmap_t *r2);
+
+/**
+ * Return true if all the elements of r1 are also in r2, and r2 is strictly
+ * greater than r1.
+ */
+bool roaring_bitmap_is_strict_subset(const roaring_bitmap_t *r1,
+                                     const roaring_bitmap_t *r2);
+
+/**
+ * (For expert users who seek high performance.)
+ *
+ * Computes the union between two bitmaps and returns new bitmap. The caller is
+ * responsible for memory management.
+ *
+ * The lazy version defers some computations such as the maintenance of the
+ * cardinality counts. Thus you must call `roaring_bitmap_repair_after_lazy()`
+ * after executing "lazy" computations.
+ *
+ * It is safe to repeatedly call roaring_bitmap_lazy_or_inplace on the result.
+ *
+ * `bitsetconversion` is a flag which determines whether container-container
+ * operations force a bitset conversion.
+ */
+roaring_bitmap_t *roaring_bitmap_lazy_or(const roaring_bitmap_t *r1,
+                                         const roaring_bitmap_t *r2,
+                                         const bool bitsetconversion);
+
+/**
+ * (For expert users who seek high performance.)
+ *
+ * Inplace version of roaring_bitmap_lazy_or, modifies r1.
+ *
+ * `bitsetconversion` is a flag which determines whether container-container
+ * operations force a bitset conversion.
+ */
+void roaring_bitmap_lazy_or_inplace(roaring_bitmap_t *r1,
+                                    const roaring_bitmap_t *r2,
+                                    const bool bitsetconversion);
+
+/**
+ * (For expert users who seek high performance.)
+ *
+ * Execute maintenance on a bitmap created from `roaring_bitmap_lazy_or()`
+ * or modified with `roaring_bitmap_lazy_or_inplace()`.
+ */
+void roaring_bitmap_repair_after_lazy(roaring_bitmap_t *r1);
+
+/**
+ * Computes the symmetric difference between two bitmaps and returns new bitmap.
+ * The caller is responsible for memory management.
+ *
+ * The lazy version defers some computations such as the maintenance of the
+ * cardinality counts. Thus you must call `roaring_bitmap_repair_after_lazy()`
+ * after executing "lazy" computations.
+ *
+ * It is safe to repeatedly call `roaring_bitmap_lazy_xor_inplace()` on
+ * the result.
+ */
+roaring_bitmap_t *roaring_bitmap_lazy_xor(const roaring_bitmap_t *r1,
+                                          const roaring_bitmap_t *r2);
+
+/**
+ * (For expert users who seek high performance.)
+ *
+ * Inplace version of roaring_bitmap_lazy_xor, modifies r1. r1 != r2
+ */
+void roaring_bitmap_lazy_xor_inplace(roaring_bitmap_t *r1,
+                                     const roaring_bitmap_t *r2);
+
+/**
+ * Compute the negation of the bitmap in the interval [range_start, range_end).
+ * The number of negated values is range_end - range_start.
+ * Areas outside the range are passed through unchanged.
+ */
+roaring_bitmap_t *roaring_bitmap_flip(const roaring_bitmap_t *r1,
+                                      uint64_t range_start, uint64_t range_end);
+
+/**
+ * compute (in place) the negation of the roaring bitmap within a specified
+ * interval: [range_start, range_end). The number of negated values is
+ * range_end - range_start.
+ * Areas outside the range are passed through unchanged.
+ */
+void roaring_bitmap_flip_inplace(roaring_bitmap_t *r1, uint64_t range_start,
+                                 uint64_t range_end);
+
+/**
+ * Selects the element at index 'rank' where the smallest element is at index 0.
+ * If the size of the roaring bitmap is strictly greater than rank, then this
+ * function returns true and sets element to the element of given rank.
+ * Otherwise, it returns false.
+ */
+bool roaring_bitmap_select(const roaring_bitmap_t *r, uint32_t rank,
+                           uint32_t *element);
+
+/**
+ * roaring_bitmap_rank returns the number of integers that are smaller or equal
+ * to x. Thus if x is the first element, this function will return 1. If
+ * x is smaller than the smallest element, this function will return 0.
+ *
+ * The indexing convention differs between roaring_bitmap_select and
+ * roaring_bitmap_rank: roaring_bitmap_select refers to the smallest value
+ * as having index 0, whereas roaring_bitmap_rank returns 1 when ranking
+ * the smallest value.
+ */
+uint64_t roaring_bitmap_rank(const roaring_bitmap_t *r, uint32_t x);
+
+/**
+ * roaring_bitmap_rank_many is an `Bulk` version of `roaring_bitmap_rank`
+ * it puts rank value of each element in `[begin .. end)` to `ans[]`
+ *
+ * the values in `[begin .. end)` must be sorted in Ascending order;
+ * Caller is responsible to ensure that there is enough memory allocated, e.g.
+ *
+ *     ans = malloc((end-begin) * sizeof(uint64_t));
+ */
+void roaring_bitmap_rank_many(const roaring_bitmap_t *r, const uint32_t *begin,
+                              const uint32_t *end, uint64_t *ans);
+
+/**
+ * Returns the index of x in the given roaring bitmap.
+ * If the roaring bitmap doesn't contain x , this function will return -1.
+ * The difference with rank function is that this function will return -1 when x
+ * is not the element of roaring bitmap, but the rank function will return a
+ * non-negative number.
+ */
+int64_t roaring_bitmap_get_index(const roaring_bitmap_t *r, uint32_t x);
+
+/**
+ * Returns the smallest value in the set, or UINT32_MAX if the set is empty.
+ */
+uint32_t roaring_bitmap_minimum(const roaring_bitmap_t *r);
+
+/**
+ * Returns the greatest value in the set, or 0 if the set is empty.
+ */
+uint32_t roaring_bitmap_maximum(const roaring_bitmap_t *r);
+
+/**
+ * (For advanced users.)
+ *
+ * Collect statistics about the bitmap, see roaring_types.h for
+ * a description of roaring_statistics_t
+ */
+void roaring_bitmap_statistics(const roaring_bitmap_t *r,
+                               roaring_statistics_t *stat);
+
+/**
+ * Perform internal consistency checks. Returns true if the bitmap is
+ * consistent. It may be useful to call this after deserializing bitmaps from
+ * untrusted sources. If roaring_bitmap_internal_validate returns true, then the
+ * bitmap should be consistent and can be trusted not to cause crashes or memory
+ * corruption.
+ *
+ * Note that some operations intentionally leave bitmaps in an inconsistent
+ * state temporarily, for example, `roaring_bitmap_lazy_*` functions, until
+ * `roaring_bitmap_repair_after_lazy` is called.
+ *
+ * If reason is non-null, it will be set to a string describing the first
+ * inconsistency found if any.
+ */
+bool roaring_bitmap_internal_validate(const roaring_bitmap_t *r,
+                                      const char **reason);
+
+/*********************
+* What follows is code use to iterate through values in a roaring bitmap
+
+roaring_bitmap_t *r =...
+roaring_uint32_iterator_t i;
+roaring_iterator_create(r, &i);
+while(i.has_value) {
+  printf("value = %d\n", i.current_value);
+  roaring_uint32_iterator_advance(&i);
+}
+
+Obviously, if you modify the underlying bitmap, the iterator
+becomes invalid. So don't.
+*/
+
+/**
+ * A struct used to keep iterator state. Users should only access
+ * `current_value` and `has_value`, the rest of the type should be treated as
+ * opaque.
+ */
+typedef struct roaring_uint32_iterator_s {
+    const roaring_bitmap_t *parent;        // Owner
+    const ROARING_CONTAINER_T *container;  // Current container
+    uint8_t typecode;                      // Typecode of current container
+    int32_t container_index;               // Current container index
+    uint32_t highbits;                     // High 16 bits of the current value
+    roaring_container_iterator_t container_it;
+
+    uint32_t current_value;
+    bool has_value;
+} roaring_uint32_iterator_t;
+
+/**
+ * Initialize an iterator object that can be used to iterate through the values.
+ * If there is a  value, then this iterator points to the first value and
+ * `it->has_value` is true. The value is in `it->current_value`.
+ */
+void roaring_iterator_init(const roaring_bitmap_t *r,
+                           roaring_uint32_iterator_t *newit);
+
+/** DEPRECATED, use `roaring_iterator_init`. */
+CROARING_DEPRECATED static inline void roaring_init_iterator(
+    const roaring_bitmap_t *r, roaring_uint32_iterator_t *newit) {
+    roaring_iterator_init(r, newit);
+}
+
+/**
+ * Initialize an iterator object that can be used to iterate through the values.
+ * If there is a value, then this iterator points to the last value and
+ * `it->has_value` is true. The value is in `it->current_value`.
+ */
+void roaring_iterator_init_last(const roaring_bitmap_t *r,
+                                roaring_uint32_iterator_t *newit);
+
+/** DEPRECATED, use `roaring_iterator_init_last`. */
+CROARING_DEPRECATED static inline void roaring_init_iterator_last(
+    const roaring_bitmap_t *r, roaring_uint32_iterator_t *newit) {
+    roaring_iterator_init_last(r, newit);
+}
+
+/**
+ * Create an iterator object that can be used to iterate through the values.
+ * Caller is responsible for calling `roaring_free_iterator()`.
+ *
+ * The iterator is initialized (this function calls `roaring_iterator_init()`)
+ * If there is a value, then this iterator points to the first value and
+ * `it->has_value` is true.  The value is in `it->current_value`.
+ */
+roaring_uint32_iterator_t *roaring_iterator_create(const roaring_bitmap_t *r);
+
+/** DEPRECATED, use `roaring_iterator_create`. */
+CROARING_DEPRECATED static inline roaring_uint32_iterator_t *
+roaring_create_iterator(const roaring_bitmap_t *r) {
+    return roaring_iterator_create(r);
+}
+
+/**
+ * Advance the iterator. If there is a new value, then `it->has_value` is true.
+ * The new value is in `it->current_value`. Values are traversed in increasing
+ * orders. For convenience, returns `it->has_value`.
+ *
+ * Once `it->has_value` is false, `roaring_uint32_iterator_advance` should not
+ * be called on the iterator again. Calling `roaring_uint32_iterator_previous`
+ * is allowed.
+ */
+bool roaring_uint32_iterator_advance(roaring_uint32_iterator_t *it);
+
+/** DEPRECATED, use `roaring_uint32_iterator_advance`. */
+CROARING_DEPRECATED static inline bool roaring_advance_uint32_iterator(
+    roaring_uint32_iterator_t *it) {
+    return roaring_uint32_iterator_advance(it);
+}
+
+/**
+ * Decrement the iterator. If there's a new value, then `it->has_value` is true.
+ * The new value is in `it->current_value`. Values are traversed in decreasing
+ * order. For convenience, returns `it->has_value`.
+ *
+ * Once `it->has_value` is false, `roaring_uint32_iterator_previous` should not
+ * be called on the iterator again. Calling `roaring_uint32_iterator_advance` is
+ * allowed.
+ */
+bool roaring_uint32_iterator_previous(roaring_uint32_iterator_t *it);
+
+/** DEPRECATED, use `roaring_uint32_iterator_previous`. */
+CROARING_DEPRECATED static inline bool roaring_previous_uint32_iterator(
+    roaring_uint32_iterator_t *it) {
+    return roaring_uint32_iterator_previous(it);
+}
+
+/**
+ * Move the iterator to the first value >= `val`. If there is a such a value,
+ * then `it->has_value` is true. The new value is in `it->current_value`.
+ * For convenience, returns `it->has_value`.
+ */
+bool roaring_uint32_iterator_move_equalorlarger(roaring_uint32_iterator_t *it,
+                                                uint32_t val);
+
+/** DEPRECATED, use `roaring_uint32_iterator_move_equalorlarger`. */
+CROARING_DEPRECATED static inline bool
+roaring_move_uint32_iterator_equalorlarger(roaring_uint32_iterator_t *it,
+                                           uint32_t val) {
+    return roaring_uint32_iterator_move_equalorlarger(it, val);
+}
+
+/**
+ * Creates a copy of an iterator.
+ * Caller must free it.
+ */
+roaring_uint32_iterator_t *roaring_uint32_iterator_copy(
+    const roaring_uint32_iterator_t *it);
+
+/** DEPRECATED, use `roaring_uint32_iterator_copy`. */
+CROARING_DEPRECATED static inline roaring_uint32_iterator_t *
+roaring_copy_uint32_iterator(const roaring_uint32_iterator_t *it) {
+    return roaring_uint32_iterator_copy(it);
+}
+
+/**
+ * Free memory following `roaring_iterator_create()`
+ */
+void roaring_uint32_iterator_free(roaring_uint32_iterator_t *it);
+
+/** DEPRECATED, use `roaring_uint32_iterator_free`. */
+CROARING_DEPRECATED static inline void roaring_free_uint32_iterator(
+    roaring_uint32_iterator_t *it) {
+    roaring_uint32_iterator_free(it);
+}
+
+/*
+ * Reads next ${count} values from iterator into user-supplied ${buf}.
+ * Returns the number of read elements.
+ * This number can be smaller than ${count}, which means that iterator is
+ * drained.
+ *
+ * This function satisfies semantics of iteration and can be used together with
+ * other iterator functions.
+ *  - first value is copied from ${it}->current_value
+ *  - after function returns, iterator is positioned at the next element
+ */
+uint32_t roaring_uint32_iterator_read(roaring_uint32_iterator_t *it,
+                                      uint32_t *buf, uint32_t count);
+
+/** DEPRECATED, use `roaring_uint32_iterator_read`. */
+CROARING_DEPRECATED static inline uint32_t roaring_read_uint32_iterator(
+    roaring_uint32_iterator_t *it, uint32_t *buf, uint32_t count) {
+    return roaring_uint32_iterator_read(it, buf, count);
+}
+
+#ifdef __cplusplus
+}
+}
+}  // extern "C" { namespace roaring { namespace api {
+#endif
+
+#endif /* ROARING_H */
+
+#ifdef __cplusplus
+/**
+ * Best practices for C++ headers is to avoid polluting global scope.
+ * But for C compatibility when just `roaring.h` is included building as
+ * C++, default to global access for the C public API.
+ *
+ * BUT when `roaring.hh` is included instead, it sets this flag.  That way
+ * explicit namespacing must be used to get the C functions.
+ *
+ * This is outside the include guard so that if you include BOTH headers,
+ * the order won't matter; you still get the global definitions.
+ */
+#if !defined(ROARING_API_NOT_IN_GLOBAL_NAMESPACE)
+using namespace ::roaring::api;
+#endif
+#endif
+/* end file include/roaring/roaring.h */
+/* begin file include/roaring/memory.h */
+#ifndef INCLUDE_ROARING_MEMORY_H_
+#define INCLUDE_ROARING_MEMORY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>  // for size_t
+
+typedef void* (*roaring_malloc_p)(size_t);
+typedef void* (*roaring_realloc_p)(void*, size_t);
+typedef void* (*roaring_calloc_p)(size_t, size_t);
+typedef void (*roaring_free_p)(void*);
+typedef void* (*roaring_aligned_malloc_p)(size_t, size_t);
+typedef void (*roaring_aligned_free_p)(void*);
+
+typedef struct roaring_memory_s {
+    roaring_malloc_p malloc;
+    roaring_realloc_p realloc;
+    roaring_calloc_p calloc;
+    roaring_free_p free;
+    roaring_aligned_malloc_p aligned_malloc;
+    roaring_aligned_free_p aligned_free;
+} roaring_memory_t;
+
+void roaring_init_memory_hook(roaring_memory_t memory_hook);
+
+void* roaring_malloc(size_t);
+void* roaring_realloc(void*, size_t);
+void* roaring_calloc(size_t, size_t);
+void roaring_free(void*);
+void* roaring_aligned_malloc(size_t, size_t);
+void roaring_aligned_free(void*);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // INCLUDE_ROARING_MEMORY_H_
+/* end file include/roaring/memory.h */
+/* begin file include/roaring/roaring64.h */
+#ifndef ROARING64_H
+#define ROARING64_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+namespace roaring {
+namespace api {
+#endif
+
+typedef struct roaring64_bitmap_s roaring64_bitmap_t;
+typedef struct roaring64_leaf_s roaring64_leaf_t;
+typedef struct roaring64_iterator_s roaring64_iterator_t;
+
+/**
+ * A bit of context usable with `roaring64_bitmap_*_bulk()` functions.
+ *
+ * Should be initialized with `{0}` (or `memset()` to all zeros).
+ * Callers should treat it as an opaque type.
+ *
+ * A context may only be used with a single bitmap (unless re-initialized to
+ * zero), and any modification to a bitmap (other than modifications performed
+ * with `_bulk()` functions with the context passed) will invalidate any
+ * contexts associated with that bitmap.
+ */
+typedef struct roaring64_bulk_context_s {
+    uint8_t high_bytes[6];
+    roaring64_leaf_t *leaf;
+} roaring64_bulk_context_t;
+
+/**
+ * Dynamically allocates a new bitmap (initially empty).
+ * Client is responsible for calling `roaring64_bitmap_free()`.
+ */
+roaring64_bitmap_t *roaring64_bitmap_create(void);
+void roaring64_bitmap_free(roaring64_bitmap_t *r);
+
+/**
+ * Returns a copy of a bitmap.
+ */
+roaring64_bitmap_t *roaring64_bitmap_copy(const roaring64_bitmap_t *r);
+
+/**
+ * Creates a new bitmap of a pointer to N 64-bit integers.
+ */
+roaring64_bitmap_t *roaring64_bitmap_of_ptr(size_t n_args,
+                                            const uint64_t *vals);
+
+#ifdef __cplusplus
+/**
+ * Creates a new bitmap which contains all values passed in as arguments.
+ *
+ * To create a bitmap from a variable number of arguments, use the
+ * `roaring64_bitmap_of_ptr` function instead.
+ */
+// Use an immediately invoked closure, capturing by reference
+// (in case __VA_ARGS__ refers to context outside the closure)
+// Include a 0 at the beginning of the array to make the array length > 0
+// (zero sized arrays are not valid in standard c/c++)
+#define roaring64_bitmap_from(...)                                       \
+    [&]() {                                                              \
+        const uint64_t roaring64_bitmap_from_array[] = {0, __VA_ARGS__}; \
+        return roaring64_bitmap_of_ptr(                                  \
+            (sizeof(roaring64_bitmap_from_array) /                       \
+             sizeof(roaring64_bitmap_from_array[0])) -                   \
+                1,                                                       \
+            &roaring64_bitmap_from_array[1]);                            \
+    }()
+#else
+/**
+ * Creates a new bitmap which contains all values passed in as arguments.
+ *
+ * To create a bitmap from a variable number of arguments, use the
+ * `roaring64_bitmap_of_ptr` function instead.
+ */
+// While __VA_ARGS__ occurs twice in expansion, one of the times is in a sizeof
+// expression, which is an unevaluated context, so it's even safe in the case
+// where expressions passed have side effects (roaring64_bitmap_from(my_func(),
+// ++i))
+// Include a 0 at the beginning of the array to make the array length > 0
+// (zero sized arrays are not valid in standard c/c++)
+#define roaring64_bitmap_from(...)                                           \
+    roaring64_bitmap_of_ptr(                                                 \
+        (sizeof((const uint64_t[]){0, __VA_ARGS__}) / sizeof(uint64_t)) - 1, \
+        &((const uint64_t[]){0, __VA_ARGS__})[1])
+#endif
+
+/**
+ * Create a new bitmap containing all the values in [min, max) that are at a
+ * distance k*step from min.
+ */
+roaring64_bitmap_t *roaring64_bitmap_from_range(uint64_t min, uint64_t max,
+                                                uint64_t step);
+
+/**
+ * Adds the provided value to the bitmap.
+ */
+void roaring64_bitmap_add(roaring64_bitmap_t *r, uint64_t val);
+
+/**
+ * Adds the provided value to the bitmap.
+ * Returns true if a new value was added, false if the value already existed.
+ */
+bool roaring64_bitmap_add_checked(roaring64_bitmap_t *r, uint64_t val);
+
+/**
+ * Add an item, using context from a previous insert for faster insertion.
+ *
+ * `context` will be used to store information between calls to make bulk
+ * operations faster. `*context` should be zero-initialized before the first
+ * call to this function.
+ *
+ * Modifying the bitmap in any way (other than `-bulk` suffixed functions)
+ * will invalidate the stored context, calling this function with a non-zero
+ * context after doing any modification invokes undefined behavior.
+ *
+ * In order to exploit this optimization, the caller should call this function
+ * with values with the same high 48 bits of the value consecutively.
+ */
+void roaring64_bitmap_add_bulk(roaring64_bitmap_t *r,
+                               roaring64_bulk_context_t *context, uint64_t val);
+
+/**
+ * Add `n_args` values from `vals`, faster than repeatedly calling
+ * `roaring64_bitmap_add()`
+ *
+ * In order to exploit this optimization, the caller should attempt to keep
+ * values with the same high 48 bits of the value as consecutive elements in
+ * `vals`.
+ */
+void roaring64_bitmap_add_many(roaring64_bitmap_t *r, size_t n_args,
+                               const uint64_t *vals);
+
+/**
+ * Add all values in range [min, max).
+ */
+void roaring64_bitmap_add_range(roaring64_bitmap_t *r, uint64_t min,
+                                uint64_t max);
+
+/**
+ * Add all values in range [min, max].
+ */
+void roaring64_bitmap_add_range_closed(roaring64_bitmap_t *r, uint64_t min,
+                                       uint64_t max);
+
+/**
+ * Removes a value from the bitmap if present.
+ */
+void roaring64_bitmap_remove(roaring64_bitmap_t *r, uint64_t val);
+
+/**
+ * Removes a value from the bitmap if present, returns true if the value was
+ * removed and false if the value was not present.
+ */
+bool roaring64_bitmap_remove_checked(roaring64_bitmap_t *r, uint64_t val);
+
+/**
+ * Remove an item, using context from a previous insert for faster removal.
+ *
+ * `context` will be used to store information between calls to make bulk
+ * operations faster. `*context` should be zero-initialized before the first
+ * call to this function.
+ *
+ * Modifying the bitmap in any way (other than `-bulk` suffixed functions)
+ * will invalidate the stored context, calling this function with a non-zero
+ * context after doing any modification invokes undefined behavior.
+ *
+ * In order to exploit this optimization, the caller should call this function
+ * with values with the same high 48 bits of the value consecutively.
+ */
+void roaring64_bitmap_remove_bulk(roaring64_bitmap_t *r,
+                                  roaring64_bulk_context_t *context,
+                                  uint64_t val);
+
+/**
+ * Remove `n_args` values from `vals`, faster than repeatedly calling
+ * `roaring64_bitmap_remove()`
+ *
+ * In order to exploit this optimization, the caller should attempt to keep
+ * values with the same high 48 bits of the value as consecutive elements in
+ * `vals`.
+ */
+void roaring64_bitmap_remove_many(roaring64_bitmap_t *r, size_t n_args,
+                                  const uint64_t *vals);
+
+/**
+ * Remove all values in range [min, max).
+ */
+void roaring64_bitmap_remove_range(roaring64_bitmap_t *r, uint64_t min,
+                                   uint64_t max);
+
+/**
+ * Remove all values in range [min, max].
+ */
+void roaring64_bitmap_remove_range_closed(roaring64_bitmap_t *r, uint64_t min,
+                                          uint64_t max);
+
+/**
+ * Returns true if the provided value is present.
+ */
+bool roaring64_bitmap_contains(const roaring64_bitmap_t *r, uint64_t val);
+
+/**
+ * Returns true if all values in the range [min, max) are present.
+ */
+bool roaring64_bitmap_contains_range(const roaring64_bitmap_t *r, uint64_t min,
+                                     uint64_t max);
+
+/**
+ * Check if an item is present using context from a previous insert or search
+ * for faster search.
+ *
+ * `context` will be used to store information between calls to make bulk
+ * operations faster. `*context` should be zero-initialized before the first
+ * call to this function.
+ *
+ * Modifying the bitmap in any way (other than `-bulk` suffixed functions)
+ * will invalidate the stored context, calling this function with a non-zero
+ * context after doing any modification invokes undefined behavior.
+ *
+ * In order to exploit this optimization, the caller should call this function
+ * with values with the same high 48 bits of the value consecutively.
+ */
+bool roaring64_bitmap_contains_bulk(const roaring64_bitmap_t *r,
+                                    roaring64_bulk_context_t *context,
+                                    uint64_t val);
+
+/**
+ * Selects the element at index 'rank' where the smallest element is at index 0.
+ * If the size of the bitmap is strictly greater than rank, then this function
+ * returns true and sets element to the element of given rank. Otherwise, it
+ * returns false.
+ */
+bool roaring64_bitmap_select(const roaring64_bitmap_t *r, uint64_t rank,
+                             uint64_t *element);
+
+/**
+ * Returns the number of integers that are smaller or equal to x. Thus if x is
+ * the first element, this function will return 1. If x is smaller than the
+ * smallest element, this function will return 0.
+ *
+ * The indexing convention differs between roaring64_bitmap_select and
+ * roaring64_bitmap_rank: roaring_bitmap64_select refers to the smallest value
+ * as having index 0, whereas roaring64_bitmap_rank returns 1 when ranking
+ * the smallest value.
+ */
+uint64_t roaring64_bitmap_rank(const roaring64_bitmap_t *r, uint64_t val);
+
+/**
+ * Returns true if the given value is in the bitmap, and sets `out_index` to the
+ * (0-based) index of the value in the bitmap. Returns false if the value is not
+ * in the bitmap.
+ */
+bool roaring64_bitmap_get_index(const roaring64_bitmap_t *r, uint64_t val,
+                                uint64_t *out_index);
+
+/**
+ * Returns the number of values in the bitmap.
+ */
+uint64_t roaring64_bitmap_get_cardinality(const roaring64_bitmap_t *r);
+
+/**
+ * Returns the number of elements in the range [min, max).
+ */
+uint64_t roaring64_bitmap_range_cardinality(const roaring64_bitmap_t *r,
+                                            uint64_t min, uint64_t max);
+
+/**
+ * Returns true if the bitmap is empty (cardinality is zero).
+ */
+bool roaring64_bitmap_is_empty(const roaring64_bitmap_t *r);
+
+/**
+ * Returns the smallest value in the set, or UINT64_MAX if the set is empty.
+ */
+uint64_t roaring64_bitmap_minimum(const roaring64_bitmap_t *r);
+
+/**
+ * Returns the largest value in the set, or 0 if empty.
+ */
+uint64_t roaring64_bitmap_maximum(const roaring64_bitmap_t *r);
+
+/**
+ * Returns true if the result has at least one run container.
+ */
+bool roaring64_bitmap_run_optimize(roaring64_bitmap_t *r);
+
+/**
+ * Perform internal consistency checks.
+ *
+ * Returns true if the bitmap is consistent. It may be useful to call this
+ * after deserializing bitmaps from untrusted sources. If
+ * roaring64_bitmap_internal_validate returns true, then the bitmap is
+ * consistent and can be trusted not to cause crashes or memory corruption.
+ *
+ * If reason is non-null, it will be set to a string describing the first
+ * inconsistency found if any.
+ */
+bool roaring64_bitmap_internal_validate(const roaring64_bitmap_t *r,
+                                        const char **reason);
+
+/**
+ * Return true if the two bitmaps contain the same elements.
+ */
+bool roaring64_bitmap_equals(const roaring64_bitmap_t *r1,
+                             const roaring64_bitmap_t *r2);
+
+/**
+ * Return true if all the elements of r1 are also in r2.
+ */
+bool roaring64_bitmap_is_subset(const roaring64_bitmap_t *r1,
+                                const roaring64_bitmap_t *r2);
+
+/**
+ * Return true if all the elements of r1 are also in r2, and r2 is strictly
+ * greater than r1.
+ */
+bool roaring64_bitmap_is_strict_subset(const roaring64_bitmap_t *r1,
+                                       const roaring64_bitmap_t *r2);
+
+/**
+ * Computes the intersection between two bitmaps and returns new bitmap. The
+ * caller is responsible for free-ing the result.
+ *
+ * Performance hint: if you are computing the intersection between several
+ * bitmaps, two-by-two, it is best to start with the smallest bitmaps. You may
+ * also rely on roaring64_bitmap_and_inplace to avoid creating many temporary
+ * bitmaps.
+ */
+roaring64_bitmap_t *roaring64_bitmap_and(const roaring64_bitmap_t *r1,
+                                         const roaring64_bitmap_t *r2);
+
+/**
+ * Computes the size of the intersection between two bitmaps.
+ */
+uint64_t roaring64_bitmap_and_cardinality(const roaring64_bitmap_t *r1,
+                                          const roaring64_bitmap_t *r2);
+
+/**
+ * In-place version of `roaring64_bitmap_and()`, modifies `r1`. `r1` and `r2`
+ * are allowed to be equal.
+ *
+ * Performance hint: if you are computing the intersection between several
+ * bitmaps, two-by-two, it is best to start with the smallest bitmaps.
+ */
+void roaring64_bitmap_and_inplace(roaring64_bitmap_t *r1,
+                                  const roaring64_bitmap_t *r2);
+
+/**
+ * Check whether two bitmaps intersect.
+ */
+bool roaring64_bitmap_intersect(const roaring64_bitmap_t *r1,
+                                const roaring64_bitmap_t *r2);
+
+/**
+ * Check whether a bitmap intersects the range [min, max).
+ */
+bool roaring64_bitmap_intersect_with_range(const roaring64_bitmap_t *r,
+                                           uint64_t min, uint64_t max);
+
+/**
+ * Computes the Jaccard index between two bitmaps. (Also known as the Tanimoto
+ * distance, or the Jaccard similarity coefficient)
+ *
+ * The Jaccard index is undefined if both bitmaps are empty.
+ */
+double roaring64_bitmap_jaccard_index(const roaring64_bitmap_t *r1,
+                                      const roaring64_bitmap_t *r2);
+
+/**
+ * Computes the union between two bitmaps and returns new bitmap. The caller is
+ * responsible for free-ing the result.
+ */
+roaring64_bitmap_t *roaring64_bitmap_or(const roaring64_bitmap_t *r1,
+                                        const roaring64_bitmap_t *r2);
+
+/**
+ * Computes the size of the union between two bitmaps.
+ */
+uint64_t roaring64_bitmap_or_cardinality(const roaring64_bitmap_t *r1,
+                                         const roaring64_bitmap_t *r2);
+
+/**
+ * In-place version of `roaring64_bitmap_or(), modifies `r1`.
+ */
+void roaring64_bitmap_or_inplace(roaring64_bitmap_t *r1,
+                                 const roaring64_bitmap_t *r2);
+
+/**
+ * Computes the symmetric difference (xor) between two bitmaps and returns a new
+ * bitmap. The caller is responsible for free-ing the result.
+ */
+roaring64_bitmap_t *roaring64_bitmap_xor(const roaring64_bitmap_t *r1,
+                                         const roaring64_bitmap_t *r2);
+
+/**
+ * Computes the size of the symmetric difference (xor) between two bitmaps.
+ */
+uint64_t roaring64_bitmap_xor_cardinality(const roaring64_bitmap_t *r1,
+                                          const roaring64_bitmap_t *r2);
+
+/**
+ * In-place version of `roaring64_bitmap_xor()`, modifies `r1`. `r1` and `r2`
+ * are not allowed to be equal (that would result in an empty bitmap).
+ */
+void roaring64_bitmap_xor_inplace(roaring64_bitmap_t *r1,
+                                  const roaring64_bitmap_t *r2);
+
+/**
+ * Computes the difference (andnot) between two bitmaps and returns a new
+ * bitmap. The caller is responsible for free-ing the result.
+ */
+roaring64_bitmap_t *roaring64_bitmap_andnot(const roaring64_bitmap_t *r1,
+                                            const roaring64_bitmap_t *r2);
+
+/**
+ * Computes the size of the difference (andnot) between two bitmaps.
+ */
+uint64_t roaring64_bitmap_andnot_cardinality(const roaring64_bitmap_t *r1,
+                                             const roaring64_bitmap_t *r2);
+
+/**
+ * In-place version of `roaring64_bitmap_andnot()`, modifies `r1`. `r1` and `r2`
+ * are not allowed to be equal (that would result in an empty bitmap).
+ */
+void roaring64_bitmap_andnot_inplace(roaring64_bitmap_t *r1,
+                                     const roaring64_bitmap_t *r2);
+
+/**
+ * Compute the negation of the bitmap in the interval [min, max).
+ * The number of negated values is `max - min`. Areas outside the range are
+ * passed through unchanged.
+ */
+roaring64_bitmap_t *roaring64_bitmap_flip(const roaring64_bitmap_t *r,
+                                          uint64_t min, uint64_t max);
+
+/**
+ * Compute the negation of the bitmap in the interval [min, max].
+ * The number of negated values is `max - min + 1`. Areas outside the range are
+ * passed through unchanged.
+ */
+roaring64_bitmap_t *roaring64_bitmap_flip_closed(const roaring64_bitmap_t *r,
+                                                 uint64_t min, uint64_t max);
+
+/**
+ * In-place version of `roaring64_bitmap_flip`. Compute the negation of the
+ * bitmap in the interval [min, max). The number of negated values is `max -
+ * min`. Areas outside the range are passed through unchanged.
+ */
+void roaring64_bitmap_flip_inplace(roaring64_bitmap_t *r, uint64_t min,
+                                   uint64_t max);
+/**
+ * In-place version of `roaring64_bitmap_flip_closed`. Compute the negation of
+ * the bitmap in the interval [min, max]. The number of negated values is `max -
+ * min + 1`. Areas outside the range are passed through unchanged.
+ */
+void roaring64_bitmap_flip_closed_inplace(roaring64_bitmap_t *r, uint64_t min,
+                                          uint64_t max);
+/**
+ * How many bytes are required to serialize this bitmap.
+ *
+ * This is meant to be compatible with other languages:
+ * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations
+ */
+size_t roaring64_bitmap_portable_size_in_bytes(const roaring64_bitmap_t *r);
+
+/**
+ * Write a bitmap to a buffer. The output buffer should refer to at least
+ * `roaring64_bitmap_portable_size_in_bytes(r)` bytes of allocated memory.
+ *
+ * Returns how many bytes were written, which should match
+ * `roaring64_bitmap_portable_size_in_bytes(r)`.
+ *
+ * This is meant to be compatible with other languages:
+ * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+size_t roaring64_bitmap_portable_serialize(const roaring64_bitmap_t *r,
+                                           char *buf);
+/**
+ * Check how many bytes would be read (up to maxbytes) at this pointer if there
+ * is a valid bitmap, returns zero if there is no valid bitmap.
+ *
+ * This is meant to be compatible with other languages
+ * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations
+ */
+size_t roaring64_bitmap_portable_deserialize_size(const char *buf,
+                                                  size_t maxbytes);
+
+/**
+ * Read a bitmap from a serialized buffer safely (reading up to maxbytes).
+ * In case of failure, NULL is returned.
+ *
+ * This is meant to be compatible with other languages
+ * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations
+ *
+ * The function itself is safe in the sense that it will not cause buffer
+ * overflows. However, for correct operations, it is assumed that the bitmap
+ * read was once serialized from a valid bitmap (i.e., it follows the format
+ * specification). If you provided an incorrect input (garbage), then the bitmap
+ * read may not be in a valid state and following operations may not lead to
+ * sensible results. In particular, the serialized array containers need to be
+ * in sorted order, and the run containers should be in sorted non-overlapping
+ * order. This is is guaranteed to happen when serializing an existing bitmap,
+ * but not for random inputs.
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a
+ * mainframe IBM s390x), the data format is going to be big-endian and not
+ * compatible with little-endian systems.
+ */
+roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe(const char *buf,
+                                                               size_t maxbytes);
+
+/**
+ * Iterate over the bitmap elements. The function `iterator` is called once for
+ * all the values with `ptr` (can be NULL) as the second parameter of each call.
+ *
+ * `roaring_iterator64` is simply a pointer to a function that returns a bool
+ * and takes `(uint64_t, void*)` as inputs. True means that the iteration should
+ * continue, while false means that it should stop.
+ *
+ * Returns true if the `roaring64_iterator` returned true throughout (so that
+ * all data points were necessarily visited).
+ *
+ * Iteration is ordered from the smallest to the largest elements.
+ */
+bool roaring64_bitmap_iterate(const roaring64_bitmap_t *r,
+                              roaring_iterator64 iterator, void *ptr);
+
+/**
+ * Convert the bitmap to a sorted array `out`.
+ *
+ * Caller is responsible to ensure that there is enough memory allocated, e.g.
+ * ```
+ * out = malloc(roaring64_bitmap_get_cardinality(bitmap) * sizeof(uint64_t));
+ * ```
+ */
+void roaring64_bitmap_to_uint64_array(const roaring64_bitmap_t *r,
+                                      uint64_t *out);
+
+/**
+ * Create an iterator object that can be used to iterate through the values.
+ * Caller is responsible for calling `roaring64_iterator_free()`.
+ *
+ * The iterator is initialized. If there is a value, then this iterator points
+ * to the first value and `roaring64_iterator_has_value()` returns true. The
+ * value can be retrieved with `roaring64_iterator_value()`.
+ */
+roaring64_iterator_t *roaring64_iterator_create(const roaring64_bitmap_t *r);
+
+/**
+ * Create an iterator object that can be used to iterate through the values.
+ * Caller is responsible for calling `roaring64_iterator_free()`.
+ *
+ * The iterator is initialized. If there is a value, then this iterator points
+ * to the last value and `roaring64_iterator_has_value()` returns true. The
+ * value can be retrieved with `roaring64_iterator_value()`.
+ */
+roaring64_iterator_t *roaring64_iterator_create_last(
+    const roaring64_bitmap_t *r);
+
+/**
+ * Re-initializes an existing iterator. Functionally the same as
+ * `roaring64_iterator_create` without a allocation.
+ */
+void roaring64_iterator_reinit(const roaring64_bitmap_t *r,
+                               roaring64_iterator_t *it);
+
+/**
+ * Re-initializes an existing iterator. Functionally the same as
+ * `roaring64_iterator_create_last` without a allocation.
+ */
+void roaring64_iterator_reinit_last(const roaring64_bitmap_t *r,
+                                    roaring64_iterator_t *it);
+
+/**
+ * Creates a copy of the iterator. Caller is responsible for calling
+ * `roaring64_iterator_free()` on the resulting iterator.
+ */
+roaring64_iterator_t *roaring64_iterator_copy(const roaring64_iterator_t *it);
+
+/**
+ * Free the iterator.
+ */
+void roaring64_iterator_free(roaring64_iterator_t *it);
+
+/**
+ * Returns true if the iterator currently points to a value. If so, calling
+ * `roaring64_iterator_value()` returns the value.
+ */
+bool roaring64_iterator_has_value(const roaring64_iterator_t *it);
+
+/**
+ * Returns the value the iterator currently points to. Should only be called if
+ * `roaring64_iterator_has_value()` returns true.
+ */
+uint64_t roaring64_iterator_value(const roaring64_iterator_t *it);
+
+/**
+ * Advance the iterator. If there is a new value, then
+ * `roaring64_iterator_has_value()` returns true. Values are traversed in
+ * increasing order. For convenience, returns the result of
+ * `roaring64_iterator_has_value()`.
+ *
+ * Once this returns false, `roaring64_iterator_advance` should not be called on
+ * the iterator again. Calling `roaring64_iterator_previous` is allowed.
+ */
+bool roaring64_iterator_advance(roaring64_iterator_t *it);
+
+/**
+ * Decrement the iterator. If there is a new value, then
+ * `roaring64_iterator_has_value()` returns true. Values are traversed in
+ * decreasing order. For convenience, returns the result of
+ * `roaring64_iterator_has_value()`.
+ *
+ * Once this returns false, `roaring64_iterator_previous` should not be called
+ * on the iterator again. Calling `roaring64_iterator_advance` is allowed.
+ */
+bool roaring64_iterator_previous(roaring64_iterator_t *it);
+
+/**
+ * Move the iterator to the first value greater than or equal to `val`, if it
+ * exists at or after the current position of the iterator. If there is a new
+ * value, then `roaring64_iterator_has_value()` returns true. Values are
+ * traversed in increasing order. For convenience, returns the result of
+ * `roaring64_iterator_has_value()`.
+ */
+bool roaring64_iterator_move_equalorlarger(roaring64_iterator_t *it,
+                                           uint64_t val);
+
+/**
+ * Reads up to `count` values from the iterator into the given `buf`. Returns
+ * the number of elements read. The number of elements read can be smaller than
+ * `count`, which means that there are no more elements in the bitmap.
+ *
+ * This function can be used together with other iterator functions.
+ */
+uint64_t roaring64_iterator_read(roaring64_iterator_t *it, uint64_t *buf,
+                                 uint64_t count);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace roaring
+}  // namespace api
+#endif
+
+#endif /* ROARING64_H */
+/* end file include/roaring/roaring64.h */
diff --git a/roaring/roaring.hh b/roaring/roaring.hh
new file mode 100644
index 000000000..b2d6f3164
--- /dev/null
+++ b/roaring/roaring.hh
@@ -0,0 +1,2934 @@
+// !!! DO NOT EDIT - THIS IS AN AUTO-GENERATED FILE !!!
+// Created by amalgamation.sh on 2024-04-02T07:40:50Z
+
+/*
+ * The CRoaring project is under a dual license (Apache/MIT).
+ * Users of the library may choose one or the other license.
+ */
+/*
+ * Copyright 2016-2022 The CRoaring authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * MIT License
+ *
+ * Copyright 2016-2022 The CRoaring authors
+ *
+ * Permission is hereby granted, free of charge, to any
+ * person obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the
+ * Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software
+ * is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice
+ * shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+ * TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+ * SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+ * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#define ROARING_API_NOT_IN_GLOBAL_NAMESPACE  // see remarks in roaring.h
+#include "roaring.h"
+#undef ROARING_API_NOT_IN_GLOBAL_NAMESPACE
+/* begin file cpp/roaring.hh */
+/*
+A C++ header for Roaring Bitmaps.
+*/
+#ifndef INCLUDE_ROARING_HH_
+#define INCLUDE_ROARING_HH_
+
+#include <algorithm>
+#include <cstdarg>
+#include <initializer_list>
+#include <new>
+#include <stdexcept>
+#include <string>
+
+#if !defined(ROARING_EXCEPTIONS)
+// __cpp_exceptions is required by C++98 and we require C++11 or better.
+#ifndef __cpp_exceptions
+#error "__cpp_exceptions should be defined"
+#endif
+#if __cpp_exceptions
+#define ROARING_EXCEPTIONS 1
+#else
+#define ROARING_EXCEPTIONS 0
+#endif
+#endif
+
+#ifndef ROARING_TERMINATE
+#if ROARING_EXCEPTIONS
+#define ROARING_TERMINATE(_s) throw std::runtime_error(_s)
+#else
+#define ROARING_TERMINATE(_s) std::terminate()
+#endif
+#endif
+
+#define ROARING_API_NOT_IN_GLOBAL_NAMESPACE  // see remarks in roaring.h
+#undef ROARING_API_NOT_IN_GLOBAL_NAMESPACE
+
+
+namespace roaring {
+
+class RoaringSetBitForwardIterator;
+
+/**
+ * A bit of context usable with `*Bulk()` functions.
+ *
+ * A context may only be used with a single bitmap, and any modification to a
+ * bitmap (other than modifications performed with `Bulk()` functions with the
+ * context passed) will invalidate any contexts associated with that bitmap.
+ */
+class BulkContext {
+   public:
+    friend class Roaring;
+    using roaring_bitmap_bulk_context_t = api::roaring_bulk_context_t;
+    BulkContext() : context_{nullptr, 0, 0, 0} {}
+
+    BulkContext(const BulkContext &) = delete;
+    BulkContext &operator=(const BulkContext &) = delete;
+    BulkContext(BulkContext &&) noexcept = default;
+    BulkContext &operator=(BulkContext &&) noexcept = default;
+
+   private:
+    roaring_bitmap_bulk_context_t context_;
+};
+
+class Roaring {
+    typedef api::roaring_bitmap_t roaring_bitmap_t;  // class-local name alias
+
+   public:
+    /**
+     * Create an empty bitmap in the existing memory for the class.
+     * The bitmap will be in the "clear" state with no auxiliary allocations.
+     */
+    Roaring() : roaring{} {
+        // The empty constructor roaring{} silences warnings from pedantic
+        // static analyzers.
+        api::roaring_bitmap_init_cleared(&roaring);
+    }
+
+    /**
+     * Construct a bitmap from a list of 32-bit integer values.
+     */
+    Roaring(size_t n, const uint32_t *data) : Roaring() {
+        api::roaring_bitmap_add_many(&roaring, n, data);
+    }
+
+    /**
+     * Construct a bitmap from an initializer list.
+     */
+    Roaring(std::initializer_list<uint32_t> l) : Roaring() {
+        addMany(l.size(), l.begin());
+    }
+
+    /**
+     * Copy constructor.
+     * It may throw std::runtime_error if there is insufficient memory.
+     */
+    Roaring(const Roaring &r) : Roaring() {
+        if (!api::roaring_bitmap_overwrite(&roaring, &r.roaring)) {
+            ROARING_TERMINATE("failed roaring_bitmap_overwrite in constructor");
+        }
+        api::roaring_bitmap_set_copy_on_write(
+            &roaring, api::roaring_bitmap_get_copy_on_write(&r.roaring));
+    }
+
+    /**
+     * Move constructor. The moved-from object remains valid but empty, i.e.
+     * it behaves as though it was just freshly constructed.
+     */
+    Roaring(Roaring &&r) noexcept : roaring(r.roaring) {
+        //
+        // !!! This clones the bits of the roaring structure to a new location
+        // and then overwrites the old bits...assuming that this will still
+        // work.  There are scenarios where this could break; e.g. if some of
+        // those bits were pointers into the structure memory itself.  If such
+        // things were possible, a roaring_bitmap_move() API would be needed.
+        //
+        api::roaring_bitmap_init_cleared(&r.roaring);
+    }
+
+    /**
+     * Construct a roaring object by taking control of a malloc()'d C struct.
+     *
+     * Passing a NULL pointer is unsafe.
+     * The pointer to the C struct will be invalid after the call.
+     */
+    explicit Roaring(roaring_bitmap_t *s) noexcept : roaring(*s) {
+        roaring_free(s);  // deallocate the passed-in pointer
+    }
+
+    /**
+     * Construct a bitmap from a list of uint32_t values.
+     */
+    static Roaring bitmapOf(size_t n, ...) {
+        Roaring ans;
+        va_list vl;
+        va_start(vl, n);
+        for (size_t i = 0; i < n; i++) {
+            ans.add(va_arg(vl, uint32_t));
+        }
+        va_end(vl);
+        return ans;
+    }
+
+    /**
+     * Construct a bitmap from a list of uint32_t values.
+     * E.g., bitmapOfList({1,2,3}).
+     */
+    static Roaring bitmapOfList(std::initializer_list<uint32_t> l) {
+        Roaring ans;
+        ans.addMany(l.size(), l.begin());
+        return ans;
+    }
+
+    /**
+     * Add value x
+     */
+    void add(uint32_t x) noexcept { api::roaring_bitmap_add(&roaring, x); }
+
+    /**
+     * Add value x
+     * Returns true if a new value was added, false if the value was already
+     * existing.
+     */
+    bool addChecked(uint32_t x) noexcept {
+        return api::roaring_bitmap_add_checked(&roaring, x);
+    }
+
+    /**
+     * Add all values in range [min, max)
+     */
+    void addRange(const uint64_t min, const uint64_t max) noexcept {
+        return api::roaring_bitmap_add_range(&roaring, min, max);
+    }
+
+    /**
+     * Add all values in range [min, max]
+     */
+    void addRangeClosed(const uint32_t min, const uint32_t max) noexcept {
+        return api::roaring_bitmap_add_range_closed(&roaring, min, max);
+    }
+
+    /**
+     * Add value n_args from pointer vals
+     */
+    void addMany(size_t n_args, const uint32_t *vals) noexcept {
+        api::roaring_bitmap_add_many(&roaring, n_args, vals);
+    }
+
+    /**
+     * Add value val, using context from a previous insert for speed
+     * optimization.
+     *
+     * `context` will be used to store information between calls to make bulk
+     * operations faster. `context` should be default-initialized before the
+     * first call to this function.
+     */
+    void addBulk(BulkContext &context, uint32_t x) noexcept {
+        api::roaring_bitmap_add_bulk(&roaring, &context.context_, x);
+    }
+
+    /**
+     * Check if item x is present, using context from a previous insert or
+     * search for speed optimization.
+     *
+     * `context` will be used to store information between calls to make bulk
+     * operations faster. `context` should be default-initialized before the
+     * first call to this function.
+     */
+    bool containsBulk(BulkContext &context, uint32_t x) const noexcept {
+        return api::roaring_bitmap_contains_bulk(&roaring, &context.context_,
+                                                 x);
+    }
+
+    /**
+     * Remove value x
+     */
+    void remove(uint32_t x) noexcept {
+        api::roaring_bitmap_remove(&roaring, x);
+    }
+
+    /**
+     * Remove value x
+     * Returns true if a new value was removed, false if the value was not
+     * existing.
+     */
+    bool removeChecked(uint32_t x) noexcept {
+        return api::roaring_bitmap_remove_checked(&roaring, x);
+    }
+
+    /**
+     * Remove all values in range [min, max)
+     */
+    void removeRange(uint64_t min, uint64_t max) noexcept {
+        return api::roaring_bitmap_remove_range(&roaring, min, max);
+    }
+
+    /**
+     * Remove all values in range [min, max]
+     */
+    void removeRangeClosed(uint32_t min, uint32_t max) noexcept {
+        return api::roaring_bitmap_remove_range_closed(&roaring, min, max);
+    }
+
+    /**
+     * Return the largest value (if not empty)
+     */
+    uint32_t maximum() const noexcept {
+        return api::roaring_bitmap_maximum(&roaring);
+    }
+
+    /**
+     * Return the smallest value (if not empty)
+     */
+    uint32_t minimum() const noexcept {
+        return api::roaring_bitmap_minimum(&roaring);
+    }
+
+    /**
+     * Check if value x is present
+     */
+    bool contains(uint32_t x) const noexcept {
+        return api::roaring_bitmap_contains(&roaring, x);
+    }
+
+    /**
+     * Check if all values from x (included) to y (excluded) are present
+     */
+    bool containsRange(const uint64_t x, const uint64_t y) const noexcept {
+        return api::roaring_bitmap_contains_range(&roaring, x, y);
+    }
+
+    /**
+     * Destructor.  By contract, calling roaring_bitmap_clear() is enough to
+     * release all auxiliary memory used by the structure.
+     */
+    ~Roaring() {
+        if (!(roaring.high_low_container.flags & ROARING_FLAG_FROZEN)) {
+            api::roaring_bitmap_clear(&roaring);
+        } else {
+            // The roaring member variable copies the `roaring_bitmap_t` and
+            // nested `roaring_array_t` structures by value and is freed in the
+            // constructor, however the underlying memory arena used for the
+            // container data is not freed with it. Here we derive the arena
+            // pointer from the second arena allocation in
+            // `roaring_bitmap_frozen_view` and free it as well.
+            roaring_bitmap_free(
+                (roaring_bitmap_t *)((char *)
+                                         roaring.high_low_container.containers -
+                                     sizeof(roaring_bitmap_t)));
+        }
+    }
+
+    /**
+     * Copies the content of the provided bitmap, and
+     * discard the current content.
+     * It may throw std::runtime_error if there is insufficient memory.
+     */
+    Roaring &operator=(const Roaring &r) {
+        if (!api::roaring_bitmap_overwrite(&roaring, &r.roaring)) {
+            ROARING_TERMINATE("failed memory alloc in assignment");
+        }
+        api::roaring_bitmap_set_copy_on_write(
+            &roaring, api::roaring_bitmap_get_copy_on_write(&r.roaring));
+        return *this;
+    }
+
+    /**
+     * Moves the content of the provided bitmap, and
+     * discard the current content.
+     */
+    Roaring &operator=(Roaring &&r) noexcept {
+        api::roaring_bitmap_clear(&roaring);  // free this class's allocations
+
+        // !!! See notes in the Move Constructor regarding roaring_bitmap_move()
+        //
+        roaring = r.roaring;
+        api::roaring_bitmap_init_cleared(&r.roaring);
+
+        return *this;
+    }
+
+    /**
+     * Assignment from an initializer list.
+     */
+    Roaring &operator=(std::initializer_list<uint32_t> l) {
+        // Delegate to move assignment operator
+        *this = Roaring(l);
+        return *this;
+    }
+
+    /**
+     * Compute the intersection between the current bitmap and the provided
+     * bitmap, writing the result in the current bitmap. The provided bitmap
+     * is not modified.
+     *
+     * Performance hint: if you are computing the intersection between several
+     * bitmaps, two-by-two, it is best to start with the smallest bitmap.
+     */
+    Roaring &operator&=(const Roaring &r) noexcept {
+        api::roaring_bitmap_and_inplace(&roaring, &r.roaring);
+        return *this;
+    }
+
+    /**
+     * Compute the difference between the current bitmap and the provided
+     * bitmap, writing the result in the current bitmap. The provided bitmap
+     * is not modified.
+     */
+    Roaring &operator-=(const Roaring &r) noexcept {
+        api::roaring_bitmap_andnot_inplace(&roaring, &r.roaring);
+        return *this;
+    }
+
+    /**
+     * Compute the union between the current bitmap and the provided bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     *
+     * See also the fastunion function to aggregate many bitmaps more quickly.
+     */
+    Roaring &operator|=(const Roaring &r) noexcept {
+        api::roaring_bitmap_or_inplace(&roaring, &r.roaring);
+        return *this;
+    }
+
+    /**
+     * Compute the symmetric union between the current bitmap and the provided
+     * bitmap, writing the result in the current bitmap. The provided bitmap
+     * is not modified.
+     */
+    Roaring &operator^=(const Roaring &r) noexcept {
+        api::roaring_bitmap_xor_inplace(&roaring, &r.roaring);
+        return *this;
+    }
+
+    /**
+     * Exchange the content of this bitmap with another.
+     */
+    void swap(Roaring &r) noexcept { std::swap(r.roaring, roaring); }
+
+    /**
+     * Get the cardinality of the bitmap (number of elements).
+     */
+    uint64_t cardinality() const noexcept {
+        return api::roaring_bitmap_get_cardinality(&roaring);
+    }
+
+    /**
+     * Returns true if the bitmap is empty (cardinality is zero).
+     */
+    bool isEmpty() const noexcept {
+        return api::roaring_bitmap_is_empty(&roaring);
+    }
+
+    /**
+     * Returns true if the bitmap is subset of the other.
+     */
+    bool isSubset(const Roaring &r) const noexcept {
+        return api::roaring_bitmap_is_subset(&roaring, &r.roaring);
+    }
+
+    /**
+     * Returns true if the bitmap is strict subset of the other.
+     */
+    bool isStrictSubset(const Roaring &r) const noexcept {
+        return api::roaring_bitmap_is_strict_subset(&roaring, &r.roaring);
+    }
+
+    /**
+     * Convert the bitmap to an array. Write the output to "ans", caller is
+     * responsible to ensure that there is enough memory allocated
+     * (e.g., ans = new uint32[mybitmap.cardinality()];)
+     */
+    void toUint32Array(uint32_t *ans) const noexcept {
+        api::roaring_bitmap_to_uint32_array(&roaring, ans);
+    }
+    /**
+     * To int array with pagination
+     */
+    void rangeUint32Array(uint32_t *ans, size_t offset,
+                          size_t limit) const noexcept {
+        api::roaring_bitmap_range_uint32_array(&roaring, offset, limit, ans);
+    }
+
+    /**
+     * Return true if the two bitmaps contain the same elements.
+     */
+    bool operator==(const Roaring &r) const noexcept {
+        return api::roaring_bitmap_equals(&roaring, &r.roaring);
+    }
+
+    /**
+     * Compute the negation of the roaring bitmap within the half-open interval
+     * [range_start, range_end). Areas outside the interval are unchanged.
+     */
+    void flip(uint64_t range_start, uint64_t range_end) noexcept {
+        api::roaring_bitmap_flip_inplace(&roaring, range_start, range_end);
+    }
+
+    /**
+     * Compute the negation of the roaring bitmap within the closed interval
+     * [range_start, range_end]. Areas outside the interval are unchanged.
+     */
+    void flipClosed(uint32_t range_start, uint32_t range_end) noexcept {
+        api::roaring_bitmap_flip_inplace(&roaring, range_start,
+                                         uint64_t(range_end) + 1);
+    }
+
+    /**
+     * Remove run-length encoding even when it is more space efficient.
+     * Return whether a change was applied.
+     */
+    bool removeRunCompression() noexcept {
+        return api::roaring_bitmap_remove_run_compression(&roaring);
+    }
+
+    /**
+     * Convert array and bitmap containers to run containers when it is more
+     * efficient; also convert from run containers when more space efficient.
+     * Returns true if the result has at least one run container.  Additional
+     * savings might be possible by calling shrinkToFit().
+     */
+    bool runOptimize() noexcept {
+        return api::roaring_bitmap_run_optimize(&roaring);
+    }
+
+    /**
+     * If needed, reallocate memory to shrink the memory usage. Returns
+     * the number of bytes saved.
+     */
+    size_t shrinkToFit() noexcept {
+        return api::roaring_bitmap_shrink_to_fit(&roaring);
+    }
+
+    /**
+     * Iterate over the bitmap elements. The function iterator is called once
+     * for all the values with ptr (can be NULL) as the second parameter of
+     * each call.
+     *
+     * roaring_iterator is simply a pointer to a function that returns bool
+     * (true means that the iteration should continue while false means that it
+     * should stop), and takes (uint32_t,void*) as inputs.
+     */
+    void iterate(api::roaring_iterator iterator, void *ptr) const {
+        api::roaring_iterate(&roaring, iterator, ptr);
+    }
+
+    /**
+     * Selects the value at index rnk in the bitmap, where the smallest value
+     * is at index 0.
+     *
+     * If the size of the roaring bitmap is strictly greater than rank, then
+     * this function returns true and sets element to the element of given rank.
+     * Otherwise, it returns false.
+     */
+    bool select(uint32_t rnk, uint32_t *element) const noexcept {
+        return api::roaring_bitmap_select(&roaring, rnk, element);
+    }
+
+    /**
+     * Computes the size of the intersection between two bitmaps.
+     */
+    uint64_t and_cardinality(const Roaring &r) const noexcept {
+        return api::roaring_bitmap_and_cardinality(&roaring, &r.roaring);
+    }
+
+    /**
+     * Check whether the two bitmaps intersect.
+     */
+    bool intersect(const Roaring &r) const noexcept {
+        return api::roaring_bitmap_intersect(&roaring, &r.roaring);
+    }
+
+    /**
+     * Computes the Jaccard index between two bitmaps. (Also known as the
+     * Tanimoto distance,
+     * or the Jaccard similarity coefficient)
+     *
+     * The Jaccard index is undefined if both bitmaps are empty.
+     */
+    double jaccard_index(const Roaring &r) const noexcept {
+        return api::roaring_bitmap_jaccard_index(&roaring, &r.roaring);
+    }
+
+    /**
+     * Computes the size of the union between two bitmaps.
+     */
+    uint64_t or_cardinality(const Roaring &r) const noexcept {
+        return api::roaring_bitmap_or_cardinality(&roaring, &r.roaring);
+    }
+
+    /**
+     * Computes the size of the difference (andnot) between two bitmaps.
+     */
+    uint64_t andnot_cardinality(const Roaring &r) const noexcept {
+        return api::roaring_bitmap_andnot_cardinality(&roaring, &r.roaring);
+    }
+
+    /**
+     * Computes the size of the symmetric difference (andnot) between two
+     * bitmaps.
+     */
+    uint64_t xor_cardinality(const Roaring &r) const noexcept {
+        return api::roaring_bitmap_xor_cardinality(&roaring, &r.roaring);
+    }
+
+    /**
+     * Returns the number of integers that are smaller or equal to x.
+     * Thus the rank of the smallest element is one.  If
+     * x is smaller than the smallest element, this function will return 0.
+     * The rank and select functions differ in convention: this function returns
+     * 1 when ranking the smallest value, but the select function returns the
+     * smallest value when using index 0.
+     */
+    uint64_t rank(uint32_t x) const noexcept {
+        return api::roaring_bitmap_rank(&roaring, x);
+    }
+
+    /**
+     * Get `rank()` values in bulk. The values in `[begin .. end)` must be in
+     * Ascending order. possible implementation: for(auto* iter = begin; iter !=
+     * end; ++iter) *(ans++) = rank(*iter);
+     */
+    void rank_many(const uint32_t *begin, const uint32_t *end,
+                   uint64_t *ans) const noexcept {
+        return api::roaring_bitmap_rank_many(&roaring, begin, end, ans);
+    }
+
+    /**
+     * Returns the index of x in the set, index start from 0.
+     * If the set doesn't contain x , this function will return -1.
+     * The difference with rank function is that this function will return -1
+     * when x isn't in the set, but the rank function will return a
+     * non-negative number.
+     */
+    int64_t getIndex(uint32_t x) const noexcept {
+        return api::roaring_bitmap_get_index(&roaring, x);
+    }
+
+    /**
+     * Write a bitmap to a char buffer. This is meant to be compatible with
+     * the Java and Go versions. Returns how many bytes were written which
+     * should be getSizeInBytes().
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     *
+     * Boost users can serialize bitmaps in this manner:
+     *
+     *       BOOST_SERIALIZATION_SPLIT_FREE(Roaring)
+     *       namespace boost {
+     *       namespace serialization {
+     *
+     *       template <class Archive>
+     *       void save(Archive& ar, const Roaring& bitmask,
+     *          const unsigned int version) {
+     *         std::size_t expected_size_in_bytes = bitmask.getSizeInBytes();
+     *         std::vector<char> buffer(expected_size_in_bytes);
+     *         std::size_t       size_in_bytes = bitmask.write(buffer.data());
+     *
+     *         ar& size_in_bytes;
+     *         ar& boost::serialization::make_binary_object(buffer.data(),
+     *             size_in_bytes);
+     *      }
+     *      template <class Archive>
+     *      void load(Archive& ar, Roaring& bitmask,
+     *          const unsigned int version) {
+     *         std::size_t size_in_bytes = 0;
+     *         ar& size_in_bytes;
+     *         std::vector<char> buffer(size_in_bytes);
+     *         ar&  boost::serialization::make_binary_object(buffer.data(),
+     *            size_in_bytes);
+     *         bitmask = Roaring::readSafe(buffer.data(), size_in_bytes);
+     *      }
+     *      }  // namespace serialization
+     *      }  // namespace boost
+     */
+    size_t write(char *buf, bool portable = true) const noexcept {
+        if (portable) {
+            return api::roaring_bitmap_portable_serialize(&roaring, buf);
+        } else {
+            return api::roaring_bitmap_serialize(&roaring, buf);
+        }
+    }
+
+    /**
+     * Read a bitmap from a serialized version. This is meant to be compatible
+     * with the Java and Go versions.
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     *
+     * This function is unsafe in the sense that if you provide bad data,
+     * many, many bytes could be read. See also readSafe.
+     *
+     * The function may throw std::runtime_error if a bitmap could not be read.
+     * Not that even if it does not throw, the bitmap could still be unusable if
+     * the loaded data does not match the portable Roaring specification: you
+     * should ensure that the data you load come from a serialized bitmap.
+     */
+    static Roaring read(const char *buf, bool portable = true) {
+        roaring_bitmap_t *r =
+            portable ? api::roaring_bitmap_portable_deserialize(buf)
+                     : api::roaring_bitmap_deserialize(buf);
+        if (r == NULL) {
+            ROARING_TERMINATE("failed alloc while reading");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * Read a bitmap from a serialized version, reading no more than maxbytes
+     * bytes.  This is meant to be compatible with the Java and Go versions.
+     * The function itself is safe in the sense that it will not cause buffer
+     * overflows. However, for correct operations, it is assumed that the bitmap
+     * read was once serialized from a valid bitmap. If you provided an
+     * incorrect input (garbage), then the bitmap read may not be in a valid
+     * state and following operations may not lead to sensible results. It is
+     * your responsability to ensure that the input bytes follow the format
+     * specification if you want a usable bitmap:
+     * https://github.com/RoaringBitmap/RoaringFormatSpec
+     * In particular, the serialized array containers need to be in sorted
+     * order, and the run containers should be in sorted non-overlapping order.
+     * This is is guaranteed to happen when serializing an existing bitmap, but
+     * not for random inputs. Note that this function assumes that your bitmap
+     * was serialized in *portable* mode (which is the default with the 'write'
+     * method).
+     *
+     * The function may throw std::runtime_error if a bitmap could not be read.
+     * Not that even if it does not throw, the bitmap could still be unusable if
+     * the loaded data does not match the portable Roaring specification: you
+     * should ensure that the data you load come from a serialized bitmap.
+     */
+    static Roaring readSafe(const char *buf, size_t maxbytes) {
+        roaring_bitmap_t *r =
+            api::roaring_bitmap_portable_deserialize_safe(buf, maxbytes);
+        if (r == NULL) {
+            ROARING_TERMINATE("failed alloc while reading");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * How many bytes are required to serialize this bitmap (meant to be
+     * compatible with Java and Go versions)
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     */
+    size_t getSizeInBytes(bool portable = true) const noexcept {
+        if (portable) {
+            return api::roaring_bitmap_portable_size_in_bytes(&roaring);
+        } else {
+            return api::roaring_bitmap_size_in_bytes(&roaring);
+        }
+    }
+
+    /**
+     * For advanced users.
+     * This function may throw std::runtime_error.
+     */
+    static const Roaring frozenView(const char *buf, size_t length) {
+        const roaring_bitmap_t *s =
+            api::roaring_bitmap_frozen_view(buf, length);
+        if (s == NULL) {
+            ROARING_TERMINATE("failed to read frozen bitmap");
+        }
+        Roaring r;
+        r.roaring = *s;
+        return r;
+    }
+
+    /**
+     * For advanced users; see roaring_bitmap_portable_deserialize_frozen.
+     * This function may throw std::runtime_error.
+     */
+    static const Roaring portableDeserializeFrozen(const char *buf) {
+        const roaring_bitmap_t *s =
+            api::roaring_bitmap_portable_deserialize_frozen(buf);
+        if (s == NULL) {
+            ROARING_TERMINATE("failed to read portable frozen bitmap");
+        }
+        Roaring r;
+        r.roaring = *s;
+        return r;
+    }
+
+    /**
+     * For advanced users.
+     */
+    void writeFrozen(char *buf) const noexcept {
+        roaring_bitmap_frozen_serialize(&roaring, buf);
+    }
+
+    /**
+     * For advanced users.
+     */
+    size_t getFrozenSizeInBytes() const noexcept {
+        return roaring_bitmap_frozen_size_in_bytes(&roaring);
+    }
+
+    /**
+     * Computes the intersection between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     *
+     * Performance hint: if you are computing the intersection between several
+     * bitmaps, two-by-two, it is best to start with the smallest bitmap.
+     * Consider also using the operator &= to avoid needlessly creating
+     * many temporary bitmaps.
+     * This function may throw std::runtime_error.
+     */
+    Roaring operator&(const Roaring &o) const {
+        roaring_bitmap_t *r = api::roaring_bitmap_and(&roaring, &o.roaring);
+        if (r == NULL) {
+            ROARING_TERMINATE("failed materalization in and");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * Computes the difference between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     * This function may throw std::runtime_error.
+     */
+    Roaring operator-(const Roaring &o) const {
+        roaring_bitmap_t *r = api::roaring_bitmap_andnot(&roaring, &o.roaring);
+        if (r == NULL) {
+            ROARING_TERMINATE("failed materalization in andnot");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * Computes the union between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     * This function may throw std::runtime_error.
+     */
+    Roaring operator|(const Roaring &o) const {
+        roaring_bitmap_t *r = api::roaring_bitmap_or(&roaring, &o.roaring);
+        if (r == NULL) {
+            ROARING_TERMINATE("failed materalization in or");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * Computes the symmetric union between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     * This function may throw std::runtime_error.
+     */
+    Roaring operator^(const Roaring &o) const {
+        roaring_bitmap_t *r = api::roaring_bitmap_xor(&roaring, &o.roaring);
+        if (r == NULL) {
+            ROARING_TERMINATE("failed materalization in xor");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * Whether or not we apply copy and write.
+     */
+    void setCopyOnWrite(bool val) noexcept {
+        api::roaring_bitmap_set_copy_on_write(&roaring, val);
+    }
+
+    /**
+     * Print the content of the bitmap
+     */
+    void printf() const noexcept { api::roaring_bitmap_printf(&roaring); }
+
+    /**
+     * Print the content of the bitmap into a string
+     */
+    std::string toString() const noexcept {
+        struct iter_data {
+            std::string str{};  // The empty constructor silences warnings from
+                                // pedantic static analyzers.
+            char first_char = '{';
+        } outer_iter_data;
+        if (!isEmpty()) {
+            iterate(
+                [](uint32_t value, void *inner_iter_data) -> bool {
+                    ((iter_data *)inner_iter_data)->str +=
+                        ((iter_data *)inner_iter_data)->first_char;
+                    ((iter_data *)inner_iter_data)->str +=
+                        std::to_string(value);
+                    ((iter_data *)inner_iter_data)->first_char = ',';
+                    return true;
+                },
+                (void *)&outer_iter_data);
+        } else
+            outer_iter_data.str = '{';
+        outer_iter_data.str += '}';
+        return outer_iter_data.str;
+    }
+
+    /**
+     * Whether or not copy and write is active.
+     */
+    bool getCopyOnWrite() const noexcept {
+        return api::roaring_bitmap_get_copy_on_write(&roaring);
+    }
+
+    /**
+     * Computes the logical or (union) between "n" bitmaps (referenced by a
+     * pointer).
+     * This function may throw std::runtime_error.
+     */
+    static Roaring fastunion(size_t n, const Roaring **inputs) {
+        const roaring_bitmap_t **x = (const roaring_bitmap_t **)roaring_malloc(
+            n * sizeof(roaring_bitmap_t *));
+        if (x == NULL) {
+            ROARING_TERMINATE("failed memory alloc in fastunion");
+        }
+        for (size_t k = 0; k < n; ++k) x[k] = &inputs[k]->roaring;
+
+        roaring_bitmap_t *c_ans = api::roaring_bitmap_or_many(n, x);
+        if (c_ans == NULL) {
+            roaring_free(x);
+            ROARING_TERMINATE("failed memory alloc in fastunion");
+        }
+        Roaring ans(c_ans);
+        roaring_free(x);
+        return ans;
+    }
+
+    typedef RoaringSetBitForwardIterator const_iterator;
+
+    /**
+     * Returns an iterator that can be used to access the position of the set
+     * bits. The running time complexity of a full scan is proportional to the
+     * number of set bits: be aware that if you have long strings of 1s, this
+     * can be very inefficient.
+     *
+     * It can be much faster to use the toArray method if you want to retrieve
+     * the set bits.
+     */
+    const_iterator begin() const;
+
+    /**
+     * A bogus iterator that can be used together with begin()
+     * for constructions such as for (auto i = b.begin(); * i!=b.end(); ++i) {}
+     */
+    const_iterator &end() const;
+
+    roaring_bitmap_t roaring;
+};
+
+/**
+ * Used to go through the set bits. Not optimally fast, but convenient.
+ */
+class RoaringSetBitForwardIterator final {
+   public:
+    typedef std::forward_iterator_tag iterator_category;
+    typedef uint32_t *pointer;
+    typedef uint32_t &reference_type;
+    typedef uint32_t value_type;
+    typedef int32_t difference_type;
+    typedef RoaringSetBitForwardIterator type_of_iterator;
+
+    /**
+     * Provides the location of the set bit.
+     */
+    value_type operator*() const { return i.current_value; }
+
+    bool operator<(const type_of_iterator &o) const {
+        if (!i.has_value) return false;
+        if (!o.i.has_value) return true;
+        return i.current_value < *o;
+    }
+
+    bool operator<=(const type_of_iterator &o) const {
+        if (!o.i.has_value) return true;
+        if (!i.has_value) return false;
+        return i.current_value <= *o;
+    }
+
+    bool operator>(const type_of_iterator &o) const {
+        if (!o.i.has_value) return false;
+        if (!i.has_value) return true;
+        return i.current_value > *o;
+    }
+
+    bool operator>=(const type_of_iterator &o) const {
+        if (!i.has_value) return true;
+        if (!o.i.has_value) return false;
+        return i.current_value >= *o;
+    }
+
+    /**
+     * Move the iterator to the first value >= val.
+     */
+    void equalorlarger(uint32_t val) {
+        api::roaring_uint32_iterator_move_equalorlarger(&i, val);
+    }
+
+    type_of_iterator &operator++() {  // ++i, must returned inc. value
+        api::roaring_uint32_iterator_advance(&i);
+        return *this;
+    }
+
+    type_of_iterator operator++(int) {  // i++, must return orig. value
+        RoaringSetBitForwardIterator orig(*this);
+        api::roaring_uint32_iterator_advance(&i);
+        return orig;
+    }
+
+    type_of_iterator &operator--() {  // prefix --
+        api::roaring_uint32_iterator_previous(&i);
+        return *this;
+    }
+
+    type_of_iterator operator--(int) {  // postfix --
+        RoaringSetBitForwardIterator orig(*this);
+        api::roaring_uint32_iterator_previous(&i);
+        return orig;
+    }
+
+    bool operator==(const RoaringSetBitForwardIterator &o) const {
+        return i.current_value == *o && i.has_value == o.i.has_value;
+    }
+
+    bool operator!=(const RoaringSetBitForwardIterator &o) const {
+        return i.current_value != *o || i.has_value != o.i.has_value;
+    }
+
+    explicit RoaringSetBitForwardIterator(const Roaring &parent,
+                                          bool exhausted = false) {
+        if (exhausted) {
+            i.parent = &parent.roaring;
+            i.container_index = INT32_MAX;
+            i.has_value = false;
+            i.current_value = UINT32_MAX;
+        } else {
+            api::roaring_iterator_init(&parent.roaring, &i);
+        }
+    }
+
+    api::roaring_uint32_iterator_t
+        i{};  // The empty constructor silences warnings from pedantic static
+              // analyzers.
+};
+
+inline RoaringSetBitForwardIterator Roaring::begin() const {
+    return RoaringSetBitForwardIterator(*this);
+}
+
+inline RoaringSetBitForwardIterator &Roaring::end() const {
+    static RoaringSetBitForwardIterator e(*this, true);
+    return e;
+}
+
+}  // namespace roaring
+
+#endif /* INCLUDE_ROARING_HH_ */
+/* end file cpp/roaring.hh */
+/* begin file cpp/roaring64map.hh */
+/**
+ * A C++ header for 64-bit Roaring Bitmaps,
+ * implemented by way of a map of many
+ * 32-bit Roaring Bitmaps.
+ *
+ * Reference (format specification) :
+ * https://github.com/RoaringBitmap/RoaringFormatSpec#extention-for-64-bit-implementations
+ */
+#ifndef INCLUDE_ROARING_64_MAP_HH_
+#define INCLUDE_ROARING_64_MAP_HH_
+
+#include <algorithm>
+#include <cinttypes>  // PRIu64 macro
+#include <cstdarg>    // for va_list handling in bitmapOf()
+#include <cstdio>     // for std::printf() in the printf() method
+#include <cstring>    // for std::memcpy()
+#include <functional>
+#include <initializer_list>
+#include <limits>
+#include <map>
+#include <new>
+#include <numeric>
+#include <queue>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+
+namespace roaring {
+
+using roaring::Roaring;
+
+class Roaring64MapSetBitBiDirectionalIterator;
+
+// For backwards compatibility; there used to be two kinds of iterators
+// (forward and bidirectional) and now there's only one.
+typedef Roaring64MapSetBitBiDirectionalIterator
+    Roaring64MapSetBitForwardIterator;
+
+class Roaring64Map {
+    typedef api::roaring_bitmap_t roaring_bitmap_t;
+
+   public:
+    /**
+     * Create an empty bitmap
+     */
+    Roaring64Map() = default;
+
+    /**
+     * Construct a bitmap from a list of 32-bit integer values.
+     */
+    Roaring64Map(size_t n, const uint32_t *data) { addMany(n, data); }
+
+    /**
+     * Construct a bitmap from a list of 64-bit integer values.
+     */
+    Roaring64Map(size_t n, const uint64_t *data) { addMany(n, data); }
+
+    /**
+     * Construct a bitmap from an initializer list.
+     */
+    Roaring64Map(std::initializer_list<uint64_t> l) {
+        addMany(l.size(), l.begin());
+    }
+
+    /**
+     * Construct a 64-bit map from a 32-bit one
+     */
+    explicit Roaring64Map(const Roaring &r) { emplaceOrInsert(0, r); }
+
+    /**
+     * Construct a 64-bit map from a 32-bit rvalue
+     */
+    explicit Roaring64Map(Roaring &&r) { emplaceOrInsert(0, std::move(r)); }
+
+    /**
+     * Construct a roaring object from the C struct.
+     *
+     * Passing a NULL point is unsafe.
+     */
+    explicit Roaring64Map(roaring_bitmap_t *s) {
+        emplaceOrInsert(0, Roaring(s));
+    }
+
+    Roaring64Map(const Roaring64Map &r) = default;
+
+    Roaring64Map(Roaring64Map &&r) noexcept = default;
+
+    /**
+     * Copy assignment operator.
+     */
+    Roaring64Map &operator=(const Roaring64Map &r) = default;
+
+    /**
+     * Move assignment operator.
+     */
+    Roaring64Map &operator=(Roaring64Map &&r) noexcept = default;
+
+    /**
+     * Assignment from an initializer list.
+     */
+    Roaring64Map &operator=(std::initializer_list<uint64_t> l) {
+        // Delegate to move assignment operator
+        *this = Roaring64Map(l);
+        return *this;
+    }
+
+    /**
+     * Construct a bitmap from a list of uint64_t values.
+     */
+    static Roaring64Map bitmapOf(size_t n...) {
+        Roaring64Map ans;
+        va_list vl;
+        va_start(vl, n);
+        for (size_t i = 0; i < n; i++) {
+            ans.add(va_arg(vl, uint64_t));
+        }
+        va_end(vl);
+        return ans;
+    }
+
+    /**
+     * Construct a bitmap from a list of uint64_t values.
+     * E.g., bitmapOfList({1,2,3}).
+     */
+    static Roaring64Map bitmapOfList(std::initializer_list<uint64_t> l) {
+        Roaring64Map ans;
+        ans.addMany(l.size(), l.begin());
+        return ans;
+    }
+
+    /**
+     * Adds value x.
+     */
+    void add(uint32_t x) { lookupOrCreateInner(0).add(x); }
+
+    /**
+     * Adds value x.
+     */
+    void add(uint64_t x) { lookupOrCreateInner(highBytes(x)).add(lowBytes(x)); }
+
+    /**
+     * Adds value x.
+     * Returns true if a new value was added, false if the value was already
+     * present.
+     */
+    bool addChecked(uint32_t x) { return lookupOrCreateInner(0).addChecked(x); }
+
+    /**
+     * Adds value x.
+     * Returns true if a new value was added, false if the value was already
+     * present.
+     */
+    bool addChecked(uint64_t x) {
+        return lookupOrCreateInner(highBytes(x)).addChecked(lowBytes(x));
+    }
+
+    /**
+     * Adds all values in the half-open interval [min, max).
+     */
+    void addRange(uint64_t min, uint64_t max) {
+        if (min >= max) {
+            return;
+        }
+        addRangeClosed(min, max - 1);
+    }
+
+    /**
+     * Adds all values in the closed interval [min, max].
+     */
+    void addRangeClosed(uint32_t min, uint32_t max) {
+        lookupOrCreateInner(0).addRangeClosed(min, max);
+    }
+
+    /**
+     * Adds all values in the closed interval [min, max]
+     */
+    void addRangeClosed(uint64_t min, uint64_t max) {
+        if (min > max) {
+            return;
+        }
+        uint32_t start_high = highBytes(min);
+        uint32_t start_low = lowBytes(min);
+        uint32_t end_high = highBytes(max);
+        uint32_t end_low = lowBytes(max);
+
+        // We put std::numeric_limits<>::max in parentheses to avoid a
+        // clash with the Windows.h header under Windows.
+        const uint32_t uint32_max = (std::numeric_limits<uint32_t>::max)();
+
+        // Fill in any nonexistent slots with empty Roarings. This simplifies
+        // the logic below, allowing it to simply iterate over the map between
+        // 'start_high' and 'end_high' in a linear fashion.
+        auto current_iter = ensureRangePopulated(start_high, end_high);
+
+        // If start and end land on the same inner bitmap, then we can do the
+        // whole operation in one call.
+        if (start_high == end_high) {
+            auto &bitmap = current_iter->second;
+            bitmap.addRangeClosed(start_low, end_low);
+            return;
+        }
+
+        // Because start and end don't land on the same inner bitmap,
+        // we need to do this in multiple steps:
+        // 1. Partially fill the first bitmap with values from the closed
+        //    interval [start_low, uint32_max]
+        // 2. Fill intermediate bitmaps completely: [0, uint32_max]
+        // 3. Partially fill the last bitmap with values from the closed
+        //    interval [0, end_low]
+        auto num_intermediate_bitmaps = end_high - start_high - 1;
+
+        // Step 1: Partially fill the first bitmap.
+        {
+            auto &bitmap = current_iter->second;
+            bitmap.addRangeClosed(start_low, uint32_max);
+            ++current_iter;
+        }
+
+        // Step 2. Fill intermediate bitmaps completely.
+        if (num_intermediate_bitmaps != 0) {
+            auto &first_intermediate = current_iter->second;
+            first_intermediate.addRangeClosed(0, uint32_max);
+            ++current_iter;
+
+            // Now make (num_intermediate_bitmaps - 1) copies of this.
+            for (uint32_t i = 1; i != num_intermediate_bitmaps; ++i) {
+                auto &next_intermediate = current_iter->second;
+                next_intermediate = first_intermediate;
+                ++current_iter;
+            }
+        }
+
+        // Step 3: Partially fill the last bitmap.
+        auto &bitmap = current_iter->second;
+        bitmap.addRangeClosed(0, end_low);
+    }
+
+    /**
+     * Adds 'n_args' values from the contiguous memory range starting at 'vals'.
+     */
+    void addMany(size_t n_args, const uint32_t *vals) {
+        lookupOrCreateInner(0).addMany(n_args, vals);
+    }
+
+    /**
+     * Adds 'n_args' values from the contiguous memory range starting at 'vals'.
+     */
+    void addMany(size_t n_args, const uint64_t *vals) {
+        // Potentially reduce outer map lookups by optimistically
+        // assuming that adjacent values will belong to the same inner bitmap.
+        Roaring *last_inner_bitmap = nullptr;
+        uint32_t last_value_high = 0;
+        BulkContext last_bulk_context;
+        for (size_t lcv = 0; lcv < n_args; lcv++) {
+            auto value = vals[lcv];
+            auto value_high = highBytes(value);
+            auto value_low = lowBytes(value);
+            if (last_inner_bitmap == nullptr || value_high != last_value_high) {
+                last_inner_bitmap = &lookupOrCreateInner(value_high);
+                last_value_high = value_high;
+                last_bulk_context = BulkContext{};
+            }
+            last_inner_bitmap->addBulk(last_bulk_context, value_low);
+        }
+    }
+
+    /**
+     * Removes value x.
+     */
+    void remove(uint32_t x) {
+        auto iter = roarings.begin();
+        // Since x is a uint32_t, highbytes(x) == 0. The inner bitmap we are
+        // looking for, if it exists, will be at the first slot of 'roarings'.
+        if (iter == roarings.end() || iter->first != 0) {
+            return;
+        }
+        auto &bitmap = iter->second;
+        bitmap.remove(x);
+        eraseIfEmpty(iter);
+    }
+
+    /**
+     * Removes value x.
+     */
+    void remove(uint64_t x) {
+        auto iter = roarings.find(highBytes(x));
+        if (iter == roarings.end()) {
+            return;
+        }
+        auto &bitmap = iter->second;
+        bitmap.remove(lowBytes(x));
+        eraseIfEmpty(iter);
+    }
+
+    /**
+     * Removes value x
+     * Returns true if a new value was removed, false if the value was not
+     * present.
+     */
+    bool removeChecked(uint32_t x) {
+        auto iter = roarings.begin();
+        // Since x is a uint32_t, highbytes(x) == 0. The inner bitmap we are
+        // looking for, if it exists, will be at the first slot of 'roarings'.
+        if (iter == roarings.end() || iter->first != 0) {
+            return false;
+        }
+        auto &bitmap = iter->second;
+        if (!bitmap.removeChecked(x)) {
+            return false;
+        }
+        eraseIfEmpty(iter);
+        return true;
+    }
+
+    /**
+     * Remove value x
+     * Returns true if a new value was removed, false if the value was not
+     * present.
+     */
+    bool removeChecked(uint64_t x) {
+        auto iter = roarings.find(highBytes(x));
+        if (iter == roarings.end()) {
+            return false;
+        }
+        auto &bitmap = iter->second;
+        if (!bitmap.removeChecked(lowBytes(x))) {
+            return false;
+        }
+        eraseIfEmpty(iter);
+        return true;
+    }
+
+    /**
+     * Removes all values in the half-open interval [min, max).
+     */
+    void removeRange(uint64_t min, uint64_t max) {
+        if (min >= max) {
+            return;
+        }
+        return removeRangeClosed(min, max - 1);
+    }
+
+    /**
+     * Removes all values in the closed interval [min, max].
+     */
+    void removeRangeClosed(uint32_t min, uint32_t max) {
+        auto iter = roarings.begin();
+        // Since min and max are uint32_t, highbytes(min or max) == 0. The inner
+        // bitmap we are looking for, if it exists, will be at the first slot of
+        // 'roarings'.
+        if (iter == roarings.end() || iter->first != 0) {
+            return;
+        }
+        auto &bitmap = iter->second;
+        bitmap.removeRangeClosed(min, max);
+        eraseIfEmpty(iter);
+    }
+
+    /**
+     * Removes all values in the closed interval [min, max].
+     */
+    void removeRangeClosed(uint64_t min, uint64_t max) {
+        if (min > max) {
+            return;
+        }
+        uint32_t start_high = highBytes(min);
+        uint32_t start_low = lowBytes(min);
+        uint32_t end_high = highBytes(max);
+        uint32_t end_low = lowBytes(max);
+
+        // We put std::numeric_limits<>::max in parentheses to avoid a
+        // clash with the Windows.h header under Windows.
+        const uint32_t uint32_max = (std::numeric_limits<uint32_t>::max)();
+
+        // If the outer map is empty, end_high is less than the first key,
+        // or start_high is greater than the last key, then exit now because
+        // there is no work to do.
+        if (roarings.empty() || end_high < roarings.cbegin()->first ||
+            start_high > (roarings.crbegin())->first) {
+            return;
+        }
+
+        // If we get here, start_iter points to the first entry in the outer map
+        // with key >= start_high. Such an entry is known to exist (i.e. the
+        // iterator will not be equal to end()) because start_high <= the last
+        // key in the map (thanks to the above if statement).
+        auto start_iter = roarings.lower_bound(start_high);
+        // end_iter points to the first entry in the outer map with
+        // key >= end_high, if such a key exists. Otherwise, it equals end().
+        auto end_iter = roarings.lower_bound(end_high);
+
+        // Note that the 'lower_bound' method will find the start and end slots,
+        // if they exist; otherwise it will find the next-higher slots.
+        // In the case where 'start' landed on an existing slot, we need to do a
+        // partial erase of that slot, and likewise for 'end'. But all the slots
+        // in between can be fully erased. More precisely:
+        //
+        // 1. If the start point falls on an existing entry, there are two
+        //    subcases:
+        //    a. if the end point falls on that same entry, remove the closed
+        //       interval [start_low, end_low] from that entry and we are done.
+        //    b. Otherwise, remove the closed interval [start_low, uint32_max]
+        //       from that entry, advance start_iter, and fall through to
+        //       step 2.
+        // 2. Completely erase all slots in the half-open interval
+        //    [start_iter, end_iter)
+        // 3. If the end point falls on an existing entry, remove the closed
+        //    interval [0, end_high] from it.
+
+        // Step 1. If the start point falls on an existing entry...
+        if (start_iter->first == start_high) {
+            auto &start_inner = start_iter->second;
+            // 1a. if the end point falls on that same entry...
+            if (start_iter == end_iter) {
+                start_inner.removeRangeClosed(start_low, end_low);
+                eraseIfEmpty(start_iter);
+                return;
+            }
+
+            // 1b. Otherwise, remove the closed range [start_low, uint32_max]...
+            start_inner.removeRangeClosed(start_low, uint32_max);
+            // Advance start_iter, but keep the old value so we can check the
+            // bitmap we just modified for emptiness and erase if it necessary.
+            auto temp = start_iter++;
+            eraseIfEmpty(temp);
+        }
+
+        // 2. Completely erase all slots in the half-open interval...
+        roarings.erase(start_iter, end_iter);
+
+        // 3. If the end point falls on an existing entry...
+        if (end_iter != roarings.end() && end_iter->first == end_high) {
+            auto &end_inner = end_iter->second;
+            end_inner.removeRangeClosed(0, end_low);
+            eraseIfEmpty(end_iter);
+        }
+    }
+
+    /**
+     * Clears the bitmap.
+     */
+    void clear() { roarings.clear(); }
+
+    /**
+     * Return the largest value (if not empty)
+     */
+    uint64_t maximum() const {
+        for (auto roaring_iter = roarings.crbegin();
+             roaring_iter != roarings.crend(); ++roaring_iter) {
+            if (!roaring_iter->second.isEmpty()) {
+                return uniteBytes(roaring_iter->first,
+                                  roaring_iter->second.maximum());
+            }
+        }
+        // we put std::numeric_limits<>::max/min in parentheses
+        // to avoid a clash with the Windows.h header under Windows
+        return (std::numeric_limits<uint64_t>::min)();
+    }
+
+    /**
+     * Return the smallest value (if not empty)
+     */
+    uint64_t minimum() const {
+        for (auto roaring_iter = roarings.cbegin();
+             roaring_iter != roarings.cend(); ++roaring_iter) {
+            if (!roaring_iter->second.isEmpty()) {
+                return uniteBytes(roaring_iter->first,
+                                  roaring_iter->second.minimum());
+            }
+        }
+        // we put std::numeric_limits<>::max/min in parentheses
+        // to avoid a clash with the Windows.h header under Windows
+        return (std::numeric_limits<uint64_t>::max)();
+    }
+
+    /**
+     * Check if value x is present
+     */
+    bool contains(uint32_t x) const {
+        auto iter = roarings.find(0);
+        if (iter == roarings.end()) {
+            return false;
+        }
+        return iter->second.contains(x);
+    }
+    bool contains(uint64_t x) const {
+        auto iter = roarings.find(highBytes(x));
+        if (iter == roarings.end()) {
+            return false;
+        }
+        return iter->second.contains(lowBytes(x));
+    }
+
+    /**
+     * Compute the intersection of the current bitmap and the provided bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     *
+     * Performance hint: if you are computing the intersection between several
+     * bitmaps, two-by-two, it is best to start with the smallest bitmap.
+     */
+    Roaring64Map &operator&=(const Roaring64Map &other) {
+        if (this == &other) {
+            // ANDing *this with itself is a no-op.
+            return *this;
+        }
+
+        // Logic table summarizing what to do when a given outer key is
+        // present vs. absent from self and other.
+        //
+        // self     other    (self & other)  work to do
+        // --------------------------------------------
+        // absent   absent   empty           None
+        // absent   present  empty           None
+        // present  absent   empty           Erase self
+        // present  present  empty or not    Intersect self with other, but
+        //                                   erase self if result is empty.
+        //
+        // Because there is only work to do when a key is present in 'self', the
+        // main for loop iterates over entries in 'self'.
+
+        decltype(roarings.begin()) self_next;
+        for (auto self_iter = roarings.begin(); self_iter != roarings.end();
+             self_iter = self_next) {
+            // Do the 'next' operation now, so we don't have to worry about
+            // invalidation of self_iter down below with the 'erase' operation.
+            self_next = std::next(self_iter);
+
+            auto self_key = self_iter->first;
+            auto &self_bitmap = self_iter->second;
+
+            auto other_iter = other.roarings.find(self_key);
+            if (other_iter == other.roarings.end()) {
+                // 'other' doesn't have self_key. In the logic table above,
+                // this reflects the case (self.present & other.absent).
+                // So, erase self.
+                roarings.erase(self_iter);
+                continue;
+            }
+
+            // Both sides have self_key. In the logic table above, this reflects
+            // the case (self.present & other.present). So, intersect self with
+            // other.
+            const auto &other_bitmap = other_iter->second;
+            self_bitmap &= other_bitmap;
+            if (self_bitmap.isEmpty()) {
+                // ...but if intersection is empty, remove it altogether.
+                roarings.erase(self_iter);
+            }
+        }
+        return *this;
+    }
+
+    /**
+     * Compute the difference between the current bitmap and the provided
+     * bitmap, writing the result in the current bitmap. The provided bitmap
+     * is not modified.
+     */
+    Roaring64Map &operator-=(const Roaring64Map &other) {
+        if (this == &other) {
+            // Subtracting *this from itself results in the empty map.
+            roarings.clear();
+            return *this;
+        }
+
+        // Logic table summarizing what to do when a given outer key is
+        // present vs. absent from self and other.
+        //
+        // self     other    (self - other)  work to do
+        // --------------------------------------------
+        // absent   absent   empty           None
+        // absent   present  empty           None
+        // present  absent   unchanged       None
+        // present  present  empty or not    Subtract other from self, but
+        //                                   erase self if result is empty
+        //
+        // Because there is only work to do when a key is present in both 'self'
+        // and 'other', the main while loop ping-pongs back and forth until it
+        // finds the next key that is the same on both sides.
+
+        auto self_iter = roarings.begin();
+        auto other_iter = other.roarings.cbegin();
+
+        while (self_iter != roarings.end() &&
+               other_iter != other.roarings.cend()) {
+            auto self_key = self_iter->first;
+            auto other_key = other_iter->first;
+            if (self_key < other_key) {
+                // Because self_key is < other_key, advance self_iter to the
+                // first point where self_key >= other_key (or end).
+                self_iter = roarings.lower_bound(other_key);
+                continue;
+            }
+
+            if (self_key > other_key) {
+                // Because self_key is > other_key, advance other_iter to the
+                // first point where other_key >= self_key (or end).
+                other_iter = other.roarings.lower_bound(self_key);
+                continue;
+            }
+
+            // Both sides have self_key. In the logic table above, this reflects
+            // the case (self.present & other.present). So subtract other from
+            // self.
+            auto &self_bitmap = self_iter->second;
+            const auto &other_bitmap = other_iter->second;
+            self_bitmap -= other_bitmap;
+
+            if (self_bitmap.isEmpty()) {
+                // ...but if subtraction is empty, remove it altogether.
+                self_iter = roarings.erase(self_iter);
+            } else {
+                ++self_iter;
+            }
+            ++other_iter;
+        }
+        return *this;
+    }
+
+    /**
+     * Compute the union of the current bitmap and the provided bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     *
+     * See also the fastunion function to aggregate many bitmaps more quickly.
+     */
+    Roaring64Map &operator|=(const Roaring64Map &other) {
+        if (this == &other) {
+            // ORing *this with itself is a no-op.
+            return *this;
+        }
+
+        // Logic table summarizing what to do when a given outer key is
+        // present vs. absent from self and other.
+        //
+        // self     other    (self | other)  work to do
+        // --------------------------------------------
+        // absent   absent   empty           None
+        // absent   present  not empty       Copy other to self and set flags
+        // present  absent   unchanged       None
+        // present  present  not empty       self |= other
+        //
+        // Because there is only work to do when a key is present in 'other',
+        // the main for loop iterates over entries in 'other'.
+
+        for (const auto &other_entry : other.roarings) {
+            const auto &other_bitmap = other_entry.second;
+
+            // Try to insert other_bitmap into self at other_key. We take
+            // advantage of the fact that std::map::insert will not overwrite an
+            // existing entry.
+            auto insert_result = roarings.insert(other_entry);
+            auto self_iter = insert_result.first;
+            auto insert_happened = insert_result.second;
+            auto &self_bitmap = self_iter->second;
+
+            if (insert_happened) {
+                // Key was not present in self, so insert was performed above.
+                // In the logic table above, this reflects the case
+                // (self.absent | other.present). Because the copy has already
+                // happened, thanks to the 'insert' operation above, we just
+                // need to set the copyOnWrite flag.
+                self_bitmap.setCopyOnWrite(copyOnWrite);
+                continue;
+            }
+
+            // Both sides have self_key, and the insert was not performed. In
+            // the logic table above, this reflects the case
+            // (self.present & other.present). So OR other into self.
+            self_bitmap |= other_bitmap;
+        }
+        return *this;
+    }
+
+    /**
+     * Compute the XOR of the current bitmap and the provided bitmap, writing
+     * the result in the current bitmap. The provided bitmap is not modified.
+     */
+    Roaring64Map &operator^=(const Roaring64Map &other) {
+        if (this == &other) {
+            // XORing *this with itself results in the empty map.
+            roarings.clear();
+            return *this;
+        }
+
+        // Logic table summarizing what to do when a given outer key is
+        // present vs. absent from self and other.
+        //
+        // self     other    (self ^ other)  work to do
+        // --------------------------------------------
+        // absent   absent   empty           None
+        // absent   present  non-empty       Copy other to self and set flags
+        // present  absent   unchanged       None
+        // present  present  empty or not    XOR other into self, but erase self
+        //                                   if result is empty.
+        //
+        // Because there is only work to do when a key is present in 'other',
+        // the main for loop iterates over entries in 'other'.
+
+        for (const auto &other_entry : other.roarings) {
+            const auto &other_bitmap = other_entry.second;
+
+            // Try to insert other_bitmap into self at other_key. We take
+            // advantage of the fact that std::map::insert will not overwrite an
+            // existing entry.
+            auto insert_result = roarings.insert(other_entry);
+            auto self_iter = insert_result.first;
+            auto insert_happened = insert_result.second;
+            auto &self_bitmap = self_iter->second;
+
+            if (insert_happened) {
+                // Key was not present in self, so insert was performed above.
+                // In the logic table above, this reflects the case
+                // (self.absent ^ other.present). Because the copy has already
+                // happened, thanks to the 'insert' operation above, we just
+                // need to set the copyOnWrite flag.
+                self_bitmap.setCopyOnWrite(copyOnWrite);
+                continue;
+            }
+
+            // Both sides have self_key, and the insert was not performed. In
+            // the logic table above, this reflects the case
+            // (self.present ^ other.present). So XOR other into self.
+            self_bitmap ^= other_bitmap;
+
+            if (self_bitmap.isEmpty()) {
+                // ...but if intersection is empty, remove it altogether.
+                roarings.erase(self_iter);
+            }
+        }
+        return *this;
+    }
+
+    /**
+     * Exchange the content of this bitmap with another.
+     */
+    void swap(Roaring64Map &r) { roarings.swap(r.roarings); }
+
+    /**
+     * Get the cardinality of the bitmap (number of elements).
+     * Throws std::length_error in the special case where the bitmap is full
+     * (cardinality() == 2^64). Check isFull() before calling to avoid
+     * exception.
+     */
+    uint64_t cardinality() const {
+        if (isFull()) {
+#if ROARING_EXCEPTIONS
+            throw std::length_error(
+                "bitmap is full, cardinality is 2^64, "
+                "unable to represent in a 64-bit integer");
+#else
+            ROARING_TERMINATE(
+                "bitmap is full, cardinality is 2^64, "
+                "unable to represent in a 64-bit integer");
+#endif
+        }
+        return std::accumulate(
+            roarings.cbegin(), roarings.cend(), (uint64_t)0,
+            [](uint64_t previous,
+               const std::pair<const uint32_t, Roaring> &map_entry) {
+                return previous + map_entry.second.cardinality();
+            });
+    }
+
+    /**
+     * Returns true if the bitmap is empty (cardinality is zero).
+     */
+    bool isEmpty() const {
+        return std::all_of(
+            roarings.cbegin(), roarings.cend(),
+            [](const std::pair<const uint32_t, Roaring> &map_entry) {
+                return map_entry.second.isEmpty();
+            });
+    }
+
+    /**
+     * Returns true if the bitmap is full (cardinality is max uint64_t + 1).
+     */
+    bool isFull() const {
+        // only bother to check if map is fully saturated
+        //
+        // we put std::numeric_limits<>::max/min in parentheses
+        // to avoid a clash with the Windows.h header under Windows
+        return roarings.size() ==
+                       ((uint64_t)(std::numeric_limits<uint32_t>::max)()) + 1
+                   ? std::all_of(
+                         roarings.cbegin(), roarings.cend(),
+                         [](const std::pair<const uint32_t, Roaring>
+                                &roaring_map_entry) {
+                             // roarings within map are saturated if cardinality
+                             // is uint32_t max + 1
+                             return roaring_map_entry.second.cardinality() ==
+                                    ((uint64_t)(std::numeric_limits<
+                                                uint32_t>::max)()) +
+                                        1;
+                         })
+                   : false;
+    }
+
+    /**
+     * Returns true if the bitmap is subset of the other.
+     */
+    bool isSubset(const Roaring64Map &r) const {
+        for (const auto &map_entry : roarings) {
+            if (map_entry.second.isEmpty()) {
+                continue;
+            }
+            auto roaring_iter = r.roarings.find(map_entry.first);
+            if (roaring_iter == r.roarings.cend())
+                return false;
+            else if (!map_entry.second.isSubset(roaring_iter->second))
+                return false;
+        }
+        return true;
+    }
+
+    /**
+     * Returns true if the bitmap is strict subset of the other.
+     * Throws std::length_error in the special case where the bitmap is full
+     * (cardinality() == 2^64). Check isFull() before calling to avoid
+     * exception.
+     */
+    bool isStrictSubset(const Roaring64Map &r) const {
+        return isSubset(r) && cardinality() != r.cardinality();
+    }
+
+    /**
+     * Convert the bitmap to an array. Write the output to "ans",
+     * caller is responsible to ensure that there is enough memory
+     * allocated
+     * (e.g., ans = new uint32[mybitmap.cardinality()];)
+     */
+    void toUint64Array(uint64_t *ans) const {
+        // Annoyingly, VS 2017 marks std::accumulate() as [[nodiscard]]
+        (void)std::accumulate(
+            roarings.cbegin(), roarings.cend(), ans,
+            [](uint64_t *previous,
+               const std::pair<const uint32_t, Roaring> &map_entry) {
+                for (uint32_t low_bits : map_entry.second)
+                    *previous++ = uniteBytes(map_entry.first, low_bits);
+                return previous;
+            });
+    }
+
+    /**
+     * Return true if the two bitmaps contain the same elements.
+     */
+    bool operator==(const Roaring64Map &r) const {
+        // we cannot use operator == on the map because either side may contain
+        // empty Roaring Bitmaps
+        auto lhs_iter = roarings.cbegin();
+        auto lhs_cend = roarings.cend();
+        auto rhs_iter = r.roarings.cbegin();
+        auto rhs_cend = r.roarings.cend();
+        while (lhs_iter != lhs_cend && rhs_iter != rhs_cend) {
+            auto lhs_key = lhs_iter->first, rhs_key = rhs_iter->first;
+            const auto &lhs_map = lhs_iter->second, &rhs_map = rhs_iter->second;
+            if (lhs_map.isEmpty()) {
+                ++lhs_iter;
+                continue;
+            }
+            if (rhs_map.isEmpty()) {
+                ++rhs_iter;
+                continue;
+            }
+            if (!(lhs_key == rhs_key)) {
+                return false;
+            }
+            if (!(lhs_map == rhs_map)) {
+                return false;
+            }
+            ++lhs_iter;
+            ++rhs_iter;
+        }
+        while (lhs_iter != lhs_cend) {
+            if (!lhs_iter->second.isEmpty()) {
+                return false;
+            }
+            ++lhs_iter;
+        }
+        while (rhs_iter != rhs_cend) {
+            if (!rhs_iter->second.isEmpty()) {
+                return false;
+            }
+            ++rhs_iter;
+        }
+        return true;
+    }
+
+    /**
+     * Computes the negation of the roaring bitmap within the half-open interval
+     * [min, max). Areas outside the interval are unchanged.
+     */
+    void flip(uint64_t min, uint64_t max) {
+        if (min >= max) {
+            return;
+        }
+        flipClosed(min, max - 1);
+    }
+
+    /**
+     * Computes the negation of the roaring bitmap within the closed interval
+     * [min, max]. Areas outside the interval are unchanged.
+     */
+    void flipClosed(uint32_t min, uint32_t max) {
+        auto iter = roarings.begin();
+        // Since min and max are uint32_t, highbytes(min or max) == 0. The inner
+        // bitmap we are looking for, if it exists, will be at the first slot of
+        // 'roarings'. If it does not exist, we have to create it.
+        if (iter == roarings.end() || iter->first != 0) {
+            iter = roarings.emplace_hint(iter, std::piecewise_construct,
+                                         std::forward_as_tuple(0),
+                                         std::forward_as_tuple());
+            auto &bitmap = iter->second;
+            bitmap.setCopyOnWrite(copyOnWrite);
+        }
+        auto &bitmap = iter->second;
+        bitmap.flipClosed(min, max);
+        eraseIfEmpty(iter);
+    }
+
+    /**
+     * Computes the negation of the roaring bitmap within the closed interval
+     * [min, max]. Areas outside the interval are unchanged.
+     */
+    void flipClosed(uint64_t min, uint64_t max) {
+        if (min > max) {
+            return;
+        }
+        uint32_t start_high = highBytes(min);
+        uint32_t start_low = lowBytes(min);
+        uint32_t end_high = highBytes(max);
+        uint32_t end_low = lowBytes(max);
+
+        // We put std::numeric_limits<>::max in parentheses to avoid a
+        // clash with the Windows.h header under Windows.
+        const uint32_t uint32_max = (std::numeric_limits<uint32_t>::max)();
+
+        // Fill in any nonexistent slots with empty Roarings. This simplifies
+        // the logic below, allowing it to simply iterate over the map between
+        // 'start_high' and 'end_high' in a linear fashion.
+        auto current_iter = ensureRangePopulated(start_high, end_high);
+
+        // If start and end land on the same inner bitmap, then we can do the
+        // whole operation in one call.
+        if (start_high == end_high) {
+            auto &bitmap = current_iter->second;
+            bitmap.flipClosed(start_low, end_low);
+            eraseIfEmpty(current_iter);
+            return;
+        }
+
+        // Because start and end don't land on the same inner bitmap,
+        // we need to do this in multiple steps:
+        // 1. Partially flip the first bitmap in the closed interval
+        //    [start_low, uint32_max]
+        // 2. Flip intermediate bitmaps completely: [0, uint32_max]
+        // 3. Partially flip the last bitmap in the closed interval
+        //    [0, end_low]
+
+        auto num_intermediate_bitmaps = end_high - start_high - 1;
+
+        // 1. Partially flip the first bitmap.
+        {
+            auto &bitmap = current_iter->second;
+            bitmap.flipClosed(start_low, uint32_max);
+            auto temp = current_iter++;
+            eraseIfEmpty(temp);
+        }
+
+        // 2. Flip intermediate bitmaps completely.
+        for (uint32_t i = 0; i != num_intermediate_bitmaps; ++i) {
+            auto &bitmap = current_iter->second;
+            bitmap.flipClosed(0, uint32_max);
+            auto temp = current_iter++;
+            eraseIfEmpty(temp);
+        }
+
+        // 3. Partially flip the last bitmap.
+        auto &bitmap = current_iter->second;
+        bitmap.flipClosed(0, end_low);
+        eraseIfEmpty(current_iter);
+    }
+
+    /**
+     * Remove run-length encoding even when it is more space efficient
+     * return whether a change was applied
+     */
+    bool removeRunCompression() {
+        return std::accumulate(
+            roarings.begin(), roarings.end(), true,
+            [](bool previous, std::pair<const uint32_t, Roaring> &map_entry) {
+                return map_entry.second.removeRunCompression() && previous;
+            });
+    }
+
+    /**
+     * Convert array and bitmap containers to run containers when it is more
+     * efficient; also convert from run containers when more space efficient.
+     * Returns true if the result has at least one run container.
+     * Additional savings might be possible by calling shrinkToFit().
+     */
+    bool runOptimize() {
+        return std::accumulate(
+            roarings.begin(), roarings.end(), true,
+            [](bool previous, std::pair<const uint32_t, Roaring> &map_entry) {
+                return map_entry.second.runOptimize() && previous;
+            });
+    }
+
+    /**
+     * If needed, reallocate memory to shrink the memory usage.
+     * Returns the number of bytes saved.
+     */
+    size_t shrinkToFit() {
+        size_t savedBytes = 0;
+        auto iter = roarings.begin();
+        while (iter != roarings.cend()) {
+            if (iter->second.isEmpty()) {
+                // empty Roarings are 84 bytes
+                savedBytes += 88;
+                roarings.erase(iter++);
+            } else {
+                savedBytes += iter->second.shrinkToFit();
+                iter++;
+            }
+        }
+        return savedBytes;
+    }
+
+    /**
+     * Iterate over the bitmap elements in order(start from the smallest one)
+     * and call iterator once for every element until the iterator function
+     * returns false. To iterate over all values, the iterator function should
+     * always return true.
+     *
+     * The roaring_iterator64 parameter is a pointer to a function that
+     * returns bool (true means that the iteration should continue while false
+     * means that it should stop), and takes (uint64_t element, void* ptr) as
+     * inputs.
+     */
+    void iterate(api::roaring_iterator64 iterator, void *ptr) const {
+        for (const auto &map_entry : roarings) {
+            bool should_continue =
+                roaring_iterate64(&map_entry.second.roaring, iterator,
+                                  uint64_t(map_entry.first) << 32, ptr);
+            if (!should_continue) {
+                break;
+            }
+        }
+    }
+
+    /**
+     * Selects the value at index 'rank' in the bitmap, where the smallest value
+     * is at index 0. If 'rank' < cardinality(), returns true with *element set
+     * to the element of the specified rank. Otherwise, returns false and the
+     * contents of *element are unspecified.
+     */
+    bool select(uint64_t rank, uint64_t *element) const {
+        for (const auto &map_entry : roarings) {
+            auto key = map_entry.first;
+            const auto &bitmap = map_entry.second;
+
+            uint64_t sub_cardinality = bitmap.cardinality();
+            if (rank < sub_cardinality) {
+                uint32_t low_bytes;
+                // Casting rank to uint32_t is safe because
+                // rank < sub_cardinality and sub_cardinality <= 2^32.
+                if (!bitmap.select((uint32_t)rank, &low_bytes)) {
+                    ROARING_TERMINATE(
+                        "Logic error: bitmap.select() "
+                        "returned false despite rank < cardinality()");
+                }
+                *element = uniteBytes(key, low_bytes);
+                return true;
+            }
+            rank -= sub_cardinality;
+        }
+        return false;
+    }
+
+    /**
+     * Returns the number of integers that are smaller or equal to x.
+     */
+    uint64_t rank(uint64_t x) const {
+        uint64_t result = 0;
+        // Find the first bitmap >= x's bucket. If that is the bucket x would be
+        // in, find it's rank in that bucket. Either way, we're left with a
+        // range of all buckets strictly smaller than x's bucket, add all their
+        // cardinalities together.
+        auto end = roarings.lower_bound(highBytes(x));
+        if (end != roarings.cend() && end->first == highBytes(x)) {
+            result += end->second.rank(lowBytes(x));
+        }
+        for (auto iter = roarings.cbegin(); iter != end; ++iter) {
+            result += iter->second.cardinality();
+        }
+        return result;
+    }
+
+    /**
+     * Returns the index of x in the set, index start from 0.
+     * If the set doesn't contain x , this function will return -1.
+     * The difference with rank function is that this function will return -1
+     * when x isn't in the set, but the rank function will return a
+     * non-negative number.
+     */
+    int64_t getIndex(uint64_t x) const {
+        int64_t index = 0;
+        auto roaring_destination = roarings.find(highBytes(x));
+        if (roaring_destination != roarings.cend()) {
+            for (auto roaring_iter = roarings.cbegin();
+                 roaring_iter != roaring_destination; ++roaring_iter) {
+                index += roaring_iter->second.cardinality();
+            }
+            auto low_idx = roaring_destination->second.getIndex(lowBytes(x));
+            if (low_idx < 0) return -1;
+            index += low_idx;
+            return index;
+        }
+        return -1;
+    }
+
+    /**
+     * Write a bitmap to a char buffer. This is meant to be compatible with
+     * the Java and Go versions. Returns how many bytes were written which
+     * should be getSizeInBytes().
+     *
+     * Setting the portable flag to false enables a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     */
+    size_t write(char *buf, bool portable = true) const {
+        const char *orig = buf;
+        // push map size
+        uint64_t map_size = roarings.size();
+        std::memcpy(buf, &map_size, sizeof(uint64_t));
+        buf += sizeof(uint64_t);
+        std::for_each(roarings.cbegin(), roarings.cend(),
+                      [&buf, portable](
+                          const std::pair<const uint32_t, Roaring> &map_entry) {
+                          // push map key
+                          std::memcpy(buf, &map_entry.first, sizeof(uint32_t));
+                          // ^-- Note: `*((uint32_t*)buf) = map_entry.first;` is
+                          // undefined
+
+                          buf += sizeof(uint32_t);
+                          // push map value Roaring
+                          buf += map_entry.second.write(buf, portable);
+                      });
+        return buf - orig;
+    }
+
+    /**
+     * Read a bitmap from a serialized version. This is meant to be compatible
+     * with the Java and Go versions.
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     *
+     * This function is unsafe in the sense that if you provide bad data, many
+     * bytes could be read, possibly causing a buffer overflow. See also
+     * readSafe.
+     */
+    static Roaring64Map read(const char *buf, bool portable = true) {
+        Roaring64Map result;
+        // get map size
+        uint64_t map_size;
+        std::memcpy(&map_size, buf, sizeof(uint64_t));
+        buf += sizeof(uint64_t);
+        for (uint64_t lcv = 0; lcv < map_size; lcv++) {
+            // get map key
+            uint32_t key;
+            std::memcpy(&key, buf, sizeof(uint32_t));
+            // ^-- Note: `uint32_t key = *((uint32_t*)buf);` is undefined
+
+            buf += sizeof(uint32_t);
+            // read map value Roaring
+            Roaring read_var = Roaring::read(buf, portable);
+            // forward buffer past the last Roaring Bitmap
+            buf += read_var.getSizeInBytes(portable);
+            result.emplaceOrInsert(key, std::move(read_var));
+        }
+        return result;
+    }
+
+    /**
+     * Read a bitmap from a serialized version, reading no more than maxbytes
+     * bytes.  This is meant to be compatible with the Java and Go versions.
+     *
+     * Setting the portable flag to false enable a custom format that can save
+     * space compared to the portable format (e.g., for very sparse bitmaps).
+     */
+    static Roaring64Map readSafe(const char *buf, size_t maxbytes) {
+        if (maxbytes < sizeof(uint64_t)) {
+            ROARING_TERMINATE("ran out of bytes");
+        }
+        Roaring64Map result;
+        uint64_t map_size;
+        std::memcpy(&map_size, buf, sizeof(uint64_t));
+        buf += sizeof(uint64_t);
+        maxbytes -= sizeof(uint64_t);
+        for (uint64_t lcv = 0; lcv < map_size; lcv++) {
+            if (maxbytes < sizeof(uint32_t)) {
+                ROARING_TERMINATE("ran out of bytes");
+            }
+            uint32_t key;
+            std::memcpy(&key, buf, sizeof(uint32_t));
+            // ^-- Note: `uint32_t key = *((uint32_t*)buf);` is undefined
+
+            buf += sizeof(uint32_t);
+            maxbytes -= sizeof(uint32_t);
+            // read map value Roaring
+            Roaring read_var = Roaring::readSafe(buf, maxbytes);
+            // forward buffer past the last Roaring Bitmap
+            size_t tz = read_var.getSizeInBytes(true);
+            buf += tz;
+            maxbytes -= tz;
+            result.emplaceOrInsert(key, std::move(read_var));
+        }
+        return result;
+    }
+
+    /**
+     * Return the number of bytes required to serialize this bitmap (meant to
+     * be compatible with Java and Go versions)
+     *
+     * Setting the portable flag to false enable a custom format that can save
+     * space compared to the portable format (e.g., for very sparse bitmaps).
+     */
+    size_t getSizeInBytes(bool portable = true) const {
+        // start with, respectively, map size and size of keys for each map
+        // entry
+        return std::accumulate(
+            roarings.cbegin(), roarings.cend(),
+            sizeof(uint64_t) + roarings.size() * sizeof(uint32_t),
+            [=](size_t previous,
+                const std::pair<const uint32_t, Roaring> &map_entry) {
+                // add in bytes used by each Roaring
+                return previous + map_entry.second.getSizeInBytes(portable);
+            });
+    }
+
+    static const Roaring64Map frozenView(const char *buf) {
+        // size of bitmap buffer and key
+        const size_t metadata_size = sizeof(size_t) + sizeof(uint32_t);
+
+        Roaring64Map result;
+
+        // get map size
+        uint64_t map_size;
+        memcpy(&map_size, buf, sizeof(uint64_t));
+        buf += sizeof(uint64_t);
+
+        for (uint64_t lcv = 0; lcv < map_size; lcv++) {
+            // pad to 32 bytes minus the metadata size
+            while (((uintptr_t)buf + metadata_size) % 32 != 0) buf++;
+
+            // get bitmap size
+            size_t len;
+            memcpy(&len, buf, sizeof(size_t));
+            buf += sizeof(size_t);
+
+            // get map key
+            uint32_t key;
+            memcpy(&key, buf, sizeof(uint32_t));
+            buf += sizeof(uint32_t);
+
+            // read map value Roaring
+            const Roaring read = Roaring::frozenView(buf, len);
+            result.emplaceOrInsert(key, read);
+
+            // forward buffer past the last Roaring Bitmap
+            buf += len;
+        }
+        return result;
+    }
+
+    static const Roaring64Map portableDeserializeFrozen(const char *buf) {
+        Roaring64Map result;
+        // get map size
+        uint64_t map_size;
+        std::memcpy(&map_size, buf, sizeof(uint64_t));
+        buf += sizeof(uint64_t);
+        for (uint64_t lcv = 0; lcv < map_size; lcv++) {
+            // get map key
+            uint32_t key;
+            std::memcpy(&key, buf, sizeof(uint32_t));
+            buf += sizeof(uint32_t);
+            // read map value Roaring
+            Roaring read_var = Roaring::portableDeserializeFrozen(buf);
+            // forward buffer past the last Roaring bitmap
+            buf += read_var.getSizeInBytes(true);
+            result.emplaceOrInsert(key, std::move(read_var));
+        }
+        return result;
+    }
+
+    // As with serialized 64-bit bitmaps, 64-bit frozen bitmaps are serialized
+    // by concatenating one or more Roaring::write output buffers with the
+    // preceeding map key. Unlike standard bitmap serialization, frozen bitmaps
+    // must be 32-byte aligned and requires a buffer length to parse. As a
+    // result, each concatenated output of Roaring::writeFrozen is preceeded by
+    // padding, the buffer size (size_t), and the map key (uint32_t). The
+    // padding is used to ensure 32-byte alignment, but since it is followed by
+    // the buffer size and map key, it actually pads to `(x - sizeof(size_t) +
+    // sizeof(uint32_t)) mod 32` to leave room for the metadata.
+    void writeFrozen(char *buf) const {
+        // size of bitmap buffer and key
+        const size_t metadata_size = sizeof(size_t) + sizeof(uint32_t);
+
+        // push map size
+        uint64_t map_size = roarings.size();
+        memcpy(buf, &map_size, sizeof(uint64_t));
+        buf += sizeof(uint64_t);
+
+        for (auto &map_entry : roarings) {
+            size_t frozenSizeInBytes = map_entry.second.getFrozenSizeInBytes();
+
+            // pad to 32 bytes minus the metadata size
+            while (((uintptr_t)buf + metadata_size) % 32 != 0) buf++;
+
+            // push bitmap size
+            memcpy(buf, &frozenSizeInBytes, sizeof(size_t));
+            buf += sizeof(size_t);
+
+            // push map key
+            memcpy(buf, &map_entry.first, sizeof(uint32_t));
+            buf += sizeof(uint32_t);
+
+            // push map value Roaring
+            map_entry.second.writeFrozen(buf);
+            buf += map_entry.second.getFrozenSizeInBytes();
+        }
+    }
+
+    size_t getFrozenSizeInBytes() const {
+        // size of bitmap size and map key
+        const size_t metadata_size = sizeof(size_t) + sizeof(uint32_t);
+        size_t ret = 0;
+
+        // map size
+        ret += sizeof(uint64_t);
+
+        for (auto &map_entry : roarings) {
+            // pad to 32 bytes minus the metadata size
+            while ((ret + metadata_size) % 32 != 0) ret++;
+            ret += metadata_size;
+
+            // frozen bitmaps must be 32-byte aligned
+            ret += map_entry.second.getFrozenSizeInBytes();
+        }
+        return ret;
+    }
+
+    /**
+     * Computes the intersection between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     *
+     * Performance hint: if you are computing the intersection between several
+     * bitmaps, two-by-two, it is best to start with the smallest bitmap.
+     * Consider also using the operator &= to avoid needlessly creating
+     * many temporary bitmaps.
+     */
+    Roaring64Map operator&(const Roaring64Map &o) const {
+        return Roaring64Map(*this) &= o;
+    }
+
+    /**
+     * Computes the difference between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring64Map operator-(const Roaring64Map &o) const {
+        return Roaring64Map(*this) -= o;
+    }
+
+    /**
+     * Computes the union between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring64Map operator|(const Roaring64Map &o) const {
+        return Roaring64Map(*this) |= o;
+    }
+
+    /**
+     * Computes the symmetric union between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring64Map operator^(const Roaring64Map &o) const {
+        return Roaring64Map(*this) ^= o;
+    }
+
+    /**
+     * Whether or not we apply copy and write.
+     */
+    void setCopyOnWrite(bool val) {
+        if (copyOnWrite == val) return;
+        copyOnWrite = val;
+        std::for_each(roarings.begin(), roarings.end(),
+                      [=](std::pair<const uint32_t, Roaring> &map_entry) {
+                          map_entry.second.setCopyOnWrite(val);
+                      });
+    }
+
+    /**
+     * Print the contents of the bitmap to stdout.
+     * Note: this method adds a final newline, but toString() does not.
+     */
+    void printf() const {
+        auto sink = [](const std::string &s) { fputs(s.c_str(), stdout); };
+        printToSink(sink);
+        sink("\n");
+    }
+
+    /**
+     * Print the contents of the bitmap into a string.
+     */
+    std::string toString() const {
+        std::string result;
+        auto sink = [&result](const std::string &s) { result += s; };
+        printToSink(sink);
+        return result;
+    }
+
+    /**
+     * Whether or not copy and write is active.
+     */
+    bool getCopyOnWrite() const { return copyOnWrite; }
+
+    /**
+     * Computes the logical or (union) between "n" bitmaps (referenced by a
+     * pointer).
+     */
+    static Roaring64Map fastunion(size_t n, const Roaring64Map **inputs) {
+        // The strategy here is to basically do a "group by" operation.
+        // We group the input roarings by key, do a 32-bit
+        // roaring_bitmap_or_many on each group, and collect the results.
+        // We accomplish the "group by" operation using a priority queue, which
+        // tracks the next key for each of our input maps. At each step, our
+        // algorithm takes the next subset of maps that share the same next key,
+        // runs roaring_bitmap_or_many on those bitmaps, and then advances the
+        // current_iter on all the affected entries and then repeats.
+
+        // There is an entry in our priority queue for each of the 'n' inputs.
+        // For a given Roaring64Map, we look at its underlying 'roarings'
+        // std::map, and take its begin() and end(). This forms our half-open
+        // interval [current_iter, end_iter), which we keep in the priority
+        // queue as a pq_entry. These entries are updated (removed and then
+        // reinserted with the pq_entry.iterator field advanced by one step) as
+        // our algorithm progresses. But when a given interval becomes empty
+        // (i.e. pq_entry.iterator == pq_entry.end) it is not returned to the
+        // priority queue.
+        struct pq_entry {
+            roarings_t::const_iterator iterator;
+            roarings_t::const_iterator end;
+        };
+
+        // Custom comparator for the priority queue.
+        auto pq_comp = [](const pq_entry &lhs, const pq_entry &rhs) {
+            auto left_key = lhs.iterator->first;
+            auto right_key = rhs.iterator->first;
+
+            // We compare in the opposite direction than normal because priority
+            // queues normally order from largest to smallest, but we want
+            // smallest to largest.
+            return left_key > right_key;
+        };
+
+        // Create and populate the priority queue.
+        std::priority_queue<pq_entry, std::vector<pq_entry>, decltype(pq_comp)>
+            pq(pq_comp);
+        for (size_t i = 0; i < n; ++i) {
+            const auto &roarings = inputs[i]->roarings;
+            if (roarings.begin() != roarings.end()) {
+                pq.push({roarings.begin(), roarings.end()});
+            }
+        }
+
+        // A reusable vector that holds the pointers to the inner bitmaps that
+        // we pass to the underlying 32-bit fastunion operation.
+        std::vector<const roaring_bitmap_t *> group_bitmaps;
+
+        // Summary of the algorithm:
+        // 1. While the priority queue is not empty:
+        //    A. Get its lowest key. Call this group_key
+        //    B. While the lowest entry in the priority queue has a key equal to
+        //       group_key:
+        //       1. Remove this entry (the pair {current_iter, end_iter}) from
+        //          the priority queue.
+        //       2. Add the bitmap pointed to by current_iter to a list of
+        //          32-bit bitmaps to process.
+        //       3. Advance current_iter. Now it will point to a bitmap entry
+        //          with some key greater than group_key (or it will point to
+        //          end()).
+        //       4. If current_iter != end_iter, reinsert the pair into the
+        //          priority queue.
+        //    C. Invoke the 32-bit roaring_bitmap_or_many() and add to result
+        Roaring64Map result;
+        while (!pq.empty()) {
+            // Find the next key (the lowest key) in the priority queue.
+            auto group_key = pq.top().iterator->first;
+
+            // The purpose of the inner loop is to gather all the inner bitmaps
+            // that share "group_key" into "group_bitmaps" so that they can be
+            // fed to roaring_bitmap_or_many(). While we are doing this, we
+            // advance those iterators to their next value and reinsert them
+            // into the priority queue (unless they reach their end).
+            group_bitmaps.clear();
+            while (!pq.empty()) {
+                auto candidate_current_iter = pq.top().iterator;
+                auto candidate_end_iter = pq.top().end;
+
+                auto candidate_key = candidate_current_iter->first;
+                const auto &candidate_bitmap = candidate_current_iter->second;
+
+                // This element will either be in the group (having
+                // key == group_key) or it will not be in the group (having
+                // key > group_key). (Note it cannot have key < group_key
+                // because of the ordered nature of the priority queue itself
+                // and the ordered nature of all the underlying roaring maps).
+                if (candidate_key != group_key) {
+                    // This entry, and (thanks to the nature of the priority
+                    // queue) all other entries as well, are all greater than
+                    // group_key, so we're done collecting elements for the
+                    // current group. Because of the way this loop was written,
+                    // the group will will always contain at least one element.
+                    break;
+                }
+
+                group_bitmaps.push_back(&candidate_bitmap.roaring);
+                // Remove this entry from the priority queue. Note this
+                // invalidates pq.top() so make sure you don't have any dangling
+                // references to it.
+                pq.pop();
+
+                // Advance 'candidate_current_iter' and insert a new entry
+                // {candidate_current_iter, candidate_end_iter} into the
+                // priority queue (unless it has reached its end).
+                ++candidate_current_iter;
+                if (candidate_current_iter != candidate_end_iter) {
+                    pq.push({candidate_current_iter, candidate_end_iter});
+                }
+            }
+
+            // Use the fast inner union to combine these.
+            auto *inner_result = roaring_bitmap_or_many(group_bitmaps.size(),
+                                                        group_bitmaps.data());
+            // Insert the 32-bit result at end of the 'roarings' map of the
+            // result we are building.
+            result.roarings.insert(
+                result.roarings.end(),
+                std::make_pair(group_key, Roaring(inner_result)));
+        }
+        return result;
+    }
+
+    friend class Roaring64MapSetBitBiDirectionalIterator;
+    typedef Roaring64MapSetBitBiDirectionalIterator const_iterator;
+    typedef Roaring64MapSetBitBiDirectionalIterator
+        const_bidirectional_iterator;
+
+    /**
+     * Returns an iterator that can be used to access the position of the set
+     * bits. The running time complexity of a full scan is proportional to the
+     * number of set bits: be aware that if you have long strings of 1s, this
+     * can be very inefficient.
+     *
+     * It can be much faster to use the toArray method if you want to
+     * retrieve the set bits.
+     */
+    const_iterator begin() const;
+
+    /**
+     * A bogus iterator that can be used together with begin()
+     * for constructions such as: for (auto i = b.begin(); * i!=b.end(); ++i) {}
+     */
+    const_iterator end() const;
+
+   private:
+    typedef std::map<uint32_t, Roaring> roarings_t;
+    roarings_t roarings{};  // The empty constructor silences warnings from
+                            // pedantic static analyzers.
+    bool copyOnWrite{false};
+    static constexpr uint32_t highBytes(const uint64_t in) {
+        return uint32_t(in >> 32);
+    }
+    static constexpr uint32_t lowBytes(const uint64_t in) {
+        return uint32_t(in);
+    }
+    static constexpr uint64_t uniteBytes(const uint32_t highBytes,
+                                         const uint32_t lowBytes) {
+        return (uint64_t(highBytes) << 32) | uint64_t(lowBytes);
+    }
+    // this is needed to tolerate gcc's C++11 libstdc++ lacking emplace
+    // prior to version 4.8
+    void emplaceOrInsert(const uint32_t key, const Roaring &value) {
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20130322
+        roarings.insert(std::make_pair(key, value));
+#else
+        roarings.emplace(std::make_pair(key, value));
+#endif
+    }
+
+    void emplaceOrInsert(const uint32_t key, Roaring &&value) {
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20130322
+        roarings.insert(std::make_pair(key, std::move(value)));
+#else
+        roarings.emplace(key, std::move(value));
+#endif
+    }
+
+    /*
+     * Look up 'key' in the 'roarings' map. If it does not exist, create it.
+     * Also, set its copyOnWrite flag to 'copyOnWrite'. Then return a reference
+     * to the (already existing or newly created) inner bitmap.
+     */
+    Roaring &lookupOrCreateInner(uint32_t key) {
+        auto &bitmap = roarings[key];
+        bitmap.setCopyOnWrite(copyOnWrite);
+        return bitmap;
+    }
+
+    /**
+     * Prints the contents of the bitmap to a caller-provided sink function.
+     */
+    void printToSink(
+        const std::function<void(const std::string &)> &sink) const {
+        sink("{");
+
+        // Storage for snprintf. Big enough to store the decimal representation
+        // of the largest uint64_t value and trailing \0.
+        char buffer[32];
+        const char *separator = "";
+        // Reusable, and therefore avoids many repeated heap allocations.
+        std::string callback_string;
+        for (const auto &entry : roarings) {
+            auto high_bits = entry.first;
+            const auto &bitmap = entry.second;
+            for (const auto low_bits : bitmap) {
+                auto value = uniteBytes(high_bits, low_bits);
+                snprintf(buffer, sizeof(buffer), "%" PRIu64, value);
+                callback_string = separator;
+                callback_string.append(buffer);
+                sink(callback_string);
+                separator = ",";
+            }
+        }
+        sink("}");
+    }
+
+    /**
+     * Ensures that every key in the closed interval [start_high, end_high]
+     * refers to a Roaring bitmap rather being an empty slot. Inserts empty
+     * Roaring bitmaps if necessary. The interval must be valid and non-empty.
+     * Returns an iterator to the bitmap at start_high.
+     */
+    roarings_t::iterator ensureRangePopulated(uint32_t start_high,
+                                              uint32_t end_high) {
+        if (start_high > end_high) {
+            ROARING_TERMINATE("Logic error: start_high > end_high");
+        }
+        // next_populated_iter points to the first entry in the outer map with
+        // key >= start_high, or end().
+        auto next_populated_iter = roarings.lower_bound(start_high);
+
+        // Use uint64_t to avoid an infinite loop when end_high == uint32_max.
+        roarings_t::iterator start_iter{};  // Definitely assigned in loop.
+        for (uint64_t slot = start_high; slot <= end_high; ++slot) {
+            roarings_t::iterator slot_iter;
+            if (next_populated_iter != roarings.end() &&
+                next_populated_iter->first == slot) {
+                // 'slot' index has caught up to next_populated_iter.
+                // Note it here and advance next_populated_iter.
+                slot_iter = next_populated_iter++;
+            } else {
+                // 'slot' index has not yet caught up to next_populated_iter.
+                // Make a fresh entry {key = 'slot', value = Roaring()}, insert
+                // it just prior to next_populated_iter, and set its copy
+                // on write flag. We take pains to use emplace_hint and
+                // piecewise_construct to minimize effort.
+                slot_iter = roarings.emplace_hint(
+                    next_populated_iter, std::piecewise_construct,
+                    std::forward_as_tuple(uint32_t(slot)),
+                    std::forward_as_tuple());
+                auto &bitmap = slot_iter->second;
+                bitmap.setCopyOnWrite(copyOnWrite);
+            }
+
+            // Make a note of the iterator of the starting slot. It will be
+            // needed for the return value.
+            if (slot == start_high) {
+                start_iter = slot_iter;
+            }
+        }
+        return start_iter;
+    }
+
+    /**
+     * Erases the entry pointed to by 'iter' from the 'roarings' map. Warning:
+     * this invalidates 'iter'.
+     */
+    void eraseIfEmpty(roarings_t::iterator iter) {
+        const auto &bitmap = iter->second;
+        if (bitmap.isEmpty()) {
+            roarings.erase(iter);
+        }
+    }
+};
+
+/**
+ * Used to go through the set bits. Not optimally fast, but convenient.
+ */
+class Roaring64MapSetBitBiDirectionalIterator {
+   public:
+    typedef std::bidirectional_iterator_tag iterator_category;
+    typedef uint64_t *pointer;
+    typedef uint64_t &reference;
+    typedef uint64_t value_type;
+    typedef int64_t difference_type;
+    typedef Roaring64MapSetBitBiDirectionalIterator type_of_iterator;
+
+    Roaring64MapSetBitBiDirectionalIterator(const Roaring64Map &parent,
+                                            bool exhausted = false)
+        : p(&parent.roarings) {
+        if (exhausted || parent.roarings.empty()) {
+            map_iter = p->cend();
+        } else {
+            map_iter = parent.roarings.cbegin();
+            roaring_iterator_init(&map_iter->second.roaring, &i);
+            while (!i.has_value) {
+                map_iter++;
+                if (map_iter == p->cend()) return;
+                roaring_iterator_init(&map_iter->second.roaring, &i);
+            }
+        }
+    }
+
+    /**
+     * Provides the location of the set bit.
+     */
+    value_type operator*() const {
+        return Roaring64Map::uniteBytes(map_iter->first, i.current_value);
+    }
+
+    bool operator<(const type_of_iterator &o) const {
+        if (map_iter == p->cend()) return false;
+        if (o.map_iter == o.p->cend()) return true;
+        return **this < *o;
+    }
+
+    bool operator<=(const type_of_iterator &o) const {
+        if (o.map_iter == o.p->cend()) return true;
+        if (map_iter == p->cend()) return false;
+        return **this <= *o;
+    }
+
+    bool operator>(const type_of_iterator &o) const {
+        if (o.map_iter == o.p->cend()) return false;
+        if (map_iter == p->cend()) return true;
+        return **this > *o;
+    }
+
+    bool operator>=(const type_of_iterator &o) const {
+        if (map_iter == p->cend()) return true;
+        if (o.map_iter == o.p->cend()) return false;
+        return **this >= *o;
+    }
+
+    type_of_iterator &operator++() {  // ++i, must returned inc. value
+        if (i.has_value == true) roaring_uint32_iterator_advance(&i);
+        while (!i.has_value) {
+            ++map_iter;
+            if (map_iter == p->cend()) return *this;
+            roaring_iterator_init(&map_iter->second.roaring, &i);
+        }
+        return *this;
+    }
+
+    type_of_iterator operator++(int) {  // i++, must return orig. value
+        Roaring64MapSetBitBiDirectionalIterator orig(*this);
+        roaring_uint32_iterator_advance(&i);
+        while (!i.has_value) {
+            ++map_iter;
+            if (map_iter == p->cend()) return orig;
+            roaring_iterator_init(&map_iter->second.roaring, &i);
+        }
+        return orig;
+    }
+
+    bool move(const value_type &x) {
+        map_iter = p->lower_bound(Roaring64Map::highBytes(x));
+        if (map_iter != p->cend()) {
+            roaring_iterator_init(&map_iter->second.roaring, &i);
+            if (map_iter->first == Roaring64Map::highBytes(x)) {
+                if (roaring_uint32_iterator_move_equalorlarger(
+                        &i, Roaring64Map::lowBytes(x)))
+                    return true;
+                ++map_iter;
+                if (map_iter == p->cend()) return false;
+                roaring_iterator_init(&map_iter->second.roaring, &i);
+            }
+            return true;
+        }
+        return false;
+    }
+
+    type_of_iterator &operator--() {  //  --i, must return dec.value
+        if (map_iter == p->cend()) {
+            --map_iter;
+            roaring_iterator_init_last(&map_iter->second.roaring, &i);
+            if (i.has_value) return *this;
+        }
+
+        roaring_uint32_iterator_previous(&i);
+        while (!i.has_value) {
+            if (map_iter == p->cbegin()) return *this;
+            map_iter--;
+            roaring_iterator_init_last(&map_iter->second.roaring, &i);
+        }
+        return *this;
+    }
+
+    type_of_iterator operator--(int) {  // i--, must return orig. value
+        Roaring64MapSetBitBiDirectionalIterator orig(*this);
+        if (map_iter == p->cend()) {
+            --map_iter;
+            roaring_iterator_init_last(&map_iter->second.roaring, &i);
+            return orig;
+        }
+
+        roaring_uint32_iterator_previous(&i);
+        while (!i.has_value) {
+            if (map_iter == p->cbegin()) return orig;
+            map_iter--;
+            roaring_iterator_init_last(&map_iter->second.roaring, &i);
+        }
+        return orig;
+    }
+
+    bool operator==(const Roaring64MapSetBitBiDirectionalIterator &o) const {
+        if (map_iter == p->cend() && o.map_iter == o.p->cend()) return true;
+        if (o.map_iter == o.p->cend()) return false;
+        return **this == *o;
+    }
+
+    bool operator!=(const Roaring64MapSetBitBiDirectionalIterator &o) const {
+        if (map_iter == p->cend() && o.map_iter == o.p->cend()) return false;
+        if (o.map_iter == o.p->cend()) return true;
+        return **this != *o;
+    }
+
+   private:
+    const std::map<uint32_t, Roaring> *p{nullptr};
+    std::map<uint32_t, Roaring>::const_iterator
+        map_iter{};  // The empty constructor silences warnings from pedantic
+                     // static analyzers.
+    api::roaring_uint32_iterator_t
+        i{};  // The empty constructor silences warnings from pedantic static
+              // analyzers.
+};
+
+inline Roaring64MapSetBitBiDirectionalIterator Roaring64Map::begin() const {
+    return Roaring64MapSetBitBiDirectionalIterator(*this);
+}
+
+inline Roaring64MapSetBitBiDirectionalIterator Roaring64Map::end() const {
+    return Roaring64MapSetBitBiDirectionalIterator(*this, true);
+}
+
+}  // namespace roaring
+
+#endif /* INCLUDE_ROARING_64_MAP_HH_ */
+/* end file cpp/roaring64map.hh */
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cbca26440..af55528d9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,12 +8,12 @@ if(MSVC)
     add_subdirectory(dll)
 else()
     #file(GLOB CPP_SOURCES *.cpp)
-    set(CPP_SOURCES abstract_data_store.cpp ann_exception.cpp disk_utils.cpp 
+    set(CPP_SOURCES abstract_data_store.cpp ann_exception.cpp cluster_store.cpp disk_utils.cpp 
         distance.cpp index.cpp in_mem_graph_store.cpp in_mem_data_store.cpp
         linux_aligned_file_reader.cpp math_utils.cpp natural_number_map.cpp
         in_mem_data_store.cpp in_mem_graph_store.cpp
         natural_number_set.cpp memory_mapper.cpp partition.cpp pq.cpp
-        pq_flash_index.cpp scratch.cpp logger.cpp utils.cpp filter_utils.cpp index_factory.cpp abstract_index.cpp pq_l2_distance.cpp pq_data_store.cpp)
+        pq_flash_index.cpp scratch.cpp logger.cpp utils.cpp filter_utils.cpp index_factory.cpp abstract_index.cpp pq_l2_distance.cpp pq_data_store.cpp ../roaring/roaring.c ../include/id_list.h)
     if (RESTAPI)
         list(APPEND CPP_SOURCES restapi/search_wrapper.cpp restapi/server.cpp)
     endif()
diff --git a/src/LSH.cpp b/src/LSH.cpp
new file mode 100644
index 000000000..138abb56b
--- /dev/null
+++ b/src/LSH.cpp
@@ -0,0 +1,85 @@
+#include <memory>
+#include <cstdint>
+#include <random>
+#include <mkl.h>
+#include <float.h>
+
+#include "utils.h"
+#include "LSH.h"
+
+namespace diskann
+{
+
+void generate_random_vectors(uint32_t dim, uint32_t num, float *vecs, float min_value = FLT_TRUE_MIN,
+                             float max_value = FLT_MAX, bool normalized = false)
+{
+    std::mt19937 rng(std::chrono::steady_clock::now().time_since_epoch().count());
+    std::uniform_real_distribution<float> dis(min_value, max_value);
+
+    for (uint32_t i = 0; i < num; i++)
+    {
+        for (uint32_t j = 0; j < dim; j++)
+        {
+            vecs[i * dim + j] = dis(rng);
+        }
+        if (normalized)
+        {
+            for (uint32_t i = 0; i < num; i++)
+                normalize(vecs + (dim * i), dim);
+        }
+    }
+}
+
+LSH::LSH(uint32_t num_dims, uint32_t num_axes) : _num_axes(num_axes), _num_dims(num_dims)
+{
+    _axes = std::make_unique<float[]>(num_axes * num_dims);
+    generate_random_vectors(num_dims, num_axes, const_cast<float *>(_axes.get()), -1, 1, false);
+}
+
+LSH::LSH(uint32_t num_dims, uint32_t num_axes, float *axes) : _num_axes(num_axes), _num_dims(num_dims)
+{
+    _axes.reset(axes);
+}
+
+uint8_t LSH::get_hash(const float *vec, float *dot_product)
+{
+    memset(dot_product, 0, _num_axes * sizeof(float));
+
+    // computing Y = alpha * A (mat) * X (vec) + beta * Y
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, _num_axes /* M */, _num_dims /* N */, 1.0f, /* alpha */
+                const_cast<float *>(_axes.get()),                                        /* A */
+                _num_dims,                                                               /* lda */
+                vec,                                                                     /* X */
+                1,                                                                       /* inc x */
+                0,                                                                       /* Beta */
+                dot_product,                                                             /* Y */
+                1                                                                        /* inc Y */
+    );
+    uint8_t lsh_value = 0;
+    for (uint32_t i = 0; i < _num_axes; i++)
+    {
+        lsh_value = dot_product[i] < 0 ? lsh_value : lsh_value | (1 << i);
+    }
+    return lsh_value;
+}
+
+void LSH::dump_axes_to_text_file(std::ostream &out_stream)
+{
+    for (uint32_t i = 0; i < _num_axes; i++)
+    {
+        for (uint32_t j = 0; j < _num_dims; j++)
+        {
+            out_stream << _axes[i * _num_dims + j] << " ";
+        }
+        out_stream << std::endl;
+    }
+}
+
+// Assuming ostream is opened in binary mode.
+void LSH::dump_axes_to_bin_format(std::ostream &out_stream)
+{
+    out_stream.write((const char *)&_num_axes, sizeof(uint32_t));
+    out_stream.write((const char *)&_num_dims, sizeof(uint32_t));
+    out_stream.write((const char *)_axes.get(), sizeof(float) * _num_axes * _num_dims);
+}
+} // namespace diskann
diff --git a/src/abstract_index.cpp b/src/abstract_index.cpp
index 92665825f..e65a4d89d 100644
--- a/src/abstract_index.cpp
+++ b/src/abstract_index.cpp
@@ -37,6 +37,20 @@ template <typename IndexType>
 std::pair<uint32_t, uint32_t> AbstractIndex::search_with_filters(const DataType &query, const std::string &raw_label,
                                                                  const size_t K, const uint32_t L, IndexType *indices,
                                                                  float *distances)
+{
+    auto any_indices = std::any(indices);
+    std::vector<std::string> tmp_lbls;
+    tmp_lbls.push_back(raw_label);
+    std::vector<std::vector<std::string>> tmp_lbls_vec;
+    tmp_lbls_vec.push_back(tmp_lbls);
+    return _search_with_filters(query, tmp_lbls_vec, K, L, any_indices, distances);
+}
+
+template <typename IndexType>
+std::pair<uint32_t, uint32_t> AbstractIndex::search_with_filters(const DataType &query,
+                                                                 const std::vector<std::vector<std::string>> &raw_label,
+                                                                 const size_t K, const uint32_t L, IndexType *indices,
+                                                                 float *distances)
 {
     auto any_indices = std::any(indices);
     return _search_with_filters(query, raw_label, K, L, any_indices, distances);
@@ -162,6 +176,13 @@ template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search_w
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search_with_filters<uint64_t>(
     const DataType &query, const std::string &raw_label, const size_t K, const uint32_t L, uint64_t *indices,
     float *distances);
+template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search_with_filters<uint32_t>(
+    const DataType &query, const std::vector<std::vector<std::string>> &raw_label, const size_t K, const uint32_t L,
+    uint32_t *indices, float *distances);
+
+template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search_with_filters<uint64_t>(
+    const DataType &query, const std::vector<std::vector<std::string>> &raw_label, const size_t K, const uint32_t L,
+    uint64_t *indices, float *distances);
 
 template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<float, int32_t>(
     const float *query, const uint64_t K, const uint32_t L, int32_t *tags, float *distances,
diff --git a/src/cluster_store.cpp b/src/cluster_store.cpp
new file mode 100644
index 000000000..d25721982
--- /dev/null
+++ b/src/cluster_store.cpp
@@ -0,0 +1,214 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#include <memory>
+#include "abstract_scratch.h"
+#include "cluster_store.h"
+#include "math_utils.h"
+#include "utils.h"
+#include <mkl.h>
+
+namespace diskann
+{
+
+template <typename data_t>
+InMemClusterStore<data_t>::InMemClusterStore(const size_t dim) : AbstractClusterStore<data_t>(dim)
+{
+}
+
+template <typename data_t> InMemClusterStore<data_t>::~InMemClusterStore()
+{
+    delete[] this->_cluster_centroids;
+}
+
+template <typename data_t> uint32_t InMemClusterStore<data_t>::load(const std::string &filename)
+{
+    std::string centers_file(filename);
+    std::string posting_file(filename);
+    centers_file += "_centers.bin";
+    posting_file += "_posting.bin";
+
+    if (!file_exists(posting_file))
+        return 0;
+
+    diskann::load_bin<float>(centers_file, this->_cluster_centroids, this->_num_clusters, this->_dim);
+
+    this->_cluster_norms = new float[this->_num_clusters];
+    math_utils::compute_vecs_l2sq(this->_cluster_norms, this->_cluster_centroids, this->_num_clusters, this->_dim);
+
+    std::vector<uint32_t> non_empty_clusters;
+
+    std::ifstream in(posting_file, std::ios::binary | std::ios::in);
+    uint64_t total_count = 0;
+
+    _posting_lists.resize(this->_num_clusters);
+    for (unsigned i = 0; i < this->_num_clusters; i++)
+    {
+        unsigned cur_count = 0;
+
+        in.read((char *)&cur_count, sizeof(unsigned));
+
+        if (cur_count > 0)
+        {
+            non_empty_clusters.emplace_back(i);
+            uint32_t *vals = new uint32_t[cur_count];
+            in.read((char *)vals, (uint64_t)cur_count * sizeof(unsigned));
+            _posting_lists[i] = RoaringIdList(cur_count, vals);
+            delete[] vals;
+        }
+
+        //      roaring_bitmap_add_many((roaring_bitmap_t*)_posting_lists[i].get_bitmap(), cur_count, vals);
+        total_count += cur_count;
+    }
+    in.close();
+
+    for (uint32_t i = 0; i < non_empty_clusters.size(); i++)
+    {
+        if (i != non_empty_clusters[i])
+        {
+            std::memcpy(this->_cluster_centroids + i * this->_dim,
+                        this->_cluster_centroids + ((uint64_t)non_empty_clusters[i]) * this->_dim,
+                        this->_dim * sizeof(float));
+            _posting_lists[i] = _posting_lists[non_empty_clusters[i]];
+        }
+    }
+
+    this->_num_clusters = non_empty_clusters.size();
+
+    std::cout << "Read a total of " << total_count << " points from inverted index file." << std::endl;
+    std::cout << " Resized to " << this->_num_clusters << " clusters after removing empty clusters." << std::endl;
+    return this->_num_clusters;
+}
+
+template <typename data_t> size_t InMemClusterStore<data_t>::save(const std::string &filename)
+{
+    std::string centers_file(filename);
+    std::string posting_file(filename);
+    centers_file += "_centers.bin";
+    posting_file += "_posting.bin";
+    diskann::save_bin<float>(centers_file, this->_cluster_centroids, this->_num_clusters, this->_dim);
+
+    std::ofstream out(posting_file, std::ios::binary | std::ios::out);
+    uint64_t total_count = 0;
+
+    for (unsigned i = 0; i < this->_num_clusters; i++)
+    {
+        uint32_t count = (uint32_t)_posting_lists[i].size();
+        out.write((char *)&count, sizeof(unsigned));
+
+        uint32_t *arr = new uint32_t[count];
+
+        roaring::Roaring x = _posting_lists[i].list;
+        x.toUint32Array(arr);
+
+        out.write((char *)arr, sizeof(unsigned) * (uint64_t)count);
+        total_count += count;
+        delete[] arr;
+    }
+    out.close();
+
+    std::cout << "Written a total of " << total_count << " points to file." << std::endl;
+    return total_count;
+}
+
+template <typename data_t> void InMemClusterStore<data_t>::add_cetroids(float *clusters, uint32_t num_clusters)
+{
+    this->_num_clusters = num_clusters;
+
+    diskann::cout << "Set num clusters to " << num_clusters << ", and dim to " << this->_dim << std::endl;
+    this->_cluster_centroids = new float[(uint64_t)num_clusters * this->_dim];
+    std::memcpy(this->_cluster_centroids, clusters, (uint64_t)num_clusters * this->_dim * sizeof(float));
+
+    _posting_lists.clear();
+    _posting_lists.resize(num_clusters);
+}
+
+template <typename data_t>
+void InMemClusterStore<data_t>::assign_data_to_clusters(data_t *vectors, std::vector<uint32_t> &ids)
+{
+    uint64_t num_pts = ids.size();
+    float *vectors_float;
+    if (sizeof(data_t) != sizeof(float))
+    {
+        vectors_float = new float[num_pts * this->_dim];
+        diskann::convert_types<data_t, float>(vectors, vectors_float, num_pts, this->_dim);
+    }
+    else
+    {
+        vectors_float = (float *)vectors;
+    }
+
+    uint32_t *closest_centers = new uint32_t[num_pts];
+    math_utils::compute_closest_centers(vectors_float, num_pts, this->_dim, this->_cluster_centroids,
+                                        this->_num_clusters, 1, closest_centers);
+
+    for (uint64_t pos = 0; pos < num_pts; pos++)
+    {
+        _posting_lists[closest_centers[pos]].add(ids[pos]);
+    }
+    delete[] closest_centers;
+    delete[] vectors_float;
+}
+
+template <typename data_t>
+void InMemClusterStore<data_t>::get_closest_clusters(data_t *query, const uint32_t num_closest,
+                                                     InMemQueryScratch<data_t> *scratch)
+{
+    auto &closest_clusters = scratch->closest_clusters();
+    float *query_float; // = new float[this->_dim];
+    if (sizeof(data_t) == sizeof(float))
+    {
+        query_float = (float *)query;
+    }
+    else
+    {
+        query_float = scratch->get_query_float();
+        diskann::convert_types<data_t, float>(query, query_float, 1, this->_dim);
+    }
+
+    auto &x = scratch->closest_clusters();
+    closest_clusters.resize(num_closest);
+    auto &cluster_distances = scratch->get_cluster_distance_vector();
+
+    std::memcpy(cluster_distances.data(), this->_cluster_norms, this->_num_clusters * sizeof(float));
+
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, (MKL_INT)this->_num_clusters, (MKL_INT)this->_dim, -2.0f,
+                this->_cluster_centroids, (MKL_INT)this->_dim, query_float, (MKL_INT)1, 1, cluster_distances.data(),
+                (MKL_INT)1);
+
+    std::priority_queue<PivotContainer> top_k_queue;
+    for (size_t j = 0; j < this->_num_clusters; j++)
+    {
+        PivotContainer this_piv(j, cluster_distances[j]);
+        top_k_queue.push(this_piv);
+    }
+    for (size_t j = 0; j < num_closest; j++)
+    {
+        PivotContainer this_piv = top_k_queue.top();
+        closest_clusters[j] = (uint32_t)this_piv.piv_id;
+        top_k_queue.pop();
+    }
+
+    //    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, (MKL_INT)num_points, (MKL_INT)num_centers, (MKL_INT)dim,
+    //    -2.0f,
+    //                data, (MKL_INT)dim, centers, (MKL_INT)dim, 1.0f, dist_matrix, (MKL_INT)num_centers);
+
+    //    math_utils::compute_closest_centers(query_float, 1, this->_dim, this->_cluster_centroids, this->_num_clusters,
+    //    num_closest,
+    //                                            closest_clusters.data());
+
+    // delete[] query_float;
+}
+
+// todo: do we copy the roaring bitmap inside the roaring list?
+template <typename data_t>
+void InMemClusterStore<data_t>::get_cluster_members(const uint32_t cluster_id, AbstractIdList &output_list)
+{
+    //    std::cout<<"*" << _posting_lists[cluster_id].size() <<"*";
+    output_list.copy_from(_posting_lists[cluster_id]);
+}
+
+template DISKANN_DLLEXPORT class InMemClusterStore<float>;
+template DISKANN_DLLEXPORT class InMemClusterStore<int8_t>;
+template DISKANN_DLLEXPORT class InMemClusterStore<uint8_t>;
+} // namespace diskann
\ No newline at end of file
diff --git a/src/index.cpp b/src/index.cpp
index 486d41e76..49c32a609 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -4,10 +4,12 @@
 #include <omp.h>
 
 #include <type_traits>
-
+#include <filesystem>
 #include "boost/dynamic_bitset.hpp"
+#include "id_list.h"
 #include "index_factory.h"
 #include "memory_mapper.h"
+#include "roaring.h"
 #include "timer.h"
 #include "tsl/robin_map.h"
 #include "tsl/robin_set.h"
@@ -23,7 +25,7 @@
 
 #include "index.h"
 
-#define MAX_POINTS_FOR_USING_BITSET 10000000
+#define MAX_POINTS_FOR_USING_BITSET 40000000
 
 namespace diskann
 {
@@ -109,6 +111,15 @@ Index<T, TagT, LabelT>::Index(const IndexConfig &index_config, std::shared_ptr<A
                                      _indexingQueueSize, _indexingRange, _indexingMaxC, _data_store->get_dims());
         }
     }
+    if (index_config.index_search_params != nullptr)
+    {
+        _filter_penalty_threshold = index_config.index_search_params->filter_penalty_threshold;
+        _bruteforce_threshold = index_config.index_search_params->bruteforce_threshold;
+        diskann::cout << "Inside Index, filter_penalty_threshold is " << _filter_penalty_threshold << std::endl;
+        diskann::cout << "Inside Index, bruteforce_threshold is " << _bruteforce_threshold << std::endl;
+    }
+    //    if (_filtered_index) {
+    //    }
 }
 
 template <typename T, typename TagT, typename LabelT>
@@ -177,6 +188,8 @@ template <typename T, typename TagT, typename LabelT> Index<T, TagT, LabelT>::~I
         ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
         manager.destroy();
     }
+    if (_sample_map != nullptr)
+        delete[] _sample_map;
 }
 
 template <typename T, typename TagT, typename LabelT>
@@ -546,6 +559,7 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
     std::string labels_file = mem_index_file + "_labels.txt";
     std::string labels_to_medoids = mem_index_file + "_labels_to_medoids.txt";
     std::string labels_map_file = mem_index_file + "_labels_map.txt";
+    std::string sample_label_file = mem_index_file + "_sample_labels.txt";
 
     if (!_save_as_one_file)
     {
@@ -588,8 +602,21 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
 
     if (file_exists(labels_file))
     {
+
         _label_map = load_label_map(labels_map_file);
         parse_label_file(labels_file, label_num_pts);
+
+        if (file_exists(sample_label_file))
+        {
+            uint64_t num_samples;
+            parse_sample_label_file(sample_label_file, num_samples);
+            std::string rev_map = sample_label_file + "_map.bin";
+            uint64_t n1, n2;
+            diskann::load_bin<uint32_t>(rev_map, _sample_map, n1, n2);
+            _sample_prob = (1.0 * num_samples) / (1.0 * label_num_pts);
+            std::cout << "Loaded sample file with rate " << _sample_prob << std::endl;
+        }
+
         assert(label_num_pts == data_file_num_pts - _num_frozen_pts);
         if (file_exists(labels_to_medoids))
         {
@@ -756,19 +783,19 @@ template <typename T, typename TagT, typename LabelT> std::vector<uint32_t> Inde
 // Find common filter between a node's labels and a given set of labels, while
 // taking into account universal label
 template <typename T, typename TagT, typename LabelT>
-bool Index<T, TagT, LabelT>::detect_common_filters(uint32_t point_id, bool search_invocation,
-                                                   const std::vector<LabelT> &incoming_labels)
+uint32_t Index<T, TagT, LabelT>::detect_common_filters(uint32_t point_id, bool search_invocation,
+                                                       const std::vector<LabelT> &incoming_labels)
 {
     auto &curr_node_labels = _location_to_labels[point_id];
     std::vector<LabelT> common_filters;
     std::set_intersection(incoming_labels.begin(), incoming_labels.end(), curr_node_labels.begin(),
                           curr_node_labels.end(), std::back_inserter(common_filters));
-    if (common_filters.size() > 0)
-    {
-        // This is to reduce the repetitive calls. If common_filters size is > 0 ,
-        // we dont need to check further for universal label
-        return true;
-    }
+    /*    if (common_filters.size() > 0)
+        {
+            // This is to reduce the repetitive calls. If common_filters size is > 0 ,
+            // we dont need to check further for universal label
+            return true;
+        } */
     if (_use_universal_label)
     {
         if (!search_invocation)
@@ -783,19 +810,156 @@ bool Index<T, TagT, LabelT>::detect_common_filters(uint32_t point_id, bool searc
                 common_filters.push_back(_universal_label);
         }
     }
-    return (common_filters.size() > 0);
+    return common_filters.size();
+}
+
+// Find common filter between a node's labels and a given set of labels, while
+// taking into account universal label
+// TODO: modify for handling universal label
+template <typename T, typename TagT, typename LabelT>
+inline uint32_t Index<T, TagT, LabelT>::detect_filter_penalty(uint32_t point_id, bool search_invocation,
+                                                              const std::vector<std::vector<LabelT>> &incoming_labels)
+{
+
+    //    auto s = std::chrono::high_resolution_clock::now();
+
+    // not implemented for build-time use case, since we need to understand universal labels for multiple filters
+    //    if (!search_invocation)
+    //        return true;
+
+
+    //    if (!search_invocation) {
+    //    auto &curr_node_labels = _location_to_labels[point_id];
+
+    /*    } else {
+        auto &curr_node_labels = _location_to_labels_bitmap[point_id];
+        for (auto &lbl : incoming_labels)
+        {
+            if (curr_node_labels.contains(lbl))
+            {
+                overlap++;
+            }
+        }
+        } */
+
+    // std::string tmp = "here, penalty=" + std::to_string(_filter_penalty_threshold) + ", overlap=" +
+    // std::to_string(overlap);
+    //    std::cout << tmp << std::endl;
+
+    //    if (overlap < _filter_penalty_threshold)
+    //        return true;
+
+    //    return false;
+    //    std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
+    //    time_to_detect_penalty += diff.count();
+
+/*
+    uint32_t overlap = 0;
+    for (auto &lbl : incoming_labels)
+    {
+        //        if (std::find(curr_node_labels.begin(), curr_node_labels.end(), lbl) != curr_node_labels.end())
+        //        if (!(_location_to_labels_robin[point_id].find(lbl) == _location_to_labels_robin[point_id].end()))
+
+        if (_labels_to_points_set[lbl].count(point_id))
+        {
+            overlap++;
+        }
+    }
+    return incoming_labels.size() - overlap;
+*/
+
+#ifdef INSTRUMENT
+auto s = std::chrono::high_resolution_clock::now();
+#endif
+
+    uint32_t cur_penalty = incoming_labels.size();
+    for (uint32_t i = 0; i < incoming_labels.size(); i++) {
+        bool or_pass = false;
+        for (uint32_t j = 0; j < incoming_labels[i].size(); j++) {
+            if (_labels_to_points_set[incoming_labels[i][j]].count(point_id)) {
+                or_pass = true;
+                break;
+            }
+        }
+        cur_penalty -= or_pass;
+    }
+
+#ifdef INSTRUMENT
+    std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
+    time_to_detect_penalty += diff.count();
+#endif
+
+    return cur_penalty;
+
+}
+
+
+template <typename T, typename TagT, typename LabelT>
+std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::brute_force_filters(const T *query, const uint32_t Lsize,
+                                                                          const roaring::Roaring &init_ids,
+                                                                          InMemQueryScratch<T> *scratch)
+{
+    T *aligned_query = scratch->aligned_query();
+    NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes();
+    best_L_nodes.reserve(Lsize);
+
+    uint32_t cmps = 0;
+    uint32_t hops = 0;
+#ifdef INSTRUMENT
+    auto s = std::chrono::high_resolution_clock::now();
+#endif
+    roaring::Roaring::const_iterator x = init_ids.begin();
+    x++;
+    for (roaring::Roaring::const_iterator i = init_ids.begin(); i != init_ids.end(); i++)
+    {
+        float distance = _data_store->get_distance(aligned_query, *i);
+        if (x != init_ids.end())
+        {
+            _data_store->prefetch_vector(*x);
+            x++;
+        }
+        Neighbor nn = Neighbor(*i, distance);
+        best_L_nodes.insert(nn);
+        cmps++;
+    }
+#ifdef INSTRUMENT
+    std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
+    time_to_filter_check_and_compare += diff.count();
+#endif
+    return std::make_pair(hops, cmps);
 }
 
+// used for index build, single vector of labels are sent to iterate to fixed point. Need to be FIXED and made consistent
 template <typename T, typename TagT, typename LabelT>
 std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
     InMemQueryScratch<T> *scratch, const uint32_t Lsize, const std::vector<uint32_t> &init_ids, bool use_filter,
     const std::vector<LabelT> &filter_labels, bool search_invocation)
 {
+    return iterate_to_fixed_point(scratch, Lsize, init_ids, use_filter, std::vector<std::vector<LabelT>>(1, filter_labels),
+                                  search_invocation);
+}
+
+
+
+template <typename T, typename TagT, typename LabelT>
+std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
+    InMemQueryScratch<T> *scratch, const uint32_t Lsize, const std::vector<uint32_t> &init_ids, bool use_filter,
+    const std::vector<std::vector<LabelT>> &filter_labels, bool search_invocation)
+{
+
+/*    for (auto &x : filter_labels) {
+        std::cout<<"(";
+        for (auto &y : x) {
+            std::cout<<y<<"|";
+        }
+        std::cout<<")&";
+    }*/
+
     std::vector<Neighbor> &expanded_nodes = scratch->pool();
     NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes();
     best_L_nodes.reserve(Lsize);
     tsl::robin_set<uint32_t> &inserted_into_pool_rs = scratch->inserted_into_pool_rs();
-    boost::dynamic_bitset<> &inserted_into_pool_bs = scratch->inserted_into_pool_bs();
+    roaring::Roaring &inserted_into_pool_bs = scratch->get_valid_bitmap();
     std::vector<uint32_t> &id_scratch = scratch->id_scratch();
     std::vector<float> &dist_scratch = scratch->dist_scratch();
     assert(id_scratch.size() == 0);
@@ -804,6 +968,13 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
 
     float *pq_dists = nullptr;
 
+    if (print_qstats)
+    {
+        std::ofstream out("query_stats.txt", std::ios_base::app);
+        out << "starting search\n" << std::endl;
+        out.close();
+    }
+
     _pq_data_store->preprocess_query(aligned_query, scratch);
 
     if (expanded_nodes.size() > 0 || id_scratch.size() > 0)
@@ -815,20 +986,9 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
     auto total_num_points = _max_points + _num_frozen_pts;
     bool fast_iterate = total_num_points <= MAX_POINTS_FOR_USING_BITSET;
 
-    if (fast_iterate)
-    {
-        if (inserted_into_pool_bs.size() < total_num_points)
-        {
-            // hopefully using 2X will reduce the number of allocations.
-            auto resize_size =
-                2 * total_num_points > MAX_POINTS_FOR_USING_BITSET ? MAX_POINTS_FOR_USING_BITSET : 2 * total_num_points;
-            inserted_into_pool_bs.resize(resize_size);
-        }
-    }
-
     // Lambda to determine if a node has been visited
     auto is_not_visited = [this, fast_iterate, &inserted_into_pool_bs, &inserted_into_pool_rs](const uint32_t id) {
-        return fast_iterate ? inserted_into_pool_bs[id] == 0
+        return fast_iterate ? !inserted_into_pool_bs.contains(id)
                             : inserted_into_pool_rs.find(id) == inserted_into_pool_rs.end();
     };
 
@@ -847,17 +1007,48 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
                                         __LINE__);
         }
 
+        float penalty = 0;
         if (use_filter)
         {
-            if (!detect_common_filters(id, search_invocation, filter_labels))
-                continue;
+            if (search_invocation)
+            {
+                #ifdef INSTRUMENT
+                auto s = std::chrono::high_resolution_clock::now();
+#endif
+                uint32_t res = detect_filter_penalty(id, search_invocation, filter_labels);
+                #ifdef INSTRUMENT
+                std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
+                time_to_get_valid += diff.count();
+            #endif
+            
+//                std::cout<<id <<" has penalty " << res << std::endl;
+                if ((res) > _filter_penalty_threshold)
+                    continue;
+                penalty = res * penalty_scale;
+                if (print_qstats)
+                {
+                    std::ofstream out("query_stats.txt", std::ios_base::app);
+                    out << "starting id " << id << " has (filter labels - overlap) size " << res << " and filters ";
+                    for (auto const &filter : _location_to_labels[id])
+                    {
+                        out << filter << " ";
+                    }
+                    out << std::endl;
+                    out.close();
+                }
+            }
+            else
+            {
+                if (detect_common_filters(id, search_invocation, filter_labels[0]) < min_inter_size)
+                    continue;
+            }
         }
 
         if (is_not_visited(id))
         {
             if (fast_iterate)
             {
-                inserted_into_pool_bs[id] = 1;
+                inserted_into_pool_bs.add(id);
             }
             else
             {
@@ -868,7 +1059,7 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
             uint32_t ids[] = {id};
             float distances[] = {std::numeric_limits<float>::max()};
             _pq_data_store->get_distance(aligned_query, ids, 1, distances, scratch);
-            distance = distances[0];
+            distance = distances[0] + penalty;
 
             Neighbor nn = Neighbor(id, distance);
             best_L_nodes.insert(nn);
@@ -878,11 +1069,47 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
     uint32_t hops = 0;
     uint32_t cmps = 0;
 
+    if (print_qstats)
+    {
+        std::ofstream out("query_stats.txt", std::ios_base::app);
+        out << std::endl;
+        out.close();
+    }
+
     while (best_L_nodes.has_unexpanded_node())
     {
+/*        for (uint32_t rnr = 0; rnr < best_L_nodes.size(); rnr++) {
+            auto &x = best_L_nodes[rnr];
+            std::cout<<std::setw(10)<<x.id;
+        }
+        std::cout<<std::endl;
+        for (uint32_t rnr = 0; rnr < best_L_nodes.size(); rnr++) {
+            auto &x = best_L_nodes[rnr];
+            std::cout<<std::setw(10)<<detect_filter_penalty(x.id, search_invocation, filter_labels);
+        }
+        std::cout<<std::endl;
+        for (uint32_t rnr = 0; rnr < best_L_nodes.size(); rnr++) {
+            auto &x = best_L_nodes[rnr];
+            std::cout<<std::setw(10)<<x.distance;
+        }
+        std::cout<<std::endl << std::endl;
+*/
+
         auto nbr = best_L_nodes.closest_unexpanded();
         auto n = nbr.id;
 
+        if (print_qstats)
+        {
+            std::ofstream out("query_stats.txt", std::ios_base::app);
+            out << "looking at id " << n << " with filters ";
+            for (auto const &filter : _location_to_labels[n])
+            {
+                out << filter << " ";
+            }
+            out << std::endl;
+            out.close();
+        }
+
         // Add node to expanded nodes to create pool for prune later
         if (!search_invocation)
         {
@@ -904,6 +1131,7 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
         // Find which of the nodes in des have not been visited before
         id_scratch.clear();
         dist_scratch.clear();
+        std::vector<float> dist_pens;
         if (_dynamic_index)
         {
             LockGuard guard(_locks[n]);
@@ -911,63 +1139,141 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
             {
                 assert(id < _max_points + _num_frozen_pts);
 
-                if (use_filter)
+                if (!is_not_visited(id))
+                    continue;
+
+                if (fast_iterate)
                 {
-                    // NOTE: NEED TO CHECK IF THIS CORRECT WITH NEW LOCKS.
-                    if (!detect_common_filters(id, search_invocation, filter_labels))
-                        continue;
+                    inserted_into_pool_bs.add(id);
+                }
+                else
+                {
+                    inserted_into_pool_rs.insert(id);
                 }
 
-                if (is_not_visited(id))
+                if (use_filter)
                 {
-                    id_scratch.push_back(id);
+                    // NOTE: NEED TO CHECK IF THIS CORRECT WITH NEW LOCKS.
+                    // TODO: CHECK ABOUT UNIVERSAL LABELS
+                    if (search_invocation)
+                    {
+                        if (detect_filter_penalty(id, search_invocation, filter_labels) > _filter_penalty_threshold)
+                            continue;
+                    }
+                    else
+                    {
+                        if (detect_filter_penalty(id, search_invocation, filter_labels) == filter_labels.size())
+                            continue;
+                    }
                 }
+
+                id_scratch.push_back(id);
             }
         }
         else
         {
-            _locks[n].lock();
-            auto nbrs = _graph_store->get_neighbours(n);
-            _locks[n].unlock();
+            std::vector<location_t> nbrs;
+            size_t id_iter = 0;
+            if (search_invocation)
+            {
+                nbrs = _graph_store->get_neighbours(n);
+                if (expand_two_hops) {
+                for (auto &x : nbrs)
+                    // add two hop neighbors
+                    nbrs.insert(nbrs.end(), _graph_store->get_neighbours(x).begin(), _graph_store->get_neighbours(x).end());
+                }
+            }
+            else
+            {
+                _locks[n].lock();
+                nbrs = _graph_store->get_neighbours(n);
+                _locks[n].unlock();
+            }
             for (auto id : nbrs)
             {
                 assert(id < _max_points + _num_frozen_pts);
 
-                if (use_filter)
+                if (!is_not_visited(id))
+                    continue;
+
+                if (fast_iterate)
                 {
-                    // NOTE: NEED TO CHECK IF THIS CORRECT WITH NEW LOCKS.
-                    if (!detect_common_filters(id, search_invocation, filter_labels))
-                        continue;
+                    inserted_into_pool_bs.add(id);
+                }
+                else
+                {
+                    inserted_into_pool_rs.insert(id);
                 }
 
-                if (is_not_visited(id))
+                float penalty = 0;
+                if (use_filter)
                 {
-                    id_scratch.push_back(id);
+                    // NOTE: NEED TO CHECK IF THIS CORRECT WITH NEW LOCKS.
+                    uint32_t res;
+                    if (search_invocation)
+                    {
+                        #ifdef INSTRUMENT
+                        auto s = std::chrono::high_resolution_clock::now();
+        #endif
+        
+                        res = detect_filter_penalty(id, search_invocation, filter_labels);
+                        #ifdef INSTRUMENT
+                        std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
+                        time_to_get_valid += diff.count();
+                    #endif
+        
+                        if (res > _filter_penalty_threshold)
+                        {
+                            id_iter++;
+                            continue;
+                        }
+                        penalty = res * penalty_scale;
+
+                        if (print_qstats)
+                        {
+                            std::ofstream out("query_stats.txt", std::ios_base::app);
+                            out << id_iter << ". looking at " << res << " nbr " << id << " with filters ";
+                            for (auto const &filter : _location_to_labels[id])
+                            {
+                                out << filter << " ";
+                            }
+                            out << std::endl;
+                            out.close();
+                        }
+                        id_iter++;
+                    }
+                    else
+                    {
+                        if (detect_common_filters(id, search_invocation, filter_labels[0]) < min_inter_size)
+                            continue;
+                    }
                 }
+                dist_pens.push_back(penalty);
+                id_scratch.push_back(id);
             }
         }
 
-        // Mark nodes visited
-        for (auto id : id_scratch)
+        if (print_qstats)
         {
-            if (fast_iterate)
-            {
-                inserted_into_pool_bs[id] = 1;
-            }
-            else
-            {
-                inserted_into_pool_rs.insert(id);
-            }
+            std::ofstream out("query_stats.txt", std::ios_base::app);
+            out << std::endl;
+            out.close();
         }
 
+        // Mark nodes visited
+        /*        for (auto id : id_scratch)
+                {
+                } */
+
         assert(dist_scratch.capacity() >= id_scratch.size());
         compute_dists(id_scratch, dist_scratch);
         cmps += (uint32_t)id_scratch.size();
+        assert(dist_pens.size() == id_scratch.size());
 
         // Insert <id, dist> pairs into the pool of candidates
         for (size_t m = 0; m < id_scratch.size(); ++m)
         {
-            best_L_nodes.insert(Neighbor(id_scratch[m], dist_scratch[m]));
+            best_L_nodes.insert(Neighbor(id_scratch[m], dist_scratch[m] + dist_pens[m]));
         }
     }
     return std::make_pair(hops, cmps);
@@ -980,7 +1286,7 @@ void Index<T, TagT, LabelT>::search_for_point_and_prune(int location, uint32_t L
                                                         uint32_t filteredLindex)
 {
     const std::vector<uint32_t> init_ids = get_init_ids();
-    const std::vector<LabelT> unused_filter_label;
+    const std::vector<std::vector<LabelT>> unused_filter_label;
 
     if (!use_filter)
     {
@@ -1000,6 +1306,15 @@ void Index<T, TagT, LabelT>::search_for_point_and_prune(int location, uint32_t L
             tl.unlock();
 
         _data_store->get_vector(location, scratch->aligned_query());
+        // get all combinations of common filters
+        for (auto const &lbl1 : _location_to_labels[location])
+        {
+            for (auto const &lbl2 : _location_to_labels[location])
+            {
+                if (lbl1 == lbl2)
+                    continue;
+            }
+        }
         iterate_to_fixed_point(scratch, filteredLindex, filter_specific_start_nodes, true,
                                _location_to_labels[location], false);
 
@@ -1296,6 +1611,8 @@ template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT
 
     diskann::Timer link_timer;
 
+    diskann::cout << "Filtered L index: " << _filtered_index << ", " << _filterIndexingQueueSize << std::endl;
+
 #pragma omp parallel for schedule(dynamic, 2048)
     for (int64_t node_ctr = 0; node_ctr < (int64_t)(visit_order.size()); node_ctr++)
     {
@@ -1751,7 +2068,14 @@ void Index<T, TagT, LabelT>::build(const std::string &data_file, const size_t nu
             this->set_universal_label(unv_label_as_num);
         }
         this->build_filtered_index(data_file.c_str(), labels_file_to_use, points_to_load);
+        std::string sample_label_file = filter_params.save_path_prefix + "_sample";
+        float p_val = 1000000.0/num_points_to_load;
+        p_val = (p_val > 1) ? 1 : p_val;
+        gen_random_slice<T>(data_file, sample_label_file, p_val,
+                      labels_file_to_use);
+        std::filesystem::copy(sample_label_file+"_ids.bin", sample_label_file+"_labels.txt_map.bin");
     }
+
     std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
     std::cout << "Indexing time: " << diff.count() << "\n";
 }
@@ -1788,9 +2112,10 @@ LabelT Index<T, TagT, LabelT>::get_converted_label(const std::string &raw_label)
         return _universal_label;
     }
     std::stringstream stream;
-    stream << "Unable to find label in the Label Map";
-    diskann::cerr << stream.str() << std::endl;
-    throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
+    stream << "Unable to find label " << raw_label << " in the Label Map";
+//    diskann::cerr << stream.str() << std::endl;
+    return std::numeric_limits<LabelT>::max();
+//    throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
 }
 
 template <typename T, typename TagT, typename LabelT>
@@ -1812,11 +2137,12 @@ void Index<T, TagT, LabelT>::parse_label_file(const std::string &label_file, siz
         line_cnt++;
     }
     _location_to_labels.resize(line_cnt, std::vector<LabelT>());
+    /* _location_to_labels_robin.resize(line_cnt, tsl::robin_set<LabelT>()); */
+    /* _location_to_labels_bitmap.resize(line_cnt, roaring::Roaring()); */
 
     infile.clear();
     infile.seekg(0, std::ios::beg);
     line_cnt = 0;
-
     while (std::getline(infile, line))
     {
         std::istringstream iss(line);
@@ -1829,17 +2155,97 @@ void Index<T, TagT, LabelT>::parse_label_file(const std::string &label_file, siz
             token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
             LabelT token_as_num = (LabelT)std::stoul(token);
             lbls.push_back(token_as_num);
+            //_location_to_labels_bitmap[line_cnt].add(token_as_num);
+            //_location_to_labels_robin[line_cnt].insert(token_as_num);
+            _labels_to_points_set[token_as_num].insert(line_cnt);
             _labels.insert(token_as_num);
+            try
+            {
+                _labels_to_points.at(token_as_num).add(line_cnt);
+            }
+            catch (const std::out_of_range &oor)
+            {
+                _labels_to_points.resize(token_as_num + 1);
+                _labels_to_points.at(token_as_num).add(line_cnt);
+            }
         }
 
         std::sort(lbls.begin(), lbls.end());
         _location_to_labels[line_cnt] = lbls;
         line_cnt++;
     }
+
+    /* _location_to_label_combos.resize(_location_to_labels.size()); */
+    /* for (size_t i = 0; i < _location_to_labels.size(); i++) */
+    /* { */
+    /*     std::vector<LabelT> curr_lbls = _location_to_labels[i]; */
+    /*     for (size_t j = 0; j < curr_lbls.size(); j++) */
+    /*     { */
+    /*         for (size_t k = j + 1; k < curr_lbls.size(); k++) */
+    /*         { */
+    /*             LabelT a = curr_lbls[j], b = curr_lbls[k]; */
+    /*             _location_to_label_combos[i].push_back(std::make_pair(a, b)); */
+    /*         } */
+    /*     } */
+    /* } */
     num_points = (size_t)line_cnt;
     diskann::cout << "Identified " << _labels.size() << " distinct label(s)" << std::endl;
 }
 
+template <typename T, typename TagT, typename LabelT>
+void Index<T, TagT, LabelT>::parse_sample_label_file(const std::string &label_file, size_t &num_samples)
+{
+    // Format of Label txt file: filters with comma separators
+
+    std::ifstream infile(label_file);
+    if (infile.fail())
+    {
+        throw diskann::ANNException(std::string("Failed to open file ") + label_file, -1);
+    }
+
+    std::string line, token;
+    uint32_t line_cnt = 0;
+    std::set<LabelT> sample_labels;
+
+    while (std::getline(infile, line))
+    {
+        line_cnt++;
+    }
+
+    infile.clear();
+    infile.seekg(0, std::ios::beg);
+    line_cnt = 0;
+    while (std::getline(infile, line))
+    {
+        std::istringstream iss(line);
+        std::vector<LabelT> lbls(0);
+        getline(iss, token, '\t');
+        std::istringstream new_iss(token);
+        while (getline(new_iss, token, ','))
+        {
+            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
+            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
+                        LabelT token_as_num = (LabelT)std::stoul(token);
+            //LabelT token_as_num = get_converted_label(token);
+            lbls.push_back(token_as_num);
+            sample_labels.insert(token_as_num);
+            try
+            {
+                _labels_to_points_sample.at(token_as_num).add(line_cnt);
+            }
+            catch (const std::out_of_range &oor)
+            {
+                _labels_to_points_sample.resize(token_as_num + 1);
+                _labels_to_points_sample.at(token_as_num).add(line_cnt);
+            }
+        }
+        line_cnt++;
+    }
+    num_samples = (size_t)line_cnt;
+    diskann::cout << "Identified " << num_samples << " samples for estimation purposes, and it covers "
+                  << sample_labels.size() << " distinct labels." << std::endl;
+}
+
 template <typename T, typename TagT, typename LabelT>
 void Index<T, TagT, LabelT>::_set_universal_label(const LabelType universal_label)
 {
@@ -1858,6 +2264,7 @@ void Index<T, TagT, LabelT>::build_filtered_index(const char *filename, const st
                                                   const size_t num_points_to_load, const std::vector<TagT> &tags)
 {
     _filtered_index = true;
+
     _label_to_start_id.clear();
     size_t num_points_labels = 0;
 
@@ -1958,10 +2365,10 @@ template <typename IdType>
 std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::search(const T *query, const size_t K, const uint32_t L,
                                                              IdType *indices, float *distances)
 {
-    if (K > (uint64_t)L)
-    {
-        throw ANNException("Set L to a value of at least K", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
+    /*    if (K > (uint64_t)L)
+        {
+            throw ANNException("Set L to a value of at least K", -1, __FUNCSIG__, __FILE__, __LINE__);
+        }*/
 
     ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
     auto scratch = manager.scratch_space();
@@ -2018,20 +2425,48 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::search(const T *query, con
 
 template <typename T, typename TagT, typename LabelT>
 std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::_search_with_filters(const DataType &query,
-                                                                           const std::string &raw_label, const size_t K,
-                                                                           const uint32_t L, std::any &indices,
-                                                                           float *distances)
+                                                                           const std::vector<std::vector<std::string>> &raw_label,
+                                                                           const size_t K, const uint32_t L,
+                                                                           std::any &indices, float *distances)
 {
-    auto converted_label = this->get_converted_label(raw_label);
+    std::vector<std::vector<LabelT>> converted_labels;
+    converted_labels.reserve(raw_label.size());
+    for (auto &x : raw_label)
+    {
+        std::vector<LabelT> cur_labels;
+        for (auto &y : x)
+        {
+            auto converted_label = this->get_converted_label(y);
+            if (converted_label != std::numeric_limits<LabelT>::max())
+                cur_labels.push_back(converted_label);
+        }
+        if (cur_labels.size() > 0)
+            converted_labels.push_back(cur_labels);
+        else {
+            for (uint32_t i = 0; i < K; i++) {
+                distances[i] = std::numeric_limits<float>::max();
+                if (typeid(uint64_t *) == indices.type()) {
+                    auto ptr = std::any_cast<uint64_t *>(indices);
+                    ptr[i] = std::numeric_limits<uint64_t>::max();
+                } else if (typeid(uint32_t *) == indices.type()) {
+                    auto ptr = std::any_cast<uint32_t *>(indices);
+                    ptr[i] = std::numeric_limits<uint32_t>::max();
+                }
+            }
+//            diskann::cerr << "No valid labels found for query" << std::endl;
+            return std::make_pair(0, 0);
+        }
+    }
+    
     if (typeid(uint64_t *) == indices.type())
     {
         auto ptr = std::any_cast<uint64_t *>(indices);
-        return this->search_with_filters(std::any_cast<T *>(query), converted_label, K, L, ptr, distances);
+        return this->search_with_filters(std::any_cast<T *>(query), converted_labels, K, L, ptr, distances);
     }
     else if (typeid(uint32_t *) == indices.type())
     {
         auto ptr = std::any_cast<uint32_t *>(indices);
-        return this->search_with_filters(std::any_cast<T *>(query), converted_label, K, L, ptr, distances);
+        return this->search_with_filters(std::any_cast<T *>(query), converted_labels, K, L, ptr, distances);
     }
     else
     {
@@ -2039,19 +2474,89 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::_search_with_filters(const
     }
 }
 
+template <typename T, typename TagT, typename LabelT>
+std::vector<std::pair<LabelT, uint32_t>> Index<T, TagT, LabelT>::sort_filter_counts(
+    const std::vector<LabelT> &filter_label)
+{
+    std::vector<std::pair<LabelT, uint32_t>> label_counts;
+    for (auto const &lbl : filter_label)
+    {
+        std::pair<LabelT, uint32_t> curr_pair(lbl, _labels_to_points[lbl].cardinality());
+        label_counts.push_back(curr_pair);
+    }
+    std::sort(label_counts.begin(), label_counts.end(),
+              [](auto &left, auto &right) { return left.second < right.second; });
+
+    return label_counts;
+}
+
+template <typename T, typename TagT, typename LabelT>
+std::pair<uint32_t, std::vector<uint32_t>> Index<T, TagT, LabelT>::sample_intersection(roaring::Roaring &intersection_bitmap, roaring::Roaring &tmp_bitmap,
+                                                                          const std::vector<std::vector<LabelT>> &filter_labels)
+{
+
+    #ifdef INSTRUMENT
+    auto s = std::chrono::high_resolution_clock::now();
+#endif
+
+
+    for (uint32_t or_itr = 0; or_itr < filter_labels[0].size(); or_itr++) {
+        intersection_bitmap |= _labels_to_points_sample[filter_labels[0][or_itr]];
+    }
+    for (size_t i = 1; i < filter_labels.size(); i++)
+    {
+        tmp_bitmap.removeRangeClosed(tmp_bitmap.minimum(), tmp_bitmap.maximum());
+        for (uint32_t or_itr = 0; or_itr < filter_labels[i].size(); or_itr++) {
+            tmp_bitmap |= _labels_to_points_sample[filter_labels[i][or_itr]];
+        }
+        intersection_bitmap &= tmp_bitmap;
+    }            
+
+
+
+/*    intersection_bitmap = _labels_to_points_sample[filter_label[0]];
+    for (size_t i = 1; i < filter_label.size(); i++)
+    {
+        intersection_bitmap &= _labels_to_points_sample[filter_label[i]];
+    }*/
+    uint32_t val = std::numeric_limits<uint32_t>::max();
+    auto x = intersection_bitmap.begin();
+    std::vector<uint32_t> results;
+    results.reserve(num_start_points);
+    while (x != intersection_bitmap.end() && results.size() < num_start_points)
+    {
+        val = _sample_map[*x];
+        results.emplace_back(val);
+        x++;
+    }
+
+    #ifdef INSTRUMENT
+    std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
+//    time_to_get_valid += diff.count();
+#endif
+
+    
+    //    std::cout<<intersection_bitmap.cardinality() << " " << val << std::endl;
+    return std::make_pair((uint32_t)(intersection_bitmap.cardinality() * (1.0 / (_sample_prob))), results);
+}
+
 template <typename T, typename TagT, typename LabelT>
 template <typename IdType>
-std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::search_with_filters(const T *query, const LabelT &filter_label,
+std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::search_with_filters(const T *query,
+                                                                          const std::vector<std::vector<LabelT>> &filter_label,
                                                                           const size_t K, const uint32_t L,
                                                                           IdType *indices, float *distances)
 {
-    if (K > (uint64_t)L)
-    {
-        throw ANNException("Set L to a value of at least K", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
+    /*    if (K > (uint64_t)L)
+        {
+            throw ANNException("Set L to a value of at least K", -1, __FUNCSIG__, __FILE__, __LINE__);
+        }*/
 
     ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
     auto scratch = manager.scratch_space();
+    std::shared_lock<std::shared_timed_mutex> lock(_update_lock);
+    std::shared_lock<std::shared_timed_mutex> tl(_tag_lock, std::defer_lock);
+    bool local_print = false;
 
     if (L > scratch->get_L())
     {
@@ -2062,59 +2567,266 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::search_with_filters(const
     }
 
     std::vector<LabelT> filter_vec;
-    std::vector<uint32_t> init_ids = get_init_ids();
+    /* std::vector<uint32_t> init_ids = get_init_ids(); */
+    std::vector<uint32_t> init_ids;
 
-    std::shared_lock<std::shared_timed_mutex> lock(_update_lock);
-    std::shared_lock<std::shared_timed_mutex> tl(_tag_lock, std::defer_lock);
-    if (_dynamic_index)
-        tl.lock();
+    /* if (_dynamic_index) */
+    /*     tl.lock(); */
+    /**/
+    /* if (_label_to_start_id.find(filter_label[0]) != _label_to_start_id.end()) */
+    /* { */
+    /*     init_ids.emplace_back(_label_to_start_id[filter_label[0]]); */
+    /* } */
+    /* else */
+    /* { */
+    /*     diskann::cout << "No filtered medoid found. exitting " */
+    /*                   << std::endl; // RKNOTE: If universal label found start there */
+    /*     throw diskann::ANNException("No filtered medoid found. exitting ", -1); */
+    /* } */
+    /* if (_dynamic_index) */
+    /*     tl.unlock(); */
+
+
+
+    /* std::ofstream out("query_sizes.txt", std::ios_base::app); */
+    /* auto [inter_estim, cand] = sample_intersection(scratch->get_valid_bitmap(), filter_label); */
+    /* for (auto const &filt : filter_label) */
+    /* { */
+    /*     out << filt << "/" << _labels_to_points[filt].cardinality() << " "; */
+    /* } */
+    /* out << "and intersection estimate " << inter_estim << std::endl; */
+    /* out << std::endl; */
+    /* out.close(); */
+    /* return std::make_pair(0, 0); */
+    /*
+        if (curr_query == 1)
+        {
+            std::ofstream out("query_stats1.txt", std::ios_base::app);
+            for (auto const &filt : filter_label)
+            {
+                out << filt << "/" << _labels_to_points[filt].cardinality() << " ";
+            }
+            out << std::endl;
+            out.close();
+        }*/
+
+    for (auto &lbl : filter_label)
+        filter_vec.push_back(lbl[0]);
+
+    std::vector<std::pair<LabelT, uint32_t>> sorted_filters = sort_filter_counts(filter_vec);
 
-    if (_label_to_start_id.find(filter_label) != _label_to_start_id.end())
+    _data_store->preprocess_query(query, scratch);
+    std::pair<uint32_t, uint32_t> retval;
+    if (_bruteforce_threshold < 3)
     {
-        init_ids.emplace_back(_label_to_start_id[filter_label]);
+        switch (_bruteforce_threshold)
+        {
+        case 0: {
+            num_brutes++;
+            // last_intersection has the common elements across all filters in sorted_filters
+#ifdef INSTRUMENT
+            auto s = std::chrono::high_resolution_clock::now();
+#endif
+            auto &last_intersection = scratch->get_valid_bitmap();
+            for (uint32_t or_itr = 0; or_itr < filter_label[0].size(); or_itr++) {
+                last_intersection |= _labels_to_points[filter_label[0][or_itr]];
+            }
+            for (size_t i = 1; i < filter_label.size(); i++)
+            {
+                auto &tmp_intersection = scratch->get_tmp_bitmap();                
+                for (uint32_t or_itr = 0; or_itr < filter_label[i].size(); or_itr++) {
+                    tmp_intersection |= _labels_to_points[filter_label[i][or_itr]];
+                }
+                last_intersection &= tmp_intersection;
+            }            
+            /*last_intersection = _labels_to_points[sorted_filters[0].first];
+            for (size_t i = 1; i < sorted_filters.size(); i++)
+            {
+                last_intersection &= _labels_to_points[sorted_filters[i].first];
+            }*/
+#ifdef INSTRUMENT
+            std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
+            time_to_get_valid += diff.count();
+#endif
+
+            retval = brute_force_filters(scratch->aligned_query(), L, last_intersection, scratch);
+        }
+        break;
+        case 1: {
+            num_graphs++;
+            auto [inter_estim, cand] = sample_intersection(scratch->get_valid_bitmap(), scratch->get_tmp_bitmap(), filter_label);
+
+            if (cand.size() > 0)
+            {
+                init_ids.insert(init_ids.end(), cand.begin(), cand.end());
+//                init_ids.emplace_back(cand);
+            } /*else {
+                if (_label_to_start_id.find(filter_label[0]) != _label_to_start_id.end()) 
+                { 
+                    init_ids.emplace_back(_label_to_start_id[filter_label[0]]); 
+                } 
+            } */
+             if (use_global_start) {
+                init_ids.emplace_back(_start);
+             }
+
+            local_print = true;
+            if (print_qstats)
+            {
+                std::ofstream out("query_stats.txt", std::ios_base::app);
+                out << "estimated intersection size is " << inter_estim << std::endl;
+                //out << "setting up init ids with id " << cand << std::endl;
+                out.close();
+            }
+            retval = iterate_to_fixed_point(scratch, L, init_ids, true, filter_label, true);
+            }
+            break;
+            case 2: {
+                uint32_t old_penalty_scale = penalty_scale;
+                penalty_scale = 0;
+                num_graphs++;
+                auto [inter_estim, cand] = sample_intersection(scratch->get_valid_bitmap(), scratch->get_tmp_bitmap(), filter_label);
+    
+                if (cand.size() > 0)
+                {
+                    init_ids.insert(init_ids.end(), cand.begin(), cand.end());
+    //                init_ids.emplace_back(cand);
+                } /*else {
+                    if (_label_to_start_id.find(filter_label[0]) != _label_to_start_id.end()) 
+                    { 
+                        init_ids.emplace_back(_label_to_start_id[filter_label[0]]); 
+                    } 
+                } */
+                 if (use_global_start) {
+                    init_ids.emplace_back(_start);
+                 }
+    
+                local_print = true;
+                if (print_qstats)
+                {
+                    std::ofstream out("query_stats.txt", std::ios_base::app);
+                    out << "estimated intersection size is " << inter_estim << std::endl;
+                    //out << "setting up init ids with id " << cand << std::endl;
+                    out.close();
+                }
+                retval = iterate_to_fixed_point(scratch, L, init_ids, true, filter_label, true);
+                penalty_scale = old_penalty_scale;
+           }
+            break;
+    
+        }
     }
     else
     {
-        diskann::cout << "No filtered medoid found. exitting "
-                      << std::endl; // RKNOTE: If universal label found start there
-        throw diskann::ANNException("No filtered medoid found. exitting ", -1);
+#ifdef INSTRUMENT
+        auto s = std::chrono::high_resolution_clock::now();
+#endif
+        auto [estimated_match, cand] = sample_intersection(scratch->get_valid_bitmap(),scratch->get_tmp_bitmap(), filter_label);
+#ifdef INSTRUMENT
+        std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
+        time_to_estimate += diff.count();
+#endif
+        if (estimated_match < _bruteforce_threshold)
+        {
+            num_brutes++;
+            auto &last_intersection = scratch->get_valid_bitmap();
+            last_intersection = _labels_to_points[sorted_filters[0].first];
+            for (size_t i = 1; i < sorted_filters.size(); i++)
+            {
+                last_intersection &= _labels_to_points[sorted_filters[i].first];
+            }
+            retval = brute_force_filters(scratch->aligned_query(), L, last_intersection, scratch);
+        }
+        else
+        {
+            num_graphs++;
+            /* if (_dynamic_index) */
+            /*     tl.lock(); */
+            /**/
+            /* if (_label_to_start_id.find(filter_label[0]) != _label_to_start_id.end()) */
+            /* { */
+            /*     init_ids.emplace_back(_label_to_start_id[filter_label[0]]); */
+            /* } */
+            /* else */
+            /* { */
+            /*     diskann::cout << "No filtered medoid found. exitting " */
+            /*                   << std::endl; // RKNOTE: If universal label found start there */
+            /*     throw diskann::ANNException("No filtered medoid found. exitting ", -1); */
+            /* } */
+            /* if (_dynamic_index) */
+            /*     tl.unlock(); */
+
+            if (cand.size() > 0)
+            {
+                init_ids.insert(init_ids.end(), cand.begin(), cand.end());
+//                init_ids.emplace_back(cand);
+            } /*else {
+                if (_label_to_start_id.find(filter_label[0]) != _label_to_start_id.end()) 
+                { 
+                    init_ids.emplace_back(_label_to_start_id[filter_label[0]]); 
+                } 
+            }*/
+             if (use_global_start) {
+                init_ids.emplace_back(_start);
+             }
+
+            local_print = true;
+            if (print_qstats)
+            {
+                std::ofstream out("query_stats.txt", std::ios_base::app);
+                out << "Search path for query " << curr_query << " with filters/specificities ";
+                for (auto const &filt : filter_vec)
+                    out << filt << "/" << _labels_to_points[filt].cardinality() << " ";
+                out << std::endl;
+                out << "estimated intersection size is " << estimated_match << std::endl;
+                //out << "setting up init ids with id " << cand << std::endl;
+                out << std::endl;
+                out.close();
+            }
+            retval = iterate_to_fixed_point(scratch, L, init_ids, true, filter_label, true);
+        }
     }
-    if (_dynamic_index)
-        tl.unlock();
-
-    filter_vec.emplace_back(filter_label);
-
-    _data_store->preprocess_query(query, scratch);
-    auto retval = iterate_to_fixed_point(scratch, L, init_ids, true, filter_vec, true);
 
     auto best_L_nodes = scratch->best_l_nodes();
 
     size_t pos = 0;
-    for (size_t i = 0; i < best_L_nodes.size(); ++i)
+    if (print_qstats && local_print)
     {
-        if (best_L_nodes[i].id < _max_points)
+        std::ofstream out("query_stats.txt", std::ios_base::app);
+        out << "final results for L size " << best_L_nodes.size() << ": ";
+        for (size_t i = 0; i < best_L_nodes.size(); ++i)
         {
-            indices[pos] = (IdType)best_L_nodes[i].id;
+            out << best_L_nodes[i].id << " ";
+        }
+        out << std::endl << std::endl;
+        out.close();
+    }
+    for (size_t i = 0; i < best_L_nodes.size(); ++i)
+    {
+        if (best_L_nodes[i].id >= _max_points || (detect_filter_penalty(best_L_nodes[i].id, true, filter_label) != 0))
+            continue;
 
-            if (distances != nullptr)
-            {
+        indices[pos] = (IdType)best_L_nodes[i].id;
+
+        if (distances != nullptr)
+        {
 #ifdef EXEC_ENV_OLS
-                // DLVS expects negative distances
-                distances[pos] = best_L_nodes[i].distance;
+            // DLVS expects negative distances
+            distances[pos] = best_L_nodes[i].distance;
 #else
-                distances[pos] = _dist_metric == diskann::Metric::INNER_PRODUCT ? -1 * best_L_nodes[i].distance
-                                                                                : best_L_nodes[i].distance;
+            distances[pos] = _dist_metric == diskann::Metric::INNER_PRODUCT ? -1 * best_L_nodes[i].distance
+                                                                            : best_L_nodes[i].distance;
 #endif
-            }
-            pos++;
         }
+        pos++;
+
         if (pos == K)
             break;
     }
-    if (pos < K)
-    {
-        diskann::cerr << "Found fewer than K elements for query" << std::endl;
-    }
+    /*    if (pos < K)
+        {
+            diskann::cerr << "Found fewer than K elements for query" << std::endl;
+        } */
 
     return retval;
 }
@@ -2173,8 +2885,8 @@ size_t Index<T, TagT, LabelT>::search_with_tags(const T *query, const uint64_t K
     }
     else
     {
-        std::vector<LabelT> filter_vec;
         auto converted_label = this->get_converted_label(filter_label);
+        std::vector<LabelT> filter_vec;
         filter_vec.push_back(converted_label);
         iterate_to_fixed_point(scratch, L, init_ids, true, filter_vec, true);
     }
@@ -3379,42 +4091,42 @@ template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t,
     const int8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
 
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint32_t>::search_with_filters<
-    uint64_t>(const float *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const float *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint32_t>::search_with_filters<
-    uint32_t>(const float *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const float *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint32_t>::search_with_filters<
-    uint64_t>(const uint8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const uint8_t *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint32_t>::search_with_filters<
-    uint32_t>(const uint8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const uint8_t *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint32_t>::search_with_filters<
-    uint64_t>(const int8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const int8_t *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint32_t>::search_with_filters<
-    uint32_t>(const int8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const int8_t *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 // TagT==uint32_t
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint32_t>::search_with_filters<
-    uint64_t>(const float *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const float *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint32_t>::search_with_filters<
-    uint32_t>(const float *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const float *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint32_t>::search_with_filters<
-    uint64_t>(const uint8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const uint8_t *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint32_t>::search_with_filters<
-    uint32_t>(const uint8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const uint8_t *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint32_t>::search_with_filters<
-    uint64_t>(const int8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const int8_t *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint32_t>::search_with_filters<
-    uint32_t>(const int8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const int8_t *query, const std::vector<std::vector<uint32_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint16_t>::search<uint64_t>(
     const float *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
@@ -3443,41 +4155,41 @@ template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t,
     const int8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
 
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint16_t>::search_with_filters<
-    uint64_t>(const float *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const float *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint16_t>::search_with_filters<
-    uint32_t>(const float *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const float *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint16_t>::search_with_filters<
-    uint64_t>(const uint8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const uint8_t *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint16_t>::search_with_filters<
-    uint32_t>(const uint8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const uint8_t *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint16_t>::search_with_filters<
-    uint64_t>(const int8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const int8_t *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint16_t>::search_with_filters<
-    uint32_t>(const int8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const int8_t *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 // TagT==uint32_t
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint16_t>::search_with_filters<
-    uint64_t>(const float *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const float *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint16_t>::search_with_filters<
-    uint32_t>(const float *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const float *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint16_t>::search_with_filters<
-    uint64_t>(const uint8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const uint8_t *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint16_t>::search_with_filters<
-    uint32_t>(const uint8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const uint8_t *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint16_t>::search_with_filters<
-    uint64_t>(const int8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
+    uint64_t>(const int8_t *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint64_t *indices, float *distances);
 template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint16_t>::search_with_filters<
-    uint32_t>(const int8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
+    uint32_t>(const int8_t *query, const std::vector<std::vector<uint16_t>> &filter_label, const size_t K, const uint32_t L,
+              uint32_t *indices, float *distances);
 
 } // namespace diskann
diff --git a/src/math_utils.cpp b/src/math_utils.cpp
index 7481da848..424467bcd 100644
--- a/src/math_utils.cpp
+++ b/src/math_utils.cpp
@@ -160,7 +160,10 @@ void compute_closest_centers(float *data, size_t num_points, size_t dim, float *
     if (!is_norm_given_for_pts)
         pts_norms_squared = new float[num_points];
 
-    size_t PAR_BLOCK_SIZE = num_points;
+    //    size_t PAR_BLOCK_SIZE = num_points;
+
+    size_t PAR_BLOCK_SIZE = 1000000;
+
     size_t N_BLOCKS =
         (num_points % PAR_BLOCK_SIZE) == 0 ? (num_points / PAR_BLOCK_SIZE) : (num_points / PAR_BLOCK_SIZE) + 1;
 
diff --git a/src/partition.cpp b/src/partition.cpp
index 570d45c7d..d9f5169af 100644
--- a/src/partition.cpp
+++ b/src/partition.cpp
@@ -31,12 +31,23 @@
 // #define SAVE_INFLATED_PQ true
 
 template <typename T>
-void gen_random_slice(const std::string base_file, const std::string output_prefix, double sampling_rate)
+void gen_random_slice(const std::string base_file, const std::string output_prefix, double sampling_rate,
+                      std::string label_file)
 {
     size_t read_blk_size = 64 * 1024 * 1024;
+    std::ifstream label_reader;
+    bool use_labels;
+    if (label_file != "")
+    {
+        label_reader = std::ifstream(label_file);
+        use_labels = true;
+    }
     cached_ifstream base_reader(base_file.c_str(), read_blk_size);
     std::ofstream sample_writer(std::string(output_prefix + "_data.bin").c_str(), std::ios::binary);
     std::ofstream sample_id_writer(std::string(output_prefix + "_ids.bin").c_str(), std::ios::binary);
+    std::ofstream sample_label_writer;
+    if (use_labels)
+        sample_label_writer = std::ofstream(std::string(output_prefix + "_labels.txt").c_str());
 
     std::random_device rd; // Will be used to obtain a seed for the random number engine
     auto x = rd();
@@ -61,12 +72,19 @@ void gen_random_slice(const std::string base_file, const std::string output_pref
     nd = nd_u32;
     std::unique_ptr<T[]> cur_row = std::make_unique<T[]>(nd);
 
+    std::string line;
     for (size_t i = 0; i < npts; i++)
     {
+        if (use_labels)
+        {
+            label_reader >> line;
+        }
         base_reader.read((char *)cur_row.get(), sizeof(T) * nd);
         float sample = distribution(generator);
         if (sample < sampling_rate)
         {
+            if (use_labels)
+                sample_label_writer << line << std::endl;
             sample_writer.write((char *)cur_row.get(), sizeof(T) * nd);
             uint32_t cur_i_u32 = (uint32_t)i;
             sample_id_writer.write((char *)&cur_i_u32, sizeof(uint32_t));
@@ -79,6 +97,8 @@ void gen_random_slice(const std::string base_file, const std::string output_pref
     sample_id_writer.write((char *)&num_sampled_pts_u32, sizeof(uint32_t));
     sample_writer.close();
     sample_id_writer.close();
+    if (use_labels)
+        sample_label_writer.close();
     diskann::cout << "Wrote " << num_sampled_pts_u32 << " points to sample file: " << output_prefix + "_data.bin"
                   << std::endl;
 }
@@ -604,11 +624,11 @@ int partition_with_ram_budget(const std::string data_file, const double sampling
 // Instantations of supported templates
 
 template void DISKANN_DLLEXPORT gen_random_slice<int8_t>(const std::string base_file, const std::string output_prefix,
-                                                         double sampling_rate);
+                                                         double sampling_rate, std::string label_file);
 template void DISKANN_DLLEXPORT gen_random_slice<uint8_t>(const std::string base_file, const std::string output_prefix,
-                                                          double sampling_rate);
+                                                          double sampling_rate, std::string label_file);
 template void DISKANN_DLLEXPORT gen_random_slice<float>(const std::string base_file, const std::string output_prefix,
-                                                        double sampling_rate);
+                                                        double sampling_rate, std::string label_file);
 
 template void DISKANN_DLLEXPORT gen_random_slice<float>(const float *inputdata, size_t npts, size_t ndims, double p_val,
                                                         float *&sampled_data, size_t &slice_size);
diff --git a/src/pq.cpp b/src/pq.cpp
index a86bf39bf..2a97fafb2 100644
--- a/src/pq.cpp
+++ b/src/pq.cpp
@@ -12,6 +12,7 @@
 
 // block size for reading/processing large files and matrices in blocks
 #define BLOCK_SIZE 5000000
+#define SAVE_INFLATED_PQ true
 
 namespace diskann
 {
@@ -735,6 +736,7 @@ int generate_pq_data_from_pivots(const std::string &data_file, uint32_t num_cent
     std::unique_ptr<float[]> rotmat_tr;
     std::unique_ptr<float[]> centroid;
     std::unique_ptr<uint32_t[]> chunk_offsets;
+    double total_error = 0;
 
     std::string inflated_pq_file = pq_compressed_vectors_path + "_inflated.bin";
 
@@ -900,8 +902,14 @@ int generate_pq_data_from_pivots(const std::string &data_file, uint32_t num_cent
                 block_compressed_base[j * num_pq_chunks + i] = closest_center[j];
 #ifdef SAVE_INFLATED_PQ
                 for (size_t k = 0; k < cur_chunk_size; k++)
+                {
+                    total_error += (block_inflated_base[j * dim + chunk_offsets[i] + k] -
+                                    block_data_float[j * dim + chunk_offsets[i] + k]) *
+                                   (block_inflated_base[j * dim + chunk_offsets[i] + k] -
+                                    block_data_float[j * dim + chunk_offsets[i] + k]);
                     block_inflated_base[j * dim + chunk_offsets[i] + k] =
                         cur_pivot_data[closest_center[j] * cur_chunk_size + k] + centroid[chunk_offsets[i] + k];
+                }
 #endif
             }
         }
@@ -932,6 +940,7 @@ int generate_pq_data_from_pivots(const std::string &data_file, uint32_t num_cent
 #ifdef SAVE_INFLATED_PQ
     inflated_file_writer.close();
 #endif
+    std::cout << "Average error = " << total_error / num_points << "." << std::endl;
     return 0;
 }
 
diff --git a/src/scratch.cpp b/src/scratch.cpp
index c3836ccf1..e43e81a09 100644
--- a/src/scratch.cpp
+++ b/src/scratch.cpp
@@ -28,15 +28,20 @@ InMemQueryScratch<T>::InMemQueryScratch(uint32_t search_l, uint32_t indexing_l,
     alloc_aligned(((void **)&this->_aligned_query_T), aligned_dim * sizeof(T), alignment_factor * sizeof(T));
     memset(this->_aligned_query_T, 0, aligned_dim * sizeof(T));
 
+    alloc_aligned(((void **)&this->_aligned_query_float), aligned_dim * sizeof(float),
+                  alignment_factor * sizeof(float));
+    memset(this->_aligned_query_float, 0, aligned_dim * sizeof(float));
+
     if (init_pq_scratch)
         this->_pq_scratch = new PQScratch<T>(defaults::MAX_GRAPH_DEGREE, aligned_dim);
     else
         this->_pq_scratch = nullptr;
 
     _occlude_factor.reserve(maxc);
+    _cluster_distances.reserve(MAX_NUM_CLUSTERS);
     _inserted_into_pool_bs = new boost::dynamic_bitset<>();
-    _id_scratch.reserve((size_t)std::ceil(1.5 * defaults::GRAPH_SLACK_FACTOR * _R));
-    _dist_scratch.reserve((size_t)std::ceil(1.5 * defaults::GRAPH_SLACK_FACTOR * _R));
+    _id_scratch.reserve((size_t)std::ceil(1.5 * defaults::GRAPH_SLACK_FACTOR * _R * _R));
+    _dist_scratch.reserve((size_t)std::ceil(1.5 * defaults::GRAPH_SLACK_FACTOR * _R * _R));
 
     resize_for_new_L(std::max(search_l, indexing_l));
 }
@@ -46,9 +51,10 @@ template <typename T> void InMemQueryScratch<T>::clear()
     _pool.clear();
     _best_l_nodes.clear();
     _occlude_factor.clear();
-
+    _closest_clusters.clear();
     _inserted_into_pool_rs.clear();
     _inserted_into_pool_bs->reset();
+    _cluster_distances.clear();
 
     _id_scratch.clear();
     _dist_scratch.clear();
@@ -65,7 +71,7 @@ template <typename T> void InMemQueryScratch<T>::resize_for_new_L(uint32_t new_l
         _L = new_l;
         _pool.reserve(3 * _L + _R);
         _best_l_nodes.reserve(_L);
-
+        _closest_clusters.reserve(2 * _L);
         _inserted_into_pool_rs.reserve(20 * _L);
     }
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index 3773cda22..ba3aa6ff5 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -127,8 +127,10 @@ void normalize_data_file(const std::string &inFileName, const std::string &outFi
 }
 
 double calculate_recall(uint32_t num_queries, uint32_t *gold_std, float *gs_dist, uint32_t dim_gs,
-                        uint32_t *our_results, uint32_t dim_or, uint32_t recall_at)
+                        uint32_t *our_results, uint32_t dim_or, uint32_t recall_at, unsigned r2)
 {
+    if (r2 == 0)
+        r2 = recall_at;
     double total_recall = 0;
     std::set<uint32_t> gt, res;
 
@@ -149,8 +151,8 @@ double calculate_recall(uint32_t num_queries, uint32_t *gold_std, float *gs_dist
 
         gt.insert(gt_vec, gt_vec + tie_breaker);
         res.insert(res_vec,
-                   res_vec + recall_at); // change to recall_at for recall k@k
-                                         // or dim_or for k@dim_or
+                   res_vec + r2); // change to recall_at for recall k@k
+                                  // or dim_or for k@dim_or
         uint32_t cur_recall = 0;
         for (auto &v : gt)
         {