diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6151054..78e91a2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -38,10 +38,12 @@ macro(ConfigureTarget Target)
 	)
 endmacro()
 
+add_library(mapped_file STATIC "libraries/mapped_file.cpp")
+ConfigureTarget(mapped_file)
+
 # Source files
 add_library(CubeObjs OBJECT
 	"src/cubes.cpp"
-	"src/cache.cpp"
 	"src/rotations.cpp"
 	"src/newCache.cpp"
 )
@@ -50,6 +52,7 @@ ConfigureTarget(CubeObjs)
 # Build main program
 add_executable(${PROJECT_NAME} "program.cpp" $<TARGET_OBJECTS:CubeObjs>)
 target_link_libraries(${PROJECT_NAME} pthread)
+target_link_libraries(${PROJECT_NAME} mapped_file)
 ConfigureTarget(${PROJECT_NAME})
 
 # Optionally build tests
diff --git a/cpp/include/cache.hpp b/cpp/include/cache.hpp
deleted file mode 100644
index 6c3480d..0000000
--- a/cpp/include/cache.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-#ifndef OPENCUBES_CACHE_HPP
-#define OPENCUBES_CACHE_HPP
-#include <string>
-
-#include "hashes.hpp"
-#include "utils.hpp"
-
-struct Cache {
-    static constexpr uint32_t MAGIC = 0x42554350;
-    static constexpr uint32_t XYZ_SIZE = 3;
-    static constexpr uint32_t ALL_SHAPES = -1;
-    struct Header {
-        uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
-        uint32_t n;              // we will never need 32bit but it is nicely aligned
-        uint32_t numShapes;      // defines length of the shapeTable
-        uint64_t numPolycubes;   // total number of polycubes
-    };
-    struct ShapeEntry {
-        uint8_t dim0;      // offset by -1
-        uint8_t dim1;      // offset by -1
-        uint8_t dim2;      // offset by -1
-        uint8_t reserved;  // for alignment
-        uint64_t offset;   // from beginning of file
-        uint64_t size;     // in bytes should be multiple of XYZ_SIZE
-    };
-
-    static void save(std::string path, Hashy& hashes, uint8_t n);
-    static Hashy load(std::string path, uint32_t extractShape = ALL_SHAPES);
-
-    int filedesc;
-    void* mmap_ptr;
-};
-
-#endif
diff --git a/cpp/include/cube.hpp b/cpp/include/cube.hpp
index 83feaa7..e92e570 100644
--- a/cpp/include/cube.hpp
+++ b/cpp/include/cube.hpp
@@ -7,6 +7,7 @@
 #include <memory>
 #include <unordered_set>
 #include <vector>
+#include <atomic>
 
 #include "utils.hpp"
 
@@ -45,20 +46,47 @@ using XYZSet = std::unordered_set<XYZ, HashXYZ, std::equal_to<XYZ>>;
 
 struct Cube {
    private:
-    struct {
-        uint8_t is_shared : 1;
-        uint8_t size : 7;  // MAX 127
-    } bits;
-    XYZ *array = nullptr;
-
-    static_assert(sizeof(bits) == sizeof(uint8_t));
+    // cube memory is stored two ways:
+    // normal, new'd buffer: is_shared == false
+    // shared, external memory: is_shared == true
+
+    struct bits_t {
+        uint64_t is_shared : 1;
+        uint64_t size : 7;   // MAX 127
+        uint64_t addr : 56;  // low 56-bits of memory address.
+    };
+    // fields
+    bits_t fields;
+
+    static_assert(sizeof(bits_t) == sizeof(void*));
+    // extract the pointer from bits_t
+    static XYZ *get(bits_t key) {
+        // pointer bit-hacking:
+        uint64_t addr = key.addr;
+        return reinterpret_cast<XYZ *>(addr);
+    }
 
+    static bits_t put(bool is_shared, int size, XYZ *addr) {
+        // mask off top byte from the memory address to fit it into bits_t::addr
+        // on x86-64 it is not used by the hardware (yet).
+        // This hack actually saves 8 bytes because previously
+        // the uint8_t caused padding to 16 bytes.
+        // @note if we get segfaults dereferencing get(fields)
+        // then this is the problem and this hack must be undone.
+        uint64_t tmp = reinterpret_cast<uint64_t>((void *)addr);
+        tmp &= 0xffffffffffffff;
+        bits_t bits;
+        bits.addr = tmp;
+        bits.is_shared = is_shared;
+        bits.size = size;
+        return bits;
+    }
    public:
     // Empty cube
-    Cube() : bits{0, 0} {}
+    Cube() : fields{put(0, 0, nullptr)} {}
 
     // Cube with N capacity
-    explicit Cube(uint8_t N) : bits{0, N}, array(new XYZ[bits.size]) {}
+    explicit Cube(uint8_t N) : fields{put(0,N, new XYZ[N])} {}
 
     // Construct from pieces
     Cube(std::initializer_list<XYZ> il) : Cube(il.size()) { std::copy(il.begin(), il.end(), begin()); }
@@ -69,20 +97,23 @@ struct Cube {
     // Construct from external source.
     // Cube shares this the memory until modified.
     // Caller guarantees the memory given will live longer than *this
-    Cube(XYZ *start, uint8_t n) : bits{1, n}, array(start) {}
+    Cube(const XYZ *start, uint8_t n) : fields{put(1,n,const_cast<XYZ*>(start))} {}
 
     // Copy ctor.
     Cube(const Cube &copy) : Cube(copy.size()) { std::copy(copy.begin(), copy.end(), begin()); }
 
     ~Cube() {
+        bits_t bits = fields;
         if (!bits.is_shared) {
-            delete[] array;
+            delete[] get(bits);
         }
     }
     friend void swap(Cube &a, Cube &b) {
         using std::swap;
-        swap(a.array, b.array);
-        swap(a.bits, b.bits);
+        bits_t abits = a.fields;
+        bits_t bbits = b.fields;
+        a.fields = bbits;
+        b.fields = abits;
     }
 
     Cube(Cube &&mv) : Cube() { swap(*this, mv); }
@@ -98,19 +129,15 @@ struct Cube {
         return *this;
     }
 
-    size_t size() const { return bits.size; }
+    size_t size() const { return fields.size; }
 
     XYZ *data() {
-        if (bits.is_shared) {
-            // lift to RAM: this should never happen really.
-            Cube tmp(array, bits.size);
-            swap(*this, tmp);
-            std::printf("Bad use of Cube\n");
-        }
-        return array;
-    }
+		return get(fields);
+	}
 
-    const XYZ *data() const { return array; }
+	const XYZ *data() const {
+		return get(fields);
+	}
 
     XYZ *begin() { return data(); }
 
@@ -138,8 +165,16 @@ struct Cube {
     void print() const {
         for (auto &p : *this) std::printf("  (%2d %2d %2d)\n\r", p.x(), p.y(), p.z());
     }
+
+    /**
+     * Copy cube data into destination buffer.
+     */
+    void copyout(int num, XYZ* dest) const {
+        std::copy_n(begin(), num, dest);
+    }
 };
 
+static_assert(sizeof(Cube) == 8, "Unexpected sizeof(Cube) for Cube");
 static_assert(std::is_move_assignable_v<Cube>, "Cube must be moveable");
 static_assert(std::is_swappable_v<Cube>, "Cube must swappable");
 
diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index 7999d5c..49462d2 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -28,7 +28,7 @@ using CubeSet = std::unordered_set<Cube, HashCube, std::equal_to<Cube>>;
 struct Hashy {
     struct Subsubhashy {
         CubeSet set;
-        std::shared_mutex set_mutex;
+        mutable std::shared_mutex set_mutex;
 
         template <typename CubeT>
         void insert(CubeT &&c) {
@@ -36,12 +36,16 @@ struct Hashy {
             set.emplace(std::forward<CubeT>(c));
         }
 
-        bool contains(const Cube &c) {
+        bool contains(const Cube &c) const {
             std::shared_lock lock(set_mutex);
-            return set.count(c);
+            auto itr = set.find(c);
+            if(itr != set.end()) {
+				return true;
+			}
+            return false;
         }
 
-        auto size() {
+        auto size() const {
             std::shared_lock lock(set_mutex);
             return set.size();
         }
@@ -59,7 +63,7 @@ struct Hashy {
             // printf("new size %ld\n\r", byshape[shape].size());
         }
 
-        auto size() {
+        auto size() const {
             size_t sum = 0;
             for (auto &set : byhash) {
                 auto part = set.size();
@@ -95,12 +99,12 @@ struct Hashy {
         set.insert(std::forward<CubeT>(c));
     }
 
-    auto size() {
+    auto size() const {
         size_t sum = 0;
-        DEBUG_PRINTF("%ld maps by shape\n\r", byshape.size());
+        DEBUG1_PRINTF("%ld maps by shape\n\r", byshape.size());
         for (auto &set : byshape) {
             auto part = set.second.size();
-            DEBUG_PRINTF("bucket [%2d %2d %2d]: %ld\n", set.first.x(), set.first.y(), set.first.z(), part);
+            DEBUG1_PRINTF("bucket [%2d %2d %2d]: %ld\n", set.first.x(), set.first.y(), set.first.z(), part);
             sum += part;
         }
         return sum;
diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 29e622e..c24a06b 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -1,13 +1,38 @@
 #pragma once
 #ifndef OPENCUBES_NEWCACHE_HPP
 #define OPENCUBES_NEWCACHE_HPP
+#include <condition_variable>
 #include <cstring>
+#include <deque>
+#include <functional>
+#include <mutex>
 #include <string>
+#include <thread>
 
 #include "cube.hpp"
 #include "hashes.hpp"
-
-class Workset;
+#include "mapped_file.hpp"
+
+namespace cacheformat {
+static constexpr uint32_t MAGIC = 0x42554350;
+static constexpr uint32_t XYZ_SIZE = 3;
+static constexpr uint32_t ALL_SHAPES = -1;
+
+struct Header {
+    uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
+    uint32_t n;              // we will never need 32bit but it is nicely aligned
+    uint32_t numShapes;      // defines length of the shapeTable
+    uint64_t numPolycubes;   // total number of polycubes
+};
+struct ShapeEntry {
+    uint8_t dim0;      // offset by -1
+    uint8_t dim1;      // offset by -1
+    uint8_t dim2;      // offset by -1
+    uint8_t reserved;  // for alignment
+    uint64_t offset;   // from beginning of file
+    uint64_t size;     // in bytes should be multiple of XYZ_SIZE
+};
+};  // namespace cacheformat
 
 class CubeIterator {
    public:
@@ -18,7 +43,7 @@ class CubeIterator {
     using reference = Cube&;  // or also value_type&
 
     // constructor
-    CubeIterator(uint32_t _n, XYZ* ptr) : n(_n), m_ptr(ptr) {}
+    CubeIterator(uint32_t _n, const XYZ* ptr) : n(_n), m_ptr(ptr) {}
 
     // invalid iterator (can't deference)
     explicit CubeIterator() : n(0), m_ptr(nullptr) {}
@@ -27,6 +52,8 @@ class CubeIterator {
     const value_type operator*() const { return Cube(m_ptr, n); }
     // pointer operator->() { return (pointer)m_ptr; }
 
+    const XYZ* data() const { return m_ptr; }
+
     // Prefix increment
     CubeIterator& operator++() {
         m_ptr += n;
@@ -49,17 +76,16 @@ class CubeIterator {
     friend bool operator<(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr < b.m_ptr; };
     friend bool operator>(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr > b.m_ptr; };
     friend bool operator!=(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr != b.m_ptr; };
-    friend class Workset;
 
    private:
     uint32_t n;
-    XYZ* m_ptr;
+    const XYZ* m_ptr;
 };
 
 class ShapeRange {
    public:
-    ShapeRange(XYZ* start, XYZ* stop, uint64_t _cubeLen, XYZ _shape)
-        : b(_cubeLen, start), e(_cubeLen, stop), size_(((uint64_t)stop - (uint64_t)start) / (_cubeLen * sizeof(XYZ))), shape_(_shape) {}
+    ShapeRange(const XYZ* start, const XYZ* stop, uint64_t _cubeLen, XYZ _shape)
+        : b(_cubeLen, start), e(_cubeLen, stop), size_(std::distance(start, stop) / _cubeLen), shape_(_shape) {}
 
     CubeIterator begin() { return b; }
     CubeIterator end() { return e; }
@@ -98,46 +124,33 @@ class CacheReader : public ICache {
     uint32_t numShapes() override { return header->numShapes; };
     operator bool() { return fileLoaded_; }
 
-    static constexpr uint32_t MAGIC = 0x42554350;
-    static constexpr uint32_t XYZ_SIZE = 3;
-    static constexpr uint32_t ALL_SHAPES = -1;
-
-    struct Header {
-        uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
-        uint32_t n;              // we will never need 32bit but it is nicely aligned
-        uint32_t numShapes;      // defines length of the shapeTable
-        uint64_t numPolycubes;   // total number of polycubes
-    };
-    struct ShapeEntry {
-        uint8_t dim0;      // offset by -1
-        uint8_t dim1;      // offset by -1
-        uint8_t dim2;      // offset by -1
-        uint8_t reserved;  // for alignment
-        uint64_t offset;   // from beginning of file
-        uint64_t size;     // in bytes should be multiple of XYZ_SIZE
-    };
-
-    CubeIterator begin() {
-        uint8_t* start = filePointer + shapes[0].offset;
-        return CubeIterator(header->n, (XYZ*)start);
+    // Do begin() and end() make sense for CacheReader
+    // If the cache file provides data for more than single shape?
+    // The data might not even be mapped contiguously to save memory.
+    /*CubeIterator begin() {
+        const uint8_t* start = filePointer + shapes[0].offset;
+        return CubeIterator(header->n, (const XYZ*)start);
     }
 
     CubeIterator end() {
-        uint8_t* stop = filePointer + shapes[0].offset + header->numPolycubes * header->n * XYZ_SIZE;
-        return CubeIterator(header->n, (XYZ*)stop);
-    }
+        const uint8_t* stop = filePointer + shapes[0].offset + header->numPolycubes * header->n * XYZ_SIZE;
+        return CubeIterator(header->n, (const XYZ*)stop);
+    }*/
 
+    // get shapes at index [0, numShapes()[
     ShapeRange getCubesByShape(uint32_t i) override;
 
    private:
-    uint8_t* filePointer;
+    std::shared_ptr<mapped::file> file_;
+    std::unique_ptr<const mapped::struct_region<cacheformat::Header>> header_;
+    std::unique_ptr<const mapped::array_region<cacheformat::ShapeEntry>> shapes_;
+    std::unique_ptr<const mapped::array_region<XYZ>> xyz_;
+
     std::string path_;
-    int fileDescriptor_;
-    uint64_t fileSize_;
     bool fileLoaded_;
-    Header dummyHeader;
-    Header* header;
-    ShapeEntry* shapes;
+    const cacheformat::Header dummyHeader;
+    const cacheformat::Header* header;
+    const cacheformat::ShapeEntry* shapes;
 };
 
 class FlatCache : public ICache {
@@ -171,4 +184,40 @@ class FlatCache : public ICache {
     size_t size() override { return allXYZs.size() / n / sizeof(XYZ); }
 };
 
+class CacheWriter {
+   protected:
+    std::mutex m_mtx;
+    std::condition_variable m_run;
+    std::condition_variable m_wait;
+    bool m_active = true;
+
+    // Jobs that flush and finalize the written file.
+    size_t m_num_flushes = 0;
+    std::deque<std::function<void(void)>> m_flushes;
+
+    // Temporary copy jobs into the memory mapped file.
+    size_t m_num_copys = 0;
+    std::deque<std::function<void(void)>> m_copy;
+
+    // thread pool executing the jobs.
+    std::deque<std::thread> m_flushers;
+
+    void run();
+
+   public:
+    CacheWriter(int num_threads = 8);
+    ~CacheWriter();
+
+    /**
+     * Capture snapshot of the Hashy and write cache file.
+     * The data may not be entirely flushed before save() returns.
+     */
+    void save(std::string path, Hashy& hashes, uint8_t n);
+
+    /**
+     * Complete all flushes immediately.
+     */
+    void flush();
+};
+
 #endif
diff --git a/cpp/include/utils.hpp b/cpp/include/utils.hpp
index 4cd23e3..f895877 100644
--- a/cpp/include/utils.hpp
+++ b/cpp/include/utils.hpp
@@ -3,12 +3,39 @@
 #define OPENCUBES_UTILS_HPP
 
 #include <cstdio>
+
+// Debug print level: all prints enabled
+// below DEBUG_LEVEL.
+// DEBUG_LEVEL -> 0 all prints disabled.
+// DEBUG_LEVEL -> 1 enable DEBUG_PRINTF() statements
+// DEBUG_LEVEL -> 2 enable DEBUG1_PRINTF() statements and earlier
+// DEBUG_LEVEL -> 3 all prints enabled.
+#define DEBUG_LEVEL 1
+
 #ifdef DEBUG
+
+#if DEBUG_LEVEL >= 1
 #define DEBUG_PRINTF(...) std::printf(__VA_ARGS__)
-#else
-#define DEBUG_PRINTF(...) \
-    do {                  \
-    } while (0)
 #endif
 
+#if DEBUG_LEVEL >= 2
+#define DEBUG1_PRINTF(...) std::printf(__VA_ARGS__)
+#endif
+
+#if DEBUG_LEVEL >= 3
+#define DEBUG2_PRINTF(...) std::printf(__VA_ARGS__)
+#endif
+
+#endif
+
+#ifndef DEBUG_PRINTF
+#define DEBUG_PRINTF(...) do {} while (0)
 #endif
+#ifndef DEBUG1_PRINTF
+#define DEBUG1_PRINTF(...) do {} while (0)
+#endif
+#ifndef DEBUG2_PRINTF
+#define DEBUG2_PRINTF(...) do {} while (0)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
new file mode 100644
index 0000000..e7261dc
--- /dev/null
+++ b/cpp/libraries/mapped_file.cpp
@@ -0,0 +1,449 @@
+/**
+ * Copyright 2023 Jarmo A Tiitto
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the “Software”), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "mapped_file.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <string>
+
+// POSIX/Linux APIs
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#endif
+
+namespace mapped {
+
+/**
+ * Mapped file POSIX/Linux compatible implementation
+ */
+file::file() : fd(-1), fd_size(0) {}
+
+file::~file() { close(); }
+
+void file::close() {
+    if (fd >= 0) {
+        ::fsync(fd);
+        ::close(fd);
+        fd = -1;
+        fd_size = 0;
+    }
+}
+
+int file::open(const char* fname) {
+    close();
+
+    fd = ::open64(fname, O_RDONLY);
+    if (fd == -1) {
+        // std::fprintf(stderr, "Error opening file for reading\n");
+        return -1;
+    }
+
+    struct stat64 finfo;
+    if (fstat64(fd, &finfo)) {
+        std::fprintf(stderr, "Error getting file size: %s\n", std::strerror(errno));
+        return -1;
+    }
+    fd_size = finfo.st_size;
+    fd_rw = false;
+    return 0;
+}
+
+int file::openrw(const char* fname, size_t maxsize, int flags) {
+    // create new files with "normal" permissions: "-rw-r--r--"
+    const mode_t fperms = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH;
+
+    close();
+
+    maxsize = roundUp(maxsize);
+
+    if (!flags) {
+        fd = ::open64(fname, O_RDWR | O_CLOEXEC);
+        if (fd == -1) {
+            // std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            return -1;
+        }
+
+        fd_rw = true;
+
+        struct stat64 finfo;
+        if (fstat64(fd, &finfo)) {
+            std::fprintf(stderr, "Error getting file size:%s\n", std::strerror(errno));
+            return -1;
+        }
+        return truncate(finfo.st_size);
+
+    } else if ((flags & (CREATE | RESIZE)) == (CREATE | RESIZE)) {
+        fd = ::open64(fname, O_CREAT | O_RDWR | O_TRUNC | O_CLOEXEC, fperms);
+        if (fd == -1) {
+            // std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            return -1;
+        }
+        fd_rw = true;
+
+        if(flags & FSTUNE) {
+            int flags = 0;
+            ioctl(fd, FS_IOC_GETFLAGS, &flags);
+            flags |= FS_NOATIME_FL | FS_NOCOW_FL;
+            ioctl(fd, FS_IOC_SETFLAGS, &flags);
+        }
+        return truncate(maxsize);
+
+    } else if ((flags & RESIZE) != 0) {
+        fd = ::open64(fname, O_RDWR | O_CLOEXEC, fperms);
+        if (fd == -1) {
+            // std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            return -1;
+        }
+        fd_rw = true;
+        return truncate(maxsize);
+    } else {
+        std::fprintf(stderr, "Invalid open flags:%s\n", std::strerror(errno));
+        return -1;
+    }
+}
+
+bool file::is_rw() const { return fd_rw; }
+
+seekoff_t file::size() const { return fd_size; }
+
+int file::truncate(seekoff_t newsize) {
+    // resize the backing file
+    if (newsize != fd_size && ftruncate64(fd, newsize)) {
+        std::fprintf(stderr, "Error resizing backing file:%s\n", std::strerror(errno));
+        return -1;
+    }
+    fd_size = newsize;
+    return 0;
+}
+
+/**
+ * Mapped region POSIX/Linux compatible implementation.
+ */
+
+region::region(std::shared_ptr<file> src, seekoff_t fpos, len_t size, len_t window) : mfile(src) {
+    std::lock_guard lock(mtx);
+    remap(fpos, size, window);
+}
+
+region::region(std::shared_ptr<file> src) : mfile(src) {
+    std::lock_guard lock(mtx);
+    auto sz = mfile->size();
+    remap(0, sz, sz);
+}
+
+region::~region() {
+    // destructor is not thread-safe.
+    std::lock_guard lock(mtx);
+    map_fseek = 0;
+    remap(0, 0, 0);
+}
+
+/**
+ * This is the core implementation of mapped_file:
+ * remap(0,0) releases the mapping.
+ * remap(0, n) mmap roundUp(n) bytes at offset 0
+ * remap(0, k) mremap roundUp(n) bytes at offset 0 (grows the existing mapping)
+ * remap(n, j) munmap old region, mmap new at offset roundDown(n)
+ *
+ * In read-write mode the backing file is grown to fit the mapping.
+ *
+ * @warn this->mtx must be held when this function is called.
+ */
+void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
+    if (fpos == usr_fseek && size == usr_size) return;  // No-op
+    // check if [fpos, fpos+size] fits into the existing
+    // mmap() window and only adjust the user region.
+    if (size && map_ptr && (map_fseek <= fpos && fpos + size <= map_fseek + map_size)) {
+        usr_fseek = fpos;
+        usr_ptr = (uint8_t*)map_ptr + (fpos - map_fseek);
+        usr_size = size;
+        return;
+    }
+
+    // if size == 0 or the usr_fseek != fpos,
+    // we have to unmap the old region first, if any.
+    if (!!map_ptr && (size == 0 || usr_fseek != fpos)) {
+        if (::munmap(map_ptr, map_size) == -1) {
+            std::fprintf(stderr, "Error mapping file memory\n");
+            return;
+        }
+        map_ptr = nullptr;
+        map_size = 0;
+        usr_ptr = nullptr;
+        usr_size = 0;
+        if (size == 0) return;
+    }
+    // keep what user tried to ask:
+    usr_fseek = fpos;
+    usr_size = size;
+
+    if (map_ptr && map_fseek == fpos) {
+        // this mapping exists already at same map_fseek
+        // remap it to grow the region.
+        auto newsize = roundUp(std::max(size, window));
+        void* newptr = mremap(map_ptr, map_size, newsize, MREMAP_MAYMOVE);
+        if (newptr == MAP_FAILED) {
+            std::fprintf(stderr, "Error resizing memory-map of file:%s\n", std::strerror(errno));
+            std::abort();
+            return;
+        }
+        map_ptr = newptr;
+        map_size = newsize;
+        return;
+    }
+
+    // create new mapping
+    if (mfile->is_rw()) {
+        // RW mapping
+        auto newsize = roundUp(std::max(size, window));
+
+        // take file lock so size() check --> truncate is atomic.
+        std::unique_lock trunclock(mfile->mut);
+        if (mfile->size() < fpos + newsize && mfile->truncate(fpos + newsize)) {
+            // failed. Disk full?
+            std::abort();
+            return;
+        }
+        trunclock.unlock();
+
+        // mmap requires fpos && size to be multiple of PAGE_SIZE
+        map_fseek = roundDown(fpos);
+        if (map_fseek < fpos) {
+            // adjust size to cover.
+            newsize += PAGE_SIZE;
+        }
+        map_size = newsize;
+        map_ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, mfile->fd, map_fseek);
+        if (map_ptr == MAP_FAILED) {
+            // If this gets triggered we are in deep trouble
+            std::fprintf(stderr, "Error memory-mapping file:%s %lu %d %lu\n", std::strerror(errno), size, mfile->fd, fpos);
+            std::fprintf(stderr, "Dumping /proc/self/maps:\n");
+            // for debugging information try print /proc/self/mmaps contents
+            // as this explains why we hit some limit of the system.
+            std::ifstream fmaps("/proc/self/maps");
+            std::string buf;
+            int count = 0;
+            while(std::getline(fmaps, buf)) {
+                std::fprintf(stderr, "%s\n", buf.c_str());
+                ++count;
+            }
+            std::fprintf(stderr, "counted %d memory-maps in process.\n", count);
+            return;
+        }
+    } else {
+        // RO mapping
+        if (mfile->size() <= fpos) {
+            // can't: the backing file is too small.
+            std::fprintf(stderr, "Error seeking past end of file.\n");
+            std::abort();
+            return;
+        }
+        map_size = roundUp(std::max(size, window));
+        map_fseek = roundDown(fpos);
+        // Map the region. (use huge pages, don't reserve backing store)
+        map_ptr = mmap(0, map_size, PROT_READ, MAP_SHARED | MAP_NORESERVE, mfile->fd, map_fseek);
+
+        if (!map_ptr || map_ptr == MAP_FAILED) {
+            std::fprintf(stderr, "Error mapping file\n");
+            std::abort();
+            return;
+        }
+    }
+
+    // hint that this memory is accessed in random order.
+    if(madvise(map_ptr, map_size, MADV_RANDOM)) {
+        std::fprintf(stderr, "warn: madvice(MADV_RANDOM) failed: %s\n", std::strerror(errno));
+    }
+    // adjust the usr_ptr to fix
+    // any page misalignment.
+    usr_ptr = (uint8_t*)map_ptr + (fpos - map_fseek);
+}
+
+void region::window(len_t window) {
+    std::lock_guard lock(mtx);
+    auto usize = usr_size;
+    // note: remap() does nothing if window == usr_size
+    remap(usr_fseek, window, window);
+    usr_size = usize;
+}
+
+void region::jump(seekoff_t fpos) {
+    std::lock_guard lock(mtx);
+    remap(fpos, usr_size, map_size);
+    is_dirty = false;
+}
+
+void region::flushJump(seekoff_t fpos) {
+    flush();
+    std::lock_guard lock(mtx);
+    remap(fpos, usr_size, map_size);
+}
+
+void region::flush() {
+    // only flush if dirty and RW mapped.
+    std::lock_guard lock(mtx);
+    if (is_dirty && mfile->is_rw()) {
+        is_dirty = false;
+        auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
+        auto flush_len = roundUp(usr_size);
+        if (flush_begin < usr_ptr) flush_len += PAGE_SIZE;
+        if (msync(flush_begin, flush_len, MS_ASYNC)) {
+            std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+void region::sync() {
+    // only flush if dirty and RW mapped.
+    std::lock_guard lock(mtx);
+    if (is_dirty && mfile->is_rw()) {
+        is_dirty = false;
+        auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
+        auto flush_len = roundUp(usr_size);
+        if (flush_begin < usr_ptr) flush_len += PAGE_SIZE;
+        if (msync(flush_begin, flush_len, MS_SYNC)) {
+            std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+void region::writeAt(seekoff_t fpos, len_t datasize, const void* data) {
+    auto srcmem = (const char*)data;
+
+    // take file lock so that file size check --> truncate is atomic.
+    std::unique_lock trunclock(mfile->mut);
+    if(mfile->size() < fpos+datasize && mfile->truncate(fpos+datasize)) {
+        return;
+    }
+    trunclock.unlock();
+
+    // does write fall out the mapped area begin?
+    if (fpos < map_fseek) {
+        // max size that can be written before map_fseek
+        ssize_t wr = std::min(map_fseek - fpos, datasize);
+        if (pwrite(mfile->fd, srcmem, wr, fpos) != wr) {
+            std::fprintf(stderr, "Error writing file:%s\n", std::strerror(errno));
+        }
+        srcmem += wr;
+        fpos += wr;
+        datasize -= wr;
+    }
+
+    if (fpos >= map_fseek && fpos < map_fseek + map_size && datasize) {
+        // max size that can be copied into this mapping:
+        ssize_t wr = std::min(map_size - (fpos - map_fseek), datasize);
+        std::memcpy((char*)map_ptr + (fpos - map_fseek), srcmem, wr);
+        srcmem += wr;
+        fpos += wr;
+        datasize -= wr;
+    }
+
+    // does write fall out the mapped area end?
+    if (datasize) {
+        // write into backing file after the mapped area:
+        if (pwrite(mfile->fd, srcmem, datasize, fpos) != ssize_t(datasize)) {
+            std::fprintf(stderr, "Error writing file:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+void region::readAt(seekoff_t fpos, len_t datasize, void* data) const {
+    auto dstmem = (char*)data;
+
+    // does read fall out the mapped area begin?
+    if (fpos < map_fseek) {
+        // max size that can be written before map_fseek
+        ssize_t rd = std::min(map_fseek - fpos, datasize);
+        if (pread(mfile->fd, dstmem, rd, fpos) != rd) {
+            std::fprintf(stderr, "Error reading file:%s\n", std::strerror(errno));
+        }
+        dstmem += rd;
+        fpos += rd;
+        datasize -= rd;
+    }
+
+    if (fpos >= map_fseek && fpos < map_fseek + map_size && datasize) {
+        // max size that can be copied from this mapping:
+        ssize_t rd = std::min(map_size - (fpos - map_fseek), datasize);
+        std::memcpy(dstmem, (char*)map_ptr + (fpos - map_fseek), rd);
+        dstmem += rd;
+        fpos += rd;
+        datasize -= rd;
+    }
+
+    // does read fall out the mapped area end?
+    if (datasize) {
+        // read from backing file after the mapped area:
+        if (pread(mfile->fd, dstmem, datasize, fpos) != ssize_t(datasize)) {
+            std::fprintf(stderr, "Error reading file:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+
+void region::resident(bool resident) {
+    std::lock_guard lock(mtx);
+    if(madvise(map_ptr, map_size, resident ? MADV_WILLNEED : MADV_DONTNEED)) {
+            std::fprintf(stderr,"Error setting memory-map residency:%s\n",std::strerror(errno));
+    }
+}
+
+
+void region::discard(seekoff_t fpos, len_t datasize) {
+
+    auto cur = usr_fseek + fpos;
+
+    if (cur < map_fseek + map_size) {
+        // max size that can discarded from this mapping:
+        ssize_t dm = std::min(map_size - (cur - map_fseek), datasize);
+
+        // Have to be careful here: if we delete too much
+        // caller will not have an good time.
+        // align size down to page size.
+        dm = roundDown(dm);
+        // align file offset up
+        auto _first = roundUp(cur - map_fseek);
+        if(_first > cur - map_fseek)
+            dm -= PAGE_SIZE;
+
+        if(dm >= (signed)PAGE_SIZE) {
+            if(madvise((char*)map_ptr + _first, dm, MADV_REMOVE)) {
+                std::fprintf(stderr,"Error discarding memory-map region:%s\n",std::strerror(errno));
+            }
+        }
+    }
+}
+
+};  // namespace mapped
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
new file mode 100644
index 0000000..b86657a
--- /dev/null
+++ b/cpp/libraries/mapped_file.hpp
@@ -0,0 +1,552 @@
+/**
+ * Copyright 2023 Jarmo A Tiitto
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the “Software”), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef MAPPEDFILE_HPP_INCLUDED
+#define MAPPEDFILE_HPP_INCLUDED
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+
+/**
+ * Memory mapped file I/O utilities
+ * - mapped::file class for opening an file
+ * - mapped::region class for RW/RO memory mapping part the file instance.
+ * - mapped::struct_region<T> template for RW/RO accessing part the file as specified type.
+ * - mapped::array_region<T> template for RW/RO accessing part of the file as array of T elements.
+ *
+ * @note
+ *  When doing read-only mapping the region instance
+ *  should be const qualified as this restricts
+ *  the region class API to read-only operations and prevents
+ *  accidental modification of the file.
+ *  Use std::make_unique<const region>(<args>) in this case.
+ *
+ * @note
+ *  When using the read-write features the backing file is resized
+ *  in multiple PAGE_SIZE blocks even if the actually mapped size is
+ *  something else.
+ *  openrw(...,size,RESIZE) always truncates the file to roundUp(size).
+ *  You should do file->truncate(< sizeInBytes>) to make the file
+ *  size exactly what you want before the file is closed.
+ *
+ *  Modified regions should flush() or sync() before they are destroyed
+ *  or the modified data may not end up in the file.
+ *
+ * TODO:
+ *  - Two region instances should not overlap,
+ *    i.e same portion of the file should not be mapped twice.
+ *    (Not sure if this is actually broken now, but you have been warned)
+ * -  Multi-threading support not tested/written.
+ *    Currently the same mapped region can be used by multiple threads,
+ *    but cannot it be modified.
+ * -  Better error handling. (exceptions?, error codes?)
+ *    Currently critical errors are printed and std::abort() is called.
+ *    How do we handle system errors that happen in constructors?
+ */
+namespace mapped {
+
+const size_t PAGE_SIZE = 4096;
+
+static inline size_t roundToPage(ptrdiff_t x) { return (std::max<ptrdiff_t>(0, x - 1) & ~(PAGE_SIZE - 1)) + PAGE_SIZE; }
+
+constexpr inline size_t roundUp(uintptr_t x) { return (x + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); }
+
+constexpr inline size_t roundDown(uintptr_t x) { return (x & ~(PAGE_SIZE - 1)); }
+
+/**
+ * seekoff_t: Position of the file cursor
+ */
+using seekoff_t = uint64_t;
+/**
+ * len_t: length of file data
+ */
+using len_t = size_t;
+
+class file;
+
+/**
+ * Memory-mapped region
+ * @brief
+ * the base class memory-maps an raw range of bytes from the backing file.
+ */
+class region {
+   protected:
+    std::mutex mtx;
+    // actually mapped region:
+    void* map_ptr = nullptr;
+    size_t map_size = 0;
+    seekoff_t map_fseek = 0;
+    // what constructor asked:
+    void* usr_ptr = nullptr;
+    size_t usr_size = 0;
+    seekoff_t usr_fseek = 0;
+    // todo: maybe use std::weak_ptr?
+    // that would allow file to be released and
+    // any any existing region(s) would still work.
+    // (but only if remap() is not called)
+    std::shared_ptr<file> mfile;
+    // non-const data access sets is_dirty.
+    bool is_dirty = false;
+
+    void remap(const seekoff_t fpos, const len_t size, const len_t window);
+
+    region() {}
+
+   public:
+    /**
+     * Open memory mapped region into a file.
+     * @brief
+     * Seeks at fpos in file and map size bytes
+     * starting from that position in file.
+     * @param window
+     *  over-extend mapping up to max(size,window) bytes.
+     *  Setting window bigger than size allows more efficient operation:
+     *  [fpos, fpos + window] area is memory mapped
+     *  but region will only operate on the
+     *  [roundDown(fpos), roundup(fpos+size)]
+     *  sub-portion of the memory.
+     * @note
+     * - Seeking past the EOF in file that is read-only will fail.
+     *   The mapped size may extend past EOF but accessing past EOF
+     *   either returns undefined data or program is terminated by OS.
+     *   (EOF is at file->size())
+     * - Seeking past the EOF that is read-write
+     *   grows the backing file to fit the mapping.
+     *   The backing file is always extended in multiple of PAGE_SIZE bytes.
+     * @note
+     *  If size and/or fpos are not aligned to multiple of PAGE_SIZE
+     *  they are forcibly aligned internally. This results in
+     *  regionSize() and regionSeek() that may differ compared to
+     *  size() and getSeek().
+     *  Side-effect is that backing file may grow more than expected.
+     */
+    region(std::shared_ptr<file> src, seekoff_t fpos, len_t size, len_t window = 0);
+
+    /**
+     * Open memory mapped region into the file
+     * @brief
+     *  same as region(myfile, 0, myfile.size())
+     *  and memory maps the entire file.
+     */
+    explicit region(std::shared_ptr<file> src);
+
+    /**
+     * Note: even if region was modified,
+     * destructor will not flush()/sync() before tearing down the mapping.
+     */
+    virtual ~region();
+
+    // region is not copyable
+    region(const region&) =delete;
+    region& operator=(const region&) =delete;
+
+    // region is moveable
+    friend void swap(region& a, region& b) {
+        using std::swap;
+        //std::lock(a.mtx,b.mtx);
+        //std::lock_guard l0(a.mtx, std::adopt_lock);
+        //std::lock_guard l1(b.mtx, std::adopt_lock);
+        swap(a.map_ptr,b.map_ptr);
+        swap(a.map_size,b.map_size);
+        swap(a.map_fseek,b.map_fseek);
+        swap(a.usr_ptr,b.usr_ptr);
+        swap(a.usr_size,b.usr_size);
+        swap(a.usr_fseek,b.usr_fseek);
+        swap(a.mfile,b.mfile);
+        swap(a.is_dirty,b.is_dirty);
+    }
+    region(region&& mv) : region() {
+        swap(*this, mv);
+    }
+    region& operator=(region&& mv) {
+        swap(*this, mv);
+        return *this;
+    }
+
+    /**
+     * Get data pointer.
+     */
+    const void* data() const { return usr_ptr; }
+    void* data() {
+        is_dirty = true;
+        return usr_ptr;
+    }
+
+    std::shared_ptr<file> getFile() { return mfile; }
+
+    /**
+     * Get the seek used to init this region.
+     */
+    seekoff_t getSeek() const { return usr_fseek; }
+    /**
+     * Get the size used to init this region.
+     */
+    len_t size() const { return usr_size; }
+
+    /**
+     * Get page aligned seek <= getSeek()
+     */
+    seekoff_t regionSeek() const { return map_fseek; }
+    /**
+     * Get page aligned size >= size()
+     */
+    len_t regionSize() const { return map_size; }
+
+    /**
+     * Resize the mapped region.
+     * @note the mapped memory address may move,
+     * but current contents are preserved.
+     * @warn all pointers or references into
+     * the mapping are invalidated.
+     */
+    void resize(len_t newsize);
+
+    /**
+     * @brief over-extend mapping up to max(size(),window) bytes.
+     *  Setting window bigger than size() allows more efficient operation:
+     *  [regionSeek(), regionSeek() + window] area is memory mapped
+     *  but region will only operate on the
+     *  [roundDown(getSeek()), roundUp(getSeek()+size())]
+     *  sub-portion of the memory.
+     */
+    void window(len_t window = 0);
+
+    /**
+     * Flush mapped memory region into the file.
+     * @brief this is an hint to operating system that
+     * memory region shall be synchronized to disk.
+     * It may not wait for this to have completed before returning.
+     * @note only the page aligned region
+     * [roundDown(data()), roundUp(data()+size())]
+     * is flushed.
+     * @note Use sync() instead if you must guarantee the data has
+     * reached persistent storage.
+     */
+    void flush();
+
+    /**
+     * Synchronize modified memory region onto disk.
+     */
+    void sync();
+
+    /**
+     * Write data into the backing file.
+     * @brief
+     *  writeAt() stores range of bytes into the backing file.
+     * @note
+     *  The region doesn't need to have this area to be memory-mapped:
+     *  The data that falls into the memory-mapped
+     *  [regionSeek(), regionSeek()+regionSize()] area is simply memcpy'ed.
+     *  Any data that falls out this window is written directly
+     *  into the backing file.
+     *  The backing file is grown to fit the data when needed.
+     */
+    void writeAt(seekoff_t fpos, len_t datasize, const void* data);
+
+    /**
+     * Read data from the backing file.
+     * @brief
+     *  readAt() reads [fpos, fpos+datasize] range of bytes from the backing file
+     * @note
+     *  The region doesn't need to have this area memory-mapped
+     *  The read out area that falls into the memory-mapped
+     *  [regionSeek(), regionSeek()+regionSize()] area is simply memcpy'ed.
+     *  Any data that falls out this window is read directly
+     *  from the backing file.
+     */
+    void readAt(seekoff_t fpos, len_t datasize, void* data) const;
+
+    /**
+     * Set memory region to resident/or released.
+     * @brief setting memory range to non-resident state
+     * causes system to drop the data from system memory.
+     * Reading non-resident memory region again causes system to
+     * fetch data from the disk again.
+     * @warn if memory region is not flushed before setting
+     * resident(false) any writes may be discarded to backing file.
+     */
+    void resident(bool state);
+
+    /**
+     * Discard memory region.
+     * @brief discarding memory range causes system
+     * to reclaim the memory *and* the on-disk area.
+     * This means the data is lost in the mapped memory region,
+     * and any data within will not be written onto disk by sync()
+     * Subsequent reads after discard() return zero filled data.
+     * @note
+     *  The discarded area shall be within the mapped area.
+     * @param fpos
+     *  file offset from begin of this mapping. (getSeek() + fpos)
+     * @param datasize
+     *  length of the data area to discard.
+     */
+    void discard(seekoff_t fpos, len_t datasize);
+
+    /**
+     * Seek in the file to fpos position and
+     * remap the memory region there.
+     * @warn all pointers or references into
+     * the mapping are invalidated.
+     */
+    void jump(seekoff_t fpos);
+
+    /**
+     * Flush the current region and
+     * Seek in the file to fpos position and
+     * remap the memory region there.
+     * @warn all pointers or references into
+     * the mapping are invalidated.
+     */
+    void flushJump(seekoff_t fpos);
+};
+
+static_assert(std::is_move_constructible_v<region>);
+static_assert(std::is_move_assignable_v<region>);
+static_assert(std::is_swappable_v<region>);
+
+/**
+ * Typed region.
+ * struct_region<T> allows directly accessing an on-disk structure.
+ * The region size is implicit from the type.
+ */
+template <typename T>
+class struct_region : protected region {
+   public:
+    using type = typename std::decay<T>::type;
+    static_assert(std::is_standard_layout_v<type>, "T must be plain-old-data type");
+
+    /**
+     * Memory map struct_region<T> at fpos in file.
+     */
+    struct_region(std::shared_ptr<file> f, seekoff_t fpos, len_t window = 0) : region(f, fpos, sizeof(type), window) {}
+
+    type* get() { return static_cast<type*>(data()); }
+    const type* get() const { return static_cast<const type*>(data()); }
+
+    type* operator->() { return get(); }
+    const type* operator->() const { return get(); }
+
+    type& operator*() { return *get(); }
+    const type& operator*() const { return *get(); }
+
+    using region::flush;
+    using region::getFile;
+    using region::getSeek;
+    using region::readAt;
+    using region::sync;
+    using region::writeAt;
+    using region::resident;
+    using region::window;
+    using region::discard;
+
+    // note: size means the sizeof(T)
+    using region::size;
+
+    /**
+     * Get the file seek position just after *this.
+     */
+    seekoff_t getEndSeek() const { return getSeek() + sizeof(T); }
+
+    /**
+     * Seek to fpos in file and remap the region.
+     * @return the pointer into the new position
+     */
+    type* jump(seekoff_t fpos) {
+        region::jump(fpos);
+        return get();
+    }
+
+    type* flushJump(seekoff_t fpos) {
+        region::flushJump(fpos);
+        return get();
+    }
+};
+
+static_assert(std::is_move_constructible_v<struct_region<int>>);
+static_assert(std::is_move_assignable_v<struct_region<int>>);
+static_assert(std::is_swappable_v<struct_region<int>>);
+
+/**
+ * Typed array region.
+ * @brief
+ * array_region<T> allows directly accessing an on-disk array of structures
+ * The element size is implicit from the type and length of the array
+ * is provided by the constructor.
+ * @provides resize(<elements>), operator[], begin(), end()
+ */
+template <typename T>
+class array_region : protected region {
+   protected:
+    size_t num_elements = 0;
+
+   public:
+    using type = typename std::decay<T>::type;
+    static_assert(std::is_standard_layout_v<type>, "T must be plain-old-data type");
+
+    /**
+     * Memory map array_region<T> at fpos in file and map array_size elements.
+     */
+    array_region(std::shared_ptr<file> f, seekoff_t fpos, size_t array_size) : region(f, fpos, sizeof(type) * array_size), num_elements(array_size) {}
+
+    /**
+     * Get pointer to first mapped element.
+     */
+    type* get() { return static_cast<type*>(data()); }
+    const type* get() const { return static_cast<const type*>(data()); }
+
+    using region::flush;
+    using region::getFile;
+    using region::getSeek;
+    using region::readAt;
+    using region::sync;
+    using region::writeAt;
+
+    /**
+     * Resize the mapped array region.
+     */
+    void resize(size_t elements) {
+        region::resize(sizeof(T) * elements);
+        num_elements = elements;
+    }
+
+    /**
+     * Get number of mapped *elements*
+     */
+    size_t size() const { return num_elements; }
+
+    /**
+     * Access the array elements
+     */
+    T& operator[](size_t index) {
+        assert(index < num_elements);
+        return get()[index];
+    }
+    const T& operator[](size_t index) const {
+        assert(index < num_elements);
+        return get()[index];
+    }
+    /**
+     * Iterators
+     */
+    T* begin() { return get(); }
+    T* end() { return get() + num_elements; }
+    const T* begin() const { return get(); }
+    const T* end() const { return get() + num_elements; }
+
+    /**
+     * Get the file seek position just after *this.
+     */
+    seekoff_t getEndSeek() const { return getSeek() + sizeof(T) * num_elements; }
+
+    /**
+     * Seek to fpos in file and remap the region.
+     * @return the pointer into the first element in the array
+     */
+    type* jump(seekoff_t fpos) {
+        region::jump(fpos);
+        return get();
+    }
+
+    type* flushJump(seekoff_t fpos) {
+        region::flushJump(fpos);
+        return get();
+    }
+};
+
+/**
+ * Memory-mapped file I/O class.
+ * @note
+ * file should be created with std::make_shared<file>()
+ * as mapped region(s) take shared ownership of the file.
+ */
+class file : public std::enable_shared_from_this<file> {
+   private:
+    std::mutex mut;
+    int fd;
+    seekoff_t fd_size;
+    bool fd_rw;
+    // the file and region classes are inherently coupled,
+    // and we don't want to expose the internals.
+    friend class region;
+
+   public:
+    enum : int {
+        CREATE = 0x1,  //!< Create new file, if doesn't exist.
+        RESIZE = 0x2,  //!< Resize file.
+        FSTUNE = 0x4   //!< When creating new file attempt to set
+                       //!< file system attributes to improve performance.
+    };
+
+    file();
+    ~file();
+
+    /**
+     * Open file in read-only mode.
+     * @return non-zero if error occurred.
+     */
+    int open(const char* file);
+
+    /**
+     * Create/Open file in read-write mode.
+     * @param flags
+     *  - CREATE|RESIZE creates or replaces existing file
+     *    that is truncated to maxsize.
+     *  - RESIZE  opens existing file and truncates it to
+     *    maxsize. The file must exist already.
+     *  - flags == 0 ignores the maxsize argument and opens
+     *    existing file.
+     * @warn default open mode discards any previous file contents!
+     * @return non-zero if error occurred.
+     */
+    int openrw(const char* file, len_t maxsize, int flags = CREATE | RESIZE);
+
+    /**
+     * Check if file open R/W or RO
+     */
+    bool is_rw() const;
+
+    /**
+     * Resize the open file to newsize bytes.
+     * (file must be open in R/W mode)
+     * @return non-zero if error occurred.
+     */
+    int truncate(seekoff_t newsize);
+
+    /**
+     * Current length of the file
+     * The file EOF (end-of-file) is at this position.
+     */
+    seekoff_t size() const;
+
+    // Close the file.
+    void close();
+};
+
+};  // namespace mapped
+#endif
diff --git a/cpp/src/cache.cpp b/cpp/src/cache.cpp
deleted file mode 100644
index 071ff0f..0000000
--- a/cpp/src/cache.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "cache.hpp"
-
-#include <algorithm>
-#include <fstream>
-#include <limits>
-#include <string>
-#include <unordered_set>
-
-#include "utils.hpp"
-
-/*
-====================
-cache file header
-====================
-
-uint32_t magic = "PCUB"
-uint32_t n = cache file for n cubes in a polycube
-uint32_t numShapes = number of different shapes in cachefile
--------
-
-====================
-shapetable:
-====================
-shapeEntry {
-    uint8_t dim0 // offset by -1
-    uint8_t dim1 // offset by -1
-    uint8_t dim2 // offset by -1
-    uint8_t reserved
-    uint64_t offset in file
-}
-shapeEntry[numShapes]
-
-
-====================
-XYZ data
-====================
-
-*/
-
-void Cache::save(std::string path, Hashy &hashes, uint8_t n) {
-    if (hashes.size() == 0) return;
-    std::ofstream ofs(path, std::ios::binary);
-    Header header;
-    header.magic = MAGIC;
-    header.n = n;
-    header.numShapes = hashes.byshape.size();
-    header.numPolycubes = hashes.size();
-    ofs.write((const char *)&header, sizeof(header));
-
-    std::vector<XYZ> keys;
-    keys.reserve(header.numShapes);
-    for (auto &pair : hashes.byshape) keys.push_back(pair.first);
-    std::sort(keys.begin(), keys.end());
-    uint64_t offset = sizeof(Header) + header.numShapes * sizeof(ShapeEntry);
-    for (auto &key : keys) {
-        ShapeEntry se;
-        se.dim0 = key.x();
-        se.dim1 = key.y();
-        se.dim2 = key.z();
-        se.reserved = 0;
-        se.offset = offset;
-        se.size = hashes.byshape[key].size() * XYZ_SIZE * n;
-        offset += se.size;
-        ofs.write((const char *)&se, sizeof(ShapeEntry));
-    }
-    // put XYZs
-    for (auto &key : keys) {
-        for (auto &subset : hashes.byshape[key].byhash)
-            for (const auto &c : subset.set) {
-                if constexpr (sizeof(XYZ) == XYZ_SIZE) {
-                    ofs.write((const char *)c.data(), sizeof(XYZ) * c.size());
-                } else {
-                    for (const auto &p : c) {
-                        ofs.write((const char *)p.data, XYZ_SIZE);
-                    }
-                }
-            }
-    }
-
-    std::printf("saved %s\n\r", path.c_str());
-}
-
-Hashy Cache::load(std::string path, uint32_t extractShape) {
-    Hashy cubes;
-    auto ifs = std::ifstream(path, std::ios::binary);
-    if (!ifs.is_open()) return cubes;
-    Header header;
-    if (!ifs.read((char *)&header, sizeof(header))) {
-        return cubes;
-    }
-    // check magic
-    if (header.magic != MAGIC) {
-        return cubes;
-    }
-#ifdef CACHE_LOAD_HEADER_ONLY
-    std::printf("loading cache file \"%s\" for N = %u", path.c_str(), header.n);
-    std::printf(", %u shapes, %lu XYZs\n\r", header.numShapes, header.numPolycubes);
-#endif
-    auto cubeSize = XYZ_SIZE * header.n;
-    DEBUG_PRINTF("cubeSize: %u\n\r", cubeSize);
-
-    for (uint32_t i = 0; i < header.numShapes; ++i) {
-        ShapeEntry shapeEntry;
-        if (!ifs.read((char *)&shapeEntry, sizeof(shapeEntry))) {
-            std::printf("ERROR reading ShapeEntry %u\n\r", i);
-            exit(-1);
-        }
-        if (ALL_SHAPES != extractShape && i != extractShape) continue;
-#ifdef CACHE_PRINT_SHAPEENTRIES
-        std::printf("ShapeEntry %3u: [%2d %2d %2d] offset: 0x%08lx size: 0x%08lx (%ld polycubes)\n\r", i, shapeEntry.dim0, shapeEntry.dim1, shapeEntry.dim2,
-                    shapeEntry.offset, shapeEntry.size, shapeEntry.size / cubeSize);
-#endif
-        if (shapeEntry.size % cubeSize != 0) {
-            std::printf("ERROR shape block is not divisible by cubeSize!\n\r");
-            exit(-1);
-        }
-#ifndef CACHE_LOAD_HEADER_ONLY
-        // remember pos in file
-        auto pos = ifs.tellg();
-
-        // read XYZ contents
-        ifs.seekg(shapeEntry.offset);
-        const uint32_t CHUNK_SIZE = 512 * XYZ_SIZE;
-        uint8_t buf[CHUNK_SIZE] = {0};
-        uint64_t buf_offset = 0;
-        uint32_t numCubes = shapeEntry.size / cubeSize;
-        XYZ shape(shapeEntry.dim0, shapeEntry.dim1, shapeEntry.dim2);
-        uint64_t readsize = shapeEntry.size - buf_offset;
-        if (readsize > CHUNK_SIZE) readsize = CHUNK_SIZE;
-        if (!ifs.read((char *)&buf, readsize)) {
-            std::printf("ERROR reading XYZs for Shape %u\n\r", i);
-            exit(-1);
-        }
-        for (uint32_t j = 0; j < numCubes; ++j) {
-            Cube next(header.n);
-            for (uint32_t k = 0; k < header.n; ++k) {
-                // check if buf contains next XYZ
-                uint64_t curr_offset = j * cubeSize + k * XYZ_SIZE;
-                if (curr_offset >= buf_offset + CHUNK_SIZE) {
-                    // std::printf("reload buffer\n\r");
-                    buf_offset += CHUNK_SIZE;
-                    readsize = shapeEntry.size - buf_offset;
-                    if (readsize > CHUNK_SIZE) readsize = CHUNK_SIZE;
-                    if (!ifs.read((char *)&buf, readsize)) {
-                        std::printf("ERROR reading XYZs for Shape %u\n\r", i);
-                        exit(-1);
-                    }
-                }
-
-                next.data()[k].data[0] = buf[curr_offset - buf_offset + 0];
-                next.data()[k].data[1] = buf[curr_offset - buf_offset + 1];
-                next.data()[k].data[2] = buf[curr_offset - buf_offset + 2];
-            }
-            cubes.insert(next, shape);
-        }
-
-        // restore pos
-        ifs.seekg(pos);
-#endif
-    }
-    std::printf("  loaded %lu cubes\n\r", cubes.size());
-    return cubes;
-}
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index cdcc5b4..1630bb6 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -7,8 +7,9 @@
 #include <iostream>
 #include <mutex>
 #include <thread>
+#include <deque>
+#include <condition_variable>
 
-#include "cache.hpp"
 #include "cube.hpp"
 #include "hashes.hpp"
 #include "newCache.hpp"
@@ -19,22 +20,27 @@ const int PERF_STEP = 500;
 
 struct Workset {
     std::mutex mu;
+
+    CacheReader cr;
     CubeIterator _begin_total;
     CubeIterator _begin;
     CubeIterator _end;
     Hashy &hashes;
     XYZ targetShape, shape, expandDim;
     bool notSameShape;
-    Workset(ShapeRange &data, Hashy &hashes, XYZ targetShape, XYZ shape, XYZ expandDim, bool notSameShape)
-        : _begin_total(data.begin())
-        , _begin(data.begin())
-        , _end(data.end())
-        , hashes(hashes)
+    Workset(Hashy &hashes, XYZ targetShape, XYZ shape, XYZ expandDim, bool notSameShape)
+        : hashes(hashes)
         , targetShape(targetShape)
         , shape(shape)
         , expandDim(expandDim)
         , notSameShape(notSameShape) {}
 
+    void setRange(ShapeRange &data) {
+        _begin_total = data.begin();
+        _begin = data.begin();
+        _end = data.end();
+    }
+
     struct Subset {
         CubeIterator _begin, _end;
         bool valid;
@@ -48,7 +54,7 @@ struct Workset {
         auto a = _begin;
         _begin += 500;
         if (_begin > _end) _begin = _end;
-        return {a, _begin, a < _end, 100 * (float)((uint64_t)a.m_ptr - (uint64_t)_begin_total.m_ptr) / ((uint64_t)_end.m_ptr - (uint64_t)_begin_total.m_ptr)};
+        return {a, _begin, a < _end, 100 * float(std::distance(_begin_total.data(), a.data())) / std::distance(_begin_total.data(), _end.data())};
     }
 
     void expand(const Cube &c) {
@@ -87,14 +93,14 @@ struct Workset {
         std::set_difference(candidates.begin(), end, c.begin(), c.end(), std::back_inserter(tmp));
         candidates = std::move(tmp);
 
-        DEBUG_PRINTF("candidates: %lu\n\r", candidates.size());
+        DEBUG1_PRINTF("candidates: %lu\n\r", candidates.size());
 
         Cube newCube(c.size() + 1);
         Cube lowestHashCube(newCube.size());
         Cube rotatedCube(newCube.size());
 
         for (const auto &p : candidates) {
-            DEBUG_PRINTF("(%2d %2d %2d)\n\r", p.x(), p.y(), p.z());
+            DEBUG2_PRINTF("(%2d %2d %2d)\n\r", p.x(), p.y(), p.z());
             int ax = (p.x() < 0) ? 1 : 0;
             int ay = (p.y() < 0) ? 1 : 0;
             int az = (p.z() < 0) ? 1 : 0;
@@ -131,26 +137,69 @@ struct Workset {
 };
 
 struct Worker {
-    Workset &ws;
+    std::shared_ptr<Workset> ws;
     int id;
-    Worker(Workset &ws_, int id_) : ws(ws_), id(id_) {}
+    int state = 3; // 1 == completed/waiting for job, 2 == processing, 3 == job assigned.
+    std::mutex mtx;
+    std::condition_variable cond;
+    std::condition_variable cond2;
+    std::thread thr;
+
+    Worker(int id_) : id(id_), thr(&Worker::run, this) {}
+    ~Worker() {
+        std::unique_lock lock(mtx);
+        state = 0;
+        cond.notify_one();
+        lock.unlock();
+        thr.join();
+    }
+
+    void launch(std::shared_ptr<Workset> ws_) {
+        std::unique_lock lock(mtx);
+        while(state > 1) {
+            cond2.wait(lock);
+        }
+        ws = ws_;
+        state = 3;
+        cond.notify_one();
+    }
+
+    void sync() {
+        std::unique_lock lock(mtx);
+        while(state > 1) {
+            cond2.wait(lock);
+        }
+        ws.reset();
+    }
+
     void run() {
-        // std::printf("start %d\n", id);
-        auto subset = ws.getPart();
-        while (subset.valid) {
-            if (id == 0) {
-                std::printf("  %5.2f%%\r", subset.percent);
-                std::flush(std::cout);
-            }
-            // std::cout << id << " next subset " << &*subset.begin() << " to " << &*subset.end() << "\n";
-            for (auto &c : subset) {
-                // std::printf("%p\n", (void *)&c);
-                // c.print();
-                ws.expand(c);
+        std::unique_lock lock(mtx);
+        std::printf("thread nro. %d started.\n", id);
+        while(state) {
+            state = 1;
+            cond2.notify_one();
+            while(state == 1)
+                cond.wait(lock);
+            if(!state)
+                return;
+            state = 2;
+            // std::printf("start %d\n", id);
+            auto subset = ws->getPart();
+            while (subset.valid) {
+                if (id == 0) {
+                    std::printf("  %5.2f%%\r", subset.percent);
+                    std::flush(std::cout);
+                }
+                // std::cout << id << " next subset " << &*subset.begin() << " to " << &*subset.end() << "\n";
+                for (auto &c : subset) {
+                    // std::printf("%p\n", (void *)&c);
+                    // c.print();
+                    ws->expand(c);
+                }
+                subset = ws->getPart();
             }
-            subset = ws.getPart();
+            // std::printf("finished %d\n", id);
         }
-        // std::printf("finished %d\n", id);
     }
 };
 
@@ -166,7 +215,8 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
         hashes.insert(Cube{{XYZ(0, 0, 0)}}, XYZ(0, 0, 0));
         std::printf("%ld elements for %d\n\r", hashes.size(), n);
         if (write_cache) {
-            Cache::save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
+            CacheWriter cw(1);
+            cw.save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
         }
         return FlatCache(hashes, n);
     }
@@ -185,10 +235,20 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
     }
     std::printf("N = %d || generating new cubes from %lu base cubes.\n\r", n, base->size());
     hashes.init(n);
+
+    // Start worker threads.
+    std::deque<Worker> workers;
+    for (int i = 0; i < threads; ++i) {
+        workers.emplace_back(i);
+    }
+
+    CacheWriter cw(threads);
+
     uint64_t totalSum = 0;
     auto start = std::chrono::steady_clock::now();
     uint32_t totalOutputShapes = hashes.byshape.size();
     uint32_t outShapeCount = 0;
+
     auto prevShapes = Hashy::generateShapes(n - 1);
     for (auto &tup : hashes.byshape) {
         outShapeCount++;
@@ -210,13 +270,14 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
             if (diffy == 1)
                 if (shape.y() == shape.x()) diffx = 1;
 
-            std::printf("  shape %d %d %d\n\r", shape.x(), shape.y(), shape.z());
+            auto ws = std::make_shared<Workset>(hashes, targetShape, shape, XYZ(diffx, diffy, diffz), abssum);
 
             if (use_split_cache) {
                 // load cache file only for this shape
                 std::string cachefile = base_path + "cubes_" + std::to_string(n - 1) + "_" + std::to_string(prevShapes[sid].x()) + "-" +
                                         std::to_string(prevShapes[sid].y()) + "-" + std::to_string(prevShapes[sid].z()) + ".bin";
-                cr.loadFile(cachefile);
+                ws->cr.loadFile(cachefile);
+                base = &ws->cr;
                 // cr.printHeader();
             }
             auto s = base->getCubesByShape(sid);
@@ -224,24 +285,30 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
                 std::printf("ERROR caches shape does not match expected shape!\n");
                 exit(-1);
             }
-            // std::printf("starting %d threads\n\r", threads);
-            std::vector<std::thread> ts;
-            Workset ws(s, hashes, targetShape, shape, XYZ(diffx, diffy, diffz), abssum);
-            std::vector<Worker> workers;
-            ts.reserve(threads);
-            workers.reserve(threads);
-            for (int i = 0; i < threads; ++i) {
-                workers.emplace_back(ws, i);
-                ts.emplace_back(&Worker::run, std::ref(workers[i]));
+
+            ws->setRange(s);
+
+            // Wait for jobs to complete.
+            for (auto& thr : workers) {
+                thr.sync();
             }
-            for (int i = 0; i < threads; ++i) {
-                ts[i].join();
+            std::printf("  shape %d %d %d\n\r", shape.x(), shape.y(), shape.z());
+            // launch the new jobs.
+            // Because the workset is held by shared_ptr
+            // main thread can do above preparation work in parallel
+            // while the jobs are running.
+            for (auto& thr : workers) {
+                thr.launch(ws);
             }
         }
+        // Wait for jobs to complete.
+        for (auto& thr : workers) {
+            thr.sync();
+        }
         std::printf("  num: %lu\n\r", hashes.byshape[targetShape].size());
         totalSum += hashes.byshape[targetShape].size();
         if (write_cache && split_cache) {
-            Cache::save(base_path + "cubes_" + std::to_string(n) + "_" + std::to_string(targetShape.x()) + "-" + std::to_string(targetShape.y()) + "-" +
+            cw.save(base_path + "cubes_" + std::to_string(n) + "_" + std::to_string(targetShape.x()) + "-" + std::to_string(targetShape.y()) + "-" +
                             std::to_string(targetShape.z()) + ".bin",
                         hashes, n);
         }
@@ -252,9 +319,16 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
             }
         }
     }
+
+    // Stop the workers.
+    workers.clear();
+
     if (write_cache && !split_cache) {
-        Cache::save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
+        cw.save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
     }
+
+    cw.flush();
+
     auto end = std::chrono::steady_clock::now();
     auto dt_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
     std::printf("took %.2f s\033[0K\n\r", dt_ms / 1000.f);
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index ef925dc..93f15b3 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -1,13 +1,8 @@
-#include "../include/newCache.hpp"
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <unistd.h>
+#include "newCache.hpp"
 
 #include <iostream>
 
-CacheReader::CacheReader()
-    : filePointer(nullptr), path_(""), fileDescriptor_(-1), fileSize_(0), fileLoaded_(false), dummyHeader{0, 0, 0, 0}, header(&dummyHeader), shapes(nullptr) {}
+CacheReader::CacheReader() : path_(""), fileLoaded_(false), dummyHeader{0, 0, 0, 0}, header(&dummyHeader), shapes(nullptr) {}
 
 void CacheReader::printHeader() {
     if (fileLoaded_) {
@@ -33,28 +28,37 @@ int CacheReader::printShapes(void) {
 int CacheReader::loadFile(const std::string path) {
     unload();
     path_ = path;
-    fileDescriptor_ = open(path.c_str(), O_RDONLY);
 
-    if (fileDescriptor_ == -1) {
+    // open read-only backing file:
+    file_ = std::make_shared<mapped::file>();
+    if (file_->open(path.c_str())) {
         std::printf("error opening file\n");
         return 1;
     }
 
-    // get filesize
-    fileSize_ = lseek(fileDescriptor_, 0, SEEK_END);
-    lseek(fileDescriptor_, 0, SEEK_SET);
+    // map the header struct
+    header_ = std::make_unique<const mapped::struct_region<cacheformat::Header>>(file_, 0);
+    header = header_->get();
 
-    // memory map file
-    filePointer = (uint8_t*)mmap(NULL, fileSize_, PROT_READ, MAP_SHARED, fileDescriptor_, 0);
-    if (filePointer == MAP_FAILED) {
-        // error handling
-        std::printf("errorm mapping file memory");
-        close(fileDescriptor_);
-        return 2;
+    if (header->magic != cacheformat::MAGIC) {
+        std::printf("error opening file: file not recognized\n");
+        return 1;
     }
 
-    header = (Header*)(filePointer);
-    shapes = (ShapeEntry*)(filePointer + sizeof(Header));
+    // map the ShapeEntry array:
+    shapes_ = std::make_unique<const mapped::array_region<cacheformat::ShapeEntry>>(file_, header_->getEndSeek(), (*header_)->numShapes);
+    shapes = shapes_->get();
+
+    size_t datasize = 0;
+    for (unsigned int i = 0; i < header->numShapes; ++i) {
+        datasize += shapes[i].size;
+    }
+
+    // map rest of the file as XYZ data:
+    if (file_->size() != shapes_->getEndSeek() + datasize) {
+        std::printf("warn: file size does not match expected value\n");
+    }
+    xyz_ = std::make_unique<const mapped::array_region<XYZ>>(file_, shapes_->getEndSeek(), datasize);
 
     fileLoaded_ = true;
 
@@ -65,27 +69,226 @@ ShapeRange CacheReader::getCubesByShape(uint32_t i) {
     if (i >= header->numShapes) {
         return ShapeRange{nullptr, nullptr, 0, XYZ(0, 0, 0)};
     }
-    XYZ* start = reinterpret_cast<XYZ*>(filePointer + shapes[i].offset);
-    XYZ* end = reinterpret_cast<XYZ*>(filePointer + shapes[i].offset + shapes[i].size);
-    return ShapeRange(start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+    if (shapes[i].size <= 0) {
+        return ShapeRange{nullptr, nullptr, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2)};
+    }
+    // get section start
+    // note: shapes[i].offset may have bogus offset
+    // if any earlier shape table entry was empty before i
+    // so we ignore the offset here.
+    size_t offset = 0;
+    for (unsigned int k = 0; k < i; ++k) {
+        offset += shapes[k].size;
+    }
+    auto index = offset / cacheformat::XYZ_SIZE;
+    auto num_xyz = shapes[i].size / cacheformat::XYZ_SIZE;
+    // pointers to Cube data:
+    auto start = xyz_->get() + index;
+    auto end = xyz_->get() + index + num_xyz;
+    return ShapeRange{start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2)};
 }
 
 void CacheReader::unload() {
-    // unmap file from memory
+    // unload file from memory
     if (fileLoaded_) {
-        if (munmap(filePointer, fileSize_) == -1) {
-            // error handling
-            std::printf("error unmapping file\n");
-        }
-
-        // close file descriptor
-        close(fileDescriptor_);
+        xyz_.reset();
+        shapes_.reset();
+        header_.reset();
+        file_.reset();
         fileLoaded_ = false;
     }
-    fileDescriptor_ = -1;
-    filePointer = nullptr;
     header = &dummyHeader;
     shapes = nullptr;
 }
 
 CacheReader::~CacheReader() { unload(); }
+
+CacheWriter::CacheWriter(int num_threads) {
+    for (int i = 0; i < num_threads; ++i) {
+        m_flushers.emplace_back(&CacheWriter::run, this);
+    }
+}
+
+CacheWriter::CacheWriter::~CacheWriter() {
+    flush();
+    // stop the threads.
+    std::unique_lock lock(m_mtx);
+    m_active = false;
+    m_run.notify_all();
+    lock.unlock();
+    for (auto &thr : m_flushers) thr.join();
+}
+
+void CacheWriter::run() {
+    std::unique_lock lock(m_mtx);
+    while (m_active) {
+        // do copy jobs:
+        if (!m_copy.empty()) {
+            auto task = std::move(m_copy.front());
+            m_copy.pop_front();
+            lock.unlock();
+
+            task();
+
+            lock.lock();
+            --m_num_copys;
+            continue;
+        }
+        // file flushes:
+        if (!m_flushes.empty()) {
+            auto task = std::move(m_flushes.front());
+            m_flushes.pop_front();
+            lock.unlock();
+
+            task();
+
+            lock.lock();
+            --m_num_flushes;
+            continue;
+        }
+        // notify that we are done here.
+        m_wait.notify_one();
+        // wait for jobs.
+        m_run.wait(lock);
+    }
+    m_wait.notify_one();
+}
+
+void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
+    if (hashes.size() == 0) return;
+
+    using namespace mapped;
+    using namespace cacheformat;
+
+    auto file_ = std::make_shared<file>();
+    if (file_->openrw(path.c_str(), 0)) {
+        std::printf("error opening file\n");
+        return;
+    }
+
+    auto header = std::make_shared<struct_region<Header>>(file_, 0);
+    (*header)->magic = cacheformat::MAGIC;
+    (*header)->n = n;
+    (*header)->numShapes = hashes.byshape.size();
+    (*header)->numPolycubes = hashes.size();
+
+    std::vector<XYZ> keys;
+    keys.reserve((*header)->numShapes);
+    for (auto &pair : hashes.byshape) keys.push_back(pair.first);
+    std::sort(keys.begin(), keys.end());
+
+    auto shapeEntry = std::make_shared<array_region<ShapeEntry>>(file_, header->getEndSeek(), (*header)->numShapes);
+
+    uint64_t offset = shapeEntry->getEndSeek();
+    size_t num_cubes = 0;
+    int i = 0;
+    for (auto &key : keys) {
+        auto &se = (*shapeEntry)[i++];
+        se.dim0 = key.x();
+        se.dim1 = key.y();
+        se.dim2 = key.z();
+        se.reserved = 0;
+        se.offset = offset;
+        auto count = hashes.byshape[key].size();
+        num_cubes += count;
+        se.size = count * XYZ_SIZE * n;
+        offset += se.size;
+    }
+
+    // put XYZs
+    // Serialize large CubeSet(s) in parallel.
+
+    auto xyz = std::make_shared<array_region<XYZ>>(file_, (*shapeEntry)[0].offset, num_cubes * n);
+    auto put = xyz->get();
+
+    auto copyrange = [n](CubeSet::iterator itr, CubeSet::iterator end, XYZ *dest) -> void {
+        while (itr != end) {
+            static_assert(sizeof(XYZ) == XYZ_SIZE);
+            assert(itr->size() == n);
+            itr->copyout(n, dest);
+            dest += n;
+            ++itr;
+        }
+    };
+
+    auto time_start = std::chrono::steady_clock::now();
+    for (auto &key : keys) {
+        for (auto &subset : hashes.byshape[key].byhash) {
+            auto itr = subset.set.begin();
+
+            ptrdiff_t dist = subset.set.size();
+            // distribute if range is large enough.
+            auto skip = std::max(4096L, std::max(1L, dist / (signed)m_flushers.size()));
+            while (dist > skip) {
+                auto start = itr;
+                auto dest = put;
+
+                auto inc = std::min(dist, skip);
+                std::advance(itr, inc);
+                put += n * inc;
+                dist = std::distance(itr, subset.set.end());
+
+                auto done = 100.0f * (std::distance(xyz->get(), put) / float(num_cubes * n));
+                std::printf("writing data %5.2f%% ...  \r", done);
+                std::flush(std::cout);
+
+                std::lock_guard lock(m_mtx);
+                m_copy.emplace_back(std::bind(copyrange, start, itr, dest));
+                ++m_num_copys;
+                m_run.notify_all();
+            }
+            // copy remainder, if any.
+            if (dist) {
+                std::lock_guard lock(m_mtx);
+                m_copy.emplace_back(std::bind(copyrange, itr, subset.set.end(), put));
+                ++m_num_copys;
+                m_run.notify_all();
+                put += n * dist;
+
+                auto done = 100.0f * (std::distance(xyz->get(), put) / float(num_cubes * n));
+                std::printf("writing data %5.2f%% ...  \r", done);
+                std::flush(std::cout);
+            }
+        }
+    }
+
+    // sanity check:
+    assert(put == (*xyz).get() + num_cubes * n);
+
+    // sync up.
+    std::unique_lock lock(m_mtx);
+    while (m_num_copys) {
+        m_wait.wait(lock);
+    }
+
+    // move the resources into flush job.
+    m_flushes.emplace_back(std::bind(
+        [](auto &&file, auto &&header, auto &&shapeEntry, auto &&xyz) -> void {
+            // flush.
+            header->flush();
+            shapeEntry->flush();
+            xyz->flush();
+            // Truncate file to proper size.
+            file->truncate(xyz->getEndSeek());
+            file->close();
+            file.reset();
+            xyz.reset();
+            shapeEntry.reset();
+            header.reset();
+        },
+        std::move(file_), std::move(header), std::move(shapeEntry), std::move(xyz)));
+    ++m_num_flushes;
+    m_run.notify_all();
+
+    auto time_end = std::chrono::steady_clock::now();
+    auto dt_ms = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+
+    std::printf("saved %s, took %.2f s\n\r", path.c_str(), dt_ms / 1000.f);
+}
+
+void CacheWriter::flush() {
+    std::unique_lock lock(m_mtx);
+    while (m_num_flushes) {
+        m_wait.wait(lock);
+    }
+}
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b30d160..42e0014 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -19,4 +19,5 @@ add_executable(${PROJECT_NAME} $<TARGET_OBJECTS:CubeObjs> ${TESTS})
 
 target_link_libraries(GTest::GTest INTERFACE gtest_main)
 target_link_libraries(${PROJECT_NAME} pthread GTest::GTest)
+target_link_libraries(${PROJECT_NAME} mapped_file)
 ConfigureTarget(${PROJECT_NAME})
diff --git a/cpp/tests/src/test_cache.cpp b/cpp/tests/src/test_cache.cpp
deleted file mode 100644
index ae10cbf..0000000
--- a/cpp/tests/src/test_cache.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "cache.hpp"
-
-TEST(CacheTests, TestCacheLoadDoesNotThrow) { EXPECT_NO_THROW(Cache::load("./test_data.bin")); }
-
-TEST(CacheTests, TestCacheSaveDoesNotThrow) {
-    auto data = Cache::load("./test_data.bin");
-    EXPECT_NO_THROW(Cache::save("./temp.bin", data, 255));
-}
\ No newline at end of file