diff --git a/tests/Makefile b/tests/Makefile
index 3a2380dc..72894eb2 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -15,7 +15,9 @@ BIN := src/createmany			\
 	src/o_tmpfile_umask		\
 	src/o_tmpfile_linkat		\
 	src/mmap_stress			\
-	src/mmap_validate
+	src/mmap_validate		\
+	src/parallel_restore		\
+	src/restore_copy
 
 DEPS := $(wildcard src/*.d)
 
@@ -27,8 +29,13 @@ endif
 
 src/mmap_stress: LIBS+=-lpthread
 
+src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
+src/parallel_restore: ../utils/src/scoutfs_parallel_restore.a
+src/restore_copy_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
+src/restore_copy : ../utils/src/scoutfs_parallel_restore.a
+
 $(BIN): %: %.c Makefile
-	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS)
+	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS) $($(@)_cflags)
 
 .PHONY: clean
 clean:
diff --git a/tests/golden/parallel_restore b/tests/golden/parallel_restore
new file mode 100644
index 00000000..28889357
--- /dev/null
+++ b/tests/golden/parallel_restore
@@ -0,0 +1,26 @@
+== simple mkfs/restore/mount
+committed_seq     1120
+total_meta_blocks 163840
+total_data_blocks 15728640
+   1440    1440   57120
+     80      80     400
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+Type Used
+MetaData 34721
+Data 64
+== under ENOSPC
+Type Used
+MetaData 117073
+Data 64
+== ENOSPC
+== attempt to restore data device
+== attempt format_v1 restore
+== test if previously mounted
+== cleanup
diff --git a/tests/golden/restore_copy b/tests/golden/restore_copy
new file mode 100644
index 00000000..36e14321
--- /dev/null
+++ b/tests/golden/restore_copy
@@ -0,0 +1,83 @@
+== restore_copy content verification
+d /mnt/test/data/d
+f /mnt/test/data/f
+l /mnt/test/data/l -> broken
+f /mnt/test/data/h
+l /mnt/test/data/F -> f
+b /mnt/test/data/b
+c /mnt/test/data/c
+c /mnt/test/data/u
+p /mnt/test/data/p
+f /mnt/test/data/f4096
+f /mnt/test/data/falloc
+f /mnt/test/data/truncate
+s /mnt/test/data/s
+f /mnt/test/data/mode_t
+f /mnt/test/data/uidgid
+f /mnt/test/data/retention
+f /mnt/test/data/proj
+f /mnt/test/data/proj_d/f
+d /mnt/test/data/proj_d
+d /mnt/test/data
+Quota rule:   7 13,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 11,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 12,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 10,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 15,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 14,L,- 0,L,- 0,L,- I 33 -
+Wrote 1 directories, 0 files, 458752 bytes total
+== verify metadata bits on restored fs
+total 16516
+-rw-r--r--. 1 33333 33333        0  uidgid
+crw-r--r--. 1     0     0     2, 2  u
+-rw-r--r--. 1     0     0 16777216  truncate
+srwxr-xr-x. 1     0     0        0  s
+-rw-r--r--. 1     0     0        0  retention
+drwxr-xr-x. 2     0     0        1  proj_d
+-rw-r--r--. 1     0     0        0  proj
+prw-r--r--. 1     0     0        0  p
+-rwsrwsrwx. 1     0     0        0  mode_t
+lrwxrwxrwx. 1     0     0        7  l -> broken
+-rw-r--r--. 2     0     0        0  h
+-rw-r--r--. 1     0     0   131072  falloc
+-rw-r--r--. 1     0     0     4096  f4096
+-rw-r--r--. 2     0     0        0  f
+drwxr-xr-x. 2     0     0        0  d
+crw-r--r--. 1     0     0     0, 0  c
+brw-r--r--. 1     0     0     1, 1  b
+lrwxrwxrwx. 1     0     0        2  F -> f
+1
+12345
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 32 flags: O.L
+extents: 1
+0: offset: 0 length: 4096 flags: O.L
+extents: 1
+  7 15,L,- 0,L,- 0,L,- I 33 -
+  7 14,L,- 0,L,- 0,L,- I 33 -
+  7 13,L,- 0,L,- 0,L,- I 33 -
+  7 12,L,- 0,L,- 0,L,- I 33 -
+  7 11,L,- 0,L,- 0,L,- I 33 -
+  7 10,L,- 0,L,- 0,L,- I 33 -
+12345
+54321
+crtime            55555.666666666
+crtime            55556.666666666
+== verify quota rules on restored fs
+  7 14,L,- 0,L,- 0,L,- I 33 -
+  7 13,L,- 0,L,- 0,L,- I 33 -
+  7 12,L,- 0,L,- 0,L,- I 33 -
+  7 11,L,- 0,L,- 0,L,- I 33 -
+  7 10,L,- 0,L,- 0,L,- I 33 -
+  7 15,L,- 0,L,- 0,L,- I 33 -
+  7 14,L,- 0,L,- 0,L,- I 33 -
+  7 13,L,- 0,L,- 0,L,- I 33 -
+  7 12,L,- 0,L,- 0,L,- I 33 -
+  7 11,L,- 0,L,- 0,L,- I 33 -
+  7 10,L,- 0,L,- 0,L,- I 33 -
+Type Used
+MetaData 34698
+Data 64
+== umount restored fs and check
+== cleanup
diff --git a/tests/sequence b/tests/sequence
index 18eff7cf..35a194d8 100644
--- a/tests/sequence
+++ b/tests/sequence
@@ -57,4 +57,6 @@ archive-light-cycle.sh
 block-stale-reads.sh
 inode-deletion.sh
 renameat2-noreplace.sh
+parallel_restore.sh
+restore_copy.sh
 xfstests.sh
diff --git a/tests/src/parallel_restore.c b/tests/src/parallel_restore.c
new file mode 100644
index 00000000..b6c82657
--- /dev/null
+++ b/tests/src/parallel_restore.c
@@ -0,0 +1,805 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <time.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <sys/socket.h>
+
+#include "../../utils/src/sparse.h"
+#include "../../utils/src/util.h"
+#include "../../utils/src/list.h"
+#include "../../utils/src/parse.h"
+#include "../../kmod/src/format.h"
+#include "../../utils/src/parallel_restore.h"
+
+/*
+ * XXX:
+ *  - add a nice description of what's going on
+ *  - mention allocator contention
+ *  - test child process dying handling
+ *  - root dir entry name length is wrong
+ */
+
+#define ERRF " errno %d (%s)"
+#define ERRA errno, strerror(errno)
+
+#define error_exit(cond, fmt, args...)			\
+do {							\
+	if (cond) {					\
+		printf("error: "fmt"\n", ##args);	\
+		exit(1);				\
+	}						\
+} while (0)
+
+#define dprintf(fmt, args...)		\
+do {					\
+	if (0)				\
+		printf(fmt, ##args);	\
+} while (0)
+
+#define REG_MODE (S_IFREG | 0644)
+#define DIR_MODE (S_IFDIR | 0755)
+
+struct opts {
+	unsigned long long buf_size;
+
+	unsigned long long write_batch;
+	unsigned long long low_dirs;
+	unsigned long long high_dirs;
+	unsigned long long low_files;
+	unsigned long long high_files;
+	char *meta_path;
+	unsigned long long total_files;
+	bool read_only;
+	unsigned long long seed;
+	unsigned long long nr_writers;
+};
+
+static void usage(void)
+{
+	printf("usage:\n"
+	       " -b NR       | threads write blocks in batches files (100000)\n"
+	       " -d LOW:HIGH | range of subdirs per directory (5:10)\n"
+	       " -f LOW:HIGH | range of files per directory (10:20)\n"
+	       " -m PATH     | path to metadata device\n"
+	       " -n NR       | total number of files to create (100)\n"
+	       " -r          | read-only, all work except writing, measure cpu cost\n"
+	       " -s NR       | randomization seed (random)\n"
+	       " -w NR       | number of writing processes to fork (online cpus)\n"
+	       );
+}
+
+static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+			 void *buf, size_t buf_size, int dev_fd)
+{
+	size_t total = 0;
+	size_t count;
+	off_t off;
+	int ret;
+
+	do {
+		ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count);
+		error_exit(ret, "write buf %d", ret);
+
+		if (count > 0) {
+			if (!opts->read_only)
+				ret = pwrite(dev_fd, buf, count, off);
+			else
+				ret = count;
+			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
+			total += ret;
+		}
+	} while (count > 0);
+
+	return total;
+}
+
+struct gen_inode {
+	struct scoutfs_parallel_restore_inode inode;
+	struct scoutfs_parallel_restore_xattr **xattrs;
+	u64 nr_xattrs;
+	struct scoutfs_parallel_restore_entry **entries;
+	u64 nr_files;
+	u64 nr_entries;
+};
+
+static void free_gino(struct gen_inode *gino)
+{
+	u64 i;
+
+	if (gino) {
+		if (gino->entries) {
+			for (i = 0; i < gino->nr_entries; i++)
+				free(gino->entries[i]);
+			free(gino->entries);
+		}
+		if (gino->xattrs) {
+			for (i = 0; i < gino->nr_xattrs; i++)
+				free(gino->xattrs[i]);
+			free(gino->xattrs);
+		}
+		free(gino);
+	}
+}
+
+static struct scoutfs_parallel_restore_xattr *
+generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value,
+		int value_len)
+{
+	struct scoutfs_parallel_restore_xattr *xattr;
+
+	xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len);
+	error_exit(!xattr, "error allocating generated xattr");
+
+	*xattr = (struct scoutfs_parallel_restore_xattr) {
+		.ino = ino,
+		.pos = pos,
+		.name_len = name_len,
+		.value_len = value_len,
+	};
+
+	xattr->name = (void *)(xattr + 1);
+	xattr->value = (void *)(xattr->name + name_len);
+
+	memcpy(xattr->name, name, name_len);
+	if (value_len)
+		memcpy(xattr->value, value, value_len);
+
+	return xattr;
+}
+
+static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode)
+{
+	struct gen_inode *gino;
+	struct timespec now;
+
+	clock_gettime(CLOCK_REALTIME, &now);
+
+	gino = calloc(1, sizeof(struct gen_inode));
+	error_exit(!gino, "failure allocating generated inode");
+
+	gino->inode = (struct scoutfs_parallel_restore_inode) {
+		.ino = ino,
+		.meta_seq = ino,
+		.data_seq = 0,
+		.mode = mode,
+		.atime = now,
+		.ctime = now,
+		.mtime = now,
+		.crtime = now,
+	};
+
+	/*
+	 * hacky creation of a bunch of xattrs for now.
+	 */
+	if ((mode & S_IFMT) == S_IFREG) {
+		#define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, }
+		struct name_val {
+			char *name;
+			int len;
+			char *value;
+			int value_len;
+		} nv[] = {
+			NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"),
+			NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""),
+			NV("scoutfs.hide.sam_reqcopies", ""),
+			NV("scoutfs.hide.sam_copy_2", ""),
+			NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"),
+			NV("scoutfs.hide.sam_copy_1", ""),
+			NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""),
+			NV("scoutfs.hide.srch.sam_release_cand", ""),
+			NV("scoutfs.hide.sam_restime", ""),
+			NV("scoutfs.hide.sam_uuid", ""),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"),
+			NV("scoutfs.hide.srch.sam_vol_F01030L6", ""),
+			NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""),
+			NV("scoutfs.hide.srch.sam_vol_E01001L6", ""),
+			NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"),
+		};
+		unsigned int nr = array_size(nv);
+		int i;
+
+		gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *));
+
+		for (i = 0; i < nr; i++)
+			gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len,
+							 nv[i].value, nv[i].value_len);
+
+		gino->nr_xattrs = nr;
+		gino->inode.nr_xattrs = nr;
+
+		gino->inode.size = 4096;
+		gino->inode.offline = true;
+	}
+
+	return gino;
+}
+
+static struct scoutfs_parallel_restore_entry *
+generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode)
+{
+	struct scoutfs_parallel_restore_entry *entry;
+	char buf[PATH_MAX];
+	int bytes;
+
+	bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr);
+
+	entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes);
+	error_exit(!entry, "error allocating generated entry");
+
+	*entry = (struct scoutfs_parallel_restore_entry) {
+		.dir_ino = dir_ino,
+		.pos = pos,
+		.ino = ino,
+		.mode = mode,
+		.name = (void *)(entry + 1),
+		.name_len = bytes,
+	};
+
+	memcpy(entry->name, buf, bytes);
+
+	return entry;
+}
+
+static u64 random64(void)
+{
+	return ((u64)lrand48() << 32) | lrand48();
+}
+
+static u64 random_range(u64 low, u64 high)
+{
+	return low + (random64() % (high - low + 1));
+}
+
+static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len,
+				      bool no_dirs)
+{
+	struct scoutfs_parallel_restore_entry *entry;
+	struct gen_inode *gino;
+	u64 nr_entries;
+	u64 nr_files;
+	u64 nr_dirs;
+	u64 ino;
+	char *prefix;
+	mode_t mode;
+	u64 i;
+
+	nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs);
+	nr_files = random_range(opts->low_files, opts->high_files);
+
+	if (1 + nr_dirs + nr_files > ino_len) {
+		nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2;
+		nr_files = (ino_len - 1) - nr_dirs;
+	}
+
+	nr_entries = nr_dirs + nr_files;
+
+	gino = generate_inode(opts, dir_ino, DIR_MODE);
+	error_exit(!gino, "error allocating generated inode");
+
+	gino->inode.nr_subdirs = nr_dirs;
+	gino->nr_files = nr_files;
+
+	if (nr_entries) {
+		gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *));
+		error_exit(!gino->entries, "error allocating generated inode entries");
+
+		gino->nr_entries = nr_entries;
+	}
+
+	mode = DIR_MODE;
+	prefix = "dir";
+	for (i = 0; i < nr_entries; i++) {
+		if (i == nr_dirs) {
+			mode = REG_MODE;
+			prefix = "file";
+		}
+
+		ino = ino_start + i;
+		entry = generate_entry(opts, prefix, ino, gino->inode.ino,
+				       SCOUTFS_DIRENT_FIRST_POS + i, ino, mode);
+
+		gino->entries[i] = entry;
+		gino->inode.total_entry_name_bytes += entry->name_len;
+	}
+
+	return gino;
+}
+
+/*
+ * Restore a generated inode.  If it's a directory then we also restore
+ * all its entries.  The caller is going to descend into subdir entries and generate
+ * those dir inodes.  We have to generate and restore all non-dir inodes referenced
+ * by this inode's entries.
+ */
+static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+			  struct gen_inode *gino)
+{
+	struct gen_inode *nondir;
+	int ret;
+	u64 i;
+
+	ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
+	error_exit(ret, "thread add root inode %d", ret);
+
+	for (i = 0; i < gino->nr_entries; i++) {
+		ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]);
+		error_exit(ret, "thread add entry %d", ret);
+
+		/* caller only needs subdir entries, generate and free others */
+		if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) {
+
+			nondir = generate_inode(opts, gino->entries[i]->ino,
+						gino->entries[i]->mode);
+			restore_inode(opts, wri, nondir);
+			free_gino(nondir);
+
+			free(gino->entries[i]);
+			if (i != gino->nr_entries - 1)
+				gino->entries[i] = gino->entries[gino->nr_entries - 1];
+			gino->nr_entries--;
+			gino->nr_files--;
+			i--;
+		}
+	}
+
+	for (i = 0; i < gino->nr_xattrs; i++) {
+		ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]);
+		error_exit(ret, "thread add xattr %d", ret);
+	}
+}
+
+struct writer_args {
+	struct list_head head;
+
+	int dev_fd;
+	int pair_fd;
+
+	struct scoutfs_parallel_restore_slice slice;
+	u64 writer_nr;
+	u64 dir_height;
+	u64 ino_start;
+	u64 ino_len;
+};
+
+struct write_result {
+	struct scoutfs_parallel_restore_progress prog;
+	struct scoutfs_parallel_restore_slice slice;
+	__le64 files_created;
+	__le64 bytes_written;
+};
+
+static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+				  void *buf, size_t buf_size, int dev_fd,
+				  struct write_result *res, bool get_slice, int pair_fd)
+{
+	size_t total;
+	int ret;
+
+	total = write_bufs(opts, wri, buf, buf_size, dev_fd);
+	le64_add_cpu(&res->bytes_written, total);
+
+	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
+	error_exit(ret, "get prog %d", ret);
+
+	if (get_slice) {
+		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
+		error_exit(ret, "thread get slice %d", ret);
+	}
+
+	ret = write(pair_fd, res, sizeof(struct write_result));
+	error_exit(ret != sizeof(struct write_result), "result send error");
+
+	memset(res, 0, sizeof(struct write_result));
+}
+
+/*
+ * Calculate the number of bytes in toplevel "dir-%llu" entry names for the given
+ * number of writers.
+ */
+static u64 topdir_entry_bytes(u64 nr_writers)
+{
+	u64 bytes = (3 + 1) * nr_writers;
+	u64 limit;
+	u64 done;
+	u64 wid;
+	u64 nr;
+
+	for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) {
+		nr = min(limit - done, nr_writers - done);
+		bytes += nr * wid;
+	}
+
+	return bytes;
+}
+
+struct dir_pos {
+	struct gen_inode *gino;
+	u64 pos;
+};
+
+static void writer_proc(struct opts *opts, struct writer_args *args)
+{
+	struct scoutfs_parallel_restore_writer *wri = NULL;
+	struct scoutfs_parallel_restore_entry *entry;
+	struct dir_pos *dirs = NULL;
+	struct write_result res;
+	struct gen_inode *gino;
+	void *buf = NULL;
+	u64 level;
+	u64 ino;
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	dirs = calloc(args->dir_height, sizeof(struct dir_pos));
+	error_exit(errno, "error allocating parent dirs "ERRF, ERRA);
+
+	errno = posix_memalign((void **)&buf, 4096, opts->buf_size);
+	error_exit(errno, "error allocating block buf "ERRF, ERRA);
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &args->slice);
+	error_exit(ret, "add slice %d", ret);
+
+	/* writer 0 creates the root dir */
+	if (args->writer_nr == 0) {
+		gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE);
+		gino->inode.nr_subdirs = opts->nr_writers;
+		gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers);
+
+		ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
+		error_exit(ret, "thread add root inode %d", ret);
+		free_gino(gino);
+	}
+
+	/* create root entry for our top level dir */
+	ino = args->ino_start++;
+	args->ino_len--;
+
+	entry = generate_entry(opts, "top", args->writer_nr,
+			       SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr,
+			       ino, DIR_MODE);
+
+	ret = scoutfs_parallel_restore_add_entry(wri, entry);
+	error_exit(ret, "thread top entry %d", ret);
+	free(entry);
+
+	level = args->dir_height - 1;
+
+	while (args->ino_len > 0 && level < args->dir_height) {
+		gino = dirs[level].gino;
+
+		/* generate and restore if we follow entries */
+		if (!gino) {
+			gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0);
+			args->ino_start += gino->nr_entries;
+			args->ino_len -= gino->nr_entries;
+			le64_add_cpu(&res.files_created, gino->nr_files);
+
+			restore_inode(opts, wri, gino);
+			dirs[level].gino = gino;
+		}
+
+		if (dirs[level].pos == gino->nr_entries) {
+			/* ascend if we're done with this dir */
+			dirs[level].gino = NULL;
+			dirs[level].pos = 0;
+			free_gino(gino);
+			level++;
+
+		} else {
+			/* otherwise descend into subdir entry */
+			ino = gino->entries[dirs[level].pos]->ino;
+			dirs[level].pos++;
+			level--;
+		}
+
+		/* do a partial write at batch intervals when there's still more to do */
+		if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0)
+			write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
+					    &res, false, args->pair_fd);
+	}
+
+	write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
+			    &res, true, args->pair_fd);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	free(dirs);
+	free(buf);
+}
+
+/*
+ * If any of our children exited with an error code, we hard exit.
+ * The child processes should themselves report out any errors
+ * encountered. Any remaining children will receive SIGHUP and
+ * terminate.
+ */
+static void sigchld_handler(int signo, siginfo_t *info, void *context)
+{
+	if (info->si_status)
+		exit(EXIT_FAILURE);
+}
+
+static void fork_writer(struct opts *opts, struct writer_args *args)
+{
+	pid_t parent = getpid();
+	pid_t pid;
+	int ret;
+
+	pid = fork();
+	error_exit(pid == -1, "fork error");
+
+	if (pid != 0)
+		return;
+
+	ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
+	error_exit(ret < 0, "failed to set parent death sig");
+
+	printf("pid %u getpid() %u parent %u getppid() %u\n",
+		pid, getpid(), parent, getppid());
+	error_exit(getppid() != parent, "child parent already changed");
+
+	writer_proc(opts, args);
+	exit(0);
+}
+
+static int do_restore(struct opts *opts)
+{
+	struct scoutfs_parallel_restore_writer *wri = NULL;
+	struct scoutfs_parallel_restore_slice *slices = NULL;
+	struct scoutfs_super_block *super = NULL;
+	struct write_result res;
+	struct writer_args *args;
+	struct timespec begin;
+	struct timespec end;
+	LIST_HEAD(writers);
+	u64 next_ino;
+	u64 ino_per;
+	u64 avg_dirs;
+	u64 avg_files;
+	u64 dir_height;
+	u64 tot_files;
+	u64 tot_bytes;
+	int pair[2] = {-1, -1};
+	float secs;
+	void *buf = NULL;
+	int dev_fd = -1;
+	int ret;
+	int i;
+
+	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
+	error_exit(ret, "socketpair error "ERRF, ERRA);
+
+	dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL)));
+	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
+
+	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
+		posix_memalign((void **)&buf, 4096, opts->buf_size);
+	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
+
+	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
+	error_exit(ret, "import super %d", ret);
+
+	slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice));
+	error_exit(!slices, "alloc slices");
+
+	scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
+	error_exit(ret, "add slices[0] %d", ret);
+
+	next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1;
+	ino_per = opts->total_files / opts->nr_writers;
+	avg_dirs = (opts->low_dirs + opts->high_dirs) / 2;
+	avg_files = (opts->low_files + opts->high_files) / 2;
+
+	dir_height = 1;
+	tot_files = avg_files * opts->nr_writers;
+
+	while (tot_files < opts->total_files) {
+		dir_height++;
+		tot_files *= avg_dirs;
+	}
+
+	dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files);
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
+
+	/* start each writing process */
+	for (i = 0; i < opts->nr_writers; i++) {
+		args = calloc(1, sizeof(struct writer_args));
+		error_exit(!args, "alloc writer args");
+
+		args->dev_fd = dev_fd;
+		args->pair_fd = pair[1];
+		args->slice = slices[1 + i];
+		args->writer_nr = i;
+		args->dir_height = dir_height;
+		args->ino_start = next_ino;
+		args->ino_len = ino_per;
+
+		list_add_tail(&args->head, &writers);
+		next_ino += ino_per;
+
+		fork_writer(opts, args);
+	}
+
+	/* read results and watch for writers to finish */
+	tot_files = 0;
+	tot_bytes = 0;
+	i = 0;
+	while (i < opts->nr_writers) {
+		ret = read(pair[0], &res, sizeof(struct write_result));
+		error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
+
+		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
+		error_exit(ret, "add thr prog %d", ret);
+
+		if (res.slice.meta_len != 0) {
+			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
+			error_exit(ret, "add thr slice %d", ret);
+			i++;
+		}
+
+		tot_files += le64_to_cpu(res.files_created);
+		tot_bytes += le64_to_cpu(res.bytes_written);
+	}
+
+	tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd);
+
+	ret = scoutfs_parallel_restore_export_super(wri, super);
+	error_exit(ret, "update super %d", ret);
+
+	if (!opts->read_only) {
+		ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+			     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+		error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
+	}
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) -
+	       ((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC));
+	printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n",
+		tot_files, tot_bytes, secs,
+		(float)tot_bytes / tot_files, (float)tot_files / secs);
+
+	if (dev_fd >= 0)
+		close(dev_fd);
+	if (pair[0] >= 0)
+		close(pair[0]);
+	if (pair[1] >= 0)
+		close(pair[1]);
+	free(super);
+	free(slices);
+	free(buf);
+
+	return 0;
+}
+
+static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret)
+{
+	char *sep;
+	int ret = 0;
+
+	sep = index(str, ':');
+	if (sep) {
+		*sep = '\0';
+		ret = parse_u64(sep + 1, high_ret);
+	}
+
+	if (ret == 0)
+		ret = parse_u64(str, low_ret);
+
+	if (sep)
+		*sep = ':';
+
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	struct opts opts = {
+		.buf_size = (32 * 1024 * 1024),
+
+		.write_batch = 1000000,
+		.low_dirs = 5,
+		.high_dirs = 10,
+		.low_files = 10,
+		.high_files = 20,
+		.total_files = 100,
+	};
+	struct sigaction act = { 0 };
+	int ret;
+	int c;
+
+	opts.seed = random64();
+	opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN);
+
+        while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) {
+                switch(c) {
+                case 'b':
+			ret = parse_u64(optarg, &opts.write_batch);
+			error_exit(ret, "error parsing -b '%s'\n", optarg);
+			error_exit(opts.write_batch == 0, "-b can't be 0");
+                        break;
+                case 'd':
+			ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs);
+			error_exit(ret, "error parsing -d '%s'\n", optarg);
+                        break;
+                case 'f':
+			ret = parse_low_high(optarg, &opts.low_files, &opts.high_files);
+			error_exit(ret, "error parsing -f '%s'\n", optarg);
+                        break;
+                case 'm':
+                        opts.meta_path = strdup(optarg);
+                        break;
+                case 'n':
+			ret = parse_u64(optarg, &opts.total_files);
+			error_exit(ret, "error parsing -n '%s'\n", optarg);
+                        break;
+                case 'r':
+			opts.read_only = true;
+			break;
+                case 's':
+			ret = parse_u64(optarg, &opts.seed);
+			error_exit(ret, "error parsing -s '%s'\n", optarg);
+                        break;
+                case 'w':
+			ret = parse_u64(optarg, &opts.nr_writers);
+			error_exit(ret, "error parsing -w '%s'\n", optarg);
+                        break;
+                case '?':
+                        printf("Unknown option '%c'\n", optopt);
+                        usage();
+			exit(1);
+                }
+        }
+
+	error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu",
+		   opts.low_dirs, opts.high_dirs);
+	error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu",
+		   opts.low_files, opts.high_files);
+	error_exit(!opts.meta_path, "must specify metadata device path with -m");
+
+	printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n",
+		opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files,
+		opts.total_files, opts.seed, opts.nr_writers);
+
+	act.sa_flags = SA_SIGINFO | SA_RESTART;
+	act.sa_sigaction = &sigchld_handler;
+	if (sigaction(SIGCHLD, &act, NULL) == -1)
+		error_exit(ret, "error setting up signal handler\n");
+
+	ret = do_restore(&opts);
+
+	free(opts.meta_path);
+
+	return ret == 0 ? 0 : 1;
+}
diff --git a/tests/src/restore_copy.c b/tests/src/restore_copy.c
new file mode 100644
index 00000000..94fc702c
--- /dev/null
+++ b/tests/src/restore_copy.c
@@ -0,0 +1,963 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <time.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/signal.h>
+#include <sys/statfs.h>
+#include <dirent.h>
+
+#include "../../utils/src/sparse.h"
+#include "../../utils/src/util.h"
+#include "../../utils/src/list.h"
+#include "../../utils/src/parse.h"
+#include "../../kmod/src/format.h"
+#include "../../kmod/src/ioctl.h"
+#include "../../utils/src/parallel_restore.h"
+
+/*
+ * XXX:
+ */
+
+#define ERRF " errno %d (%s)"
+#define ERRA errno, strerror(errno)
+
+#define error_exit(cond, fmt, args...)			\
+do {							\
+	if (cond) {					\
+		printf("error: "fmt"\n", ##args);	\
+		exit(1);				\
+	}						\
+} while (0)
+
+#define REG_MODE (S_IFREG | 0644)
+#define DIR_MODE (S_IFDIR | 0755)
+#define LNK_MODE (S_IFLNK | 0777)
+
+/*
+ * At about 1k files we seem to be writing about 1MB of data, so
+ * set buffer sizes adequately above that.
+ */
+#define BATCH_FILES 1024
+#define BUF_SIZ 2 * 1024 * 1024
+
+/*
+ * We can't make duplicate inodes for hardlinked files, so we
+ * will need to track these as we generate them. Not too costly
+ * to do, since it's just an integer, and sorting shouldn't matter
+ * until we get into the millions of entries, hopefully.
+ */
+static struct list_head hardlinks;
+struct hardlink_head {
+	struct list_head head;
+	u64 ino;
+};
+
+struct opts {
+	char *meta_path;
+	char *source_dir;
+};
+
+static bool warn_scoutfs = false;
+
+static void usage(void)
+{
+	printf("usage:\n"
+	       " -m PATH     | path to metadata device\n"
+	       " -s PATH     | path to source directory\n"
+	       );
+}
+
+static size_t write_bufs(struct scoutfs_parallel_restore_writer *wri,
+			 void *buf, int dev_fd)
+{
+	size_t total = 0;
+	size_t count;
+	off_t off;
+	int ret;
+
+	do {
+		ret = scoutfs_parallel_restore_write_buf(wri, buf, BUF_SIZ, &off, &count);
+		error_exit(ret, "write buf %d", ret);
+
+		if (count > 0) {
+			ret = pwrite(dev_fd, buf, count, off);
+			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
+			total += ret;
+		}
+	} while (count > 0);
+
+	return total;
+}
+
+struct write_result {
+	struct scoutfs_parallel_restore_progress prog;
+	struct scoutfs_parallel_restore_slice slice;
+	__le64 files_created;
+	__le64 dirs_created;
+	__le64 bytes_written;
+	bool complete;
+};
+
+static void write_bufs_and_send(struct scoutfs_parallel_restore_writer *wri,
+				void *buf, int dev_fd,
+				struct write_result *res, bool get_slice, int pair_fd)
+{
+	size_t total;
+	int ret;
+
+	total = write_bufs(wri, buf, dev_fd);
+	le64_add_cpu(&res->bytes_written, total);
+
+	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
+	error_exit(ret, "get prog %d", ret);
+
+	if (get_slice) {
+		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
+		error_exit(ret, "thread get slice %d", ret);
+	}
+
+	ret = write(pair_fd, res, sizeof(struct write_result));
+	error_exit(ret != sizeof(struct write_result), "result send error");
+
+	memset(res, 0, sizeof(struct write_result));
+}
+
+/*
+ * Adding xattrs is supported for files and directories only.
+ *
+ * If the filesystem on which the path resides isn't scoutfs, we omit the
+ * scoutfs specific ioctl to fetch hidden xattrs.
+ *
+ * Untested if the hidden xattr ioctl works on directories or symlinks.
+ */
+static void add_xattrs(struct scoutfs_parallel_restore_writer *wri, char *path, u64 ino, bool is_scoutfs)
+{
+	struct scoutfs_ioctl_listxattr_hidden lxh;
+	struct scoutfs_parallel_restore_xattr *xattr;
+	char *buf = NULL;
+	char *name = NULL;
+	int fd = -1;
+	int bytes;
+	int len;
+	int value_len;
+	int ret;
+	int pos = 0;
+
+	if (!is_scoutfs)
+		goto normal_xattrs;
+
+	fd = open(path, O_RDONLY);
+	error_exit(fd < 0, "open"ERRF, ERRA);
+
+	memset(&lxh, 0, sizeof(lxh));
+	lxh.id_pos = 0;
+	lxh.hash_pos = 0;
+	lxh.buf_bytes = 256 * 1024;
+
+	buf = malloc(lxh.buf_bytes);
+	error_exit(!buf, "alloc xattr_hidden buf");
+	lxh.buf_ptr = (unsigned long)buf;
+
+	/* hidden */
+	for (;;) {
+		ret = ioctl(fd, SCOUTFS_IOC_LISTXATTR_HIDDEN, &lxh);
+		if (ret == 0) /* done */
+			break;
+		error_exit(ret < 0, "listxattr_hidden"ERRF, ERRA);
+		bytes = ret;
+		error_exit(bytes > lxh.buf_bytes, "listxattr_hidden overflow");
+		error_exit(buf[bytes - 1] != '\0', "listxattr_hidden didn't term");
+
+		name = buf;
+
+		do {
+			len = strlen(name);
+			error_exit(len == 0, "listxattr_hidden empty name");
+			error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
+
+			/* get value len */
+			value_len = fgetxattr(fd, name, NULL, 0);
+			error_exit(value_len < 0, "malloc value hidden"ERRF, ERRA);
+
+			/* allocate everything at once */
+			xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
+			error_exit(!xattr, "error allocating generated xattr");
+
+			*xattr = (struct scoutfs_parallel_restore_xattr) {
+				.ino = ino,
+				.pos = pos++,
+				.name_len = len,
+				.value_len = value_len,
+			};
+			xattr->name = (void *)(xattr + 1);
+			xattr->value = (void *)(xattr->name + len);
+
+			/* get value into xattr directly */
+			ret = fgetxattr(fd, name, (void *)(xattr->name + len), value_len);
+			error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
+
+			memcpy(xattr->name, name, len);
+
+			ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
+			error_exit(ret, "add hidden xattr %d", ret);
+
+			free(xattr);
+
+			name += len + 1;
+			bytes -= len + 1;
+		} while (bytes > 0);
+	}
+
+	free(buf);
+	close(fd);
+
+normal_xattrs:
+	value_len = listxattr(path, NULL, 0);
+	error_exit(value_len < 0, "hidden listxattr "ERRF, ERRA);
+	if (value_len == 0)
+		return;
+
+	buf = calloc(1, value_len);
+	error_exit(!buf, "malloc value"ERRF, ERRA);
+
+	ret = listxattr(path, buf, value_len);
+	error_exit(ret < 0, "hidden listxattr %d", ret);
+
+	name = buf;
+	bytes = ret;
+	do {
+		len = strlen(name);
+
+		error_exit(len == 0, "listxattr_hidden empty name");
+		error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
+
+		value_len = getxattr(path, name, NULL, 0);
+		error_exit(value_len < 0, "value "ERRF, ERRA);
+
+		xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
+		error_exit(!xattr, "error allocating generated xattr");
+
+		*xattr = (struct scoutfs_parallel_restore_xattr) {
+			.ino = ino,
+			.pos = pos++,
+			.name_len = len,
+			.value_len = value_len,
+		};
+		xattr->name = (void *)(xattr + 1);
+		xattr->value = (void *)(xattr->name + len);
+
+		ret = getxattr(path, name, (void *)(xattr->name + len), value_len);
+		error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
+
+		memcpy(xattr->name, name, len);
+
+		ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
+		error_exit(ret, "add xattr %d", ret);
+
+		free(xattr);
+
+		name += len + 1;
+		bytes -= len + 1;
+	} while (bytes > 0);
+
+	free(buf);
+}
+
+/*
+ * We can't store the same inode multiple times, so we need to make
+ * sure to account for hardlinks. Maintain a LL that stores the first
+ * hardlink inode we encounter, and every subsequent hardlink to this
+ * inode will omit inserting an inode, and just adds another entry
+ */
+static bool is_new_inode_item(bool nlink, u64 ino)
+{
+	struct hardlink_head *hh_tmp;
+	struct hardlink_head *hh;
+
+	if (!nlink)
+		return true;
+
+	/* lineair search, pretty awful, should be a binary tree */
+	list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
+		if (hh->ino == ino)
+			return false;
+	}
+
+	/* insert item */
+	hh = malloc(sizeof(struct hardlink_head));
+	error_exit(!hh, "malloc");
+	hh->ino = ino;
+	list_add_tail(&hh->head, &hardlinks);
+
+	/*
+	 *  XXX
+	 *
+	 * We can be confident that if we don't traverse filesystems
+	 * that once we've created N entries of an N-linked inode, that
+	 * it can be removed from the LL. This would significantly
+	 * improve the manageability of the list.
+	 *
+	 * All we'd need to do is add a counter and compare it to the nr_links
+	 * field of the inode.
+	 */
+
+	return true;
+}
+
+/*
+ * create the inode data for a given path as best as possible
+ * duplicating the exact data from the source path
+ */
+static struct scoutfs_parallel_restore_inode *read_inode_data(char *path, u64 ino, bool *nlink, bool is_scoutfs)
+{
+	struct scoutfs_parallel_restore_inode *inode = NULL;
+	struct scoutfs_ioctl_stat_more stm;
+	struct scoutfs_ioctl_inode_attr_x iax;
+	struct stat st;
+	int ret;
+	int fd;
+
+	inode = calloc(1, sizeof(struct scoutfs_parallel_restore_inode));
+	error_exit(!inode, "failure allocating inode");
+
+	ret = lstat(path, &st);
+	error_exit(ret, "failure stat inode");
+
+	/* use exact inode numbers from path, except for root ino */
+	if (ino != SCOUTFS_ROOT_INO)
+		inode->ino = st.st_ino;
+	else
+		inode->ino = SCOUTFS_ROOT_INO;
+
+	inode->mode = st.st_mode;
+	inode->uid = st.st_uid;
+	inode->gid = st.st_gid;
+	inode->atime = st.st_atim;
+	inode->ctime = st.st_ctim;
+	inode->mtime = st.st_mtim;
+	inode->size = st.st_size;
+	inode->nlink = st.st_nlink;
+
+	inode->rdev = st.st_rdev;
+
+	/* scoutfs specific */
+	inode->meta_seq = 0;
+	inode->data_seq = 0;
+	inode->crtime = st.st_ctim;
+
+	/* we don't restore data */
+	if (S_ISREG(inode->mode) && (inode->size > 0))
+		inode->offline = true;
+
+	if (S_ISREG(inode->mode) || S_ISDIR(inode->mode)) {
+		if (is_scoutfs) {
+			fd = open(path, O_RDONLY);
+			error_exit(!fd, "open failure"ERRF, ERRA);
+
+			ret = ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm);
+			error_exit(ret, "failure SCOUTFS_IOC_STAT_MORE inode");
+
+			/* these aren't restored! */
+			inode->meta_seq = stm.meta_seq;
+			inode->data_seq = stm.data_seq;
+
+			inode->crtime = (struct timespec){.tv_sec = stm.crtime_sec, .tv_nsec = stm.crtime_nsec};
+
+			/* project ID, retention bit */
+			memset(&iax, 0, sizeof(iax));
+
+			iax.x_flags = 0;
+			iax.x_mask = SCOUTFS_IOC_IAX_PROJECT_ID | SCOUTFS_IOC_IAX__BITS;
+			iax.bits = SCOUTFS_IOC_IAX_B_RETENTION;
+
+			ret = ioctl(fd, SCOUTFS_IOC_GET_ATTR_X, &iax);
+			error_exit(ret, "failure SCOUTFS_IOC_GET_ATTR_X inode");
+
+			inode->proj = iax.project_id;
+			inode->flags |= (iax.bits & SCOUTFS_IOC_IAX_B_RETENTION) ? SCOUTFS_INO_FLAG_RETENTION : 0;
+
+			close(fd);
+		}
+
+	}
+
+	/* pass whether item is hardlinked or not */
+	*nlink = (st.st_nlink > 1);
+
+	return inode;
+}
+
+typedef int (*quota_ioctl_in)(struct scoutfs_ioctl_quota_rule *irules,
+							  struct scoutfs_ioctl_get_quota_rules *gqr,
+							  size_t nr, int fd);
+
+static int get_quota_ioctl(struct scoutfs_ioctl_quota_rule *irules,
+						   struct scoutfs_ioctl_get_quota_rules *rules_in,
+						   size_t nr, int fd)
+{
+	struct scoutfs_ioctl_get_quota_rules *gqr = rules_in;
+	int ret;
+
+	gqr->rules_ptr = (intptr_t)irules;
+	gqr->rules_nr = nr;
+
+	ret = ioctl(fd, SCOUTFS_IOC_GET_QUOTA_RULES, gqr);
+	error_exit(ret < 0, "quota ioctl error");
+
+	return ret;
+}
+
+static char opc[] = {
+        [SQ_OP_DATA] = 'D',
+        [SQ_OP_INODE] = 'I',
+};
+
+static char nsc[] = {
+        [SQ_NS_LITERAL] = 'L',
+        [SQ_NS_PROJ] = 'P',
+        [SQ_NS_UID] = 'U',
+        [SQ_NS_GID] = 'G',
+};
+
+static int insert_quota_rule(struct scoutfs_parallel_restore_writer *wri,
+					   struct scoutfs_ioctl_quota_rule *irule)
+{
+	struct scoutfs_parallel_restore_quota_rule *prule = NULL;
+	int ret;
+	int i;
+
+	prule = calloc(1, sizeof(struct scoutfs_parallel_restore_quota_rule));
+	error_exit(!prule, "quota rule alloc failed");
+	prule->limit = irule->limit;
+	prule->prio = irule->prio;
+	prule->op = irule->op;
+	prule->rule_flags = irule->rule_flags;
+	prule->names[0].val = irule->name_val[0];
+	prule->names[0].source = irule->name_source[0];
+	prule->names[0].flags = irule->name_flags[0];
+	prule->names[1].val = irule->name_val[1];
+	prule->names[1].source = irule->name_source[1];
+	prule->names[1].flags = irule->name_flags[1];
+	prule->names[2].val = irule->name_val[2];
+	prule->names[2].source = irule->name_source[2];
+	prule->names[2].flags = irule->name_flags[2];
+
+	/* print out the rule */
+        printf("Quota rule: %3u ", irule->prio);
+        for (i = 0; i < array_size(irule->name_val); i++) {
+                printf("%llu,%c,%c ",
+                       irule->name_val[i],
+                       nsc[irule->name_source[i]],
+                       (irule->name_flags[i] & SQ_NF_SELECT) ? 'S' : '-');
+        }
+        printf("%c %llu %c\n",
+               opc[irule->op], irule->limit, (irule->rule_flags & SQ_RF_TOTL_COUNT) ? 'C' : '-');
+
+	ret = scoutfs_parallel_restore_add_quota_rule(wri, prule);
+	error_exit(ret, "quota add rule %d", ret);
+	free(prule);
+	return ret;
+}
+
+static int restore_quotas(struct scoutfs_parallel_restore_writer *wri,
+			  quota_ioctl_in quota_in, char *path)
+{
+	struct scoutfs_ioctl_get_quota_rules gqr = {{0,}};
+	struct scoutfs_ioctl_quota_rule *irules = NULL;
+	size_t rule_alloc = 0;
+	size_t rule_nr = 0;
+	size_t rule_count;
+	size_t i;
+	int fd = -1;
+	int ret;
+
+	fd = open(path, O_RDONLY);
+	error_exit(fd < 0, "open"ERRF, ERRA);
+
+	for (;;) {
+		if (rule_nr == rule_alloc) {
+			rule_alloc += 1024;
+			irules = realloc(irules, rule_alloc * sizeof(irules[0]));
+			error_exit(!irules, "irule realloc failed rule_nr:%zu alloced:%zu", rule_nr, rule_alloc);
+			if (!irules) {
+				ret = -errno;
+				fprintf(stderr, "memory allocation failed: %s (%d)\n",
+					strerror(errno), errno);
+				goto out;
+			}
+		}
+
+		ret = quota_in(&irules[rule_nr], &gqr, rule_alloc - rule_nr, fd);
+		if (ret == 0)
+			break;
+		if (ret < 0)
+			goto out;
+
+		rule_count = ret;
+
+		for (i = 0; i < rule_count; i++) {
+			ret = insert_quota_rule(wri, &irules[i]);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	if (fd >= 0)
+		close(fd);
+	if (irules)
+		free(irules);
+	return ret;
+}
+
+struct writer_args {
+	struct list_head head;
+
+	int dev_fd;
+	int pair_fd;
+
+	struct scoutfs_parallel_restore_slice slice;
+};
+
+static void restore_path(struct scoutfs_parallel_restore_writer *wri, struct writer_args *args, struct write_result *res, void *buf, char *path, u64 ino)
+{
+	struct scoutfs_parallel_restore_inode *inode;
+	struct scoutfs_parallel_restore_entry *entry;
+	DIR *dirp = NULL;
+	char *subdir = NULL;
+	char link[PATH_MAX + 1];
+	struct dirent *ent;
+	struct statfs stf;
+	int ret = 0;
+	int subdir_count = 0, file_count = 0;
+	size_t ent_len = 0;
+	size_t pos = 0;
+	bool nlink = false;
+	char ind = '?';
+	u64 mode;
+	bool is_scoutfs = false;
+
+	/* get fs info once per path */
+	ret = statfs(path, &stf);
+	error_exit(ret != 0, "statfs"ERRF, ERRA);
+	is_scoutfs = (stf.f_type == 0x554f4353);
+
+	if (!is_scoutfs && !warn_scoutfs) {
+		warn_scoutfs = true;
+		fprintf(stderr, "Non-scoutfs source path detected: scoutfs specific features disabled\n");
+	}
+
+
+	/* traverse the entire tree */
+	dirp = opendir(path);
+	errno = 0;
+	while ((ent = readdir(dirp))) {
+		if (ent->d_type == DT_DIR) {
+			if ((strcmp(ent->d_name, ".") == 0) ||
+			    (strcmp(ent->d_name, "..") == 0)) {
+				/* position still matters */
+				pos++;
+				continue;
+			}
+
+			/* recurse into subdir */
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+			restore_path(wri, args, res, buf, subdir, ent->d_ino);
+
+			subdir_count++;
+
+			ent_len += strlen(ent->d_name);
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = DIR_MODE,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add entry %d", ret);
+			free(entry);
+
+			add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
+
+			free(subdir);
+
+			le64_add_cpu(&res->dirs_created, 1);
+		} else if (ent->d_type == DT_REG) {
+
+			file_count++;
+
+			ent_len += strlen(ent->d_name);
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = REG_MODE,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add entry %d", ret);
+			free(entry);
+
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+
+			/* file inode */
+			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
+			fprintf(stdout, "f %s/%s\n", path, ent->d_name);
+			if (is_new_inode_item(nlink, ent->d_ino)) {
+				ret = scoutfs_parallel_restore_add_inode(wri, inode);
+				error_exit(ret, "add reg file inode %d", ret);
+
+				/* xattrs */
+				add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
+			}
+			free(inode);
+
+			free(subdir);
+
+			le64_add_cpu(&res->files_created, 1);
+		} else if (ent->d_type == DT_LNK) {
+			/* readlink */
+
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+
+			ent_len += strlen(ent->d_name);
+
+			ret = readlink(subdir, link, PATH_MAX);
+			error_exit(ret < 0, "readlink %d", ret);
+			/* must 0-terminate if we want to print it */
+			link[ret] = 0;
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = LNK_MODE,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add symlink entry %d", ret);
+
+			/* link inode */
+			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
+
+			fprintf(stdout, "l %s/%s -> %s\n", path, ent->d_name, link);
+
+			inode->mode = LNK_MODE;
+			inode->target = link;
+			inode->target_len = strlen(link) + 1; /* scoutfs null terminates symlinks */
+
+			ret = scoutfs_parallel_restore_add_inode(wri, inode);
+			error_exit(ret, "add syml inode %d", ret);
+
+			free(inode);
+			free(subdir);
+
+			le64_add_cpu(&res->files_created, 1);
+		} else {
+			/* odd stuff */
+			switch(ent->d_type) {
+			case DT_CHR:
+				ind = 'c';
+				mode = S_IFCHR;
+				break;
+			case DT_BLK:
+				ind = 'b';
+				mode = S_IFBLK;
+				break;
+			case DT_FIFO:
+				ind = 'p';
+				mode = S_IFIFO;
+				break;
+			case DT_SOCK:
+				ind = 's';
+				mode = S_IFSOCK;
+				break;
+			default:
+				error_exit(true, "Unknown readdir entry type");
+				;;
+			}
+
+			file_count++;
+
+			ent_len += strlen(ent->d_name);
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = mode,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add entry %d", ret);
+
+			free(entry);
+
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+
+			/* file inode */
+			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
+			fprintf(stdout, "%c %s/%s\n", ind, path, ent->d_name);
+			if (is_new_inode_item(nlink, ent->d_ino)) {
+				ret = scoutfs_parallel_restore_add_inode(wri, inode);
+				error_exit(ret, "add reg file inode %d", ret);
+			}
+			free(inode);
+
+			free(subdir);
+
+			le64_add_cpu(&res->files_created, 1);
+		}
+
+		/* batch out changes, will be about 1M */
+		if (le64_to_cpu(res->files_created) > BATCH_FILES) {
+			write_bufs_and_send(wri, buf, args->dev_fd, res, false, args->pair_fd);
+		}
+
+	}
+	if (ent != NULL)
+		error_exit(errno, "readdir"ERRF, ERRA);
+	closedir(dirp);
+
+	/* create the dir itself */
+	inode = read_inode_data(path, ino, &nlink, is_scoutfs);
+	inode->nr_subdirs = subdir_count;
+	inode->total_entry_name_bytes = ent_len;
+	fprintf(stdout, "d %s\n", path);
+
+	ret = scoutfs_parallel_restore_add_inode(wri, inode);
+	error_exit(ret, "add dir inode %d", ret);
+
+	free(inode);
+
+	/* No need to send, we'll send final after last directory is complete */
+}
+
+static int do_restore(struct opts *opts)
+{
+	struct scoutfs_parallel_restore_writer *pwri, *wri = NULL;
+	struct scoutfs_parallel_restore_slice *slices = NULL;
+	struct scoutfs_super_block *super = NULL;
+	struct writer_args *args;
+	struct write_result res;
+	int pair[2] = {-1, -1};
+	LIST_HEAD(writers);
+	void *buf = NULL;
+	void *bufp = NULL;
+	int dev_fd = -1;
+	pid_t pid;
+	int ret;
+	u64 tot_bytes;
+	u64 tot_dirs;
+	u64 tot_files;
+
+	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
+	error_exit(ret, "socketpair error "ERRF, ERRA);
+
+	dev_fd = open(opts->meta_path, O_DIRECT | (O_RDWR|O_EXCL));
+	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
+
+	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
+		posix_memalign((void **)&buf, 4096, BUF_SIZ);
+	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
+
+	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
+
+	error_exit((super->flags & SCOUTFS_FLAG_IS_META_BDEV) == 0, "super block is not meta dev");
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
+	error_exit(ret, "import super %d", ret);
+
+	slices = calloc(2, sizeof(struct scoutfs_parallel_restore_slice));
+	error_exit(!slices, "alloc slices");
+
+	scoutfs_parallel_restore_init_slices(wri, slices, 2);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
+	error_exit(ret, "add slices[0] %d", ret);
+
+	args = calloc(1, sizeof(struct writer_args));
+	error_exit(!args, "alloc writer args");
+
+	args->dev_fd = dev_fd;
+	args->slice = slices[1];
+	args->pair_fd = pair[1];
+	list_add_tail(&args->head, &writers);
+
+	/* fork writer process */
+	pid = fork();
+	error_exit(pid == -1, "fork error");
+
+	if (pid == 0) {
+		ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
+		error_exit(ret < 0, "failed to set parent death sig");
+
+		errno = posix_memalign((void **)&bufp, 4096, BUF_SIZ);
+		error_exit(errno, "error allocating block bufp "ERRF, ERRA);
+
+		ret = scoutfs_parallel_restore_create_writer(&pwri);
+		error_exit(ret, "create pwriter %d", ret);
+
+		ret = scoutfs_parallel_restore_add_slice(pwri, &args->slice);
+		error_exit(ret, "add pslice %d", ret);
+
+		memset(&res, 0, sizeof(res));
+
+		restore_path(pwri, args, &res, bufp, opts->source_dir, SCOUTFS_ROOT_INO);
+
+		ret = restore_quotas(pwri, get_quota_ioctl, opts->source_dir);
+		error_exit(ret, "quota add %d", ret);
+
+		res.complete = true;
+
+		write_bufs_and_send(pwri, buf, args->dev_fd, &res, true, args->pair_fd);
+
+		scoutfs_parallel_restore_destroy_writer(&pwri);
+		free(bufp);
+
+		exit(0);
+	};
+
+	/* read results and wait for writer to finish */
+	tot_bytes = 0;
+	tot_dirs = 1;
+	tot_files = 0;
+	for (;;) {
+		ret = read(pair[0], &res, sizeof(struct write_result));
+		error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
+
+		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
+		error_exit(ret, "add thr prog %d", ret);
+
+		if (res.slice.meta_len != 0) {
+			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
+			error_exit(ret, "add thr slice %d", ret);
+
+			if (res.complete)
+				break;
+		}
+
+		tot_bytes += le64_to_cpu(res.bytes_written);
+		tot_files += le64_to_cpu(res.files_created);
+		tot_dirs += le64_to_cpu(res.dirs_created);
+	}
+
+	tot_bytes += write_bufs(wri, buf, args->dev_fd);
+
+	fprintf(stdout, "Wrote %lld directories, %lld files, %lld bytes total\n",
+		tot_dirs, tot_files, tot_bytes);
+
+	/* write super to finalize */
+	ret = scoutfs_parallel_restore_export_super(wri, super);
+	error_exit(ret, "update super %d", ret);
+
+	ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	if (dev_fd >= 0)
+		close(dev_fd);
+	if (pair[0] > 0)
+		close(pair[0]);
+	if (pair[1] > 0)
+		close(pair[1]);
+	free(super);
+	free(args);
+	free(slices);
+	free(buf);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct opts opts = (struct opts){ 0 };
+	struct hardlink_head *hh_tmp;
+	struct hardlink_head *hh;
+	int ret;
+	int c;
+
+	INIT_LIST_HEAD(&hardlinks);
+
+        while ((c = getopt(argc, argv, "b:m:s:")) != -1) {
+                switch(c) {
+                case 'm':
+                        opts.meta_path = strdup(optarg);
+                        break;
+		case 's':
+			opts.source_dir = strdup(optarg);
+			break;
+                case '?':
+                        printf("Unknown option '%c'\n", optopt);
+                        usage();
+			exit(1);
+                }
+        }
+
+	error_exit(!opts.meta_path, "must specify metadata device path with -m");
+	error_exit(!opts.source_dir, "must specify source directory path with -s");
+
+	ret = do_restore(&opts);
+
+	free(opts.meta_path);
+	free(opts.source_dir);
+
+	list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
+		list_del_init(&hh->head);
+		free(hh);
+	}
+
+	return ret == 0 ? 0 : 1;
+}
diff --git a/tests/tests/parallel_restore.sh b/tests/tests/parallel_restore.sh
new file mode 100644
index 00000000..69b594ec
--- /dev/null
+++ b/tests/tests/parallel_restore.sh
@@ -0,0 +1,74 @@
+#
+# validate parallel restore library
+#
+
+t_require_commands scoutfs parallel_restore find xargs
+
+SCR="$T_TMPDIR/mnt.scratch"
+mkdir -p "$SCR"
+
+scratch_mkfs() {
+	scoutfs mkfs $@ \
+		-A -f -Q 0,127.0.0.1,53000 $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_check() {
+	# give ample time for writes to commit
+	sleep 1
+	sync
+	scoutfs check -d ${T_TMPDIR}/check.debug $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_mount() {
+	mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 $T_EX_DATA_DEV $SCR
+}
+
+echo "== simple mkfs/restore/mount"
+# meta device just big enough for reserves and the metadata we'll fill
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" > /dev/null || t_fail "parallel_restore"
+scratch_check || t_fail "check failed"
+scratch_mount
+
+scoutfs statfs -p "$SCR" | grep -v -e 'fsid' -e 'rid'
+find "$SCR" -exec scoutfs list-hidden-xattrs {} \; | wc
+scoutfs search-xattrs -p "$SCR" scoutfs.hide.srch.sam_vol_F01030L6 -p "$SCR" | wc
+find "$SCR" -type f -name "file-*" | head -n 4 | xargs -n 1 scoutfs get-fiemap -L
+scoutfs df -p "$SCR" | awk '{print $1, $4}'
+scoutfs quota-list -p "$SCR"
+
+umount "$SCR"
+scratch_check || t_fail "check after mount failed"
+
+echo "== under ENOSPC"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" -n 2000000 > /dev/null || t_fail "parallel_restore"
+scratch_check || t_fail "check failed"
+scratch_mount
+scoutfs df -p "$SCR" | awk '{print $1, $4}'
+umount "$SCR"
+scratch_check || t_fail "check after mount failed"
+
+echo "== ENOSPC"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" -d 600:1000 -f 600:1000 -n 4000000 | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== attempt to restore data device"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_DATA_DEV" | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== attempt format_v1 restore"
+scratch_mkfs -V 1 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== test if previously mounted"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
+	"$T_EX_DATA_DEV" "$SCR"
+umount "$SCR"
+parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== cleanup"
+rmdir "$SCR"
+
+t_pass
diff --git a/tests/tests/restore_copy.sh b/tests/tests/restore_copy.sh
new file mode 100644
index 00000000..7517f05d
--- /dev/null
+++ b/tests/tests/restore_copy.sh
@@ -0,0 +1,118 @@
+#
+# validate parallel restore library - using restore_copy.c
+#
+
+t_require_commands scoutfs restore_copy find xargs
+
+SCR="$T_TMPDIR/mnt.scratch"
+mkdir -p "$SCR"
+
+scratch_mkfs() {
+	scoutfs mkfs $@ \
+		-A -f -Q 0,127.0.0.1,53000 $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_check() {
+	# give ample time for writes to commit
+	sleep 1
+	sync
+	scoutfs check -d ${T_TMPDIR}/check.debug $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_mount() {
+	mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 $T_EX_DATA_DEV $SCR
+}
+
+echo "== restore_copy content verification"
+mkdir "$T_M0/data"
+
+# create all supported inode types:
+mkdir -p "$T_M0/data/d"
+touch "$T_M0/data/f"
+ln -sf "broken" "$T_M0/data/l"
+ln "$T_M0/data/f" "$T_M0/data/h"
+ln -sf "f" "$T_M0/data/F"
+mknod "$T_M0/data/b" b 1 1
+mknod "$T_M0/data/c" c 0 0
+mknod "$T_M0/data/u" u 2 2
+mknod "$T_M0/data/p" p
+
+# some files with data
+dd if=/dev/zero of="$T_M0/data/f4096" bs=4096 count=1 status=none
+touch "$T_M0/data/falloc" "$T_M0/data/truncate"
+xfs_io -C "falloc 65536 65536" "$T_M0/data/falloc"
+xfs_io -C "truncate $((4096 * 4096))" "$T_M0/data/truncate"
+
+# socket (could have used python but avoids python/python2/python3 problem)
+perl -e "use IO::Socket; my \$s = IO::Socket::UNIX->new(Type=>SOCK_STREAM,Local=>'$T_M0/data/s') or die 'sock';"
+# set all mode_t bits
+touch "$T_M0/data/mode_t"
+chmod 6777 "$T_M0/data/mode_t"
+# uid/gid
+touch "$T_M0/data/uidgid"
+chown 33333:33333 "$T_M0/data/uidgid"
+# set retention bit
+touch "$T_M0/data/retention"
+scoutfs set-attr-x -t 1 "$T_M0/data/retention"
+# set project ID
+touch "$T_M0/data/proj"
+scoutfs set-attr-x -p 12345 "$T_M0/data/proj"
+mkdir -p "$T_M0/data/proj_d"
+touch "$T_M0/data/proj_d/f"
+scoutfs set-attr-x -p 12345 "$T_M0/data/proj_d/f"
+scoutfs set-attr-x -p 54321 "$T_M0/data/proj_d"
+# quotas
+for a in $(seq 10 15); do
+	scoutfs quota-add -p "$T_M0" -r "7 $a,L,- 0,L,- 0,L,- I 33 -"
+done
+# crtime
+scoutfs set-attr-x -r 55555.666666666 "$T_M0/data/proj_d"
+scoutfs set-attr-x -r 55556.666666666 "$T_M0/data/proj_d/f"
+# data_seq, meta_seq, data_version is not restored.
+
+scratch_mkfs -V 2 > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+restore_copy -m $T_EX_META_DEV -s "$T_M0/data" | t_filter_fs
+scratch_check || t_fail "check before mount failed"
+
+scratch_mount
+
+echo "== verify metadata bits on restored fs"
+inspect() {
+	ls -Alnr --time-style=+""
+	scoutfs get-attr-x -t "retention"
+	scoutfs get-attr-x -p "proj"
+	scoutfs get-fiemap -L "f4096"
+	scoutfs get-fiemap -L "falloc"
+	scoutfs get-fiemap -L "truncate"
+	scoutfs quota-list -p "."
+	scoutfs get-attr-x -p "proj_d/f"
+	scoutfs get-attr-x -p "proj_d"
+
+	scoutfs stat proj_d | grep crtime
+	scoutfs stat proj_d/f | grep crtime
+}
+
+( cd "$SCR" ; inspect )
+
+echo "== verify quota rules on restored fs"
+scoutfs quota-del -p "$T_M0" -r "7 15,L,- 0,L,- 0,L,- I 33 -" || t_fail "quota-del failed"
+scoutfs quota-list -p "$T_M0"
+scoutfs quota-add -p "$T_M0" -r "7 15,L,- 0,L,- 0,L,- I 33 -" || t_fail "quota-add failed"
+scoutfs quota-list -p "$T_M0"
+
+scoutfs df -p "$SCR" | awk '{print $1, $4}'
+
+echo "== umount restored fs and check"
+umount "$SCR"
+scratch_check || t_fail "check after mount failed"
+
+#scoutfs print $T_META_DEVICE
+#scoutfs print $T_EX_META_DEV
+
+echo "== cleanup"
+rmdir "$SCR"
+scoutfs set-attr-x -t 0 "$T_M0/data/retention"
+rm -rf "$T_M0/data"
+scoutfs quota-wipe -p "$T_M0"
+
+t_pass
diff --git a/utils/Makefile b/utils/Makefile
index e0f76142..17c7fa1b 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
 FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))
 
 CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
-	-fno-strict-aliasing \
+	-I src/ -fno-strict-aliasing \
 	-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
 
 ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
@@ -15,10 +15,13 @@ CFLAGS += -I../kmod/src
 endif
 
 BIN := src/scoutfs
-OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
-DEPS := $(wildcard */*.d)
+OBJ_DIRS := src src/check
+OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
+DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))
 
-all: $(BIN)
+AR := src/scoutfs_parallel_restore.a
+
+all: $(BIN) $(AR)
 
 ifneq ($(DEPS),)
 -include $(DEPS)
@@ -36,6 +39,10 @@ $(BIN): $(OBJ)
 	$(QU)  [BIN $@]
 	$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid
 
+$(AR): $(OBJ)
+	$(QU)  [AR $@]
+	$(VE)ar rcs $@ $^
+
 %.o %.d: %.c Makefile sparse.sh
 	$(QU)  [CC $<]
 	$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
diff --git a/utils/man/scoutfs.8 b/utils/man/scoutfs.8
index d105d87b..bb3f67d9 100644
--- a/utils/man/scoutfs.8
+++ b/utils/man/scoutfs.8
@@ -76,6 +76,41 @@ run when the file system will not be mounted.
 .RE
 .PD
 
+.TP
+.BI "check META-DEVICE DATA-DEVICE [-d|--debug FILE]"
+.sp
+Performs an offline file system check. The program iterates through all the
+data structures on disk directly - the filesystem must not be mounted while
+this operation is running.
+.RS 1.0i
+.PD 0
+.sp
+.TP
+.B "-d, --debug FILE"
+An output file where the program can output debug information about the
+state of the filesystem as it performs the check. If
+.B FILE
+is "-", the debug output is written to the Standard Error output.
+.TP
+.RE
+.sp
+.B RETURN VALUE
+The check function can return the following exit codes:
+.RS
+.TP
+\fB 0 \fR - no filesystem issues detected
+.TP
+\fB 1 \fR - file system issues were detected
+.TP
+\fB 8 \fR - operational error
+.TP
+\fB 16 \fR - usage error
+.TP
+\fB 32 \fR - cancelled by user (SIGINT)
+.TP
+.RE
+.PD
+
 .TP
 .BI "counters [-t|--table] SYSFS-DIR"
 .sp
diff --git a/utils/scoutfs-utils.spec.in b/utils/scoutfs-utils.spec.in
index fb24b812..a7c53514 100644
--- a/utils/scoutfs-utils.spec.in
+++ b/utils/scoutfs-utils.spec.in
@@ -54,6 +54,8 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
 install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
 install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
 install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
+install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h
+install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a
 install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
 install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
 install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
@@ -70,6 +72,7 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
 %files -n scoutfs-devel
 %defattr(644,root,root,755)
 %{_includedir}/scoutfs
+%{_libdir}/scoutfs
 
 %clean
 rm -rf %{buildroot}
diff --git a/utils/src/check/alloc.c b/utils/src/check/alloc.c
new file mode 100644
index 00000000..43d1d125
--- /dev/null
+++ b/utils/src/check/alloc.c
@@ -0,0 +1,166 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "bitmap.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "btree.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+
+/*
+ * We check the list blocks serially.
+ *
+ * XXX:
+ *  - compare ref seqs
+ *  - detect cycles?
+ */
+int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_ref ref;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+
+	ref = lhead->ref;
+
+	while (ref.blkno) {
+		blkno = le64_to_cpu(ref.blkno);
+
+		ret = cb(blkno, 1, cb_arg);
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+
+		ret = block_get(&blk, blkno, 0);
+		if (ret < 0)
+			goto out;
+
+		lblk = block_buf(blk);
+		/* XXX verify block */
+		ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
+		if (ret < 0)
+			goto out;
+
+		/* XXX sort?   maybe */
+
+		ref = lblk->next;
+
+		block_put(&blk);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
+{
+	return btree_meta_iter(&root->root, cb, cb_arg);
+}
+
+int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_ref ref;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	ref = lhead->ref;
+
+	while (ref.blkno) {
+		blkno = le64_to_cpu(ref.blkno);
+
+		ret = block_get(&blk, blkno, 0);
+		if (ret < 0)
+			goto out;
+
+		sns_push("alloc_list_block", blkno, 0);
+
+		lblk = block_buf(blk);
+		/* XXX verify block */
+		ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
+		if (ret < 0)
+			goto out;
+		/* XXX sort?   maybe */
+
+		ret = 0;
+		for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
+			blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
+
+			ret = cb(blkno, 1, cb_arg);
+			if (ret < 0)
+				break;
+		}
+
+		ref = lblk->next;
+
+		block_put(&blk);
+		sns_pop();
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static bool valid_free_extent_key(struct scoutfs_key *key)
+{
+	return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
+	        key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
+	       (!key->_sk_fourth && !key->sk_type &&
+		(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
+}
+
+static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
+{
+	struct extent_cb_arg_t *ecba = cb_arg;
+	u64 start;
+	u64 len;
+
+	/* XXX not sure these eios are what we want */
+
+	if (val_len != 0)
+		return -EIO;
+
+	if (!valid_free_extent_key(key))
+		return -EIO;
+
+	if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
+		return -ECHECK_ITER_DONE;
+
+	start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
+	len = le64_to_cpu(key->skfb_len);
+
+	return ecba->cb(start, len, ecba->cb_arg);
+}
+
+/*
+ * Call the callback with each of the primary BLKNO free extents stored
+ * in item in the given alloc root.  It doesn't visit the secondary
+ * ORDER extents.
+ */
+int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
+{
+	struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
+
+	return btree_item_iter(&root->root, free_item_cb, &ecba);
+}
diff --git a/utils/src/check/alloc.h b/utils/src/check/alloc.h
new file mode 100644
index 00000000..f0273e4a
--- /dev/null
+++ b/utils/src/check/alloc.h
@@ -0,0 +1,12 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
+#define _SCOUTFS_UTILS_CHECK_ALLOC_H
+
+#include "extent.h"
+
+int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
+int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
+
+int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
+int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
+
+#endif
diff --git a/utils/src/check/block.c b/utils/src/check/block.c
new file mode 100644
index 00000000..08535a5a
--- /dev/null
+++ b/utils/src/check/block.c
@@ -0,0 +1,613 @@
+#define _ISOC11_SOURCE /* aligned_alloc */
+#define _DEFAULT_SOURCE /* syscall() */
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/syscall.h>
+#include <linux/aio_abi.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "list.h"
+#include "cmp.h"
+#include "hash.h"
+
+#include "block.h"
+#include "debug.h"
+#include "super.h"
+#include "eno.h"
+#include "crc.h"
+#include "sns.h"
+
+static struct block_data {
+	struct list_head *hash_lists;
+	size_t hash_nr;
+
+	struct list_head active_head;
+	struct list_head inactive_head;
+	struct list_head dirty_list;
+	size_t nr_active;
+	size_t nr_inactive;
+	size_t nr_dirty;
+
+	int meta_fd;
+	size_t max_cached;
+	size_t nr_events;
+
+	aio_context_t ctx;
+	struct iocb *iocbs;
+	struct iocb **iocbps;
+	struct io_event *events;
+} global_bdat;
+
+struct block {
+	struct list_head hash_head;
+	struct list_head lru_head;
+	struct list_head dirty_head;
+	struct list_head submit_head;
+	unsigned long refcount;
+	unsigned long uptodate:1,
+		      active:1;
+	u64 blkno;
+	void *buf;
+	size_t size;
+};
+
+#define BLK_FMT \
+	"blkno %llu rc %ld d %u a %u"
+#define BLK_ARG(blk) \
+	(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
+#define debug_blk(blk, fmt, args...) \
+	debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
+
+/*
+ * This just allocates and initialzies the block.  The caller is
+ * responsible for putting it on the appropriate initial lists and
+ * managing refcounts.
+ */
+static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
+{
+	struct block *blk;
+
+	blk = calloc(1, sizeof(struct block));
+	if (blk) {
+		blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
+		if (!blk->buf) {
+			free(blk);
+			blk = NULL;
+		} else {
+			INIT_LIST_HEAD(&blk->hash_head);
+			INIT_LIST_HEAD(&blk->lru_head);
+			INIT_LIST_HEAD(&blk->dirty_head);
+			INIT_LIST_HEAD(&blk->submit_head);
+			blk->blkno = blkno;
+			blk->size = size;
+		}
+	}
+
+	return blk;
+}
+
+static void free_block(struct block_data *bdat, struct block *blk)
+{
+	debug_blk(blk, "free");
+
+	if (!list_empty(&blk->lru_head)) {
+		if (blk->active)
+			bdat->nr_active--;
+		else
+			bdat->nr_inactive--;
+		list_del(&blk->lru_head);
+	}
+
+	if (!list_empty(&blk->dirty_head)) {
+		bdat->nr_dirty--;
+		list_del(&blk->dirty_head);
+	}
+
+	if (!list_empty(&blk->hash_head))
+		list_del(&blk->hash_head);
+
+	if (!list_empty(&blk->submit_head))
+		list_del(&blk->submit_head);
+
+	free(blk->buf);
+	free(blk);
+}
+
+static bool blk_is_dirty(struct block *blk)
+{
+	return !list_empty(&blk->dirty_head);
+}
+
+/*
+ * Rebalance the cache.
+ *
+ * First we shrink the cache to limit it to max_cached blocks.
+ * Logically, we walk from oldest to newest in the inactive list and
+ * then in the active list.  Since these lists are physically one
+ * list_head list we achieve this with a reverse walk starting from the
+ * active head.
+ *
+ * Then we rebalnace the size of the two lists.  The constraint is that
+ * we don't let the active list grow larger than the inactive list.  We
+ * move blocks from the oldest tail of the active list to the newest
+ * head of the inactive list.
+ *
+ * <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
+ */
+static void rebalance_cache(struct block_data *bdat)
+{
+	struct block *blk;
+	struct block *blk_;
+
+	list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
+		if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
+			break;
+
+		if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
+		    blk_is_dirty(blk))
+			continue;
+
+		free_block(bdat, blk);
+	}
+
+	list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
+		if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
+			break;
+
+		list_move(&blk->lru_head, &bdat->inactive_head);
+		blk->active = 0;
+		bdat->nr_active--;
+		bdat->nr_inactive++;
+	}
+}
+
+static void make_active(struct block_data *bdat, struct block *blk)
+{
+	if (!blk->active) {
+		if (!list_empty(&blk->lru_head)) {
+			list_move(&blk->lru_head, &bdat->active_head);
+			bdat->nr_inactive--;
+		} else {
+			list_add(&blk->lru_head, &bdat->active_head);
+		}
+
+		blk->active = 1;
+		bdat->nr_active++;
+	}
+}
+
+static int compar_iocbp(const void *A, const void *B)
+{
+	struct iocb *a = *(struct iocb **)A;
+	struct iocb *b = *(struct iocb **)B;
+
+	return scoutfs_cmp(a->aio_offset, b->aio_offset);
+}
+
+static int submit_and_wait(struct block_data *bdat, struct list_head *list)
+{
+	struct io_event *event;
+	struct iocb *iocb;
+	struct block *blk;
+	int ret;
+	int err;
+	int nr;
+	int i;
+
+	err = 0;
+	nr = 0;
+	list_for_each_entry(blk, list, submit_head) {
+		iocb = &bdat->iocbs[nr];
+		bdat->iocbps[nr] = iocb;
+
+		memset(iocb, 0, sizeof(struct iocb));
+
+		iocb->aio_data = (intptr_t)blk;
+		iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
+		iocb->aio_fildes = bdat->meta_fd;
+		iocb->aio_buf = (intptr_t)blk->buf;
+		iocb->aio_nbytes = blk->size;
+		iocb->aio_offset = blk->blkno * blk->size;
+
+		nr++;
+
+		debug_blk(blk, "submit");
+
+		if ((nr < bdat->nr_events) && blk->submit_head.next != list)
+			continue;
+
+		qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
+
+		ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
+		if (ret != nr) {
+			if (ret >= 0)
+				errno = EIO;
+			ret = -errno;
+			fprintf(stderr, "fatal system error submitting async IO: "ENO_FMT"\n",
+				ENO_ARG(-ret));
+			goto out;
+		}
+
+		ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
+		if (ret != nr) {
+			if (ret >= 0)
+				errno = EIO;
+			ret = -errno;
+			fprintf(stderr, "fatal system error getting IO events: "ENO_FMT"\n",
+				ENO_ARG(-ret));
+			goto out;
+		}
+
+		ret = 0;
+		for (i = 0; i < nr; i++) {
+			event = &bdat->events[i];
+			iocb = (struct iocb *)(intptr_t)event->obj;
+			blk = (struct block *)(intptr_t)event->data;
+
+			debug_blk(blk, "complete res %lld", (long long)event->res);
+
+			if (event->res >= 0 && event->res != blk->size)
+				event->res = -EIO;
+
+			/* io errors are fatal */
+			if (event->res < 0) {
+				ret = event->res;
+				goto out;
+			}
+
+			if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
+				blk->uptodate = 1;
+			} else {
+				list_del_init(&blk->dirty_head);
+				bdat->nr_dirty--;
+			}
+		}
+		nr = 0;
+	}
+
+	ret = 0;
+out:
+	return ret ?: err;
+}
+
+static void inc_refcount(struct block *blk)
+{
+	blk->refcount++;
+}
+
+void block_put(struct block **blkp)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk = *blkp;
+
+	if (blk) {
+		blk->refcount--;
+		*blkp = NULL;
+
+		rebalance_cache(bdat);
+	}
+}
+
+static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
+{
+	u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
+
+	return &bdat->hash_lists[hash % bdat->hash_nr];
+}
+
+int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic)
+{
+	struct scoutfs_block_header *hdr;
+	size_t size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
+	int ret;
+	u32 crc;
+
+	ret = block_get(&blk, blkno, bf);
+	if (ret < 0) {
+		fprintf(stderr, "error reading block %llu\n", blkno);
+		goto out;
+	}
+
+	hdr = block_buf(blk);
+
+	crc = crc_block(hdr, size);
+
+	/*
+	 * a bad CRC is easy to repair, so we pass a different error code
+	 * back. Unless the other data is also wrong - then it's EINVAL
+	 * to signal that this isn't a valid block hdr at all.
+	 */
+	if (le32_to_cpu(hdr->crc) != crc)
+		ret = -EIO; /* keep checking other fields */
+
+	if (le32_to_cpu(hdr->magic) != magic)
+		ret = -EINVAL;
+
+	/*
+	 * Our first caller fills in global_super. Until this completes,
+	 * we can't do this check.
+	 */
+	if ((blkno != SCOUTFS_SUPER_BLKNO) &&
+	    (hdr->fsid != global_super->hdr.fsid))
+		ret = -EINVAL;
+
+	block_put(&blk);
+
+	debug("%s blk_hdr_valid blkno %llu size %lu crc 0x%08x magic 0x%08x ret %d",
+	      sns_str(), blkno, size, le32_to_cpu(hdr->crc), le32_to_cpu(hdr->magic),
+	      ret);
+
+out:
+	return ret;
+}
+
+static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
+{
+	struct list_head *bucket = hash_bucket(bdat, blkno);
+	struct block *search;
+	struct block *blk;
+	size_t size;
+
+	size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
+
+	blk = NULL;
+	list_for_each_entry(search, bucket, hash_head) {
+		if (search->blkno == blkno && search->size == size) {
+			blk = search;
+			break;
+		}
+	}
+
+	if (!blk) {
+		blk = alloc_block(bdat, blkno, size);
+		if (blk) {
+			list_add(&blk->hash_head, bucket);
+			list_add(&blk->lru_head, &bdat->inactive_head);
+			bdat->nr_inactive++;
+		}
+	}
+	if (blk)
+		inc_refcount(blk);
+
+	return blk;
+}
+
+/*
+ * Get a block.
+ *
+ * The caller holds a refcount to the block while it's in use that
+ * prevents it from being removed from the cache.  It must be dropped
+ * with block_put();
+ */
+int block_get(struct block **blk_ret, u64 blkno, int bf)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	LIST_HEAD(list);
+	int ret;
+
+	blk = get_or_alloc(bdat, blkno, bf);
+	if (!blk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if ((bf & BF_ZERO)) {
+		memset(blk->buf, 0, blk->size);
+		blk->uptodate = 1;
+	}
+
+	if (bf & BF_OVERWRITE)
+		blk->uptodate = 1;
+
+	if (!blk->uptodate) {
+		list_add(&blk->submit_head, &list);
+		ret = submit_and_wait(bdat, &list);
+		list_del_init(&blk->submit_head);
+		if (ret < 0)
+			goto out;
+	}
+
+	if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
+		list_add_tail(&bdat->dirty_list, &blk->dirty_head);
+		bdat->nr_dirty++;
+	}
+
+	make_active(bdat, blk);
+
+	rebalance_cache(bdat);
+	ret = 0;
+out:
+	if (ret < 0)
+		block_put(&blk);
+	*blk_ret = blk;
+	return ret;
+}
+
+void *block_buf(struct block *blk)
+{
+	return blk->buf;
+}
+
+size_t block_size(struct block *blk)
+{
+	return blk->size;
+}
+
+/*
+ * Drop the block from the cache, regardless of if it was free or not.
+ * This is used to avoid writing blocks which were dirtied but then
+ * later freed.
+ *
+ * The block is immediately freed and can't be referenced after this
+ * returns.
+ */
+void block_drop(struct block **blkp)
+{
+	struct block_data *bdat = &global_bdat;
+
+	free_block(bdat, *blkp);
+	*blkp = NULL;
+	rebalance_cache(bdat);
+}
+
+/*
+ * This doesn't quite work for mixing large and small blocks, but that's
+ * fine, we never do that.
+ */
+static int compar_u64(const void *A, const void *B)
+{
+	u64 a = *((u64 *)A);
+	u64 b = *((u64 *)B);
+
+	return scoutfs_cmp(a, b);
+}
+
+/*
+ * This read-ahead is synchronous and errors are ignored.  If any of the
+ * blknos aren't present in the cache then we issue concurrent reads for
+ * them and wait.  Any existing cached blocks will be left as is.
+ *
+ * We might be trying to read a lot more than the number of events so we
+ * sort the caller's blknos before iterating over them rather than
+ * relying on submission sorting the blocks in each submitted set.
+ */
+void block_readahead(u64 *blknos, size_t nr)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	struct block *blk_;
+	LIST_HEAD(list);
+	size_t i;
+
+	if (nr == 0)
+		return;
+
+	qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
+
+	for (i = 0; i < nr; i++) {
+		blk = get_or_alloc(bdat, blknos[i], 0);
+		if (blk) {
+			if (!blk->uptodate)
+				list_add_tail(&blk->submit_head, &list);
+			else
+				block_put(&blk);
+		}
+	}
+
+	(void)submit_and_wait(bdat, &list);
+
+	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	}
+
+	rebalance_cache(bdat);
+}
+
+/*
+ * The caller's block changes form a consistent transaction.  If the amount of dirty
+ * blocks is large enough we issue a write.
+ */
+int block_try_commit(bool force)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	struct block *blk_;
+	LIST_HEAD(list);
+	int ret;
+
+	if (!force && bdat->nr_dirty < bdat->nr_events)
+		return 0;
+
+	list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
+		list_add_tail(&blk->submit_head, &list);
+		inc_refcount(blk);
+	}
+
+	ret = submit_and_wait(bdat, &list);
+
+	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	}
+
+	if (ret < 0) {
+		fprintf(stderr, "error writing dirty transaction blocks\n");
+		goto out;
+	}
+
+	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
+	if (ret == 0) {
+		list_add(&blk->submit_head, &list);
+		ret = submit_and_wait(bdat, &list);
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	} else {
+		ret = -ENOMEM;
+	}
+	if (ret < 0)
+		fprintf(stderr, "error writing super block to commit transaction\n");
+
+out:
+	rebalance_cache(bdat);
+	return ret;
+}
+
+int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
+{
+	struct block_data *bdat = &global_bdat;
+	size_t i;
+	int ret;
+
+	bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
+	bdat->hash_nr = bdat->max_cached / 4;
+	bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
+
+	bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
+	bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
+	bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
+	bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
+	if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&bdat->active_head);
+	INIT_LIST_HEAD(&bdat->inactive_head);
+	INIT_LIST_HEAD(&bdat->dirty_list);
+	bdat->meta_fd = meta_fd;
+	list_add(&bdat->inactive_head, &bdat->active_head);
+
+	for (i = 0; i < bdat->hash_nr; i++)
+		INIT_LIST_HEAD(&bdat->hash_lists[i]);
+
+	ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
+
+out:
+	if (ret < 0) {
+		free(bdat->iocbs);
+		free(bdat->iocbps);
+		free(bdat->events);
+		free(bdat->hash_lists);
+	}
+
+	return ret;
+}
+
+void block_shutdown(void)
+{
+	struct block_data *bdat = &global_bdat;
+
+	syscall(SYS_io_destroy, bdat->ctx);
+
+	free(bdat->iocbs);
+	free(bdat->iocbps);
+	free(bdat->events);
+	free(bdat->hash_lists);
+}
diff --git a/utils/src/check/block.h b/utils/src/check/block.h
new file mode 100644
index 00000000..6c13b0cc
--- /dev/null
+++ b/utils/src/check/block.h
@@ -0,0 +1,34 @@
+#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
+#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
+
+#include <unistd.h>
+#include <stdbool.h>
+
+struct block;
+
+#include "sparse.h"
+
+/* block flags passed to block_get() */
+enum {
+	BF_ZERO      = (1 << 0), /* zero contents buf as block is returned */
+	BF_DIRTY     = (1 << 1), /* block will be written with transaction */
+	BF_SM        = (1 << 2), /* small 4k block instead of large 64k block */
+	BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
+};
+
+int block_get(struct block **blk_ret, u64 blkno, int bf);
+void block_put(struct block **blkp);
+
+void *block_buf(struct block *blk);
+size_t block_size(struct block *blk);
+void block_drop(struct block **blkp);
+
+void block_readahead(u64 *blknos, size_t nr);
+int block_try_commit(bool force);
+
+int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
+void block_shutdown(void);
+
+int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic);
+
+#endif
diff --git a/utils/src/check/btree.c b/utils/src/check/btree.c
new file mode 100644
index 00000000..ebf05b8c
--- /dev/null
+++ b/utils/src/check/btree.c
@@ -0,0 +1,217 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "key.h"
+#include "avl.h"
+
+#include "block.h"
+#include "btree.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+#include "meta.h"
+#include "problem.h"
+
+static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
+{
+	return (void *)bt + le16_to_cpu(item->val_off);
+}
+
+static void readahead_refs(struct scoutfs_btree_block *bt)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *node;
+	struct scoutfs_block_ref *ref;
+	u64 *blknos;
+	u64 blkno;
+	u16 valid = 0;
+	u16 nr = le16_to_cpu(bt->nr_items);
+	int i;
+
+	blknos = calloc(nr, sizeof(blknos[0]));
+	if (!blknos)
+		return;
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < nr; i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+		ref = item_val(bt, item);
+		blkno = le64_to_cpu(ref->blkno);
+
+		if (valid_meta_blkno(blkno))
+			blknos[valid++] = blkno;
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	if (valid > 0)
+		block_readahead(blknos, valid);
+	free(blknos);
+}
+
+/*
+ * Call the callback on the referenced block.  Then if the block
+ * contains referneces read it and recurse into all its references.
+ */
+static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
+			       void *cb_arg)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *node;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	blkno = le64_to_cpu(ref->blkno);
+	if (!blkno)
+		return 0;
+
+	ret = cb(blkno, 1, cb_arg);
+	if (ret < 0) {
+		ret = xlate_iter_errno(ret);
+		return 0;
+	}
+
+	if (level == 0)
+		return 0;
+
+	ret = block_get(&blk, blkno, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
+	if (ret < 0)
+		return ret;
+
+	sns_push("btree_parent", blkno, 0);
+
+	bt = block_buf(blk);
+
+	/* XXX integrate verification with block cache */
+	if (bt->level != level) {
+		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* read-ahead last level of parents */
+	if (level == 2)
+		readahead_refs(bt);
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+		ref = item_val(bt, item);
+
+		ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
+		if (ret < 0)
+			goto out;
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	ret = 0;
+out:
+	block_put(&blk);
+	sns_pop();
+
+	return ret;
+}
+
+int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
+{
+	/* XXX check root */
+	if (root->height == 0)
+		return 0;
+
+	return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
+}
+
+static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
+			       btree_item_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *node;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	blkno = le64_to_cpu(ref->blkno);
+	if (!blkno)
+		return 0;
+
+	ret = block_get(&blk, blkno, 0);
+	if (ret < 0)
+		return ret;
+
+	if (level)
+		sns_push("btree_parent", blkno, 0);
+	else
+		sns_push("btree_leaf", blkno, 0);
+
+	ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
+	if (ret < 0)
+		return ret;
+
+	bt = block_buf(blk);
+
+	/* XXX integrate verification with block cache */
+	if (bt->level != level) {
+		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* read-ahead leaves that contain items */
+	if (level == 1)
+		readahead_refs(bt);
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+
+		if (level) {
+			ref = item_val(bt, item);
+			ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
+		} else {
+			ret = cb(&item->key, item_val(bt, item),
+				 le16_to_cpu(item->val_len), cb_arg);
+			debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
+		}
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	ret = 0;
+out:
+	block_put(&blk);
+	sns_pop();
+
+	return ret;
+}
+
+int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
+{
+	/* XXX check root */
+	if (root->height == 0)
+		return 0;
+
+	return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
+}
diff --git a/utils/src/check/btree.h b/utils/src/check/btree.h
new file mode 100644
index 00000000..dc0b3bf9
--- /dev/null
+++ b/utils/src/check/btree.h
@@ -0,0 +1,14 @@
+#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
+#define _SCOUTFS_UTILS_CHECK_BTREE_H_
+
+#include "util.h"
+#include "format.h"
+
+#include "extent.h"
+
+typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
+
+int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
+int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
+
+#endif
diff --git a/utils/src/check/check.c b/utils/src/check/check.c
new file mode 100644
index 00000000..0fa8a870
--- /dev/null
+++ b/utils/src/check/check.c
@@ -0,0 +1,184 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "parse.h"
+#include "util.h"
+#include "format.h"
+#include "ioctl.h"
+#include "cmd.h"
+#include "dev.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "debug.h"
+#include "meta.h"
+#include "super.h"
+#include "problem.h"
+
+struct check_args {
+	char *meta_device;
+	char *data_device;
+	char *debug_path;
+};
+
+static int do_check(struct check_args *args)
+{
+	int debug_fd = -1;
+	int meta_fd = -1;
+	int data_fd = -1;
+	int ret;
+
+	if (args->debug_path) {
+		if (strcmp(args->debug_path, "-") == 0)
+			debug_fd = dup(STDERR_FILENO);
+		else
+			debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (debug_fd < 0) {
+			ret = -errno;
+			fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
+				args->debug_path, strerror(errno), errno);
+			goto out;
+		}
+
+		debug_enable(debug_fd);
+	}
+
+	meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
+	if (meta_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
+			args->meta_device, strerror(errno), errno);
+		goto out;
+	}
+
+	data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
+	if (data_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
+			args->data_device, strerror(errno), errno);
+		goto out;
+	}
+
+	ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * At some point we may convert this to a multi-pass system where we may
+	 * try and repair items, and, as long as repairs are made, we will rerun
+	 * the checks more times. We may need to start counting how many problems we
+	 * fix in the process of these loops, so that we don't stall on unrepairable
+	 * problems and are making actual repair progress. IOW - when we do a full
+	 * check loop without any problems fixed, we stop trying.
+	 */
+	ret = check_supers(data_fd) ?:
+	      check_super_in_use(meta_fd) ?:
+	      check_meta_alloc() ?:
+	      check_super_crc();
+
+	if (ret < 0)
+		goto out;
+
+	debug("problem count %lu", problems_count());
+	if (problems_count() > 0)
+		printf("Problems detected.\n");
+
+out:
+	/* and tear it all down */
+	block_shutdown();
+	super_shutdown();
+	debug_disable();
+
+	if (meta_fd >= 0)
+		close(meta_fd);
+	if (data_fd >= 0)
+		close(data_fd);
+	if (debug_fd >= 0)
+		close(debug_fd);
+
+	return ret;
+}
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct check_args *args = state->input;
+
+	switch (key) {
+	case 'd':
+		args->debug_path = strdup_or_error(state, arg);
+		break;
+	case 'e':
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else if (!args->data_device)
+			args->data_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two device arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		if (!args->data_device)
+			argp_error(state, "no data device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"META-DEVICE DATA-DEVICE",
+	"Check filesystem consistency"
+};
+
+/* Exit codes used by fsck-type programs */
+#define FSCK_EX_NONDESTRUCT	1	/* File system errors corrected */
+#define FSCK_EX_UNCORRECTED	4	/* File system errors left uncorrected */
+#define FSCK_EX_ERROR		8	/* Operational error */
+#define FSCK_EX_USAGE		16	/* Usage or syntax error */
+
+static int check_cmd(int argc, char **argv)
+{
+	struct check_args check_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
+	if (ret)
+		exit(FSCK_EX_USAGE);
+
+	ret = do_check(&check_args);
+	if (ret < 0)
+		ret = FSCK_EX_ERROR;
+
+	if (problems_count() > 0)
+		ret |= FSCK_EX_UNCORRECTED;
+
+	exit(ret);
+}
+
+static void __attribute__((constructor)) check_ctor(void)
+{
+	cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
+}
diff --git a/utils/src/check/debug.c b/utils/src/check/debug.c
new file mode 100644
index 00000000..0017c1aa
--- /dev/null
+++ b/utils/src/check/debug.c
@@ -0,0 +1,16 @@
+#include <stdlib.h>
+
+#include "debug.h"
+
+int debug_fd = -1;
+
+void debug_enable(int fd)
+{
+	debug_fd = fd;
+}
+
+void debug_disable(void)
+{
+	if (debug_fd >= 0)
+		debug_fd = -1;
+}
diff --git a/utils/src/check/debug.h b/utils/src/check/debug.h
new file mode 100644
index 00000000..a5103494
--- /dev/null
+++ b/utils/src/check/debug.h
@@ -0,0 +1,17 @@
+#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
+#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
+
+#include <stdio.h>
+
+#define debug(fmt, args...)				\
+do {							\
+	if (debug_fd >= 0)				\
+		dprintf(debug_fd, fmt"\n", ##args);	\
+} while (0)
+
+extern int debug_fd;
+
+void debug_enable(int fd);
+void debug_disable(void);
+
+#endif
diff --git a/utils/src/check/eno.h b/utils/src/check/eno.h
new file mode 100644
index 00000000..14579fce
--- /dev/null
+++ b/utils/src/check/eno.h
@@ -0,0 +1,9 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
+#define _SCOUTFS_UTILS_CHECK_ENO_H_
+
+#include <errno.h>
+
+#define ENO_FMT		"%d (%s)"
+#define ENO_ARG(eno)	eno, strerror(eno)
+
+#endif
diff --git a/utils/src/check/extent.c b/utils/src/check/extent.c
new file mode 100644
index 00000000..bbbcc887
--- /dev/null
+++ b/utils/src/check/extent.c
@@ -0,0 +1,313 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "util.h"
+#include "lk_rbtree_wrapper.h"
+
+#include "debug.h"
+#include "extent.h"
+
+/*
+ * In-memory extent management in rbtree nodes.
+ */
+
+bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
+{
+	u64 a_end = a_start + a_len;
+	u64 b_end = b_start + b_len;
+
+	return !((a_end <= b_start) || (b_end <= a_start));
+}
+
+static int ext_contains(struct extent_node *ext, u64 start, u64 len)
+{
+	return ext->start <= start && ext->start + ext->len >= start + len;
+}
+
+/*
+ * True if the given extent is bisected by the given range; there's
+ * leftover containing extents on both the left and right sides of the
+ * range in the extent.
+ */
+static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
+{
+	return ext->start < start && ext->start + ext->len > start + len;
+}
+
+static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
+{
+	return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
+}
+
+static struct extent_node *next_ext(struct extent_node *ext)
+{
+	return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
+}
+
+static struct extent_node *prev_ext(struct extent_node *ext)
+{
+	return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
+}
+
+struct walk_results {
+	unsigned bisect_to_leaf:1;
+	struct extent_node *found;
+	struct extent_node *next;
+	struct rb_node *parent;
+	struct rb_node **node;
+};
+
+static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
+{
+	struct rb_node **node = &root->rbroot.rb_node;
+	struct extent_node *ext;
+	u64 end = start + len;
+	int cmp;
+
+	wlk->found = NULL;
+	wlk->next = NULL;
+	wlk->parent = NULL;
+
+	while (*node) {
+		wlk->parent = *node;
+		ext = ext_from_rbnode(*node);
+		cmp = end <= ext->start ? -1 :
+		      start >= ext->start + ext->len ? 1 : 0;
+
+		if (cmp < 0) {
+			node = &ext->rbnode.rb_left;
+			wlk->next = ext;
+		} else if (cmp > 0) {
+			node = &ext->rbnode.rb_right;
+		} else {
+			wlk->found = ext;
+			if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
+				break;
+			/* walk right so we can insert greater right from bisection */
+			node = &ext->rbnode.rb_right;
+		}
+	}
+
+	wlk->node = node;
+}
+
+/*
+ * Return an extent that overlaps with the given range.
+ */
+int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
+{
+	struct walk_results wlk = { 0, };
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+	if (wlk.found) {
+		memset(found, 0, sizeof(struct extent_node));
+		found->start = wlk.found->start;
+		found->len = wlk.found->len;
+		ret = 0;
+	} else {
+		ret = -ENOENT;
+	}
+
+	return ret;
+}
+
+/*
+ * Callers can iterate through direct node references and are entirely
+ * responsible for consistency when doing so.
+ */
+struct extent_node *extent_first(struct extent_root *root)
+{
+	struct walk_results wlk = { 0, };
+
+	walk_extents(root, 0, 1, &wlk);
+
+	return wlk.found ?: wlk.next;
+}
+
+struct extent_node *extent_next(struct extent_node *ext)
+{
+	return next_ext(ext);
+}
+
+struct extent_node *extent_prev(struct extent_node *ext)
+{
+	return prev_ext(ext);
+}
+
+/*
+ * Insert a new extent into the tree.  We can extend existing nodes,
+ * merge with neighbours, or remove existing extents entirely if we
+ * insert a range that fully spans existing nodes.
+ */
+static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
+{
+	struct walk_results wlk = { 0, };
+	struct extent_node *ext;
+	struct extent_node *nei;
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+
+	ext = wlk.found;
+	if (ext && found_err) {
+		ret = found_err;
+		goto out;
+	}
+
+	if (!ext) {
+		ext = malloc(sizeof(struct extent_node));
+		if (!ext) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ext->start = start;
+		ext->len = len;
+
+		rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
+		rb_insert_color(&ext->rbnode, &root->rbroot);
+	}
+
+	/* start by expanding an existing extent if our range is larger */
+	if (start < ext->start) {
+		ext->len += ext->start - start;
+		ext->start = start;
+	}
+	if (ext->start + ext->len < start + len)
+		ext->len += (start + len) - (ext->start + ext->len);
+
+	/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
+
+	while ((nei = prev_ext(ext))) {
+		if (nei->start + nei->len < ext->start)
+			break;
+
+		if (nei->start < ext->start) {
+			ext->len += ext->start - nei->start;
+			ext->start = nei->start;
+		}
+
+		rb_erase(&nei->rbnode, &root->rbroot);
+		free(nei);
+	}
+
+	while ((nei = next_ext(ext))) {
+		if (ext->start + ext->len < nei->start)
+			break;
+
+		if (ext->start + ext->len < nei->start + nei->len)
+			ext->len += (nei->start + nei->len) - (ext->start + ext->len);
+
+		rb_erase(&nei->rbnode, &root->rbroot);
+		free(nei);
+	}
+
+	ret = 0;
+out:
+	if (ret < 0)
+		debug("start %llu len %llu ret %d", start, len, ret);
+	return ret;
+}
+
+/*
+ * Insert a new extent.  The specified extent must not overlap with any
+ * existing extents or -EEXIST is returned.
+ */
+int extent_insert_new(struct extent_root *root, u64 start, u64 len)
+{
+	return walk_insert(root, start, len, true);
+}
+
+/*
+ * Insert an extent, extending any existing extents that may overlap.
+ */
+int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
+{
+	return walk_insert(root, start, len, false);
+}
+
+/*
+ * Remove the specified extent from an existing node.  The given extent must be fully
+ * contained in a single node or -ENOENT is returned.
+ */
+int extent_remove(struct extent_root *root, u64 start, u64 len)
+{
+	struct extent_node *ext;
+	struct extent_node *ins;
+	struct walk_results wlk = {
+		.bisect_to_leaf = 1,
+	};
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+
+	if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (ext_bisected(ext, start, len)) {
+		debug("found bisected start %llu len %llu", ext->start, ext->len);
+		ins = malloc(sizeof(struct extent_node));
+		if (!ins) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ins->start = start + len;
+		ins->len = (ext->start + ext->len) - ins->start;
+
+		rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
+		rb_insert_color(&ins->rbnode, &root->rbroot);
+	}
+
+	if (start > ext->start) {
+		ext->len = start - ext->start;
+	} else if (len < ext->len) {
+		ext->start += len;
+		ext->len -= len;
+	} else {
+		rb_erase(&ext->rbnode, &root->rbroot);
+	}
+
+	ret = 0;
+out:
+	debug("start %llu len %llu ret %d", start, len, ret);
+
+	return ret;
+}
+
+void extent_root_init(struct extent_root *root)
+{
+	root->rbroot = RB_ROOT;
+	root->total = 0;
+}
+
+void extent_root_free(struct extent_root *root)
+{
+	struct extent_node *ext;
+	struct rb_node *node;
+	struct rb_node *tmp;
+
+	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
+		ext = rb_entry(node, struct extent_node, rbnode);
+		rb_erase(&ext->rbnode, &root->rbroot);
+		free(ext);
+	}
+}
+
+void extent_root_print(struct extent_root *root)
+{
+	struct extent_node *ext;
+	struct rb_node *node;
+	struct rb_node *tmp;
+
+	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
+		ext = rb_entry(node, struct extent_node, rbnode);
+		debug("  start %llu len %llu", ext->start, ext->len);
+	}
+}
diff --git a/utils/src/check/extent.h b/utils/src/check/extent.h
new file mode 100644
index 00000000..2a38f765
--- /dev/null
+++ b/utils/src/check/extent.h
@@ -0,0 +1,38 @@
+#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
+#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
+
+#include "lk_rbtree_wrapper.h"
+
+struct extent_root {
+	struct rb_root rbroot;
+	u64 total;
+};
+
+struct extent_node {
+	struct rb_node rbnode;
+	u64 start;
+	u64 len;
+};
+
+typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
+
+struct extent_cb_arg_t {
+	extent_cb_t cb;
+	void *cb_arg;
+};
+
+bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
+
+int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
+struct extent_node *extent_first(struct extent_root *root);
+struct extent_node *extent_next(struct extent_node *ext);
+struct extent_node *extent_prev(struct extent_node *ext);
+int extent_insert_new(struct extent_root *root, u64 start, u64 len);
+int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
+int extent_remove(struct extent_root *root, u64 start, u64 len);
+
+void extent_root_init(struct extent_root *root);
+void extent_root_free(struct extent_root *root);
+void extent_root_print(struct extent_root *root);
+
+#endif
diff --git a/utils/src/check/image.c b/utils/src/check/image.c
new file mode 100644
index 00000000..0932ece6
--- /dev/null
+++ b/utils/src/check/image.c
@@ -0,0 +1,540 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <stdbool.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "bitmap.h"
+#include "parse.h"
+#include "util.h"
+#include "format.h"
+#include "crc.h"
+#include "cmd.h"
+#include "dev.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "btree.h"
+#include "log_trees.h"
+#include "super.h"
+
+/* huh. */
+#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
+
+#define SCOUTFS_META_IMAGE_HEADER_MAGIC		0x8aee00d098fa60c5ULL
+#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC	0x70bd5e9269effd86ULL
+
+struct scoutfs_meta_image_header {
+	__le64 magic;
+	__le64 total_bytes;
+	__le32 version;
+} __packed;
+
+struct scoutfs_meta_image_block_header {
+	__le64 magic;
+	__le64 offset;
+	__le32 size;
+	__le32 crc;
+} __packed;
+
+struct image_args {
+	char *meta_device;
+	bool is_read;
+	bool show_header;
+	u64 ra_window;
+};
+
+struct block_bitmaps {
+	unsigned long *bits;
+	u64 size;
+	u64 count;
+};
+
+#define errf(fmt, args...) \
+	dprintf(STDERR_FILENO, fmt, ##args)
+
+static int set_meta_bit(u64 start, u64 len, void *arg)
+{
+	struct block_bitmaps *bm = arg;
+	int ret;
+
+	if (len != 1) {
+		ret = -EINVAL;
+	} else {
+		if (!test_bit(bm->bits, start)) {
+			set_bit(bm->bits, start);
+			bm->count++;
+		}
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int get_ref_bits(struct block_bitmaps *bm)
+{
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+	u64 i;
+
+	/*
+	 * There are almost no small blocks we need to read, so we read
+	 * them as the large blocks that contain them to simplify the
+	 * block reading process.
+	 */
+	set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
+
+	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
+		set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
+
+	ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
+	      alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
+	      alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
+	      log_trees_meta_iter(set_meta_bit, bm);
+
+	return ret;
+}
+
+/*
+ * Note that this temporarily modifies the header that it's given.
+ */
+static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
+{
+	__le32 saved = bh->crc;
+	u32 crc = ~0;
+
+	bh->crc = 0;
+	crc = crc32c(crc, bh, sizeof(*bh));
+	crc = crc32c(crc, buf, size);
+	bh->crc = saved;
+
+	return cpu_to_le32(crc);
+}
+
+static void printf_header(struct scoutfs_meta_image_header *hdr)
+{
+	errf("magic: 0x%016llx\n"
+	     "total_bytes: %llu\n"
+	     "version: %u\n",
+	       le64_to_cpu(hdr->magic),
+	       le64_to_cpu(hdr->total_bytes),
+	       le32_to_cpu(hdr->version));
+}
+
+typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
+
+static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
+{
+	return read(fd, buf, count);
+}
+
+static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
+{
+	return pread(fd, buf, count, offset);
+}
+
+static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
+{
+	return write(fd, buf, count);
+}
+
+static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
+{
+	return pwrite(fd, buf, count, offset);
+}
+
+static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t sret;
+
+	while (count > 0) {
+		sret = func(fd, buf, count, offset);
+		if (sret <= 0 || sret > count) {
+			if (sret < 0)
+				return -errno;
+			else
+				return -EIO;
+		}
+
+		if (tot)
+			*tot += sret;
+		buf += sret;
+		count -= sret;
+	}
+
+	return 0;
+}
+
+static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
+{
+	struct scoutfs_meta_image_block_header bh;
+	struct scoutfs_meta_image_header hdr;
+	u64 opening;
+	void *buf;
+	off_t off;
+	u64 bit;
+	u64 ra;
+	int ret;
+
+	buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
+	hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
+				      (bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
+	hdr.version = cpu_to_le32(1);
+
+	if (args->show_header) {
+		printf_header(&hdr);
+		ret = 0;
+		goto out;
+	}
+
+	ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
+	if (ret < 0)
+		goto out;
+
+	opening = args->ra_window;
+	ra = 0;
+	bit = 0;
+
+	for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
+
+		/* readahead to open the full window, then a block at a time */
+		do {
+			ra = find_next_set_bit(bm->bits, ra, bm->size);
+			if (ra < bm->size) {
+				off = ra << SCOUTFS_BLOCK_LG_SHIFT;
+				posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
+				ra++;
+				if (opening)
+					opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
+			}
+		} while (opening > 0);
+
+		off = bit << SCOUTFS_BLOCK_LG_SHIFT;
+		ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
+		if (ret < 0)
+			goto out;
+
+		/*
+		 * Might as well try to drop the pages we've used to
+		 * reduce memory pressure on our read-ahead pages that
+		 * are waiting.
+		 */
+		posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
+
+		bh.magic = cpu_to_le64(SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
+		bh.offset = cpu_to_le64(off);
+		bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
+		bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
+
+		ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
+		      rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	free(buf);
+
+	return ret;
+}
+
+static int invalid_header(struct scoutfs_meta_image_header *hdr)
+{
+	if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
+		errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
+		       le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
+
+	} else if (le32_to_cpu(hdr->version) != 1) {
+		errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
+
+	} else {
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/*
+ * Doesn't catch offset+size overflowing, presumes pwrite() will return
+ * an error.
+ */
+static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
+{
+	if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
+		errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
+		       le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
+
+	} else if (le32_to_cpu(bh->size) == 0) {
+		errf("invalid block header size %u\n", le32_to_cpu(bh->size));
+
+	} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
+		errf("block header size %u too large for size_t (> %zu)\n",
+		       le32_to_cpu(bh->size), (size_t)SIZE_MAX);
+
+	} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
+		errf("block header offset %llu too large for off_t (> %llu)\n",
+		       le64_to_cpu(bh->offset), (u64)OFF_MAX);
+
+	} else {
+		return 0;
+	}
+
+	return -EIO;
+}
+
+static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
+{
+	struct scoutfs_meta_image_block_header bh;
+	struct scoutfs_meta_image_header hdr;
+	size_t writeback_batch = (2 * 1024 * 1024);
+	size_t buf_size;
+	size_t dirty;
+	size_t size;
+	off_t first;
+	off_t last;
+	off_t off;
+	__le32 calc;
+	void *buf;
+	u64 tot;
+	int ret;
+
+	tot = 0;
+
+	ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
+	if (ret < 0)
+		goto out;
+
+	if (args->show_header) {
+		printf_header(&hdr);
+		ret = 0;
+		goto out;
+	}
+
+	ret = invalid_header(&hdr);
+	if (ret < 0)
+		goto out;
+
+	dirty = 0;
+	first = OFF_MAX;
+	last = 0;
+	buf = NULL;
+	buf_size = 0;
+
+	while (tot < le64_to_cpu(hdr.total_bytes)) {
+
+		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
+		if (ret < 0)
+			goto out;
+
+		ret = invalid_block_header(&bh);
+		if (ret < 0)
+			goto out;
+
+		size = le32_to_cpu(bh.size);
+		if (buf_size < size) {
+			buf = realloc(buf, size);
+			if (!buf) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			buf_size = size;
+		}
+
+		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
+		if (ret < 0)
+			goto out;
+
+		calc = calc_crc(&bh, buf, size);
+		if (calc != bh.crc) {
+			errf("crc err");
+			ret = -EIO;
+			goto out;
+		}
+
+		off = le64_to_cpu(bh.offset);
+
+		ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
+		if (ret < 0)
+			goto out;
+
+		dirty += size;
+		first = min(first, off);
+		last = max(last, off);
+		if (dirty >= writeback_batch) {
+			posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
+			dirty = 0;
+			first = OFF_MAX;
+			last = 0;
+		}
+	}
+
+	ret = fsync(fd);
+	if (ret < 0) {
+		ret = -errno;
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+static int do_image(struct image_args *args)
+{
+	struct block_bitmaps bm = { .bits = NULL };
+	int meta_fd = -1;
+	u64 dev_size;
+	mode_t mode;
+	int ret;
+
+	mode = args->is_read ? O_RDONLY : O_RDWR;
+
+	meta_fd = open(args->meta_device, mode);
+	if (meta_fd < 0) {
+		ret = -errno;
+		errf("failed to open meta device '%s': %s (%d)\n",
+		     args->meta_device, strerror(errno), errno);
+		goto out;
+	}
+
+	if (args->is_read) {
+		ret = flush_device(meta_fd);
+		if (ret < 0)
+			goto out;
+
+		ret = get_device_size(args->meta_device, meta_fd, &dev_size);
+		if (ret < 0)
+			goto out;
+
+		bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
+		bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
+		if (!bm.bits) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
+		      check_supers(-1) ?:
+		      get_ref_bits(&bm) ?:
+		      read_image(args, meta_fd, &bm);
+		block_shutdown();
+	} else {
+		ret = write_image(args, meta_fd, &bm);
+	}
+out:
+	free(bm.bits);
+
+	if (meta_fd >= 0)
+		close(meta_fd);
+
+	return ret;
+}
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct image_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'h':
+		args->show_header = true;
+		break;
+	case 'r':
+		ret = parse_u64(arg, &args->ra_window);
+		if (ret)
+			argp_error(state, "readahead winddoe parse error");
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two device arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
+	{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
+	{ NULL }
+};
+
+static struct argp read_image_argp = {
+	options,
+	parse_opt,
+	"META-DEVICE",
+	"Read metadata image stream from metadata device file"
+};
+
+#define DEFAULT_RA_WINDOW (512 * 1024)
+
+static int read_image_cmd(int argc, char **argv)
+{
+	struct image_args image_args = {
+		.is_read = true,
+		.ra_window = DEFAULT_RA_WINDOW,
+	};
+	int ret;
+
+	ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
+	if (ret)
+		return ret;
+
+	return do_image(&image_args);
+}
+
+static struct argp write_image_argp = {
+	options,
+	parse_opt,
+	"META-DEVICE",
+	"Write metadata image stream to metadata device file"
+};
+
+static int write_image_cmd(int argc, char **argv)
+{
+	struct image_args image_args = {
+		.is_read = false,
+		.ra_window = DEFAULT_RA_WINDOW,
+	};
+	int ret;
+
+	ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
+	if (ret)
+		return ret;
+
+	return do_image(&image_args);
+}
+
+static void __attribute__((constructor)) image_ctor(void)
+{
+	cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
+	cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
+}
diff --git a/utils/src/check/iter.h b/utils/src/check/iter.h
new file mode 100644
index 00000000..54c5d13b
--- /dev/null
+++ b/utils/src/check/iter.h
@@ -0,0 +1,15 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
+#define _SCOUTFS_UTILS_CHECK_ITER_H_
+
+/*
+ * Callbacks can return a weird -errno that we'll never use to indicate
+ * that iteration can stop and return 0 for success.
+ */
+#define ECHECK_ITER_DONE EL2HLT
+
+static inline int xlate_iter_errno(int ret)
+{
+	return ret == -ECHECK_ITER_DONE ? 0 : ret;
+}
+
+#endif
diff --git a/utils/src/check/log_trees.c b/utils/src/check/log_trees.c
new file mode 100644
index 00000000..627052c7
--- /dev/null
+++ b/utils/src/check/log_trees.c
@@ -0,0 +1,98 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "btree.h"
+#include "debug.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+#include "log_trees.h"
+#include "super.h"
+
+struct iter_args {
+	extent_cb_t cb;
+	void *cb_arg;
+};
+
+static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
+{
+	struct iter_args *ia = cb_arg;
+	struct scoutfs_log_trees *lt;
+	int ret;
+
+	if (val_len != sizeof(struct scoutfs_log_trees))
+		; /* XXX */
+
+	lt = val;
+
+	sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
+
+	debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
+
+	sns_push("meta_avail", 0, 0);
+	ret = alloc_list_meta_iter(&lt->meta_avail, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_freed", 0, 0);
+	ret = alloc_list_meta_iter(&lt->meta_freed, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("item_root", 0, 0);
+	ret = btree_meta_iter(&lt->item_root, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	if (lt->bloom_ref.blkno) {
+		sns_push("bloom_ref", 0, 0);
+		ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
+		sns_pop();
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+	}
+
+	sns_push("data_avail", 0, 0);
+	ret = alloc_root_meta_iter(&lt->data_avail, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("data_freed", 0, 0);
+	ret = alloc_root_meta_iter(&lt->data_freed, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+out:
+	sns_pop();
+
+	return ret;
+}
+
+/*
+ * Call the callers callback with the extent of all the metadata block references contained
+ * in log btrees.  We walk the logs_root btree items and walk all the metadata structures
+ * they reference.
+ */
+int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_super_block *super = global_super;
+	struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
+
+	return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
+}
diff --git a/utils/src/check/log_trees.h b/utils/src/check/log_trees.h
new file mode 100644
index 00000000..7a7150b1
--- /dev/null
+++ b/utils/src/check/log_trees.h
@@ -0,0 +1,8 @@
+#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
+#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
+
+#include "extent.h"
+
+int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
+
+#endif
diff --git a/utils/src/check/meta.c b/utils/src/check/meta.c
new file mode 100644
index 00000000..40a2e5a5
--- /dev/null
+++ b/utils/src/check/meta.c
@@ -0,0 +1,367 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "bitmap.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "btree.h"
+#include "debug.h"
+#include "extent.h"
+#include "sns.h"
+#include "log_trees.h"
+#include "meta.h"
+#include "problem.h"
+#include "super.h"
+
+static struct meta_data {
+	struct extent_root meta_refed;
+	struct extent_root meta_free;
+	struct {
+		u64 ref_blocks;
+		u64 free_extents;
+		u64 free_blocks;
+	} stats;
+} global_mdat;
+
+bool valid_meta_blkno(u64 blkno)
+{
+	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
+
+	return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
+}
+
+static bool valid_meta_extent(u64 start, u64 len)
+{
+	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
+	bool valid;
+
+	valid = len > 0 &&
+		start >= SCOUTFS_META_DEV_START_BLKNO &&
+		start < tot &&
+		len <= tot &&
+		((start + len) <= tot) &&
+		((start + len) > start);
+
+	debug("start %llu len %llu valid %u", start, len, !!valid);
+
+	if (!valid)
+		problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
+
+	return valid;
+}
+
+/*
+ * Track references to individual metadata blocks.  This uses the extent
+ * callback type but is only ever called for single block references.
+ * Any reference to a block that has already been referenced is
+ * considered invalid and is ignored.  Later repair will resolve
+ * duplicate references.
+ */
+static int insert_meta_ref(u64 start, u64 len, void *arg)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_root *root = arg;
+	int ret = 0;
+
+	/* this is tracking single metadata block references */
+	if (len != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (valid_meta_blkno(start)) {
+		ret = extent_insert_new(root, start, len);
+		if (ret == 0)
+			mdat->stats.ref_blocks++;
+		else if (ret == -EEXIST)
+			problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
+	}
+
+out:
+	return ret;
+}
+
+static int insert_meta_free(u64 start, u64 len, void *arg)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_root *root = arg;
+	int ret = 0;
+
+	if (valid_meta_extent(start, len)) {
+		ret = extent_insert_new(root, start, len);
+		if (ret == 0) {
+			mdat->stats.free_extents++;
+			mdat->stats.free_blocks++;
+
+		} else if (ret == -EEXIST) {
+			problem(PB_META_FREE_OVERLAPS_EXISTING,
+				"start %llu llen %llu", start, len);
+		}
+
+	}
+
+	return ret;
+}
+
+/*
+ * Walk all metadata references in the system.  This walk doesn't need
+ * to read metadata that doesn't contain any metadata references so it
+ * can skip the bulk of metadata blocks.  This gives us the set of
+ * referenced metadata blocks which we can then use to repair metadata
+ * allocator structures.
+ */
+static int get_meta_refs(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+
+	extent_root_init(&mdat->meta_refed);
+
+	/* XXX record reserved blocks around super as referenced */
+
+	sns_push("meta_alloc", 0, 0);
+	ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_alloc", 1, 0);
+	ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("data_alloc", 1, 0);
+	ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 0, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_avail[0],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 1, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_avail[1],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 0, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_freed[0],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 1, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_freed[1],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("fs_root", 0, 0);
+	ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("logs_root", 0, 0);
+	ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("log_merge", 0, 0);
+	ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("mounted_clients", 0, 0);
+	ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("srch_root", 0, 0);
+	ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
+	if (ret < 0)
+		goto out;
+
+	debug("found %llu referenced metadata blocks", mdat->stats.ref_blocks);
+	ret = 0;
+out:
+	return ret;
+}
+
+static int get_meta_free(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+
+	extent_root_init(&mdat->meta_free);
+
+	sns_push("meta_alloc", 0, 0);
+	ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_alloc", 1, 0);
+	ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 0, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_avail[0],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 1, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_avail[1],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 0, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_freed[0],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 1, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_freed[1],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	debug("found %llu free metadata blocks in %llu extents",
+	       mdat->stats.free_blocks, mdat->stats.free_extents);
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * All the space between referenced blocks must be recorded in the free
+ * extents.  The free extent walk didn't check that the extents
+ * overlapped with references, we do that here.  Remember that metadata
+ * block references were merged into extents here, the refed extents
+ * aren't necessarily all a single block.
+ */
+static int compare_refs_and_free(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_node *ref;
+	struct extent_node *free;
+	struct extent_node *next;
+	struct extent_node *prev;
+	u64 expect;
+	u64 start;
+	u64 end;
+
+	expect = 0;
+	ref = extent_first(&mdat->meta_refed);
+	free = extent_first(&mdat->meta_free);
+	while (ref || free) {
+
+		debug("exp %llu ref %llu.%llu free %llu.%llu",
+			expect, ref ? ref->start : 0, ref ? ref->len : 0,
+			free ? free->start : 0, free ? free->len : 0);
+
+		/* referenced marked free, remove ref from free and continue from same point */
+		if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
+			debug("ref extent %llu.%llu overlaps free %llu %llu",
+				ref->start, ref->len, free->start, free->len);
+
+			start = max(ref->start, free->start);
+			end = min(ref->start + ref->len, free->start + free->len);
+
+			prev = extent_prev(free);
+
+			extent_remove(&mdat->meta_free, start, end - start);
+
+			if (prev)
+				free = extent_next(prev);
+			else
+				free = extent_first(&mdat->meta_free);
+			continue;
+		}
+
+		/* see which extent starts earlier */
+		if (!free || (ref && ref->start <= free->start))
+			next = ref;
+		else
+			next = free;
+
+		/* untracked region before next extent */
+		if (expect < next->start) {
+			debug("missing free extent %llu.%llu", expect, next->start - expect);
+			expect = next->start;
+			continue;
+		}
+
+
+		/* didn't overlap, advance past next extent */
+		expect = next->start + next->len;
+		if (next == ref)
+			ref = extent_next(ref);
+		else
+			free = extent_next(free);
+	}
+
+	return 0;
+}
+
+/*
+ * Check the metadata allocators by comparing the set of referenced
+ * blocks with the set of free blocks that are stored in free btree
+ * items and alloc list blocks.
+ */
+int check_meta_alloc(void)
+{
+	int ret;
+
+	ret = get_meta_refs();
+	if (ret < 0)
+		goto out;
+
+	ret = get_meta_free();
+	if (ret < 0)
+		goto out;
+
+	ret = compare_refs_and_free();
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
diff --git a/utils/src/check/meta.h b/utils/src/check/meta.h
new file mode 100644
index 00000000..80c97a03
--- /dev/null
+++ b/utils/src/check/meta.h
@@ -0,0 +1,9 @@
+#ifndef _SCOUTFS_UTILS_CHECK_META_H_
+#define _SCOUTFS_UTILS_CHECK_META_H_
+
+bool valid_meta_blkno(u64 blkno);
+
+int check_meta_alloc(void);
+
+#endif
+
diff --git a/utils/src/check/padding.c b/utils/src/check/padding.c
new file mode 100644
index 00000000..81e12c33
--- /dev/null
+++ b/utils/src/check/padding.c
@@ -0,0 +1,23 @@
+#include <string.h>
+#include <stdbool.h>
+
+#include "util.h"
+#include "padding.h"
+
+bool padding_is_zeros(const void *data, size_t sz)
+{
+	static char zeros[32] = {0,};
+	const size_t batch = array_size(zeros);
+
+	while (sz >= batch) {
+		if (memcmp(data, zeros, batch))
+			return false;
+		data += batch;
+		sz -= batch;
+	}
+
+	if (sz > 0 && memcmp(data, zeros, sz))
+		return false;
+
+	return true;
+}
diff --git a/utils/src/check/padding.h b/utils/src/check/padding.h
new file mode 100644
index 00000000..9bf03a81
--- /dev/null
+++ b/utils/src/check/padding.h
@@ -0,0 +1,6 @@
+#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_
+#define _SCOUTFS_UTILS_CHECK_PADDING_H_
+
+bool padding_is_zeros(const void *data, size_t sz);
+
+#endif
diff --git a/utils/src/check/problem.c b/utils/src/check/problem.c
new file mode 100644
index 00000000..fd8d42a9
--- /dev/null
+++ b/utils/src/check/problem.c
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <stdint.h>
+
+#include "problem.h"
+
+#define PROB_STR(pb) [pb] = #pb
+char *prob_strs[] = {
+	PROB_STR(PB_META_EXTENT_INVALID),
+	PROB_STR(PB_META_REF_OVERLAPS_EXISTING),
+	PROB_STR(PB_META_FREE_OVERLAPS_EXISTING),
+	PROB_STR(PB_BTREE_BLOCK_BAD_LEVEL),
+	PROB_STR(PB_SB_HDR_CRC_INVALID),
+	PROB_STR(PB_SB_HDR_MAGIC_INVALID),
+	PROB_STR(PB_FS_IN_USE),
+	PROB_STR(PB_MOUNTED_CLIENTS_REF_BLKNO),
+	PROB_STR(PB_SB_BAD_FLAG),
+	PROB_STR(PB_SB_BAD_FMT_VERS),
+	PROB_STR(PB_QCONF_WRONG_VERSION),
+	PROB_STR(PB_QSLOT_BAD_FAM),
+	PROB_STR(PB_QSLOT_BAD_PORT),
+	PROB_STR(PB_QSLOT_NO_ADDR),
+	PROB_STR(PB_QSLOT_BAD_ADDR),
+	PROB_STR(PB_DATA_DEV_SB_INVALID),
+};
+
+static struct problem_data {
+	uint64_t counts[PB__NR];
+	uint64_t count;
+} global_pdat;
+
+void problem_record(prob_t pb)
+{
+	struct problem_data *pdat = &global_pdat;
+
+	pdat->counts[pb]++;
+	pdat->count++;
+}
+
+uint64_t problems_count(void)
+{
+	struct problem_data *pdat = &global_pdat;
+
+	return pdat->count;
+}
diff --git a/utils/src/check/problem.h b/utils/src/check/problem.h
new file mode 100644
index 00000000..6ac49bb5
--- /dev/null
+++ b/utils/src/check/problem.h
@@ -0,0 +1,38 @@
+#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_
+#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_
+
+#include "debug.h"
+#include "sns.h"
+
+typedef enum {
+	PB_META_EXTENT_INVALID,
+	PB_META_REF_OVERLAPS_EXISTING,
+	PB_META_FREE_OVERLAPS_EXISTING,
+	PB_BTREE_BLOCK_BAD_LEVEL,
+	PB_SB_HDR_CRC_INVALID,
+	PB_SB_HDR_MAGIC_INVALID,
+	PB_FS_IN_USE,
+	PB_MOUNTED_CLIENTS_REF_BLKNO,
+	PB_SB_BAD_FLAG,
+	PB_SB_BAD_FMT_VERS,
+	PB_QCONF_WRONG_VERSION,
+	PB_QSLOT_BAD_FAM,
+	PB_QSLOT_BAD_PORT,
+	PB_QSLOT_NO_ADDR,
+	PB_QSLOT_BAD_ADDR,
+	PB_DATA_DEV_SB_INVALID,
+	PB__NR,
+} prob_t;
+
+extern char *prob_strs[];
+
+#define problem(pb, fmt, ...)							\
+do {										\
+	debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__);	\
+	problem_record(pb);							\
+} while (0)
+
+void problem_record(prob_t pb);
+uint64_t problems_count(void);
+
+#endif
diff --git a/utils/src/check/sns.c b/utils/src/check/sns.c
new file mode 100644
index 00000000..45f45453
--- /dev/null
+++ b/utils/src/check/sns.c
@@ -0,0 +1,118 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "sns.h"
+
+/*
+ * This "str num stack" is used to describe our location in metadata at
+ * any given time.
+ *
+ * As we descend into structures we pop a string on decribing them,
+ * perhaps with associated numbers.  Pushing and popping is very cheap
+ * and only rarely do we format the stack into a string, as an arbitrary
+ * example:
+ *   super.fs_root.btree_parent:1231.btree_leaf:3231"
+ */
+
+#define SNS_MAX_DEPTH	1000
+#define SNS_STR_SIZE	(SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1))
+
+static struct sns_data {
+	unsigned int depth;
+
+	struct sns_entry {
+		char *str;
+		size_t len;
+		u64 a;
+		u64 b;
+	} ents[SNS_MAX_DEPTH];
+
+	char str[SNS_STR_SIZE];
+
+} global_lsdat;
+
+void _sns_push(char *str, size_t len, u64 a, u64 b)
+{
+	struct sns_data *lsdat = &global_lsdat;
+
+	if (lsdat->depth < SNS_MAX_DEPTH) {
+		lsdat->ents[lsdat->depth++] = (struct sns_entry) {
+			.str = str,
+			.len = len,
+			.a = a,
+			.b = b,
+		};
+	}
+}
+
+void sns_pop(void)
+{
+	struct sns_data *lsdat = &global_lsdat;
+
+	if (lsdat->depth > 0)
+		lsdat->depth--;
+}
+
+static char *append_str(char *pos, char *str, size_t len)
+{
+	memcpy(pos, str, len);
+	return pos + len;
+}
+
+/*
+ * This is not called for x = 0 so we don't need to emit an initial 0.
+ * We could by using do {} while instead of while {}.
+ */
+static char *append_u64x(char *pos, u64 x)
+{
+	static char hex[] = "0123456789abcdef";
+
+	while (x) {
+		*pos++ = hex[x & 0xf];
+		x >>= 4;
+	}
+
+	return pos;
+}
+
+static char *append_char(char *pos, char c)
+{
+	*(pos++) = c;
+	return pos;
+}
+
+/*
+ * Return a pointer to a null terminated string that describes the
+ * current location stack.  The string buffer is global.
+ */
+char *sns_str(void)
+{
+	struct sns_data *lsdat = &global_lsdat;
+	struct sns_entry *ent;
+	char *pos;
+	int i;
+
+	pos = lsdat->str;
+	for (i = 0; i < lsdat->depth; i++) {
+		ent = &lsdat->ents[i];
+
+		if (i)
+			pos = append_char(pos, '.');
+
+		pos = append_str(pos, ent->str, ent->len);
+
+		if (ent->a) {
+			pos = append_char(pos, ':');
+			pos = append_u64x(pos, ent->a);
+		}
+
+		if (ent->b) {
+			pos = append_char(pos, ':');
+			pos = append_u64x(pos, ent->b);
+		}
+	}
+
+	*pos = '\0';
+
+	return lsdat->str;
+}
diff --git a/utils/src/check/sns.h b/utils/src/check/sns.h
new file mode 100644
index 00000000..34c1a2be
--- /dev/null
+++ b/utils/src/check/sns.h
@@ -0,0 +1,20 @@
+#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_
+#define _SCOUTFS_UTILS_CHECK_SNS_H_
+
+#include <assert.h>
+
+#include "sparse.h"
+
+#define SNS_MAX_STR_LEN 20
+
+#define sns_push(str, a, b)					\
+do {								\
+	build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN);	\
+	_sns_push((str), sizeof(str) - 1, a, b);		\
+} while (0)
+
+void _sns_push(char *str, size_t len, u64 a, u64 b);
+void sns_pop(void);
+char *sns_str(void);
+
+#endif
diff --git a/utils/src/check/super.c b/utils/src/check/super.c
new file mode 100644
index 00000000..e3c14fae
--- /dev/null
+++ b/utils/src/check/super.c
@@ -0,0 +1,252 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "crc.h"
+
+#include "block.h"
+#include "super.h"
+#include "problem.h"
+
+/*
+ * After we check the super blocks we provide a global buffer to track
+ * the current super block.  It is referenced to get static information
+ * about the system and is also modified and written as part of
+ * transactions.
+ */
+struct scoutfs_super_block *global_super;
+
+/*
+ * Check superblock crc. We can't use global_super here since it's not the
+ * whole block itself, but only the struct scoutfs_super_block, so it needs
+ * to reload a copy here.
+ */
+int check_super_crc(void)
+{
+	struct scoutfs_super_block *super = NULL;
+	struct scoutfs_block_header *hdr;
+	struct block *blk = NULL;
+	u32 crc;
+	int ret;
+
+	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_DIRTY);
+	if (ret < 0) {
+		fprintf(stderr, "error reading super block\n");
+		return ret;
+	}
+
+	super = block_buf(blk);
+	crc = crc_block((struct scoutfs_block_header *)super, block_size(blk));
+	hdr = &global_super->hdr;
+	debug("superblock crc 0x%04x calculated 0x%04x " "%s", le32_to_cpu(hdr->crc), crc, le32_to_cpu(hdr->crc) == crc ? "(match)" : "(mismatch)");
+
+	if (crc != le32_to_cpu(hdr->crc))
+		problem(PB_SB_HDR_CRC_INVALID, "crc 0x%04x calculated 0x%04x", le32_to_cpu(hdr->crc), crc);
+	block_put(&blk);
+
+	return 0;
+}
+
+/*
+ * Crude check for the unlikely cases where the fs appears to still be mounted.
+ */
+int check_super_in_use(int meta_fd)
+{
+	int ret = meta_super_in_use(meta_fd, global_super);
+	debug("meta_super_in_use ret %d", ret);
+
+	if (ret < 0)
+		problem(PB_FS_IN_USE, "File system appears in use. ret %d", ret);
+
+	debug("global_super->mounted_clients.ref.blkno 0x%08llx", global_super->mounted_clients.ref.blkno);
+	if (global_super->mounted_clients.ref.blkno != 0)
+		problem(PB_MOUNTED_CLIENTS_REF_BLKNO, "Mounted clients ref blkno 0x%08llx",
+			 global_super->mounted_clients.ref.blkno);
+
+	return ret;
+}
+
+/*
+ * quick glance data device superblock checks.
+ *
+ * -EIO for crc failures, all others -EINVAL
+ *
+ * caller must have run check_supers() first so that global_super is
+ * setup, so that we can cross-ref to it.
+ */
+static int check_data_super(int data_fd)
+{
+	struct scoutfs_super_block *super = NULL;
+	char *buf;
+	int ret = 0;
+	u32 crc;
+	ssize_t size = SCOUTFS_BLOCK_SM_SIZE;
+	off_t off = SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT;
+
+	buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
+	if (!buf)
+		return -ENOMEM;
+
+	memset(buf, 0, size);
+
+	if (lseek(data_fd, off, SEEK_SET) != off)
+		return -errno;
+
+	if (read(data_fd, buf, size) < 0) {
+		ret = -errno;
+		goto out;
+	}
+
+	super = (struct scoutfs_super_block *)buf;
+
+	crc = crc_block((struct scoutfs_block_header *)buf, size);
+
+	debug("data fsid 0x%016llx", le64_to_cpu(super->hdr.fsid));
+	debug("data super magic 0x%04x", super->hdr.magic);
+	debug("data crc calc 0x%08x exp 0x%08x %s", crc, le32_to_cpu(super->hdr.crc),
+	      crc == le32_to_cpu(super->hdr.crc) ? "(match)" : "(mismatch)");
+	debug("data flags %llu fmt_vers %llu", le64_to_cpu(super->flags), le64_to_cpu(super->fmt_vers));
+
+	if (crc != le32_to_cpu(super->hdr.crc))
+		/* tis but a scratch */
+		ret = -EIO;
+
+	if (le64_to_cpu(super->hdr.fsid) != le64_to_cpu(global_super->hdr.fsid))
+		/* mismatched data bdev? not good */
+		ret = -EINVAL;
+
+	if (le32_to_cpu(super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER)
+		/* fsid matched but not a superblock? yikes */
+		ret = -EINVAL;
+
+	if (le64_to_cpu(super->flags) != 0) /* !SCOUTFS_FLAG_IS_META_BDEV */
+		ret = -EINVAL;
+
+	if ((le64_to_cpu(super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) ||
+	    (le64_to_cpu(super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX))
+		ret = -EINVAL;
+
+	if (ret != 0)
+		problem(PB_DATA_DEV_SB_INVALID, "data device is invalid or corrupt (%d)", ret);
+out:
+	free(buf);
+	return ret;
+}
+
+/*
+ * After checking the supers we save a copy of it in a global buffer that's used by
+ * other modules to track the current super.  It can be modified and written during commits.
+ */
+int check_supers(int data_fd)
+{
+	struct scoutfs_super_block *super = NULL;
+	struct block *blk = NULL;
+	struct scoutfs_quorum_slot* slot = NULL;
+	struct in_addr in;
+	uint16_t family;
+	uint16_t port;
+	int ret;
+
+	sns_push("supers", 0, 0);
+
+	global_super = malloc(sizeof(struct scoutfs_super_block));
+	if (!global_super) {
+		fprintf(stderr, "error allocating super block buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM);
+	if (ret < 0) {
+		fprintf(stderr, "error reading super block\n");
+		goto out;
+	}
+
+	ret = block_hdr_valid(blk, SCOUTFS_SUPER_BLKNO, BF_SM, SCOUTFS_BLOCK_MAGIC_SUPER);
+
+	super = block_buf(blk);
+
+	if (ret < 0) {
+		/* */
+		if (ret == -EINVAL) {
+			/* that's really bad */
+			fprintf(stderr, "superblock invalid magic\n");
+			goto out;
+		} else if (ret == -EIO)
+			/* just report/count a CRC error */
+			problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x",
+				super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER);
+	}
+
+	memcpy(global_super, super, sizeof(struct scoutfs_super_block));
+
+	debug("Superblock flag: %llu", global_super->flags);
+	if (le64_to_cpu(global_super->flags) != SCOUTFS_FLAG_IS_META_BDEV)
+		problem(PB_SB_BAD_FLAG, "Bad flag: %llu expecting: 1 or 0", global_super->flags);
+
+	debug("Superblock fmt_vers: %llu", le64_to_cpu(global_super->fmt_vers));
+	if ((le64_to_cpu(global_super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) ||
+	    (le64_to_cpu(global_super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX))
+		problem(PB_SB_BAD_FMT_VERS, "Bad fmt_vers: %llu outside supported range (%d-%d)",
+			le64_to_cpu(global_super->fmt_vers), SCOUTFS_FORMAT_VERSION_MIN,
+			SCOUTFS_FORMAT_VERSION_MAX);
+
+	debug("Quorum Config Version: %llu", global_super->qconf.version);
+	if (le64_to_cpu(global_super->qconf.version) != 1)
+		problem(PB_QCONF_WRONG_VERSION, "Wrong Version: %llu (expected 1)", global_super->qconf.version);
+
+	for (int i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
+		slot = &global_super->qconf.slots[i];
+		family = le16_to_cpu(slot->addr.v4.family);
+		port = le16_to_cpu(slot->addr.v4.port);
+		in.s_addr = le32_to_cpu(slot->addr.v4.addr);
+
+		if (family == SCOUTFS_AF_NONE) {
+			debug("Quorum slot %u is empty", i);
+			continue;
+		}
+
+		debug("Quorum slot %u family: %u, port: %u, address: %s", i, family, port, inet_ntoa(in));
+		if (family != SCOUTFS_AF_IPV4)
+			problem(PB_QSLOT_BAD_FAM, "Quorum Slot %u doesn't have valid address", i);
+
+		if (port == 0)
+			problem(PB_QSLOT_BAD_PORT, "Quorum Slot %u has bad port", i);
+
+		if (!in.s_addr) {
+			problem(PB_QSLOT_NO_ADDR, "Quorum Slot %u has not been assigned ipv4 address", i);
+		} else if (!(in.s_addr & 0xff000000)) {
+			problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i);
+		} else if ((in.s_addr & 0xff) == 0xff) {
+			problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i);
+		}
+	}
+
+	debug("super magic 0x%04x", global_super->hdr.magic);
+	if (le32_to_cpu(global_super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER)
+		problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x",
+			global_super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER);
+
+	/* `scoutfs image` command doesn't open data_fd */
+	if (data_fd < 0)
+		ret = 0;
+	else
+		ret = check_data_super(data_fd);
+out:
+	block_put(&blk);
+
+	sns_pop();
+
+	return ret;
+}
+
+void super_shutdown(void)
+{
+	free(global_super);
+}
diff --git a/utils/src/check/super.h b/utils/src/check/super.h
new file mode 100644
index 00000000..f14417ba
--- /dev/null
+++ b/utils/src/check/super.h
@@ -0,0 +1,12 @@
+#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_
+#define _SCOUTFS_UTILS_CHECK_SUPER_H_
+
+extern struct scoutfs_super_block *global_super;
+
+int check_super_crc();
+int check_supers(int data_fd);
+int super_commit(void);
+int check_super_in_use(int meta_fd);
+void super_shutdown(void);
+
+#endif
diff --git a/utils/src/parallel_restore.c b/utils/src/parallel_restore.c
new file mode 100644
index 00000000..4a1ab3fd
--- /dev/null
+++ b/utils/src/parallel_restore.c
@@ -0,0 +1,1986 @@
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <uuid/uuid.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/uio.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "crc.h"
+#include "rand.h"
+#include "key.h"
+#include "bitops.h"
+#include "btree.h"
+#include "leaf_item_hash.h"
+#include "name_hash.h"
+#include "mode_types.h"
+#include "srch.h"
+#include "bloom.h"
+
+#include "parallel_restore.h"
+
+#include "list.h"
+#include "lk_rbtree_wrapper.h"
+
+/*
+ * XXX
+ *  - interface versioning?
+ *  - next seq and next ino are both max ino + 1
+ *  - fix writer builder layout to match super, users except for build order
+ *  - look into zeroing buffers consistently
+ *  - init_alb looks weird?  naming consistency?
+ *  - make sure inode_count makes sense (fs root, log deltas)
+ *  - audit file types
+ */
+
+#define dprintf(fmt, args...)		\
+do {					\
+	if (0)				\
+		printf(fmt, ##args);	\
+} while (0)
+
+struct btree_item {
+	struct rb_node node;
+	struct scoutfs_key key;
+	unsigned int val_len;
+	void *val;
+};
+
+struct srch_node {
+	struct rb_node node;
+	u64 hash;
+	u64 ino;
+	u64 id;
+};
+
+struct block_builder;
+typedef bool (*bld_empty_t)(struct block_builder *bld);
+typedef void (*bld_reset_t)(struct block_builder *bld);
+typedef spr_err_t (*bld_build_t)(struct scoutfs_parallel_restore_writer *wri,
+				 struct block_builder *bld, void *buf, u64 blkno);
+typedef spr_err_t (*bld_post_t)(struct scoutfs_parallel_restore_writer *wri,
+				struct block_builder *bld);
+
+struct block_builder {
+	struct list_head head;
+	bld_empty_t empty;
+	bld_reset_t reset;
+	bld_build_t build;
+	bld_post_t post;
+};
+
+struct btree_builder {
+	struct block_builder bld;
+
+	/* track all items */
+	u64 total_items;
+	/* track total length of extent items */
+	u64 total_len;
+
+	/* eventual root that references built blocks */
+	struct scoutfs_btree_root btroot;
+
+	/* blocks are built as levels accumulate sufficient items */
+	struct {
+		struct rb_root root;
+		unsigned long nr;
+	} items[SCOUTFS_BTREE_MAX_HEIGHT];
+};
+
+struct alloc_list_builder {
+	struct block_builder bld;
+	u64 start;
+	u64 len;
+	struct scoutfs_alloc_list_head lhead;
+};
+
+/*
+ * srch parent radix fanout is really wide, it doesn't take many to have
+ * 2^64 bytes in entry blocks.
+ */
+#define MAX_SRCH_HEIGHT 6
+
+struct srch_builder {
+	struct block_builder bld;
+
+	/* accumulates blocks/entries as we build */
+	struct scoutfs_srch_file sfl;
+
+	/* no parents at level 0, [0] never used */
+	u64 total_parent_refs;
+	struct {
+		struct scoutfs_block_ref *refs;
+		unsigned long nr;
+	} parents[MAX_SRCH_HEIGHT];
+
+	struct rb_root entries;
+};
+
+struct bloom_builder {
+	struct block_builder bld;
+	struct scoutfs_bloom_block *bloom;
+};
+
+struct scoutfs_parallel_restore_writer {
+	u64 inode_count;
+	u64 max_ino;
+
+	__le64 fsid;
+	u64 meta_start;
+	u64 meta_len;
+	struct list_head meta_extents;
+
+	struct list_head builders;
+	struct btree_builder meta_btb[2];
+	struct btree_builder data_btb;
+	struct alloc_list_builder meta_alb[2];
+	struct btree_builder root_btb;
+	struct btree_builder fs_btb;
+	struct btree_builder srch_btb;
+	struct btree_builder log_btb;
+
+	struct srch_builder srch_sbld;
+	struct bloom_builder bloom_bbld;
+
+	struct scoutfs_btree_root root_items;
+	struct scoutfs_super_block super;
+};
+
+struct extent_head {
+	struct list_head head;
+	u64 start;
+	u64 len;
+};
+
+static void init_builder(struct block_builder *bld, bld_empty_t empty, bld_reset_t reset,
+			 bld_build_t build)
+{
+	INIT_LIST_HEAD(&bld->head);
+	bld->empty = empty;
+	bld->reset = reset;
+	bld->build = build;
+	bld->post = NULL;
+}
+
+static spr_err_t meta_alloc_add(struct scoutfs_parallel_restore_writer *wri,
+				u64 start, u64 len)
+{
+	struct extent_head *eh;
+
+	if (len == 0)
+		return 0;
+
+	if (wri->meta_len == 0) {
+		wri->meta_start = start;
+		wri->meta_len = len;
+	} else {
+		eh = malloc(sizeof(struct extent_head));
+		if  (!eh)
+			return ENOMEM;
+		eh->start = start;
+		eh->len = len;
+		list_add_tail(&eh->head, &wri->meta_extents);
+	}
+
+	return 0;
+}
+
+static spr_err_t meta_alloc_contig(struct scoutfs_parallel_restore_writer *wri,
+				   u64 prev, u64 *blkno_ret)
+{
+	struct extent_head *eh;
+
+	if (prev && wri->meta_len && (wri->meta_start != prev + 1)) {
+		*blkno_ret = 0;
+		return 0;
+	}
+
+	if (!wri->meta_len) {
+		*blkno_ret = 0;
+		return ENOSPC;
+	}
+
+	*blkno_ret = wri->meta_start++;
+
+	if (--wri->meta_len == 0 && !list_empty(&wri->meta_extents)) {
+		eh = list_entry(wri->meta_extents.next, struct extent_head, head);
+		wri->meta_start = eh->start;
+		wri->meta_len = eh->len;
+		free(eh);
+	}
+
+	return 0;
+}
+
+static spr_err_t bti_alloc(int val_len, struct btree_item **bti_ret)
+{
+	struct btree_item *bti;
+	spr_err_t err;
+
+	bti = malloc(sizeof(struct btree_item) + val_len);
+	if (bti) {
+		bti->val = (void *)(bti + 1);
+		bti->val_len = val_len;
+		err = 0;
+	} else {
+		err = ENOMEM;
+	}
+
+	*bti_ret = bti;
+	return err;
+}
+
+static struct btree_item *bti_walk(struct rb_root *root, struct scoutfs_key *key,
+				   struct btree_item *ins)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btree_item *found = NULL;
+	struct btree_item *bti;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		bti = container_of(*node, struct btree_item, node);
+
+		cmp = scoutfs_key_compare(key, &bti->key);
+		if (cmp < 0) {
+			node = &(*node)->rb_left;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		} else {
+			found = bti;
+			break;
+		}
+	}
+
+	if (ins && !found) {
+		rb_link_node(&ins->node, parent, node);
+		rb_insert_color(&ins->node, root);
+	}
+
+	return found;
+}
+
+static struct btree_item *node_bti(struct rb_node *node)
+{
+	return node ? container_of(node, struct btree_item, node) : NULL;
+}
+
+static struct btree_item *bti_first(struct rb_root *root)
+{
+	return node_bti(rb_first(root));
+}
+
+static struct btree_item *bti_next(struct btree_item *bti)
+{
+	return bti ? node_bti(rb_next(&bti->node)) : NULL;
+}
+
+#define for_each_bti_safe(root, bti, tmp) \
+	for (bti = bti_first(root); bti && ((tmp = bti_next(bti)), 1); bti = tmp)
+
+/*
+ * It's always an error to try and insert a key that was already tracked
+ * in a btree level.
+ */
+static spr_err_t btb_insert(struct btree_builder *btb, struct btree_item *bti, int level)
+{
+	struct btree_item *found;
+
+	found = bti_walk(&btb->items[level].root, &bti->key, bti);
+	if (found) {
+		return EEXIST;
+	} else {
+		btb->items[level].nr++;
+		btb->total_items++;
+		return 0;
+	}
+}
+
+static void btb_erase(struct btree_builder *btb, struct btree_item *bti, int level)
+{
+	rb_erase(&bti->node, &btb->items[level].root);
+	btb->items[level].nr--;
+	btb->total_items--;
+}
+
+static void btb_destroy(struct btree_builder *btb)
+{
+	struct btree_item *bti;
+	struct btree_item *tmp;
+	int i;
+
+	for (i = 0; i < array_size(btb->items); i++) {
+		for_each_bti_safe(&btb->items[i].root, bti, tmp) {
+			btb_erase(btb, bti, i);
+			free(bti);
+		}
+	}
+}
+
+static void init_key(struct scoutfs_key *key, u8 zone, u8 type, u64 first, u64 second,
+		     u64 third, u8 fourth)
+{
+	key->_sk_first = cpu_to_le64(first);
+	key->_sk_second = cpu_to_le64(second);
+	key->_sk_third = cpu_to_le64(third);
+	key->_sk_fourth = fourth;
+	key->sk_zone = zone;
+	key->sk_type = type;
+	memset(&key->__pad, 0, sizeof(key->__pad));
+}
+
+static u64 free_extent_order(u64 len)
+{
+	return (fls64(len | 1) - 1) / 3;
+}
+
+static int insert_free_items(struct btree_builder *btb, u64 start, u64 len)
+{
+	struct scoutfs_key keys[2];
+	struct btree_item *bti;
+	spr_err_t err;
+	u64 order;
+	u64 end;
+	int i;
+
+	end = start + len - 1;
+	order = U64_MAX - free_extent_order(len);
+
+	init_key(&keys[0], SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, end, len, 0, 0);
+	init_key(&keys[1], SCOUTFS_FREE_EXTENT_ORDER_ZONE, 0, order, end, len, 0);
+
+	for (i = 0; i < array_size(keys); i++) {
+		err = bti_alloc(0, &bti);
+		if (err)
+			goto out;
+
+		bti->key = keys[i];
+
+		err = btb_insert(btb, bti, 0);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+	}
+
+	btb->total_len += len;
+
+	err = 0;
+out:
+	return err;
+}
+
+static void set_alloc_root(struct scoutfs_alloc_root *root, struct btree_builder *btb)
+{
+	root->total_len = cpu_to_le64(btb->total_len);
+	root->flags = 0;
+	root->_pad = 0;
+	root->root = btb->btroot;
+}
+
+static spr_err_t map_start_key(struct scoutfs_key *start, struct scoutfs_key *key)
+{
+	if (key->sk_zone == SCOUTFS_FS_ZONE) {
+		init_key(start, SCOUTFS_FS_ZONE, 0,
+			 le64_to_cpu(key->_sk_first) & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK,
+			 0, 0, 0);
+
+	} else if (key->sk_zone == SCOUTFS_XATTR_TOTL_ZONE) {
+		init_key(start, SCOUTFS_XATTR_TOTL_ZONE, 0, 0, 0, 0, 0);
+
+	} else if (key->sk_zone == SCOUTFS_INODE_INDEX_ZONE) {
+		init_key(start, SCOUTFS_INODE_INDEX_ZONE, 0, 0,
+			 le64_to_cpu(key->_sk_second) & ~(u64)SCOUTFS_LOCK_SEQ_GROUP_MASK,
+			 0, 0);
+	} else if (key->sk_zone == SCOUTFS_QUOTA_ZONE) {
+		init_key(start, SCOUTFS_QUOTA_ZONE, 0, 0, 0, 0, 0);
+	} else {
+		return EINVAL;
+	}
+
+	return 0;
+}
+
+static spr_err_t update_bloom(struct bloom_builder *bbld, struct scoutfs_key *key)
+{
+	struct scoutfs_bloom_block *bb = bbld->bloom;
+	unsigned int nrs[SCOUTFS_FOREST_BLOOM_NRS];
+	struct scoutfs_key start;
+	spr_err_t err;
+	int i;
+
+	err = map_start_key(&start, key);
+	if (err)
+		goto out;
+
+	calc_bloom_nrs(&start, nrs);
+
+	for (i = 0; i < SCOUTFS_FOREST_BLOOM_NRS; i++) {
+		if (!test_and_set_bit_le(nrs[i], bb->bits))
+			le64_add_cpu(&bb->total_set, 1);
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static spr_err_t insert_fs_item(struct scoutfs_parallel_restore_writer *wri,
+				struct btree_item *bti)
+{
+	spr_err_t err;
+
+	if (bti->key.sk_zone == SCOUTFS_FS_ZONE && bti->key.sk_type == SCOUTFS_INODE_TYPE &&
+	    le64_to_cpu(bti->key.ski_ino) == SCOUTFS_ROOT_INO) {
+		err = btb_insert(&wri->root_btb, bti, 0);
+	} else {
+		err = btb_insert(&wri->fs_btb, bti, 0) ?:
+		      update_bloom(&wri->bloom_bbld, &bti->key);
+	}
+
+	return err;
+}
+
+static spr_err_t insert_entry_items(struct scoutfs_parallel_restore_writer *wri,
+				    struct scoutfs_parallel_restore_entry *entry)
+{
+	struct scoutfs_dirent *dent = NULL;
+	struct scoutfs_key keys[3];
+	struct btree_item *bti;
+	unsigned int bytes;
+	spr_err_t err = 0;
+	u64 dir_ino;
+	u64 hash;
+	u64 ino;
+	u64 pos;
+	int i;
+
+	bytes = offsetof(struct scoutfs_dirent, name[entry->name_len]);
+	dent = malloc(bytes);
+	if (!dent) {
+		err = ENOMEM;
+		goto out;
+	}
+
+	dir_ino = entry->dir_ino;
+	ino = entry->ino;
+	hash = dirent_name_hash(entry->name, entry->name_len);
+	pos = entry->pos;
+
+	dent->ino = cpu_to_le64(ino);
+	dent->hash = cpu_to_le64(hash);
+	dent->pos = cpu_to_le64(pos);
+	dent->type = mode_to_type(entry->mode);
+	memset(&dent->__pad, 0, sizeof(dent->__pad));
+	memcpy(dent->name, entry->name, entry->name_len);
+
+	init_key(&keys[0], SCOUTFS_FS_ZONE, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos, 0);
+	init_key(&keys[1], SCOUTFS_FS_ZONE, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0, 0);
+	init_key(&keys[2], SCOUTFS_FS_ZONE, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos, 0);
+
+	for (i = 0; i < array_size(keys); i++) {
+		err = bti_alloc(bytes, &bti);
+		if (err)
+			goto out;
+
+		bti->key = keys[i];
+		memcpy(bti->val, dent, bytes);
+
+		err = insert_fs_item(wri, bti);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+	}
+
+	err = 0;
+out:
+	free(dent);
+	return err;
+}
+
+static spr_err_t insert_extent_item(struct scoutfs_parallel_restore_writer *wri, u64 ino, u64 len)
+{
+	struct scoutfs_data_extent_val *dv;
+	struct scoutfs_key key;
+	struct btree_item *bti;
+	spr_err_t err;
+
+	init_key(&key, SCOUTFS_FS_ZONE, SCOUTFS_DATA_EXTENT_TYPE, ino, 0 + len - 1, len, 0);
+
+	err = bti_alloc(sizeof(struct scoutfs_data_extent_val), &bti);
+	if (!err) {
+		bti->key = key;
+		dv = bti->val;
+		dv->blkno = 0;
+		dv->flags = SEF_OFFLINE;
+
+		err = insert_fs_item(wri, bti);
+		if (err)
+			free(bti);
+	}
+
+	return err;
+}
+
+/*
+ * We're trusting that the caller hasn't made up garbage xattrs.
+ * All we have to do is check for the scoutfs prefix and then
+ * identify the sequence of known tags.  There can be a lot more
+ * xattrs than files so this is a surprisingly hot path.
+ */
+#define HIDE_BE32 cpu_to_be32(0x68696465)
+#define SRCH_BE32 cpu_to_be32(0x73726368)
+#define TOTL_BE32 cpu_to_be32(0x746f746c)
+#define TAG_LEN 5
+#define XTAG_SRCH (1 << 1)
+#define XTAG_TOTL (1 << 2)
+static int get_xattr_tags(char *name, int name_len)
+{
+	static const char prefix[] = "scoutfs.";
+	static const size_t prefix_len = array_size(prefix) - 1;
+	__be32 betag;
+	int xtags = 0;
+
+	if (name_len < prefix_len || strncmp(name, prefix, prefix_len))
+		return 0;
+
+	name += prefix_len;
+	name_len -= prefix_len;
+
+	while (name_len >= TAG_LEN && name[TAG_LEN - 1] == '.') {
+		memcpy(&betag, name, sizeof(betag));
+
+		dprintf("tag 0x%08x\n", be32_to_cpu(betag));
+
+		if (betag == HIDE_BE32)
+			;
+		else if (betag == SRCH_BE32)
+			xtags |= XTAG_SRCH;
+		else if (betag == TOTL_BE32)
+			xtags |= XTAG_TOTL;
+		else
+			break;
+
+		name += TAG_LEN;
+		name_len -= TAG_LEN;
+	}
+
+	dprintf("xat name %.*s tags 0x%x\n", name_len, name, xtags);
+
+	return xtags;
+}
+
+static spr_err_t insert_xattr_items(struct scoutfs_parallel_restore_writer *wri,
+				    struct scoutfs_parallel_restore_xattr *xattr, u32 hash)
+{
+	struct scoutfs_xattr xat;
+	struct iovec value[3] = {
+		{ &xat, sizeof(xat) },
+		{ xattr->name, xattr->name_len, },
+		{ xattr->value, xattr->value_len, },
+	};
+	struct iovec *iov = value;
+	struct scoutfs_key key;
+	struct btree_item *bti;
+	unsigned int total;
+	unsigned int bytes;
+	unsigned int piece;
+	spr_err_t err;
+	char *buf;
+
+	init_key(&key, SCOUTFS_FS_ZONE, SCOUTFS_XATTR_TYPE, xattr->ino, hash, xattr->pos, 0);
+	total = value[0].iov_len + value[1].iov_len + value[2].iov_len;
+
+	xat.val_len = cpu_to_le16(xattr->value_len);
+	xat.name_len = xattr->name_len;
+	memset(xat.__pad, 0, sizeof(xat.__pad));
+
+	while (total > 0) {
+		bytes = min(total, SCOUTFS_XATTR_MAX_PART_SIZE);
+
+		err = bti_alloc(bytes, &bti);
+		if (err)
+			goto out;
+
+		bti->key = key;
+		buf = bti->val;
+
+		while (bytes) {
+			piece = min(bytes, iov->iov_len);
+			memcpy(buf, iov->iov_base, piece);
+			buf += piece;
+			bytes -= piece;
+			total -= piece;
+			iov->iov_base += piece;
+			iov->iov_len -= piece;
+			if (iov->iov_len == 0)
+				iov++; /* falls off array when done */
+		}
+
+		err = insert_fs_item(wri, bti);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+
+		key._sk_fourth++;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static spr_err_t insert_symlink_items(struct scoutfs_parallel_restore_writer *wri,
+				      u64 ino, char *target, int target_len)
+{
+	struct scoutfs_key key;
+	struct btree_item *bti;
+	spr_err_t err;
+	int bytes;
+	int off = 0;
+
+	init_key(&key, SCOUTFS_FS_ZONE, SCOUTFS_SYMLINK_TYPE, ino, 0, 0, 0);
+
+	while (off < target_len) {
+		bytes = min(target_len - off, SCOUTFS_MAX_VAL_SIZE);
+
+		err = bti_alloc(bytes, &bti);
+		if (err)
+			goto out;
+
+		bti->key = key;
+		memcpy(bti->val, target + off, bytes);
+
+		err = insert_fs_item(wri, bti);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+
+		off += bytes;
+		le64_add_cpu(&key._sk_second, 1);
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+/* forbid the leading + that strtoull allows */
+static spr_err_t totl_strtoull(char *s, int len, unsigned long long *res)
+{
+	char str[SCOUTFS_XATTR_MAX_TOTL_U64 + 1];
+
+	if (len <= 0 || len >= array_size(str) || s[0] == '+')
+		return EINVAL;
+
+	memcpy(str, s, len);
+	str[len] = '\0';
+
+	errno = 0;
+	*res = strtoull(str, NULL, 0);
+	return errno;
+}
+
+/*
+ * .totl. xattrs turn into items with the key based on dotted u64s at the end of the
+ * name and a value in the .. value.
+ */
+static spr_err_t insert_totl_item(struct scoutfs_parallel_restore_writer *wri,
+				  struct scoutfs_parallel_restore_xattr *xattr)
+{
+	static const char prefix[] = "scoutfs.totl.";
+	static const int prefix_len = sizeof(prefix) - 1;
+	struct scoutfs_xattr_totl_val *found_tval;
+	struct scoutfs_xattr_totl_val *tval;
+	struct btree_item *found;
+	struct btree_item *bti;
+	unsigned long long longs[3];
+	unsigned long long v;
+	spr_err_t err;
+	int nr = 0;
+	int prev;
+	int i;
+
+	prev = xattr->name_len;
+	for (i = xattr->name_len - 1; i > prefix_len; i--) {
+		if (xattr->name[i] == '.') {
+			err = totl_strtoull(&xattr->name[i + 1], prev - (i + 1), &longs[nr]);
+			if (err)
+				goto out;
+			if (++nr == array_size(longs))
+				break;
+			prev = i;
+		}
+	}
+	if (nr != array_size(longs)) {
+		err = EINVAL;
+		goto out;
+	}
+
+	err = totl_strtoull(xattr->value, xattr->value_len, &v);
+	if (err)
+		goto out;
+
+	if (v == 0) {
+		err = 0;
+		goto out;
+	}
+
+	err = bti_alloc(sizeof(struct scoutfs_xattr_totl_val), &bti);
+	if (err)
+		goto out;
+
+	init_key(&bti->key, SCOUTFS_XATTR_TOTL_ZONE, 0, longs[2], longs[1], longs[0], 0);
+	tval = bti->val;
+	tval->total = cpu_to_le64(v);
+	tval->count = cpu_to_le64(1);
+
+	found = bti_walk(&wri->fs_btb.items[0].root, &bti->key, NULL);
+	if (found) {
+		found_tval = found->val;
+		le64_add_cpu(&found_tval->total, le64_to_cpu(tval->total));
+		le64_add_cpu(&found_tval->count, le64_to_cpu(tval->count));
+		if (found_tval->total == 0)
+			btb_erase(&wri->fs_btb, found, 0);
+		free(bti);
+	} else {
+		err = insert_fs_item(wri, bti);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static spr_err_t insert_inode_index_item(struct scoutfs_parallel_restore_writer *wri,
+					 u8 type, u64 major, u64 ino)
+{
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(0, &bti);
+	if (!err) {
+		init_key(&bti->key, SCOUTFS_INODE_INDEX_ZONE, type, 0, major, ino, 0);
+		err = insert_fs_item(wri, bti);
+		if (err)
+			free(bti);
+	}
+
+	return err;
+}
+
+static spr_err_t insert_inode_items(struct scoutfs_parallel_restore_writer *wri,
+				    struct scoutfs_parallel_restore_inode *inode)
+{
+	struct scoutfs_inode *si;
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(sizeof(struct scoutfs_inode), &bti);
+	if (err)
+		goto out;
+
+	init_key(&bti->key, SCOUTFS_FS_ZONE, SCOUTFS_INODE_TYPE, inode->ino, 0, 0, 0);
+
+	si = bti->val;
+
+	si->size = 0;
+	si->meta_seq = cpu_to_le64(inode->meta_seq);
+	si->data_seq = cpu_to_le64(inode->data_seq);
+	si->data_version = 0;
+	si->online_blocks = 0;
+	si->offline_blocks = 0;
+	si->next_readdir_pos = 0;
+	si->next_xattr_id = cpu_to_le64(inode->nr_xattrs + 1);
+	si->version = cpu_to_le64(1);
+	si->nlink = cpu_to_le32(inode->nlink ? inode->nlink : 1);
+	si->uid = cpu_to_le32(inode->uid);
+	si->gid = cpu_to_le32(inode->gid);
+	si->mode = cpu_to_le32(inode->mode);
+	si->flags = 0;
+	si->flags = cpu_to_le32(inode->flags);
+	si->atime.sec = cpu_to_le64(inode->atime.tv_sec);
+	si->atime.nsec = cpu_to_le32(inode->atime.tv_nsec);
+	si->ctime.sec = cpu_to_le64(inode->ctime.tv_sec);
+	si->ctime.nsec = cpu_to_le32(inode->ctime.tv_nsec);
+	si->mtime.sec = cpu_to_le64(inode->mtime.tv_sec);
+	si->mtime.nsec = cpu_to_le32(inode->mtime.tv_nsec);
+	si->crtime.sec = cpu_to_le64(inode->crtime.tv_sec);
+	si->crtime.nsec = cpu_to_le32(inode->crtime.tv_nsec);
+	si->proj = cpu_to_le64(inode->proj);
+
+	/* XXX make sure this works across all el7/8/9 due to glibc magic */
+	si->rdev = (inode->rdev & 0xff) | ((inode->rdev & 0xffffff00) << 12);
+
+	err = insert_inode_index_item(wri, SCOUTFS_INODE_INDEX_META_SEQ_TYPE,
+				      le64_to_cpu(si->meta_seq), inode->ino);
+	if (err)
+		goto out;
+
+	if (S_ISREG(inode->mode)) {
+		si->size = cpu_to_le64(inode->size);
+		si->data_version = cpu_to_le64(inode->data_version);
+
+		err = insert_inode_index_item(wri, SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE,
+					      le64_to_cpu(si->data_seq), inode->ino);
+		if (err)
+			goto out;
+
+		if (inode->offline) {
+			si->offline_blocks = cpu_to_le64(DIV_ROUND_UP(inode->size,
+								      SCOUTFS_BLOCK_SM_SIZE));
+			err = insert_extent_item(wri, inode->ino, le64_to_cpu(si->offline_blocks));
+			if (err)
+				goto out;
+		}
+
+	} else if (S_ISDIR(inode->mode)) {
+		si->size = cpu_to_le64(inode->total_entry_name_bytes);
+		si->next_readdir_pos = cpu_to_le64(SCOUTFS_DIRENT_FIRST_POS + inode->nr_subdirs);
+		si->nlink = cpu_to_le32(2 + inode->nr_subdirs);
+
+	} else if (S_ISLNK(inode->mode)) {
+		si->size = cpu_to_le64(inode->target_len);
+
+		err = insert_symlink_items(wri, inode->ino, inode->target, inode->target_len);
+		if (err)
+			goto out;
+	}
+
+	err = insert_fs_item(wri, bti);
+out:
+	return err;
+}
+
+static spr_err_t insert_log_trees_item(struct scoutfs_parallel_restore_writer *wri,
+				       struct scoutfs_parallel_restore_progress *prog)
+{
+	struct scoutfs_log_trees *lt;
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(sizeof(struct scoutfs_log_trees), &bti);
+	if (err)
+		goto out;
+
+	lt = bti->val;
+	memset(lt, 0, sizeof(struct scoutfs_log_trees));
+	lt->item_root = prog->fs_items;
+	lt->bloom_ref = prog->bloom_ref;
+	/* lt srch_file is blank once finalized, moved to srch_root items */
+	lt->inode_count_delta = prog->inode_count;
+	lt->get_trans_seq = cpu_to_le64(1);
+	lt->commit_trans_seq = cpu_to_le64(1);
+	lt->max_item_seq = cpu_to_le64(1);
+	lt->finalize_seq = cpu_to_le64(1);
+	lt->rid = prog->max_ino;
+	lt->nr = cpu_to_le64(1);
+	lt->flags = cpu_to_le64(SCOUTFS_LOG_TREES_FINALIZED);
+
+	init_key(&bti->key, SCOUTFS_LOG_TREES_ZONE, 0,
+		 le64_to_cpu(lt->rid), le64_to_cpu(lt->nr), 0, 0);
+
+	err = btb_insert(&wri->log_btb, bti, 0);
+out:
+	return err;
+}
+
+static spr_err_t insert_srch_item(struct scoutfs_parallel_restore_writer *wri,
+				  struct scoutfs_srch_file *sfl)
+{
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(sizeof(struct scoutfs_srch_file), &bti);
+	if (!err) {
+		init_key(&bti->key, SCOUTFS_SRCH_ZONE, SCOUTFS_SRCH_BLOCKS_TYPE,
+			 0, le64_to_cpu(sfl->blocks), le64_to_cpu(sfl->ref.blkno), 0);
+		memcpy(bti->val, sfl, sizeof(struct scoutfs_srch_file));
+		err = btb_insert(&wri->srch_btb, bti, 0);
+	}
+
+	return err;
+}
+
+static spr_err_t insert_quota_item(struct scoutfs_parallel_restore_writer *wri,
+				  struct scoutfs_parallel_restore_quota_rule *rule)
+{
+	struct scoutfs_quota_rule_val *rv;
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(sizeof(struct scoutfs_quota_rule_val), &bti);
+	if (err)
+		goto out;
+
+	rv = bti->val;
+	memset(rv, 0, sizeof(struct scoutfs_quota_rule_val));
+	rv->limit = cpu_to_le64(rule->limit);
+	rv->prio = rule->prio;
+	rv->op = rule->op;
+	rv->rule_flags = rule->rule_flags;
+	rv->name_val[0] = cpu_to_le64(rule->names[0].val);
+	rv->name_source[0] = rule->names[0].source;
+	rv->name_flags[0] = rule->names[0].flags;
+	rv->name_val[1] = cpu_to_le64(rule->names[1].val);
+	rv->name_source[1] = rule->names[1].source;
+	rv->name_flags[1] = rule->names[1].flags;
+	rv->name_val[2] = cpu_to_le64(rule->names[2].val);
+	rv->name_source[2] = rule->names[2].source;
+	rv->name_flags[2] = rule->names[2].flags;
+	memset(&rv->_pad, 0, sizeof(rv->_pad));
+
+	init_key(&bti->key, SCOUTFS_QUOTA_ZONE, SCOUTFS_QUOTA_RULE_TYPE,
+			0, scoutfs_hash64(rv, sizeof(struct scoutfs_quota_rule_val)), 0, 0);
+
+	err = insert_fs_item(wri, bti);
+	if (err) {
+		free(bti);
+		goto out;
+	}
+out:
+	return err;
+}
+
+#define UNLINKED_AVL_HEIGHT 255
+
+static void link_avl_nodes(struct scoutfs_btree_block *bt, __le16 *parent, __le16 parent_off,
+			   u8 height, int first, int last)
+{
+	int ind = (first + last) / 2;
+	struct scoutfs_avl_node *node = &bt->items[ind].node;
+	u64 off = (long)node - (long)&bt->item_root;
+
+	dprintf("first %d ind %d last %d height %u\n", first, ind, last, height);
+
+	if (ind < first || ind > last || node->height != UNLINKED_AVL_HEIGHT)
+		return;
+
+	*parent = cpu_to_le16(off);
+	node->parent = parent_off;
+	node->height = height;
+	node->left = 0;
+	node->right = 0;
+	memset(node->__pad, 0, sizeof(node->__pad));
+
+	if (height > 1) {
+		link_avl_nodes(bt, &node->left, cpu_to_le16(off), height - 1, first, ind - 1);
+		link_avl_nodes(bt, &node->right, cpu_to_le16(off), height - 1, ind + 1, last);
+	}
+}
+
+#define DEFINE_BUILDER_CONTAINER(type, name, ptr) \
+	type *name = container_of(ptr, type, bld)
+
+static bool btree_empty(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct btree_builder, btb, bld);
+
+	return btb->total_items == 0;
+}
+
+static void btree_reset(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct btree_builder, btb, bld);
+
+	btb->total_items = 0;
+	btb->total_len = 0;
+	memset(&btb->btroot, 0, sizeof(btb->btroot));
+}
+
+/*
+ * Incrementally build btrees.  By the time we're called the builder has
+ * all the sorted leaf items in an rbtree at their level.  We streaem
+ * them into blocks and store parent items at the next highest level.
+ * Once we're out of leaf items we stream the parent items into blocks
+ * and store their parent items at the next highest level.  Eventually
+ * we drain all the items and are left with the root's reference to the
+ * first block in the tree.
+ */
+static spr_err_t build_btree_block(struct scoutfs_parallel_restore_writer *wri,
+				   struct block_builder *bld, void *buf, u64 blkno)
+{
+	DEFINE_BUILDER_CONTAINER(struct btree_builder, btb, bld);
+	struct scoutfs_block_header *hdr;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_block_ref *ref;
+	struct btree_item *bti;
+	struct btree_item *tmp;
+	unsigned long val_align;
+	unsigned long bytes;
+	unsigned long nr;
+	unsigned long min_items;
+	long item_bytes_after_block;
+	void *val_buf;
+	spr_err_t err;
+	u8 height;
+	int level;
+	int i;
+
+	/* find next highest level to build items from */
+	for (i = 0; i < SCOUTFS_BTREE_MAX_HEIGHT; i++) {
+		if (btb->items[i].nr == 0)
+			continue;
+
+		level = i;
+		break;
+	}
+
+	/* shouldn't be possible */
+	if (i >= SCOUTFS_BTREE_MAX_HEIGHT) {
+		err = ENOBUFS;
+		goto out;
+	}
+
+	dprintf("building btree blkno %llu level %u nr %lu tot %llu \n",
+		blkno, level, btb->items[level].nr, btb->total_items);
+
+	/*
+	 * XXX Be more careful about item filling.. can parents be entirely
+	 * full?  Should we let the last nodes on the right be under the
+	 * min?  We can see that there are < (nr + min) left and emit
+	 * half the remaining in each.
+	 */
+
+	/* initialize the non-item parts of the block */
+	bt = buf;
+	memset(bt, 0, sizeof(struct scoutfs_btree_block));
+	hdr = &bt->hdr;
+	hdr->magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BTREE);
+	hdr->fsid = wri->fsid;
+	hdr->blkno = cpu_to_le64(blkno);
+	hdr->seq = cpu_to_le64(1);
+	bt->level = level;
+	btree_init_block(bt, level);
+	if (level == 0)
+		memset((char *)bt + SCOUTFS_BLOCK_LG_SIZE - SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES, 0,
+		       SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES);
+
+	/* find the items that fit in the leaf */
+	item = &bt->items[0];
+	nr = 0;
+	val_buf = (void *)item + le16_to_cpu(bt->mid_free_len);
+
+	for_each_bti_safe(&btb->items[level].root, bti, tmp) {
+		val_align = round_up(bti->val_len, SCOUTFS_BTREE_VALUE_ALIGN);
+		bytes = sizeof(struct scoutfs_btree_item) + val_align;
+		item_bytes_after_block = (le64_to_cpu(btb->total_items) * bytes) - le16_to_cpu(bt->mid_free_len);
+		min_items = (SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) / 4;
+
+		if (le16_to_cpu(bt->mid_free_len) < bytes)
+			break;
+
+		/* stop when there are not enough items to fill the next block */
+		if (item_bytes_after_block > 0 && item_bytes_after_block < min_items)
+			break;
+
+		item->node.height = UNLINKED_AVL_HEIGHT;
+		item->key = bti->key;
+		item->seq = cpu_to_le64(1);
+		item->val_len = cpu_to_le16(bti->val_len);
+		item->flags = 0;
+		memset(item->node.__pad, 0, sizeof(item->node.__pad));
+
+		if (bti->val_len) {
+			val_buf -= val_align;
+			item->val_off = cpu_to_le16((long)val_buf - (long)bt);
+			memcpy(val_buf, bti->val, bti->val_len);
+		} else {
+			item->val_off = 0;
+		}
+
+		le16_add_cpu(&bt->nr_items, 1);
+		le16_add_cpu(&bt->total_item_bytes, bytes);
+		le16_add_cpu(&bt->mid_free_len, -bytes);
+		if (level == 0)
+			leaf_item_hash_insert(bt, &item->key,
+					      cpu_to_le16((void *)item - (void *)bt));
+
+		item++;
+		nr++;
+
+		btb_erase(btb, bti, level);
+		free(bti);
+	}
+
+	/* zero the middle of the block without items */
+	if (bt->mid_free_len)
+		memset(&bt->items[nr], 0, le16_to_cpu(bt->mid_free_len));
+
+	height = (int)ceil(log2(nr)) + 2; /* leaves are height 1 */
+	link_avl_nodes(bt, &bt->item_root.node, 0, height - 1, 0, nr - 1);
+
+	/* finish block */
+	hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE));
+
+	if (btb->total_items == 0) {
+		/* root refs hightest/last block we build */
+		btb->btroot.ref.blkno = hdr->blkno;
+		btb->btroot.ref.seq = hdr->seq;
+		btb->btroot.height = level +1;
+	} else {
+		/* parent ref items will be built into parent blocks */
+		/* we'll always need a parent ref for the block we're building */
+		err = bti_alloc(sizeof(struct scoutfs_block_ref), &bti);
+		if (err)
+			goto out;
+
+		/* refs to right spine blocks has all ones key */
+		if (btb->items[level].nr == 0)
+			scoutfs_key_set_ones(&bti->key);
+		else
+			bti->key = bt->items[nr - 1].key;
+		ref = bti->val;
+		ref->blkno = hdr->blkno;
+		ref->seq = hdr->seq;
+		btb_insert(btb, bti, level + 1);
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static void btb_init(struct btree_builder *btb)
+{
+	int i;
+
+	init_builder(&btb->bld, btree_empty, btree_reset, build_btree_block);
+
+	for (i = 0; i < array_size(btb->items); i++)
+		btb->items[i].root = RB_ROOT;
+}
+
+/*
+ * This is how we get around the recursion of allocating blocks to write blocks that
+ * store the allocators.  After we've written all other metadata blocks we know precisely
+ * how many allocation blocks we'll need.  We modify the writer to only have that many
+ * free blocks remaining and put the rest in the alloc block builders.
+ */
+static spr_err_t prepare_alloc_builders(struct scoutfs_parallel_restore_writer *wri,
+					struct block_builder *bld)
+{
+#define ALLOC_BLOCKS 5 /* 2 meta list, 2 meta btree, 1 data btree */
+	struct extent_head *eh_tmp;
+	struct extent_head *eh;
+	spr_err_t err;
+	u64 start;
+	u64 skip;
+	u64 len;
+	int ind;
+
+	dprintf("starting prepare with start %llu len %llu\n", wri->meta_start, wri->meta_len);
+
+	skip = ALLOC_BLOCKS + (SCOUTFS_ALLOC_LIST_MAX_BLOCKS * 2);
+	if (wri->meta_len <= skip)
+		return ENOSPC;
+
+	/* store remainder of meta alloc as a free extent */
+	start = wri->meta_start + skip;
+	len = wri->meta_len - skip;
+	err = insert_free_items(&wri->meta_btb[0], start, len);
+	if (err)
+		goto out;
+	wri->meta_len -= len;
+
+	/* the rest of the meta extents are items in the two meta trees */
+	ind = 1;
+	list_for_each_entry_safe(eh, eh_tmp, &wri->meta_extents, head) {
+		err = insert_free_items(&wri->meta_btb[ind], eh->start, eh->len);
+		if (err)
+			goto out;
+		list_del_init(&eh->head);
+		free(eh);
+		ind ^= 1;
+	}
+
+	/* fill the two server avail alloc list blocks */
+	wri->meta_alb[0].start = wri->meta_start + ALLOC_BLOCKS;
+	wri->meta_alb[0].len = SCOUTFS_ALLOC_LIST_MAX_BLOCKS;
+	wri->meta_alb[1].start = wri->meta_alb[0].start + wri->meta_alb[0].len;
+	wri->meta_alb[1].len = wri->meta_alb[0].len;
+
+	/* writer left with only meta allocation for remaining alloc blocks */
+	wri->meta_len = ALLOC_BLOCKS;
+
+	err = 0;
+out:
+	return err;
+}
+
+static bool alloc_list_empty(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct alloc_list_builder, alb, bld);
+
+	return alb->len == 0;
+}
+
+static spr_err_t build_alloc_list_block(struct scoutfs_parallel_restore_writer *wri,
+					struct block_builder *bld, void *buf, u64 blkno)
+{
+	DEFINE_BUILDER_CONTAINER(struct alloc_list_builder, alb, bld);
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_header *hdr;
+	int i;
+
+	if (alb->len > SCOUTFS_ALLOC_LIST_MAX_BLOCKS)
+		return EOVERFLOW;
+
+	lblk = buf;
+	memset(&lblk->next, 0, sizeof(lblk->next));
+	lblk->start = 0;
+	lblk->nr = cpu_to_le32(alb->len);
+
+	for (i = 0; i < alb->len; i++)
+		lblk->blknos[i] = cpu_to_le64(alb->start + i);
+
+	hdr = &lblk->hdr;
+	hdr->magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
+	hdr->fsid = wri->fsid;
+	hdr->blkno = cpu_to_le64(blkno);
+	hdr->seq = cpu_to_le64(1);
+	hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE));
+
+	alb->lhead.ref.blkno = hdr->blkno;
+	alb->lhead.ref.seq = hdr->seq;
+	alb->lhead.first_nr = cpu_to_le32(alb->len);
+	alb->lhead.total_nr = cpu_to_le64(alb->len);
+
+	alb->start = 0;
+	alb->len = 0;
+
+	return 0;
+}
+
+static void init_alb(struct alloc_list_builder *alb)
+{
+	init_builder(&alb->bld, alloc_list_empty, NULL, build_alloc_list_block);
+}
+
+static struct srch_node *node_srn(struct rb_node *node)
+{
+	return node ? container_of(node, struct srch_node, node) : NULL;
+}
+
+static struct srch_node *srn_first(struct rb_root *root)
+{
+	return node_srn(rb_first(root));
+}
+
+static struct srch_node *srn_next(struct srch_node *srn)
+{
+	return srn ? node_srn(rb_next(&srn->node)) : NULL;
+}
+
+static spr_err_t insert_srch_entry(struct srch_builder *sbld, u64 hash, u64 ino, u64 id)
+{
+	struct rb_root *root = &sbld->entries;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct srch_node *ins;
+	struct srch_node *srn;
+	int cmp;
+
+	ins = malloc(sizeof(struct srch_node));
+	if (!ins)
+		return ENOMEM;
+
+	ins->hash = hash;
+	ins->ino = ino;
+	ins->id = id;
+
+	while (*node) {
+		parent = *node;
+		srn = node_srn(*node);
+
+		cmp = scoutfs_cmp(ins->hash, srn->hash) ?:
+		      scoutfs_cmp(ins->ino, srn->ino) ?:
+		      scoutfs_cmp(ins->id, srn->id);
+		if (cmp < 0)
+			node = &(*node)->rb_left;
+		else if (cmp > 0)
+			node = &(*node)->rb_right;
+		else
+			return EEXIST;
+	}
+
+	rb_link_node(&ins->node, parent, node);
+	rb_insert_color(&ins->node, root);
+
+	return 0;
+}
+
+static bool srch_empty(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct srch_builder, sbld, bld);
+
+	return RB_EMPTY_ROOT(&sbld->entries) && sbld->total_parent_refs == 0;
+}
+
+static void srch_reset(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct srch_builder, sbld, bld);
+
+	memset(&sbld->sfl, 0, sizeof(sbld->sfl));
+}
+
+#define for_each_sbld_parent(sbld, i) \
+	for (i = 1; i < array_size(sbld->parents); i++)
+
+static spr_err_t build_srch_block(struct scoutfs_parallel_restore_writer *wri,
+				   struct block_builder *bld, void *buf, u64 blkno)
+{
+	DEFINE_BUILDER_CONTAINER(struct srch_builder, sbld, bld);
+	struct scoutfs_block_header *hdr;
+	struct scoutfs_srch_parent *par;
+	struct scoutfs_srch_block *srb;
+	struct scoutfs_srch_entry sre;
+	struct scoutfs_block_ref *ref;
+	struct srch_node *srn_tmp;
+	struct srch_node *srn;
+	unsigned int nr;
+	spr_err_t err;
+	u32 magic;
+	int level;
+	int tail;
+	int ret;
+
+	dprintf("building srch blkno %llu empty_entries %u tot refs %llu parent nrs: ",
+		blkno, RB_EMPTY_ROOT(&sbld->entries), sbld->total_parent_refs);
+	for_each_sbld_parent(sbld, level)
+		dprintf("%u:%lu ", level, sbld->parents[level].nr);
+	dprintf("\n");
+
+	/* build parents with refs that are full or when we're out of entries */
+	for_each_sbld_parent(sbld, level) {
+
+		nr = sbld->parents[level].nr;
+		if (nr == 0 || (nr < SCOUTFS_SRCH_PARENT_REFS && !RB_EMPTY_ROOT(&sbld->entries)))
+			continue;
+
+		/* copy parent refs */
+		par = buf;
+		memcpy(par->refs, sbld->parents[level].refs, nr * sizeof(par->refs[0]));
+		sbld->total_parent_refs -= nr;
+		sbld->parents[level].nr = 0;
+
+		/* zero the tail of the block */
+		tail = SCOUTFS_BLOCK_LG_SIZE - offsetof(struct scoutfs_srch_parent, refs[nr]);
+		if (tail > 0)
+			memset(buf + SCOUTFS_BLOCK_LG_SIZE - tail, 0, tail);
+
+		magic = SCOUTFS_BLOCK_MAGIC_SRCH_PARENT;
+		hdr = &par->hdr;
+		goto finish_hdr;
+	}
+
+	/* no built parent, must have entries to build */
+	level = 0;
+	if (RB_EMPTY_ROOT(&sbld->entries)) {
+		err = EINVAL;
+		goto out;
+	}
+
+	srn = srn_first(&sbld->entries);
+	sre.hash = cpu_to_le64(srn->hash);
+	sre.ino = cpu_to_le64(srn->ino);
+	sre.id = cpu_to_le64(srn->id);
+
+	srb = buf;
+	srb->entry_nr = 0;
+	srb->entry_bytes = 0;
+	srb->first = sre;
+	memset(&srb->tail, 0, sizeof(srb->tail));
+
+	if (sbld->sfl.blocks == 0)
+		sbld->sfl.first = sre;
+
+	do {
+		if (le32_to_cpu(srb->entry_bytes) > SCOUTFS_SRCH_BLOCK_SAFE_BYTES)
+			break;
+
+		ret = srch_encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
+					&sre, &srb->tail);
+
+		dprintf("%llu.%llu.%llu ret %d\n", srn->hash, srn->ino, srn->id, ret);
+
+		le32_add_cpu(&srb->entry_bytes, ret);
+		le32_add_cpu(&srb->entry_nr, 1);
+		srb->tail = sre;
+
+		srn_tmp = srn_next(srn);
+		rb_erase(&srn->node, &sbld->entries);
+		free(srn);
+
+		if ((srn = srn_tmp)) {
+			sre.hash = cpu_to_le64(srn->hash);
+			sre.ino = cpu_to_le64(srn->ino);
+			sre.id = cpu_to_le64(srn->id);
+		}
+	} while (srn);
+
+	srb->last = srb->tail;
+	sbld->sfl.last = srb->tail;
+
+	le64_add_cpu(&sbld->sfl.blocks, 1);
+	le64_add_cpu(&sbld->sfl.entries, le32_to_cpu(srb->entry_nr));
+
+	magic = SCOUTFS_BLOCK_MAGIC_SRCH_BLOCK;
+	hdr = &srb->hdr;
+
+finish_hdr:
+	hdr->magic = cpu_to_le32(magic);
+	hdr->fsid = wri->fsid;
+	hdr->blkno = cpu_to_le64(blkno);
+	hdr->seq = cpu_to_le64(1);
+	hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE));
+
+	if (srch_empty(&sbld->bld)) {
+		/* the last block is referenced by the root */
+		sbld->sfl.ref.blkno = hdr->blkno;
+		sbld->sfl.ref.seq = hdr->seq;
+		sbld->sfl.height = level + 1;
+		memset(sbld->sfl.__pad, 0, sizeof(sbld->sfl.__pad));
+	} else {
+		/* store the parent ref to our block */
+		nr = sbld->parents[level + 1].nr++;
+		ref = &sbld->parents[level + 1].refs[nr];
+		ref->blkno = hdr->blkno;
+		ref->seq = hdr->seq;
+		sbld->total_parent_refs++;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static spr_err_t sbld_create(struct srch_builder *sbld)
+{
+	spr_err_t err = 0;
+	int i;
+
+	init_builder(&sbld->bld, srch_empty, srch_reset, build_srch_block);
+
+	for_each_sbld_parent(sbld, i) {
+		sbld->parents[i].refs = malloc(SCOUTFS_SRCH_PARENT_REFS *
+					       sizeof(struct scoutfs_block_ref));
+		if (!sbld->parents[i].refs) {
+			while (--i >= 1) {
+				free(sbld->parents[i].refs);
+				sbld->parents[i].refs = NULL;
+			}
+			err = ENOMEM;
+			break;
+		}
+	}
+
+	return err;
+}
+
+static void sbld_destroy(struct srch_builder *sbld)
+{
+	int i;
+
+	for_each_sbld_parent(sbld, i) {
+		free(sbld->parents[i].refs);
+		sbld->parents[i].refs = NULL;
+	}
+}
+
+/*
+ * We've written the bloom block if we've filled out its header.
+ */
+static bool bloom_empty(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct bloom_builder, bbld, bld);
+
+	return bbld->bloom->hdr.seq != 0;
+}
+
+static void bloom_reset(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct bloom_builder, bbld, bld);
+
+	memset(bbld->bloom, 0, SCOUTFS_BLOCK_LG_SIZE);
+}
+
+static spr_err_t build_bloom_block(struct scoutfs_parallel_restore_writer *wri,
+				   struct block_builder *bld, void *buf, u64 blkno)
+{
+	DEFINE_BUILDER_CONTAINER(struct bloom_builder, bbld, bld);
+	struct scoutfs_block_header *hdr;
+
+	hdr = &bbld->bloom->hdr;
+	hdr->magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BLOOM);
+	hdr->fsid = wri->fsid;
+	hdr->blkno = cpu_to_le64(blkno);
+	hdr->seq = cpu_to_le64(1);
+	hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE));
+
+	memcpy(buf, bbld->bloom, SCOUTFS_BLOCK_LG_SIZE);
+
+	return 0;
+}
+
+static spr_err_t bbld_create(struct bloom_builder *bbld)
+{
+	init_builder(&bbld->bld, bloom_empty, bloom_reset, build_bloom_block);
+
+	bbld->bloom = malloc(SCOUTFS_BLOCK_LG_SIZE);
+	if (!bbld->bloom)
+		return ENOMEM;
+
+	memset(&bbld->bloom->hdr, 0, sizeof(bbld->bloom->hdr));
+
+	return 0;
+}
+
+static void bbld_destroy(struct bloom_builder *bbld)
+{
+	free(bbld->bloom);
+}
+
+static bool wri_has_super(struct scoutfs_parallel_restore_writer *wri)
+{
+	return wri->super.hdr.blkno != 0;
+}
+
+static void reset_builders(struct scoutfs_parallel_restore_writer *wri)
+{
+	/* define block build order, different than struct layout order */
+	struct block_builder *builders[] = {
+		/* fs items written in parallel by writers */
+		&wri->fs_btb.bld,
+		&wri->bloom_bbld.bld,
+		&wri->srch_sbld.bld,
+
+		/* global items written finally by global super writer */
+		&wri->root_btb.bld,
+		&wri->srch_btb.bld,
+		/* log .post() prepares final allocators */
+		&wri->log_btb.bld,
+		&wri->meta_alb[0].bld,
+		&wri->meta_alb[1].bld,
+		&wri->meta_btb[0].bld,
+		&wri->meta_btb[1].bld,
+		&wri->data_btb.bld,
+	};
+	struct block_builder *bld;
+	int i;
+
+	for (i = 0; i < array_size(builders); i++) {
+		bld = builders[i];
+
+		if (bld->reset)
+			bld->reset(bld);
+
+		if (!list_empty(&bld->head))
+			list_del_init(&bld->head);
+		list_add_tail(&bld->head, &wri->builders);
+	}
+}
+
+spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip)
+{
+	struct scoutfs_parallel_restore_writer *wri;
+	spr_err_t err;
+
+	wri = calloc(1, sizeof(struct scoutfs_parallel_restore_writer));
+	if (!wri) {
+		err = ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&wri->meta_extents);
+	INIT_LIST_HEAD(&wri->builders);
+	btb_init(&wri->root_btb);
+	btb_init(&wri->fs_btb);
+	btb_init(&wri->srch_btb);
+	btb_init(&wri->log_btb);
+	btb_init(&wri->meta_btb[0]);
+	btb_init(&wri->meta_btb[1]);
+	btb_init(&wri->data_btb);
+	init_alb(&wri->meta_alb[0]);
+	init_alb(&wri->meta_alb[1]);
+
+	err = sbld_create(&wri->srch_sbld) ?:
+	      bbld_create(&wri->bloom_bbld);
+	if (err)
+		goto out;
+
+	reset_builders(wri);
+	err = 0;
+out:
+	if (err) {
+		if (wri) {
+			sbld_destroy(&wri->srch_sbld);
+			bbld_destroy(&wri->bloom_bbld);
+			free(wri);
+		}
+		wri = NULL;
+	}
+	*wrip = wri;
+	return err;
+}
+
+void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip)
+{
+	struct scoutfs_parallel_restore_writer *wri = *wrip;
+	struct extent_head *eh;
+	struct extent_head *eh_tmp;
+
+	if (!wri)
+		return;
+
+	btb_destroy(&wri->root_btb);
+	btb_destroy(&wri->fs_btb);
+	btb_destroy(&wri->srch_btb);
+	btb_destroy(&wri->log_btb);
+	btb_destroy(&wri->meta_btb[0]);
+	btb_destroy(&wri->meta_btb[1]);
+	btb_destroy(&wri->data_btb);
+	sbld_destroy(&wri->srch_sbld);
+	bbld_destroy(&wri->bloom_bbld);
+
+	list_for_each_entry_safe(eh, eh_tmp, &wri->meta_extents, head) {
+		list_del_init(&eh->head);
+		free(eh);
+	}
+
+	free(wri);
+	*wrip = NULL;
+}
+
+spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
+					       struct scoutfs_parallel_restore_slice *slices,
+					       int nr)
+{
+	u64 total = le64_to_cpu(wri->super.total_meta_blocks);
+	u64 start = SCOUTFS_META_DEV_START_BLKNO;
+	u64 each = (total - start) / nr;
+	int i;
+
+	if (!wri_has_super(wri))
+		return EINVAL;
+
+	for (i = 0; i < nr - 1; i++) {
+		slices[i].fsid = wri->super.hdr.fsid;
+		slices[i].meta_start = cpu_to_le64(start);
+		slices[i].meta_len = cpu_to_le64(each);
+		start += each;
+	}
+
+	slices[i].fsid = wri->super.hdr.fsid;
+	slices[i].meta_start = cpu_to_le64(start);
+	slices[i].meta_len = cpu_to_le64(total - start);
+
+	return 0;
+}
+
+spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_slice *slice)
+{
+	wri->fsid = slice->fsid;
+
+	return meta_alloc_add(wri, le64_to_cpu(slice->meta_start), le64_to_cpu(slice->meta_len));
+}
+
+spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_slice *slice)
+{
+	slice->fsid = wri->fsid;
+	slice->meta_start = cpu_to_le64(wri->meta_start);
+	slice->meta_len = cpu_to_le64(wri->meta_len);
+	return 0;
+}
+
+spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_inode *inode)
+{
+	spr_err_t err;
+
+	if (wri_has_super(wri))
+		return EINVAL;
+
+	err = insert_inode_items(wri, inode);
+	if (err)
+		goto out;
+
+	wri->inode_count++;
+	wri->max_ino = max(wri->max_ino, inode->ino);
+	err = 0;
+out:
+	return err;
+}
+
+spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_entry *entry)
+{
+
+	if (wri_has_super(wri))
+		return EINVAL;
+
+	return insert_entry_items(wri, entry);
+}
+
+spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_xattr *xattr)
+{
+	spr_err_t err;
+	int xtags;
+	u32 xat_hash;
+	u64 srch_hash;
+
+	xat_hash = crc32c(U32_MAX, xattr->name, xattr->name_len);
+	srch_hash = scoutfs_hash64(xattr->name, xattr->name_len);
+	xtags = get_xattr_tags(xattr->name, xattr->name_len);
+
+	err = insert_xattr_items(wri, xattr, xat_hash);
+	if (!err) {
+		if (xtags & XTAG_SRCH)
+			err = insert_srch_entry(&wri->srch_sbld, srch_hash, xattr->ino, xattr->pos);
+		if (!err && (xtags & XTAG_TOTL))
+			err = insert_totl_item(wri, xattr);
+	}
+
+	return err;
+}
+
+spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog)
+{
+	if (wri_has_super(wri))
+		return EINVAL;
+
+	memset(prog, 0, sizeof(struct scoutfs_parallel_restore_progress));
+	prog->fs_items = wri->fs_btb.btroot;
+	prog->root_items = wri->root_btb.btroot;
+	prog->sfl = wri->srch_sbld.sfl;
+	prog->bloom_ref.blkno = wri->bloom_bbld.bloom->hdr.blkno;
+	prog->bloom_ref.seq = wri->bloom_bbld.bloom->hdr.seq;
+	prog->inode_count = cpu_to_le64(wri->inode_count);
+	prog->max_ino = cpu_to_le64(wri->max_ino);
+
+	reset_builders(wri);
+	wri->inode_count = 0;
+	wri->max_ino = 0;
+
+	return 0;
+}
+
+spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog)
+{
+	spr_err_t err;
+
+	if (!wri_has_super(wri))
+		return EINVAL;
+
+	/*
+	 * Only one writer's progress should contain the root inode.
+	 */
+	if (prog->root_items.ref.blkno) {
+		if (wri->root_items.ref.blkno)
+			return EEXIST;
+		wri->root_items = prog->root_items;
+	}
+
+	wri->max_ino = max(wri->max_ino, le64_to_cpu(prog->max_ino));
+
+	err = insert_log_trees_item(wri, prog);
+	if (!err && prog->sfl.ref.blkno)
+	      err = insert_srch_item(wri, &prog->sfl);
+
+	return err;
+}
+
+spr_err_t scoutfs_parallel_restore_add_quota_rule(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_quota_rule *rule)
+{
+	return insert_quota_item(wri, rule);
+}
+
+spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
+					     void *buf, size_t len, off_t *off_ret,
+					     size_t *count_ret)
+{
+	struct block_builder *bld;
+	off_t count = 0;
+	off_t off = 0;
+	u64 blkno = 0;
+	spr_err_t err;
+
+	if (len < SCOUTFS_BLOCK_LG_SIZE) {
+		err = EINVAL;
+		goto out;
+	}
+
+	while (len >= SCOUTFS_BLOCK_LG_SIZE) {
+		bld = list_first_entry_or_null(&wri->builders, struct block_builder, head);
+		if (!bld) {
+			err = 0;
+			break;
+		}
+
+		if (bld->empty(bld)) {
+			if (bld->post && ((err = bld->post(wri, bld))))
+				break;
+			list_del_init(&bld->head);
+			continue;
+		}
+
+		err = meta_alloc_contig(wri, blkno, &blkno);
+		if (err || blkno == 0)
+			break;
+
+		if (off == 0)
+			off = blkno << SCOUTFS_BLOCK_LG_SHIFT;
+
+		err = bld->build(wri, bld, buf, blkno);
+		if (err)
+			break;
+
+		buf += SCOUTFS_BLOCK_LG_SIZE;
+		len -= SCOUTFS_BLOCK_LG_SIZE;
+		count += SCOUTFS_BLOCK_LG_SIZE;
+
+		dprintf("built blkno %llu off %llu count %llu\n", blkno, (u64)off, (u64)count);
+	}
+
+out:
+	*off_ret = off;
+	*count_ret = count;
+	return count > 0 ? 0 : err;
+}
+
+/*
+ * Here we take in a dev's fd an read its quorum blocks to see if the dev has
+ * been mounted before
+ */
+static spr_err_t scoutfs_check_if_previous_mount(int fd)
+{
+	struct scoutfs_quorum_block *blk = NULL;
+	struct scoutfs_quorum_block_event *ev;
+	u64 blkno;
+	int i, j;
+	spr_err_t err;
+
+	for (i = 0; i <  SCOUTFS_QUORUM_MAX_SLOTS; i++) {
+		blkno = SCOUTFS_QUORUM_BLKNO + i;
+		err = read_block(fd, blkno, SCOUTFS_BLOCK_SM_SHIFT, (void **)&blk);
+		if (!blk)
+			return EINVAL;
+
+		dprintf("quorum block read; quorum bklno: %llu, err_val: %d\n", blkno, err);
+		if (err) {
+			free(blk);
+			return err;
+		}
+
+		for (j = 0; j < SCOUTFS_QUORUM_EVENT_NR; j++) {
+			ev = &blk->events[j];
+			if (ev->ts.sec || ev->ts.nsec) {
+				free(blk);
+				return EINVAL;
+			}
+		}
+
+		free(blk);
+	}
+
+	return err;
+}
+
+spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super, int fd)
+{
+	spr_err_t err;
+	u64 start;
+	u64 len;
+
+	/*
+	 * check the device we are restoring into to make sure
+	 * that it has has never been mounted
+	 */
+	if (scoutfs_check_if_previous_mount(fd))
+		return EINVAL;
+
+	if (le64_to_cpu(super->fmt_vers) < 2)
+		return EINVAL;
+
+	if ((le64_to_cpu(super->flags) & SCOUTFS_FLAG_IS_META_BDEV) == 0)
+		return EINVAL;
+
+	if (wri_has_super(wri))
+		return EINVAL;
+
+	start = SCOUTFS_DATA_DEV_START_BLKNO;
+	len = le64_to_cpu(super->total_data_blocks) - start;
+
+	/* make sure all data extents are free */
+	if (le64_to_cpu(super->data_alloc.total_len) != len)
+		return ENOTEMPTY;
+
+	/* we write new allocator blocks so that we don't have to read exiting */
+	err = insert_free_items(&wri->data_btb, start, len);
+	if (err)
+		return err;
+
+	wri->super = *super;
+
+	/* prepare alloc block builders only after other metadata blocks are built */
+	wri->log_btb.bld.post = prepare_alloc_builders;
+
+	return 0;
+}
+
+spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super)
+{
+	if (!wri_has_super(wri))
+		return EINVAL;
+
+	*super = wri->super;
+
+	super->seq = cpu_to_le64(wri->max_ino + 1);
+	super->next_ino = cpu_to_le64(wri->max_ino + 1);
+	super->inode_count = cpu_to_le64(wri->inode_count);
+	set_alloc_root(&super->meta_alloc[0], &wri->meta_btb[0]);
+	set_alloc_root(&super->meta_alloc[1], &wri->meta_btb[1]);
+	set_alloc_root(&super->data_alloc, &wri->data_btb);
+	super->server_meta_avail[0] = wri->meta_alb[0].lhead;
+	super->server_meta_avail[1] = wri->meta_alb[1].lhead;
+	memset(super->server_meta_freed, 0, sizeof(super->server_meta_freed));
+	super->fs_root = wri->root_items;
+	super->logs_root = wri->log_btb.btroot;
+	memset(&super->log_merge, 0, sizeof(super->log_merge));
+	memset(&super->mounted_clients, 0, sizeof(super->mounted_clients));
+	super->srch_root = wri->srch_btb.btroot;
+	/* test volopt? */
+
+	super->hdr.crc = cpu_to_le32(crc_block(&super->hdr, SCOUTFS_BLOCK_SM_SIZE));
+
+	return 0;
+}
diff --git a/utils/src/parallel_restore.h b/utils/src/parallel_restore.h
new file mode 100644
index 00000000..8865e842
--- /dev/null
+++ b/utils/src/parallel_restore.h
@@ -0,0 +1,126 @@
+#ifndef _SCOUTFS_PARALLEL_RESTORE_H_
+#define _SCOUTFS_PARALLEL_RESTORE_H_
+
+#include <errno.h>
+
+struct scoutfs_parallel_restore_progress {
+	struct scoutfs_btree_root fs_items;
+	struct scoutfs_btree_root root_items;
+	struct scoutfs_srch_file sfl;
+	struct scoutfs_block_ref bloom_ref;
+	__le64 inode_count;
+	__le64 max_ino;
+};
+
+struct scoutfs_parallel_restore_slice {
+	__le64 fsid;
+	__le64 meta_start;
+	__le64 meta_len;
+};
+
+struct scoutfs_parallel_restore_entry {
+	u64 dir_ino;
+	u64 pos;
+	u64 ino;
+	mode_t mode;
+	char *name;
+	unsigned int name_len;
+};
+
+struct scoutfs_parallel_restore_xattr {
+	u64 ino;
+	u64 pos;
+	char *name;
+	unsigned int name_len;
+	void *value;
+	unsigned int value_len;
+};
+
+struct scoutfs_parallel_restore_inode {
+	/* all inodes */
+	u64 ino;
+	u64 meta_seq;
+	u64 data_seq;
+	u64 nr_xattrs;
+	u32 uid;
+	u32 gid;
+	u32 mode;
+	u32 rdev;
+	u32 flags;
+	u8 pad[4];
+	struct timespec atime;
+	struct timespec ctime;
+	struct timespec mtime;
+	struct timespec crtime;
+	u64 proj;
+
+	/* regular files */
+	u64 data_version;
+	u64 size;
+	bool offline;
+	u32 nlink;
+
+	/* only used for directories */
+	u64 nr_subdirs;
+	u64 total_entry_name_bytes;
+
+	/* only used for symlnks */
+	char *target;
+	unsigned int target_len; /* not including null terminator */
+};
+
+struct scoutfs_parallel_restore_quota_rule {
+	u64 limit;
+	u8  prio;
+	u8  op;
+	u8  rule_flags;
+	struct quota_rule_name {
+		u64 val;
+		u8  source;
+		u8  flags;
+	} names [3];
+	char *value;
+	unsigned int value_len;
+};
+
+typedef __typeof__(EINVAL) spr_err_t;
+
+struct scoutfs_parallel_restore_writer;
+
+spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip);
+void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip);
+
+spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
+					       struct scoutfs_parallel_restore_slice *slices,
+					       int nr);
+spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
+					    struct scoutfs_parallel_restore_slice *slice);
+spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
+					    struct scoutfs_parallel_restore_slice *slice);
+
+spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_inode *inode);
+spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_entry *entry);
+spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_xattr *xattr);
+
+spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog);
+spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog);
+
+spr_err_t scoutfs_parallel_restore_add_quota_rule(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_quota_rule *rule);
+
+spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
+					     void *buf, size_t len, off_t *off_ret,
+					     size_t *count_ret);
+
+spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super, int fd);
+spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super);
+
+
+#endif