From 47f302568038682359046f8bb35c2af30da0c83b Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 23 Feb 2024 13:28:32 -0800
Subject: [PATCH 01/15] Add check command

Signed-off-by: Zach Brown <zab@versity.com>
Signed-off-by: Auke Kok <auke.kok@versity.com>
---
 utils/Makefile              |   7 +-
 utils/src/check/alloc.c     | 159 ++++++++++
 utils/src/check/alloc.h     |  12 +
 utils/src/check/block.c     | 564 ++++++++++++++++++++++++++++++++++++
 utils/src/check/block.h     |  32 ++
 utils/src/check/btree.c     | 209 +++++++++++++
 utils/src/check/btree.h     |  14 +
 utils/src/check/check.c     | 152 ++++++++++
 utils/src/check/debug.c     |  16 +
 utils/src/check/debug.h     |  17 ++
 utils/src/check/eno.h       |   9 +
 utils/src/check/extent.c    | 313 ++++++++++++++++++++
 utils/src/check/extent.h    |  38 +++
 utils/src/check/iter.h      |  15 +
 utils/src/check/log_trees.c |  98 +++++++
 utils/src/check/log_trees.h |   8 +
 utils/src/check/meta.c      | 367 +++++++++++++++++++++++
 utils/src/check/meta.h      |   9 +
 utils/src/check/padding.c   |  23 ++
 utils/src/check/padding.h   |   6 +
 utils/src/check/problem.c   |  23 ++
 utils/src/check/problem.h   |  23 ++
 utils/src/check/sns.c       | 118 ++++++++
 utils/src/check/sns.h       |  20 ++
 utils/src/check/super.c     |  57 ++++
 utils/src/check/super.h     |   9 +
 26 files changed, 2315 insertions(+), 3 deletions(-)
 create mode 100644 utils/src/check/alloc.c
 create mode 100644 utils/src/check/alloc.h
 create mode 100644 utils/src/check/block.c
 create mode 100644 utils/src/check/block.h
 create mode 100644 utils/src/check/btree.c
 create mode 100644 utils/src/check/btree.h
 create mode 100644 utils/src/check/check.c
 create mode 100644 utils/src/check/debug.c
 create mode 100644 utils/src/check/debug.h
 create mode 100644 utils/src/check/eno.h
 create mode 100644 utils/src/check/extent.c
 create mode 100644 utils/src/check/extent.h
 create mode 100644 utils/src/check/iter.h
 create mode 100644 utils/src/check/log_trees.c
 create mode 100644 utils/src/check/log_trees.h
 create mode 100644 utils/src/check/meta.c
 create mode 100644 utils/src/check/meta.h
 create mode 100644 utils/src/check/padding.c
 create mode 100644 utils/src/check/padding.h
 create mode 100644 utils/src/check/problem.c
 create mode 100644 utils/src/check/problem.h
 create mode 100644 utils/src/check/sns.c
 create mode 100644 utils/src/check/sns.h
 create mode 100644 utils/src/check/super.c
 create mode 100644 utils/src/check/super.h

diff --git a/utils/Makefile b/utils/Makefile
index e0f761424..7f819d405 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
 FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))
 
 CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
-	-fno-strict-aliasing \
+	-I src/ -fno-strict-aliasing \
 	-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
 
 ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
@@ -15,8 +15,9 @@ CFLAGS += -I../kmod/src
 endif
 
 BIN := src/scoutfs
-OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
-DEPS := $(wildcard */*.d)
+OBJ_DIRS := src src/check
+OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
+DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))
 
 all: $(BIN)
 
diff --git a/utils/src/check/alloc.c b/utils/src/check/alloc.c
new file mode 100644
index 000000000..f67b66603
--- /dev/null
+++ b/utils/src/check/alloc.c
@@ -0,0 +1,159 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "bitmap.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "btree.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+
+/*
+ * We check the list blocks serially.
+ *
+ * XXX:
+ *  - compare ref seqs
+ *  - detect cycles?
+ */
+int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_ref ref;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+
+	ref = lhead->ref;
+
+	while (ref.blkno) {
+		blkno = le64_to_cpu(ref.blkno);
+
+		ret = cb(blkno, 1, cb_arg);
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+
+		ret = block_get(&blk, blkno, 0);
+		if (ret < 0)
+			goto out;
+
+		lblk = block_buf(blk);
+		/* XXX verify block */
+		/* XXX sort?   maybe */
+
+		ref = lblk->next;
+
+		block_put(&blk);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
+{
+	return btree_meta_iter(&root->root, cb, cb_arg);
+}
+
+int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_ref ref;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	ref = lhead->ref;
+
+	while (ref.blkno) {
+		blkno = le64_to_cpu(ref.blkno);
+
+		ret = block_get(&blk, blkno, 0);
+		if (ret < 0)
+			goto out;
+
+		sns_push("alloc_list_block", blkno, 0);
+
+		lblk = block_buf(blk);
+		/* XXX verify block */
+		/* XXX sort?   maybe */
+
+		ret = 0;
+		for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
+			blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
+
+			ret = cb(blkno, 1, cb_arg);
+			if (ret < 0)
+				break;
+		}
+
+		ref = lblk->next;
+
+		block_put(&blk);
+		sns_pop();
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static bool valid_free_extent_key(struct scoutfs_key *key)
+{
+	return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
+	        key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
+	       (!key->_sk_fourth && !key->sk_type &&
+		(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
+}
+
+static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
+{
+	struct extent_cb_arg_t *ecba = cb_arg;
+	u64 start;
+	u64 len;
+
+	/* XXX not sure these eios are what we want */
+
+	if (val_len != 0)
+		return -EIO;
+
+	if (!valid_free_extent_key(key))
+		return -EIO;
+
+	if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
+		return -ECHECK_ITER_DONE;
+
+	start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
+	len = le64_to_cpu(key->skfb_len);
+
+	return ecba->cb(start, len, ecba->cb_arg);
+}
+
+/*
+ * Call the callback with each of the primary BLKNO free extents stored
+ * in item in the given alloc root.  It doesn't visit the secondary
+ * ORDER extents.
+ */
+int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
+{
+	struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
+
+	return btree_item_iter(&root->root, free_item_cb, &ecba);
+}
diff --git a/utils/src/check/alloc.h b/utils/src/check/alloc.h
new file mode 100644
index 000000000..f0273e4a7
--- /dev/null
+++ b/utils/src/check/alloc.h
@@ -0,0 +1,12 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
+#define _SCOUTFS_UTILS_CHECK_ALLOC_H
+
+#include "extent.h"
+
+int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
+int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
+
+int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
+int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
+
+#endif
diff --git a/utils/src/check/block.c b/utils/src/check/block.c
new file mode 100644
index 000000000..53b6eed0b
--- /dev/null
+++ b/utils/src/check/block.c
@@ -0,0 +1,564 @@
+#define _ISOC11_SOURCE /* aligned_alloc */
+#define _DEFAULT_SOURCE /* syscall() */
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/syscall.h>
+#include <linux/aio_abi.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "list.h"
+#include "cmp.h"
+#include "hash.h"
+
+#include "block.h"
+#include "debug.h"
+#include "eno.h"
+
+static struct block_data {
+	struct list_head *hash_lists;
+	size_t hash_nr;
+
+	struct list_head active_head;
+	struct list_head inactive_head;
+	struct list_head dirty_list;
+	size_t nr_active;
+	size_t nr_inactive;
+	size_t nr_dirty;
+
+	int meta_fd;
+	size_t max_cached;
+	size_t nr_events;
+
+	aio_context_t ctx;
+	struct iocb *iocbs;
+	struct iocb **iocbps;
+	struct io_event *events;
+} global_bdat;
+
+struct block {
+	struct list_head hash_head;
+	struct list_head lru_head;
+	struct list_head dirty_head;
+	struct list_head submit_head;
+	unsigned long refcount;
+	unsigned long uptodate:1,
+		      active:1;
+	u64 blkno;
+	void *buf;
+	size_t size;
+};
+
+#define BLK_FMT \
+	"blkno %llu rc %ld d %u a %u"
+#define BLK_ARG(blk) \
+	(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
+#define debug_blk(blk, fmt, args...) \
+	debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
+
+/*
+ * This just allocates and initialzies the block.  The caller is
+ * responsible for putting it on the appropriate initial lists and
+ * managing refcounts.
+ */
+static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
+{
+	struct block *blk;
+
+	blk = calloc(1, sizeof(struct block));
+	if (blk) {
+		blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
+		if (!blk->buf) {
+			free(blk);
+			blk = NULL;
+		} else {
+			INIT_LIST_HEAD(&blk->hash_head);
+			INIT_LIST_HEAD(&blk->lru_head);
+			INIT_LIST_HEAD(&blk->dirty_head);
+			INIT_LIST_HEAD(&blk->submit_head);
+			blk->blkno = blkno;
+			blk->size = size;
+		}
+	}
+
+	return blk;
+}
+
+static void free_block(struct block_data *bdat, struct block *blk)
+{
+	debug_blk(blk, "free");
+
+	if (!list_empty(&blk->lru_head)) {
+		if (blk->active)
+			bdat->nr_active--;
+		else
+			bdat->nr_inactive--;
+		list_del(&blk->lru_head);
+	}
+
+	if (!list_empty(&blk->dirty_head)) {
+		bdat->nr_dirty--;
+		list_del(&blk->dirty_head);
+	}
+
+	if (!list_empty(&blk->hash_head))
+		list_del(&blk->hash_head);
+
+	if (!list_empty(&blk->submit_head))
+		list_del(&blk->submit_head);
+
+	free(blk->buf);
+	free(blk);
+}
+
+static bool blk_is_dirty(struct block *blk)
+{
+	return !list_empty(&blk->dirty_head);
+}
+
+/*
+ * Rebalance the cache.
+ *
+ * First we shrink the cache to limit it to max_cached blocks.
+ * Logically, we walk from oldest to newest in the inactive list and
+ * then in the active list.  Since these lists are physically one
+ * list_head list we achieve this with a reverse walk starting from the
+ * active head.
+ *
+ * Then we rebalnace the size of the two lists.  The constraint is that
+ * we don't let the active list grow larger than the inactive list.  We
+ * move blocks from the oldest tail of the active list to the newest
+ * head of the inactive list.
+ *
+ * <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
+ */
+static void rebalance_cache(struct block_data *bdat)
+{
+	struct block *blk;
+	struct block *blk_;
+
+	list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
+		if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
+			break;
+
+		if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
+		    blk_is_dirty(blk))
+			continue;
+
+		free_block(bdat, blk);
+	}
+
+	list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
+		if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
+			break;
+
+		list_move(&blk->lru_head, &bdat->inactive_head);
+		blk->active = 0;
+		bdat->nr_active--;
+		bdat->nr_inactive++;
+	}
+}
+
+static void make_active(struct block_data *bdat, struct block *blk)
+{
+	if (!blk->active) {
+		if (!list_empty(&blk->lru_head)) {
+			list_move(&blk->lru_head, &bdat->active_head);
+			bdat->nr_inactive--;
+		} else {
+			list_add(&blk->lru_head, &bdat->active_head);
+		}
+
+		blk->active = 1;
+		bdat->nr_active++;
+	}
+}
+
+static int compar_iocbp(const void *A, const void *B)
+{
+	struct iocb *a = *(struct iocb **)A;
+	struct iocb *b = *(struct iocb **)B;
+
+	return scoutfs_cmp(a->aio_offset, b->aio_offset);
+}
+
+static int submit_and_wait(struct block_data *bdat, struct list_head *list)
+{
+	struct io_event *event;
+	struct iocb *iocb;
+	struct block *blk;
+	int ret;
+	int err;
+	int nr;
+	int i;
+
+	err = 0;
+	nr = 0;
+	list_for_each_entry(blk, list, submit_head) {
+		iocb = &bdat->iocbs[nr];
+		bdat->iocbps[nr] = iocb;
+
+		memset(iocb, 0, sizeof(struct iocb));
+
+		iocb->aio_data = (intptr_t)blk;
+		iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
+		iocb->aio_fildes = bdat->meta_fd;
+		iocb->aio_buf = (intptr_t)blk->buf;
+		iocb->aio_nbytes = blk->size;
+		iocb->aio_offset = blk->blkno * blk->size;
+
+		nr++;
+
+		debug_blk(blk, "submit");
+
+		if ((nr < bdat->nr_events) && blk->submit_head.next != list)
+			continue;
+
+		qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
+
+		ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
+		if (ret != nr) {
+			if (ret >= 0)
+				errno = EIO;
+			ret = -errno;
+			fprintf(stderr, "fatal system error submitting async IO: "ENO_FMT"\n",
+				ENO_ARG(-ret));
+			goto out;
+		}
+
+		ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
+		if (ret != nr) {
+			if (ret >= 0)
+				errno = EIO;
+			ret = -errno;
+			fprintf(stderr, "fatal system error getting IO events: "ENO_FMT"\n",
+				ENO_ARG(-ret));
+			goto out;
+		}
+
+		ret = 0;
+		for (i = 0; i < nr; i++) {
+			event = &bdat->events[i];
+			iocb = (struct iocb *)(intptr_t)event->obj;
+			blk = (struct block *)(intptr_t)event->data;
+
+			debug_blk(blk, "complete res %lld", (long long)event->res);
+
+			if (event->res >= 0 && event->res != blk->size)
+				event->res = -EIO;
+
+			/* io errors are fatal */
+			if (event->res < 0) {
+				ret = event->res;
+				goto out;
+			}
+
+			if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
+				blk->uptodate = 1;
+			} else {
+				list_del_init(&blk->dirty_head);
+				bdat->nr_dirty--;
+			}
+		}
+		nr = 0;
+	}
+
+	ret = 0;
+out:
+	return ret ?: err;
+}
+
+static void inc_refcount(struct block *blk)
+{
+	blk->refcount++;
+}
+
+void block_put(struct block **blkp)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk = *blkp;
+
+	if (blk) {
+		blk->refcount--;
+		*blkp = NULL;
+
+		rebalance_cache(bdat);
+	}
+}
+
+static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
+{
+	u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
+
+	return &bdat->hash_lists[hash % bdat->hash_nr];
+}
+
+static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
+{
+	struct list_head *bucket = hash_bucket(bdat, blkno);
+	struct block *search;
+	struct block *blk;
+	size_t size;
+
+	size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
+
+	blk = NULL;
+	list_for_each_entry(search, bucket, hash_head) {
+		if (search->blkno == blkno && search->size == size) {
+			blk = search;
+			break;
+		}
+	}
+
+	if (!blk) {
+		blk = alloc_block(bdat, blkno, size);
+		if (blk) {
+			list_add(&blk->hash_head, bucket);
+			list_add(&blk->lru_head, &bdat->inactive_head);
+			bdat->nr_inactive++;
+		}
+	}
+	if (blk)
+		inc_refcount(blk);
+
+	return blk;
+}
+
+/*
+ * Get a block.
+ *
+ * The caller holds a refcount to the block while it's in use that
+ * prevents it from being removed from the cache.  It must be dropped
+ * with block_put();
+ */
+int block_get(struct block **blk_ret, u64 blkno, int bf)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	LIST_HEAD(list);
+	int ret;
+
+	blk = get_or_alloc(bdat, blkno, bf);
+	if (!blk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if ((bf & BF_ZERO)) {
+		memset(blk->buf, 0, blk->size);
+		blk->uptodate = 1;
+	}
+
+	if (bf & BF_OVERWRITE)
+		blk->uptodate = 1;
+
+	if (!blk->uptodate) {
+		list_add(&blk->submit_head, &list);
+		ret = submit_and_wait(bdat, &list);
+		list_del_init(&blk->submit_head);
+		if (ret < 0)
+			goto out;
+	}
+
+	if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
+		list_add_tail(&bdat->dirty_list, &blk->dirty_head);
+		bdat->nr_dirty++;
+	}
+
+	make_active(bdat, blk);
+
+	rebalance_cache(bdat);
+	ret = 0;
+out:
+	if (ret < 0)
+		block_put(&blk);
+	*blk_ret = blk;
+	return ret;
+}
+
+void *block_buf(struct block *blk)
+{
+	return blk->buf;
+}
+
+size_t block_size(struct block *blk)
+{
+	return blk->size;
+}
+
+/*
+ * Drop the block from the cache, regardless of if it was free or not.
+ * This is used to avoid writing blocks which were dirtied but then
+ * later freed.
+ *
+ * The block is immediately freed and can't be referenced after this
+ * returns.
+ */
+void block_drop(struct block **blkp)
+{
+	struct block_data *bdat = &global_bdat;
+
+	free_block(bdat, *blkp);
+	*blkp = NULL;
+	rebalance_cache(bdat);
+}
+
+/*
+ * This doesn't quite work for mixing large and small blocks, but that's
+ * fine, we never do that.
+ */
+static int compar_u64(const void *A, const void *B)
+{
+	u64 a = *((u64 *)A);
+	u64 b = *((u64 *)B);
+
+	return scoutfs_cmp(a, b);
+}
+
+/*
+ * This read-ahead is synchronous and errors are ignored.  If any of the
+ * blknos aren't present in the cache then we issue concurrent reads for
+ * them and wait.  Any existing cached blocks will be left as is.
+ *
+ * We might be trying to read a lot more than the number of events so we
+ * sort the caller's blknos before iterating over them rather than
+ * relying on submission sorting the blocks in each submitted set.
+ */
+void block_readahead(u64 *blknos, size_t nr)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	struct block *blk_;
+	LIST_HEAD(list);
+	size_t i;
+
+	if (nr == 0)
+		return;
+
+	qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
+
+	for (i = 0; i < nr; i++) {
+		blk = get_or_alloc(bdat, blknos[i], 0);
+		if (blk) {
+			if (!blk->uptodate)
+				list_add_tail(&blk->submit_head, &list);
+			else
+				block_put(&blk);
+		}
+	}
+
+	(void)submit_and_wait(bdat, &list);
+
+	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	}
+
+	rebalance_cache(bdat);
+}
+
+/*
+ * The caller's block changes form a consistent transaction.  If the amount of dirty
+ * blocks is large enough we issue a write.
+ */
+int block_try_commit(bool force)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	struct block *blk_;
+	LIST_HEAD(list);
+	int ret;
+
+	if (!force && bdat->nr_dirty < bdat->nr_events)
+		return 0;
+
+	list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
+		list_add_tail(&blk->submit_head, &list);
+		inc_refcount(blk);
+	}
+
+	ret = submit_and_wait(bdat, &list);
+
+	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	}
+
+	if (ret < 0) {
+		fprintf(stderr, "error writing dirty transaction blocks\n");
+		goto out;
+	}
+
+	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
+	if (ret == 0) {
+		list_add(&blk->submit_head, &list);
+		ret = submit_and_wait(bdat, &list);
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	} else {
+		ret = -ENOMEM;
+	}
+	if (ret < 0)
+		fprintf(stderr, "error writing super block to commit transaction\n");
+
+out:
+	rebalance_cache(bdat);
+	return ret;
+}
+
+int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
+{
+	struct block_data *bdat = &global_bdat;
+	size_t i;
+	int ret;
+
+	bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
+	bdat->hash_nr = bdat->max_cached / 4;
+	bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
+
+	bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
+	bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
+	bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
+	bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
+	if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&bdat->active_head);
+	INIT_LIST_HEAD(&bdat->inactive_head);
+	INIT_LIST_HEAD(&bdat->dirty_list);
+	bdat->meta_fd = meta_fd;
+	list_add(&bdat->inactive_head, &bdat->active_head);
+
+	for (i = 0; i < bdat->hash_nr; i++)
+		INIT_LIST_HEAD(&bdat->hash_lists[i]);
+
+	ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
+
+out:
+	if (ret < 0) {
+		free(bdat->iocbs);
+		free(bdat->iocbps);
+		free(bdat->events);
+		free(bdat->hash_lists);
+	}
+
+	return ret;
+}
+
+void block_shutdown(void)
+{
+	struct block_data *bdat = &global_bdat;
+
+	syscall(SYS_io_destroy, bdat->ctx);
+
+	free(bdat->iocbs);
+	free(bdat->iocbps);
+	free(bdat->events);
+	free(bdat->hash_lists);
+}
diff --git a/utils/src/check/block.h b/utils/src/check/block.h
new file mode 100644
index 000000000..ad7195ce8
--- /dev/null
+++ b/utils/src/check/block.h
@@ -0,0 +1,32 @@
+#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
+#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
+
+#include <unistd.h>
+#include <stdbool.h>
+
+struct block;
+
+#include "sparse.h"
+
+/* block flags passed to block_get() */
+enum {
+	BF_ZERO      = (1 << 0), /* zero contents buf as block is returned */
+	BF_DIRTY     = (1 << 1), /* block will be written with transaction */
+	BF_SM        = (1 << 2), /* small 4k block instead of large 64k block */
+	BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
+};
+
+int block_get(struct block **blk_ret, u64 blkno, int bf);
+void block_put(struct block **blkp);
+
+void *block_buf(struct block *blk);
+size_t block_size(struct block *blk);
+void block_drop(struct block **blkp);
+
+void block_readahead(u64 *blknos, size_t nr);
+int block_try_commit(bool force);
+
+int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
+void block_shutdown(void);
+
+#endif
diff --git a/utils/src/check/btree.c b/utils/src/check/btree.c
new file mode 100644
index 000000000..50bd1fa2b
--- /dev/null
+++ b/utils/src/check/btree.c
@@ -0,0 +1,209 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "key.h"
+#include "avl.h"
+
+#include "block.h"
+#include "btree.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+#include "meta.h"
+#include "problem.h"
+
+static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
+{
+	return (void *)bt + le16_to_cpu(item->val_off);
+}
+
+static void readahead_refs(struct scoutfs_btree_block *bt)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *node;
+	struct scoutfs_block_ref *ref;
+	u64 *blknos;
+	u64 blkno;
+	u16 valid = 0;
+	u16 nr = le16_to_cpu(bt->nr_items);
+	int i;
+
+	blknos = calloc(nr, sizeof(blknos[0]));
+	if (!blknos)
+		return;
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < nr; i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+		ref = item_val(bt, item);
+		blkno = le64_to_cpu(ref->blkno);
+
+		if (valid_meta_blkno(blkno))
+			blknos[valid++] = blkno;
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	if (valid > 0)
+		block_readahead(blknos, valid);
+	free(blknos);
+}
+
+/*
+ * Call the callback on the referenced block.  Then if the block
+ * contains referneces read it and recurse into all its references.
+ */
+static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
+			       void *cb_arg)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *node;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	blkno = le64_to_cpu(ref->blkno);
+	if (!blkno)
+		return 0;
+
+	ret = cb(blkno, 1, cb_arg);
+	if (ret < 0) {
+		ret = xlate_iter_errno(ret);
+		return 0;
+	}
+
+	if (level == 0)
+		return 0;
+
+	ret = block_get(&blk, blkno, 0);
+	if (ret < 0)
+		return ret;
+
+	sns_push("btree_parent", blkno, 0);
+
+	bt = block_buf(blk);
+
+	/* XXX integrate verification with block cache */
+	if (bt->level != level) {
+		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* read-ahead last level of parents */
+	if (level == 2)
+		readahead_refs(bt);
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+		ref = item_val(bt, item);
+
+		ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
+		if (ret < 0)
+			goto out;
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	ret = 0;
+out:
+	block_put(&blk);
+	sns_pop();
+
+	return ret;
+}
+
+int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
+{
+	/* XXX check root */
+	if (root->height == 0)
+		return 0;
+
+	return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
+}
+
+static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
+			       btree_item_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *node;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	blkno = le64_to_cpu(ref->blkno);
+	if (!blkno)
+		return 0;
+
+	ret = block_get(&blk, blkno, 0);
+	if (ret < 0)
+		return ret;
+
+	if (level)
+		sns_push("btree_parent", blkno, 0);
+	else
+		sns_push("btree_leaf", blkno, 0);
+
+	bt = block_buf(blk);
+
+	/* XXX integrate verification with block cache */
+	if (bt->level != level) {
+		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* read-ahead leaves that contain items */
+	if (level == 1)
+		readahead_refs(bt);
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+
+		if (level) {
+			ref = item_val(bt, item);
+			ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
+		} else {
+			ret = cb(&item->key, item_val(bt, item),
+				 le16_to_cpu(item->val_len), cb_arg);
+			debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
+		}
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	ret = 0;
+out:
+	block_put(&blk);
+	sns_pop();
+
+	return ret;
+}
+
+int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
+{
+	/* XXX check root */
+	if (root->height == 0)
+		return 0;
+
+	return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
+}
diff --git a/utils/src/check/btree.h b/utils/src/check/btree.h
new file mode 100644
index 000000000..dc0b3bf97
--- /dev/null
+++ b/utils/src/check/btree.h
@@ -0,0 +1,14 @@
+#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
+#define _SCOUTFS_UTILS_CHECK_BTREE_H_
+
+#include "util.h"
+#include "format.h"
+
+#include "extent.h"
+
+typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
+
+int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
+int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
+
+#endif
diff --git a/utils/src/check/check.c b/utils/src/check/check.c
new file mode 100644
index 000000000..b74b5c52b
--- /dev/null
+++ b/utils/src/check/check.c
@@ -0,0 +1,152 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "parse.h"
+#include "util.h"
+#include "format.h"
+#include "ioctl.h"
+#include "cmd.h"
+#include "dev.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "debug.h"
+#include "meta.h"
+#include "super.h"
+
+struct check_args {
+	char *meta_device;
+	char *data_device;
+	char *debug_path;
+};
+
+static int do_check(struct check_args *args)
+{
+	int debug_fd = -1;
+	int meta_fd = -1;
+	int data_fd = -1;
+	int ret;
+
+	if (args->debug_path) {
+		if (strcmp(args->debug_path, "-") == 0)
+			debug_fd = dup(STDERR_FILENO);
+		else
+			debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (debug_fd < 0) {
+			ret = -errno;
+			fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
+				args->debug_path, strerror(errno), errno);
+			goto out;
+		}
+
+		debug_enable(debug_fd);
+	}
+
+	meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
+	if (meta_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
+			args->meta_device, strerror(errno), errno);
+		goto out;
+	}
+
+	data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
+	if (data_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
+			args->data_device, strerror(errno), errno);
+		goto out;
+	}
+
+	ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
+	if (ret < 0)
+		goto out;
+
+	ret = check_supers() ?:
+	      check_meta_refs();
+out:
+	/* and tear it all down */
+	block_shutdown();
+	super_shutdown();
+	debug_disable();
+
+	if (meta_fd >= 0)
+		close(meta_fd);
+	if (data_fd >= 0)
+		close(data_fd);
+	if (debug_fd >= 0)
+		close(debug_fd);
+
+	return ret;
+}
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct check_args *args = state->input;
+
+	switch (key) {
+	case 'd':
+		args->debug_path = strdup_or_error(state, arg);
+		break;
+	case 'e':
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else if (!args->data_device)
+			args->data_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two device arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		if (!args->data_device)
+			argp_error(state, "no data device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"META-DEVICE DATA-DEVICE",
+	"Check filesystem consistency"
+};
+
+static int check_cmd(int argc, char **argv)
+{
+	struct check_args check_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
+	if (ret)
+		return ret;
+
+	return do_check(&check_args);
+}
+
+static void __attribute__((constructor)) check_ctor(void)
+{
+	cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
+}
diff --git a/utils/src/check/debug.c b/utils/src/check/debug.c
new file mode 100644
index 000000000..0017c1aa8
--- /dev/null
+++ b/utils/src/check/debug.c
@@ -0,0 +1,16 @@
+#include <stdlib.h>
+
+#include "debug.h"
+
+int debug_fd = -1;
+
+void debug_enable(int fd)
+{
+	debug_fd = fd;
+}
+
+void debug_disable(void)
+{
+	if (debug_fd >= 0)
+		debug_fd = -1;
+}
diff --git a/utils/src/check/debug.h b/utils/src/check/debug.h
new file mode 100644
index 000000000..a51034942
--- /dev/null
+++ b/utils/src/check/debug.h
@@ -0,0 +1,17 @@
+#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
+#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
+
+#include <stdio.h>
+
+#define debug(fmt, args...)				\
+do {							\
+	if (debug_fd >= 0)				\
+		dprintf(debug_fd, fmt"\n", ##args);	\
+} while (0)
+
+extern int debug_fd;
+
+void debug_enable(int fd);
+void debug_disable(void);
+
+#endif
diff --git a/utils/src/check/eno.h b/utils/src/check/eno.h
new file mode 100644
index 000000000..14579fcee
--- /dev/null
+++ b/utils/src/check/eno.h
@@ -0,0 +1,9 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
+#define _SCOUTFS_UTILS_CHECK_ENO_H_
+
+#include <errno.h>
+
+#define ENO_FMT		"%d (%s)"
+#define ENO_ARG(eno)	eno, strerror(eno)
+
+#endif
diff --git a/utils/src/check/extent.c b/utils/src/check/extent.c
new file mode 100644
index 000000000..bbbcc8877
--- /dev/null
+++ b/utils/src/check/extent.c
@@ -0,0 +1,313 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "util.h"
+#include "lk_rbtree_wrapper.h"
+
+#include "debug.h"
+#include "extent.h"
+
+/*
+ * In-memory extent management in rbtree nodes.
+ */
+
+bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
+{
+	u64 a_end = a_start + a_len;
+	u64 b_end = b_start + b_len;
+
+	return !((a_end <= b_start) || (b_end <= a_start));
+}
+
+static int ext_contains(struct extent_node *ext, u64 start, u64 len)
+{
+	return ext->start <= start && ext->start + ext->len >= start + len;
+}
+
+/*
+ * True if the given extent is bisected by the given range; there's
+ * leftover containing extents on both the left and right sides of the
+ * range in the extent.
+ */
+static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
+{
+	return ext->start < start && ext->start + ext->len > start + len;
+}
+
+static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
+{
+	return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
+}
+
+static struct extent_node *next_ext(struct extent_node *ext)
+{
+	return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
+}
+
+static struct extent_node *prev_ext(struct extent_node *ext)
+{
+	return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
+}
+
+struct walk_results {
+	unsigned bisect_to_leaf:1;
+	struct extent_node *found;
+	struct extent_node *next;
+	struct rb_node *parent;
+	struct rb_node **node;
+};
+
+static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
+{
+	struct rb_node **node = &root->rbroot.rb_node;
+	struct extent_node *ext;
+	u64 end = start + len;
+	int cmp;
+
+	wlk->found = NULL;
+	wlk->next = NULL;
+	wlk->parent = NULL;
+
+	while (*node) {
+		wlk->parent = *node;
+		ext = ext_from_rbnode(*node);
+		cmp = end <= ext->start ? -1 :
+		      start >= ext->start + ext->len ? 1 : 0;
+
+		if (cmp < 0) {
+			node = &ext->rbnode.rb_left;
+			wlk->next = ext;
+		} else if (cmp > 0) {
+			node = &ext->rbnode.rb_right;
+		} else {
+			wlk->found = ext;
+			if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
+				break;
+			/* walk right so we can insert greater right from bisection */
+			node = &ext->rbnode.rb_right;
+		}
+	}
+
+	wlk->node = node;
+}
+
+/*
+ * Return an extent that overlaps with the given range.
+ */
+int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
+{
+	struct walk_results wlk = { 0, };
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+	if (wlk.found) {
+		memset(found, 0, sizeof(struct extent_node));
+		found->start = wlk.found->start;
+		found->len = wlk.found->len;
+		ret = 0;
+	} else {
+		ret = -ENOENT;
+	}
+
+	return ret;
+}
+
+/*
+ * Callers can iterate through direct node references and are entirely
+ * responsible for consistency when doing so.
+ */
+struct extent_node *extent_first(struct extent_root *root)
+{
+	struct walk_results wlk = { 0, };
+
+	walk_extents(root, 0, 1, &wlk);
+
+	return wlk.found ?: wlk.next;
+}
+
+struct extent_node *extent_next(struct extent_node *ext)
+{
+	return next_ext(ext);
+}
+
+struct extent_node *extent_prev(struct extent_node *ext)
+{
+	return prev_ext(ext);
+}
+
+/*
+ * Insert a new extent into the tree.  We can extend existing nodes,
+ * merge with neighbours, or remove existing extents entirely if we
+ * insert a range that fully spans existing nodes.
+ */
+static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
+{
+	struct walk_results wlk = { 0, };
+	struct extent_node *ext;
+	struct extent_node *nei;
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+
+	ext = wlk.found;
+	if (ext && found_err) {
+		ret = found_err;
+		goto out;
+	}
+
+	if (!ext) {
+		ext = malloc(sizeof(struct extent_node));
+		if (!ext) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ext->start = start;
+		ext->len = len;
+
+		rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
+		rb_insert_color(&ext->rbnode, &root->rbroot);
+	}
+
+	/* start by expanding an existing extent if our range is larger */
+	if (start < ext->start) {
+		ext->len += ext->start - start;
+		ext->start = start;
+	}
+	if (ext->start + ext->len < start + len)
+		ext->len += (start + len) - (ext->start + ext->len);
+
+	/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
+
+	while ((nei = prev_ext(ext))) {
+		if (nei->start + nei->len < ext->start)
+			break;
+
+		if (nei->start < ext->start) {
+			ext->len += ext->start - nei->start;
+			ext->start = nei->start;
+		}
+
+		rb_erase(&nei->rbnode, &root->rbroot);
+		free(nei);
+	}
+
+	while ((nei = next_ext(ext))) {
+		if (ext->start + ext->len < nei->start)
+			break;
+
+		if (ext->start + ext->len < nei->start + nei->len)
+			ext->len += (nei->start + nei->len) - (ext->start + ext->len);
+
+		rb_erase(&nei->rbnode, &root->rbroot);
+		free(nei);
+	}
+
+	ret = 0;
+out:
+	if (ret < 0)
+		debug("start %llu len %llu ret %d", start, len, ret);
+	return ret;
+}
+
+/*
+ * Insert a new extent.  The specified extent must not overlap with any
+ * existing extents or -EEXIST is returned.
+ */
+int extent_insert_new(struct extent_root *root, u64 start, u64 len)
+{
+	return walk_insert(root, start, len, true);
+}
+
+/*
+ * Insert an extent, extending any existing extents that may overlap.
+ */
+int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
+{
+	return walk_insert(root, start, len, false);
+}
+
+/*
+ * Remove the specified extent from an existing node.  The given extent must be fully
+ * contained in a single node or -ENOENT is returned.
+ */
+int extent_remove(struct extent_root *root, u64 start, u64 len)
+{
+	struct extent_node *ext;
+	struct extent_node *ins;
+	struct walk_results wlk = {
+		.bisect_to_leaf = 1,
+	};
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+
+	if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (ext_bisected(ext, start, len)) {
+		debug("found bisected start %llu len %llu", ext->start, ext->len);
+		ins = malloc(sizeof(struct extent_node));
+		if (!ins) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ins->start = start + len;
+		ins->len = (ext->start + ext->len) - ins->start;
+
+		rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
+		rb_insert_color(&ins->rbnode, &root->rbroot);
+	}
+
+	if (start > ext->start) {
+		ext->len = start - ext->start;
+	} else if (len < ext->len) {
+		ext->start += len;
+		ext->len -= len;
+	} else {
+		rb_erase(&ext->rbnode, &root->rbroot);
+	}
+
+	ret = 0;
+out:
+	debug("start %llu len %llu ret %d", start, len, ret);
+
+	return ret;
+}
+
+void extent_root_init(struct extent_root *root)
+{
+	root->rbroot = RB_ROOT;
+	root->total = 0;
+}
+
+void extent_root_free(struct extent_root *root)
+{
+	struct extent_node *ext;
+	struct rb_node *node;
+	struct rb_node *tmp;
+
+	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
+		ext = rb_entry(node, struct extent_node, rbnode);
+		rb_erase(&ext->rbnode, &root->rbroot);
+		free(ext);
+	}
+}
+
+void extent_root_print(struct extent_root *root)
+{
+	struct extent_node *ext;
+	struct rb_node *node;
+	struct rb_node *tmp;
+
+	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
+		ext = rb_entry(node, struct extent_node, rbnode);
+		debug("  start %llu len %llu", ext->start, ext->len);
+	}
+}
diff --git a/utils/src/check/extent.h b/utils/src/check/extent.h
new file mode 100644
index 000000000..2a38f7655
--- /dev/null
+++ b/utils/src/check/extent.h
@@ -0,0 +1,38 @@
+#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
+#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
+
+#include "lk_rbtree_wrapper.h"
+
+struct extent_root {
+	struct rb_root rbroot;
+	u64 total;
+};
+
+struct extent_node {
+	struct rb_node rbnode;
+	u64 start;
+	u64 len;
+};
+
+typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
+
+struct extent_cb_arg_t {
+	extent_cb_t cb;
+	void *cb_arg;
+};
+
+bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
+
+int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
+struct extent_node *extent_first(struct extent_root *root);
+struct extent_node *extent_next(struct extent_node *ext);
+struct extent_node *extent_prev(struct extent_node *ext);
+int extent_insert_new(struct extent_root *root, u64 start, u64 len);
+int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
+int extent_remove(struct extent_root *root, u64 start, u64 len);
+
+void extent_root_init(struct extent_root *root);
+void extent_root_free(struct extent_root *root);
+void extent_root_print(struct extent_root *root);
+
+#endif
diff --git a/utils/src/check/iter.h b/utils/src/check/iter.h
new file mode 100644
index 000000000..54c5d13b7
--- /dev/null
+++ b/utils/src/check/iter.h
@@ -0,0 +1,15 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
+#define _SCOUTFS_UTILS_CHECK_ITER_H_
+
+/*
+ * Callbacks can return a weird -errno that we'll never use to indicate
+ * that iteration can stop and return 0 for success.
+ */
+#define ECHECK_ITER_DONE EL2HLT
+
+static inline int xlate_iter_errno(int ret)
+{
+	return ret == -ECHECK_ITER_DONE ? 0 : ret;
+}
+
+#endif
diff --git a/utils/src/check/log_trees.c b/utils/src/check/log_trees.c
new file mode 100644
index 000000000..627052c7e
--- /dev/null
+++ b/utils/src/check/log_trees.c
@@ -0,0 +1,98 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "btree.h"
+#include "debug.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+#include "log_trees.h"
+#include "super.h"
+
+struct iter_args {
+	extent_cb_t cb;
+	void *cb_arg;
+};
+
+static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
+{
+	struct iter_args *ia = cb_arg;
+	struct scoutfs_log_trees *lt;
+	int ret;
+
+	if (val_len != sizeof(struct scoutfs_log_trees))
+		; /* XXX */
+
+	lt = val;
+
+	sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
+
+	debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
+
+	sns_push("meta_avail", 0, 0);
+	ret = alloc_list_meta_iter(&lt->meta_avail, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_freed", 0, 0);
+	ret = alloc_list_meta_iter(&lt->meta_freed, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("item_root", 0, 0);
+	ret = btree_meta_iter(&lt->item_root, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	if (lt->bloom_ref.blkno) {
+		sns_push("bloom_ref", 0, 0);
+		ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
+		sns_pop();
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+	}
+
+	sns_push("data_avail", 0, 0);
+	ret = alloc_root_meta_iter(&lt->data_avail, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("data_freed", 0, 0);
+	ret = alloc_root_meta_iter(&lt->data_freed, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+out:
+	sns_pop();
+
+	return ret;
+}
+
+/*
+ * Call the callers callback with the extent of all the metadata block references contained
+ * in log btrees.  We walk the logs_root btree items and walk all the metadata structures
+ * they reference.
+ */
+int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_super_block *super = global_super;
+	struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
+
+	return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
+}
diff --git a/utils/src/check/log_trees.h b/utils/src/check/log_trees.h
new file mode 100644
index 000000000..7a7150b12
--- /dev/null
+++ b/utils/src/check/log_trees.h
@@ -0,0 +1,8 @@
+#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
+#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
+
+#include "extent.h"
+
+int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
+
+#endif
diff --git a/utils/src/check/meta.c b/utils/src/check/meta.c
new file mode 100644
index 000000000..40a2e5a58
--- /dev/null
+++ b/utils/src/check/meta.c
@@ -0,0 +1,367 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "bitmap.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "btree.h"
+#include "debug.h"
+#include "extent.h"
+#include "sns.h"
+#include "log_trees.h"
+#include "meta.h"
+#include "problem.h"
+#include "super.h"
+
+static struct meta_data {
+	struct extent_root meta_refed;
+	struct extent_root meta_free;
+	struct {
+		u64 ref_blocks;
+		u64 free_extents;
+		u64 free_blocks;
+	} stats;
+} global_mdat;
+
+bool valid_meta_blkno(u64 blkno)
+{
+	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
+
+	return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
+}
+
+static bool valid_meta_extent(u64 start, u64 len)
+{
+	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
+	bool valid;
+
+	valid = len > 0 &&
+		start >= SCOUTFS_META_DEV_START_BLKNO &&
+		start < tot &&
+		len <= tot &&
+		((start + len) <= tot) &&
+		((start + len) > start);
+
+	debug("start %llu len %llu valid %u", start, len, !!valid);
+
+	if (!valid)
+		problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
+
+	return valid;
+}
+
+/*
+ * Track references to individual metadata blocks.  This uses the extent
+ * callback type but is only ever called for single block references.
+ * Any reference to a block that has already been referenced is
+ * considered invalid and is ignored.  Later repair will resolve
+ * duplicate references.
+ */
+static int insert_meta_ref(u64 start, u64 len, void *arg)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_root *root = arg;
+	int ret = 0;
+
+	/* this is tracking single metadata block references */
+	if (len != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (valid_meta_blkno(start)) {
+		ret = extent_insert_new(root, start, len);
+		if (ret == 0)
+			mdat->stats.ref_blocks++;
+		else if (ret == -EEXIST)
+			problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
+	}
+
+out:
+	return ret;
+}
+
+static int insert_meta_free(u64 start, u64 len, void *arg)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_root *root = arg;
+	int ret = 0;
+
+	if (valid_meta_extent(start, len)) {
+		ret = extent_insert_new(root, start, len);
+		if (ret == 0) {
+			mdat->stats.free_extents++;
+			mdat->stats.free_blocks++;
+
+		} else if (ret == -EEXIST) {
+			problem(PB_META_FREE_OVERLAPS_EXISTING,
+				"start %llu llen %llu", start, len);
+		}
+
+	}
+
+	return ret;
+}
+
+/*
+ * Walk all metadata references in the system.  This walk doesn't need
+ * to read metadata that doesn't contain any metadata references so it
+ * can skip the bulk of metadata blocks.  This gives us the set of
+ * referenced metadata blocks which we can then use to repair metadata
+ * allocator structures.
+ */
+static int get_meta_refs(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+
+	extent_root_init(&mdat->meta_refed);
+
+	/* XXX record reserved blocks around super as referenced */
+
+	sns_push("meta_alloc", 0, 0);
+	ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_alloc", 1, 0);
+	ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("data_alloc", 1, 0);
+	ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 0, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_avail[0],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 1, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_avail[1],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 0, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_freed[0],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 1, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_freed[1],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("fs_root", 0, 0);
+	ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("logs_root", 0, 0);
+	ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("log_merge", 0, 0);
+	ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("mounted_clients", 0, 0);
+	ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("srch_root", 0, 0);
+	ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
+	if (ret < 0)
+		goto out;
+
+	debug("found %llu referenced metadata blocks", mdat->stats.ref_blocks);
+	ret = 0;
+out:
+	return ret;
+}
+
+static int get_meta_free(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+
+	extent_root_init(&mdat->meta_free);
+
+	sns_push("meta_alloc", 0, 0);
+	ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_alloc", 1, 0);
+	ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 0, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_avail[0],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 1, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_avail[1],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 0, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_freed[0],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 1, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_freed[1],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	debug("found %llu free metadata blocks in %llu extents",
+	       mdat->stats.free_blocks, mdat->stats.free_extents);
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * All the space between referenced blocks must be recorded in the free
+ * extents.  The free extent walk didn't check that the extents
+ * overlapped with references, we do that here.  Remember that metadata
+ * block references were merged into extents here, the refed extents
+ * aren't necessarily all a single block.
+ */
+static int compare_refs_and_free(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_node *ref;
+	struct extent_node *free;
+	struct extent_node *next;
+	struct extent_node *prev;
+	u64 expect;
+	u64 start;
+	u64 end;
+
+	expect = 0;
+	ref = extent_first(&mdat->meta_refed);
+	free = extent_first(&mdat->meta_free);
+	while (ref || free) {
+
+		debug("exp %llu ref %llu.%llu free %llu.%llu",
+			expect, ref ? ref->start : 0, ref ? ref->len : 0,
+			free ? free->start : 0, free ? free->len : 0);
+
+		/* referenced marked free, remove ref from free and continue from same point */
+		if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
+			debug("ref extent %llu.%llu overlaps free %llu %llu",
+				ref->start, ref->len, free->start, free->len);
+
+			start = max(ref->start, free->start);
+			end = min(ref->start + ref->len, free->start + free->len);
+
+			prev = extent_prev(free);
+
+			extent_remove(&mdat->meta_free, start, end - start);
+
+			if (prev)
+				free = extent_next(prev);
+			else
+				free = extent_first(&mdat->meta_free);
+			continue;
+		}
+
+		/* see which extent starts earlier */
+		if (!free || (ref && ref->start <= free->start))
+			next = ref;
+		else
+			next = free;
+
+		/* untracked region before next extent */
+		if (expect < next->start) {
+			debug("missing free extent %llu.%llu", expect, next->start - expect);
+			expect = next->start;
+			continue;
+		}
+
+
+		/* didn't overlap, advance past next extent */
+		expect = next->start + next->len;
+		if (next == ref)
+			ref = extent_next(ref);
+		else
+			free = extent_next(free);
+	}
+
+	return 0;
+}
+
+/*
+ * Check the metadata allocators by comparing the set of referenced
+ * blocks with the set of free blocks that are stored in free btree
+ * items and alloc list blocks.
+ */
+int check_meta_alloc(void)
+{
+	int ret;
+
+	ret = get_meta_refs();
+	if (ret < 0)
+		goto out;
+
+	ret = get_meta_free();
+	if (ret < 0)
+		goto out;
+
+	ret = compare_refs_and_free();
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
diff --git a/utils/src/check/meta.h b/utils/src/check/meta.h
new file mode 100644
index 000000000..80c97a03a
--- /dev/null
+++ b/utils/src/check/meta.h
@@ -0,0 +1,9 @@
+#ifndef _SCOUTFS_UTILS_CHECK_META_H_
+#define _SCOUTFS_UTILS_CHECK_META_H_
+
+bool valid_meta_blkno(u64 blkno);
+
+int check_meta_alloc(void);
+
+#endif
+
diff --git a/utils/src/check/padding.c b/utils/src/check/padding.c
new file mode 100644
index 000000000..81e12c333
--- /dev/null
+++ b/utils/src/check/padding.c
@@ -0,0 +1,23 @@
+#include <string.h>
+#include <stdbool.h>
+
+#include "util.h"
+#include "padding.h"
+
+bool padding_is_zeros(const void *data, size_t sz)
+{
+	static char zeros[32] = {0,};
+	const size_t batch = array_size(zeros);
+
+	while (sz >= batch) {
+		if (memcmp(data, zeros, batch))
+			return false;
+		data += batch;
+		sz -= batch;
+	}
+
+	if (sz > 0 && memcmp(data, zeros, sz))
+		return false;
+
+	return true;
+}
diff --git a/utils/src/check/padding.h b/utils/src/check/padding.h
new file mode 100644
index 000000000..9bf03a81b
--- /dev/null
+++ b/utils/src/check/padding.h
@@ -0,0 +1,6 @@
+#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_
+#define _SCOUTFS_UTILS_CHECK_PADDING_H_
+
+bool padding_is_zeros(const void *data, size_t sz);
+
+#endif
diff --git a/utils/src/check/problem.c b/utils/src/check/problem.c
new file mode 100644
index 000000000..2191726f2
--- /dev/null
+++ b/utils/src/check/problem.c
@@ -0,0 +1,23 @@
+#include <stdio.h>
+#include <stdint.h>
+
+#include "problem.h"
+
+#if 0
+#define PROB_STR(pb) [pb] = #pb
+static char *prob_strs[] = {
+	PROB_STR(PB_META_EXTENT_INVALID),
+	PROB_STR(PB_META_EXTENT_OVERLAPS_EXISTING),
+};
+#endif
+
+static struct problem_data {
+	uint64_t counts[PB__NR];
+} global_pdat;
+
+void problem_record(prob_t pb)
+{
+	struct problem_data *pdat = &global_pdat;
+
+	pdat->counts[pb]++;
+}
diff --git a/utils/src/check/problem.h b/utils/src/check/problem.h
new file mode 100644
index 000000000..ce7b7fde2
--- /dev/null
+++ b/utils/src/check/problem.h
@@ -0,0 +1,23 @@
+#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_
+#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_
+
+#include "debug.h"
+#include "sns.h"
+
+typedef enum {
+	PB_META_EXTENT_INVALID,
+	PB_META_REF_OVERLAPS_EXISTING,
+	PB_META_FREE_OVERLAPS_EXISTING,
+	PB_BTREE_BLOCK_BAD_LEVEL,
+	PB__NR,
+} prob_t;
+
+#define problem(pb, fmt, ...)							\
+do {										\
+	debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__);	\
+	problem_record(pb);							\
+} while (0)
+
+void problem_record(prob_t pb);
+
+#endif
diff --git a/utils/src/check/sns.c b/utils/src/check/sns.c
new file mode 100644
index 000000000..45f454531
--- /dev/null
+++ b/utils/src/check/sns.c
@@ -0,0 +1,118 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "sns.h"
+
+/*
+ * This "str num stack" is used to describe our location in metadata at
+ * any given time.
+ *
+ * As we descend into structures we pop a string on decribing them,
+ * perhaps with associated numbers.  Pushing and popping is very cheap
+ * and only rarely do we format the stack into a string, as an arbitrary
+ * example:
+ *   super.fs_root.btree_parent:1231.btree_leaf:3231"
+ */
+
+#define SNS_MAX_DEPTH	1000
+#define SNS_STR_SIZE	(SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1))
+
+static struct sns_data {
+	unsigned int depth;
+
+	struct sns_entry {
+		char *str;
+		size_t len;
+		u64 a;
+		u64 b;
+	} ents[SNS_MAX_DEPTH];
+
+	char str[SNS_STR_SIZE];
+
+} global_lsdat;
+
+void _sns_push(char *str, size_t len, u64 a, u64 b)
+{
+	struct sns_data *lsdat = &global_lsdat;
+
+	if (lsdat->depth < SNS_MAX_DEPTH) {
+		lsdat->ents[lsdat->depth++] = (struct sns_entry) {
+			.str = str,
+			.len = len,
+			.a = a,
+			.b = b,
+		};
+	}
+}
+
+void sns_pop(void)
+{
+	struct sns_data *lsdat = &global_lsdat;
+
+	if (lsdat->depth > 0)
+		lsdat->depth--;
+}
+
+static char *append_str(char *pos, char *str, size_t len)
+{
+	memcpy(pos, str, len);
+	return pos + len;
+}
+
+/*
+ * This is not called for x = 0 so we don't need to emit an initial 0.
+ * We could by using do {} while instead of while {}.
+ */
+static char *append_u64x(char *pos, u64 x)
+{
+	static char hex[] = "0123456789abcdef";
+
+	while (x) {
+		*pos++ = hex[x & 0xf];
+		x >>= 4;
+	}
+
+	return pos;
+}
+
+static char *append_char(char *pos, char c)
+{
+	*(pos++) = c;
+	return pos;
+}
+
+/*
+ * Return a pointer to a null terminated string that describes the
+ * current location stack.  The string buffer is global.
+ */
+char *sns_str(void)
+{
+	struct sns_data *lsdat = &global_lsdat;
+	struct sns_entry *ent;
+	char *pos;
+	int i;
+
+	pos = lsdat->str;
+	for (i = 0; i < lsdat->depth; i++) {
+		ent = &lsdat->ents[i];
+
+		if (i)
+			pos = append_char(pos, '.');
+
+		pos = append_str(pos, ent->str, ent->len);
+
+		if (ent->a) {
+			pos = append_char(pos, ':');
+			pos = append_u64x(pos, ent->a);
+		}
+
+		if (ent->b) {
+			pos = append_char(pos, ':');
+			pos = append_u64x(pos, ent->b);
+		}
+	}
+
+	*pos = '\0';
+
+	return lsdat->str;
+}
diff --git a/utils/src/check/sns.h b/utils/src/check/sns.h
new file mode 100644
index 000000000..34c1a2be9
--- /dev/null
+++ b/utils/src/check/sns.h
@@ -0,0 +1,20 @@
+#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_
+#define _SCOUTFS_UTILS_CHECK_SNS_H_
+
+#include <assert.h>
+
+#include "sparse.h"
+
+#define SNS_MAX_STR_LEN 20
+
+#define sns_push(str, a, b)					\
+do {								\
+	build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN);	\
+	_sns_push((str), sizeof(str) - 1, a, b);		\
+} while (0)
+
+void _sns_push(char *str, size_t len, u64 a, u64 b);
+void sns_pop(void);
+char *sns_str(void);
+
+#endif
diff --git a/utils/src/check/super.c b/utils/src/check/super.c
new file mode 100644
index 000000000..9c2f078db
--- /dev/null
+++ b/utils/src/check/super.c
@@ -0,0 +1,57 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+
+#include "block.h"
+#include "super.h"
+
+/*
+ * After we check the super blocks we provide a global buffer to track
+ * the current super block.  It is referenced to get static information
+ * about the system and is also modified and written as part of
+ * transactions.
+ */
+struct scoutfs_super_block *global_super;
+
+/*
+ * After checking the supers we save a copy of it in a global buffer that's used by
+ * other modules to track the current super.  It can be modified and written during commits.
+ */
+int check_supers(void)
+{
+	struct scoutfs_super_block *super = NULL;
+	struct block *blk = NULL;
+	int ret;
+
+	global_super = malloc(sizeof(struct scoutfs_super_block));
+	if (!global_super) {
+		fprintf(stderr, "error allocating super block buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM);
+	if (ret < 0) {
+		fprintf(stderr, "error reading super block\n");
+		goto out;
+	}
+
+	super = block_buf(blk);
+
+	memcpy(global_super, super, sizeof(struct scoutfs_super_block));
+	ret = 0;
+out:
+	block_put(&blk);
+
+	return ret;
+}
+
+void super_shutdown(void)
+{
+	free(global_super);
+}
diff --git a/utils/src/check/super.h b/utils/src/check/super.h
new file mode 100644
index 000000000..7c75ad2df
--- /dev/null
+++ b/utils/src/check/super.h
@@ -0,0 +1,9 @@
+#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_
+#define _SCOUTFS_UTILS_CHECK_SUPER_H_
+
+extern struct scoutfs_super_block *global_super;
+
+int check_supers(void);
+void super_shutdown(void);
+
+#endif

From 7c7b7e6eb91d03c288b06beffa907031cf2038c4 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 6 Mar 2024 14:33:24 -0800
Subject: [PATCH 02/15] Fix partial rename to check_meta_alloc

As I was committing the initial check command I had only partially
completed a rename of the function that checks the metadata allocators.

Signed-off-by: Zach Brown <zab@versity.com>
---
 utils/src/check/check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/src/check/check.c b/utils/src/check/check.c
index b74b5c52b..df07f4037 100644
--- a/utils/src/check/check.c
+++ b/utils/src/check/check.c
@@ -75,7 +75,7 @@ static int do_check(struct check_args *args)
 		goto out;
 
 	ret = check_supers() ?:
-	      check_meta_refs();
+	      check_meta_alloc();
 out:
 	/* and tear it all down */
 	block_shutdown();

From cfab835242aee4acfd77c5ffbbf1457602bed744 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 6 Mar 2024 14:33:26 -0800
Subject: [PATCH 03/15] Add {read,write}-metadata-image scoutfs commands

Signed-off-by: Zach Brown <zab@versity.com>
---
 utils/src/check/image.c | 540 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 540 insertions(+)
 create mode 100644 utils/src/check/image.c

diff --git a/utils/src/check/image.c b/utils/src/check/image.c
new file mode 100644
index 000000000..061076c6e
--- /dev/null
+++ b/utils/src/check/image.c
@@ -0,0 +1,540 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <stdbool.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "bitmap.h"
+#include "parse.h"
+#include "util.h"
+#include "format.h"
+#include "crc.h"
+#include "cmd.h"
+#include "dev.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "btree.h"
+#include "log_trees.h"
+#include "super.h"
+
+/* huh. */
+#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
+
+#define SCOUTFS_META_IMAGE_HEADER_MAGIC		0x8aee00d098fa60c5ULL
+#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC	0x70bd5e9269effd86ULL
+
+struct scoutfs_meta_image_header {
+	__le64 magic;
+	__le64 total_bytes;
+	__le32 version;
+} __packed;
+
+struct scoutfs_meta_image_block_header {
+	__le64 magic;
+	__le64 offset;
+	__le32 size;
+	__le32 crc;
+} __packed;
+
+struct image_args {
+	char *meta_device;
+	bool is_read;
+	bool show_header;
+	u64 ra_window;
+};
+
+struct block_bitmaps {
+	unsigned long *bits;
+	u64 size;
+	u64 count;
+};
+
+#define errf(fmt, args...) \
+	dprintf(STDERR_FILENO, fmt, ##args)
+
+static int set_meta_bit(u64 start, u64 len, void *arg)
+{
+	struct block_bitmaps *bm = arg;
+	int ret;
+
+	if (len != 1) {
+		ret = -EINVAL;
+	} else {
+		if (!test_bit(bm->bits, start)) {
+			set_bit(bm->bits, start);
+			bm->count++;
+		}
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int get_ref_bits(struct block_bitmaps *bm)
+{
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+	u64 i;
+
+	/*
+	 * There are almost no small blocks we need to read, so we read
+	 * them as the large blocks that contain them to simplify the
+	 * block reading process.
+	 */
+	set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
+
+	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
+		set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
+
+	ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
+	      alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
+	      alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
+	      log_trees_meta_iter(set_meta_bit, bm);
+
+	return ret;
+}
+
+/*
+ * Note that this temporarily modifies the header that it's given.
+ */
+static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
+{
+	__le32 saved = bh->crc;
+	u32 crc = ~0;
+
+	bh->crc = 0;
+	crc = crc32c(crc, bh, sizeof(*bh));
+	crc = crc32c(crc, buf, size);
+	bh->crc = saved;
+
+	return cpu_to_le32(crc);
+}
+
+static void printf_header(struct scoutfs_meta_image_header *hdr)
+{
+	errf("magic: 0x%016llx\n"
+	     "total_bytes: %llu\n"
+	     "version: %u\n",
+	       le64_to_cpu(hdr->magic),
+	       le64_to_cpu(hdr->total_bytes),
+	       le32_to_cpu(hdr->version));
+}
+
+typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
+
+static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
+{
+	return read(fd, buf, count);
+}
+
+static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
+{
+	return pread(fd, buf, count, offset);
+}
+
+static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
+{
+	return write(fd, buf, count);
+}
+
+static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
+{
+	return pwrite(fd, buf, count, offset);
+}
+
+static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t sret;
+
+	while (count > 0) {
+		sret = func(fd, buf, count, offset);
+		if (sret <= 0 || sret > count) {
+			if (sret < 0)
+				return -errno;
+			else
+				return -EIO;
+		}
+
+		if (tot)
+			*tot += sret;
+		buf += sret;
+		count -= sret;
+	}
+
+	return 0;
+}
+
+static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
+{
+	struct scoutfs_meta_image_block_header bh;
+	struct scoutfs_meta_image_header hdr;
+	u64 opening;
+	void *buf;
+	off_t off;
+	u64 bit;
+	u64 ra;
+	int ret;
+
+	buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
+	hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
+				      (bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
+	hdr.version = cpu_to_le32(1);
+
+	if (args->show_header) {
+		printf_header(&hdr);
+		ret = 0;
+		goto out;
+	}
+
+	ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
+	if (ret < 0)
+		goto out;
+
+	opening = args->ra_window;
+	ra = 0;
+	bit = 0;
+
+	for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
+
+		/* readahead to open the full window, then a block at a time */
+		do {
+			ra = find_next_set_bit(bm->bits, ra, bm->size);
+			if (ra < bm->size) {
+				off = ra << SCOUTFS_BLOCK_LG_SHIFT;
+				posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
+				ra++;
+				if (opening)
+					opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
+			}
+		} while (opening > 0);
+
+		off = bit << SCOUTFS_BLOCK_LG_SHIFT;
+		ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
+		if (ret < 0)
+			goto out;
+
+		/*
+		 * Might as well try to drop the pages we've used to
+		 * reduce memory pressure on our read-ahead pages that
+		 * are waiting.
+		 */
+		posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
+
+		bh.magic = cpu_to_le64(SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
+		bh.offset = cpu_to_le64(off);
+		bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
+		bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
+
+		ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
+		      rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	free(buf);
+
+	return ret;
+}
+
+static int invalid_header(struct scoutfs_meta_image_header *hdr)
+{
+	if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
+		errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
+		       le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
+
+	} else if (le32_to_cpu(hdr->version) != 1) {
+		errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
+
+	} else {
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/*
+ * Doesn't catch offset+size overflowing, presumes pwrite() will return
+ * an error.
+ */
+static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
+{
+	if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
+		errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
+		       le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
+
+	} else if (le32_to_cpu(bh->size) == 0) {
+		errf("invalid block header size %u\n", le32_to_cpu(bh->size));
+
+	} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
+		errf("block header size %u too large for size_t (> %zu)\n",
+		       le32_to_cpu(bh->size), (size_t)SIZE_MAX);
+
+	} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
+		errf("block header offset %llu too large for off_t (> %llu)\n",
+		       le64_to_cpu(bh->offset), (u64)OFF_MAX);
+
+	} else {
+		return 0;
+	}
+
+	return -EIO;
+}
+
+static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
+{
+	struct scoutfs_meta_image_block_header bh;
+	struct scoutfs_meta_image_header hdr;
+	size_t writeback_batch = (2 * 1024 * 1024);
+	size_t buf_size;
+	size_t dirty;
+	size_t size;
+	off_t first;
+	off_t last;
+	off_t off;
+	__le32 calc;
+	void *buf;
+	u64 tot;
+	int ret;
+
+	tot = 0;
+
+	ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
+	if (ret < 0)
+		goto out;
+
+	if (args->show_header) {
+		printf_header(&hdr);
+		ret = 0;
+		goto out;
+	}
+
+	ret = invalid_header(&hdr);
+	if (ret < 0)
+		goto out;
+
+	dirty = 0;
+	first = OFF_MAX;
+	last = 0;
+	buf = NULL;
+	buf_size = 0;
+
+	while (tot < le64_to_cpu(hdr.total_bytes)) {
+
+		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
+		if (ret < 0)
+			goto out;
+
+		ret = invalid_block_header(&bh);
+		if (ret < 0)
+			goto out;
+
+		size = le32_to_cpu(bh.size);
+		if (buf_size < size) {
+			buf = realloc(buf, size);
+			if (!buf) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			buf_size = size;
+		}
+
+		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
+		if (ret < 0)
+			goto out;
+
+		calc = calc_crc(&bh, buf, size);
+		if (calc != bh.crc) {
+			errf("crc err");
+			ret = -EIO;
+			goto out;
+		}
+
+		off = le64_to_cpu(bh.offset);
+
+		ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
+		if (ret < 0)
+			goto out;
+
+		dirty += size;
+		first = min(first, off);
+		last = max(last, off);
+		if (dirty >= writeback_batch) {
+			posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
+			dirty = 0;
+			first = OFF_MAX;
+			last = 0;
+		}
+	}
+
+	ret = fsync(fd);
+	if (ret < 0) {
+		ret = -errno;
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+static int do_image(struct image_args *args)
+{
+	struct block_bitmaps bm = { .bits = NULL };
+	int meta_fd = -1;
+	u64 dev_size;
+	mode_t mode;
+	int ret;
+
+	mode = args->is_read ? O_RDONLY : O_RDWR;
+
+	meta_fd = open(args->meta_device, mode);
+	if (meta_fd < 0) {
+		ret = -errno;
+		errf("failed to open meta device '%s': %s (%d)\n",
+		     args->meta_device, strerror(errno), errno);
+		goto out;
+	}
+
+	if (args->is_read) {
+		ret = flush_device(meta_fd);
+		if (ret < 0)
+			goto out;
+
+		ret = get_device_size(args->meta_device, meta_fd, &dev_size);
+		if (ret < 0)
+			goto out;
+
+		bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
+		bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
+		if (!bm.bits) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
+		      check_supers() ?:
+		      get_ref_bits(&bm) ?:
+		      read_image(args, meta_fd, &bm);
+		block_shutdown();
+	} else {
+		ret = write_image(args, meta_fd, &bm);
+	}
+out:
+	free(bm.bits);
+
+	if (meta_fd >= 0)
+		close(meta_fd);
+
+	return ret;
+}
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct image_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'h':
+		args->show_header = true;
+		break;
+	case 'r':
+		ret = parse_u64(arg, &args->ra_window);
+		if (ret)
+			argp_error(state, "readahead winddoe parse error");
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two device arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
+	{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
+	{ NULL }
+};
+
+static struct argp read_image_argp = {
+	options,
+	parse_opt,
+	"META-DEVICE",
+	"Read metadata image stream from metadata device file"
+};
+
+#define DEFAULT_RA_WINDOW (512 * 1024)
+
+static int read_image_cmd(int argc, char **argv)
+{
+	struct image_args image_args = {
+		.is_read = true,
+		.ra_window = DEFAULT_RA_WINDOW,
+	};
+	int ret;
+
+	ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
+	if (ret)
+		return ret;
+
+	return do_image(&image_args);
+}
+
+static struct argp write_image_argp = {
+	options,
+	parse_opt,
+	"META-DEVICE",
+	"Write metadata image stream to metadata device file"
+};
+
+static int write_image_cmd(int argc, char **argv)
+{
+	struct image_args image_args = {
+		.is_read = false,
+		.ra_window = DEFAULT_RA_WINDOW,
+	};
+	int ret;
+
+	ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
+	if (ret)
+		return ret;
+
+	return do_image(&image_args);
+}
+
+static void __attribute__((constructor)) image_ctor(void)
+{
+	cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
+	cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
+}

From 9d68c8bba7a26684e8b323c96610f56b61bb0113 Mon Sep 17 00:00:00 2001
From: Auke Kok <auke.kok@versity.com>
Date: Tue, 26 Mar 2024 18:26:33 -0400
Subject: [PATCH 04/15] Generic block header checks: crc, magic.

Generally as we call block_get() we should validate that if the block
has a hdr, at a minimum the crc is correct and the magic value is
the expected value passed, and the fsid matches the superblock. This
function implements just that. Returns -EINVAL, up to the caller to
report a problem() and handle the outcome. For now the code just hard
fails, which incedentally makes it fail the clobber-repair.sh tests
I wrote.

Signed-off-by: Auke Kok <auke.kok@versity.com>
---
 utils/src/check/alloc.c |  7 +++++++
 utils/src/check/block.c | 42 +++++++++++++++++++++++++++++++++++++++++
 utils/src/check/block.h |  2 ++
 utils/src/check/btree.c |  8 ++++++++
 utils/src/check/super.c |  8 ++++++++
 5 files changed, 67 insertions(+)

diff --git a/utils/src/check/alloc.c b/utils/src/check/alloc.c
index f67b66603..43d1d125c 100644
--- a/utils/src/check/alloc.c
+++ b/utils/src/check/alloc.c
@@ -50,6 +50,10 @@ int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb,
 
 		lblk = block_buf(blk);
 		/* XXX verify block */
+		ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
+		if (ret < 0)
+			goto out;
+
 		/* XXX sort?   maybe */
 
 		ref = lblk->next;
@@ -89,6 +93,9 @@ int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb
 
 		lblk = block_buf(blk);
 		/* XXX verify block */
+		ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
+		if (ret < 0)
+			goto out;
 		/* XXX sort?   maybe */
 
 		ret = 0;
diff --git a/utils/src/check/block.c b/utils/src/check/block.c
index 53b6eed0b..13eeb8679 100644
--- a/utils/src/check/block.c
+++ b/utils/src/check/block.c
@@ -17,7 +17,10 @@
 
 #include "block.h"
 #include "debug.h"
+#include "super.h"
 #include "eno.h"
+#include "crc.h"
+#include "sns.h"
 
 static struct block_data {
 	struct list_head *hash_lists;
@@ -297,6 +300,45 @@ static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
 	return &bdat->hash_lists[hash % bdat->hash_nr];
 }
 
+int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic)
+{
+	struct scoutfs_block_header *hdr;
+	size_t size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
+	int ret;
+	u32 crc;
+
+	ret = block_get(&blk, blkno, bf);
+	if (ret < 0) {
+		fprintf(stderr, "error reading block %llu\n", blkno);
+		goto out;
+	}
+
+	hdr = block_buf(blk);
+
+	crc = crc_block(hdr, size);
+
+	if ((le32_to_cpu(hdr->crc) != crc) ||
+	    (le32_to_cpu(hdr->magic) != magic))
+		ret = -EINVAL;
+
+	/*
+	 * Our first caller fills in global_super. Until this completes,
+	 * we can't do this check.
+	 */
+	if ((blkno != SCOUTFS_SUPER_BLKNO) &&
+	    (hdr->fsid != global_super->hdr.fsid))
+		ret = -EINVAL;
+
+	block_put(&blk);
+
+	debug("%s blk_hdr_valid blkno %llu size %lu crc 0x%08x magic 0x%08x ret %d",
+	      sns_str(), blkno, size, le32_to_cpu(hdr->crc), le32_to_cpu(hdr->magic),
+	      ret);
+
+out:
+	return ret;
+}
+
 static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
 {
 	struct list_head *bucket = hash_bucket(bdat, blkno);
diff --git a/utils/src/check/block.h b/utils/src/check/block.h
index ad7195ce8..6c13b0cc1 100644
--- a/utils/src/check/block.h
+++ b/utils/src/check/block.h
@@ -29,4 +29,6 @@ int block_try_commit(bool force);
 int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
 void block_shutdown(void);
 
+int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic);
+
 #endif
diff --git a/utils/src/check/btree.c b/utils/src/check/btree.c
index 50bd1fa2b..ebf05b8c2 100644
--- a/utils/src/check/btree.c
+++ b/utils/src/check/btree.c
@@ -88,6 +88,10 @@ static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, ex
 	if (ret < 0)
 		return ret;
 
+	ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
+	if (ret < 0)
+		return ret;
+
 	sns_push("btree_parent", blkno, 0);
 
 	bt = block_buf(blk);
@@ -157,6 +161,10 @@ static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
 	else
 		sns_push("btree_leaf", blkno, 0);
 
+	ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
+	if (ret < 0)
+		return ret;
+
 	bt = block_buf(blk);
 
 	/* XXX integrate verification with block cache */
diff --git a/utils/src/check/super.c b/utils/src/check/super.c
index 9c2f078db..40f815eaf 100644
--- a/utils/src/check/super.c
+++ b/utils/src/check/super.c
@@ -28,6 +28,8 @@ int check_supers(void)
 	struct block *blk = NULL;
 	int ret;
 
+	sns_push("supers", 0, 0);
+
 	global_super = malloc(sizeof(struct scoutfs_super_block));
 	if (!global_super) {
 		fprintf(stderr, "error allocating super block buffer\n");
@@ -41,6 +43,10 @@ int check_supers(void)
 		goto out;
 	}
 
+	ret = block_hdr_valid(blk, SCOUTFS_SUPER_BLKNO, BF_SM, SCOUTFS_BLOCK_MAGIC_SUPER);
+	if (ret < 0)
+		return ret;
+
 	super = block_buf(blk);
 
 	memcpy(global_super, super, sizeof(struct scoutfs_super_block));
@@ -48,6 +54,8 @@ int check_supers(void)
 out:
 	block_put(&blk);
 
+	sns_pop();
+
 	return ret;
 }
 

From 17d99bdd0d13a97b15e9007ac421f04a0dd52426 Mon Sep 17 00:00:00 2001
From: Auke Kok <auke.kok@versity.com>
Date: Thu, 21 Mar 2024 12:54:22 -0400
Subject: [PATCH 05/15] Add man page content for check.

Adds basic man page content for the `check` subcommand.

Signed-off-by: Auke Kok <auke.kok@versity.com>
---
 utils/man/scoutfs.8 | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/utils/man/scoutfs.8 b/utils/man/scoutfs.8
index d105d87b8..bb3f67d99 100644
--- a/utils/man/scoutfs.8
+++ b/utils/man/scoutfs.8
@@ -76,6 +76,41 @@ run when the file system will not be mounted.
 .RE
 .PD
 
+.TP
+.BI "check META-DEVICE DATA-DEVICE [-d|--debug FILE]"
+.sp
+Performs an offline file system check. The program iterates through all the
+data structures on disk directly - the filesystem must not be mounted while
+this operation is running.
+.RS 1.0i
+.PD 0
+.sp
+.TP
+.B "-d, --debug FILE"
+An output file where the program can output debug information about the
+state of the filesystem as it performs the check. If
+.B FILE
+is "-", the debug output is written to the Standard Error output.
+.TP
+.RE
+.sp
+.B RETURN VALUE
+The check function can return the following exit codes:
+.RS
+.TP
+\fB 0 \fR - no filesystem issues detected
+.TP
+\fB 1 \fR - file system issues were detected
+.TP
+\fB 8 \fR - operational error
+.TP
+\fB 16 \fR - usage error
+.TP
+\fB 32 \fR - cancelled by user (SIGINT)
+.TP
+.RE
+.PD
+
 .TP
 .BI "counters [-t|--table] SYSFS-DIR"
 .sp

From b53288ffdc47485aed85b4d2efa2fca2540c3900 Mon Sep 17 00:00:00 2001
From: Auke Kok <auke.kok@versity.com>
Date: Mon, 11 Mar 2024 10:35:20 -0700
Subject: [PATCH 06/15] Superblock checks for meta and data dev.

We check superblock magic, crc, flags. data device superblock is
checked but a little less thorough.  We check whether the device is
still mounted, since that would make checking invalid to begin with.
Quorum blocks are validated to have sane contents.

We add a global problem counter so we can trivially measure and
report whether any problem was found at all, instead of iterating
over all the problems and checking each individual count.

We pick the standard exit code values from `fsck` and mirror their
intentional behavior. This results in `fsck.scoutfs` can now be
trivially created by making it a wrapper around `scoutfs check`.

Signed-off-by: Auke Kok <auke.kok@versity.com>
Signed-off-by: Hunter Shaffer <hunter.shaffer@versity.com>
---
 utils/src/check/block.c   |  11 ++-
 utils/src/check/check.c   |  40 +++++++-
 utils/src/check/image.c   |   2 +-
 utils/src/check/problem.c |  29 +++++-
 utils/src/check/problem.h |  15 +++
 utils/src/check/super.c   | 195 +++++++++++++++++++++++++++++++++++++-
 utils/src/check/super.h   |   5 +-
 7 files changed, 281 insertions(+), 16 deletions(-)

diff --git a/utils/src/check/block.c b/utils/src/check/block.c
index 13eeb8679..08535a5a7 100644
--- a/utils/src/check/block.c
+++ b/utils/src/check/block.c
@@ -317,8 +317,15 @@ int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic)
 
 	crc = crc_block(hdr, size);
 
-	if ((le32_to_cpu(hdr->crc) != crc) ||
-	    (le32_to_cpu(hdr->magic) != magic))
+	/*
+	 * a bad CRC is easy to repair, so we pass a different error code
+	 * back. Unless the other data is also wrong - then it's EINVAL
+	 * to signal that this isn't a valid block hdr at all.
+	 */
+	if (le32_to_cpu(hdr->crc) != crc)
+		ret = -EIO; /* keep checking other fields */
+
+	if (le32_to_cpu(hdr->magic) != magic)
 		ret = -EINVAL;
 
 	/*
diff --git a/utils/src/check/check.c b/utils/src/check/check.c
index df07f4037..0fa8a8702 100644
--- a/utils/src/check/check.c
+++ b/utils/src/check/check.c
@@ -25,6 +25,7 @@
 #include "debug.h"
 #include "meta.h"
 #include "super.h"
+#include "problem.h"
 
 struct check_args {
 	char *meta_device;
@@ -74,8 +75,26 @@ static int do_check(struct check_args *args)
 	if (ret < 0)
 		goto out;
 
-	ret = check_supers() ?:
-	      check_meta_alloc();
+	/*
+	 * At some point we may convert this to a multi-pass system where we may
+	 * try and repair items, and, as long as repairs are made, we will rerun
+	 * the checks more times. We may need to start counting how many problems we
+	 * fix in the process of these loops, so that we don't stall on unrepairable
+	 * problems and are making actual repair progress. IOW - when we do a full
+	 * check loop without any problems fixed, we stop trying.
+	 */
+	ret = check_supers(data_fd) ?:
+	      check_super_in_use(meta_fd) ?:
+	      check_meta_alloc() ?:
+	      check_super_crc();
+
+	if (ret < 0)
+		goto out;
+
+	debug("problem count %lu", problems_count());
+	if (problems_count() > 0)
+		printf("Problems detected.\n");
+
 out:
 	/* and tear it all down */
 	block_shutdown();
@@ -134,6 +153,12 @@ static struct argp argp = {
 	"Check filesystem consistency"
 };
 
+/* Exit codes used by fsck-type programs */
+#define FSCK_EX_NONDESTRUCT	1	/* File system errors corrected */
+#define FSCK_EX_UNCORRECTED	4	/* File system errors left uncorrected */
+#define FSCK_EX_ERROR		8	/* Operational error */
+#define FSCK_EX_USAGE		16	/* Usage or syntax error */
+
 static int check_cmd(int argc, char **argv)
 {
 	struct check_args check_args = {NULL};
@@ -141,9 +166,16 @@ static int check_cmd(int argc, char **argv)
 
 	ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
 	if (ret)
-		return ret;
+		exit(FSCK_EX_USAGE);
+
+	ret = do_check(&check_args);
+	if (ret < 0)
+		ret = FSCK_EX_ERROR;
+
+	if (problems_count() > 0)
+		ret |= FSCK_EX_UNCORRECTED;
 
-	return do_check(&check_args);
+	exit(ret);
 }
 
 static void __attribute__((constructor)) check_ctor(void)
diff --git a/utils/src/check/image.c b/utils/src/check/image.c
index 061076c6e..0932ece63 100644
--- a/utils/src/check/image.c
+++ b/utils/src/check/image.c
@@ -434,7 +434,7 @@ static int do_image(struct image_args *args)
 		}
 
 		ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
-		      check_supers() ?:
+		      check_supers(-1) ?:
 		      get_ref_bits(&bm) ?:
 		      read_image(args, meta_fd, &bm);
 		block_shutdown();
diff --git a/utils/src/check/problem.c b/utils/src/check/problem.c
index 2191726f2..fd8d42a99 100644
--- a/utils/src/check/problem.c
+++ b/utils/src/check/problem.c
@@ -3,16 +3,29 @@
 
 #include "problem.h"
 
-#if 0
 #define PROB_STR(pb) [pb] = #pb
-static char *prob_strs[] = {
+char *prob_strs[] = {
 	PROB_STR(PB_META_EXTENT_INVALID),
-	PROB_STR(PB_META_EXTENT_OVERLAPS_EXISTING),
+	PROB_STR(PB_META_REF_OVERLAPS_EXISTING),
+	PROB_STR(PB_META_FREE_OVERLAPS_EXISTING),
+	PROB_STR(PB_BTREE_BLOCK_BAD_LEVEL),
+	PROB_STR(PB_SB_HDR_CRC_INVALID),
+	PROB_STR(PB_SB_HDR_MAGIC_INVALID),
+	PROB_STR(PB_FS_IN_USE),
+	PROB_STR(PB_MOUNTED_CLIENTS_REF_BLKNO),
+	PROB_STR(PB_SB_BAD_FLAG),
+	PROB_STR(PB_SB_BAD_FMT_VERS),
+	PROB_STR(PB_QCONF_WRONG_VERSION),
+	PROB_STR(PB_QSLOT_BAD_FAM),
+	PROB_STR(PB_QSLOT_BAD_PORT),
+	PROB_STR(PB_QSLOT_NO_ADDR),
+	PROB_STR(PB_QSLOT_BAD_ADDR),
+	PROB_STR(PB_DATA_DEV_SB_INVALID),
 };
-#endif
 
 static struct problem_data {
 	uint64_t counts[PB__NR];
+	uint64_t count;
 } global_pdat;
 
 void problem_record(prob_t pb)
@@ -20,4 +33,12 @@ void problem_record(prob_t pb)
 	struct problem_data *pdat = &global_pdat;
 
 	pdat->counts[pb]++;
+	pdat->count++;
+}
+
+uint64_t problems_count(void)
+{
+	struct problem_data *pdat = &global_pdat;
+
+	return pdat->count;
 }
diff --git a/utils/src/check/problem.h b/utils/src/check/problem.h
index ce7b7fde2..6ac49bb54 100644
--- a/utils/src/check/problem.h
+++ b/utils/src/check/problem.h
@@ -9,9 +9,23 @@ typedef enum {
 	PB_META_REF_OVERLAPS_EXISTING,
 	PB_META_FREE_OVERLAPS_EXISTING,
 	PB_BTREE_BLOCK_BAD_LEVEL,
+	PB_SB_HDR_CRC_INVALID,
+	PB_SB_HDR_MAGIC_INVALID,
+	PB_FS_IN_USE,
+	PB_MOUNTED_CLIENTS_REF_BLKNO,
+	PB_SB_BAD_FLAG,
+	PB_SB_BAD_FMT_VERS,
+	PB_QCONF_WRONG_VERSION,
+	PB_QSLOT_BAD_FAM,
+	PB_QSLOT_BAD_PORT,
+	PB_QSLOT_NO_ADDR,
+	PB_QSLOT_BAD_ADDR,
+	PB_DATA_DEV_SB_INVALID,
 	PB__NR,
 } prob_t;
 
+extern char *prob_strs[];
+
 #define problem(pb, fmt, ...)							\
 do {										\
 	debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__);	\
@@ -19,5 +33,6 @@ do {										\
 } while (0)
 
 void problem_record(prob_t pb);
+uint64_t problems_count(void);
 
 #endif
diff --git a/utils/src/check/super.c b/utils/src/check/super.c
index 40f815eaf..e3c14fae3 100644
--- a/utils/src/check/super.c
+++ b/utils/src/check/super.c
@@ -2,13 +2,17 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <errno.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
 
 #include "sparse.h"
 #include "util.h"
 #include "format.h"
+#include "crc.h"
 
 #include "block.h"
 #include "super.h"
+#include "problem.h"
 
 /*
  * After we check the super blocks we provide a global buffer to track
@@ -18,14 +22,135 @@
  */
 struct scoutfs_super_block *global_super;
 
+/*
+ * Check superblock crc. We can't use global_super here since it's not the
+ * whole block itself, but only the struct scoutfs_super_block, so it needs
+ * to reload a copy here.
+ */
+int check_super_crc(void)
+{
+	struct scoutfs_super_block *super = NULL;
+	struct scoutfs_block_header *hdr;
+	struct block *blk = NULL;
+	u32 crc;
+	int ret;
+
+	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_DIRTY);
+	if (ret < 0) {
+		fprintf(stderr, "error reading super block\n");
+		return ret;
+	}
+
+	super = block_buf(blk);
+	crc = crc_block((struct scoutfs_block_header *)super, block_size(blk));
+	hdr = &global_super->hdr;
+	debug("superblock crc 0x%04x calculated 0x%04x " "%s", le32_to_cpu(hdr->crc), crc, le32_to_cpu(hdr->crc) == crc ? "(match)" : "(mismatch)");
+
+	if (crc != le32_to_cpu(hdr->crc))
+		problem(PB_SB_HDR_CRC_INVALID, "crc 0x%04x calculated 0x%04x", le32_to_cpu(hdr->crc), crc);
+	block_put(&blk);
+
+	return 0;
+}
+
+/*
+ * Crude check for the unlikely cases where the fs appears to still be mounted.
+ */
+int check_super_in_use(int meta_fd)
+{
+	int ret = meta_super_in_use(meta_fd, global_super);
+	debug("meta_super_in_use ret %d", ret);
+
+	if (ret < 0)
+		problem(PB_FS_IN_USE, "File system appears in use. ret %d", ret);
+
+	debug("global_super->mounted_clients.ref.blkno 0x%08llx", global_super->mounted_clients.ref.blkno);
+	if (global_super->mounted_clients.ref.blkno != 0)
+		problem(PB_MOUNTED_CLIENTS_REF_BLKNO, "Mounted clients ref blkno 0x%08llx",
+			 global_super->mounted_clients.ref.blkno);
+
+	return ret;
+}
+
+/*
+ * quick glance data device superblock checks.
+ *
+ * -EIO for crc failures, all others -EINVAL
+ *
+ * caller must have run check_supers() first so that global_super is
+ * setup, so that we can cross-ref to it.
+ */
+static int check_data_super(int data_fd)
+{
+	struct scoutfs_super_block *super = NULL;
+	char *buf;
+	int ret = 0;
+	u32 crc;
+	ssize_t size = SCOUTFS_BLOCK_SM_SIZE;
+	off_t off = SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT;
+
+	buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
+	if (!buf)
+		return -ENOMEM;
+
+	memset(buf, 0, size);
+
+	if (lseek(data_fd, off, SEEK_SET) != off)
+		return -errno;
+
+	if (read(data_fd, buf, size) < 0) {
+		ret = -errno;
+		goto out;
+	}
+
+	super = (struct scoutfs_super_block *)buf;
+
+	crc = crc_block((struct scoutfs_block_header *)buf, size);
+
+	debug("data fsid 0x%016llx", le64_to_cpu(super->hdr.fsid));
+	debug("data super magic 0x%04x", super->hdr.magic);
+	debug("data crc calc 0x%08x exp 0x%08x %s", crc, le32_to_cpu(super->hdr.crc),
+	      crc == le32_to_cpu(super->hdr.crc) ? "(match)" : "(mismatch)");
+	debug("data flags %llu fmt_vers %llu", le64_to_cpu(super->flags), le64_to_cpu(super->fmt_vers));
+
+	if (crc != le32_to_cpu(super->hdr.crc))
+		/* tis but a scratch */
+		ret = -EIO;
+
+	if (le64_to_cpu(super->hdr.fsid) != le64_to_cpu(global_super->hdr.fsid))
+		/* mismatched data bdev? not good */
+		ret = -EINVAL;
+
+	if (le32_to_cpu(super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER)
+		/* fsid matched but not a superblock? yikes */
+		ret = -EINVAL;
+
+	if (le64_to_cpu(super->flags) != 0) /* !SCOUTFS_FLAG_IS_META_BDEV */
+		ret = -EINVAL;
+
+	if ((le64_to_cpu(super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) ||
+	    (le64_to_cpu(super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX))
+		ret = -EINVAL;
+
+	if (ret != 0)
+		problem(PB_DATA_DEV_SB_INVALID, "data device is invalid or corrupt (%d)", ret);
+out:
+	free(buf);
+	return ret;
+}
+
 /*
  * After checking the supers we save a copy of it in a global buffer that's used by
  * other modules to track the current super.  It can be modified and written during commits.
  */
-int check_supers(void)
+int check_supers(int data_fd)
 {
 	struct scoutfs_super_block *super = NULL;
 	struct block *blk = NULL;
+	struct scoutfs_quorum_slot* slot = NULL;
+	struct in_addr in;
+	uint16_t family;
+	uint16_t port;
 	int ret;
 
 	sns_push("supers", 0, 0);
@@ -44,13 +169,75 @@ int check_supers(void)
 	}
 
 	ret = block_hdr_valid(blk, SCOUTFS_SUPER_BLKNO, BF_SM, SCOUTFS_BLOCK_MAGIC_SUPER);
-	if (ret < 0)
-		return ret;
 
 	super = block_buf(blk);
 
+	if (ret < 0) {
+		/* */
+		if (ret == -EINVAL) {
+			/* that's really bad */
+			fprintf(stderr, "superblock invalid magic\n");
+			goto out;
+		} else if (ret == -EIO)
+			/* just report/count a CRC error */
+			problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x",
+				super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER);
+	}
+
 	memcpy(global_super, super, sizeof(struct scoutfs_super_block));
-	ret = 0;
+
+	debug("Superblock flag: %llu", global_super->flags);
+	if (le64_to_cpu(global_super->flags) != SCOUTFS_FLAG_IS_META_BDEV)
+		problem(PB_SB_BAD_FLAG, "Bad flag: %llu expecting: 1 or 0", global_super->flags);
+
+	debug("Superblock fmt_vers: %llu", le64_to_cpu(global_super->fmt_vers));
+	if ((le64_to_cpu(global_super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) ||
+	    (le64_to_cpu(global_super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX))
+		problem(PB_SB_BAD_FMT_VERS, "Bad fmt_vers: %llu outside supported range (%d-%d)",
+			le64_to_cpu(global_super->fmt_vers), SCOUTFS_FORMAT_VERSION_MIN,
+			SCOUTFS_FORMAT_VERSION_MAX);
+
+	debug("Quorum Config Version: %llu", global_super->qconf.version);
+	if (le64_to_cpu(global_super->qconf.version) != 1)
+		problem(PB_QCONF_WRONG_VERSION, "Wrong Version: %llu (expected 1)", global_super->qconf.version);
+
+	for (int i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
+		slot = &global_super->qconf.slots[i];
+		family = le16_to_cpu(slot->addr.v4.family);
+		port = le16_to_cpu(slot->addr.v4.port);
+		in.s_addr = le32_to_cpu(slot->addr.v4.addr);
+
+		if (family == SCOUTFS_AF_NONE) {
+			debug("Quorum slot %u is empty", i);
+			continue;
+		}
+
+		debug("Quorum slot %u family: %u, port: %u, address: %s", i, family, port, inet_ntoa(in));
+		if (family != SCOUTFS_AF_IPV4)
+			problem(PB_QSLOT_BAD_FAM, "Quorum Slot %u doesn't have valid address", i);
+
+		if (port == 0)
+			problem(PB_QSLOT_BAD_PORT, "Quorum Slot %u has bad port", i);
+
+		if (!in.s_addr) {
+			problem(PB_QSLOT_NO_ADDR, "Quorum Slot %u has not been assigned ipv4 address", i);
+		} else if (!(in.s_addr & 0xff000000)) {
+			problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i);
+		} else if ((in.s_addr & 0xff) == 0xff) {
+			problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i);
+		}
+	}
+
+	debug("super magic 0x%04x", global_super->hdr.magic);
+	if (le32_to_cpu(global_super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER)
+		problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x",
+			global_super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER);
+
+	/* `scoutfs image` command doesn't open data_fd */
+	if (data_fd < 0)
+		ret = 0;
+	else
+		ret = check_data_super(data_fd);
 out:
 	block_put(&blk);
 
diff --git a/utils/src/check/super.h b/utils/src/check/super.h
index 7c75ad2df..f14417ba0 100644
--- a/utils/src/check/super.h
+++ b/utils/src/check/super.h
@@ -3,7 +3,10 @@
 
 extern struct scoutfs_super_block *global_super;
 
-int check_supers(void);
+int check_super_crc();
+int check_supers(int data_fd);
+int super_commit(void);
+int check_super_in_use(int meta_fd);
 void super_shutdown(void);
 
 #endif

From 51dbb7248f1d79cf93e28141f0113bd418567032 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 28 Mar 2023 12:50:09 -0700
Subject: [PATCH 07/15] Add parallel restore

Signed-off-by: Zach Brown <zab@versity.com>
Signed-off-by: Hunter Shaffer <hunter.shaffer@versity.com>
Signed-off-by: Auke Kok <auke.kok@versity.com>
---
 utils/Makefile               |    8 +-
 utils/scoutfs-utils.spec.in  |    3 +
 utils/src/parallel_restore.c | 1978 ++++++++++++++++++++++++++++++++++
 utils/src/parallel_restore.h |  125 +++
 4 files changed, 2113 insertions(+), 1 deletion(-)
 create mode 100644 utils/src/parallel_restore.c
 create mode 100644 utils/src/parallel_restore.h

diff --git a/utils/Makefile b/utils/Makefile
index 7f819d405..17c7fa1b8 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -19,7 +19,9 @@ OBJ_DIRS := src src/check
 OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
 DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))
 
-all: $(BIN)
+AR := src/scoutfs_parallel_restore.a
+
+all: $(BIN) $(AR)
 
 ifneq ($(DEPS),)
 -include $(DEPS)
@@ -37,6 +39,10 @@ $(BIN): $(OBJ)
 	$(QU)  [BIN $@]
 	$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid
 
+$(AR): $(OBJ)
+	$(QU)  [AR $@]
+	$(VE)ar rcs $@ $^
+
 %.o %.d: %.c Makefile sparse.sh
 	$(QU)  [CC $<]
 	$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
diff --git a/utils/scoutfs-utils.spec.in b/utils/scoutfs-utils.spec.in
index fb24b812e..a7c535149 100644
--- a/utils/scoutfs-utils.spec.in
+++ b/utils/scoutfs-utils.spec.in
@@ -54,6 +54,8 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
 install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
 install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
 install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
+install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h
+install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a
 install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
 install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
 install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
@@ -70,6 +72,7 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
 %files -n scoutfs-devel
 %defattr(644,root,root,755)
 %{_includedir}/scoutfs
+%{_libdir}/scoutfs
 
 %clean
 rm -rf %{buildroot}
diff --git a/utils/src/parallel_restore.c b/utils/src/parallel_restore.c
new file mode 100644
index 000000000..e575011ca
--- /dev/null
+++ b/utils/src/parallel_restore.c
@@ -0,0 +1,1978 @@
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <uuid/uuid.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/uio.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "crc.h"
+#include "rand.h"
+#include "key.h"
+#include "bitops.h"
+#include "btree.h"
+#include "leaf_item_hash.h"
+#include "name_hash.h"
+#include "mode_types.h"
+#include "srch.h"
+#include "bloom.h"
+
+#include "parallel_restore.h"
+
+#include "list.h"
+#include "lk_rbtree_wrapper.h"
+
+/*
+ * XXX
+ *  - interface versioning?
+ *  - next seq and next ino are both max ino + 1
+ *  - fix writer builder layout to match super, users except for build order
+ *  - look into zeroing buffers consistently
+ *  - init_alb looks weird?  naming consistency?
+ *  - make sure inode_count makes sense (fs root, log deltas)
+ *  - audit file types
+ */
+
+#define dprintf(fmt, args...)		\
+do {					\
+	if (0)				\
+		printf(fmt, ##args);	\
+} while (0)
+
+struct btree_item {
+	struct rb_node node;
+	struct scoutfs_key key;
+	unsigned int val_len;
+	void *val;
+};
+
+struct srch_node {
+	struct rb_node node;
+	u64 hash;
+	u64 ino;
+	u64 id;
+};
+
+struct block_builder;
+typedef bool (*bld_empty_t)(struct block_builder *bld);
+typedef void (*bld_reset_t)(struct block_builder *bld);
+typedef spr_err_t (*bld_build_t)(struct scoutfs_parallel_restore_writer *wri,
+				 struct block_builder *bld, void *buf, u64 blkno);
+typedef spr_err_t (*bld_post_t)(struct scoutfs_parallel_restore_writer *wri,
+				struct block_builder *bld);
+
+struct block_builder {
+	struct list_head head;
+	bld_empty_t empty;
+	bld_reset_t reset;
+	bld_build_t build;
+	bld_post_t post;
+};
+
+struct btree_builder {
+	struct block_builder bld;
+
+	/* track all items */
+	u64 total_items;
+	/* track total length of extent items */
+	u64 total_len;
+
+	/* eventual root that references built blocks */
+	struct scoutfs_btree_root btroot;
+
+	/* blocks are built as levels accumulate sufficient items */
+	struct {
+		struct rb_root root;
+		unsigned long nr;
+	} items[SCOUTFS_BTREE_MAX_HEIGHT];
+};
+
+struct alloc_list_builder {
+	struct block_builder bld;
+	u64 start;
+	u64 len;
+	struct scoutfs_alloc_list_head lhead;
+};
+
+/*
+ * srch parent radix fanout is really wide, it doesn't take many to have
+ * 2^64 bytes in entry blocks.
+ */
+#define MAX_SRCH_HEIGHT 6
+
+struct srch_builder {
+	struct block_builder bld;
+
+	/* accumulates blocks/entries as we build */
+	struct scoutfs_srch_file sfl;
+
+	/* no parents at level 0, [0] never used */
+	u64 total_parent_refs;
+	struct {
+		struct scoutfs_block_ref *refs;
+		unsigned long nr;
+	} parents[MAX_SRCH_HEIGHT];
+
+	struct rb_root entries;
+};
+
+struct bloom_builder {
+	struct block_builder bld;
+	struct scoutfs_bloom_block *bloom;
+};
+
+struct scoutfs_parallel_restore_writer {
+	u64 inode_count;
+	u64 max_ino;
+
+	__le64 fsid;
+	u64 meta_start;
+	u64 meta_len;
+	struct list_head meta_extents;
+
+	struct list_head builders;
+	struct btree_builder meta_btb[2];
+	struct btree_builder data_btb;
+	struct alloc_list_builder meta_alb[2];
+	struct btree_builder root_btb;
+	struct btree_builder fs_btb;
+	struct btree_builder srch_btb;
+	struct btree_builder log_btb;
+
+	struct srch_builder srch_sbld;
+	struct bloom_builder bloom_bbld;
+
+	struct scoutfs_btree_root root_items;
+	struct scoutfs_super_block super;
+};
+
+struct extent_head {
+	struct list_head head;
+	u64 start;
+	u64 len;
+};
+
+static void init_builder(struct block_builder *bld, bld_empty_t empty, bld_reset_t reset,
+			 bld_build_t build)
+{
+	INIT_LIST_HEAD(&bld->head);
+	bld->empty = empty;
+	bld->reset = reset;
+	bld->build = build;
+	bld->post = NULL;
+}
+
+static spr_err_t meta_alloc_add(struct scoutfs_parallel_restore_writer *wri,
+				u64 start, u64 len)
+{
+	struct extent_head *eh;
+
+	if (len == 0)
+		return 0;
+
+	if (wri->meta_len == 0) {
+		wri->meta_start = start;
+		wri->meta_len = len;
+	} else {
+		eh = malloc(sizeof(struct extent_head));
+		if  (!eh)
+			return ENOMEM;
+		eh->start = start;
+		eh->len = len;
+		list_add_tail(&eh->head, &wri->meta_extents);
+	}
+
+	return 0;
+}
+
+static spr_err_t meta_alloc_contig(struct scoutfs_parallel_restore_writer *wri,
+				   u64 prev, u64 *blkno_ret)
+{
+	struct extent_head *eh;
+
+	if (prev && wri->meta_len && (wri->meta_start != prev + 1)) {
+		*blkno_ret = 0;
+		return 0;
+	}
+
+	if (!wri->meta_len) {
+		*blkno_ret = 0;
+		return ENOSPC;
+	}
+
+	*blkno_ret = wri->meta_start++;
+
+	if (--wri->meta_len == 0 && !list_empty(&wri->meta_extents)) {
+		eh = list_entry(wri->meta_extents.next, struct extent_head, head);
+		wri->meta_start = eh->start;
+		wri->meta_len = eh->len;
+		free(eh);
+	}
+
+	return 0;
+}
+
+static spr_err_t bti_alloc(int val_len, struct btree_item **bti_ret)
+{
+	struct btree_item *bti;
+	spr_err_t err;
+
+	bti = malloc(sizeof(struct btree_item) + val_len);
+	if (bti) {
+		bti->val = (void *)(bti + 1);
+		bti->val_len = val_len;
+		err = 0;
+	} else {
+		err = ENOMEM;
+	}
+
+	*bti_ret = bti;
+	return err;
+}
+
+static struct btree_item *bti_walk(struct rb_root *root, struct scoutfs_key *key,
+				   struct btree_item *ins)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btree_item *found = NULL;
+	struct btree_item *bti;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		bti = container_of(*node, struct btree_item, node);
+
+		cmp = scoutfs_key_compare(key, &bti->key);
+		if (cmp < 0) {
+			node = &(*node)->rb_left;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		} else {
+			found = bti;
+			break;
+		}
+	}
+
+	if (ins && !found) {
+		rb_link_node(&ins->node, parent, node);
+		rb_insert_color(&ins->node, root);
+	}
+
+	return found;
+}
+
+static struct btree_item *node_bti(struct rb_node *node)
+{
+	return node ? container_of(node, struct btree_item, node) : NULL;
+}
+
+static struct btree_item *bti_first(struct rb_root *root)
+{
+	return node_bti(rb_first(root));
+}
+
+static struct btree_item *bti_next(struct btree_item *bti)
+{
+	return bti ? node_bti(rb_next(&bti->node)) : NULL;
+}
+
+#define for_each_bti_safe(root, bti, tmp) \
+	for (bti = bti_first(root); bti && ((tmp = bti_next(bti)), 1); bti = tmp)
+
+/*
+ * It's always an error to try and insert a key that was already tracked
+ * in a btree level.
+ */
+static spr_err_t btb_insert(struct btree_builder *btb, struct btree_item *bti, int level)
+{
+	struct btree_item *found;
+
+	found = bti_walk(&btb->items[level].root, &bti->key, bti);
+	if (found) {
+		return EEXIST;
+	} else {
+		btb->items[level].nr++;
+		btb->total_items++;
+		return 0;
+	}
+}
+
+static void btb_erase(struct btree_builder *btb, struct btree_item *bti, int level)
+{
+	rb_erase(&bti->node, &btb->items[level].root);
+	btb->items[level].nr--;
+	btb->total_items--;
+}
+
+static void btb_destroy(struct btree_builder *btb)
+{
+	struct btree_item *bti;
+	struct btree_item *tmp;
+	int i;
+
+	for (i = 0; i < array_size(btb->items); i++) {
+		for_each_bti_safe(&btb->items[i].root, bti, tmp) {
+			btb_erase(btb, bti, i);
+			free(bti);
+		}
+	}
+}
+
+static void init_key(struct scoutfs_key *key, u8 zone, u8 type, u64 first, u64 second,
+		     u64 third, u8 fourth)
+{
+	key->_sk_first = cpu_to_le64(first);
+	key->_sk_second = cpu_to_le64(second);
+	key->_sk_third = cpu_to_le64(third);
+	key->_sk_fourth = fourth;
+	key->sk_zone = zone;
+	key->sk_type = type;
+	memset(&key->__pad, 0, sizeof(key->__pad));
+}
+
+static u64 free_extent_order(u64 len)
+{
+	return (fls64(len | 1) - 1) / 3;
+}
+
+static int insert_free_items(struct btree_builder *btb, u64 start, u64 len)
+{
+	struct scoutfs_key keys[2];
+	struct btree_item *bti;
+	spr_err_t err;
+	u64 order;
+	u64 end;
+	int i;
+
+	end = start + len - 1;
+	order = U64_MAX - free_extent_order(len);
+
+	init_key(&keys[0], SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, end, len, 0, 0);
+	init_key(&keys[1], SCOUTFS_FREE_EXTENT_ORDER_ZONE, 0, order, end, len, 0);
+
+	for (i = 0; i < array_size(keys); i++) {
+		err = bti_alloc(0, &bti);
+		if (err)
+			goto out;
+
+		bti->key = keys[i];
+
+		err = btb_insert(btb, bti, 0);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+	}
+
+	btb->total_len += len;
+
+	err = 0;
+out:
+	return err;
+}
+
+static void set_alloc_root(struct scoutfs_alloc_root *root, struct btree_builder *btb)
+{
+	root->total_len = cpu_to_le64(btb->total_len);
+	root->flags = 0;
+	root->_pad = 0;
+	root->root = btb->btroot;
+}
+
+static spr_err_t map_start_key(struct scoutfs_key *start, struct scoutfs_key *key)
+{
+	if (key->sk_zone == SCOUTFS_FS_ZONE) {
+		init_key(start, SCOUTFS_FS_ZONE, 0,
+			 le64_to_cpu(key->_sk_first) & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK,
+			 0, 0, 0);
+
+	} else if (key->sk_zone == SCOUTFS_XATTR_TOTL_ZONE) {
+		init_key(start, SCOUTFS_XATTR_TOTL_ZONE, 0, 0, 0, 0, 0);
+
+	} else if (key->sk_zone == SCOUTFS_INODE_INDEX_ZONE) {
+		init_key(start, SCOUTFS_INODE_INDEX_ZONE, 0, 0,
+			 le64_to_cpu(key->_sk_second) & ~(u64)SCOUTFS_LOCK_SEQ_GROUP_MASK,
+			 0, 0);
+	} else if (key->sk_zone == SCOUTFS_QUOTA_ZONE) {
+		init_key(start, SCOUTFS_QUOTA_ZONE, 0, 0, 0, 0, 0);
+	} else {
+		return EINVAL;
+	}
+
+	return 0;
+}
+
+static spr_err_t update_bloom(struct bloom_builder *bbld, struct scoutfs_key *key)
+{
+	struct scoutfs_bloom_block *bb = bbld->bloom;
+	unsigned int nrs[SCOUTFS_FOREST_BLOOM_NRS];
+	struct scoutfs_key start;
+	spr_err_t err;
+	int i;
+
+	err = map_start_key(&start, key);
+	if (err)
+		goto out;
+
+	calc_bloom_nrs(&start, nrs);
+
+	for (i = 0; i < SCOUTFS_FOREST_BLOOM_NRS; i++) {
+		if (!test_and_set_bit_le(nrs[i], bb->bits))
+			le64_add_cpu(&bb->total_set, 1);
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static spr_err_t insert_fs_item(struct scoutfs_parallel_restore_writer *wri,
+				struct btree_item *bti)
+{
+	spr_err_t err;
+
+	if (bti->key.sk_zone == SCOUTFS_FS_ZONE && bti->key.sk_type == SCOUTFS_INODE_TYPE &&
+	    le64_to_cpu(bti->key.ski_ino) == SCOUTFS_ROOT_INO) {
+		err = btb_insert(&wri->root_btb, bti, 0);
+	} else {
+		err = btb_insert(&wri->fs_btb, bti, 0) ?:
+		      update_bloom(&wri->bloom_bbld, &bti->key);
+	}
+
+	return err;
+}
+
+static spr_err_t insert_entry_items(struct scoutfs_parallel_restore_writer *wri,
+				    struct scoutfs_parallel_restore_entry *entry)
+{
+	struct scoutfs_dirent *dent = NULL;
+	struct scoutfs_key keys[3];
+	struct btree_item *bti;
+	unsigned int bytes;
+	spr_err_t err = 0;
+	u64 dir_ino;
+	u64 hash;
+	u64 ino;
+	u64 pos;
+	int i;
+
+	bytes = offsetof(struct scoutfs_dirent, name[entry->name_len]);
+	dent = malloc(bytes);
+	if (!dent) {
+		err = ENOMEM;
+		goto out;
+	}
+
+	dir_ino = entry->dir_ino;
+	ino = entry->ino;
+	hash = dirent_name_hash(entry->name, entry->name_len);
+	pos = entry->pos;
+
+	dent->ino = cpu_to_le64(ino);
+	dent->hash = cpu_to_le64(hash);
+	dent->pos = cpu_to_le64(pos);
+	dent->type = mode_to_type(entry->mode);
+	memset(&dent->__pad, 0, sizeof(dent->__pad));
+	memcpy(dent->name, entry->name, entry->name_len);
+
+	init_key(&keys[0], SCOUTFS_FS_ZONE, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos, 0);
+	init_key(&keys[1], SCOUTFS_FS_ZONE, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0, 0);
+	init_key(&keys[2], SCOUTFS_FS_ZONE, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos, 0);
+
+	for (i = 0; i < array_size(keys); i++) {
+		err = bti_alloc(bytes, &bti);
+		if (err)
+			goto out;
+
+		bti->key = keys[i];
+		memcpy(bti->val, dent, bytes);
+
+		err = insert_fs_item(wri, bti);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+	}
+
+	err = 0;
+out:
+	free(dent);
+	return err;
+}
+
+static spr_err_t insert_extent_item(struct scoutfs_parallel_restore_writer *wri, u64 ino, u64 len)
+{
+	struct scoutfs_data_extent_val *dv;
+	struct scoutfs_key key;
+	struct btree_item *bti;
+	spr_err_t err;
+
+	init_key(&key, SCOUTFS_FS_ZONE, SCOUTFS_DATA_EXTENT_TYPE, ino, 0 + len - 1, len, 0);
+
+	err = bti_alloc(sizeof(struct scoutfs_data_extent_val), &bti);
+	if (!err) {
+		bti->key = key;
+		dv = bti->val;
+		dv->blkno = 0;
+		dv->flags = SEF_OFFLINE;
+
+		err = insert_fs_item(wri, bti);
+		if (err)
+			free(bti);
+	}
+
+	return err;
+}
+
+/*
+ * We're trusting that the caller hasn't made up garbage xattrs.
+ * All we have to do is check for the scoutfs prefix and then
+ * identify the sequence of known tags.  There can be a lot more
+ * xattrs than files so this is a surprisingly hot path.
+ */
+#define HIDE_BE32 cpu_to_be32(0x68696465)
+#define SRCH_BE32 cpu_to_be32(0x73726368)
+#define TOTL_BE32 cpu_to_be32(0x746f746c)
+#define TAG_LEN 5
+#define XTAG_SRCH (1 << 1)
+#define XTAG_TOTL (1 << 2)
+static int get_xattr_tags(char *name, int name_len)
+{
+	static const char prefix[] = "scoutfs.";
+	static const size_t prefix_len = array_size(prefix) - 1;
+	__be32 betag;
+	int xtags = 0;
+
+	if (name_len < prefix_len || strncmp(name, prefix, prefix_len))
+		return 0;
+
+	name += prefix_len;
+	name_len -= prefix_len;
+
+	while (name_len >= TAG_LEN && name[TAG_LEN - 1] == '.') {
+		memcpy(&betag, name, sizeof(betag));
+
+		dprintf("tag 0x%08x\n", be32_to_cpu(betag));
+
+		if (betag == HIDE_BE32)
+			;
+		else if (betag == SRCH_BE32)
+			xtags |= XTAG_SRCH;
+		else if (betag == TOTL_BE32)
+			xtags |= XTAG_TOTL;
+		else
+			break;
+
+		name += TAG_LEN;
+		name_len -= TAG_LEN;
+	}
+
+	dprintf("xat name %.*s tags 0x%x\n", name_len, name, xtags);
+
+	return xtags;
+}
+
+static spr_err_t insert_xattr_items(struct scoutfs_parallel_restore_writer *wri,
+				    struct scoutfs_parallel_restore_xattr *xattr, u32 hash)
+{
+	struct scoutfs_xattr xat;
+	struct iovec value[3] = {
+		{ &xat, sizeof(xat) },
+		{ xattr->name, xattr->name_len, },
+		{ xattr->value, xattr->value_len, },
+	};
+	struct iovec *iov = value;
+	struct scoutfs_key key;
+	struct btree_item *bti;
+	unsigned int total;
+	unsigned int bytes;
+	unsigned int piece;
+	spr_err_t err;
+	char *buf;
+
+	init_key(&key, SCOUTFS_FS_ZONE, SCOUTFS_XATTR_TYPE, xattr->ino, hash, xattr->pos, 0);
+	total = value[0].iov_len + value[1].iov_len + value[2].iov_len;
+
+	xat.val_len = cpu_to_le16(xattr->value_len);
+	xat.name_len = xattr->name_len;
+	memset(xat.__pad, 0, sizeof(xat.__pad));
+
+	while (total > 0) {
+		bytes = min(total, SCOUTFS_XATTR_MAX_PART_SIZE);
+
+		err = bti_alloc(bytes, &bti);
+		if (err)
+			goto out;
+
+		bti->key = key;
+		buf = bti->val;
+
+		while (bytes) {
+			piece = min(bytes, iov->iov_len);
+			memcpy(buf, iov->iov_base, piece);
+			buf += piece;
+			bytes -= piece;
+			total -= piece;
+			iov->iov_base += piece;
+			iov->iov_len -= piece;
+			if (iov->iov_len == 0)
+				iov++; /* falls off array when done */
+		}
+
+		err = insert_fs_item(wri, bti);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+
+		key._sk_fourth++;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static spr_err_t insert_symlink_items(struct scoutfs_parallel_restore_writer *wri,
+				      u64 ino, char *target, int target_len)
+{
+	struct scoutfs_key key;
+	struct btree_item *bti;
+	spr_err_t err;
+	int bytes;
+	int off = 0;
+
+	init_key(&key, SCOUTFS_FS_ZONE, SCOUTFS_SYMLINK_TYPE, ino, 0, 0, 0);
+
+	while (off < target_len) {
+		bytes = min(target_len - off, SCOUTFS_MAX_VAL_SIZE);
+
+		err = bti_alloc(bytes, &bti);
+		if (err)
+			goto out;
+
+		bti->key = key;
+		memcpy(bti->val, target + off, bytes);
+
+		err = insert_fs_item(wri, bti);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+
+		off += bytes;
+		le64_add_cpu(&key._sk_second, 1);
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+/* forbid the leading + that strtoull allows */
+static spr_err_t totl_strtoull(char *s, int len, unsigned long long *res)
+{
+	char str[SCOUTFS_XATTR_MAX_TOTL_U64 + 1];
+
+	if (len <= 0 || len >= array_size(str) || s[0] == '+')
+		return EINVAL;
+
+	memcpy(str, s, len);
+	str[len] = '\0';
+
+	errno = 0;
+	*res = strtoull(str, NULL, 0);
+	return errno;
+}
+
+/*
+ * .totl. xattrs turn into items with the key based on dotted u64s at the end of the
+ * name and a value in the .. value.
+ */
+static spr_err_t insert_totl_item(struct scoutfs_parallel_restore_writer *wri,
+				  struct scoutfs_parallel_restore_xattr *xattr)
+{
+	static const char prefix[] = "scoutfs.totl.";
+	static const int prefix_len = sizeof(prefix) - 1;
+	struct scoutfs_xattr_totl_val *found_tval;
+	struct scoutfs_xattr_totl_val *tval;
+	struct btree_item *found;
+	struct btree_item *bti;
+	unsigned long long longs[3];
+	unsigned long long v;
+	spr_err_t err;
+	int nr = 0;
+	int prev;
+	int i;
+
+	prev = xattr->name_len;
+	for (i = xattr->name_len - 1; i > prefix_len; i--) {
+		if (xattr->name[i] == '.') {
+			err = totl_strtoull(&xattr->name[i + 1], prev - (i + 1), &longs[nr]);
+			if (err)
+				goto out;
+			if (++nr == array_size(longs))
+				break;
+			prev = i;
+		}
+	}
+	if (nr != array_size(longs)) {
+		err = EINVAL;
+		goto out;
+	}
+
+	err = totl_strtoull(xattr->value, xattr->value_len, &v);
+	if (err)
+		goto out;
+
+	if (v == 0) {
+		err = 0;
+		goto out;
+	}
+
+	err = bti_alloc(sizeof(struct scoutfs_xattr_totl_val), &bti);
+	if (err)
+		goto out;
+
+	init_key(&bti->key, SCOUTFS_XATTR_TOTL_ZONE, 0, longs[2], longs[1], longs[0], 0);
+	tval = bti->val;
+	tval->total = cpu_to_le64(v);
+	tval->count = cpu_to_le64(1);
+
+	found = bti_walk(&wri->fs_btb.items[0].root, &bti->key, NULL);
+	if (found) {
+		found_tval = found->val;
+		le64_add_cpu(&found_tval->total, le64_to_cpu(tval->total));
+		le64_add_cpu(&found_tval->count, le64_to_cpu(tval->count));
+		if (found_tval->total == 0)
+			btb_erase(&wri->fs_btb, found, 0);
+		free(bti);
+	} else {
+		err = insert_fs_item(wri, bti);
+		if (err) {
+			free(bti);
+			goto out;
+		}
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static spr_err_t insert_inode_index_item(struct scoutfs_parallel_restore_writer *wri,
+					 u8 type, u64 major, u64 ino)
+{
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(0, &bti);
+	if (!err) {
+		init_key(&bti->key, SCOUTFS_INODE_INDEX_ZONE, type, 0, major, ino, 0);
+		err = insert_fs_item(wri, bti);
+		if (err)
+			free(bti);
+	}
+
+	return err;
+}
+
+static spr_err_t insert_inode_items(struct scoutfs_parallel_restore_writer *wri,
+				    struct scoutfs_parallel_restore_inode *inode)
+{
+	struct scoutfs_inode *si;
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(sizeof(struct scoutfs_inode), &bti);
+	if (err)
+		goto out;
+
+	init_key(&bti->key, SCOUTFS_FS_ZONE, SCOUTFS_INODE_TYPE, inode->ino, 0, 0, 0);
+
+	si = bti->val;
+
+	si->size = 0;
+	si->meta_seq = cpu_to_le64(inode->meta_seq);
+	si->data_seq = cpu_to_le64(inode->data_seq);
+	si->data_version = 0;
+	si->online_blocks = 0;
+	si->offline_blocks = 0;
+	si->next_readdir_pos = 0;
+	si->next_xattr_id = cpu_to_le64(inode->nr_xattrs + 1);
+	si->version = cpu_to_le64(1);
+	si->nlink = cpu_to_le32(1);
+	si->uid = cpu_to_le32(inode->uid);
+	si->gid = cpu_to_le32(inode->gid);
+	si->mode = cpu_to_le32(inode->mode);
+	si->flags = 0;
+	si->flags = cpu_to_le32(inode->flags);
+	si->atime.sec = cpu_to_le64(inode->atime.tv_sec);
+	si->atime.nsec = cpu_to_le32(inode->atime.tv_nsec);
+	si->ctime.sec = cpu_to_le64(inode->ctime.tv_sec);
+	si->ctime.nsec = cpu_to_le32(inode->ctime.tv_nsec);
+	si->mtime.sec = cpu_to_le64(inode->mtime.tv_sec);
+	si->mtime.nsec = cpu_to_le32(inode->mtime.tv_nsec);
+	si->crtime.sec = cpu_to_le64(inode->crtime.tv_sec);
+	si->crtime.nsec = cpu_to_le32(inode->crtime.tv_nsec);
+	si->proj = cpu_to_le64(inode->proj);
+
+	/* XXX make sure this works across all el7/8/9 due to glibc magic */
+	si->rdev = (inode->rdev & 0xff) | ((inode->rdev & 0xffffff00) << 12);
+
+	err = insert_inode_index_item(wri, SCOUTFS_INODE_INDEX_META_SEQ_TYPE,
+				      le64_to_cpu(si->meta_seq), inode->ino);
+	if (err)
+		goto out;
+
+	if (S_ISREG(inode->mode)) {
+		si->size = cpu_to_le64(inode->size);
+		si->data_version = cpu_to_le64(inode->data_version);
+
+		err = insert_inode_index_item(wri, SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE,
+					      le64_to_cpu(si->data_seq), inode->ino);
+		if (err)
+			goto out;
+
+		if (inode->offline) {
+			si->offline_blocks = cpu_to_le64(DIV_ROUND_UP(inode->size,
+								      SCOUTFS_BLOCK_SM_SIZE));
+			err = insert_extent_item(wri, inode->ino, le64_to_cpu(si->offline_blocks));
+			if (err)
+				goto out;
+		}
+
+	} else if (S_ISDIR(inode->mode)) {
+		si->size = cpu_to_le64(inode->total_entry_name_bytes);
+		si->next_readdir_pos = cpu_to_le64(SCOUTFS_DIRENT_FIRST_POS + inode->nr_subdirs);
+		si->nlink = cpu_to_le32(2 + inode->nr_subdirs);
+
+	} else if (S_ISLNK(inode->mode)) {
+		si->size = cpu_to_le64(inode->target_len);
+
+		err = insert_symlink_items(wri, inode->ino, inode->target, inode->target_len);
+		if (err)
+			goto out;
+	}
+
+	err = insert_fs_item(wri, bti);
+out:
+	return err;
+}
+
+static spr_err_t insert_log_trees_item(struct scoutfs_parallel_restore_writer *wri,
+				       struct scoutfs_parallel_restore_progress *prog)
+{
+	struct scoutfs_log_trees *lt;
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(sizeof(struct scoutfs_log_trees), &bti);
+	if (err)
+		goto out;
+
+	lt = bti->val;
+	memset(lt, 0, sizeof(struct scoutfs_log_trees));
+	lt->item_root = prog->fs_items;
+	lt->bloom_ref = prog->bloom_ref;
+	/* lt srch_file is blank once finalized, moved to srch_root items */
+	lt->inode_count_delta = prog->inode_count;
+	lt->get_trans_seq = cpu_to_le64(1);
+	lt->commit_trans_seq = cpu_to_le64(1);
+	lt->max_item_seq = cpu_to_le64(1);
+	lt->finalize_seq = cpu_to_le64(1);
+	lt->rid = prog->max_ino;
+	lt->nr = cpu_to_le64(1);
+	lt->flags = cpu_to_le64(SCOUTFS_LOG_TREES_FINALIZED);
+
+	init_key(&bti->key, SCOUTFS_LOG_TREES_ZONE, 0,
+		 le64_to_cpu(lt->rid), le64_to_cpu(lt->nr), 0, 0);
+
+	err = btb_insert(&wri->log_btb, bti, 0);
+out:
+	return err;
+}
+
+static spr_err_t insert_srch_item(struct scoutfs_parallel_restore_writer *wri,
+				  struct scoutfs_srch_file *sfl)
+{
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(sizeof(struct scoutfs_srch_file), &bti);
+	if (!err) {
+		init_key(&bti->key, SCOUTFS_SRCH_ZONE, SCOUTFS_SRCH_BLOCKS_TYPE,
+			 0, le64_to_cpu(sfl->blocks), le64_to_cpu(sfl->ref.blkno), 0);
+		memcpy(bti->val, sfl, sizeof(struct scoutfs_srch_file));
+		err = btb_insert(&wri->srch_btb, bti, 0);
+	}
+
+	return err;
+}
+
+static spr_err_t insert_quota_item(struct scoutfs_parallel_restore_writer *wri,
+				  struct scoutfs_parallel_restore_quota_rule *rule)
+{
+	struct scoutfs_quota_rule_val *rv;
+	struct btree_item *bti;
+	spr_err_t err;
+
+	err = bti_alloc(sizeof(struct scoutfs_quota_rule_val), &bti);
+	if (err)
+		goto out;
+
+	rv = bti->val;
+	memset(rv, 0, sizeof(struct scoutfs_quota_rule_val));
+	rv->limit = cpu_to_le64(rule->limit);
+	rv->prio = rule->prio;
+	rv->op = rule->op;
+	rv->rule_flags = rule->rule_flags;
+	rv->name_val[0] = cpu_to_le64(rule->names[0].val);
+	rv->name_source[0] = rule->names[0].source;
+	rv->name_flags[0] = rule->names[0].flags;
+	rv->name_val[1] = cpu_to_le64(rule->names[1].val);
+	rv->name_source[1] = rule->names[1].source;
+	rv->name_flags[1] = rule->names[1].flags;
+	rv->name_val[2] = cpu_to_le64(rule->names[2].val);
+	rv->name_source[2] = rule->names[2].source;
+	rv->name_flags[2] = rule->names[2].flags;
+	memset(&rv->_pad, 0, sizeof(rv->_pad));
+
+	init_key(&bti->key, SCOUTFS_QUOTA_ZONE, SCOUTFS_QUOTA_RULE_TYPE,
+			0, scoutfs_hash64(&rv, sizeof(rv)), 0, 0);
+
+	err = insert_fs_item(wri, bti);
+	if (err) {
+		free(bti);
+		goto out;
+	}
+out:
+	return err;
+}
+
+#define UNLINKED_AVL_HEIGHT 255
+
+static void link_avl_nodes(struct scoutfs_btree_block *bt, __le16 *parent, __le16 parent_off,
+			   u8 height, int first, int last)
+{
+	int ind = (first + last) / 2;
+	struct scoutfs_avl_node *node = &bt->items[ind].node;
+	u64 off = (long)node - (long)&bt->item_root;
+
+	dprintf("first %d ind %d last %d height %u\n", first, ind, last, height);
+
+	if (ind < first || ind > last || node->height != UNLINKED_AVL_HEIGHT)
+		return;
+
+	*parent = cpu_to_le16(off);
+	node->parent = parent_off;
+	node->height = height;
+	node->left = 0;
+	node->right = 0;
+	memset(node->__pad, 0, sizeof(node->__pad));
+
+	if (height > 1) {
+		link_avl_nodes(bt, &node->left, cpu_to_le16(off), height - 1, first, ind - 1);
+		link_avl_nodes(bt, &node->right, cpu_to_le16(off), height - 1, ind + 1, last);
+	}
+}
+
+#define DEFINE_BUILDER_CONTAINER(type, name, ptr) \
+	type *name = container_of(ptr, type, bld)
+
+static bool btree_empty(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct btree_builder, btb, bld);
+
+	return btb->total_items == 0;
+}
+
+static void btree_reset(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct btree_builder, btb, bld);
+
+	btb->total_items = 0;
+	btb->total_len = 0;
+	memset(&btb->btroot, 0, sizeof(btb->btroot));
+}
+
+/*
+ * Incrementally build btrees.  By the time we're called the builder has
+ * all the sorted leaf items in an rbtree at their level.  We streaem
+ * them into blocks and store parent items at the next highest level.
+ * Once we're out of leaf items we stream the parent items into blocks
+ * and store their parent items at the next highest level.  Eventually
+ * we drain all the items and are left with the root's reference to the
+ * first block in the tree.
+ */
+static spr_err_t build_btree_block(struct scoutfs_parallel_restore_writer *wri,
+				   struct block_builder *bld, void *buf, u64 blkno)
+{
+	DEFINE_BUILDER_CONTAINER(struct btree_builder, btb, bld);
+	struct scoutfs_block_header *hdr;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_block_ref *ref;
+	struct btree_item *bti;
+	struct btree_item *tmp;
+	unsigned long val_align;
+	unsigned long bytes;
+	unsigned long nr;
+	void *val_buf;
+	spr_err_t err;
+	u8 height;
+	int level;
+	int i;
+
+	/* find next highest level to build items from */
+	for (i = 0; i < SCOUTFS_BTREE_MAX_HEIGHT; i++) {
+		if (btb->items[i].nr == 0)
+			continue;
+
+		level = i;
+		break;
+	}
+
+	/* shouldn't be possible */
+	if (i >= SCOUTFS_BTREE_MAX_HEIGHT) {
+		err = ENOBUFS;
+		goto out;
+	}
+
+	dprintf("building btree blkno %llu level %u nr %lu tot %llu \n",
+		blkno, level, btb->items[level].nr, btb->total_items);
+
+	/*
+	 * XXX Be more careful about item filling.. can parents be entirely
+	 * full?  Should we let the last nodes on the right be under the
+	 * min?  We can see that there are < (nr + min) left and emit
+	 * half the remaining in each.
+	 */
+
+	/* initialize the non-item parts of the block */
+	bt = buf;
+	memset(bt, 0, sizeof(struct scoutfs_btree_block));
+	hdr = &bt->hdr;
+	hdr->magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BTREE);
+	hdr->fsid = wri->fsid;
+	hdr->blkno = cpu_to_le64(blkno);
+	hdr->seq = cpu_to_le64(1);
+	bt->level = level;
+	btree_init_block(bt, level);
+	if (level == 0)
+		memset((char *)bt + SCOUTFS_BLOCK_LG_SIZE - SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES, 0,
+		       SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES);
+
+	/* find the items that fit in the leaf */
+	item = &bt->items[0];
+	nr = 0;
+	val_buf = (void *)item + le16_to_cpu(bt->mid_free_len);
+
+	for_each_bti_safe(&btb->items[level].root, bti, tmp) {
+		val_align = round_up(bti->val_len, SCOUTFS_BTREE_VALUE_ALIGN);
+		bytes = sizeof(struct scoutfs_btree_item) + val_align;
+
+		if (le16_to_cpu(bt->mid_free_len) < bytes)
+			break;
+
+		item->node.height = UNLINKED_AVL_HEIGHT;
+		item->key = bti->key;
+		item->seq = cpu_to_le64(1);
+		item->val_len = cpu_to_le16(bti->val_len);
+		item->flags = 0;
+		memset(item->node.__pad, 0, sizeof(item->node.__pad));
+
+		if (bti->val_len) {
+			val_buf -= val_align;
+			item->val_off = cpu_to_le16((long)val_buf - (long)bt);
+			memcpy(val_buf, bti->val, bti->val_len);
+		} else {
+			item->val_off = 0;
+		}
+
+		le16_add_cpu(&bt->nr_items, 1);
+		le16_add_cpu(&bt->total_item_bytes, bytes);
+		le16_add_cpu(&bt->mid_free_len, -bytes);
+		if (level == 0)
+			leaf_item_hash_insert(bt, &item->key,
+					      cpu_to_le16((void *)item - (void *)bt));
+
+		item++;
+		nr++;
+
+		btb_erase(btb, bti, level);
+		free(bti);
+	}
+
+	/* zero the middle of the block without items */
+	if (bt->mid_free_len)
+		memset(&bt->items[nr], 0, le16_to_cpu(bt->mid_free_len));
+
+	height = (int)ceil(log2(nr)) + 2; /* leaves are height 1 */
+	link_avl_nodes(bt, &bt->item_root.node, 0, height - 1, 0, nr - 1);
+
+	/* finish block */
+	hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE));
+
+	if (btb->total_items == 0) {
+		/* root refs hightest/last block we build */
+		btb->btroot.ref.blkno = hdr->blkno;
+		btb->btroot.ref.seq = hdr->seq;
+		btb->btroot.height = level +1;
+	} else {
+		/* parent ref items will be built into parent blocks */
+		/* we'll always need a parent ref for the block we're building */
+		err = bti_alloc(sizeof(struct scoutfs_block_ref), &bti);
+		if (err)
+			goto out;
+
+		/* refs to right spine blocks has all ones key */
+		if (btb->items[level].nr == 0)
+			scoutfs_key_set_ones(&bti->key);
+		else
+			bti->key = bt->items[nr - 1].key;
+		ref = bti->val;
+		ref->blkno = hdr->blkno;
+		ref->seq = hdr->seq;
+		btb_insert(btb, bti, level + 1);
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static void btb_init(struct btree_builder *btb)
+{
+	int i;
+
+	init_builder(&btb->bld, btree_empty, btree_reset, build_btree_block);
+
+	for (i = 0; i < array_size(btb->items); i++)
+		btb->items[i].root = RB_ROOT;
+}
+
+/*
+ * This is how we get around the recursion of allocating blocks to write blocks that
+ * store the allocators.  After we've written all other metadata blocks we know precisely
+ * how many allocation blocks we'll need.  We modify the writer to only have that many
+ * free blocks remaining and put the rest in the alloc block builders.
+ */
+static spr_err_t prepare_alloc_builders(struct scoutfs_parallel_restore_writer *wri,
+					struct block_builder *bld)
+{
+#define ALLOC_BLOCKS 5 /* 2 meta list, 2 meta btree, 1 data btree */
+	struct extent_head *eh_tmp;
+	struct extent_head *eh;
+	spr_err_t err;
+	u64 start;
+	u64 skip;
+	u64 len;
+	int ind;
+
+	dprintf("starting prepare with start %llu len %llu\n", wri->meta_start, wri->meta_len);
+
+	skip = ALLOC_BLOCKS + (SCOUTFS_ALLOC_LIST_MAX_BLOCKS * 2);
+	if (wri->meta_len <= skip)
+		return ENOSPC;
+
+	/* store remainder of meta alloc as a free extent */
+	start = wri->meta_start + skip;
+	len = wri->meta_len - skip;
+	err = insert_free_items(&wri->meta_btb[0], start, len);
+	if (err)
+		goto out;
+	wri->meta_len -= len;
+
+	/* the rest of the meta extents are items in the two meta trees */
+	ind = 1;
+	list_for_each_entry_safe(eh, eh_tmp, &wri->meta_extents, head) {
+		err = insert_free_items(&wri->meta_btb[ind], eh->start, eh->len);
+		if (err)
+			goto out;
+		list_del_init(&eh->head);
+		free(eh);
+		ind ^= 1;
+	}
+
+	/* fill the two server avail alloc list blocks */
+	wri->meta_alb[0].start = wri->meta_start + ALLOC_BLOCKS;
+	wri->meta_alb[0].len = SCOUTFS_ALLOC_LIST_MAX_BLOCKS;
+	wri->meta_alb[1].start = wri->meta_alb[0].start + wri->meta_alb[0].len;
+	wri->meta_alb[1].len = wri->meta_alb[0].len;
+
+	/* writer left with only meta allocation for remaining alloc blocks */
+	wri->meta_len = ALLOC_BLOCKS;
+
+	err = 0;
+out:
+	return err;
+}
+
+static bool alloc_list_empty(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct alloc_list_builder, alb, bld);
+
+	return alb->len == 0;
+}
+
+static spr_err_t build_alloc_list_block(struct scoutfs_parallel_restore_writer *wri,
+					struct block_builder *bld, void *buf, u64 blkno)
+{
+	DEFINE_BUILDER_CONTAINER(struct alloc_list_builder, alb, bld);
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_header *hdr;
+	int i;
+
+	if (alb->len > SCOUTFS_ALLOC_LIST_MAX_BLOCKS)
+		return EOVERFLOW;
+
+	lblk = buf;
+	memset(&lblk->next, 0, sizeof(lblk->next));
+	lblk->start = 0;
+	lblk->nr = cpu_to_le32(alb->len);
+
+	for (i = 0; i < alb->len; i++)
+		lblk->blknos[i] = cpu_to_le64(alb->start + i);
+
+	hdr = &lblk->hdr;
+	hdr->magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
+	hdr->fsid = wri->fsid;
+	hdr->blkno = cpu_to_le64(blkno);
+	hdr->seq = cpu_to_le64(1);
+	hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE));
+
+	alb->lhead.ref.blkno = hdr->blkno;
+	alb->lhead.ref.seq = hdr->seq;
+	alb->lhead.first_nr = cpu_to_le32(alb->len);
+	alb->lhead.total_nr = cpu_to_le64(alb->len);
+
+	alb->start = 0;
+	alb->len = 0;
+
+	return 0;
+}
+
+static void init_alb(struct alloc_list_builder *alb)
+{
+	init_builder(&alb->bld, alloc_list_empty, NULL, build_alloc_list_block);
+}
+
+static struct srch_node *node_srn(struct rb_node *node)
+{
+	return node ? container_of(node, struct srch_node, node) : NULL;
+}
+
+static struct srch_node *srn_first(struct rb_root *root)
+{
+	return node_srn(rb_first(root));
+}
+
+static struct srch_node *srn_next(struct srch_node *srn)
+{
+	return srn ? node_srn(rb_next(&srn->node)) : NULL;
+}
+
+static spr_err_t insert_srch_entry(struct srch_builder *sbld, u64 hash, u64 ino, u64 id)
+{
+	struct rb_root *root = &sbld->entries;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct srch_node *ins;
+	struct srch_node *srn;
+	int cmp;
+
+	ins = malloc(sizeof(struct srch_node));
+	if (!ins)
+		return ENOMEM;
+
+	ins->hash = hash;
+	ins->ino = ino;
+	ins->id = id;
+
+	while (*node) {
+		parent = *node;
+		srn = node_srn(*node);
+
+		cmp = scoutfs_cmp(ins->hash, srn->hash) ?:
+		      scoutfs_cmp(ins->ino, srn->ino) ?:
+		      scoutfs_cmp(ins->id, srn->id);
+		if (cmp < 0)
+			node = &(*node)->rb_left;
+		else if (cmp > 0)
+			node = &(*node)->rb_right;
+		else
+			return EEXIST;
+	}
+
+	rb_link_node(&ins->node, parent, node);
+	rb_insert_color(&ins->node, root);
+
+	return 0;
+}
+
+static bool srch_empty(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct srch_builder, sbld, bld);
+
+	return RB_EMPTY_ROOT(&sbld->entries) && sbld->total_parent_refs == 0;
+}
+
+static void srch_reset(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct srch_builder, sbld, bld);
+
+	memset(&sbld->sfl, 0, sizeof(sbld->sfl));
+}
+
+#define for_each_sbld_parent(sbld, i) \
+	for (i = 1; i < array_size(sbld->parents); i++)
+
+static spr_err_t build_srch_block(struct scoutfs_parallel_restore_writer *wri,
+				   struct block_builder *bld, void *buf, u64 blkno)
+{
+	DEFINE_BUILDER_CONTAINER(struct srch_builder, sbld, bld);
+	struct scoutfs_block_header *hdr;
+	struct scoutfs_srch_parent *par;
+	struct scoutfs_srch_block *srb;
+	struct scoutfs_srch_entry sre;
+	struct scoutfs_block_ref *ref;
+	struct srch_node *srn_tmp;
+	struct srch_node *srn;
+	unsigned int nr;
+	spr_err_t err;
+	u32 magic;
+	int level;
+	int tail;
+	int ret;
+
+	dprintf("building srch blkno %llu empty_entries %u tot refs %llu parent nrs: ",
+		blkno, RB_EMPTY_ROOT(&sbld->entries), sbld->total_parent_refs);
+	for_each_sbld_parent(sbld, level)
+		dprintf("%u:%lu ", level, sbld->parents[level].nr);
+	dprintf("\n");
+
+	/* build parents with refs that are full or when we're out of entries */
+	for_each_sbld_parent(sbld, level) {
+
+		nr = sbld->parents[level].nr;
+		if (nr == 0 || (nr < SCOUTFS_SRCH_PARENT_REFS && !RB_EMPTY_ROOT(&sbld->entries)))
+			continue;
+
+		/* copy parent refs */
+		par = buf;
+		memcpy(par->refs, sbld->parents[level].refs, nr * sizeof(par->refs[0]));
+		sbld->total_parent_refs -= nr;
+		sbld->parents[level].nr = 0;
+
+		/* zero the tail of the block */
+		tail = SCOUTFS_BLOCK_LG_SIZE - offsetof(struct scoutfs_srch_parent, refs[nr]);
+		if (tail > 0)
+			memset(buf + SCOUTFS_BLOCK_LG_SIZE - tail, 0, tail);
+
+		magic = SCOUTFS_BLOCK_MAGIC_SRCH_PARENT;
+		hdr = &par->hdr;
+		goto finish_hdr;
+	}
+
+	/* no built parent, must have entries to build */
+	level = 0;
+	if (RB_EMPTY_ROOT(&sbld->entries)) {
+		err = EINVAL;
+		goto out;
+	}
+
+	srn = srn_first(&sbld->entries);
+	sre.hash = cpu_to_le64(srn->hash);
+	sre.ino = cpu_to_le64(srn->ino);
+	sre.id = cpu_to_le64(srn->id);
+
+	srb = buf;
+	srb->entry_nr = 0;
+	srb->entry_bytes = 0;
+	srb->first = sre;
+	memset(&srb->tail, 0, sizeof(srb->tail));
+
+	if (sbld->sfl.blocks == 0)
+		sbld->sfl.first = sre;
+
+	do {
+		if (le32_to_cpu(srb->entry_bytes) > SCOUTFS_SRCH_BLOCK_SAFE_BYTES)
+			break;
+
+		ret = srch_encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
+					&sre, &srb->tail);
+
+		dprintf("%llu.%llu.%llu ret %d\n", srn->hash, srn->ino, srn->id, ret);
+
+		le32_add_cpu(&srb->entry_bytes, ret);
+		le32_add_cpu(&srb->entry_nr, 1);
+		srb->tail = sre;
+
+		srn_tmp = srn_next(srn);
+		rb_erase(&srn->node, &sbld->entries);
+		free(srn);
+
+		if ((srn = srn_tmp)) {
+			sre.hash = cpu_to_le64(srn->hash);
+			sre.ino = cpu_to_le64(srn->ino);
+			sre.id = cpu_to_le64(srn->id);
+		}
+	} while (srn);
+
+	srb->last = srb->tail;
+	sbld->sfl.last = srb->tail;
+
+	le64_add_cpu(&sbld->sfl.blocks, 1);
+	le64_add_cpu(&sbld->sfl.entries, le32_to_cpu(srb->entry_nr));
+
+	magic = SCOUTFS_BLOCK_MAGIC_SRCH_BLOCK;
+	hdr = &srb->hdr;
+
+finish_hdr:
+	hdr->magic = cpu_to_le32(magic);
+	hdr->fsid = wri->fsid;
+	hdr->blkno = cpu_to_le64(blkno);
+	hdr->seq = cpu_to_le64(1);
+	hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE));
+
+	if (srch_empty(&sbld->bld)) {
+		/* the last block is referenced by the root */
+		sbld->sfl.ref.blkno = hdr->blkno;
+		sbld->sfl.ref.seq = hdr->seq;
+		sbld->sfl.height = level + 1;
+		memset(sbld->sfl.__pad, 0, sizeof(sbld->sfl.__pad));
+	} else {
+		/* store the parent ref to our block */
+		nr = sbld->parents[level + 1].nr++;
+		ref = &sbld->parents[level + 1].refs[nr];
+		ref->blkno = hdr->blkno;
+		ref->seq = hdr->seq;
+		sbld->total_parent_refs++;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static spr_err_t sbld_create(struct srch_builder *sbld)
+{
+	spr_err_t err = 0;
+	int i;
+
+	init_builder(&sbld->bld, srch_empty, srch_reset, build_srch_block);
+
+	for_each_sbld_parent(sbld, i) {
+		sbld->parents[i].refs = malloc(SCOUTFS_SRCH_PARENT_REFS *
+					       sizeof(struct scoutfs_block_ref));
+		if (!sbld->parents[i].refs) {
+			while (--i >= 1) {
+				free(sbld->parents[i].refs);
+				sbld->parents[i].refs = NULL;
+			}
+			err = ENOMEM;
+			break;
+		}
+	}
+
+	return err;
+}
+
+static void sbld_destroy(struct srch_builder *sbld)
+{
+	int i;
+
+	for_each_sbld_parent(sbld, i) {
+		free(sbld->parents[i].refs);
+		sbld->parents[i].refs = NULL;
+	}
+}
+
+/*
+ * We've written the bloom block if we've filled out its header.
+ */
+static bool bloom_empty(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct bloom_builder, bbld, bld);
+
+	return bbld->bloom->hdr.seq != 0;
+}
+
+static void bloom_reset(struct block_builder *bld)
+{
+	DEFINE_BUILDER_CONTAINER(struct bloom_builder, bbld, bld);
+
+	memset(bbld->bloom, 0, SCOUTFS_BLOCK_LG_SIZE);
+}
+
+static spr_err_t build_bloom_block(struct scoutfs_parallel_restore_writer *wri,
+				   struct block_builder *bld, void *buf, u64 blkno)
+{
+	DEFINE_BUILDER_CONTAINER(struct bloom_builder, bbld, bld);
+	struct scoutfs_block_header *hdr;
+
+	hdr = &bbld->bloom->hdr;
+	hdr->magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BLOOM);
+	hdr->fsid = wri->fsid;
+	hdr->blkno = cpu_to_le64(blkno);
+	hdr->seq = cpu_to_le64(1);
+	hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE));
+
+	memcpy(buf, bbld->bloom, SCOUTFS_BLOCK_LG_SIZE);
+
+	return 0;
+}
+
+static spr_err_t bbld_create(struct bloom_builder *bbld)
+{
+	init_builder(&bbld->bld, bloom_empty, bloom_reset, build_bloom_block);
+
+	bbld->bloom = malloc(SCOUTFS_BLOCK_LG_SIZE);
+	if (!bbld->bloom)
+		return ENOMEM;
+
+	memset(&bbld->bloom->hdr, 0, sizeof(bbld->bloom->hdr));
+
+	return 0;
+}
+
+static void bbld_destroy(struct bloom_builder *bbld)
+{
+	free(bbld->bloom);
+}
+
+static bool wri_has_super(struct scoutfs_parallel_restore_writer *wri)
+{
+	return wri->super.hdr.blkno != 0;
+}
+
+static void reset_builders(struct scoutfs_parallel_restore_writer *wri)
+{
+	/* define block build order, different than struct layout order */
+	struct block_builder *builders[] = {
+		/* fs items written in parallel by writers */
+		&wri->fs_btb.bld,
+		&wri->bloom_bbld.bld,
+		&wri->srch_sbld.bld,
+
+		/* global items written finally by global super writer */
+		&wri->root_btb.bld,
+		&wri->srch_btb.bld,
+		/* log .post() prepares final allocators */
+		&wri->log_btb.bld,
+		&wri->meta_alb[0].bld,
+		&wri->meta_alb[1].bld,
+		&wri->meta_btb[0].bld,
+		&wri->meta_btb[1].bld,
+		&wri->data_btb.bld,
+	};
+	struct block_builder *bld;
+	int i;
+
+	for (i = 0; i < array_size(builders); i++) {
+		bld = builders[i];
+
+		if (bld->reset)
+			bld->reset(bld);
+
+		if (!list_empty(&bld->head))
+			list_del_init(&bld->head);
+		list_add_tail(&bld->head, &wri->builders);
+	}
+}
+
+spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip)
+{
+	struct scoutfs_parallel_restore_writer *wri;
+	spr_err_t err;
+
+	wri = calloc(1, sizeof(struct scoutfs_parallel_restore_writer));
+	if (!wri) {
+		err = ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&wri->meta_extents);
+	INIT_LIST_HEAD(&wri->builders);
+	btb_init(&wri->root_btb);
+	btb_init(&wri->fs_btb);
+	btb_init(&wri->srch_btb);
+	btb_init(&wri->log_btb);
+	btb_init(&wri->meta_btb[0]);
+	btb_init(&wri->meta_btb[1]);
+	btb_init(&wri->data_btb);
+	init_alb(&wri->meta_alb[0]);
+	init_alb(&wri->meta_alb[1]);
+
+	err = sbld_create(&wri->srch_sbld) ?:
+	      bbld_create(&wri->bloom_bbld);
+	if (err)
+		goto out;
+
+	reset_builders(wri);
+	err = 0;
+out:
+	if (err) {
+		if (wri) {
+			sbld_destroy(&wri->srch_sbld);
+			bbld_destroy(&wri->bloom_bbld);
+			free(wri);
+		}
+		wri = NULL;
+	}
+	*wrip = wri;
+	return err;
+}
+
+void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip)
+{
+	struct scoutfs_parallel_restore_writer *wri = *wrip;
+	struct extent_head *eh;
+	struct extent_head *eh_tmp;
+
+	if (!wri)
+		return;
+
+	btb_destroy(&wri->root_btb);
+	btb_destroy(&wri->fs_btb);
+	btb_destroy(&wri->srch_btb);
+	btb_destroy(&wri->log_btb);
+	btb_destroy(&wri->meta_btb[0]);
+	btb_destroy(&wri->meta_btb[1]);
+	btb_destroy(&wri->data_btb);
+	sbld_destroy(&wri->srch_sbld);
+	bbld_destroy(&wri->bloom_bbld);
+
+	list_for_each_entry_safe(eh, eh_tmp, &wri->meta_extents, head) {
+		list_del_init(&eh->head);
+		free(eh);
+	}
+
+	free(wri);
+	*wrip = NULL;
+}
+
+spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
+					       struct scoutfs_parallel_restore_slice *slices,
+					       int nr)
+{
+	u64 total = le64_to_cpu(wri->super.total_meta_blocks);
+	u64 start = SCOUTFS_META_DEV_START_BLKNO;
+	u64 each = (total - start) / nr;
+	int i;
+
+	if (!wri_has_super(wri))
+		return EINVAL;
+
+	for (i = 0; i < nr - 1; i++) {
+		slices[i].fsid = wri->super.hdr.fsid;
+		slices[i].meta_start = cpu_to_le64(start);
+		slices[i].meta_len = cpu_to_le64(each);
+		start += each;
+	}
+
+	slices[i].fsid = wri->super.hdr.fsid;
+	slices[i].meta_start = cpu_to_le64(start);
+	slices[i].meta_len = cpu_to_le64(total - start);
+
+	return 0;
+}
+
+spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_slice *slice)
+{
+	wri->fsid = slice->fsid;
+
+	return meta_alloc_add(wri, le64_to_cpu(slice->meta_start), le64_to_cpu(slice->meta_len));
+}
+
+spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_slice *slice)
+{
+	slice->fsid = wri->fsid;
+	slice->meta_start = cpu_to_le64(wri->meta_start);
+	slice->meta_len = cpu_to_le64(wri->meta_len);
+	return 0;
+}
+
+spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_inode *inode)
+{
+	spr_err_t err;
+
+	if (wri_has_super(wri))
+		return EINVAL;
+
+	err = insert_inode_items(wri, inode);
+	if (err)
+		goto out;
+
+	wri->inode_count++;
+	wri->max_ino = max(wri->max_ino, inode->ino);
+	err = 0;
+out:
+	return err;
+}
+
+spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_entry *entry)
+{
+
+	if (wri_has_super(wri))
+		return EINVAL;
+
+	return insert_entry_items(wri, entry);
+}
+
+spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_xattr *xattr)
+{
+	spr_err_t err;
+	int xtags;
+	u32 xat_hash;
+	u64 srch_hash;
+
+	xat_hash = crc32c(U32_MAX, xattr->name, xattr->name_len);
+	srch_hash = scoutfs_hash64(xattr->name, xattr->name_len);
+	xtags = get_xattr_tags(xattr->name, xattr->name_len);
+
+	err = insert_xattr_items(wri, xattr, xat_hash);
+	if (!err) {
+		if (xtags & XTAG_SRCH)
+			err = insert_srch_entry(&wri->srch_sbld, srch_hash, xattr->ino, xattr->pos);
+		if (!err && (xtags & XTAG_TOTL))
+			err = insert_totl_item(wri, xattr);
+	}
+
+	return err;
+}
+
+spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog)
+{
+	if (wri_has_super(wri))
+		return EINVAL;
+
+	memset(prog, 0, sizeof(struct scoutfs_parallel_restore_progress));
+	prog->fs_items = wri->fs_btb.btroot;
+	prog->root_items = wri->root_btb.btroot;
+	prog->sfl = wri->srch_sbld.sfl;
+	prog->bloom_ref.blkno = wri->bloom_bbld.bloom->hdr.blkno;
+	prog->bloom_ref.seq = wri->bloom_bbld.bloom->hdr.seq;
+	prog->inode_count = cpu_to_le64(wri->inode_count);
+	prog->max_ino = cpu_to_le64(wri->max_ino);
+
+	reset_builders(wri);
+	wri->inode_count = 0;
+	wri->max_ino = 0;
+
+	return 0;
+}
+
+spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog)
+{
+	spr_err_t err;
+
+	if (!wri_has_super(wri))
+		return EINVAL;
+
+	/*
+	 * Only one writer's progress should contain the root inode.
+	 */
+	if (prog->root_items.ref.blkno) {
+		if (wri->root_items.ref.blkno)
+			return EEXIST;
+		wri->root_items = prog->root_items;
+	}
+
+	wri->max_ino = max(wri->max_ino, le64_to_cpu(prog->max_ino));
+
+	err = insert_log_trees_item(wri, prog);
+	if (!err && prog->sfl.ref.blkno)
+	      err = insert_srch_item(wri, &prog->sfl);
+
+	return err;
+}
+
+spr_err_t scoutfs_parallel_restore_add_quota_rule(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_quota_rule *rule)
+{
+	return insert_quota_item(wri, rule);
+}
+
+spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
+					     void *buf, size_t len, off_t *off_ret,
+					     size_t *count_ret)
+{
+	struct block_builder *bld;
+	off_t count = 0;
+	off_t off = 0;
+	u64 blkno = 0;
+	spr_err_t err;
+
+	if (len < SCOUTFS_BLOCK_LG_SIZE) {
+		err = EINVAL;
+		goto out;
+	}
+
+	while (len >= SCOUTFS_BLOCK_LG_SIZE) {
+		bld = list_first_entry_or_null(&wri->builders, struct block_builder, head);
+		if (!bld) {
+			err = 0;
+			break;
+		}
+
+		if (bld->empty(bld)) {
+			if (bld->post && ((err = bld->post(wri, bld))))
+				break;
+			list_del_init(&bld->head);
+			continue;
+		}
+
+		err = meta_alloc_contig(wri, blkno, &blkno);
+		if (err || blkno == 0)
+			break;
+
+		if (off == 0)
+			off = blkno << SCOUTFS_BLOCK_LG_SHIFT;
+
+		err = bld->build(wri, bld, buf, blkno);
+		if (err)
+			break;
+
+		buf += SCOUTFS_BLOCK_LG_SIZE;
+		len -= SCOUTFS_BLOCK_LG_SIZE;
+		count += SCOUTFS_BLOCK_LG_SIZE;
+
+		dprintf("built blkno %llu off %llu count %llu\n", blkno, (u64)off, (u64)count);
+	}
+
+out:
+	*off_ret = off;
+	*count_ret = count;
+	return count > 0 ? 0 : err;
+}
+
+/*
+ * Here we take in a dev's fd an read its quorum blocks to see if the dev has
+ * been mounted before
+ */
+static spr_err_t scoutfs_check_if_previous_mount(int fd)
+{
+	struct scoutfs_quorum_block *blk = NULL;
+	struct scoutfs_quorum_block_event *ev;
+	u64 blkno;
+	int i, j;
+	spr_err_t err;
+
+	for (i = 0; i <  SCOUTFS_QUORUM_MAX_SLOTS; i++) {
+		blkno = SCOUTFS_QUORUM_BLKNO + i;
+		err = read_block(fd, blkno, SCOUTFS_BLOCK_SM_SHIFT, (void **)&blk);
+		if (!blk)
+			return EINVAL;
+
+		dprintf("quorum block read; quorum bklno: %llu, err_val: %d\n", blkno, err);
+		if (err) {
+			free(blk);
+			return err;
+		}
+
+		for (j = 0; j < SCOUTFS_QUORUM_EVENT_NR; j++) {
+			ev = &blk->events[j];
+			if (ev->ts.sec || ev->ts.nsec) {
+				free(blk);
+				return EINVAL;
+			}
+		}
+
+		free(blk);
+	}
+
+	return err;
+}
+
+spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super, int fd)
+{
+	spr_err_t err;
+	u64 start;
+	u64 len;
+
+	/*
+	 * check the device we are restoring into to make sure
+	 * that it has has never been mounted
+	 */
+	if (scoutfs_check_if_previous_mount(fd))
+		return EINVAL;
+
+	if (le64_to_cpu(super->fmt_vers) < 2)
+		return EINVAL;
+
+	if ((le64_to_cpu(super->flags) & SCOUTFS_FLAG_IS_META_BDEV) == 0)
+		return EINVAL;
+
+	if (wri_has_super(wri))
+		return EINVAL;
+
+	start = SCOUTFS_DATA_DEV_START_BLKNO;
+	len = le64_to_cpu(super->total_data_blocks) - start;
+
+	/* make sure all data extents are free */
+	if (le64_to_cpu(super->data_alloc.total_len) != len)
+		return ENOTEMPTY;
+
+	/* we write new allocator blocks so that we don't have to read exiting */
+	err = insert_free_items(&wri->data_btb, start, len);
+	if (err)
+		return err;
+
+	wri->super = *super;
+
+	/* prepare alloc block builders only after other metadata blocks are built */
+	wri->log_btb.bld.post = prepare_alloc_builders;
+
+	return 0;
+}
+
+spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super)
+{
+	if (!wri_has_super(wri))
+		return EINVAL;
+
+	*super = wri->super;
+
+	super->seq = cpu_to_le64(wri->max_ino + 1);
+	super->next_ino = cpu_to_le64(wri->max_ino + 1);
+	super->inode_count = cpu_to_le64(wri->inode_count);
+	set_alloc_root(&super->meta_alloc[0], &wri->meta_btb[0]);
+	set_alloc_root(&super->meta_alloc[1], &wri->meta_btb[1]);
+	set_alloc_root(&super->data_alloc, &wri->data_btb);
+	super->server_meta_avail[0] = wri->meta_alb[0].lhead;
+	super->server_meta_avail[1] = wri->meta_alb[1].lhead;
+	memset(super->server_meta_freed, 0, sizeof(super->server_meta_freed));
+	super->fs_root = wri->root_items;
+	super->logs_root = wri->log_btb.btroot;
+	memset(&super->log_merge, 0, sizeof(super->log_merge));
+	memset(&super->mounted_clients, 0, sizeof(super->mounted_clients));
+	super->srch_root = wri->srch_btb.btroot;
+	/* test volopt? */
+
+	super->hdr.crc = cpu_to_le32(crc_block(&super->hdr, SCOUTFS_BLOCK_SM_SIZE));
+
+	return 0;
+}
diff --git a/utils/src/parallel_restore.h b/utils/src/parallel_restore.h
new file mode 100644
index 000000000..51fcf4b21
--- /dev/null
+++ b/utils/src/parallel_restore.h
@@ -0,0 +1,125 @@
+#ifndef _SCOUTFS_PARALLEL_RESTORE_H_
+#define _SCOUTFS_PARALLEL_RESTORE_H_
+
+#include <errno.h>
+
+struct scoutfs_parallel_restore_progress {
+	struct scoutfs_btree_root fs_items;
+	struct scoutfs_btree_root root_items;
+	struct scoutfs_srch_file sfl;
+	struct scoutfs_block_ref bloom_ref;
+	__le64 inode_count;
+	__le64 max_ino;
+};
+
+struct scoutfs_parallel_restore_slice {
+	__le64 fsid;
+	__le64 meta_start;
+	__le64 meta_len;
+};
+
+struct scoutfs_parallel_restore_entry {
+	u64 dir_ino;
+	u64 pos;
+	u64 ino;
+	mode_t mode;
+	char *name;
+	unsigned int name_len;
+};
+
+struct scoutfs_parallel_restore_xattr {
+	u64 ino;
+	u64 pos;
+	char *name;
+	unsigned int name_len;
+	void *value;
+	unsigned int value_len;
+};
+
+struct scoutfs_parallel_restore_inode {
+	/* all inodes */
+	u64 ino;
+	u64 meta_seq;
+	u64 data_seq;
+	u64 nr_xattrs;
+	u32 uid;
+	u32 gid;
+	u32 mode;
+	u32 rdev;
+	u32 flags;
+	u8 pad[4];
+	struct timespec atime;
+	struct timespec ctime;
+	struct timespec mtime;
+	struct timespec crtime;
+	u64 proj;
+
+	/* regular files */
+	u64 data_version;
+	u64 size;
+	bool offline;
+
+	/* only used for directories */
+	u64 nr_subdirs;
+	u64 total_entry_name_bytes;
+
+	/* only used for symlnks */
+	char *target;
+	unsigned int target_len; /* not including null terminator */
+};
+
+struct scoutfs_parallel_restore_quota_rule {
+	u64 limit;
+	u8  prio;
+	u8  op;
+	u8  rule_flags;
+	struct quota_rule_name {
+		u64 val;
+		u8  source;
+		u8  flags;
+	} names [3];
+	char *value;
+	unsigned int value_len;
+};
+
+typedef __typeof__(EINVAL) spr_err_t;
+
+struct scoutfs_parallel_restore_writer;
+
+spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip);
+void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip);
+
+spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
+					       struct scoutfs_parallel_restore_slice *slices,
+					       int nr);
+spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
+					    struct scoutfs_parallel_restore_slice *slice);
+spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
+					    struct scoutfs_parallel_restore_slice *slice);
+
+spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_inode *inode);
+spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_entry *entry);
+spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_xattr *xattr);
+
+spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog);
+spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog);
+
+spr_err_t scoutfs_parallel_restore_add_quota_rule(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_quota_rule *rule);
+
+spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
+					     void *buf, size_t len, off_t *off_ret,
+					     size_t *count_ret);
+
+spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super, int fd);
+spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super);
+
+
+#endif

From 758d5d64e75270c2ca32d34a1ddb3f9523517509 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 28 Mar 2023 12:49:50 -0700
Subject: [PATCH 08/15] Add test for parallel restore

This is the benchmark binary that bulk creates filesystem items, xattrs
and is heavily threaded to scope the performance of the library. The
test script invokes it to validate some basic constraints.

Signed-off-by: Zach Brown <zab@versity.com>
Signed-off-by: Hunter Shaffer <hunter.shaffer@versity.com>
Signed-off-by: Auke Kok <auke.kok@versity.com>
---
 tests/Makefile                  |   8 +-
 tests/golden/parallel_restore   |  26 ++
 tests/sequence                  |   1 +
 tests/src/parallel_restore.c    | 805 ++++++++++++++++++++++++++++++++
 tests/tests/parallel_restore.sh |  74 +++
 5 files changed, 912 insertions(+), 2 deletions(-)
 create mode 100644 tests/golden/parallel_restore
 create mode 100644 tests/src/parallel_restore.c
 create mode 100644 tests/tests/parallel_restore.sh

diff --git a/tests/Makefile b/tests/Makefile
index 3a2380dc2..28dedb38e 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -15,7 +15,8 @@ BIN := src/createmany			\
 	src/o_tmpfile_umask		\
 	src/o_tmpfile_linkat		\
 	src/mmap_stress			\
-	src/mmap_validate
+	src/mmap_validate		\
+	src/parallel_restore
 
 DEPS := $(wildcard src/*.d)
 
@@ -27,8 +28,11 @@ endif
 
 src/mmap_stress: LIBS+=-lpthread
 
+src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
+src/parallel_restore: ../utils/src/scoutfs_parallel_restore.a
+
 $(BIN): %: %.c Makefile
-	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS)
+	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS) $($(@)_cflags)
 
 .PHONY: clean
 clean:
diff --git a/tests/golden/parallel_restore b/tests/golden/parallel_restore
new file mode 100644
index 000000000..a9b9d42ed
--- /dev/null
+++ b/tests/golden/parallel_restore
@@ -0,0 +1,26 @@
+== simple mkfs/restore/mount
+committed_seq     1120
+total_meta_blocks 163840
+total_data_blocks 15728640
+   1440    1440   57120
+     80      80     400
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+    Type  Size     Total   Used      Free  Use%  
+MetaData  64KB    163840  34721    129119    21  
+    Data   4KB  15728640     64  15728576     0  
+== under ENOSPC
+    Type  Size     Total    Used      Free  Use%  
+MetaData  64KB    163840  115361     48479    70  
+    Data   4KB  15728640      64  15728576     0  
+== ENOSPC
+== attempt to restore data device
+== attempt format_v1 restore
+== test if previously mounted
+== cleanup
diff --git a/tests/sequence b/tests/sequence
index 18eff7cff..c706e5299 100644
--- a/tests/sequence
+++ b/tests/sequence
@@ -57,4 +57,5 @@ archive-light-cycle.sh
 block-stale-reads.sh
 inode-deletion.sh
 renameat2-noreplace.sh
+parallel_restore.sh
 xfstests.sh
diff --git a/tests/src/parallel_restore.c b/tests/src/parallel_restore.c
new file mode 100644
index 000000000..b6c82657a
--- /dev/null
+++ b/tests/src/parallel_restore.c
@@ -0,0 +1,805 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <time.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <sys/socket.h>
+
+#include "../../utils/src/sparse.h"
+#include "../../utils/src/util.h"
+#include "../../utils/src/list.h"
+#include "../../utils/src/parse.h"
+#include "../../kmod/src/format.h"
+#include "../../utils/src/parallel_restore.h"
+
+/*
+ * XXX:
+ *  - add a nice description of what's going on
+ *  - mention allocator contention
+ *  - test child process dying handling
+ *  - root dir entry name length is wrong
+ */
+
+#define ERRF " errno %d (%s)"
+#define ERRA errno, strerror(errno)
+
+#define error_exit(cond, fmt, args...)			\
+do {							\
+	if (cond) {					\
+		printf("error: "fmt"\n", ##args);	\
+		exit(1);				\
+	}						\
+} while (0)
+
+#define dprintf(fmt, args...)		\
+do {					\
+	if (0)				\
+		printf(fmt, ##args);	\
+} while (0)
+
+#define REG_MODE (S_IFREG | 0644)
+#define DIR_MODE (S_IFDIR | 0755)
+
+struct opts {
+	unsigned long long buf_size;
+
+	unsigned long long write_batch;
+	unsigned long long low_dirs;
+	unsigned long long high_dirs;
+	unsigned long long low_files;
+	unsigned long long high_files;
+	char *meta_path;
+	unsigned long long total_files;
+	bool read_only;
+	unsigned long long seed;
+	unsigned long long nr_writers;
+};
+
+static void usage(void)
+{
+	printf("usage:\n"
+	       " -b NR       | threads write blocks in batches files (100000)\n"
+	       " -d LOW:HIGH | range of subdirs per directory (5:10)\n"
+	       " -f LOW:HIGH | range of files per directory (10:20)\n"
+	       " -m PATH     | path to metadata device\n"
+	       " -n NR       | total number of files to create (100)\n"
+	       " -r          | read-only, all work except writing, measure cpu cost\n"
+	       " -s NR       | randomization seed (random)\n"
+	       " -w NR       | number of writing processes to fork (online cpus)\n"
+	       );
+}
+
+static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+			 void *buf, size_t buf_size, int dev_fd)
+{
+	size_t total = 0;
+	size_t count;
+	off_t off;
+	int ret;
+
+	do {
+		ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count);
+		error_exit(ret, "write buf %d", ret);
+
+		if (count > 0) {
+			if (!opts->read_only)
+				ret = pwrite(dev_fd, buf, count, off);
+			else
+				ret = count;
+			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
+			total += ret;
+		}
+	} while (count > 0);
+
+	return total;
+}
+
+struct gen_inode {
+	struct scoutfs_parallel_restore_inode inode;
+	struct scoutfs_parallel_restore_xattr **xattrs;
+	u64 nr_xattrs;
+	struct scoutfs_parallel_restore_entry **entries;
+	u64 nr_files;
+	u64 nr_entries;
+};
+
+static void free_gino(struct gen_inode *gino)
+{
+	u64 i;
+
+	if (gino) {
+		if (gino->entries) {
+			for (i = 0; i < gino->nr_entries; i++)
+				free(gino->entries[i]);
+			free(gino->entries);
+		}
+		if (gino->xattrs) {
+			for (i = 0; i < gino->nr_xattrs; i++)
+				free(gino->xattrs[i]);
+			free(gino->xattrs);
+		}
+		free(gino);
+	}
+}
+
+static struct scoutfs_parallel_restore_xattr *
+generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value,
+		int value_len)
+{
+	struct scoutfs_parallel_restore_xattr *xattr;
+
+	xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len);
+	error_exit(!xattr, "error allocating generated xattr");
+
+	*xattr = (struct scoutfs_parallel_restore_xattr) {
+		.ino = ino,
+		.pos = pos,
+		.name_len = name_len,
+		.value_len = value_len,
+	};
+
+	xattr->name = (void *)(xattr + 1);
+	xattr->value = (void *)(xattr->name + name_len);
+
+	memcpy(xattr->name, name, name_len);
+	if (value_len)
+		memcpy(xattr->value, value, value_len);
+
+	return xattr;
+}
+
+static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode)
+{
+	struct gen_inode *gino;
+	struct timespec now;
+
+	clock_gettime(CLOCK_REALTIME, &now);
+
+	gino = calloc(1, sizeof(struct gen_inode));
+	error_exit(!gino, "failure allocating generated inode");
+
+	gino->inode = (struct scoutfs_parallel_restore_inode) {
+		.ino = ino,
+		.meta_seq = ino,
+		.data_seq = 0,
+		.mode = mode,
+		.atime = now,
+		.ctime = now,
+		.mtime = now,
+		.crtime = now,
+	};
+
+	/*
+	 * hacky creation of a bunch of xattrs for now.
+	 */
+	if ((mode & S_IFMT) == S_IFREG) {
+		#define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, }
+		struct name_val {
+			char *name;
+			int len;
+			char *value;
+			int value_len;
+		} nv[] = {
+			NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"),
+			NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""),
+			NV("scoutfs.hide.sam_reqcopies", ""),
+			NV("scoutfs.hide.sam_copy_2", ""),
+			NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"),
+			NV("scoutfs.hide.sam_copy_1", ""),
+			NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""),
+			NV("scoutfs.hide.srch.sam_release_cand", ""),
+			NV("scoutfs.hide.sam_restime", ""),
+			NV("scoutfs.hide.sam_uuid", ""),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"),
+			NV("scoutfs.hide.srch.sam_vol_F01030L6", ""),
+			NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""),
+			NV("scoutfs.hide.srch.sam_vol_E01001L6", ""),
+			NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"),
+		};
+		unsigned int nr = array_size(nv);
+		int i;
+
+		gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *));
+
+		for (i = 0; i < nr; i++)
+			gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len,
+							 nv[i].value, nv[i].value_len);
+
+		gino->nr_xattrs = nr;
+		gino->inode.nr_xattrs = nr;
+
+		gino->inode.size = 4096;
+		gino->inode.offline = true;
+	}
+
+	return gino;
+}
+
+static struct scoutfs_parallel_restore_entry *
+generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode)
+{
+	struct scoutfs_parallel_restore_entry *entry;
+	char buf[PATH_MAX];
+	int bytes;
+
+	bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr);
+
+	entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes);
+	error_exit(!entry, "error allocating generated entry");
+
+	*entry = (struct scoutfs_parallel_restore_entry) {
+		.dir_ino = dir_ino,
+		.pos = pos,
+		.ino = ino,
+		.mode = mode,
+		.name = (void *)(entry + 1),
+		.name_len = bytes,
+	};
+
+	memcpy(entry->name, buf, bytes);
+
+	return entry;
+}
+
+static u64 random64(void)
+{
+	return ((u64)lrand48() << 32) | lrand48();
+}
+
+static u64 random_range(u64 low, u64 high)
+{
+	return low + (random64() % (high - low + 1));
+}
+
+static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len,
+				      bool no_dirs)
+{
+	struct scoutfs_parallel_restore_entry *entry;
+	struct gen_inode *gino;
+	u64 nr_entries;
+	u64 nr_files;
+	u64 nr_dirs;
+	u64 ino;
+	char *prefix;
+	mode_t mode;
+	u64 i;
+
+	nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs);
+	nr_files = random_range(opts->low_files, opts->high_files);
+
+	if (1 + nr_dirs + nr_files > ino_len) {
+		nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2;
+		nr_files = (ino_len - 1) - nr_dirs;
+	}
+
+	nr_entries = nr_dirs + nr_files;
+
+	gino = generate_inode(opts, dir_ino, DIR_MODE);
+	error_exit(!gino, "error allocating generated inode");
+
+	gino->inode.nr_subdirs = nr_dirs;
+	gino->nr_files = nr_files;
+
+	if (nr_entries) {
+		gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *));
+		error_exit(!gino->entries, "error allocating generated inode entries");
+
+		gino->nr_entries = nr_entries;
+	}
+
+	mode = DIR_MODE;
+	prefix = "dir";
+	for (i = 0; i < nr_entries; i++) {
+		if (i == nr_dirs) {
+			mode = REG_MODE;
+			prefix = "file";
+		}
+
+		ino = ino_start + i;
+		entry = generate_entry(opts, prefix, ino, gino->inode.ino,
+				       SCOUTFS_DIRENT_FIRST_POS + i, ino, mode);
+
+		gino->entries[i] = entry;
+		gino->inode.total_entry_name_bytes += entry->name_len;
+	}
+
+	return gino;
+}
+
+/*
+ * Restore a generated inode.  If it's a directory then we also restore
+ * all its entries.  The caller is going to descend into subdir entries and generate
+ * those dir inodes.  We have to generate and restore all non-dir inodes referenced
+ * by this inode's entries.
+ */
+static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+			  struct gen_inode *gino)
+{
+	struct gen_inode *nondir;
+	int ret;
+	u64 i;
+
+	ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
+	error_exit(ret, "thread add root inode %d", ret);
+
+	for (i = 0; i < gino->nr_entries; i++) {
+		ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]);
+		error_exit(ret, "thread add entry %d", ret);
+
+		/* caller only needs subdir entries, generate and free others */
+		if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) {
+
+			nondir = generate_inode(opts, gino->entries[i]->ino,
+						gino->entries[i]->mode);
+			restore_inode(opts, wri, nondir);
+			free_gino(nondir);
+
+			free(gino->entries[i]);
+			if (i != gino->nr_entries - 1)
+				gino->entries[i] = gino->entries[gino->nr_entries - 1];
+			gino->nr_entries--;
+			gino->nr_files--;
+			i--;
+		}
+	}
+
+	for (i = 0; i < gino->nr_xattrs; i++) {
+		ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]);
+		error_exit(ret, "thread add xattr %d", ret);
+	}
+}
+
+struct writer_args {
+	struct list_head head;
+
+	int dev_fd;
+	int pair_fd;
+
+	struct scoutfs_parallel_restore_slice slice;
+	u64 writer_nr;
+	u64 dir_height;
+	u64 ino_start;
+	u64 ino_len;
+};
+
+struct write_result {
+	struct scoutfs_parallel_restore_progress prog;
+	struct scoutfs_parallel_restore_slice slice;
+	__le64 files_created;
+	__le64 bytes_written;
+};
+
+static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+				  void *buf, size_t buf_size, int dev_fd,
+				  struct write_result *res, bool get_slice, int pair_fd)
+{
+	size_t total;
+	int ret;
+
+	total = write_bufs(opts, wri, buf, buf_size, dev_fd);
+	le64_add_cpu(&res->bytes_written, total);
+
+	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
+	error_exit(ret, "get prog %d", ret);
+
+	if (get_slice) {
+		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
+		error_exit(ret, "thread get slice %d", ret);
+	}
+
+	ret = write(pair_fd, res, sizeof(struct write_result));
+	error_exit(ret != sizeof(struct write_result), "result send error");
+
+	memset(res, 0, sizeof(struct write_result));
+}
+
+/*
+ * Calculate the number of bytes in toplevel "dir-%llu" entry names for the given
+ * number of writers.
+ */
+static u64 topdir_entry_bytes(u64 nr_writers)
+{
+	u64 bytes = (3 + 1) * nr_writers;
+	u64 limit;
+	u64 done;
+	u64 wid;
+	u64 nr;
+
+	for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) {
+		nr = min(limit - done, nr_writers - done);
+		bytes += nr * wid;
+	}
+
+	return bytes;
+}
+
+struct dir_pos {
+	struct gen_inode *gino;
+	u64 pos;
+};
+
+static void writer_proc(struct opts *opts, struct writer_args *args)
+{
+	struct scoutfs_parallel_restore_writer *wri = NULL;
+	struct scoutfs_parallel_restore_entry *entry;
+	struct dir_pos *dirs = NULL;
+	struct write_result res;
+	struct gen_inode *gino;
+	void *buf = NULL;
+	u64 level;
+	u64 ino;
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	dirs = calloc(args->dir_height, sizeof(struct dir_pos));
+	error_exit(errno, "error allocating parent dirs "ERRF, ERRA);
+
+	errno = posix_memalign((void **)&buf, 4096, opts->buf_size);
+	error_exit(errno, "error allocating block buf "ERRF, ERRA);
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &args->slice);
+	error_exit(ret, "add slice %d", ret);
+
+	/* writer 0 creates the root dir */
+	if (args->writer_nr == 0) {
+		gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE);
+		gino->inode.nr_subdirs = opts->nr_writers;
+		gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers);
+
+		ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
+		error_exit(ret, "thread add root inode %d", ret);
+		free_gino(gino);
+	}
+
+	/* create root entry for our top level dir */
+	ino = args->ino_start++;
+	args->ino_len--;
+
+	entry = generate_entry(opts, "top", args->writer_nr,
+			       SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr,
+			       ino, DIR_MODE);
+
+	ret = scoutfs_parallel_restore_add_entry(wri, entry);
+	error_exit(ret, "thread top entry %d", ret);
+	free(entry);
+
+	level = args->dir_height - 1;
+
+	while (args->ino_len > 0 && level < args->dir_height) {
+		gino = dirs[level].gino;
+
+		/* generate and restore if we follow entries */
+		if (!gino) {
+			gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0);
+			args->ino_start += gino->nr_entries;
+			args->ino_len -= gino->nr_entries;
+			le64_add_cpu(&res.files_created, gino->nr_files);
+
+			restore_inode(opts, wri, gino);
+			dirs[level].gino = gino;
+		}
+
+		if (dirs[level].pos == gino->nr_entries) {
+			/* ascend if we're done with this dir */
+			dirs[level].gino = NULL;
+			dirs[level].pos = 0;
+			free_gino(gino);
+			level++;
+
+		} else {
+			/* otherwise descend into subdir entry */
+			ino = gino->entries[dirs[level].pos]->ino;
+			dirs[level].pos++;
+			level--;
+		}
+
+		/* do a partial write at batch intervals when there's still more to do */
+		if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0)
+			write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
+					    &res, false, args->pair_fd);
+	}
+
+	write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
+			    &res, true, args->pair_fd);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	free(dirs);
+	free(buf);
+}
+
+/*
+ * If any of our children exited with an error code, we hard exit.
+ * The child processes should themselves report out any errors
+ * encountered. Any remaining children will receive SIGHUP and
+ * terminate.
+ */
+static void sigchld_handler(int signo, siginfo_t *info, void *context)
+{
+	if (info->si_status)
+		exit(EXIT_FAILURE);
+}
+
+static void fork_writer(struct opts *opts, struct writer_args *args)
+{
+	pid_t parent = getpid();
+	pid_t pid;
+	int ret;
+
+	pid = fork();
+	error_exit(pid == -1, "fork error");
+
+	if (pid != 0)
+		return;
+
+	ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
+	error_exit(ret < 0, "failed to set parent death sig");
+
+	printf("pid %u getpid() %u parent %u getppid() %u\n",
+		pid, getpid(), parent, getppid());
+	error_exit(getppid() != parent, "child parent already changed");
+
+	writer_proc(opts, args);
+	exit(0);
+}
+
+static int do_restore(struct opts *opts)
+{
+	struct scoutfs_parallel_restore_writer *wri = NULL;
+	struct scoutfs_parallel_restore_slice *slices = NULL;
+	struct scoutfs_super_block *super = NULL;
+	struct write_result res;
+	struct writer_args *args;
+	struct timespec begin;
+	struct timespec end;
+	LIST_HEAD(writers);
+	u64 next_ino;
+	u64 ino_per;
+	u64 avg_dirs;
+	u64 avg_files;
+	u64 dir_height;
+	u64 tot_files;
+	u64 tot_bytes;
+	int pair[2] = {-1, -1};
+	float secs;
+	void *buf = NULL;
+	int dev_fd = -1;
+	int ret;
+	int i;
+
+	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
+	error_exit(ret, "socketpair error "ERRF, ERRA);
+
+	dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL)));
+	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
+
+	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
+		posix_memalign((void **)&buf, 4096, opts->buf_size);
+	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
+
+	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
+	error_exit(ret, "import super %d", ret);
+
+	slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice));
+	error_exit(!slices, "alloc slices");
+
+	scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
+	error_exit(ret, "add slices[0] %d", ret);
+
+	next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1;
+	ino_per = opts->total_files / opts->nr_writers;
+	avg_dirs = (opts->low_dirs + opts->high_dirs) / 2;
+	avg_files = (opts->low_files + opts->high_files) / 2;
+
+	dir_height = 1;
+	tot_files = avg_files * opts->nr_writers;
+
+	while (tot_files < opts->total_files) {
+		dir_height++;
+		tot_files *= avg_dirs;
+	}
+
+	dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files);
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
+
+	/* start each writing process */
+	for (i = 0; i < opts->nr_writers; i++) {
+		args = calloc(1, sizeof(struct writer_args));
+		error_exit(!args, "alloc writer args");
+
+		args->dev_fd = dev_fd;
+		args->pair_fd = pair[1];
+		args->slice = slices[1 + i];
+		args->writer_nr = i;
+		args->dir_height = dir_height;
+		args->ino_start = next_ino;
+		args->ino_len = ino_per;
+
+		list_add_tail(&args->head, &writers);
+		next_ino += ino_per;
+
+		fork_writer(opts, args);
+	}
+
+	/* read results and watch for writers to finish */
+	tot_files = 0;
+	tot_bytes = 0;
+	i = 0;
+	while (i < opts->nr_writers) {
+		ret = read(pair[0], &res, sizeof(struct write_result));
+		error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
+
+		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
+		error_exit(ret, "add thr prog %d", ret);
+
+		if (res.slice.meta_len != 0) {
+			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
+			error_exit(ret, "add thr slice %d", ret);
+			i++;
+		}
+
+		tot_files += le64_to_cpu(res.files_created);
+		tot_bytes += le64_to_cpu(res.bytes_written);
+	}
+
+	tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd);
+
+	ret = scoutfs_parallel_restore_export_super(wri, super);
+	error_exit(ret, "update super %d", ret);
+
+	if (!opts->read_only) {
+		ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+			     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+		error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
+	}
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) -
+	       ((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC));
+	printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n",
+		tot_files, tot_bytes, secs,
+		(float)tot_bytes / tot_files, (float)tot_files / secs);
+
+	if (dev_fd >= 0)
+		close(dev_fd);
+	if (pair[0] >= 0)
+		close(pair[0]);
+	if (pair[1] >= 0)
+		close(pair[1]);
+	free(super);
+	free(slices);
+	free(buf);
+
+	return 0;
+}
+
+static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret)
+{
+	char *sep;
+	int ret = 0;
+
+	sep = index(str, ':');
+	if (sep) {
+		*sep = '\0';
+		ret = parse_u64(sep + 1, high_ret);
+	}
+
+	if (ret == 0)
+		ret = parse_u64(str, low_ret);
+
+	if (sep)
+		*sep = ':';
+
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	struct opts opts = {
+		.buf_size = (32 * 1024 * 1024),
+
+		.write_batch = 1000000,
+		.low_dirs = 5,
+		.high_dirs = 10,
+		.low_files = 10,
+		.high_files = 20,
+		.total_files = 100,
+	};
+	struct sigaction act = { 0 };
+	int ret;
+	int c;
+
+	opts.seed = random64();
+	opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN);
+
+        while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) {
+                switch(c) {
+                case 'b':
+			ret = parse_u64(optarg, &opts.write_batch);
+			error_exit(ret, "error parsing -b '%s'\n", optarg);
+			error_exit(opts.write_batch == 0, "-b can't be 0");
+                        break;
+                case 'd':
+			ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs);
+			error_exit(ret, "error parsing -d '%s'\n", optarg);
+                        break;
+                case 'f':
+			ret = parse_low_high(optarg, &opts.low_files, &opts.high_files);
+			error_exit(ret, "error parsing -f '%s'\n", optarg);
+                        break;
+                case 'm':
+                        opts.meta_path = strdup(optarg);
+                        break;
+                case 'n':
+			ret = parse_u64(optarg, &opts.total_files);
+			error_exit(ret, "error parsing -n '%s'\n", optarg);
+                        break;
+                case 'r':
+			opts.read_only = true;
+			break;
+                case 's':
+			ret = parse_u64(optarg, &opts.seed);
+			error_exit(ret, "error parsing -s '%s'\n", optarg);
+                        break;
+                case 'w':
+			ret = parse_u64(optarg, &opts.nr_writers);
+			error_exit(ret, "error parsing -w '%s'\n", optarg);
+                        break;
+                case '?':
+                        printf("Unknown option '%c'\n", optopt);
+                        usage();
+			exit(1);
+                }
+        }
+
+	error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu",
+		   opts.low_dirs, opts.high_dirs);
+	error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu",
+		   opts.low_files, opts.high_files);
+	error_exit(!opts.meta_path, "must specify metadata device path with -m");
+
+	printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n",
+		opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files,
+		opts.total_files, opts.seed, opts.nr_writers);
+
+	act.sa_flags = SA_SIGINFO | SA_RESTART;
+	act.sa_sigaction = &sigchld_handler;
+	if (sigaction(SIGCHLD, &act, NULL) == -1)
+		error_exit(ret, "error setting up signal handler\n");
+
+	ret = do_restore(&opts);
+
+	free(opts.meta_path);
+
+	return ret == 0 ? 0 : 1;
+}
diff --git a/tests/tests/parallel_restore.sh b/tests/tests/parallel_restore.sh
new file mode 100644
index 000000000..a43d2d2cb
--- /dev/null
+++ b/tests/tests/parallel_restore.sh
@@ -0,0 +1,74 @@
+#
+# validate parallel restore library
+#
+
+t_require_commands scoutfs parallel_restore find xargs
+
+SCR="$T_TMPDIR/mnt.scratch"
+mkdir -p "$SCR"
+
+scratch_mkfs() {
+	scoutfs mkfs $@ \
+		-A -f -Q 0,127.0.0.1,53000 $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_check() {
+	# give ample time for writes to commit
+	sleep 1
+	sync
+	scoutfs check -d ${T_TMPDIR}/check.debug $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_mount() {
+	mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 $T_EX_DATA_DEV $SCR
+}
+
+echo "== simple mkfs/restore/mount"
+# meta device just big enough for reserves and the metadata we'll fill
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" > /dev/null || t_fail "parallel_restore"
+scratch_check || t_fail "check failed"
+scratch_mount
+
+scoutfs statfs -p "$SCR" | grep -v -e 'fsid' -e 'rid'
+find "$SCR" -exec scoutfs list-hidden-xattrs {} \; | wc
+scoutfs search-xattrs -p "$SCR" scoutfs.hide.srch.sam_vol_F01030L6 -p "$SCR" | wc
+find "$SCR" -type f -name "file-*" | head -n 4 | xargs -n 1 scoutfs get-fiemap -L
+scoutfs df -p "$SCR"
+scoutfs quota-list -p "$SCR"
+
+umount "$SCR"
+scratch_check || t_fail "check after mount failed"
+
+echo "== under ENOSPC"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" -n 2000000 > /dev/null || t_fail "parallel_restore"
+scratch_check || t_fail "check failed"
+scratch_mount
+scoutfs df -p "$SCR"
+umount "$SCR"
+scratch_check || t_fail "check after mount failed"
+
+echo "== ENOSPC"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" -d 600:1000 -f 600:1000 -n 4000000 | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== attempt to restore data device"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_DATA_DEV" | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== attempt format_v1 restore"
+scratch_mkfs -V 1 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== test if previously mounted"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
+	"$T_EX_DATA_DEV" "$SCR"
+umount "$SCR"
+parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== cleanup"
+rmdir "$SCR"
+
+t_pass

From a2eb157a1f0ee2838744213c12e44cf07220ded4 Mon Sep 17 00:00:00 2001
From: Auke Kok <auke.kok@versity.com>
Date: Fri, 3 May 2024 13:38:55 -0400
Subject: [PATCH 09/15] Copy a tree using parallel restore library.

This tool compies a source tree (whether it's scoutfs or not)
into an offline scoutfs meta device. It has only those 2 parameters
and does a single-process walk of the tree to restore all items
while preservice as much of the metadata as possible.

Signed-off-by: Hunter Shaffer <hunter.shaffer@versity.com>
Signed-off-by: Auke Kok <auke.kok@versity.com>
---
 tests/Makefile              |   5 +-
 tests/golden/restore_copy   |  64 +++
 tests/sequence              |   1 +
 tests/src/restore_copy.c    | 959 ++++++++++++++++++++++++++++++++++++
 tests/tests/restore_copy.sh |  99 ++++
 5 files changed, 1127 insertions(+), 1 deletion(-)
 create mode 100644 tests/golden/restore_copy
 create mode 100644 tests/src/restore_copy.c
 create mode 100644 tests/tests/restore_copy.sh

diff --git a/tests/Makefile b/tests/Makefile
index 28dedb38e..72894eb28 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -16,7 +16,8 @@ BIN := src/createmany			\
 	src/o_tmpfile_linkat		\
 	src/mmap_stress			\
 	src/mmap_validate		\
-	src/parallel_restore
+	src/parallel_restore		\
+	src/restore_copy
 
 DEPS := $(wildcard src/*.d)
 
@@ -30,6 +31,8 @@ src/mmap_stress: LIBS+=-lpthread
 
 src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
 src/parallel_restore: ../utils/src/scoutfs_parallel_restore.a
+src/restore_copy_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
+src/restore_copy : ../utils/src/scoutfs_parallel_restore.a
 
 $(BIN): %: %.c Makefile
 	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS) $($(@)_cflags)
diff --git a/tests/golden/restore_copy b/tests/golden/restore_copy
new file mode 100644
index 000000000..186b7e0d4
--- /dev/null
+++ b/tests/golden/restore_copy
@@ -0,0 +1,64 @@
+== restore_copy content verification
+d /mnt/test/data/d
+f /mnt/test/data/f
+l /mnt/test/data/l -> broken
+f /mnt/test/data/h
+l /mnt/test/data/F -> f
+b /mnt/test/data/b
+c /mnt/test/data/c
+c /mnt/test/data/u
+p /mnt/test/data/p
+f /mnt/test/data/f4096
+f /mnt/test/data/falloc
+f /mnt/test/data/truncate
+s /mnt/test/data/s
+f /mnt/test/data/mode_t
+f /mnt/test/data/uidgid
+f /mnt/test/data/retention
+f /mnt/test/data/proj
+d /mnt/test/data
+Quota rule:   7 13,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 11,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 12,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 10,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 15,L,- 0,L,- 0,L,- I 33 -
+Quota rule:   7 14,L,- 0,L,- 0,L,- I 33 -
+Wrote 1 directories, 0 files, 458752 bytes total
+== verify metadata bits on restored fs
+total 16516
+-rw-r--r--. 1 33333 33333        0  uidgid
+crw-r--r--. 1     0     0     2, 2  u
+-rw-r--r--. 1     0     0 16777216  truncate
+srwxr-xr-x. 1     0     0        0  s
+-rw-r--r--. 1     0     0        0  retention
+-rw-r--r--. 1     0     0        0  proj
+prw-r--r--. 1     0     0        0  p
+-rwsrwsrwx. 1     0     0        0  mode_t
+lrwxrwxrwx. 1     0     0        7  l -> broken
+-rw-r--r--. 1     0     0        0  h
+-rw-r--r--. 1     0     0   131072  falloc
+-rw-r--r--. 1     0     0     4096  f4096
+-rw-r--r--. 1     0     0        0  f
+drwxr-xr-x. 2     0     0        0  d
+crw-r--r--. 1     0     0     0, 0  c
+brw-r--r--. 1     0     0     1, 1  b
+lrwxrwxrwx. 1     0     0        2  F -> f
+1
+12345
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 32 flags: O.L
+extents: 1
+0: offset: 0 length: 4096 flags: O.L
+extents: 1
+  7 15,L,- 0,L,- 0,L,- I 33 -
+  7 14,L,- 0,L,- 0,L,- I 33 -
+  7 13,L,- 0,L,- 0,L,- I 33 -
+  7 12,L,- 0,L,- 0,L,- I 33 -
+  7 11,L,- 0,L,- 0,L,- I 33 -
+  7 10,L,- 0,L,- 0,L,- I 33 -
+    Type  Size     Total   Used      Free  Use%  
+MetaData  64KB   4194304  34698   4159606     0  
+    Data   4KB  67108864     64  67108800     0  
+== umount restored fs and check
+== cleanup
diff --git a/tests/sequence b/tests/sequence
index c706e5299..35a194d84 100644
--- a/tests/sequence
+++ b/tests/sequence
@@ -58,4 +58,5 @@ block-stale-reads.sh
 inode-deletion.sh
 renameat2-noreplace.sh
 parallel_restore.sh
+restore_copy.sh
 xfstests.sh
diff --git a/tests/src/restore_copy.c b/tests/src/restore_copy.c
new file mode 100644
index 000000000..f474aa2d8
--- /dev/null
+++ b/tests/src/restore_copy.c
@@ -0,0 +1,959 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <time.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/signal.h>
+#include <sys/statfs.h>
+#include <dirent.h>
+
+#include "../../utils/src/sparse.h"
+#include "../../utils/src/util.h"
+#include "../../utils/src/list.h"
+#include "../../utils/src/parse.h"
+#include "../../kmod/src/format.h"
+#include "../../kmod/src/ioctl.h"
+#include "../../utils/src/parallel_restore.h"
+
+/*
+ * XXX:
+ */
+
+#define ERRF " errno %d (%s)"
+#define ERRA errno, strerror(errno)
+
+#define error_exit(cond, fmt, args...)			\
+do {							\
+	if (cond) {					\
+		printf("error: "fmt"\n", ##args);	\
+		exit(1);				\
+	}						\
+} while (0)
+
+#define REG_MODE (S_IFREG | 0644)
+#define DIR_MODE (S_IFDIR | 0755)
+#define LNK_MODE (S_IFLNK | 0777)
+
+/*
+ * At about 1k files we seem to be writing about 1MB of data, so
+ * set buffer sizes adequately above that.
+ */
+#define BATCH_FILES 1024
+#define BUF_SIZ 2 * 1024 * 1024
+
+/*
+ * We can't make duplicate inodes for hardlinked files, so we
+ * will need to track these as we generate them. Not too costly
+ * to do, since it's just an integer, and sorting shouldn't matter
+ * until we get into the millions of entries, hopefully.
+ */
+static struct list_head hardlinks;
+struct hardlink_head {
+	struct list_head head;
+	u64 ino;
+};
+
+struct opts {
+	char *meta_path;
+	char *source_dir;
+};
+
+static bool warn_scoutfs = false;
+
+static void usage(void)
+{
+	printf("usage:\n"
+	       " -m PATH     | path to metadata device\n"
+	       " -s PATH     | path to source directory\n"
+	       );
+}
+
+static size_t write_bufs(struct scoutfs_parallel_restore_writer *wri,
+			 void *buf, int dev_fd)
+{
+	size_t total = 0;
+	size_t count;
+	off_t off;
+	int ret;
+
+	do {
+		ret = scoutfs_parallel_restore_write_buf(wri, buf, BUF_SIZ, &off, &count);
+		error_exit(ret, "write buf %d", ret);
+
+		if (count > 0) {
+			ret = pwrite(dev_fd, buf, count, off);
+			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
+			total += ret;
+		}
+	} while (count > 0);
+
+	return total;
+}
+
+struct write_result {
+	struct scoutfs_parallel_restore_progress prog;
+	struct scoutfs_parallel_restore_slice slice;
+	__le64 files_created;
+	__le64 dirs_created;
+	__le64 bytes_written;
+	bool complete;
+};
+
+static void write_bufs_and_send(struct scoutfs_parallel_restore_writer *wri,
+				void *buf, int dev_fd,
+				struct write_result *res, bool get_slice, int pair_fd)
+{
+	size_t total;
+	int ret;
+
+	total = write_bufs(wri, buf, dev_fd);
+	le64_add_cpu(&res->bytes_written, total);
+
+	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
+	error_exit(ret, "get prog %d", ret);
+
+	if (get_slice) {
+		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
+		error_exit(ret, "thread get slice %d", ret);
+	}
+
+	ret = write(pair_fd, res, sizeof(struct write_result));
+	error_exit(ret != sizeof(struct write_result), "result send error");
+
+	memset(res, 0, sizeof(struct write_result));
+}
+
+/*
+ * Adding xattrs is supported for files and directories only.
+ *
+ * If the filesystem on which the path resides isn't scoutfs, we omit the
+ * scoutfs specific ioctl to fetch hidden xattrs.
+ *
+ * Untested if the hidden xattr ioctl works on directories or symlinks.
+ */
+static void add_xattrs(struct scoutfs_parallel_restore_writer *wri, char *path, u64 ino, bool is_scoutfs)
+{
+	struct scoutfs_ioctl_listxattr_hidden lxh;
+	struct scoutfs_parallel_restore_xattr *xattr;
+	char *buf = NULL;
+	char *name = NULL;
+	int fd = -1;
+	int bytes;
+	int len;
+	int value_len;
+	int ret;
+	int pos = 0;
+
+	if (!is_scoutfs)
+		goto normal_xattrs;
+
+	fd = open(path, O_RDONLY);
+	error_exit(fd < 0, "open"ERRF, ERRA);
+
+	memset(&lxh, 0, sizeof(lxh));
+	lxh.id_pos = 0;
+	lxh.hash_pos = 0;
+	lxh.buf_bytes = 256 * 1024;
+
+	buf = malloc(lxh.buf_bytes);
+	error_exit(!buf, "alloc xattr_hidden buf");
+	lxh.buf_ptr = (unsigned long)buf;
+
+	/* hidden */
+	for (;;) {
+		ret = ioctl(fd, SCOUTFS_IOC_LISTXATTR_HIDDEN, &lxh);
+		if (ret == 0) /* done */
+			break;
+		error_exit(ret < 0, "listxattr_hidden"ERRF, ERRA);
+		bytes = ret;
+		error_exit(bytes > lxh.buf_bytes, "listxattr_hidden overflow");
+		error_exit(buf[bytes - 1] != '\0', "listxattr_hidden didn't term");
+
+		name = buf;
+
+		do {
+			len = strlen(name);
+			error_exit(len == 0, "listxattr_hidden empty name");
+			error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
+
+			/* get value len */
+			value_len = fgetxattr(fd, name, NULL, 0);
+			error_exit(value_len < 0, "malloc value hidden"ERRF, ERRA);
+
+			/* allocate everything at once */
+			xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
+			error_exit(!xattr, "error allocating generated xattr");
+
+			*xattr = (struct scoutfs_parallel_restore_xattr) {
+				.ino = ino,
+				.pos = pos++,
+				.name_len = len,
+				.value_len = value_len,
+			};
+			xattr->name = (void *)(xattr + 1);
+			xattr->value = (void *)(xattr->name + len);
+
+			/* get value into xattr directly */
+			ret = fgetxattr(fd, name, (void *)(xattr->name + len), value_len);
+			error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
+
+			memcpy(xattr->name, name, len);
+
+			ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
+			error_exit(ret, "add hidden xattr %d", ret);
+
+			free(xattr);
+
+			name += len + 1;
+			bytes -= len + 1;
+		} while (bytes > 0);
+	}
+
+	free(buf);
+	close(fd);
+
+normal_xattrs:
+	value_len = listxattr(path, NULL, 0);
+	error_exit(value_len < 0, "hidden listxattr "ERRF, ERRA);
+	if (value_len == 0)
+		return;
+
+	buf = calloc(1, value_len);
+	error_exit(!buf, "malloc value"ERRF, ERRA);
+
+	ret = listxattr(path, buf, value_len);
+	error_exit(ret < 0, "hidden listxattr %d", ret);
+
+	name = buf;
+	bytes = ret;
+	do {
+		len = strlen(name);
+
+		error_exit(len == 0, "listxattr_hidden empty name");
+		error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
+
+		value_len = getxattr(path, name, NULL, 0);
+		error_exit(value_len < 0, "value "ERRF, ERRA);
+
+		xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
+		error_exit(!xattr, "error allocating generated xattr");
+
+		*xattr = (struct scoutfs_parallel_restore_xattr) {
+			.ino = ino,
+			.pos = pos++,
+			.name_len = len,
+			.value_len = value_len,
+		};
+		xattr->name = (void *)(xattr + 1);
+		xattr->value = (void *)(xattr->name + len);
+
+		ret = getxattr(path, name, (void *)(xattr->name + len), value_len);
+		error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
+
+		memcpy(xattr->name, name, len);
+
+		ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
+		error_exit(ret, "add xattr %d", ret);
+
+		free(xattr);
+
+		name += len + 1;
+		bytes -= len + 1;
+	} while (bytes > 0);
+
+	free(buf);
+}
+
+/*
+ * We can't store the same inode multiple times, so we need to make
+ * sure to account for hardlinks. Maintain a LL that stores the first
+ * hardlink inode we encounter, and every subsequent hardlink to this
+ * inode will omit inserting an inode, and just adds another entry
+ */
+static bool is_new_inode_item(bool nlink, u64 ino)
+{
+	struct hardlink_head *hh_tmp;
+	struct hardlink_head *hh;
+
+	if (!nlink)
+		return true;
+
+	/* lineair search, pretty awful, should be a binary tree */
+	list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
+		if (hh->ino == ino)
+			return false;
+	}
+
+	/* insert item */
+	hh = malloc(sizeof(struct hardlink_head));
+	error_exit(!hh, "malloc");
+	hh->ino = ino;
+	list_add_tail(&hh->head, &hardlinks);
+
+	/*
+	 *  XXX
+	 *
+	 * We can be confident that if we don't traverse filesystems
+	 * that once we've created N entries of an N-linked inode, that
+	 * it can be removed from the LL. This would significantly
+	 * improve the manageability of the list.
+	 *
+	 * All we'd need to do is add a counter and compare it to the nr_links
+	 * field of the inode.
+	 */
+
+	return true;
+}
+
+/*
+ * create the inode data for a given path as best as possible
+ * duplicating the exact data from the source path
+ */
+static struct scoutfs_parallel_restore_inode *read_inode_data(char *path, u64 ino, bool *nlink, bool is_scoutfs)
+{
+	struct scoutfs_parallel_restore_inode *inode = NULL;
+	struct scoutfs_ioctl_stat_more stm;
+	struct scoutfs_ioctl_inode_attr_x iax;
+	struct stat st;
+	int ret;
+	int fd;
+
+	inode = calloc(1, sizeof(struct scoutfs_parallel_restore_inode));
+	error_exit(!inode, "failure allocating inode");
+
+	ret = lstat(path, &st);
+	error_exit(ret, "failure stat inode");
+
+	/* use exact inode numbers from path, except for root ino */
+	if (ino != SCOUTFS_ROOT_INO)
+		inode->ino = st.st_ino;
+	else
+		inode->ino = SCOUTFS_ROOT_INO;
+
+	inode->mode = st.st_mode;
+	inode->uid = st.st_uid;
+	inode->gid = st.st_gid;
+	inode->atime = st.st_atim;
+	inode->ctime = st.st_ctim;
+	inode->mtime = st.st_mtim;
+	inode->size = st.st_size;
+
+	inode->rdev = st.st_rdev;
+
+	/* scoutfs specific */
+	inode->meta_seq = 0;
+	inode->data_seq = 0;
+	inode->crtime = st.st_ctim;
+
+	if (S_ISREG(inode->mode)) {
+		if (inode->size > 0)
+			inode->offline = true;
+
+		if (is_scoutfs) {
+			fd = open(path, O_RDONLY);
+			error_exit(!fd, "open failure"ERRF, ERRA);
+
+			ret = ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm);
+			error_exit(ret, "failure SCOUTFS_IOC_STAT_MORE inode");
+
+			inode->meta_seq = stm.meta_seq;
+			inode->data_seq = stm.data_seq;
+			inode->crtime = (struct timespec){.tv_sec = stm.crtime_sec, .tv_nsec = stm.crtime_nsec};
+
+			/* project ID, retention bit */
+			memset(&iax, 0, sizeof(iax));
+
+			iax.x_flags = 0;
+			iax.x_mask = SCOUTFS_IOC_IAX_PROJECT_ID | SCOUTFS_IOC_IAX__BITS;
+			iax.bits = SCOUTFS_IOC_IAX_B_RETENTION;
+
+			ret = ioctl(fd, SCOUTFS_IOC_GET_ATTR_X, &iax);
+			error_exit(ret, "failure SCOUTFS_IOC_GET_ATTR_X inode");
+
+			inode->proj = iax.project_id;
+			inode->flags |= (iax.bits & SCOUTFS_IOC_IAX_B_RETENTION) ? SCOUTFS_INO_FLAG_RETENTION : 0;
+
+			close(fd);
+		}
+
+	}
+
+	/* pass whether item is hardlinked or not */
+	*nlink = (st.st_nlink > 1);
+
+	return inode;
+}
+
+typedef int (*quota_ioctl_in)(struct scoutfs_ioctl_quota_rule *irules,
+							  struct scoutfs_ioctl_get_quota_rules *gqr,
+							  size_t nr, int fd);
+
+static int get_quota_ioctl(struct scoutfs_ioctl_quota_rule *irules,
+						   struct scoutfs_ioctl_get_quota_rules *rules_in,
+						   size_t nr, int fd)
+{
+	struct scoutfs_ioctl_get_quota_rules *gqr = rules_in;
+	int ret;
+
+	gqr->rules_ptr = (intptr_t)irules;
+	gqr->rules_nr = nr;
+
+	ret = ioctl(fd, SCOUTFS_IOC_GET_QUOTA_RULES, gqr);
+	error_exit(ret < 0, "quota ioctl error");
+
+	return ret;
+}
+
+static char opc[] = {
+        [SQ_OP_DATA] = 'D',
+        [SQ_OP_INODE] = 'I',
+};
+
+static char nsc[] = {
+        [SQ_NS_LITERAL] = 'L',
+        [SQ_NS_PROJ] = 'P',
+        [SQ_NS_UID] = 'U',
+        [SQ_NS_GID] = 'G',
+};
+
+static int insert_quota_rule(struct scoutfs_parallel_restore_writer *wri,
+					   struct scoutfs_ioctl_quota_rule *irule)
+{
+	struct scoutfs_parallel_restore_quota_rule *prule = NULL;
+	int ret;
+	int i;
+
+	prule = calloc(1, sizeof(struct scoutfs_parallel_restore_quota_rule));
+	error_exit(!prule, "quota rule alloc failed");
+	prule->limit = irule->limit;
+	prule->prio = irule->prio;
+	prule->op = irule->op;
+	prule->rule_flags = irule->rule_flags;
+	prule->names[0].val = irule->name_val[0];
+	prule->names[0].source = irule->name_source[0];
+	prule->names[0].flags = irule->name_flags[0];
+	prule->names[1].val = irule->name_val[1];
+	prule->names[1].source = irule->name_source[1];
+	prule->names[1].flags = irule->name_flags[1];
+	prule->names[2].val = irule->name_val[2];
+	prule->names[2].source = irule->name_source[2];
+	prule->names[2].flags = irule->name_flags[2];
+
+	/* print out the rule */
+        printf("Quota rule: %3u ", irule->prio);
+        for (i = 0; i < array_size(irule->name_val); i++) {
+                printf("%llu,%c,%c ",
+                       irule->name_val[i],
+                       nsc[irule->name_source[i]],
+                       (irule->name_flags[i] & SQ_NF_SELECT) ? 'S' : '-');
+        }
+        printf("%c %llu %c\n",
+               opc[irule->op], irule->limit, (irule->rule_flags & SQ_RF_TOTL_COUNT) ? 'C' : '-');
+
+	ret = scoutfs_parallel_restore_add_quota_rule(wri, prule);
+	error_exit(ret, "quota add rule %d", ret);
+	free(prule);
+	return ret;
+}
+
+static int restore_quotas(struct scoutfs_parallel_restore_writer *wri,
+			  quota_ioctl_in quota_in, char *path)
+{
+	struct scoutfs_ioctl_get_quota_rules gqr = {{0,}};
+	struct scoutfs_ioctl_quota_rule *irules = NULL;
+	size_t rule_alloc = 0;
+	size_t rule_nr = 0;
+	size_t rule_count;
+	size_t i;
+	int fd = -1;
+	int ret;
+
+	fd = open(path, O_RDONLY);
+	error_exit(fd < 0, "open"ERRF, ERRA);
+
+	for (;;) {
+		if (rule_nr == rule_alloc) {
+			rule_alloc += 1024;
+			irules = realloc(irules, rule_alloc * sizeof(irules[0]));
+			error_exit(!irules, "irule realloc failed rule_nr:%zu alloced:%zu", rule_nr, rule_alloc);
+			if (!irules) {
+				ret = -errno;
+				fprintf(stderr, "memory allocation failed: %s (%d)\n",
+					strerror(errno), errno);
+				goto out;
+			}
+		}
+
+		ret = quota_in(&irules[rule_nr], &gqr, rule_alloc - rule_nr, fd);
+		if (ret == 0)
+			break;
+		if (ret < 0)
+			goto out;
+
+		rule_count = ret;
+
+		for (i = 0; i < rule_count; i++) {
+			ret = insert_quota_rule(wri, &irules[i]);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	if (fd >= 0)
+		close(fd);
+	if (irules)
+		free(irules);
+	return ret;
+}
+
+struct writer_args {
+	struct list_head head;
+
+	int dev_fd;
+	int pair_fd;
+
+	struct scoutfs_parallel_restore_slice slice;
+};
+
+static void restore_path(struct scoutfs_parallel_restore_writer *wri, struct writer_args *args, struct write_result *res, void *buf, char *path, u64 ino)
+{
+	struct scoutfs_parallel_restore_inode *inode;
+	struct scoutfs_parallel_restore_entry *entry;
+	DIR *dirp = NULL;
+	char *subdir = NULL;
+	char link[PATH_MAX + 1];
+	struct dirent *ent;
+	struct statfs stf;
+	int ret = 0;
+	int subdir_count = 0, file_count = 0;
+	size_t ent_len = 0;
+	size_t pos = 0;
+	bool nlink = false;
+	char ind = '?';
+	u64 mode;
+	bool is_scoutfs = false;
+
+	/* get fs info once per path */
+	ret = statfs(path, &stf);
+	error_exit(ret != 0, "statfs"ERRF, ERRA);
+	is_scoutfs = (stf.f_type == 0x554f4353);
+
+	if (!is_scoutfs && !warn_scoutfs) {
+		warn_scoutfs = true;
+		fprintf(stderr, "Non-scoutfs source path detected: scoutfs specific features disabled\n");
+	}
+
+
+	/* traverse the entire tree */
+	dirp = opendir(path);
+	errno = 0;
+	while ((ent = readdir(dirp))) {
+		if (ent->d_type == DT_DIR) {
+			if ((strcmp(ent->d_name, ".") == 0) ||
+			    (strcmp(ent->d_name, "..") == 0)) {
+				/* position still matters */
+				pos++;
+				continue;
+			}
+
+			/* recurse into subdir */
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+			restore_path(wri, args, res, buf, subdir, ent->d_ino);
+
+			subdir_count++;
+
+			ent_len += strlen(ent->d_name);
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = DIR_MODE,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add entry %d", ret);
+			free(entry);
+
+			add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
+
+			free(subdir);
+
+			le64_add_cpu(&res->dirs_created, 1);
+		} else if (ent->d_type == DT_REG) {
+
+			file_count++;
+
+			ent_len += strlen(ent->d_name);
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = REG_MODE,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add entry %d", ret);
+			free(entry);
+
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+
+			/* file inode */
+			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
+			fprintf(stdout, "f %s/%s\n", path, ent->d_name);
+			if (is_new_inode_item(nlink, ent->d_ino)) {
+				ret = scoutfs_parallel_restore_add_inode(wri, inode);
+				error_exit(ret, "add reg file inode %d", ret);
+
+				/* xattrs */
+				add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
+			}
+			free(inode);
+
+			free(subdir);
+
+			le64_add_cpu(&res->files_created, 1);
+		} else if (ent->d_type == DT_LNK) {
+			/* readlink */
+
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+
+			ent_len += strlen(ent->d_name);
+
+			ret = readlink(subdir, link, PATH_MAX);
+			error_exit(ret < 0, "readlink %d", ret);
+			/* must 0-terminate if we want to print it */
+			link[ret] = 0;
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = LNK_MODE,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add symlink entry %d", ret);
+
+			/* link inode */
+			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
+
+			fprintf(stdout, "l %s/%s -> %s\n", path, ent->d_name, link);
+
+			inode->mode = LNK_MODE;
+			inode->target = link;
+			inode->target_len = strlen(link) + 1; /* scoutfs null terminates symlinks */
+
+			ret = scoutfs_parallel_restore_add_inode(wri, inode);
+			error_exit(ret, "add syml inode %d", ret);
+
+			free(inode);
+			free(subdir);
+
+			le64_add_cpu(&res->files_created, 1);
+		} else {
+			/* odd stuff */
+			switch(ent->d_type) {
+			case DT_CHR:
+				ind = 'c';
+				mode = S_IFCHR;
+				break;
+			case DT_BLK:
+				ind = 'b';
+				mode = S_IFBLK;
+				break;
+			case DT_FIFO:
+				ind = 'p';
+				mode = S_IFIFO;
+				break;
+			case DT_SOCK:
+				ind = 's';
+				mode = S_IFSOCK;
+				break;
+			default:
+				error_exit(true, "Unknown readdir entry type");
+				;;
+			}
+
+			file_count++;
+
+			ent_len += strlen(ent->d_name);
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = mode,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add entry %d", ret);
+
+			free(entry);
+
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+
+			/* file inode */
+			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
+			fprintf(stdout, "%c %s/%s\n", ind, path, ent->d_name);
+			if (is_new_inode_item(nlink, ent->d_ino)) {
+				ret = scoutfs_parallel_restore_add_inode(wri, inode);
+				error_exit(ret, "add reg file inode %d", ret);
+			}
+			free(inode);
+
+			free(subdir);
+
+			le64_add_cpu(&res->files_created, 1);
+		}
+
+		/* batch out changes, will be about 1M */
+		if (le64_to_cpu(res->files_created) > BATCH_FILES) {
+			write_bufs_and_send(wri, buf, args->dev_fd, res, false, args->pair_fd);
+		}
+
+	}
+	if (ent != NULL)
+		error_exit(errno, "readdir"ERRF, ERRA);
+	closedir(dirp);
+
+	/* create the dir itself */
+	inode = read_inode_data(path, ino, &nlink, is_scoutfs);
+	inode->nr_subdirs = subdir_count;
+	inode->total_entry_name_bytes = ent_len;
+	fprintf(stdout, "d %s\n", path);
+
+	ret = scoutfs_parallel_restore_add_inode(wri, inode);
+	error_exit(ret, "add dir inode %d", ret);
+
+	free(inode);
+
+	/* No need to send, we'll send final after last directory is complete */
+}
+
+static int do_restore(struct opts *opts)
+{
+	struct scoutfs_parallel_restore_writer *pwri, *wri = NULL;
+	struct scoutfs_parallel_restore_slice *slices = NULL;
+	struct scoutfs_super_block *super = NULL;
+	struct writer_args *args;
+	struct write_result res;
+	int pair[2] = {-1, -1};
+	LIST_HEAD(writers);
+	void *buf = NULL;
+	void *bufp = NULL;
+	int dev_fd = -1;
+	pid_t pid;
+	int ret;
+	u64 tot_bytes;
+	u64 tot_dirs;
+	u64 tot_files;
+
+	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
+	error_exit(ret, "socketpair error "ERRF, ERRA);
+
+	dev_fd = open(opts->meta_path, O_DIRECT | (O_RDWR|O_EXCL));
+	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
+
+	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
+		posix_memalign((void **)&buf, 4096, BUF_SIZ);
+	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
+
+	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
+
+	error_exit((super->flags & SCOUTFS_FLAG_IS_META_BDEV) == 0, "super block is not meta dev");
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
+	error_exit(ret, "import super %d", ret);
+
+	slices = calloc(2, sizeof(struct scoutfs_parallel_restore_slice));
+	error_exit(!slices, "alloc slices");
+
+	scoutfs_parallel_restore_init_slices(wri, slices, 2);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
+	error_exit(ret, "add slices[0] %d", ret);
+
+	args = calloc(1, sizeof(struct writer_args));
+	error_exit(!args, "alloc writer args");
+
+	args->dev_fd = dev_fd;
+	args->slice = slices[1];
+	args->pair_fd = pair[1];
+	list_add_tail(&args->head, &writers);
+
+	/* fork writer process */
+	pid = fork();
+	error_exit(pid == -1, "fork error");
+
+	if (pid == 0) {
+		ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
+		error_exit(ret < 0, "failed to set parent death sig");
+
+		errno = posix_memalign((void **)&bufp, 4096, BUF_SIZ);
+		error_exit(errno, "error allocating block bufp "ERRF, ERRA);
+
+		ret = scoutfs_parallel_restore_create_writer(&pwri);
+		error_exit(ret, "create pwriter %d", ret);
+
+		ret = scoutfs_parallel_restore_add_slice(pwri, &args->slice);
+		error_exit(ret, "add pslice %d", ret);
+
+		memset(&res, 0, sizeof(res));
+
+		restore_path(pwri, args, &res, bufp, opts->source_dir, SCOUTFS_ROOT_INO);
+
+		ret = restore_quotas(pwri, get_quota_ioctl, opts->source_dir);
+		error_exit(ret, "quota add %d", ret);
+
+		res.complete = true;
+
+		write_bufs_and_send(pwri, buf, args->dev_fd, &res, true, args->pair_fd);
+
+		scoutfs_parallel_restore_destroy_writer(&pwri);
+		free(bufp);
+
+		exit(0);
+	};
+
+	/* read results and wait for writer to finish */
+	tot_bytes = 0;
+	tot_dirs = 1;
+	tot_files = 0;
+	for (;;) {
+		ret = read(pair[0], &res, sizeof(struct write_result));
+		error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
+
+		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
+		error_exit(ret, "add thr prog %d", ret);
+
+		if (res.slice.meta_len != 0) {
+			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
+			error_exit(ret, "add thr slice %d", ret);
+
+			if (res.complete)
+				break;
+		}
+
+		tot_bytes += le64_to_cpu(res.bytes_written);
+		tot_files += le64_to_cpu(res.files_created);
+		tot_dirs += le64_to_cpu(res.dirs_created);
+	}
+
+	tot_bytes += write_bufs(wri, buf, args->dev_fd);
+
+	fprintf(stdout, "Wrote %lld directories, %lld files, %lld bytes total\n",
+		tot_dirs, tot_files, tot_bytes);
+
+	/* write super to finalize */
+	ret = scoutfs_parallel_restore_export_super(wri, super);
+	error_exit(ret, "update super %d", ret);
+
+	ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	if (dev_fd >= 0)
+		close(dev_fd);
+	if (pair[0] > 0)
+		close(pair[0]);
+	if (pair[1] > 0)
+		close(pair[1]);
+	free(super);
+	free(args);
+	free(slices);
+	free(buf);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct opts opts = (struct opts){ 0 };
+	struct hardlink_head *hh_tmp;
+	struct hardlink_head *hh;
+	int ret;
+	int c;
+
+	INIT_LIST_HEAD(&hardlinks);
+
+        while ((c = getopt(argc, argv, "b:m:s:")) != -1) {
+                switch(c) {
+                case 'm':
+                        opts.meta_path = strdup(optarg);
+                        break;
+		case 's':
+			opts.source_dir = strdup(optarg);
+			break;
+                case '?':
+                        printf("Unknown option '%c'\n", optopt);
+                        usage();
+			exit(1);
+                }
+        }
+
+	error_exit(!opts.meta_path, "must specify metadata device path with -m");
+	error_exit(!opts.source_dir, "must specify source directory path with -s");
+
+	ret = do_restore(&opts);
+
+	free(opts.meta_path);
+	free(opts.source_dir);
+
+	list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
+		list_del_init(&hh->head);
+		free(hh);
+	}
+
+	return ret == 0 ? 0 : 1;
+}
diff --git a/tests/tests/restore_copy.sh b/tests/tests/restore_copy.sh
new file mode 100644
index 000000000..2fe4be247
--- /dev/null
+++ b/tests/tests/restore_copy.sh
@@ -0,0 +1,99 @@
+#
+# validate parallel restore library - using restore_copy.c
+#
+
+t_require_commands scoutfs restore_copy find xargs
+
+SCR="$T_TMPDIR/mnt.scratch"
+mkdir -p "$SCR"
+
+scratch_mkfs() {
+	scoutfs mkfs $@ \
+		-A -f -Q 0,127.0.0.1,53000 $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_check() {
+	# give ample time for writes to commit
+	sleep 1
+	sync
+	scoutfs check -d ${T_TMPDIR}/check.debug $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_mount() {
+	mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 $T_EX_DATA_DEV $SCR
+}
+
+echo "== restore_copy content verification"
+mkdir "$T_M0/data"
+
+# create all supported inode types:
+mkdir -p "$T_M0/data/d"
+touch "$T_M0/data/f"
+ln -sf "broken" "$T_M0/data/l"
+ln "$T_M0/data/f" "$T_M0/data/h"
+ln -sf "f" "$T_M0/data/F"
+mknod "$T_M0/data/b" b 1 1
+mknod "$T_M0/data/c" c 0 0
+mknod "$T_M0/data/u" u 2 2
+mknod "$T_M0/data/p" p
+
+# some files with data
+dd if=/dev/zero of="$T_M0/data/f4096" bs=4096 count=1 status=none
+touch "$T_M0/data/falloc" "$T_M0/data/truncate"
+xfs_io -C "falloc 65536 65536" "$T_M0/data/falloc"
+xfs_io -C "truncate $((4096 * 4096))" "$T_M0/data/truncate"
+
+# socket (could have used python but avoids python/python2/python3 problem)
+perl -e "use IO::Socket; my \$s = IO::Socket::UNIX->new(Type=>SOCK_STREAM,Local=>'$T_M0/data/s') or die 'sock';"
+# set all mode_t bits
+touch "$T_M0/data/mode_t"
+chmod 6777 "$T_M0/data/mode_t"
+# uid/gid
+touch "$T_M0/data/uidgid"
+chown 33333:33333 "$T_M0/data/uidgid"
+# set retention bit
+touch "$T_M0/data/retention"
+scoutfs set-attr-x -t 1 "$T_M0/data/retention"
+# set project ID
+touch "$T_M0/data/proj"
+scoutfs set-attr-x -p 12345 "$T_M0/data/proj"
+# quotas
+for a in $(seq 10 15); do
+	scoutfs quota-add -p "$T_M0" -r "7 $a,L,- 0,L,- 0,L,- I 33 -"
+done
+
+scratch_mkfs -V 2 > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+restore_copy -m $T_EX_META_DEV -s "$T_M0/data" | t_filter_fs
+scratch_check || t_fail "check before mount failed"
+
+scratch_mount
+
+echo "== verify metadata bits on restored fs"
+inspect() {
+	ls -Alnr --time-style=+""
+	scoutfs get-attr-x -t "retention"
+	scoutfs get-attr-x -p "proj"
+	scoutfs get-fiemap -L "f4096"
+	scoutfs get-fiemap -L "falloc"
+	scoutfs get-fiemap -L "truncate"
+	scoutfs quota-list -p "."
+}
+
+( cd "$SCR" ; inspect )
+
+scoutfs df -p "$SCR"
+
+echo "== umount restored fs and check"
+umount "$SCR"
+scratch_check || t_fail "check after mount failed"
+
+#scoutfs print $T_META_DEVICE
+#scoutfs print $T_EX_META_DEV
+
+echo "== cleanup"
+rmdir "$SCR"
+scoutfs set-attr-x -t 0 "$T_M0/data/retention"
+rm -rf "$T_M0/data"
+scoutfs quota-wipe -p "$T_M0"
+
+t_pass

From e4f5fc4682fa5e91b0798ec02c45d580259abb9d Mon Sep 17 00:00:00 2001
From: Auke Kok <auke.kok@versity.com>
Date: Thu, 16 Jan 2025 11:34:26 -0800
Subject: [PATCH 10/15] Restore hardlink count.

The hardlink count of files was previously hard coded to 1. We want to
properly restore hard linked files because it saves space and time.

The test binary restore_copy exposed this missed case before and is updated
to make use of it.

Signed-off-by: Auke Kok <auke.kok@versity.com>
---
 tests/golden/restore_copy    | 4 ++--
 tests/src/restore_copy.c     | 1 +
 utils/src/parallel_restore.c | 2 +-
 utils/src/parallel_restore.h | 1 +
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/golden/restore_copy b/tests/golden/restore_copy
index 186b7e0d4..a3144fec9 100644
--- a/tests/golden/restore_copy
+++ b/tests/golden/restore_copy
@@ -35,10 +35,10 @@ srwxr-xr-x. 1     0     0        0  s
 prw-r--r--. 1     0     0        0  p
 -rwsrwsrwx. 1     0     0        0  mode_t
 lrwxrwxrwx. 1     0     0        7  l -> broken
--rw-r--r--. 1     0     0        0  h
+-rw-r--r--. 2     0     0        0  h
 -rw-r--r--. 1     0     0   131072  falloc
 -rw-r--r--. 1     0     0     4096  f4096
--rw-r--r--. 1     0     0        0  f
+-rw-r--r--. 2     0     0        0  f
 drwxr-xr-x. 2     0     0        0  d
 crw-r--r--. 1     0     0     0, 0  c
 brw-r--r--. 1     0     0     1, 1  b
diff --git a/tests/src/restore_copy.c b/tests/src/restore_copy.c
index f474aa2d8..f894bcfd0 100644
--- a/tests/src/restore_copy.c
+++ b/tests/src/restore_copy.c
@@ -349,6 +349,7 @@ static struct scoutfs_parallel_restore_inode *read_inode_data(char *path, u64 in
 	inode->ctime = st.st_ctim;
 	inode->mtime = st.st_mtim;
 	inode->size = st.st_size;
+	inode->nlink = st.st_nlink;
 
 	inode->rdev = st.st_rdev;
 
diff --git a/utils/src/parallel_restore.c b/utils/src/parallel_restore.c
index e575011ca..8ca7fde43 100644
--- a/utils/src/parallel_restore.c
+++ b/utils/src/parallel_restore.c
@@ -813,7 +813,7 @@ static spr_err_t insert_inode_items(struct scoutfs_parallel_restore_writer *wri,
 	si->next_readdir_pos = 0;
 	si->next_xattr_id = cpu_to_le64(inode->nr_xattrs + 1);
 	si->version = cpu_to_le64(1);
-	si->nlink = cpu_to_le32(1);
+	si->nlink = cpu_to_le32(inode->nlink ? inode->nlink : 1);
 	si->uid = cpu_to_le32(inode->uid);
 	si->gid = cpu_to_le32(inode->gid);
 	si->mode = cpu_to_le32(inode->mode);
diff --git a/utils/src/parallel_restore.h b/utils/src/parallel_restore.h
index 51fcf4b21..8865e8423 100644
--- a/utils/src/parallel_restore.h
+++ b/utils/src/parallel_restore.h
@@ -58,6 +58,7 @@ struct scoutfs_parallel_restore_inode {
 	u64 data_version;
 	u64 size;
 	bool offline;
+	u32 nlink;
 
 	/* only used for directories */
 	u64 nr_subdirs;

From aba4eb12ac90ca9c26d2f775695a0409c34c0b6c Mon Sep 17 00:00:00 2001
From: Hunter Shaffer <hunter.shaffer@versity.com>
Date: Fri, 24 Jan 2025 12:57:35 -0500
Subject: [PATCH 11/15] Fix Quota hashing

When initializing the key for the quota we were originally given
the address of the pointer to the rule, that is fixed here.
There is also a test case verifying that we are able to perform
operations such as rule deletion and adding a rule to the restored
filesystem.

Signed-off-by: Hunter Shaffer <hunter.shaffer@versity.com>
---
 tests/golden/restore_copy    | 12 ++++++++++++
 tests/tests/restore_copy.sh  |  6 ++++++
 utils/src/parallel_restore.c |  2 +-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/tests/golden/restore_copy b/tests/golden/restore_copy
index a3144fec9..a3a009cf3 100644
--- a/tests/golden/restore_copy
+++ b/tests/golden/restore_copy
@@ -51,6 +51,18 @@ extents: 1
 extents: 1
 0: offset: 0 length: 4096 flags: O.L
 extents: 1
+  7 15,L,- 0,L,- 0,L,- I 33 -
+  7 14,L,- 0,L,- 0,L,- I 33 -
+  7 13,L,- 0,L,- 0,L,- I 33 -
+  7 12,L,- 0,L,- 0,L,- I 33 -
+  7 11,L,- 0,L,- 0,L,- I 33 -
+  7 10,L,- 0,L,- 0,L,- I 33 -
+== verify quota rules on restored fs
+  7 14,L,- 0,L,- 0,L,- I 33 -
+  7 13,L,- 0,L,- 0,L,- I 33 -
+  7 12,L,- 0,L,- 0,L,- I 33 -
+  7 11,L,- 0,L,- 0,L,- I 33 -
+  7 10,L,- 0,L,- 0,L,- I 33 -
   7 15,L,- 0,L,- 0,L,- I 33 -
   7 14,L,- 0,L,- 0,L,- I 33 -
   7 13,L,- 0,L,- 0,L,- I 33 -
diff --git a/tests/tests/restore_copy.sh b/tests/tests/restore_copy.sh
index 2fe4be247..d74955a10 100644
--- a/tests/tests/restore_copy.sh
+++ b/tests/tests/restore_copy.sh
@@ -81,6 +81,12 @@ inspect() {
 
 ( cd "$SCR" ; inspect )
 
+echo "== verify quota rules on restored fs"
+scoutfs quota-del -p "$T_M0" -r "7 15,L,- 0,L,- 0,L,- I 33 -" || t_fail "quota-del failed"
+scoutfs quota-list -p "$T_M0"
+scoutfs quota-add -p "$T_M0" -r "7 15,L,- 0,L,- 0,L,- I 33 -" || t_fail "quota-add failed"
+scoutfs quota-list -p "$T_M0"
+
 scoutfs df -p "$SCR"
 
 echo "== umount restored fs and check"
diff --git a/utils/src/parallel_restore.c b/utils/src/parallel_restore.c
index 8ca7fde43..ecd84ca42 100644
--- a/utils/src/parallel_restore.c
+++ b/utils/src/parallel_restore.c
@@ -951,7 +951,7 @@ static spr_err_t insert_quota_item(struct scoutfs_parallel_restore_writer *wri,
 	memset(&rv->_pad, 0, sizeof(rv->_pad));
 
 	init_key(&bti->key, SCOUTFS_QUOTA_ZONE, SCOUTFS_QUOTA_RULE_TYPE,
-			0, scoutfs_hash64(&rv, sizeof(rv)), 0, 0);
+			0, scoutfs_hash64(rv, sizeof(struct scoutfs_quota_rule_val)), 0, 0);
 
 	err = insert_fs_item(wri, bti);
 	if (err) {

From a1e8f38ef3b1d54cd1474db3924b3aa07b955253 Mon Sep 17 00:00:00 2001
From: Hunter Shaffer <hunter.shaffer@versity.com>
Date: Wed, 5 Mar 2025 12:39:06 -0500
Subject: [PATCH 12/15] Limit scoutfs df output in testing

The tests scripts for restore_copy and parallel_restore were diffing
because of the scoutfs df output. This happened because the fields
other than Used would be dependent on the disk size used. This patch
fixes this by limiting the output to only the type and space used.
Signed-off-by: Hunter Shaffer <hunter.shaffer@versity.com>
---
 tests/golden/parallel_restore   | 12 ++++++------
 tests/golden/restore_copy       |  6 +++---
 tests/tests/parallel_restore.sh |  4 ++--
 tests/tests/restore_copy.sh     |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/golden/parallel_restore b/tests/golden/parallel_restore
index a9b9d42ed..288893574 100644
--- a/tests/golden/parallel_restore
+++ b/tests/golden/parallel_restore
@@ -12,13 +12,13 @@ extents: 1
 extents: 1
 0: offset: 0 length: 1 flags: O.L
 extents: 1
-    Type  Size     Total   Used      Free  Use%  
-MetaData  64KB    163840  34721    129119    21  
-    Data   4KB  15728640     64  15728576     0  
+Type Used
+MetaData 34721
+Data 64
 == under ENOSPC
-    Type  Size     Total    Used      Free  Use%  
-MetaData  64KB    163840  115361     48479    70  
-    Data   4KB  15728640      64  15728576     0  
+Type Used
+MetaData 117073
+Data 64
 == ENOSPC
 == attempt to restore data device
 == attempt format_v1 restore
diff --git a/tests/golden/restore_copy b/tests/golden/restore_copy
index a3a009cf3..718216c33 100644
--- a/tests/golden/restore_copy
+++ b/tests/golden/restore_copy
@@ -69,8 +69,8 @@ extents: 1
   7 12,L,- 0,L,- 0,L,- I 33 -
   7 11,L,- 0,L,- 0,L,- I 33 -
   7 10,L,- 0,L,- 0,L,- I 33 -
-    Type  Size     Total   Used      Free  Use%  
-MetaData  64KB   4194304  34698   4159606     0  
-    Data   4KB  67108864     64  67108800     0  
+Type Used
+MetaData 34698
+Data 64
 == umount restored fs and check
 == cleanup
diff --git a/tests/tests/parallel_restore.sh b/tests/tests/parallel_restore.sh
index a43d2d2cb..69b594ec4 100644
--- a/tests/tests/parallel_restore.sh
+++ b/tests/tests/parallel_restore.sh
@@ -34,7 +34,7 @@ scoutfs statfs -p "$SCR" | grep -v -e 'fsid' -e 'rid'
 find "$SCR" -exec scoutfs list-hidden-xattrs {} \; | wc
 scoutfs search-xattrs -p "$SCR" scoutfs.hide.srch.sam_vol_F01030L6 -p "$SCR" | wc
 find "$SCR" -type f -name "file-*" | head -n 4 | xargs -n 1 scoutfs get-fiemap -L
-scoutfs df -p "$SCR"
+scoutfs df -p "$SCR" | awk '{print $1, $4}'
 scoutfs quota-list -p "$SCR"
 
 umount "$SCR"
@@ -45,7 +45,7 @@ scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
 parallel_restore -m "$T_EX_META_DEV" -n 2000000 > /dev/null || t_fail "parallel_restore"
 scratch_check || t_fail "check failed"
 scratch_mount
-scoutfs df -p "$SCR"
+scoutfs df -p "$SCR" | awk '{print $1, $4}'
 umount "$SCR"
 scratch_check || t_fail "check after mount failed"
 
diff --git a/tests/tests/restore_copy.sh b/tests/tests/restore_copy.sh
index d74955a10..694c95324 100644
--- a/tests/tests/restore_copy.sh
+++ b/tests/tests/restore_copy.sh
@@ -87,7 +87,7 @@ scoutfs quota-list -p "$T_M0"
 scoutfs quota-add -p "$T_M0" -r "7 15,L,- 0,L,- 0,L,- I 33 -" || t_fail "quota-add failed"
 scoutfs quota-list -p "$T_M0"
 
-scoutfs df -p "$SCR"
+scoutfs df -p "$SCR" | awk '{print $1, $4}'
 
 echo "== umount restored fs and check"
 umount "$SCR"

From c7599286022252e36cefc70d358db740d6a27157 Mon Sep 17 00:00:00 2001
From: Auke Kok <auke.kok@versity.com>
Date: Tue, 25 Mar 2025 15:12:32 -0400
Subject: [PATCH 13/15] Restore project ID, retention et al in restore_copy

We didn't migrate the extra data from inodes on folders before,
which is a gap in testing. Make sure to test with a nested restored
folder to test that inheritance isn't in the way.

Signed-off-by: Auke Kok <auke.kok@versity.com>
---
 tests/golden/restore_copy   | 5 +++++
 tests/src/restore_copy.c    | 7 ++++---
 tests/tests/restore_copy.sh | 6 ++++++
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tests/golden/restore_copy b/tests/golden/restore_copy
index 718216c33..e223c8f15 100644
--- a/tests/golden/restore_copy
+++ b/tests/golden/restore_copy
@@ -16,6 +16,8 @@ f /mnt/test/data/mode_t
 f /mnt/test/data/uidgid
 f /mnt/test/data/retention
 f /mnt/test/data/proj
+f /mnt/test/data/proj_d/f
+d /mnt/test/data/proj_d
 d /mnt/test/data
 Quota rule:   7 13,L,- 0,L,- 0,L,- I 33 -
 Quota rule:   7 11,L,- 0,L,- 0,L,- I 33 -
@@ -31,6 +33,7 @@ crw-r--r--. 1     0     0     2, 2  u
 -rw-r--r--. 1     0     0 16777216  truncate
 srwxr-xr-x. 1     0     0        0  s
 -rw-r--r--. 1     0     0        0  retention
+drwxr-xr-x. 2     0     0        1  proj_d
 -rw-r--r--. 1     0     0        0  proj
 prw-r--r--. 1     0     0        0  p
 -rwsrwsrwx. 1     0     0        0  mode_t
@@ -57,6 +60,8 @@ extents: 1
   7 12,L,- 0,L,- 0,L,- I 33 -
   7 11,L,- 0,L,- 0,L,- I 33 -
   7 10,L,- 0,L,- 0,L,- I 33 -
+12345
+54321
 == verify quota rules on restored fs
   7 14,L,- 0,L,- 0,L,- I 33 -
   7 13,L,- 0,L,- 0,L,- I 33 -
diff --git a/tests/src/restore_copy.c b/tests/src/restore_copy.c
index f894bcfd0..5dd2a76df 100644
--- a/tests/src/restore_copy.c
+++ b/tests/src/restore_copy.c
@@ -358,10 +358,11 @@ static struct scoutfs_parallel_restore_inode *read_inode_data(char *path, u64 in
 	inode->data_seq = 0;
 	inode->crtime = st.st_ctim;
 
-	if (S_ISREG(inode->mode)) {
-		if (inode->size > 0)
-			inode->offline = true;
+	/* we don't restore data */
+	if (S_ISREG(inode->mode) && (inode->size > 0))
+		inode->offline = true;
 
+	if (S_ISREG(inode->mode) || S_ISDIR(inode->mode)) {
 		if (is_scoutfs) {
 			fd = open(path, O_RDONLY);
 			error_exit(!fd, "open failure"ERRF, ERRA);
diff --git a/tests/tests/restore_copy.sh b/tests/tests/restore_copy.sh
index 694c95324..7805d3ca3 100644
--- a/tests/tests/restore_copy.sh
+++ b/tests/tests/restore_copy.sh
@@ -57,6 +57,10 @@ scoutfs set-attr-x -t 1 "$T_M0/data/retention"
 # set project ID
 touch "$T_M0/data/proj"
 scoutfs set-attr-x -p 12345 "$T_M0/data/proj"
+mkdir -p "$T_M0/data/proj_d"
+touch "$T_M0/data/proj_d/f"
+scoutfs set-attr-x -p 12345 "$T_M0/data/proj_d/f"
+scoutfs set-attr-x -p 54321 "$T_M0/data/proj_d"
 # quotas
 for a in $(seq 10 15); do
 	scoutfs quota-add -p "$T_M0" -r "7 $a,L,- 0,L,- 0,L,- I 33 -"
@@ -77,6 +81,8 @@ inspect() {
 	scoutfs get-fiemap -L "falloc"
 	scoutfs get-fiemap -L "truncate"
 	scoutfs quota-list -p "."
+	scoutfs get-attr-x -p "proj_d/f"
+	scoutfs get-attr-x -p "proj_d"
 }
 
 ( cd "$SCR" ; inspect )

From 200b286b7bce865cdda775ed1d0b3a8ce2c31583 Mon Sep 17 00:00:00 2001
From: Auke Kok <auke.kok@versity.com>
Date: Tue, 25 Mar 2025 15:44:29 -0400
Subject: [PATCH 14/15] Restore crtime.

While doing this I noticed we attempt to restore data/meta_seq but
that goes nowhere, it's just ignored.

Signed-off-by: Auke Kok <auke.kok@versity.com>
---
 tests/golden/restore_copy   | 2 ++
 tests/src/restore_copy.c    | 2 ++
 tests/tests/restore_copy.sh | 7 +++++++
 3 files changed, 11 insertions(+)

diff --git a/tests/golden/restore_copy b/tests/golden/restore_copy
index e223c8f15..36e14321b 100644
--- a/tests/golden/restore_copy
+++ b/tests/golden/restore_copy
@@ -62,6 +62,8 @@ extents: 1
   7 10,L,- 0,L,- 0,L,- I 33 -
 12345
 54321
+crtime            55555.666666666
+crtime            55556.666666666
 == verify quota rules on restored fs
   7 14,L,- 0,L,- 0,L,- I 33 -
   7 13,L,- 0,L,- 0,L,- I 33 -
diff --git a/tests/src/restore_copy.c b/tests/src/restore_copy.c
index 5dd2a76df..94fc702c1 100644
--- a/tests/src/restore_copy.c
+++ b/tests/src/restore_copy.c
@@ -370,8 +370,10 @@ static struct scoutfs_parallel_restore_inode *read_inode_data(char *path, u64 in
 			ret = ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm);
 			error_exit(ret, "failure SCOUTFS_IOC_STAT_MORE inode");
 
+			/* these aren't restored! */
 			inode->meta_seq = stm.meta_seq;
 			inode->data_seq = stm.data_seq;
+
 			inode->crtime = (struct timespec){.tv_sec = stm.crtime_sec, .tv_nsec = stm.crtime_nsec};
 
 			/* project ID, retention bit */
diff --git a/tests/tests/restore_copy.sh b/tests/tests/restore_copy.sh
index 7805d3ca3..7517f05d2 100644
--- a/tests/tests/restore_copy.sh
+++ b/tests/tests/restore_copy.sh
@@ -65,6 +65,10 @@ scoutfs set-attr-x -p 54321 "$T_M0/data/proj_d"
 for a in $(seq 10 15); do
 	scoutfs quota-add -p "$T_M0" -r "7 $a,L,- 0,L,- 0,L,- I 33 -"
 done
+# crtime
+scoutfs set-attr-x -r 55555.666666666 "$T_M0/data/proj_d"
+scoutfs set-attr-x -r 55556.666666666 "$T_M0/data/proj_d/f"
+# data_seq, meta_seq, data_version is not restored.
 
 scratch_mkfs -V 2 > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
 restore_copy -m $T_EX_META_DEV -s "$T_M0/data" | t_filter_fs
@@ -83,6 +87,9 @@ inspect() {
 	scoutfs quota-list -p "."
 	scoutfs get-attr-x -p "proj_d/f"
 	scoutfs get-attr-x -p "proj_d"
+
+	scoutfs stat proj_d | grep crtime
+	scoutfs stat proj_d/f | grep crtime
 }
 
 ( cd "$SCR" ; inspect )

From 7b121d9860286e3372e9419405c5b6fd8700a01b Mon Sep 17 00:00:00 2001
From: Hunter Shaffer <hunter.shaffer@versity.com>
Date: Tue, 25 Feb 2025 16:27:07 -0500
Subject: [PATCH 15/15] small btree balancing

While we are filling blocks the final block may not have enough items
to properly fill that block. Here we add a check that stops filling
the block if we have less than the minimum amount of items.

Signed-off-by: Hunter Shaffer <hunter.shaffer@versity.com>
---
 utils/src/parallel_restore.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/utils/src/parallel_restore.c b/utils/src/parallel_restore.c
index ecd84ca42..4a1ab3fd6 100644
--- a/utils/src/parallel_restore.c
+++ b/utils/src/parallel_restore.c
@@ -1030,6 +1030,8 @@ static spr_err_t build_btree_block(struct scoutfs_parallel_restore_writer *wri,
 	unsigned long val_align;
 	unsigned long bytes;
 	unsigned long nr;
+	unsigned long min_items;
+	long item_bytes_after_block;
 	void *val_buf;
 	spr_err_t err;
 	u8 height;
@@ -1083,10 +1085,16 @@ static spr_err_t build_btree_block(struct scoutfs_parallel_restore_writer *wri,
 	for_each_bti_safe(&btb->items[level].root, bti, tmp) {
 		val_align = round_up(bti->val_len, SCOUTFS_BTREE_VALUE_ALIGN);
 		bytes = sizeof(struct scoutfs_btree_item) + val_align;
+		item_bytes_after_block = (le64_to_cpu(btb->total_items) * bytes) - le16_to_cpu(bt->mid_free_len);
+		min_items = (SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) / 4;
 
 		if (le16_to_cpu(bt->mid_free_len) < bytes)
 			break;
 
+		/* stop when there are not enough items to fill the next block */
+		if (item_bytes_after_block > 0 && item_bytes_after_block < min_items)
+			break;
+
 		item->node.height = UNLINKED_AVL_HEIGHT;
 		item->key = bti->key;
 		item->seq = cpu_to_le64(1);