diff --git a/tests/Makefile b/tests/Makefile index 3a2380dc..72894eb2 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -15,7 +15,9 @@ BIN := src/createmany \ src/o_tmpfile_umask \ src/o_tmpfile_linkat \ src/mmap_stress \ - src/mmap_validate + src/mmap_validate \ + src/parallel_restore \ + src/restore_copy DEPS := $(wildcard src/*.d) @@ -27,8 +29,13 @@ endif src/mmap_stress: LIBS+=-lpthread +src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm +src/parallel_restore: ../utils/src/scoutfs_parallel_restore.a +src/restore_copy_cflags := ../utils/src/scoutfs_parallel_restore.a -lm +src/restore_copy : ../utils/src/scoutfs_parallel_restore.a + $(BIN): %: %.c Makefile - gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS) + gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS) $($(@)_cflags) .PHONY: clean clean: diff --git a/tests/golden/parallel_restore b/tests/golden/parallel_restore new file mode 100644 index 00000000..28889357 --- /dev/null +++ b/tests/golden/parallel_restore @@ -0,0 +1,26 @@ +== simple mkfs/restore/mount +committed_seq 1120 +total_meta_blocks 163840 +total_data_blocks 15728640 + 1440 1440 57120 + 80 80 400 +0: offset: 0 length: 1 flags: O.L +extents: 1 +0: offset: 0 length: 1 flags: O.L +extents: 1 +0: offset: 0 length: 1 flags: O.L +extents: 1 +0: offset: 0 length: 1 flags: O.L +extents: 1 +Type Used +MetaData 34721 +Data 64 +== under ENOSPC +Type Used +MetaData 117073 +Data 64 +== ENOSPC +== attempt to restore data device +== attempt format_v1 restore +== test if previously mounted +== cleanup diff --git a/tests/golden/restore_copy b/tests/golden/restore_copy new file mode 100644 index 00000000..36e14321 --- /dev/null +++ b/tests/golden/restore_copy @@ -0,0 +1,83 @@ +== restore_copy content verification +d /mnt/test/data/d +f /mnt/test/data/f +l /mnt/test/data/l -> broken +f /mnt/test/data/h +l /mnt/test/data/F -> f +b /mnt/test/data/b +c /mnt/test/data/c +c /mnt/test/data/u +p /mnt/test/data/p +f /mnt/test/data/f4096 +f /mnt/test/data/falloc +f /mnt/test/data/truncate +s /mnt/test/data/s +f /mnt/test/data/mode_t +f /mnt/test/data/uidgid +f /mnt/test/data/retention +f /mnt/test/data/proj +f /mnt/test/data/proj_d/f +d /mnt/test/data/proj_d +d /mnt/test/data +Quota rule: 7 13,L,- 0,L,- 0,L,- I 33 - +Quota rule: 7 11,L,- 0,L,- 0,L,- I 33 - +Quota rule: 7 12,L,- 0,L,- 0,L,- I 33 - +Quota rule: 7 10,L,- 0,L,- 0,L,- I 33 - +Quota rule: 7 15,L,- 0,L,- 0,L,- I 33 - +Quota rule: 7 14,L,- 0,L,- 0,L,- I 33 - +Wrote 1 directories, 0 files, 458752 bytes total +== verify metadata bits on restored fs +total 16516 +-rw-r--r--. 1 33333 33333 0 uidgid +crw-r--r--. 1 0 0 2, 2 u +-rw-r--r--. 1 0 0 16777216 truncate +srwxr-xr-x. 1 0 0 0 s +-rw-r--r--. 1 0 0 0 retention +drwxr-xr-x. 2 0 0 1 proj_d +-rw-r--r--. 1 0 0 0 proj +prw-r--r--. 1 0 0 0 p +-rwsrwsrwx. 1 0 0 0 mode_t +lrwxrwxrwx. 1 0 0 7 l -> broken +-rw-r--r--. 2 0 0 0 h +-rw-r--r--. 1 0 0 131072 falloc +-rw-r--r--. 1 0 0 4096 f4096 +-rw-r--r--. 2 0 0 0 f +drwxr-xr-x. 2 0 0 0 d +crw-r--r--. 1 0 0 0, 0 c +brw-r--r--. 1 0 0 1, 1 b +lrwxrwxrwx. 1 0 0 2 F -> f +1 +12345 +0: offset: 0 length: 1 flags: O.L +extents: 1 +0: offset: 0 length: 32 flags: O.L +extents: 1 +0: offset: 0 length: 4096 flags: O.L +extents: 1 + 7 15,L,- 0,L,- 0,L,- I 33 - + 7 14,L,- 0,L,- 0,L,- I 33 - + 7 13,L,- 0,L,- 0,L,- I 33 - + 7 12,L,- 0,L,- 0,L,- I 33 - + 7 11,L,- 0,L,- 0,L,- I 33 - + 7 10,L,- 0,L,- 0,L,- I 33 - +12345 +54321 +crtime 55555.666666666 +crtime 55556.666666666 +== verify quota rules on restored fs + 7 14,L,- 0,L,- 0,L,- I 33 - + 7 13,L,- 0,L,- 0,L,- I 33 - + 7 12,L,- 0,L,- 0,L,- I 33 - + 7 11,L,- 0,L,- 0,L,- I 33 - + 7 10,L,- 0,L,- 0,L,- I 33 - + 7 15,L,- 0,L,- 0,L,- I 33 - + 7 14,L,- 0,L,- 0,L,- I 33 - + 7 13,L,- 0,L,- 0,L,- I 33 - + 7 12,L,- 0,L,- 0,L,- I 33 - + 7 11,L,- 0,L,- 0,L,- I 33 - + 7 10,L,- 0,L,- 0,L,- I 33 - +Type Used +MetaData 34698 +Data 64 +== umount restored fs and check +== cleanup diff --git a/tests/sequence b/tests/sequence index 18eff7cf..35a194d8 100644 --- a/tests/sequence +++ b/tests/sequence @@ -57,4 +57,6 @@ archive-light-cycle.sh block-stale-reads.sh inode-deletion.sh renameat2-noreplace.sh +parallel_restore.sh +restore_copy.sh xfstests.sh diff --git a/tests/src/parallel_restore.c b/tests/src/parallel_restore.c new file mode 100644 index 00000000..b6c82657 --- /dev/null +++ b/tests/src/parallel_restore.c @@ -0,0 +1,805 @@ +#define _GNU_SOURCE /* O_DIRECT */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../utils/src/sparse.h" +#include "../../utils/src/util.h" +#include "../../utils/src/list.h" +#include "../../utils/src/parse.h" +#include "../../kmod/src/format.h" +#include "../../utils/src/parallel_restore.h" + +/* + * XXX: + * - add a nice description of what's going on + * - mention allocator contention + * - test child process dying handling + * - root dir entry name length is wrong + */ + +#define ERRF " errno %d (%s)" +#define ERRA errno, strerror(errno) + +#define error_exit(cond, fmt, args...) \ +do { \ + if (cond) { \ + printf("error: "fmt"\n", ##args); \ + exit(1); \ + } \ +} while (0) + +#define dprintf(fmt, args...) \ +do { \ + if (0) \ + printf(fmt, ##args); \ +} while (0) + +#define REG_MODE (S_IFREG | 0644) +#define DIR_MODE (S_IFDIR | 0755) + +struct opts { + unsigned long long buf_size; + + unsigned long long write_batch; + unsigned long long low_dirs; + unsigned long long high_dirs; + unsigned long long low_files; + unsigned long long high_files; + char *meta_path; + unsigned long long total_files; + bool read_only; + unsigned long long seed; + unsigned long long nr_writers; +}; + +static void usage(void) +{ + printf("usage:\n" + " -b NR | threads write blocks in batches files (100000)\n" + " -d LOW:HIGH | range of subdirs per directory (5:10)\n" + " -f LOW:HIGH | range of files per directory (10:20)\n" + " -m PATH | path to metadata device\n" + " -n NR | total number of files to create (100)\n" + " -r | read-only, all work except writing, measure cpu cost\n" + " -s NR | randomization seed (random)\n" + " -w NR | number of writing processes to fork (online cpus)\n" + ); +} + +static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri, + void *buf, size_t buf_size, int dev_fd) +{ + size_t total = 0; + size_t count; + off_t off; + int ret; + + do { + ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count); + error_exit(ret, "write buf %d", ret); + + if (count > 0) { + if (!opts->read_only) + ret = pwrite(dev_fd, buf, count, off); + else + ret = count; + error_exit(ret != count, "pwrite count %zu ret %d", count, ret); + total += ret; + } + } while (count > 0); + + return total; +} + +struct gen_inode { + struct scoutfs_parallel_restore_inode inode; + struct scoutfs_parallel_restore_xattr **xattrs; + u64 nr_xattrs; + struct scoutfs_parallel_restore_entry **entries; + u64 nr_files; + u64 nr_entries; +}; + +static void free_gino(struct gen_inode *gino) +{ + u64 i; + + if (gino) { + if (gino->entries) { + for (i = 0; i < gino->nr_entries; i++) + free(gino->entries[i]); + free(gino->entries); + } + if (gino->xattrs) { + for (i = 0; i < gino->nr_xattrs; i++) + free(gino->xattrs[i]); + free(gino->xattrs); + } + free(gino); + } +} + +static struct scoutfs_parallel_restore_xattr * +generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value, + int value_len) +{ + struct scoutfs_parallel_restore_xattr *xattr; + + xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len); + error_exit(!xattr, "error allocating generated xattr"); + + *xattr = (struct scoutfs_parallel_restore_xattr) { + .ino = ino, + .pos = pos, + .name_len = name_len, + .value_len = value_len, + }; + + xattr->name = (void *)(xattr + 1); + xattr->value = (void *)(xattr->name + name_len); + + memcpy(xattr->name, name, name_len); + if (value_len) + memcpy(xattr->value, value, value_len); + + return xattr; +} + +static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode) +{ + struct gen_inode *gino; + struct timespec now; + + clock_gettime(CLOCK_REALTIME, &now); + + gino = calloc(1, sizeof(struct gen_inode)); + error_exit(!gino, "failure allocating generated inode"); + + gino->inode = (struct scoutfs_parallel_restore_inode) { + .ino = ino, + .meta_seq = ino, + .data_seq = 0, + .mode = mode, + .atime = now, + .ctime = now, + .mtime = now, + .crtime = now, + }; + + /* + * hacky creation of a bunch of xattrs for now. + */ + if ((mode & S_IFMT) == S_IFREG) { + #define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, } + struct name_val { + char *name; + int len; + char *value; + int value_len; + } nv[] = { + NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"), + NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""), + NV("scoutfs.hide.sam_reqcopies", ""), + NV("scoutfs.hide.sam_copy_2", ""), + NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"), + NV("scoutfs.hide.sam_copy_1", ""), + NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""), + NV("scoutfs.hide.srch.sam_release_cand", ""), + NV("scoutfs.hide.sam_restime", ""), + NV("scoutfs.hide.sam_uuid", ""), + NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"), + NV("scoutfs.hide.srch.sam_vol_F01030L6", ""), + NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""), + NV("scoutfs.hide.srch.sam_vol_E01001L6", ""), + NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"), + NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"), + NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"), + NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"), + }; + unsigned int nr = array_size(nv); + int i; + + gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *)); + + for (i = 0; i < nr; i++) + gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len, + nv[i].value, nv[i].value_len); + + gino->nr_xattrs = nr; + gino->inode.nr_xattrs = nr; + + gino->inode.size = 4096; + gino->inode.offline = true; + } + + return gino; +} + +static struct scoutfs_parallel_restore_entry * +generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode) +{ + struct scoutfs_parallel_restore_entry *entry; + char buf[PATH_MAX]; + int bytes; + + bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr); + + entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes); + error_exit(!entry, "error allocating generated entry"); + + *entry = (struct scoutfs_parallel_restore_entry) { + .dir_ino = dir_ino, + .pos = pos, + .ino = ino, + .mode = mode, + .name = (void *)(entry + 1), + .name_len = bytes, + }; + + memcpy(entry->name, buf, bytes); + + return entry; +} + +static u64 random64(void) +{ + return ((u64)lrand48() << 32) | lrand48(); +} + +static u64 random_range(u64 low, u64 high) +{ + return low + (random64() % (high - low + 1)); +} + +static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len, + bool no_dirs) +{ + struct scoutfs_parallel_restore_entry *entry; + struct gen_inode *gino; + u64 nr_entries; + u64 nr_files; + u64 nr_dirs; + u64 ino; + char *prefix; + mode_t mode; + u64 i; + + nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs); + nr_files = random_range(opts->low_files, opts->high_files); + + if (1 + nr_dirs + nr_files > ino_len) { + nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2; + nr_files = (ino_len - 1) - nr_dirs; + } + + nr_entries = nr_dirs + nr_files; + + gino = generate_inode(opts, dir_ino, DIR_MODE); + error_exit(!gino, "error allocating generated inode"); + + gino->inode.nr_subdirs = nr_dirs; + gino->nr_files = nr_files; + + if (nr_entries) { + gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *)); + error_exit(!gino->entries, "error allocating generated inode entries"); + + gino->nr_entries = nr_entries; + } + + mode = DIR_MODE; + prefix = "dir"; + for (i = 0; i < nr_entries; i++) { + if (i == nr_dirs) { + mode = REG_MODE; + prefix = "file"; + } + + ino = ino_start + i; + entry = generate_entry(opts, prefix, ino, gino->inode.ino, + SCOUTFS_DIRENT_FIRST_POS + i, ino, mode); + + gino->entries[i] = entry; + gino->inode.total_entry_name_bytes += entry->name_len; + } + + return gino; +} + +/* + * Restore a generated inode. If it's a directory then we also restore + * all its entries. The caller is going to descend into subdir entries and generate + * those dir inodes. We have to generate and restore all non-dir inodes referenced + * by this inode's entries. + */ +static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri, + struct gen_inode *gino) +{ + struct gen_inode *nondir; + int ret; + u64 i; + + ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode); + error_exit(ret, "thread add root inode %d", ret); + + for (i = 0; i < gino->nr_entries; i++) { + ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]); + error_exit(ret, "thread add entry %d", ret); + + /* caller only needs subdir entries, generate and free others */ + if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) { + + nondir = generate_inode(opts, gino->entries[i]->ino, + gino->entries[i]->mode); + restore_inode(opts, wri, nondir); + free_gino(nondir); + + free(gino->entries[i]); + if (i != gino->nr_entries - 1) + gino->entries[i] = gino->entries[gino->nr_entries - 1]; + gino->nr_entries--; + gino->nr_files--; + i--; + } + } + + for (i = 0; i < gino->nr_xattrs; i++) { + ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]); + error_exit(ret, "thread add xattr %d", ret); + } +} + +struct writer_args { + struct list_head head; + + int dev_fd; + int pair_fd; + + struct scoutfs_parallel_restore_slice slice; + u64 writer_nr; + u64 dir_height; + u64 ino_start; + u64 ino_len; +}; + +struct write_result { + struct scoutfs_parallel_restore_progress prog; + struct scoutfs_parallel_restore_slice slice; + __le64 files_created; + __le64 bytes_written; +}; + +static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri, + void *buf, size_t buf_size, int dev_fd, + struct write_result *res, bool get_slice, int pair_fd) +{ + size_t total; + int ret; + + total = write_bufs(opts, wri, buf, buf_size, dev_fd); + le64_add_cpu(&res->bytes_written, total); + + ret = scoutfs_parallel_restore_get_progress(wri, &res->prog); + error_exit(ret, "get prog %d", ret); + + if (get_slice) { + ret = scoutfs_parallel_restore_get_slice(wri, &res->slice); + error_exit(ret, "thread get slice %d", ret); + } + + ret = write(pair_fd, res, sizeof(struct write_result)); + error_exit(ret != sizeof(struct write_result), "result send error"); + + memset(res, 0, sizeof(struct write_result)); +} + +/* + * Calculate the number of bytes in toplevel "dir-%llu" entry names for the given + * number of writers. + */ +static u64 topdir_entry_bytes(u64 nr_writers) +{ + u64 bytes = (3 + 1) * nr_writers; + u64 limit; + u64 done; + u64 wid; + u64 nr; + + for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) { + nr = min(limit - done, nr_writers - done); + bytes += nr * wid; + } + + return bytes; +} + +struct dir_pos { + struct gen_inode *gino; + u64 pos; +}; + +static void writer_proc(struct opts *opts, struct writer_args *args) +{ + struct scoutfs_parallel_restore_writer *wri = NULL; + struct scoutfs_parallel_restore_entry *entry; + struct dir_pos *dirs = NULL; + struct write_result res; + struct gen_inode *gino; + void *buf = NULL; + u64 level; + u64 ino; + int ret; + + memset(&res, 0, sizeof(res)); + + dirs = calloc(args->dir_height, sizeof(struct dir_pos)); + error_exit(errno, "error allocating parent dirs "ERRF, ERRA); + + errno = posix_memalign((void **)&buf, 4096, opts->buf_size); + error_exit(errno, "error allocating block buf "ERRF, ERRA); + + ret = scoutfs_parallel_restore_create_writer(&wri); + error_exit(ret, "create writer %d", ret); + + ret = scoutfs_parallel_restore_add_slice(wri, &args->slice); + error_exit(ret, "add slice %d", ret); + + /* writer 0 creates the root dir */ + if (args->writer_nr == 0) { + gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE); + gino->inode.nr_subdirs = opts->nr_writers; + gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers); + + ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode); + error_exit(ret, "thread add root inode %d", ret); + free_gino(gino); + } + + /* create root entry for our top level dir */ + ino = args->ino_start++; + args->ino_len--; + + entry = generate_entry(opts, "top", args->writer_nr, + SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr, + ino, DIR_MODE); + + ret = scoutfs_parallel_restore_add_entry(wri, entry); + error_exit(ret, "thread top entry %d", ret); + free(entry); + + level = args->dir_height - 1; + + while (args->ino_len > 0 && level < args->dir_height) { + gino = dirs[level].gino; + + /* generate and restore if we follow entries */ + if (!gino) { + gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0); + args->ino_start += gino->nr_entries; + args->ino_len -= gino->nr_entries; + le64_add_cpu(&res.files_created, gino->nr_files); + + restore_inode(opts, wri, gino); + dirs[level].gino = gino; + } + + if (dirs[level].pos == gino->nr_entries) { + /* ascend if we're done with this dir */ + dirs[level].gino = NULL; + dirs[level].pos = 0; + free_gino(gino); + level++; + + } else { + /* otherwise descend into subdir entry */ + ino = gino->entries[dirs[level].pos]->ino; + dirs[level].pos++; + level--; + } + + /* do a partial write at batch intervals when there's still more to do */ + if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0) + write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd, + &res, false, args->pair_fd); + } + + write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd, + &res, true, args->pair_fd); + + scoutfs_parallel_restore_destroy_writer(&wri); + + free(dirs); + free(buf); +} + +/* + * If any of our children exited with an error code, we hard exit. + * The child processes should themselves report out any errors + * encountered. Any remaining children will receive SIGHUP and + * terminate. + */ +static void sigchld_handler(int signo, siginfo_t *info, void *context) +{ + if (info->si_status) + exit(EXIT_FAILURE); +} + +static void fork_writer(struct opts *opts, struct writer_args *args) +{ + pid_t parent = getpid(); + pid_t pid; + int ret; + + pid = fork(); + error_exit(pid == -1, "fork error"); + + if (pid != 0) + return; + + ret = prctl(PR_SET_PDEATHSIG, SIGHUP); + error_exit(ret < 0, "failed to set parent death sig"); + + printf("pid %u getpid() %u parent %u getppid() %u\n", + pid, getpid(), parent, getppid()); + error_exit(getppid() != parent, "child parent already changed"); + + writer_proc(opts, args); + exit(0); +} + +static int do_restore(struct opts *opts) +{ + struct scoutfs_parallel_restore_writer *wri = NULL; + struct scoutfs_parallel_restore_slice *slices = NULL; + struct scoutfs_super_block *super = NULL; + struct write_result res; + struct writer_args *args; + struct timespec begin; + struct timespec end; + LIST_HEAD(writers); + u64 next_ino; + u64 ino_per; + u64 avg_dirs; + u64 avg_files; + u64 dir_height; + u64 tot_files; + u64 tot_bytes; + int pair[2] = {-1, -1}; + float secs; + void *buf = NULL; + int dev_fd = -1; + int ret; + int i; + + ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair); + error_exit(ret, "socketpair error "ERRF, ERRA); + + dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL))); + error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA); + + errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?: + posix_memalign((void **)&buf, 4096, opts->buf_size); + error_exit(errno, "error allocating block bufs "ERRF, ERRA); + + ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE, + SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT); + error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret); + + ret = scoutfs_parallel_restore_create_writer(&wri); + error_exit(ret, "create writer %d", ret); + + ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd); + error_exit(ret, "import super %d", ret); + + slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice)); + error_exit(!slices, "alloc slices"); + + scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers); + + ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]); + error_exit(ret, "add slices[0] %d", ret); + + next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1; + ino_per = opts->total_files / opts->nr_writers; + avg_dirs = (opts->low_dirs + opts->high_dirs) / 2; + avg_files = (opts->low_files + opts->high_files) / 2; + + dir_height = 1; + tot_files = avg_files * opts->nr_writers; + + while (tot_files < opts->total_files) { + dir_height++; + tot_files *= avg_dirs; + } + + dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files); + + clock_gettime(CLOCK_MONOTONIC_RAW, &begin); + + /* start each writing process */ + for (i = 0; i < opts->nr_writers; i++) { + args = calloc(1, sizeof(struct writer_args)); + error_exit(!args, "alloc writer args"); + + args->dev_fd = dev_fd; + args->pair_fd = pair[1]; + args->slice = slices[1 + i]; + args->writer_nr = i; + args->dir_height = dir_height; + args->ino_start = next_ino; + args->ino_len = ino_per; + + list_add_tail(&args->head, &writers); + next_ino += ino_per; + + fork_writer(opts, args); + } + + /* read results and watch for writers to finish */ + tot_files = 0; + tot_bytes = 0; + i = 0; + while (i < opts->nr_writers) { + ret = read(pair[0], &res, sizeof(struct write_result)); + error_exit(ret != sizeof(struct write_result), "result read error %d", ret); + + ret = scoutfs_parallel_restore_add_progress(wri, &res.prog); + error_exit(ret, "add thr prog %d", ret); + + if (res.slice.meta_len != 0) { + ret = scoutfs_parallel_restore_add_slice(wri, &res.slice); + error_exit(ret, "add thr slice %d", ret); + i++; + } + + tot_files += le64_to_cpu(res.files_created); + tot_bytes += le64_to_cpu(res.bytes_written); + } + + tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd); + + ret = scoutfs_parallel_restore_export_super(wri, super); + error_exit(ret, "update super %d", ret); + + if (!opts->read_only) { + ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE, + SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT); + error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret); + } + + clock_gettime(CLOCK_MONOTONIC_RAW, &end); + + scoutfs_parallel_restore_destroy_writer(&wri); + + secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) - + ((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC)); + printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n", + tot_files, tot_bytes, secs, + (float)tot_bytes / tot_files, (float)tot_files / secs); + + if (dev_fd >= 0) + close(dev_fd); + if (pair[0] >= 0) + close(pair[0]); + if (pair[1] >= 0) + close(pair[1]); + free(super); + free(slices); + free(buf); + + return 0; +} + +static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret) +{ + char *sep; + int ret = 0; + + sep = index(str, ':'); + if (sep) { + *sep = '\0'; + ret = parse_u64(sep + 1, high_ret); + } + + if (ret == 0) + ret = parse_u64(str, low_ret); + + if (sep) + *sep = ':'; + + return ret; +} + +int main(int argc, char **argv) +{ + struct opts opts = { + .buf_size = (32 * 1024 * 1024), + + .write_batch = 1000000, + .low_dirs = 5, + .high_dirs = 10, + .low_files = 10, + .high_files = 20, + .total_files = 100, + }; + struct sigaction act = { 0 }; + int ret; + int c; + + opts.seed = random64(); + opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN); + + while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) { + switch(c) { + case 'b': + ret = parse_u64(optarg, &opts.write_batch); + error_exit(ret, "error parsing -b '%s'\n", optarg); + error_exit(opts.write_batch == 0, "-b can't be 0"); + break; + case 'd': + ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs); + error_exit(ret, "error parsing -d '%s'\n", optarg); + break; + case 'f': + ret = parse_low_high(optarg, &opts.low_files, &opts.high_files); + error_exit(ret, "error parsing -f '%s'\n", optarg); + break; + case 'm': + opts.meta_path = strdup(optarg); + break; + case 'n': + ret = parse_u64(optarg, &opts.total_files); + error_exit(ret, "error parsing -n '%s'\n", optarg); + break; + case 'r': + opts.read_only = true; + break; + case 's': + ret = parse_u64(optarg, &opts.seed); + error_exit(ret, "error parsing -s '%s'\n", optarg); + break; + case 'w': + ret = parse_u64(optarg, &opts.nr_writers); + error_exit(ret, "error parsing -w '%s'\n", optarg); + break; + case '?': + printf("Unknown option '%c'\n", optopt); + usage(); + exit(1); + } + } + + error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu", + opts.low_dirs, opts.high_dirs); + error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu", + opts.low_files, opts.high_files); + error_exit(!opts.meta_path, "must specify metadata device path with -m"); + + printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n", + opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files, + opts.total_files, opts.seed, opts.nr_writers); + + act.sa_flags = SA_SIGINFO | SA_RESTART; + act.sa_sigaction = &sigchld_handler; + if (sigaction(SIGCHLD, &act, NULL) == -1) + error_exit(ret, "error setting up signal handler\n"); + + ret = do_restore(&opts); + + free(opts.meta_path); + + return ret == 0 ? 0 : 1; +} diff --git a/tests/src/restore_copy.c b/tests/src/restore_copy.c new file mode 100644 index 00000000..94fc702c --- /dev/null +++ b/tests/src/restore_copy.c @@ -0,0 +1,963 @@ +#define _GNU_SOURCE /* O_DIRECT */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../utils/src/sparse.h" +#include "../../utils/src/util.h" +#include "../../utils/src/list.h" +#include "../../utils/src/parse.h" +#include "../../kmod/src/format.h" +#include "../../kmod/src/ioctl.h" +#include "../../utils/src/parallel_restore.h" + +/* + * XXX: + */ + +#define ERRF " errno %d (%s)" +#define ERRA errno, strerror(errno) + +#define error_exit(cond, fmt, args...) \ +do { \ + if (cond) { \ + printf("error: "fmt"\n", ##args); \ + exit(1); \ + } \ +} while (0) + +#define REG_MODE (S_IFREG | 0644) +#define DIR_MODE (S_IFDIR | 0755) +#define LNK_MODE (S_IFLNK | 0777) + +/* + * At about 1k files we seem to be writing about 1MB of data, so + * set buffer sizes adequately above that. + */ +#define BATCH_FILES 1024 +#define BUF_SIZ 2 * 1024 * 1024 + +/* + * We can't make duplicate inodes for hardlinked files, so we + * will need to track these as we generate them. Not too costly + * to do, since it's just an integer, and sorting shouldn't matter + * until we get into the millions of entries, hopefully. + */ +static struct list_head hardlinks; +struct hardlink_head { + struct list_head head; + u64 ino; +}; + +struct opts { + char *meta_path; + char *source_dir; +}; + +static bool warn_scoutfs = false; + +static void usage(void) +{ + printf("usage:\n" + " -m PATH | path to metadata device\n" + " -s PATH | path to source directory\n" + ); +} + +static size_t write_bufs(struct scoutfs_parallel_restore_writer *wri, + void *buf, int dev_fd) +{ + size_t total = 0; + size_t count; + off_t off; + int ret; + + do { + ret = scoutfs_parallel_restore_write_buf(wri, buf, BUF_SIZ, &off, &count); + error_exit(ret, "write buf %d", ret); + + if (count > 0) { + ret = pwrite(dev_fd, buf, count, off); + error_exit(ret != count, "pwrite count %zu ret %d", count, ret); + total += ret; + } + } while (count > 0); + + return total; +} + +struct write_result { + struct scoutfs_parallel_restore_progress prog; + struct scoutfs_parallel_restore_slice slice; + __le64 files_created; + __le64 dirs_created; + __le64 bytes_written; + bool complete; +}; + +static void write_bufs_and_send(struct scoutfs_parallel_restore_writer *wri, + void *buf, int dev_fd, + struct write_result *res, bool get_slice, int pair_fd) +{ + size_t total; + int ret; + + total = write_bufs(wri, buf, dev_fd); + le64_add_cpu(&res->bytes_written, total); + + ret = scoutfs_parallel_restore_get_progress(wri, &res->prog); + error_exit(ret, "get prog %d", ret); + + if (get_slice) { + ret = scoutfs_parallel_restore_get_slice(wri, &res->slice); + error_exit(ret, "thread get slice %d", ret); + } + + ret = write(pair_fd, res, sizeof(struct write_result)); + error_exit(ret != sizeof(struct write_result), "result send error"); + + memset(res, 0, sizeof(struct write_result)); +} + +/* + * Adding xattrs is supported for files and directories only. + * + * If the filesystem on which the path resides isn't scoutfs, we omit the + * scoutfs specific ioctl to fetch hidden xattrs. + * + * Untested if the hidden xattr ioctl works on directories or symlinks. + */ +static void add_xattrs(struct scoutfs_parallel_restore_writer *wri, char *path, u64 ino, bool is_scoutfs) +{ + struct scoutfs_ioctl_listxattr_hidden lxh; + struct scoutfs_parallel_restore_xattr *xattr; + char *buf = NULL; + char *name = NULL; + int fd = -1; + int bytes; + int len; + int value_len; + int ret; + int pos = 0; + + if (!is_scoutfs) + goto normal_xattrs; + + fd = open(path, O_RDONLY); + error_exit(fd < 0, "open"ERRF, ERRA); + + memset(&lxh, 0, sizeof(lxh)); + lxh.id_pos = 0; + lxh.hash_pos = 0; + lxh.buf_bytes = 256 * 1024; + + buf = malloc(lxh.buf_bytes); + error_exit(!buf, "alloc xattr_hidden buf"); + lxh.buf_ptr = (unsigned long)buf; + + /* hidden */ + for (;;) { + ret = ioctl(fd, SCOUTFS_IOC_LISTXATTR_HIDDEN, &lxh); + if (ret == 0) /* done */ + break; + error_exit(ret < 0, "listxattr_hidden"ERRF, ERRA); + bytes = ret; + error_exit(bytes > lxh.buf_bytes, "listxattr_hidden overflow"); + error_exit(buf[bytes - 1] != '\0', "listxattr_hidden didn't term"); + + name = buf; + + do { + len = strlen(name); + error_exit(len == 0, "listxattr_hidden empty name"); + error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name"); + + /* get value len */ + value_len = fgetxattr(fd, name, NULL, 0); + error_exit(value_len < 0, "malloc value hidden"ERRF, ERRA); + + /* allocate everything at once */ + xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len); + error_exit(!xattr, "error allocating generated xattr"); + + *xattr = (struct scoutfs_parallel_restore_xattr) { + .ino = ino, + .pos = pos++, + .name_len = len, + .value_len = value_len, + }; + xattr->name = (void *)(xattr + 1); + xattr->value = (void *)(xattr->name + len); + + /* get value into xattr directly */ + ret = fgetxattr(fd, name, (void *)(xattr->name + len), value_len); + error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA); + + memcpy(xattr->name, name, len); + + ret = scoutfs_parallel_restore_add_xattr(wri, xattr); + error_exit(ret, "add hidden xattr %d", ret); + + free(xattr); + + name += len + 1; + bytes -= len + 1; + } while (bytes > 0); + } + + free(buf); + close(fd); + +normal_xattrs: + value_len = listxattr(path, NULL, 0); + error_exit(value_len < 0, "hidden listxattr "ERRF, ERRA); + if (value_len == 0) + return; + + buf = calloc(1, value_len); + error_exit(!buf, "malloc value"ERRF, ERRA); + + ret = listxattr(path, buf, value_len); + error_exit(ret < 0, "hidden listxattr %d", ret); + + name = buf; + bytes = ret; + do { + len = strlen(name); + + error_exit(len == 0, "listxattr_hidden empty name"); + error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name"); + + value_len = getxattr(path, name, NULL, 0); + error_exit(value_len < 0, "value "ERRF, ERRA); + + xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len); + error_exit(!xattr, "error allocating generated xattr"); + + *xattr = (struct scoutfs_parallel_restore_xattr) { + .ino = ino, + .pos = pos++, + .name_len = len, + .value_len = value_len, + }; + xattr->name = (void *)(xattr + 1); + xattr->value = (void *)(xattr->name + len); + + ret = getxattr(path, name, (void *)(xattr->name + len), value_len); + error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA); + + memcpy(xattr->name, name, len); + + ret = scoutfs_parallel_restore_add_xattr(wri, xattr); + error_exit(ret, "add xattr %d", ret); + + free(xattr); + + name += len + 1; + bytes -= len + 1; + } while (bytes > 0); + + free(buf); +} + +/* + * We can't store the same inode multiple times, so we need to make + * sure to account for hardlinks. Maintain a LL that stores the first + * hardlink inode we encounter, and every subsequent hardlink to this + * inode will omit inserting an inode, and just adds another entry + */ +static bool is_new_inode_item(bool nlink, u64 ino) +{ + struct hardlink_head *hh_tmp; + struct hardlink_head *hh; + + if (!nlink) + return true; + + /* lineair search, pretty awful, should be a binary tree */ + list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) { + if (hh->ino == ino) + return false; + } + + /* insert item */ + hh = malloc(sizeof(struct hardlink_head)); + error_exit(!hh, "malloc"); + hh->ino = ino; + list_add_tail(&hh->head, &hardlinks); + + /* + * XXX + * + * We can be confident that if we don't traverse filesystems + * that once we've created N entries of an N-linked inode, that + * it can be removed from the LL. This would significantly + * improve the manageability of the list. + * + * All we'd need to do is add a counter and compare it to the nr_links + * field of the inode. + */ + + return true; +} + +/* + * create the inode data for a given path as best as possible + * duplicating the exact data from the source path + */ +static struct scoutfs_parallel_restore_inode *read_inode_data(char *path, u64 ino, bool *nlink, bool is_scoutfs) +{ + struct scoutfs_parallel_restore_inode *inode = NULL; + struct scoutfs_ioctl_stat_more stm; + struct scoutfs_ioctl_inode_attr_x iax; + struct stat st; + int ret; + int fd; + + inode = calloc(1, sizeof(struct scoutfs_parallel_restore_inode)); + error_exit(!inode, "failure allocating inode"); + + ret = lstat(path, &st); + error_exit(ret, "failure stat inode"); + + /* use exact inode numbers from path, except for root ino */ + if (ino != SCOUTFS_ROOT_INO) + inode->ino = st.st_ino; + else + inode->ino = SCOUTFS_ROOT_INO; + + inode->mode = st.st_mode; + inode->uid = st.st_uid; + inode->gid = st.st_gid; + inode->atime = st.st_atim; + inode->ctime = st.st_ctim; + inode->mtime = st.st_mtim; + inode->size = st.st_size; + inode->nlink = st.st_nlink; + + inode->rdev = st.st_rdev; + + /* scoutfs specific */ + inode->meta_seq = 0; + inode->data_seq = 0; + inode->crtime = st.st_ctim; + + /* we don't restore data */ + if (S_ISREG(inode->mode) && (inode->size > 0)) + inode->offline = true; + + if (S_ISREG(inode->mode) || S_ISDIR(inode->mode)) { + if (is_scoutfs) { + fd = open(path, O_RDONLY); + error_exit(!fd, "open failure"ERRF, ERRA); + + ret = ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm); + error_exit(ret, "failure SCOUTFS_IOC_STAT_MORE inode"); + + /* these aren't restored! */ + inode->meta_seq = stm.meta_seq; + inode->data_seq = stm.data_seq; + + inode->crtime = (struct timespec){.tv_sec = stm.crtime_sec, .tv_nsec = stm.crtime_nsec}; + + /* project ID, retention bit */ + memset(&iax, 0, sizeof(iax)); + + iax.x_flags = 0; + iax.x_mask = SCOUTFS_IOC_IAX_PROJECT_ID | SCOUTFS_IOC_IAX__BITS; + iax.bits = SCOUTFS_IOC_IAX_B_RETENTION; + + ret = ioctl(fd, SCOUTFS_IOC_GET_ATTR_X, &iax); + error_exit(ret, "failure SCOUTFS_IOC_GET_ATTR_X inode"); + + inode->proj = iax.project_id; + inode->flags |= (iax.bits & SCOUTFS_IOC_IAX_B_RETENTION) ? SCOUTFS_INO_FLAG_RETENTION : 0; + + close(fd); + } + + } + + /* pass whether item is hardlinked or not */ + *nlink = (st.st_nlink > 1); + + return inode; +} + +typedef int (*quota_ioctl_in)(struct scoutfs_ioctl_quota_rule *irules, + struct scoutfs_ioctl_get_quota_rules *gqr, + size_t nr, int fd); + +static int get_quota_ioctl(struct scoutfs_ioctl_quota_rule *irules, + struct scoutfs_ioctl_get_quota_rules *rules_in, + size_t nr, int fd) +{ + struct scoutfs_ioctl_get_quota_rules *gqr = rules_in; + int ret; + + gqr->rules_ptr = (intptr_t)irules; + gqr->rules_nr = nr; + + ret = ioctl(fd, SCOUTFS_IOC_GET_QUOTA_RULES, gqr); + error_exit(ret < 0, "quota ioctl error"); + + return ret; +} + +static char opc[] = { + [SQ_OP_DATA] = 'D', + [SQ_OP_INODE] = 'I', +}; + +static char nsc[] = { + [SQ_NS_LITERAL] = 'L', + [SQ_NS_PROJ] = 'P', + [SQ_NS_UID] = 'U', + [SQ_NS_GID] = 'G', +}; + +static int insert_quota_rule(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_ioctl_quota_rule *irule) +{ + struct scoutfs_parallel_restore_quota_rule *prule = NULL; + int ret; + int i; + + prule = calloc(1, sizeof(struct scoutfs_parallel_restore_quota_rule)); + error_exit(!prule, "quota rule alloc failed"); + prule->limit = irule->limit; + prule->prio = irule->prio; + prule->op = irule->op; + prule->rule_flags = irule->rule_flags; + prule->names[0].val = irule->name_val[0]; + prule->names[0].source = irule->name_source[0]; + prule->names[0].flags = irule->name_flags[0]; + prule->names[1].val = irule->name_val[1]; + prule->names[1].source = irule->name_source[1]; + prule->names[1].flags = irule->name_flags[1]; + prule->names[2].val = irule->name_val[2]; + prule->names[2].source = irule->name_source[2]; + prule->names[2].flags = irule->name_flags[2]; + + /* print out the rule */ + printf("Quota rule: %3u ", irule->prio); + for (i = 0; i < array_size(irule->name_val); i++) { + printf("%llu,%c,%c ", + irule->name_val[i], + nsc[irule->name_source[i]], + (irule->name_flags[i] & SQ_NF_SELECT) ? 'S' : '-'); + } + printf("%c %llu %c\n", + opc[irule->op], irule->limit, (irule->rule_flags & SQ_RF_TOTL_COUNT) ? 'C' : '-'); + + ret = scoutfs_parallel_restore_add_quota_rule(wri, prule); + error_exit(ret, "quota add rule %d", ret); + free(prule); + return ret; +} + +static int restore_quotas(struct scoutfs_parallel_restore_writer *wri, + quota_ioctl_in quota_in, char *path) +{ + struct scoutfs_ioctl_get_quota_rules gqr = {{0,}}; + struct scoutfs_ioctl_quota_rule *irules = NULL; + size_t rule_alloc = 0; + size_t rule_nr = 0; + size_t rule_count; + size_t i; + int fd = -1; + int ret; + + fd = open(path, O_RDONLY); + error_exit(fd < 0, "open"ERRF, ERRA); + + for (;;) { + if (rule_nr == rule_alloc) { + rule_alloc += 1024; + irules = realloc(irules, rule_alloc * sizeof(irules[0])); + error_exit(!irules, "irule realloc failed rule_nr:%zu alloced:%zu", rule_nr, rule_alloc); + if (!irules) { + ret = -errno; + fprintf(stderr, "memory allocation failed: %s (%d)\n", + strerror(errno), errno); + goto out; + } + } + + ret = quota_in(&irules[rule_nr], &gqr, rule_alloc - rule_nr, fd); + if (ret == 0) + break; + if (ret < 0) + goto out; + + rule_count = ret; + + for (i = 0; i < rule_count; i++) { + ret = insert_quota_rule(wri, &irules[i]); + if (ret < 0) + goto out; + } + } + + ret = 0; +out: + if (fd >= 0) + close(fd); + if (irules) + free(irules); + return ret; +} + +struct writer_args { + struct list_head head; + + int dev_fd; + int pair_fd; + + struct scoutfs_parallel_restore_slice slice; +}; + +static void restore_path(struct scoutfs_parallel_restore_writer *wri, struct writer_args *args, struct write_result *res, void *buf, char *path, u64 ino) +{ + struct scoutfs_parallel_restore_inode *inode; + struct scoutfs_parallel_restore_entry *entry; + DIR *dirp = NULL; + char *subdir = NULL; + char link[PATH_MAX + 1]; + struct dirent *ent; + struct statfs stf; + int ret = 0; + int subdir_count = 0, file_count = 0; + size_t ent_len = 0; + size_t pos = 0; + bool nlink = false; + char ind = '?'; + u64 mode; + bool is_scoutfs = false; + + /* get fs info once per path */ + ret = statfs(path, &stf); + error_exit(ret != 0, "statfs"ERRF, ERRA); + is_scoutfs = (stf.f_type == 0x554f4353); + + if (!is_scoutfs && !warn_scoutfs) { + warn_scoutfs = true; + fprintf(stderr, "Non-scoutfs source path detected: scoutfs specific features disabled\n"); + } + + + /* traverse the entire tree */ + dirp = opendir(path); + errno = 0; + while ((ent = readdir(dirp))) { + if (ent->d_type == DT_DIR) { + if ((strcmp(ent->d_name, ".") == 0) || + (strcmp(ent->d_name, "..") == 0)) { + /* position still matters */ + pos++; + continue; + } + + /* recurse into subdir */ + ret = asprintf(&subdir, "%s/%s", path, ent->d_name); + error_exit(ret == -1, "asprintf subdir"ERRF, ERRA); + restore_path(wri, args, res, buf, subdir, ent->d_ino); + + subdir_count++; + + ent_len += strlen(ent->d_name); + + entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name)); + error_exit(!entry, "error allocating generated entry"); + + *entry = (struct scoutfs_parallel_restore_entry) { + .dir_ino = ino, + .pos = pos++, + .ino = ent->d_ino, + .mode = DIR_MODE, + .name = (void *)(entry + 1), + .name_len = strlen(ent->d_name), + }; + + memcpy(entry->name, ent->d_name, strlen(ent->d_name)); + ret = scoutfs_parallel_restore_add_entry(wri, entry); + error_exit(ret, "add entry %d", ret); + free(entry); + + add_xattrs(wri, subdir, ent->d_ino, is_scoutfs); + + free(subdir); + + le64_add_cpu(&res->dirs_created, 1); + } else if (ent->d_type == DT_REG) { + + file_count++; + + ent_len += strlen(ent->d_name); + + entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name)); + error_exit(!entry, "error allocating generated entry"); + + *entry = (struct scoutfs_parallel_restore_entry) { + .dir_ino = ino, + .pos = pos++, + .ino = ent->d_ino, + .mode = REG_MODE, + .name = (void *)(entry + 1), + .name_len = strlen(ent->d_name), + }; + + memcpy(entry->name, ent->d_name, strlen(ent->d_name)); + ret = scoutfs_parallel_restore_add_entry(wri, entry); + error_exit(ret, "add entry %d", ret); + free(entry); + + ret = asprintf(&subdir, "%s/%s", path, ent->d_name); + error_exit(ret == -1, "asprintf subdir"ERRF, ERRA); + + /* file inode */ + inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs); + fprintf(stdout, "f %s/%s\n", path, ent->d_name); + if (is_new_inode_item(nlink, ent->d_ino)) { + ret = scoutfs_parallel_restore_add_inode(wri, inode); + error_exit(ret, "add reg file inode %d", ret); + + /* xattrs */ + add_xattrs(wri, subdir, ent->d_ino, is_scoutfs); + } + free(inode); + + free(subdir); + + le64_add_cpu(&res->files_created, 1); + } else if (ent->d_type == DT_LNK) { + /* readlink */ + + ret = asprintf(&subdir, "%s/%s", path, ent->d_name); + error_exit(ret == -1, "asprintf subdir"ERRF, ERRA); + + ent_len += strlen(ent->d_name); + + ret = readlink(subdir, link, PATH_MAX); + error_exit(ret < 0, "readlink %d", ret); + /* must 0-terminate if we want to print it */ + link[ret] = 0; + + entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name)); + error_exit(!entry, "error allocating generated entry"); + + *entry = (struct scoutfs_parallel_restore_entry) { + .dir_ino = ino, + .pos = pos++, + .ino = ent->d_ino, + .mode = LNK_MODE, + .name = (void *)(entry + 1), + .name_len = strlen(ent->d_name), + }; + + memcpy(entry->name, ent->d_name, strlen(ent->d_name)); + ret = scoutfs_parallel_restore_add_entry(wri, entry); + error_exit(ret, "add symlink entry %d", ret); + + /* link inode */ + inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs); + + fprintf(stdout, "l %s/%s -> %s\n", path, ent->d_name, link); + + inode->mode = LNK_MODE; + inode->target = link; + inode->target_len = strlen(link) + 1; /* scoutfs null terminates symlinks */ + + ret = scoutfs_parallel_restore_add_inode(wri, inode); + error_exit(ret, "add syml inode %d", ret); + + free(inode); + free(subdir); + + le64_add_cpu(&res->files_created, 1); + } else { + /* odd stuff */ + switch(ent->d_type) { + case DT_CHR: + ind = 'c'; + mode = S_IFCHR; + break; + case DT_BLK: + ind = 'b'; + mode = S_IFBLK; + break; + case DT_FIFO: + ind = 'p'; + mode = S_IFIFO; + break; + case DT_SOCK: + ind = 's'; + mode = S_IFSOCK; + break; + default: + error_exit(true, "Unknown readdir entry type"); + ;; + } + + file_count++; + + ent_len += strlen(ent->d_name); + + entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name)); + error_exit(!entry, "error allocating generated entry"); + + *entry = (struct scoutfs_parallel_restore_entry) { + .dir_ino = ino, + .pos = pos++, + .ino = ent->d_ino, + .mode = mode, + .name = (void *)(entry + 1), + .name_len = strlen(ent->d_name), + }; + + memcpy(entry->name, ent->d_name, strlen(ent->d_name)); + ret = scoutfs_parallel_restore_add_entry(wri, entry); + error_exit(ret, "add entry %d", ret); + + free(entry); + + ret = asprintf(&subdir, "%s/%s", path, ent->d_name); + error_exit(ret == -1, "asprintf subdir"ERRF, ERRA); + + /* file inode */ + inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs); + fprintf(stdout, "%c %s/%s\n", ind, path, ent->d_name); + if (is_new_inode_item(nlink, ent->d_ino)) { + ret = scoutfs_parallel_restore_add_inode(wri, inode); + error_exit(ret, "add reg file inode %d", ret); + } + free(inode); + + free(subdir); + + le64_add_cpu(&res->files_created, 1); + } + + /* batch out changes, will be about 1M */ + if (le64_to_cpu(res->files_created) > BATCH_FILES) { + write_bufs_and_send(wri, buf, args->dev_fd, res, false, args->pair_fd); + } + + } + if (ent != NULL) + error_exit(errno, "readdir"ERRF, ERRA); + closedir(dirp); + + /* create the dir itself */ + inode = read_inode_data(path, ino, &nlink, is_scoutfs); + inode->nr_subdirs = subdir_count; + inode->total_entry_name_bytes = ent_len; + fprintf(stdout, "d %s\n", path); + + ret = scoutfs_parallel_restore_add_inode(wri, inode); + error_exit(ret, "add dir inode %d", ret); + + free(inode); + + /* No need to send, we'll send final after last directory is complete */ +} + +static int do_restore(struct opts *opts) +{ + struct scoutfs_parallel_restore_writer *pwri, *wri = NULL; + struct scoutfs_parallel_restore_slice *slices = NULL; + struct scoutfs_super_block *super = NULL; + struct writer_args *args; + struct write_result res; + int pair[2] = {-1, -1}; + LIST_HEAD(writers); + void *buf = NULL; + void *bufp = NULL; + int dev_fd = -1; + pid_t pid; + int ret; + u64 tot_bytes; + u64 tot_dirs; + u64 tot_files; + + ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair); + error_exit(ret, "socketpair error "ERRF, ERRA); + + dev_fd = open(opts->meta_path, O_DIRECT | (O_RDWR|O_EXCL)); + error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA); + + errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?: + posix_memalign((void **)&buf, 4096, BUF_SIZ); + error_exit(errno, "error allocating block bufs "ERRF, ERRA); + + ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE, + SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT); + error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret); + + error_exit((super->flags & SCOUTFS_FLAG_IS_META_BDEV) == 0, "super block is not meta dev"); + + ret = scoutfs_parallel_restore_create_writer(&wri); + error_exit(ret, "create writer %d", ret); + + ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd); + error_exit(ret, "import super %d", ret); + + slices = calloc(2, sizeof(struct scoutfs_parallel_restore_slice)); + error_exit(!slices, "alloc slices"); + + scoutfs_parallel_restore_init_slices(wri, slices, 2); + + ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]); + error_exit(ret, "add slices[0] %d", ret); + + args = calloc(1, sizeof(struct writer_args)); + error_exit(!args, "alloc writer args"); + + args->dev_fd = dev_fd; + args->slice = slices[1]; + args->pair_fd = pair[1]; + list_add_tail(&args->head, &writers); + + /* fork writer process */ + pid = fork(); + error_exit(pid == -1, "fork error"); + + if (pid == 0) { + ret = prctl(PR_SET_PDEATHSIG, SIGHUP); + error_exit(ret < 0, "failed to set parent death sig"); + + errno = posix_memalign((void **)&bufp, 4096, BUF_SIZ); + error_exit(errno, "error allocating block bufp "ERRF, ERRA); + + ret = scoutfs_parallel_restore_create_writer(&pwri); + error_exit(ret, "create pwriter %d", ret); + + ret = scoutfs_parallel_restore_add_slice(pwri, &args->slice); + error_exit(ret, "add pslice %d", ret); + + memset(&res, 0, sizeof(res)); + + restore_path(pwri, args, &res, bufp, opts->source_dir, SCOUTFS_ROOT_INO); + + ret = restore_quotas(pwri, get_quota_ioctl, opts->source_dir); + error_exit(ret, "quota add %d", ret); + + res.complete = true; + + write_bufs_and_send(pwri, buf, args->dev_fd, &res, true, args->pair_fd); + + scoutfs_parallel_restore_destroy_writer(&pwri); + free(bufp); + + exit(0); + }; + + /* read results and wait for writer to finish */ + tot_bytes = 0; + tot_dirs = 1; + tot_files = 0; + for (;;) { + ret = read(pair[0], &res, sizeof(struct write_result)); + error_exit(ret != sizeof(struct write_result), "result read error %d", ret); + + ret = scoutfs_parallel_restore_add_progress(wri, &res.prog); + error_exit(ret, "add thr prog %d", ret); + + if (res.slice.meta_len != 0) { + ret = scoutfs_parallel_restore_add_slice(wri, &res.slice); + error_exit(ret, "add thr slice %d", ret); + + if (res.complete) + break; + } + + tot_bytes += le64_to_cpu(res.bytes_written); + tot_files += le64_to_cpu(res.files_created); + tot_dirs += le64_to_cpu(res.dirs_created); + } + + tot_bytes += write_bufs(wri, buf, args->dev_fd); + + fprintf(stdout, "Wrote %lld directories, %lld files, %lld bytes total\n", + tot_dirs, tot_files, tot_bytes); + + /* write super to finalize */ + ret = scoutfs_parallel_restore_export_super(wri, super); + error_exit(ret, "update super %d", ret); + + ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE, + SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT); + error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret); + + scoutfs_parallel_restore_destroy_writer(&wri); + + if (dev_fd >= 0) + close(dev_fd); + if (pair[0] > 0) + close(pair[0]); + if (pair[1] > 0) + close(pair[1]); + free(super); + free(args); + free(slices); + free(buf); + + return 0; +} + +int main(int argc, char **argv) +{ + struct opts opts = (struct opts){ 0 }; + struct hardlink_head *hh_tmp; + struct hardlink_head *hh; + int ret; + int c; + + INIT_LIST_HEAD(&hardlinks); + + while ((c = getopt(argc, argv, "b:m:s:")) != -1) { + switch(c) { + case 'm': + opts.meta_path = strdup(optarg); + break; + case 's': + opts.source_dir = strdup(optarg); + break; + case '?': + printf("Unknown option '%c'\n", optopt); + usage(); + exit(1); + } + } + + error_exit(!opts.meta_path, "must specify metadata device path with -m"); + error_exit(!opts.source_dir, "must specify source directory path with -s"); + + ret = do_restore(&opts); + + free(opts.meta_path); + free(opts.source_dir); + + list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) { + list_del_init(&hh->head); + free(hh); + } + + return ret == 0 ? 0 : 1; +} diff --git a/tests/tests/parallel_restore.sh b/tests/tests/parallel_restore.sh new file mode 100644 index 00000000..69b594ec --- /dev/null +++ b/tests/tests/parallel_restore.sh @@ -0,0 +1,74 @@ +# +# validate parallel restore library +# + +t_require_commands scoutfs parallel_restore find xargs + +SCR="$T_TMPDIR/mnt.scratch" +mkdir -p "$SCR" + +scratch_mkfs() { + scoutfs mkfs $@ \ + -A -f -Q 0,127.0.0.1,53000 $T_EX_META_DEV $T_EX_DATA_DEV +} + +scratch_check() { + # give ample time for writes to commit + sleep 1 + sync + scoutfs check -d ${T_TMPDIR}/check.debug $T_EX_META_DEV $T_EX_DATA_DEV +} + +scratch_mount() { + mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 $T_EX_DATA_DEV $SCR +} + +echo "== simple mkfs/restore/mount" +# meta device just big enough for reserves and the metadata we'll fill +scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed" +parallel_restore -m "$T_EX_META_DEV" > /dev/null || t_fail "parallel_restore" +scratch_check || t_fail "check failed" +scratch_mount + +scoutfs statfs -p "$SCR" | grep -v -e 'fsid' -e 'rid' +find "$SCR" -exec scoutfs list-hidden-xattrs {} \; | wc +scoutfs search-xattrs -p "$SCR" scoutfs.hide.srch.sam_vol_F01030L6 -p "$SCR" | wc +find "$SCR" -type f -name "file-*" | head -n 4 | xargs -n 1 scoutfs get-fiemap -L +scoutfs df -p "$SCR" | awk '{print $1, $4}' +scoutfs quota-list -p "$SCR" + +umount "$SCR" +scratch_check || t_fail "check after mount failed" + +echo "== under ENOSPC" +scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed" +parallel_restore -m "$T_EX_META_DEV" -n 2000000 > /dev/null || t_fail "parallel_restore" +scratch_check || t_fail "check failed" +scratch_mount +scoutfs df -p "$SCR" | awk '{print $1, $4}' +umount "$SCR" +scratch_check || t_fail "check after mount failed" + +echo "== ENOSPC" +scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed" +parallel_restore -m "$T_EX_META_DEV" -d 600:1000 -f 600:1000 -n 4000000 | grep died 2>&1 && t_fail "parallel_restore" + +echo "== attempt to restore data device" +scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed" +parallel_restore -m "$T_EX_DATA_DEV" | grep died 2>&1 && t_fail "parallel_restore" + +echo "== attempt format_v1 restore" +scratch_mkfs -V 1 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed" +parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore" + +echo "== test if previously mounted" +scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed" +mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \ + "$T_EX_DATA_DEV" "$SCR" +umount "$SCR" +parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore" + +echo "== cleanup" +rmdir "$SCR" + +t_pass diff --git a/tests/tests/restore_copy.sh b/tests/tests/restore_copy.sh new file mode 100644 index 00000000..7517f05d --- /dev/null +++ b/tests/tests/restore_copy.sh @@ -0,0 +1,118 @@ +# +# validate parallel restore library - using restore_copy.c +# + +t_require_commands scoutfs restore_copy find xargs + +SCR="$T_TMPDIR/mnt.scratch" +mkdir -p "$SCR" + +scratch_mkfs() { + scoutfs mkfs $@ \ + -A -f -Q 0,127.0.0.1,53000 $T_EX_META_DEV $T_EX_DATA_DEV +} + +scratch_check() { + # give ample time for writes to commit + sleep 1 + sync + scoutfs check -d ${T_TMPDIR}/check.debug $T_EX_META_DEV $T_EX_DATA_DEV +} + +scratch_mount() { + mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 $T_EX_DATA_DEV $SCR +} + +echo "== restore_copy content verification" +mkdir "$T_M0/data" + +# create all supported inode types: +mkdir -p "$T_M0/data/d" +touch "$T_M0/data/f" +ln -sf "broken" "$T_M0/data/l" +ln "$T_M0/data/f" "$T_M0/data/h" +ln -sf "f" "$T_M0/data/F" +mknod "$T_M0/data/b" b 1 1 +mknod "$T_M0/data/c" c 0 0 +mknod "$T_M0/data/u" u 2 2 +mknod "$T_M0/data/p" p + +# some files with data +dd if=/dev/zero of="$T_M0/data/f4096" bs=4096 count=1 status=none +touch "$T_M0/data/falloc" "$T_M0/data/truncate" +xfs_io -C "falloc 65536 65536" "$T_M0/data/falloc" +xfs_io -C "truncate $((4096 * 4096))" "$T_M0/data/truncate" + +# socket (could have used python but avoids python/python2/python3 problem) +perl -e "use IO::Socket; my \$s = IO::Socket::UNIX->new(Type=>SOCK_STREAM,Local=>'$T_M0/data/s') or die 'sock';" +# set all mode_t bits +touch "$T_M0/data/mode_t" +chmod 6777 "$T_M0/data/mode_t" +# uid/gid +touch "$T_M0/data/uidgid" +chown 33333:33333 "$T_M0/data/uidgid" +# set retention bit +touch "$T_M0/data/retention" +scoutfs set-attr-x -t 1 "$T_M0/data/retention" +# set project ID +touch "$T_M0/data/proj" +scoutfs set-attr-x -p 12345 "$T_M0/data/proj" +mkdir -p "$T_M0/data/proj_d" +touch "$T_M0/data/proj_d/f" +scoutfs set-attr-x -p 12345 "$T_M0/data/proj_d/f" +scoutfs set-attr-x -p 54321 "$T_M0/data/proj_d" +# quotas +for a in $(seq 10 15); do + scoutfs quota-add -p "$T_M0" -r "7 $a,L,- 0,L,- 0,L,- I 33 -" +done +# crtime +scoutfs set-attr-x -r 55555.666666666 "$T_M0/data/proj_d" +scoutfs set-attr-x -r 55556.666666666 "$T_M0/data/proj_d/f" +# data_seq, meta_seq, data_version is not restored. + +scratch_mkfs -V 2 > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed" +restore_copy -m $T_EX_META_DEV -s "$T_M0/data" | t_filter_fs +scratch_check || t_fail "check before mount failed" + +scratch_mount + +echo "== verify metadata bits on restored fs" +inspect() { + ls -Alnr --time-style=+"" + scoutfs get-attr-x -t "retention" + scoutfs get-attr-x -p "proj" + scoutfs get-fiemap -L "f4096" + scoutfs get-fiemap -L "falloc" + scoutfs get-fiemap -L "truncate" + scoutfs quota-list -p "." + scoutfs get-attr-x -p "proj_d/f" + scoutfs get-attr-x -p "proj_d" + + scoutfs stat proj_d | grep crtime + scoutfs stat proj_d/f | grep crtime +} + +( cd "$SCR" ; inspect ) + +echo "== verify quota rules on restored fs" +scoutfs quota-del -p "$T_M0" -r "7 15,L,- 0,L,- 0,L,- I 33 -" || t_fail "quota-del failed" +scoutfs quota-list -p "$T_M0" +scoutfs quota-add -p "$T_M0" -r "7 15,L,- 0,L,- 0,L,- I 33 -" || t_fail "quota-add failed" +scoutfs quota-list -p "$T_M0" + +scoutfs df -p "$SCR" | awk '{print $1, $4}' + +echo "== umount restored fs and check" +umount "$SCR" +scratch_check || t_fail "check after mount failed" + +#scoutfs print $T_META_DEVICE +#scoutfs print $T_EX_META_DEV + +echo "== cleanup" +rmdir "$SCR" +scoutfs set-attr-x -t 0 "$T_M0/data/retention" +rm -rf "$T_M0/data" +scoutfs quota-wipe -p "$T_M0" + +t_pass diff --git a/utils/Makefile b/utils/Makefile index e0f76142..17c7fa1b 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H)) CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \ - -fno-strict-aliasing \ + -I src/ -fno-strict-aliasing \ -DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),) @@ -15,10 +15,13 @@ CFLAGS += -I../kmod/src endif BIN := src/scoutfs -OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c)) -DEPS := $(wildcard */*.d) +OBJ_DIRS := src src/check +OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c))) +DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d)) -all: $(BIN) +AR := src/scoutfs_parallel_restore.a + +all: $(BIN) $(AR) ifneq ($(DEPS),) -include $(DEPS) @@ -36,6 +39,10 @@ $(BIN): $(OBJ) $(QU) [BIN $@] $(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid +$(AR): $(OBJ) + $(QU) [AR $@] + $(VE)ar rcs $@ $^ + %.o %.d: %.c Makefile sparse.sh $(QU) [CC $<] $(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o diff --git a/utils/man/scoutfs.8 b/utils/man/scoutfs.8 index d105d87b..bb3f67d9 100644 --- a/utils/man/scoutfs.8 +++ b/utils/man/scoutfs.8 @@ -76,6 +76,41 @@ run when the file system will not be mounted. .RE .PD +.TP +.BI "check META-DEVICE DATA-DEVICE [-d|--debug FILE]" +.sp +Performs an offline file system check. The program iterates through all the +data structures on disk directly - the filesystem must not be mounted while +this operation is running. +.RS 1.0i +.PD 0 +.sp +.TP +.B "-d, --debug FILE" +An output file where the program can output debug information about the +state of the filesystem as it performs the check. If +.B FILE +is "-", the debug output is written to the Standard Error output. +.TP +.RE +.sp +.B RETURN VALUE +The check function can return the following exit codes: +.RS +.TP +\fB 0 \fR - no filesystem issues detected +.TP +\fB 1 \fR - file system issues were detected +.TP +\fB 8 \fR - operational error +.TP +\fB 16 \fR - usage error +.TP +\fB 32 \fR - cancelled by user (SIGINT) +.TP +.RE +.PD + .TP .BI "counters [-t|--table] SYSFS-DIR" .sp diff --git a/utils/scoutfs-utils.spec.in b/utils/scoutfs-utils.spec.in index fb24b812..a7c53514 100644 --- a/utils/scoutfs-utils.spec.in +++ b/utils/scoutfs-utils.spec.in @@ -54,6 +54,8 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/. install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h +install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h +install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example @@ -70,6 +72,7 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi %files -n scoutfs-devel %defattr(644,root,root,755) %{_includedir}/scoutfs +%{_libdir}/scoutfs %clean rm -rf %{buildroot} diff --git a/utils/src/check/alloc.c b/utils/src/check/alloc.c new file mode 100644 index 00000000..43d1d125 --- /dev/null +++ b/utils/src/check/alloc.c @@ -0,0 +1,166 @@ +#include +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "util.h" +#include "format.h" +#include "bitmap.h" +#include "key.h" + +#include "alloc.h" +#include "block.h" +#include "btree.h" +#include "extent.h" +#include "iter.h" +#include "sns.h" + +/* + * We check the list blocks serially. + * + * XXX: + * - compare ref seqs + * - detect cycles? + */ +int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg) +{ + struct scoutfs_alloc_list_block *lblk; + struct scoutfs_block_ref ref; + struct block *blk = NULL; + u64 blkno; + int ret; + + ref = lhead->ref; + + while (ref.blkno) { + blkno = le64_to_cpu(ref.blkno); + + ret = cb(blkno, 1, cb_arg); + if (ret < 0) { + ret = xlate_iter_errno(ret); + goto out; + } + + ret = block_get(&blk, blkno, 0); + if (ret < 0) + goto out; + + lblk = block_buf(blk); + /* XXX verify block */ + ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST); + if (ret < 0) + goto out; + + /* XXX sort? maybe */ + + ref = lblk->next; + + block_put(&blk); + } + + ret = 0; +out: + return ret; +} + +int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg) +{ + return btree_meta_iter(&root->root, cb, cb_arg); +} + +int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg) +{ + struct scoutfs_alloc_list_block *lblk; + struct scoutfs_block_ref ref; + struct block *blk = NULL; + u64 blkno; + int ret; + int i; + + ref = lhead->ref; + + while (ref.blkno) { + blkno = le64_to_cpu(ref.blkno); + + ret = block_get(&blk, blkno, 0); + if (ret < 0) + goto out; + + sns_push("alloc_list_block", blkno, 0); + + lblk = block_buf(blk); + /* XXX verify block */ + ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST); + if (ret < 0) + goto out; + /* XXX sort? maybe */ + + ret = 0; + for (i = 0; i < le32_to_cpu(lblk->nr); i++) { + blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]); + + ret = cb(blkno, 1, cb_arg); + if (ret < 0) + break; + } + + ref = lblk->next; + + block_put(&blk); + sns_pop(); + if (ret < 0) { + ret = xlate_iter_errno(ret); + goto out; + } + } + + ret = 0; +out: + return ret; +} + +static bool valid_free_extent_key(struct scoutfs_key *key) +{ + return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE || + key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) && + (!key->_sk_fourth && !key->sk_type && + (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third)); +} + +static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg) +{ + struct extent_cb_arg_t *ecba = cb_arg; + u64 start; + u64 len; + + /* XXX not sure these eios are what we want */ + + if (val_len != 0) + return -EIO; + + if (!valid_free_extent_key(key)) + return -EIO; + + if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) + return -ECHECK_ITER_DONE; + + start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1; + len = le64_to_cpu(key->skfb_len); + + return ecba->cb(start, len, ecba->cb_arg); +} + +/* + * Call the callback with each of the primary BLKNO free extents stored + * in item in the given alloc root. It doesn't visit the secondary + * ORDER extents. + */ +int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg) +{ + struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg }; + + return btree_item_iter(&root->root, free_item_cb, &ecba); +} diff --git a/utils/src/check/alloc.h b/utils/src/check/alloc.h new file mode 100644 index 00000000..f0273e4a --- /dev/null +++ b/utils/src/check/alloc.h @@ -0,0 +1,12 @@ +#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H +#define _SCOUTFS_UTILS_CHECK_ALLOC_H + +#include "extent.h" + +int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg); +int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg); + +int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg); +int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg); + +#endif diff --git a/utils/src/check/block.c b/utils/src/check/block.c new file mode 100644 index 00000000..08535a5a --- /dev/null +++ b/utils/src/check/block.c @@ -0,0 +1,613 @@ +#define _ISOC11_SOURCE /* aligned_alloc */ +#define _DEFAULT_SOURCE /* syscall() */ +#include +#include +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "util.h" +#include "format.h" +#include "list.h" +#include "cmp.h" +#include "hash.h" + +#include "block.h" +#include "debug.h" +#include "super.h" +#include "eno.h" +#include "crc.h" +#include "sns.h" + +static struct block_data { + struct list_head *hash_lists; + size_t hash_nr; + + struct list_head active_head; + struct list_head inactive_head; + struct list_head dirty_list; + size_t nr_active; + size_t nr_inactive; + size_t nr_dirty; + + int meta_fd; + size_t max_cached; + size_t nr_events; + + aio_context_t ctx; + struct iocb *iocbs; + struct iocb **iocbps; + struct io_event *events; +} global_bdat; + +struct block { + struct list_head hash_head; + struct list_head lru_head; + struct list_head dirty_head; + struct list_head submit_head; + unsigned long refcount; + unsigned long uptodate:1, + active:1; + u64 blkno; + void *buf; + size_t size; +}; + +#define BLK_FMT \ + "blkno %llu rc %ld d %u a %u" +#define BLK_ARG(blk) \ + (blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active +#define debug_blk(blk, fmt, args...) \ + debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk)) + +/* + * This just allocates and initialzies the block. The caller is + * responsible for putting it on the appropriate initial lists and + * managing refcounts. + */ +static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size) +{ + struct block *blk; + + blk = calloc(1, sizeof(struct block)); + if (blk) { + blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */ + if (!blk->buf) { + free(blk); + blk = NULL; + } else { + INIT_LIST_HEAD(&blk->hash_head); + INIT_LIST_HEAD(&blk->lru_head); + INIT_LIST_HEAD(&blk->dirty_head); + INIT_LIST_HEAD(&blk->submit_head); + blk->blkno = blkno; + blk->size = size; + } + } + + return blk; +} + +static void free_block(struct block_data *bdat, struct block *blk) +{ + debug_blk(blk, "free"); + + if (!list_empty(&blk->lru_head)) { + if (blk->active) + bdat->nr_active--; + else + bdat->nr_inactive--; + list_del(&blk->lru_head); + } + + if (!list_empty(&blk->dirty_head)) { + bdat->nr_dirty--; + list_del(&blk->dirty_head); + } + + if (!list_empty(&blk->hash_head)) + list_del(&blk->hash_head); + + if (!list_empty(&blk->submit_head)) + list_del(&blk->submit_head); + + free(blk->buf); + free(blk); +} + +static bool blk_is_dirty(struct block *blk) +{ + return !list_empty(&blk->dirty_head); +} + +/* + * Rebalance the cache. + * + * First we shrink the cache to limit it to max_cached blocks. + * Logically, we walk from oldest to newest in the inactive list and + * then in the active list. Since these lists are physically one + * list_head list we achieve this with a reverse walk starting from the + * active head. + * + * Then we rebalnace the size of the two lists. The constraint is that + * we don't let the active list grow larger than the inactive list. We + * move blocks from the oldest tail of the active list to the newest + * head of the inactive list. + * + * <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] -> + */ +static void rebalance_cache(struct block_data *bdat) +{ + struct block *blk; + struct block *blk_; + + list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) { + if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached) + break; + + if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 || + blk_is_dirty(blk)) + continue; + + free_block(bdat, blk); + } + + list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) { + if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head) + break; + + list_move(&blk->lru_head, &bdat->inactive_head); + blk->active = 0; + bdat->nr_active--; + bdat->nr_inactive++; + } +} + +static void make_active(struct block_data *bdat, struct block *blk) +{ + if (!blk->active) { + if (!list_empty(&blk->lru_head)) { + list_move(&blk->lru_head, &bdat->active_head); + bdat->nr_inactive--; + } else { + list_add(&blk->lru_head, &bdat->active_head); + } + + blk->active = 1; + bdat->nr_active++; + } +} + +static int compar_iocbp(const void *A, const void *B) +{ + struct iocb *a = *(struct iocb **)A; + struct iocb *b = *(struct iocb **)B; + + return scoutfs_cmp(a->aio_offset, b->aio_offset); +} + +static int submit_and_wait(struct block_data *bdat, struct list_head *list) +{ + struct io_event *event; + struct iocb *iocb; + struct block *blk; + int ret; + int err; + int nr; + int i; + + err = 0; + nr = 0; + list_for_each_entry(blk, list, submit_head) { + iocb = &bdat->iocbs[nr]; + bdat->iocbps[nr] = iocb; + + memset(iocb, 0, sizeof(struct iocb)); + + iocb->aio_data = (intptr_t)blk; + iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD; + iocb->aio_fildes = bdat->meta_fd; + iocb->aio_buf = (intptr_t)blk->buf; + iocb->aio_nbytes = blk->size; + iocb->aio_offset = blk->blkno * blk->size; + + nr++; + + debug_blk(blk, "submit"); + + if ((nr < bdat->nr_events) && blk->submit_head.next != list) + continue; + + qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp); + + ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps); + if (ret != nr) { + if (ret >= 0) + errno = EIO; + ret = -errno; + fprintf(stderr, "fatal system error submitting async IO: "ENO_FMT"\n", + ENO_ARG(-ret)); + goto out; + } + + ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL); + if (ret != nr) { + if (ret >= 0) + errno = EIO; + ret = -errno; + fprintf(stderr, "fatal system error getting IO events: "ENO_FMT"\n", + ENO_ARG(-ret)); + goto out; + } + + ret = 0; + for (i = 0; i < nr; i++) { + event = &bdat->events[i]; + iocb = (struct iocb *)(intptr_t)event->obj; + blk = (struct block *)(intptr_t)event->data; + + debug_blk(blk, "complete res %lld", (long long)event->res); + + if (event->res >= 0 && event->res != blk->size) + event->res = -EIO; + + /* io errors are fatal */ + if (event->res < 0) { + ret = event->res; + goto out; + } + + if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) { + blk->uptodate = 1; + } else { + list_del_init(&blk->dirty_head); + bdat->nr_dirty--; + } + } + nr = 0; + } + + ret = 0; +out: + return ret ?: err; +} + +static void inc_refcount(struct block *blk) +{ + blk->refcount++; +} + +void block_put(struct block **blkp) +{ + struct block_data *bdat = &global_bdat; + struct block *blk = *blkp; + + if (blk) { + blk->refcount--; + *blkp = NULL; + + rebalance_cache(bdat); + } +} + +static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno) +{ + u32 hash = scoutfs_hash32(&blkno, sizeof(blkno)); + + return &bdat->hash_lists[hash % bdat->hash_nr]; +} + +int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic) +{ + struct scoutfs_block_header *hdr; + size_t size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE; + int ret; + u32 crc; + + ret = block_get(&blk, blkno, bf); + if (ret < 0) { + fprintf(stderr, "error reading block %llu\n", blkno); + goto out; + } + + hdr = block_buf(blk); + + crc = crc_block(hdr, size); + + /* + * a bad CRC is easy to repair, so we pass a different error code + * back. Unless the other data is also wrong - then it's EINVAL + * to signal that this isn't a valid block hdr at all. + */ + if (le32_to_cpu(hdr->crc) != crc) + ret = -EIO; /* keep checking other fields */ + + if (le32_to_cpu(hdr->magic) != magic) + ret = -EINVAL; + + /* + * Our first caller fills in global_super. Until this completes, + * we can't do this check. + */ + if ((blkno != SCOUTFS_SUPER_BLKNO) && + (hdr->fsid != global_super->hdr.fsid)) + ret = -EINVAL; + + block_put(&blk); + + debug("%s blk_hdr_valid blkno %llu size %lu crc 0x%08x magic 0x%08x ret %d", + sns_str(), blkno, size, le32_to_cpu(hdr->crc), le32_to_cpu(hdr->magic), + ret); + +out: + return ret; +} + +static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf) +{ + struct list_head *bucket = hash_bucket(bdat, blkno); + struct block *search; + struct block *blk; + size_t size; + + size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE; + + blk = NULL; + list_for_each_entry(search, bucket, hash_head) { + if (search->blkno == blkno && search->size == size) { + blk = search; + break; + } + } + + if (!blk) { + blk = alloc_block(bdat, blkno, size); + if (blk) { + list_add(&blk->hash_head, bucket); + list_add(&blk->lru_head, &bdat->inactive_head); + bdat->nr_inactive++; + } + } + if (blk) + inc_refcount(blk); + + return blk; +} + +/* + * Get a block. + * + * The caller holds a refcount to the block while it's in use that + * prevents it from being removed from the cache. It must be dropped + * with block_put(); + */ +int block_get(struct block **blk_ret, u64 blkno, int bf) +{ + struct block_data *bdat = &global_bdat; + struct block *blk; + LIST_HEAD(list); + int ret; + + blk = get_or_alloc(bdat, blkno, bf); + if (!blk) { + ret = -ENOMEM; + goto out; + } + + if ((bf & BF_ZERO)) { + memset(blk->buf, 0, blk->size); + blk->uptodate = 1; + } + + if (bf & BF_OVERWRITE) + blk->uptodate = 1; + + if (!blk->uptodate) { + list_add(&blk->submit_head, &list); + ret = submit_and_wait(bdat, &list); + list_del_init(&blk->submit_head); + if (ret < 0) + goto out; + } + + if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) { + list_add_tail(&bdat->dirty_list, &blk->dirty_head); + bdat->nr_dirty++; + } + + make_active(bdat, blk); + + rebalance_cache(bdat); + ret = 0; +out: + if (ret < 0) + block_put(&blk); + *blk_ret = blk; + return ret; +} + +void *block_buf(struct block *blk) +{ + return blk->buf; +} + +size_t block_size(struct block *blk) +{ + return blk->size; +} + +/* + * Drop the block from the cache, regardless of if it was free or not. + * This is used to avoid writing blocks which were dirtied but then + * later freed. + * + * The block is immediately freed and can't be referenced after this + * returns. + */ +void block_drop(struct block **blkp) +{ + struct block_data *bdat = &global_bdat; + + free_block(bdat, *blkp); + *blkp = NULL; + rebalance_cache(bdat); +} + +/* + * This doesn't quite work for mixing large and small blocks, but that's + * fine, we never do that. + */ +static int compar_u64(const void *A, const void *B) +{ + u64 a = *((u64 *)A); + u64 b = *((u64 *)B); + + return scoutfs_cmp(a, b); +} + +/* + * This read-ahead is synchronous and errors are ignored. If any of the + * blknos aren't present in the cache then we issue concurrent reads for + * them and wait. Any existing cached blocks will be left as is. + * + * We might be trying to read a lot more than the number of events so we + * sort the caller's blknos before iterating over them rather than + * relying on submission sorting the blocks in each submitted set. + */ +void block_readahead(u64 *blknos, size_t nr) +{ + struct block_data *bdat = &global_bdat; + struct block *blk; + struct block *blk_; + LIST_HEAD(list); + size_t i; + + if (nr == 0) + return; + + qsort(blknos, nr, sizeof(blknos[0]), compar_u64); + + for (i = 0; i < nr; i++) { + blk = get_or_alloc(bdat, blknos[i], 0); + if (blk) { + if (!blk->uptodate) + list_add_tail(&blk->submit_head, &list); + else + block_put(&blk); + } + } + + (void)submit_and_wait(bdat, &list); + + list_for_each_entry_safe(blk, blk_, &list, submit_head) { + list_del_init(&blk->submit_head); + block_put(&blk); + } + + rebalance_cache(bdat); +} + +/* + * The caller's block changes form a consistent transaction. If the amount of dirty + * blocks is large enough we issue a write. + */ +int block_try_commit(bool force) +{ + struct block_data *bdat = &global_bdat; + struct block *blk; + struct block *blk_; + LIST_HEAD(list); + int ret; + + if (!force && bdat->nr_dirty < bdat->nr_events) + return 0; + + list_for_each_entry(blk, &bdat->dirty_list, dirty_head) { + list_add_tail(&blk->submit_head, &list); + inc_refcount(blk); + } + + ret = submit_and_wait(bdat, &list); + + list_for_each_entry_safe(blk, blk_, &list, submit_head) { + list_del_init(&blk->submit_head); + block_put(&blk); + } + + if (ret < 0) { + fprintf(stderr, "error writing dirty transaction blocks\n"); + goto out; + } + + ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY); + if (ret == 0) { + list_add(&blk->submit_head, &list); + ret = submit_and_wait(bdat, &list); + list_del_init(&blk->submit_head); + block_put(&blk); + } else { + ret = -ENOMEM; + } + if (ret < 0) + fprintf(stderr, "error writing super block to commit transaction\n"); + +out: + rebalance_cache(bdat); + return ret; +} + +int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes) +{ + struct block_data *bdat = &global_bdat; + size_t i; + int ret; + + bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE); + bdat->hash_nr = bdat->max_cached / 4; + bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE); + + bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0])); + bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0])); + bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0])); + bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0])); + if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&bdat->active_head); + INIT_LIST_HEAD(&bdat->inactive_head); + INIT_LIST_HEAD(&bdat->dirty_list); + bdat->meta_fd = meta_fd; + list_add(&bdat->inactive_head, &bdat->active_head); + + for (i = 0; i < bdat->hash_nr; i++) + INIT_LIST_HEAD(&bdat->hash_lists[i]); + + ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx); + +out: + if (ret < 0) { + free(bdat->iocbs); + free(bdat->iocbps); + free(bdat->events); + free(bdat->hash_lists); + } + + return ret; +} + +void block_shutdown(void) +{ + struct block_data *bdat = &global_bdat; + + syscall(SYS_io_destroy, bdat->ctx); + + free(bdat->iocbs); + free(bdat->iocbps); + free(bdat->events); + free(bdat->hash_lists); +} diff --git a/utils/src/check/block.h b/utils/src/check/block.h new file mode 100644 index 00000000..6c13b0cc --- /dev/null +++ b/utils/src/check/block.h @@ -0,0 +1,34 @@ +#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_ +#define _SCOUTFS_UTILS_CHECK_BLOCK_H_ + +#include +#include + +struct block; + +#include "sparse.h" + +/* block flags passed to block_get() */ +enum { + BF_ZERO = (1 << 0), /* zero contents buf as block is returned */ + BF_DIRTY = (1 << 1), /* block will be written with transaction */ + BF_SM = (1 << 2), /* small 4k block instead of large 64k block */ + BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */ +}; + +int block_get(struct block **blk_ret, u64 blkno, int bf); +void block_put(struct block **blkp); + +void *block_buf(struct block *blk); +size_t block_size(struct block *blk); +void block_drop(struct block **blkp); + +void block_readahead(u64 *blknos, size_t nr); +int block_try_commit(bool force); + +int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes); +void block_shutdown(void); + +int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic); + +#endif diff --git a/utils/src/check/btree.c b/utils/src/check/btree.c new file mode 100644 index 00000000..ebf05b8c --- /dev/null +++ b/utils/src/check/btree.c @@ -0,0 +1,217 @@ +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "util.h" +#include "format.h" +#include "key.h" +#include "avl.h" + +#include "block.h" +#include "btree.h" +#include "extent.h" +#include "iter.h" +#include "sns.h" +#include "meta.h" +#include "problem.h" + +static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item) +{ + return (void *)bt + le16_to_cpu(item->val_off); +} + +static void readahead_refs(struct scoutfs_btree_block *bt) +{ + struct scoutfs_btree_item *item; + struct scoutfs_avl_node *node; + struct scoutfs_block_ref *ref; + u64 *blknos; + u64 blkno; + u16 valid = 0; + u16 nr = le16_to_cpu(bt->nr_items); + int i; + + blknos = calloc(nr, sizeof(blknos[0])); + if (!blknos) + return; + + node = avl_first(&bt->item_root); + + for (i = 0; i < nr; i++) { + item = container_of(node, struct scoutfs_btree_item, node); + ref = item_val(bt, item); + blkno = le64_to_cpu(ref->blkno); + + if (valid_meta_blkno(blkno)) + blknos[valid++] = blkno; + + node = avl_next(&bt->item_root, &item->node); + } + + if (valid > 0) + block_readahead(blknos, valid); + free(blknos); +} + +/* + * Call the callback on the referenced block. Then if the block + * contains referneces read it and recurse into all its references. + */ +static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb, + void *cb_arg) +{ + struct scoutfs_btree_item *item; + struct scoutfs_btree_block *bt; + struct scoutfs_avl_node *node; + struct block *blk = NULL; + u64 blkno; + int ret; + int i; + + blkno = le64_to_cpu(ref->blkno); + if (!blkno) + return 0; + + ret = cb(blkno, 1, cb_arg); + if (ret < 0) { + ret = xlate_iter_errno(ret); + return 0; + } + + if (level == 0) + return 0; + + ret = block_get(&blk, blkno, 0); + if (ret < 0) + return ret; + + ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE); + if (ret < 0) + return ret; + + sns_push("btree_parent", blkno, 0); + + bt = block_buf(blk); + + /* XXX integrate verification with block cache */ + if (bt->level != level) { + problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level); + ret = -EINVAL; + goto out; + } + + /* read-ahead last level of parents */ + if (level == 2) + readahead_refs(bt); + + node = avl_first(&bt->item_root); + + for (i = 0; i < le16_to_cpu(bt->nr_items); i++) { + item = container_of(node, struct scoutfs_btree_item, node); + ref = item_val(bt, item); + + ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg); + if (ret < 0) + goto out; + + node = avl_next(&bt->item_root, &item->node); + } + + ret = 0; +out: + block_put(&blk); + sns_pop(); + + return ret; +} + +int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg) +{ + /* XXX check root */ + if (root->height == 0) + return 0; + + return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg); +} + +static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level, + btree_item_cb_t cb, void *cb_arg) +{ + struct scoutfs_btree_item *item; + struct scoutfs_btree_block *bt; + struct scoutfs_avl_node *node; + struct block *blk = NULL; + u64 blkno; + int ret; + int i; + + blkno = le64_to_cpu(ref->blkno); + if (!blkno) + return 0; + + ret = block_get(&blk, blkno, 0); + if (ret < 0) + return ret; + + if (level) + sns_push("btree_parent", blkno, 0); + else + sns_push("btree_leaf", blkno, 0); + + ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE); + if (ret < 0) + return ret; + + bt = block_buf(blk); + + /* XXX integrate verification with block cache */ + if (bt->level != level) { + problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level); + ret = -EINVAL; + goto out; + } + + /* read-ahead leaves that contain items */ + if (level == 1) + readahead_refs(bt); + + node = avl_first(&bt->item_root); + + for (i = 0; i < le16_to_cpu(bt->nr_items); i++) { + item = container_of(node, struct scoutfs_btree_item, node); + + if (level) { + ref = item_val(bt, item); + ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg); + } else { + ret = cb(&item->key, item_val(bt, item), + le16_to_cpu(item->val_len), cb_arg); + debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret); + } + if (ret < 0) { + ret = xlate_iter_errno(ret); + goto out; + } + + node = avl_next(&bt->item_root, &item->node); + } + + ret = 0; +out: + block_put(&blk); + sns_pop(); + + return ret; +} + +int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg) +{ + /* XXX check root */ + if (root->height == 0) + return 0; + + return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg); +} diff --git a/utils/src/check/btree.h b/utils/src/check/btree.h new file mode 100644 index 00000000..dc0b3bf9 --- /dev/null +++ b/utils/src/check/btree.h @@ -0,0 +1,14 @@ +#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_ +#define _SCOUTFS_UTILS_CHECK_BTREE_H_ + +#include "util.h" +#include "format.h" + +#include "extent.h" + +typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg); + +int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg); +int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg); + +#endif diff --git a/utils/src/check/check.c b/utils/src/check/check.c new file mode 100644 index 00000000..0fa8a870 --- /dev/null +++ b/utils/src/check/check.c @@ -0,0 +1,184 @@ +#define _GNU_SOURCE /* O_DIRECT */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "parse.h" +#include "util.h" +#include "format.h" +#include "ioctl.h" +#include "cmd.h" +#include "dev.h" + +#include "alloc.h" +#include "block.h" +#include "debug.h" +#include "meta.h" +#include "super.h" +#include "problem.h" + +struct check_args { + char *meta_device; + char *data_device; + char *debug_path; +}; + +static int do_check(struct check_args *args) +{ + int debug_fd = -1; + int meta_fd = -1; + int data_fd = -1; + int ret; + + if (args->debug_path) { + if (strcmp(args->debug_path, "-") == 0) + debug_fd = dup(STDERR_FILENO); + else + debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (debug_fd < 0) { + ret = -errno; + fprintf(stderr, "error opening debug output file '%s': %s (%d)\n", + args->debug_path, strerror(errno), errno); + goto out; + } + + debug_enable(debug_fd); + } + + meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL); + if (meta_fd < 0) { + ret = -errno; + fprintf(stderr, "failed to open meta device '%s': %s (%d)\n", + args->meta_device, strerror(errno), errno); + goto out; + } + + data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL); + if (data_fd < 0) { + ret = -errno; + fprintf(stderr, "failed to open data device '%s': %s (%d)\n", + args->data_device, strerror(errno), errno); + goto out; + } + + ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024); + if (ret < 0) + goto out; + + /* + * At some point we may convert this to a multi-pass system where we may + * try and repair items, and, as long as repairs are made, we will rerun + * the checks more times. We may need to start counting how many problems we + * fix in the process of these loops, so that we don't stall on unrepairable + * problems and are making actual repair progress. IOW - when we do a full + * check loop without any problems fixed, we stop trying. + */ + ret = check_supers(data_fd) ?: + check_super_in_use(meta_fd) ?: + check_meta_alloc() ?: + check_super_crc(); + + if (ret < 0) + goto out; + + debug("problem count %lu", problems_count()); + if (problems_count() > 0) + printf("Problems detected.\n"); + +out: + /* and tear it all down */ + block_shutdown(); + super_shutdown(); + debug_disable(); + + if (meta_fd >= 0) + close(meta_fd); + if (data_fd >= 0) + close(data_fd); + if (debug_fd >= 0) + close(debug_fd); + + return ret; +} + +static int parse_opt(int key, char *arg, struct argp_state *state) +{ + struct check_args *args = state->input; + + switch (key) { + case 'd': + args->debug_path = strdup_or_error(state, arg); + break; + case 'e': + case ARGP_KEY_ARG: + if (!args->meta_device) + args->meta_device = strdup_or_error(state, arg); + else if (!args->data_device) + args->data_device = strdup_or_error(state, arg); + else + argp_error(state, "more than two device arguments given"); + break; + case ARGP_KEY_FINI: + if (!args->meta_device) + argp_error(state, "no metadata device argument given"); + if (!args->data_device) + argp_error(state, "no data device argument given"); + break; + default: + break; + } + + return 0; +} + +static struct argp_option options[] = { + { "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"}, + { NULL } +}; + +static struct argp argp = { + options, + parse_opt, + "META-DEVICE DATA-DEVICE", + "Check filesystem consistency" +}; + +/* Exit codes used by fsck-type programs */ +#define FSCK_EX_NONDESTRUCT 1 /* File system errors corrected */ +#define FSCK_EX_UNCORRECTED 4 /* File system errors left uncorrected */ +#define FSCK_EX_ERROR 8 /* Operational error */ +#define FSCK_EX_USAGE 16 /* Usage or syntax error */ + +static int check_cmd(int argc, char **argv) +{ + struct check_args check_args = {NULL}; + int ret; + + ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args); + if (ret) + exit(FSCK_EX_USAGE); + + ret = do_check(&check_args); + if (ret < 0) + ret = FSCK_EX_ERROR; + + if (problems_count() > 0) + ret |= FSCK_EX_UNCORRECTED; + + exit(ret); +} + +static void __attribute__((constructor)) check_ctor(void) +{ + cmd_register_argp("check", &argp, GROUP_CORE, check_cmd); +} diff --git a/utils/src/check/debug.c b/utils/src/check/debug.c new file mode 100644 index 00000000..0017c1aa --- /dev/null +++ b/utils/src/check/debug.c @@ -0,0 +1,16 @@ +#include + +#include "debug.h" + +int debug_fd = -1; + +void debug_enable(int fd) +{ + debug_fd = fd; +} + +void debug_disable(void) +{ + if (debug_fd >= 0) + debug_fd = -1; +} diff --git a/utils/src/check/debug.h b/utils/src/check/debug.h new file mode 100644 index 00000000..a5103494 --- /dev/null +++ b/utils/src/check/debug.h @@ -0,0 +1,17 @@ +#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_ +#define _SCOUTFS_UTILS_CHECK_DEBUG_H_ + +#include + +#define debug(fmt, args...) \ +do { \ + if (debug_fd >= 0) \ + dprintf(debug_fd, fmt"\n", ##args); \ +} while (0) + +extern int debug_fd; + +void debug_enable(int fd); +void debug_disable(void); + +#endif diff --git a/utils/src/check/eno.h b/utils/src/check/eno.h new file mode 100644 index 00000000..14579fce --- /dev/null +++ b/utils/src/check/eno.h @@ -0,0 +1,9 @@ +#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_ +#define _SCOUTFS_UTILS_CHECK_ENO_H_ + +#include + +#define ENO_FMT "%d (%s)" +#define ENO_ARG(eno) eno, strerror(eno) + +#endif diff --git a/utils/src/check/extent.c b/utils/src/check/extent.c new file mode 100644 index 00000000..bbbcc887 --- /dev/null +++ b/utils/src/check/extent.c @@ -0,0 +1,313 @@ +#include +#include +#include +#include +#include + +#include "util.h" +#include "lk_rbtree_wrapper.h" + +#include "debug.h" +#include "extent.h" + +/* + * In-memory extent management in rbtree nodes. + */ + +bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len) +{ + u64 a_end = a_start + a_len; + u64 b_end = b_start + b_len; + + return !((a_end <= b_start) || (b_end <= a_start)); +} + +static int ext_contains(struct extent_node *ext, u64 start, u64 len) +{ + return ext->start <= start && ext->start + ext->len >= start + len; +} + +/* + * True if the given extent is bisected by the given range; there's + * leftover containing extents on both the left and right sides of the + * range in the extent. + */ +static int ext_bisected(struct extent_node *ext, u64 start, u64 len) +{ + return ext->start < start && ext->start + ext->len > start + len; +} + +static struct extent_node *ext_from_rbnode(struct rb_node *rbnode) +{ + return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL; +} + +static struct extent_node *next_ext(struct extent_node *ext) +{ + return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL; +} + +static struct extent_node *prev_ext(struct extent_node *ext) +{ + return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL; +} + +struct walk_results { + unsigned bisect_to_leaf:1; + struct extent_node *found; + struct extent_node *next; + struct rb_node *parent; + struct rb_node **node; +}; + +static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk) +{ + struct rb_node **node = &root->rbroot.rb_node; + struct extent_node *ext; + u64 end = start + len; + int cmp; + + wlk->found = NULL; + wlk->next = NULL; + wlk->parent = NULL; + + while (*node) { + wlk->parent = *node; + ext = ext_from_rbnode(*node); + cmp = end <= ext->start ? -1 : + start >= ext->start + ext->len ? 1 : 0; + + if (cmp < 0) { + node = &ext->rbnode.rb_left; + wlk->next = ext; + } else if (cmp > 0) { + node = &ext->rbnode.rb_right; + } else { + wlk->found = ext; + if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len))) + break; + /* walk right so we can insert greater right from bisection */ + node = &ext->rbnode.rb_right; + } + } + + wlk->node = node; +} + +/* + * Return an extent that overlaps with the given range. + */ +int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found) +{ + struct walk_results wlk = { 0, }; + int ret; + + walk_extents(root, start, len, &wlk); + if (wlk.found) { + memset(found, 0, sizeof(struct extent_node)); + found->start = wlk.found->start; + found->len = wlk.found->len; + ret = 0; + } else { + ret = -ENOENT; + } + + return ret; +} + +/* + * Callers can iterate through direct node references and are entirely + * responsible for consistency when doing so. + */ +struct extent_node *extent_first(struct extent_root *root) +{ + struct walk_results wlk = { 0, }; + + walk_extents(root, 0, 1, &wlk); + + return wlk.found ?: wlk.next; +} + +struct extent_node *extent_next(struct extent_node *ext) +{ + return next_ext(ext); +} + +struct extent_node *extent_prev(struct extent_node *ext) +{ + return prev_ext(ext); +} + +/* + * Insert a new extent into the tree. We can extend existing nodes, + * merge with neighbours, or remove existing extents entirely if we + * insert a range that fully spans existing nodes. + */ +static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err) +{ + struct walk_results wlk = { 0, }; + struct extent_node *ext; + struct extent_node *nei; + int ret; + + walk_extents(root, start, len, &wlk); + + ext = wlk.found; + if (ext && found_err) { + ret = found_err; + goto out; + } + + if (!ext) { + ext = malloc(sizeof(struct extent_node)); + if (!ext) { + ret = -ENOMEM; + goto out; + } + + ext->start = start; + ext->len = len; + + rb_link_node(&ext->rbnode, wlk.parent, wlk.node); + rb_insert_color(&ext->rbnode, &root->rbroot); + } + + /* start by expanding an existing extent if our range is larger */ + if (start < ext->start) { + ext->len += ext->start - start; + ext->start = start; + } + if (ext->start + ext->len < start + len) + ext->len += (start + len) - (ext->start + ext->len); + + /* drop any fully spanned neighbors, possibly merging with a final adjacent one */ + + while ((nei = prev_ext(ext))) { + if (nei->start + nei->len < ext->start) + break; + + if (nei->start < ext->start) { + ext->len += ext->start - nei->start; + ext->start = nei->start; + } + + rb_erase(&nei->rbnode, &root->rbroot); + free(nei); + } + + while ((nei = next_ext(ext))) { + if (ext->start + ext->len < nei->start) + break; + + if (ext->start + ext->len < nei->start + nei->len) + ext->len += (nei->start + nei->len) - (ext->start + ext->len); + + rb_erase(&nei->rbnode, &root->rbroot); + free(nei); + } + + ret = 0; +out: + if (ret < 0) + debug("start %llu len %llu ret %d", start, len, ret); + return ret; +} + +/* + * Insert a new extent. The specified extent must not overlap with any + * existing extents or -EEXIST is returned. + */ +int extent_insert_new(struct extent_root *root, u64 start, u64 len) +{ + return walk_insert(root, start, len, true); +} + +/* + * Insert an extent, extending any existing extents that may overlap. + */ +int extent_insert_extend(struct extent_root *root, u64 start, u64 len) +{ + return walk_insert(root, start, len, false); +} + +/* + * Remove the specified extent from an existing node. The given extent must be fully + * contained in a single node or -ENOENT is returned. + */ +int extent_remove(struct extent_root *root, u64 start, u64 len) +{ + struct extent_node *ext; + struct extent_node *ins; + struct walk_results wlk = { + .bisect_to_leaf = 1, + }; + int ret; + + walk_extents(root, start, len, &wlk); + + if (!(ext = wlk.found) || !ext_contains(ext, start, len)) { + ret = -ENOENT; + goto out; + } + + if (ext_bisected(ext, start, len)) { + debug("found bisected start %llu len %llu", ext->start, ext->len); + ins = malloc(sizeof(struct extent_node)); + if (!ins) { + ret = -ENOMEM; + goto out; + } + + ins->start = start + len; + ins->len = (ext->start + ext->len) - ins->start; + + rb_link_node(&ins->rbnode, wlk.parent, wlk.node); + rb_insert_color(&ins->rbnode, &root->rbroot); + } + + if (start > ext->start) { + ext->len = start - ext->start; + } else if (len < ext->len) { + ext->start += len; + ext->len -= len; + } else { + rb_erase(&ext->rbnode, &root->rbroot); + } + + ret = 0; +out: + debug("start %llu len %llu ret %d", start, len, ret); + + return ret; +} + +void extent_root_init(struct extent_root *root) +{ + root->rbroot = RB_ROOT; + root->total = 0; +} + +void extent_root_free(struct extent_root *root) +{ + struct extent_node *ext; + struct rb_node *node; + struct rb_node *tmp; + + for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) { + ext = rb_entry(node, struct extent_node, rbnode); + rb_erase(&ext->rbnode, &root->rbroot); + free(ext); + } +} + +void extent_root_print(struct extent_root *root) +{ + struct extent_node *ext; + struct rb_node *node; + struct rb_node *tmp; + + for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) { + ext = rb_entry(node, struct extent_node, rbnode); + debug(" start %llu len %llu", ext->start, ext->len); + } +} diff --git a/utils/src/check/extent.h b/utils/src/check/extent.h new file mode 100644 index 00000000..2a38f765 --- /dev/null +++ b/utils/src/check/extent.h @@ -0,0 +1,38 @@ +#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_ +#define _SCOUTFS_UTILS_CHECK_EXTENT_H_ + +#include "lk_rbtree_wrapper.h" + +struct extent_root { + struct rb_root rbroot; + u64 total; +}; + +struct extent_node { + struct rb_node rbnode; + u64 start; + u64 len; +}; + +typedef int (*extent_cb_t)(u64 start, u64 len, void *arg); + +struct extent_cb_arg_t { + extent_cb_t cb; + void *cb_arg; +}; + +bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len); + +int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found); +struct extent_node *extent_first(struct extent_root *root); +struct extent_node *extent_next(struct extent_node *ext); +struct extent_node *extent_prev(struct extent_node *ext); +int extent_insert_new(struct extent_root *root, u64 start, u64 len); +int extent_insert_extend(struct extent_root *root, u64 start, u64 len); +int extent_remove(struct extent_root *root, u64 start, u64 len); + +void extent_root_init(struct extent_root *root); +void extent_root_free(struct extent_root *root); +void extent_root_print(struct extent_root *root); + +#endif diff --git a/utils/src/check/image.c b/utils/src/check/image.c new file mode 100644 index 00000000..0932ece6 --- /dev/null +++ b/utils/src/check/image.c @@ -0,0 +1,540 @@ +#define _GNU_SOURCE /* O_DIRECT */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "bitmap.h" +#include "parse.h" +#include "util.h" +#include "format.h" +#include "crc.h" +#include "cmd.h" +#include "dev.h" + +#include "alloc.h" +#include "block.h" +#include "btree.h" +#include "log_trees.h" +#include "super.h" + +/* huh. */ +#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1) + +#define SCOUTFS_META_IMAGE_HEADER_MAGIC 0x8aee00d098fa60c5ULL +#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC 0x70bd5e9269effd86ULL + +struct scoutfs_meta_image_header { + __le64 magic; + __le64 total_bytes; + __le32 version; +} __packed; + +struct scoutfs_meta_image_block_header { + __le64 magic; + __le64 offset; + __le32 size; + __le32 crc; +} __packed; + +struct image_args { + char *meta_device; + bool is_read; + bool show_header; + u64 ra_window; +}; + +struct block_bitmaps { + unsigned long *bits; + u64 size; + u64 count; +}; + +#define errf(fmt, args...) \ + dprintf(STDERR_FILENO, fmt, ##args) + +static int set_meta_bit(u64 start, u64 len, void *arg) +{ + struct block_bitmaps *bm = arg; + int ret; + + if (len != 1) { + ret = -EINVAL; + } else { + if (!test_bit(bm->bits, start)) { + set_bit(bm->bits, start); + bm->count++; + } + ret = 0; + } + + return ret; +} + +static int get_ref_bits(struct block_bitmaps *bm) +{ + struct scoutfs_super_block *super = global_super; + int ret; + u64 i; + + /* + * There are almost no small blocks we need to read, so we read + * them as the large blocks that contain them to simplify the + * block reading process. + */ + set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm); + + for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) + set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm); + + ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?: + alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?: + alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?: + alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?: + alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?: + alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?: + alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?: + btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?: + btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?: + btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?: + btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?: + btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?: + log_trees_meta_iter(set_meta_bit, bm); + + return ret; +} + +/* + * Note that this temporarily modifies the header that it's given. + */ +static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size) +{ + __le32 saved = bh->crc; + u32 crc = ~0; + + bh->crc = 0; + crc = crc32c(crc, bh, sizeof(*bh)); + crc = crc32c(crc, buf, size); + bh->crc = saved; + + return cpu_to_le32(crc); +} + +static void printf_header(struct scoutfs_meta_image_header *hdr) +{ + errf("magic: 0x%016llx\n" + "total_bytes: %llu\n" + "version: %u\n", + le64_to_cpu(hdr->magic), + le64_to_cpu(hdr->total_bytes), + le32_to_cpu(hdr->version)); +} + +typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset); + +static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset) +{ + return read(fd, buf, count); +} + +static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset) +{ + return pread(fd, buf, count, offset); +} + +static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset) +{ + return write(fd, buf, count); +} + +static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset) +{ + return pwrite(fd, buf, count, offset); +} + +static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset) +{ + ssize_t sret; + + while (count > 0) { + sret = func(fd, buf, count, offset); + if (sret <= 0 || sret > count) { + if (sret < 0) + return -errno; + else + return -EIO; + } + + if (tot) + *tot += sret; + buf += sret; + count -= sret; + } + + return 0; +} + +static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm) +{ + struct scoutfs_meta_image_block_header bh; + struct scoutfs_meta_image_header hdr; + u64 opening; + void *buf; + off_t off; + u64 bit; + u64 ra; + int ret; + + buf = malloc(SCOUTFS_BLOCK_LG_SIZE); + if (!buf) { + ret = -ENOMEM; + goto out; + } + + hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC); + hdr.total_bytes = cpu_to_le64(sizeof(hdr) + + (bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh)))); + hdr.version = cpu_to_le32(1); + + if (args->show_header) { + printf_header(&hdr); + ret = 0; + goto out; + } + + ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0); + if (ret < 0) + goto out; + + opening = args->ra_window; + ra = 0; + bit = 0; + + for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) { + + /* readahead to open the full window, then a block at a time */ + do { + ra = find_next_set_bit(bm->bits, ra, bm->size); + if (ra < bm->size) { + off = ra << SCOUTFS_BLOCK_LG_SHIFT; + posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED); + ra++; + if (opening) + opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE); + } + } while (opening > 0); + + off = bit << SCOUTFS_BLOCK_LG_SHIFT; + ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off); + if (ret < 0) + goto out; + + /* + * Might as well try to drop the pages we've used to + * reduce memory pressure on our read-ahead pages that + * are waiting. + */ + posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED); + + bh.magic = cpu_to_le64(SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC); + bh.offset = cpu_to_le64(off); + bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE); + bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE); + + ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?: + rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0); + if (ret < 0) + goto out; + } + +out: + free(buf); + + return ret; +} + +static int invalid_header(struct scoutfs_meta_image_header *hdr) +{ + if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) { + errf("bad image header magic 0x%016llx (!= expected %016llx)\n", + le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC); + + } else if (le32_to_cpu(hdr->version) != 1) { + errf("unknown image header version %u\n", le32_to_cpu(hdr->version)); + + } else { + return 0; + } + + return -EIO; +} + +/* + * Doesn't catch offset+size overflowing, presumes pwrite() will return + * an error. + */ +static int invalid_block_header(struct scoutfs_meta_image_block_header *bh) +{ + if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) { + errf("bad block header magic 0x%016llx (!= expected %016llx)\n", + le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC); + + } else if (le32_to_cpu(bh->size) == 0) { + errf("invalid block header size %u\n", le32_to_cpu(bh->size)); + + } else if (le32_to_cpu(bh->size) > SIZE_MAX) { + errf("block header size %u too large for size_t (> %zu)\n", + le32_to_cpu(bh->size), (size_t)SIZE_MAX); + + } else if (le64_to_cpu(bh->offset) > OFF_MAX) { + errf("block header offset %llu too large for off_t (> %llu)\n", + le64_to_cpu(bh->offset), (u64)OFF_MAX); + + } else { + return 0; + } + + return -EIO; +} + +static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm) +{ + struct scoutfs_meta_image_block_header bh; + struct scoutfs_meta_image_header hdr; + size_t writeback_batch = (2 * 1024 * 1024); + size_t buf_size; + size_t dirty; + size_t size; + off_t first; + off_t last; + off_t off; + __le32 calc; + void *buf; + u64 tot; + int ret; + + tot = 0; + + ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0); + if (ret < 0) + goto out; + + if (args->show_header) { + printf_header(&hdr); + ret = 0; + goto out; + } + + ret = invalid_header(&hdr); + if (ret < 0) + goto out; + + dirty = 0; + first = OFF_MAX; + last = 0; + buf = NULL; + buf_size = 0; + + while (tot < le64_to_cpu(hdr.total_bytes)) { + + ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0); + if (ret < 0) + goto out; + + ret = invalid_block_header(&bh); + if (ret < 0) + goto out; + + size = le32_to_cpu(bh.size); + if (buf_size < size) { + buf = realloc(buf, size); + if (!buf) { + ret = -ENOMEM; + goto out; + } + + buf_size = size; + } + + ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0); + if (ret < 0) + goto out; + + calc = calc_crc(&bh, buf, size); + if (calc != bh.crc) { + errf("crc err"); + ret = -EIO; + goto out; + } + + off = le64_to_cpu(bh.offset); + + ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off); + if (ret < 0) + goto out; + + dirty += size; + first = min(first, off); + last = max(last, off); + if (dirty >= writeback_batch) { + posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED); + dirty = 0; + first = OFF_MAX; + last = 0; + } + } + + ret = fsync(fd); + if (ret < 0) { + ret = -errno; + goto out; + } + +out: + return ret; +} + +static int do_image(struct image_args *args) +{ + struct block_bitmaps bm = { .bits = NULL }; + int meta_fd = -1; + u64 dev_size; + mode_t mode; + int ret; + + mode = args->is_read ? O_RDONLY : O_RDWR; + + meta_fd = open(args->meta_device, mode); + if (meta_fd < 0) { + ret = -errno; + errf("failed to open meta device '%s': %s (%d)\n", + args->meta_device, strerror(errno), errno); + goto out; + } + + if (args->is_read) { + ret = flush_device(meta_fd); + if (ret < 0) + goto out; + + ret = get_device_size(args->meta_device, meta_fd, &dev_size); + if (ret < 0) + goto out; + + bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE); + bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8); + if (!bm.bits) { + ret = -ENOMEM; + goto out; + } + + ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?: + check_supers(-1) ?: + get_ref_bits(&bm) ?: + read_image(args, meta_fd, &bm); + block_shutdown(); + } else { + ret = write_image(args, meta_fd, &bm); + } +out: + free(bm.bits); + + if (meta_fd >= 0) + close(meta_fd); + + return ret; +} + +static int parse_opt(int key, char *arg, struct argp_state *state) +{ + struct image_args *args = state->input; + int ret; + + switch (key) { + case 'h': + args->show_header = true; + break; + case 'r': + ret = parse_u64(arg, &args->ra_window); + if (ret) + argp_error(state, "readahead winddoe parse error"); + break; + case ARGP_KEY_ARG: + if (!args->meta_device) + args->meta_device = strdup_or_error(state, arg); + else + argp_error(state, "more than two device arguments given"); + break; + case ARGP_KEY_FINI: + if (!args->meta_device) + argp_error(state, "no metadata device argument given"); + break; + default: + break; + } + + return 0; +} + +static struct argp_option options[] = { + { "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" }, + { "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" }, + { NULL } +}; + +static struct argp read_image_argp = { + options, + parse_opt, + "META-DEVICE", + "Read metadata image stream from metadata device file" +}; + +#define DEFAULT_RA_WINDOW (512 * 1024) + +static int read_image_cmd(int argc, char **argv) +{ + struct image_args image_args = { + .is_read = true, + .ra_window = DEFAULT_RA_WINDOW, + }; + int ret; + + ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args); + if (ret) + return ret; + + return do_image(&image_args); +} + +static struct argp write_image_argp = { + options, + parse_opt, + "META-DEVICE", + "Write metadata image stream to metadata device file" +}; + +static int write_image_cmd(int argc, char **argv) +{ + struct image_args image_args = { + .is_read = false, + .ra_window = DEFAULT_RA_WINDOW, + }; + int ret; + + ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args); + if (ret) + return ret; + + return do_image(&image_args); +} + +static void __attribute__((constructor)) image_ctor(void) +{ + cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd); + cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd); +} diff --git a/utils/src/check/iter.h b/utils/src/check/iter.h new file mode 100644 index 00000000..54c5d13b --- /dev/null +++ b/utils/src/check/iter.h @@ -0,0 +1,15 @@ +#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_ +#define _SCOUTFS_UTILS_CHECK_ITER_H_ + +/* + * Callbacks can return a weird -errno that we'll never use to indicate + * that iteration can stop and return 0 for success. + */ +#define ECHECK_ITER_DONE EL2HLT + +static inline int xlate_iter_errno(int ret) +{ + return ret == -ECHECK_ITER_DONE ? 0 : ret; +} + +#endif diff --git a/utils/src/check/log_trees.c b/utils/src/check/log_trees.c new file mode 100644 index 00000000..627052c7 --- /dev/null +++ b/utils/src/check/log_trees.c @@ -0,0 +1,98 @@ +#include +#include +#include +#include + +#include "sparse.h" +#include "util.h" +#include "format.h" +#include "key.h" + +#include "alloc.h" +#include "btree.h" +#include "debug.h" +#include "extent.h" +#include "iter.h" +#include "sns.h" +#include "log_trees.h" +#include "super.h" + +struct iter_args { + extent_cb_t cb; + void *cb_arg; +}; + +static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg) +{ + struct iter_args *ia = cb_arg; + struct scoutfs_log_trees *lt; + int ret; + + if (val_len != sizeof(struct scoutfs_log_trees)) + ; /* XXX */ + + lt = val; + + sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr)); + + debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr)); + + sns_push("meta_avail", 0, 0); + ret = alloc_list_meta_iter(<->meta_avail, ia->cb, ia->cb_arg); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("meta_freed", 0, 0); + ret = alloc_list_meta_iter(<->meta_freed, ia->cb, ia->cb_arg); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("item_root", 0, 0); + ret = btree_meta_iter(<->item_root, ia->cb, ia->cb_arg); + sns_pop(); + if (ret < 0) + goto out; + + if (lt->bloom_ref.blkno) { + sns_push("bloom_ref", 0, 0); + ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg); + sns_pop(); + if (ret < 0) { + ret = xlate_iter_errno(ret); + goto out; + } + } + + sns_push("data_avail", 0, 0); + ret = alloc_root_meta_iter(<->data_avail, ia->cb, ia->cb_arg); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("data_freed", 0, 0); + ret = alloc_root_meta_iter(<->data_freed, ia->cb, ia->cb_arg); + sns_pop(); + if (ret < 0) + goto out; + + ret = 0; +out: + sns_pop(); + + return ret; +} + +/* + * Call the callers callback with the extent of all the metadata block references contained + * in log btrees. We walk the logs_root btree items and walk all the metadata structures + * they reference. + */ +int log_trees_meta_iter(extent_cb_t cb, void *cb_arg) +{ + struct scoutfs_super_block *super = global_super; + struct iter_args ia = { .cb = cb, .cb_arg = cb_arg }; + + return btree_item_iter(&super->logs_root, lt_meta_iter, &ia); +} diff --git a/utils/src/check/log_trees.h b/utils/src/check/log_trees.h new file mode 100644 index 00000000..7a7150b1 --- /dev/null +++ b/utils/src/check/log_trees.h @@ -0,0 +1,8 @@ +#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_ +#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_ + +#include "extent.h" + +int log_trees_meta_iter(extent_cb_t cb, void *cb_arg); + +#endif diff --git a/utils/src/check/meta.c b/utils/src/check/meta.c new file mode 100644 index 00000000..40a2e5a5 --- /dev/null +++ b/utils/src/check/meta.c @@ -0,0 +1,367 @@ +#include +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "util.h" +#include "format.h" +#include "bitmap.h" +#include "key.h" + +#include "alloc.h" +#include "btree.h" +#include "debug.h" +#include "extent.h" +#include "sns.h" +#include "log_trees.h" +#include "meta.h" +#include "problem.h" +#include "super.h" + +static struct meta_data { + struct extent_root meta_refed; + struct extent_root meta_free; + struct { + u64 ref_blocks; + u64 free_extents; + u64 free_blocks; + } stats; +} global_mdat; + +bool valid_meta_blkno(u64 blkno) +{ + u64 tot = le64_to_cpu(global_super->total_meta_blocks); + + return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot; +} + +static bool valid_meta_extent(u64 start, u64 len) +{ + u64 tot = le64_to_cpu(global_super->total_meta_blocks); + bool valid; + + valid = len > 0 && + start >= SCOUTFS_META_DEV_START_BLKNO && + start < tot && + len <= tot && + ((start + len) <= tot) && + ((start + len) > start); + + debug("start %llu len %llu valid %u", start, len, !!valid); + + if (!valid) + problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len); + + return valid; +} + +/* + * Track references to individual metadata blocks. This uses the extent + * callback type but is only ever called for single block references. + * Any reference to a block that has already been referenced is + * considered invalid and is ignored. Later repair will resolve + * duplicate references. + */ +static int insert_meta_ref(u64 start, u64 len, void *arg) +{ + struct meta_data *mdat = &global_mdat; + struct extent_root *root = arg; + int ret = 0; + + /* this is tracking single metadata block references */ + if (len != 1) { + ret = -EINVAL; + goto out; + } + + if (valid_meta_blkno(start)) { + ret = extent_insert_new(root, start, len); + if (ret == 0) + mdat->stats.ref_blocks++; + else if (ret == -EEXIST) + problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start); + } + +out: + return ret; +} + +static int insert_meta_free(u64 start, u64 len, void *arg) +{ + struct meta_data *mdat = &global_mdat; + struct extent_root *root = arg; + int ret = 0; + + if (valid_meta_extent(start, len)) { + ret = extent_insert_new(root, start, len); + if (ret == 0) { + mdat->stats.free_extents++; + mdat->stats.free_blocks++; + + } else if (ret == -EEXIST) { + problem(PB_META_FREE_OVERLAPS_EXISTING, + "start %llu llen %llu", start, len); + } + + } + + return ret; +} + +/* + * Walk all metadata references in the system. This walk doesn't need + * to read metadata that doesn't contain any metadata references so it + * can skip the bulk of metadata blocks. This gives us the set of + * referenced metadata blocks which we can then use to repair metadata + * allocator structures. + */ +static int get_meta_refs(void) +{ + struct meta_data *mdat = &global_mdat; + struct scoutfs_super_block *super = global_super; + int ret; + + extent_root_init(&mdat->meta_refed); + + /* XXX record reserved blocks around super as referenced */ + + sns_push("meta_alloc", 0, 0); + ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("meta_alloc", 1, 0); + ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("data_alloc", 1, 0); + ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("server_meta_avail", 0, 0); + ret = alloc_list_meta_iter(&super->server_meta_avail[0], + insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("server_meta_avail", 1, 0); + ret = alloc_list_meta_iter(&super->server_meta_avail[1], + insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("server_meta_freed", 0, 0); + ret = alloc_list_meta_iter(&super->server_meta_freed[0], + insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("server_meta_freed", 1, 0); + ret = alloc_list_meta_iter(&super->server_meta_freed[1], + insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("fs_root", 0, 0); + ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("logs_root", 0, 0); + ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("log_merge", 0, 0); + ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("mounted_clients", 0, 0); + ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("srch_root", 0, 0); + ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed); + sns_pop(); + if (ret < 0) + goto out; + + ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed); + if (ret < 0) + goto out; + + debug("found %llu referenced metadata blocks", mdat->stats.ref_blocks); + ret = 0; +out: + return ret; +} + +static int get_meta_free(void) +{ + struct meta_data *mdat = &global_mdat; + struct scoutfs_super_block *super = global_super; + int ret; + + extent_root_init(&mdat->meta_free); + + sns_push("meta_alloc", 0, 0); + ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("meta_alloc", 1, 0); + ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("server_meta_avail", 0, 0); + ret = alloc_list_extent_iter(&super->server_meta_avail[0], + insert_meta_free, &mdat->meta_free); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("server_meta_avail", 1, 0); + ret = alloc_list_extent_iter(&super->server_meta_avail[1], + insert_meta_free, &mdat->meta_free); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("server_meta_freed", 0, 0); + ret = alloc_list_extent_iter(&super->server_meta_freed[0], + insert_meta_free, &mdat->meta_free); + sns_pop(); + if (ret < 0) + goto out; + + sns_push("server_meta_freed", 1, 0); + ret = alloc_list_extent_iter(&super->server_meta_freed[1], + insert_meta_free, &mdat->meta_free); + sns_pop(); + if (ret < 0) + goto out; + + debug("found %llu free metadata blocks in %llu extents", + mdat->stats.free_blocks, mdat->stats.free_extents); + ret = 0; +out: + return ret; +} + +/* + * All the space between referenced blocks must be recorded in the free + * extents. The free extent walk didn't check that the extents + * overlapped with references, we do that here. Remember that metadata + * block references were merged into extents here, the refed extents + * aren't necessarily all a single block. + */ +static int compare_refs_and_free(void) +{ + struct meta_data *mdat = &global_mdat; + struct extent_node *ref; + struct extent_node *free; + struct extent_node *next; + struct extent_node *prev; + u64 expect; + u64 start; + u64 end; + + expect = 0; + ref = extent_first(&mdat->meta_refed); + free = extent_first(&mdat->meta_free); + while (ref || free) { + + debug("exp %llu ref %llu.%llu free %llu.%llu", + expect, ref ? ref->start : 0, ref ? ref->len : 0, + free ? free->start : 0, free ? free->len : 0); + + /* referenced marked free, remove ref from free and continue from same point */ + if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) { + debug("ref extent %llu.%llu overlaps free %llu %llu", + ref->start, ref->len, free->start, free->len); + + start = max(ref->start, free->start); + end = min(ref->start + ref->len, free->start + free->len); + + prev = extent_prev(free); + + extent_remove(&mdat->meta_free, start, end - start); + + if (prev) + free = extent_next(prev); + else + free = extent_first(&mdat->meta_free); + continue; + } + + /* see which extent starts earlier */ + if (!free || (ref && ref->start <= free->start)) + next = ref; + else + next = free; + + /* untracked region before next extent */ + if (expect < next->start) { + debug("missing free extent %llu.%llu", expect, next->start - expect); + expect = next->start; + continue; + } + + + /* didn't overlap, advance past next extent */ + expect = next->start + next->len; + if (next == ref) + ref = extent_next(ref); + else + free = extent_next(free); + } + + return 0; +} + +/* + * Check the metadata allocators by comparing the set of referenced + * blocks with the set of free blocks that are stored in free btree + * items and alloc list blocks. + */ +int check_meta_alloc(void) +{ + int ret; + + ret = get_meta_refs(); + if (ret < 0) + goto out; + + ret = get_meta_free(); + if (ret < 0) + goto out; + + ret = compare_refs_and_free(); + if (ret < 0) + goto out; + + ret = 0; +out: + return ret; +} diff --git a/utils/src/check/meta.h b/utils/src/check/meta.h new file mode 100644 index 00000000..80c97a03 --- /dev/null +++ b/utils/src/check/meta.h @@ -0,0 +1,9 @@ +#ifndef _SCOUTFS_UTILS_CHECK_META_H_ +#define _SCOUTFS_UTILS_CHECK_META_H_ + +bool valid_meta_blkno(u64 blkno); + +int check_meta_alloc(void); + +#endif + diff --git a/utils/src/check/padding.c b/utils/src/check/padding.c new file mode 100644 index 00000000..81e12c33 --- /dev/null +++ b/utils/src/check/padding.c @@ -0,0 +1,23 @@ +#include +#include + +#include "util.h" +#include "padding.h" + +bool padding_is_zeros(const void *data, size_t sz) +{ + static char zeros[32] = {0,}; + const size_t batch = array_size(zeros); + + while (sz >= batch) { + if (memcmp(data, zeros, batch)) + return false; + data += batch; + sz -= batch; + } + + if (sz > 0 && memcmp(data, zeros, sz)) + return false; + + return true; +} diff --git a/utils/src/check/padding.h b/utils/src/check/padding.h new file mode 100644 index 00000000..9bf03a81 --- /dev/null +++ b/utils/src/check/padding.h @@ -0,0 +1,6 @@ +#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_ +#define _SCOUTFS_UTILS_CHECK_PADDING_H_ + +bool padding_is_zeros(const void *data, size_t sz); + +#endif diff --git a/utils/src/check/problem.c b/utils/src/check/problem.c new file mode 100644 index 00000000..fd8d42a9 --- /dev/null +++ b/utils/src/check/problem.c @@ -0,0 +1,44 @@ +#include +#include + +#include "problem.h" + +#define PROB_STR(pb) [pb] = #pb +char *prob_strs[] = { + PROB_STR(PB_META_EXTENT_INVALID), + PROB_STR(PB_META_REF_OVERLAPS_EXISTING), + PROB_STR(PB_META_FREE_OVERLAPS_EXISTING), + PROB_STR(PB_BTREE_BLOCK_BAD_LEVEL), + PROB_STR(PB_SB_HDR_CRC_INVALID), + PROB_STR(PB_SB_HDR_MAGIC_INVALID), + PROB_STR(PB_FS_IN_USE), + PROB_STR(PB_MOUNTED_CLIENTS_REF_BLKNO), + PROB_STR(PB_SB_BAD_FLAG), + PROB_STR(PB_SB_BAD_FMT_VERS), + PROB_STR(PB_QCONF_WRONG_VERSION), + PROB_STR(PB_QSLOT_BAD_FAM), + PROB_STR(PB_QSLOT_BAD_PORT), + PROB_STR(PB_QSLOT_NO_ADDR), + PROB_STR(PB_QSLOT_BAD_ADDR), + PROB_STR(PB_DATA_DEV_SB_INVALID), +}; + +static struct problem_data { + uint64_t counts[PB__NR]; + uint64_t count; +} global_pdat; + +void problem_record(prob_t pb) +{ + struct problem_data *pdat = &global_pdat; + + pdat->counts[pb]++; + pdat->count++; +} + +uint64_t problems_count(void) +{ + struct problem_data *pdat = &global_pdat; + + return pdat->count; +} diff --git a/utils/src/check/problem.h b/utils/src/check/problem.h new file mode 100644 index 00000000..6ac49bb5 --- /dev/null +++ b/utils/src/check/problem.h @@ -0,0 +1,38 @@ +#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_ +#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_ + +#include "debug.h" +#include "sns.h" + +typedef enum { + PB_META_EXTENT_INVALID, + PB_META_REF_OVERLAPS_EXISTING, + PB_META_FREE_OVERLAPS_EXISTING, + PB_BTREE_BLOCK_BAD_LEVEL, + PB_SB_HDR_CRC_INVALID, + PB_SB_HDR_MAGIC_INVALID, + PB_FS_IN_USE, + PB_MOUNTED_CLIENTS_REF_BLKNO, + PB_SB_BAD_FLAG, + PB_SB_BAD_FMT_VERS, + PB_QCONF_WRONG_VERSION, + PB_QSLOT_BAD_FAM, + PB_QSLOT_BAD_PORT, + PB_QSLOT_NO_ADDR, + PB_QSLOT_BAD_ADDR, + PB_DATA_DEV_SB_INVALID, + PB__NR, +} prob_t; + +extern char *prob_strs[]; + +#define problem(pb, fmt, ...) \ +do { \ + debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__); \ + problem_record(pb); \ +} while (0) + +void problem_record(prob_t pb); +uint64_t problems_count(void); + +#endif diff --git a/utils/src/check/sns.c b/utils/src/check/sns.c new file mode 100644 index 00000000..45f45453 --- /dev/null +++ b/utils/src/check/sns.c @@ -0,0 +1,118 @@ +#include +#include + +#include "sns.h" + +/* + * This "str num stack" is used to describe our location in metadata at + * any given time. + * + * As we descend into structures we pop a string on decribing them, + * perhaps with associated numbers. Pushing and popping is very cheap + * and only rarely do we format the stack into a string, as an arbitrary + * example: + * super.fs_root.btree_parent:1231.btree_leaf:3231" + */ + +#define SNS_MAX_DEPTH 1000 +#define SNS_STR_SIZE (SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1)) + +static struct sns_data { + unsigned int depth; + + struct sns_entry { + char *str; + size_t len; + u64 a; + u64 b; + } ents[SNS_MAX_DEPTH]; + + char str[SNS_STR_SIZE]; + +} global_lsdat; + +void _sns_push(char *str, size_t len, u64 a, u64 b) +{ + struct sns_data *lsdat = &global_lsdat; + + if (lsdat->depth < SNS_MAX_DEPTH) { + lsdat->ents[lsdat->depth++] = (struct sns_entry) { + .str = str, + .len = len, + .a = a, + .b = b, + }; + } +} + +void sns_pop(void) +{ + struct sns_data *lsdat = &global_lsdat; + + if (lsdat->depth > 0) + lsdat->depth--; +} + +static char *append_str(char *pos, char *str, size_t len) +{ + memcpy(pos, str, len); + return pos + len; +} + +/* + * This is not called for x = 0 so we don't need to emit an initial 0. + * We could by using do {} while instead of while {}. + */ +static char *append_u64x(char *pos, u64 x) +{ + static char hex[] = "0123456789abcdef"; + + while (x) { + *pos++ = hex[x & 0xf]; + x >>= 4; + } + + return pos; +} + +static char *append_char(char *pos, char c) +{ + *(pos++) = c; + return pos; +} + +/* + * Return a pointer to a null terminated string that describes the + * current location stack. The string buffer is global. + */ +char *sns_str(void) +{ + struct sns_data *lsdat = &global_lsdat; + struct sns_entry *ent; + char *pos; + int i; + + pos = lsdat->str; + for (i = 0; i < lsdat->depth; i++) { + ent = &lsdat->ents[i]; + + if (i) + pos = append_char(pos, '.'); + + pos = append_str(pos, ent->str, ent->len); + + if (ent->a) { + pos = append_char(pos, ':'); + pos = append_u64x(pos, ent->a); + } + + if (ent->b) { + pos = append_char(pos, ':'); + pos = append_u64x(pos, ent->b); + } + } + + *pos = '\0'; + + return lsdat->str; +} diff --git a/utils/src/check/sns.h b/utils/src/check/sns.h new file mode 100644 index 00000000..34c1a2be --- /dev/null +++ b/utils/src/check/sns.h @@ -0,0 +1,20 @@ +#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_ +#define _SCOUTFS_UTILS_CHECK_SNS_H_ + +#include + +#include "sparse.h" + +#define SNS_MAX_STR_LEN 20 + +#define sns_push(str, a, b) \ +do { \ + build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN); \ + _sns_push((str), sizeof(str) - 1, a, b); \ +} while (0) + +void _sns_push(char *str, size_t len, u64 a, u64 b); +void sns_pop(void); +char *sns_str(void); + +#endif diff --git a/utils/src/check/super.c b/utils/src/check/super.c new file mode 100644 index 00000000..e3c14fae --- /dev/null +++ b/utils/src/check/super.c @@ -0,0 +1,252 @@ +#include +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "util.h" +#include "format.h" +#include "crc.h" + +#include "block.h" +#include "super.h" +#include "problem.h" + +/* + * After we check the super blocks we provide a global buffer to track + * the current super block. It is referenced to get static information + * about the system and is also modified and written as part of + * transactions. + */ +struct scoutfs_super_block *global_super; + +/* + * Check superblock crc. We can't use global_super here since it's not the + * whole block itself, but only the struct scoutfs_super_block, so it needs + * to reload a copy here. + */ +int check_super_crc(void) +{ + struct scoutfs_super_block *super = NULL; + struct scoutfs_block_header *hdr; + struct block *blk = NULL; + u32 crc; + int ret; + + ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_DIRTY); + if (ret < 0) { + fprintf(stderr, "error reading super block\n"); + return ret; + } + + super = block_buf(blk); + crc = crc_block((struct scoutfs_block_header *)super, block_size(blk)); + hdr = &global_super->hdr; + debug("superblock crc 0x%04x calculated 0x%04x " "%s", le32_to_cpu(hdr->crc), crc, le32_to_cpu(hdr->crc) == crc ? "(match)" : "(mismatch)"); + + if (crc != le32_to_cpu(hdr->crc)) + problem(PB_SB_HDR_CRC_INVALID, "crc 0x%04x calculated 0x%04x", le32_to_cpu(hdr->crc), crc); + block_put(&blk); + + return 0; +} + +/* + * Crude check for the unlikely cases where the fs appears to still be mounted. + */ +int check_super_in_use(int meta_fd) +{ + int ret = meta_super_in_use(meta_fd, global_super); + debug("meta_super_in_use ret %d", ret); + + if (ret < 0) + problem(PB_FS_IN_USE, "File system appears in use. ret %d", ret); + + debug("global_super->mounted_clients.ref.blkno 0x%08llx", global_super->mounted_clients.ref.blkno); + if (global_super->mounted_clients.ref.blkno != 0) + problem(PB_MOUNTED_CLIENTS_REF_BLKNO, "Mounted clients ref blkno 0x%08llx", + global_super->mounted_clients.ref.blkno); + + return ret; +} + +/* + * quick glance data device superblock checks. + * + * -EIO for crc failures, all others -EINVAL + * + * caller must have run check_supers() first so that global_super is + * setup, so that we can cross-ref to it. + */ +static int check_data_super(int data_fd) +{ + struct scoutfs_super_block *super = NULL; + char *buf; + int ret = 0; + u32 crc; + ssize_t size = SCOUTFS_BLOCK_SM_SIZE; + off_t off = SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT; + + buf = aligned_alloc(4096, size); /* XXX static alignment :/ */ + if (!buf) + return -ENOMEM; + + memset(buf, 0, size); + + if (lseek(data_fd, off, SEEK_SET) != off) + return -errno; + + if (read(data_fd, buf, size) < 0) { + ret = -errno; + goto out; + } + + super = (struct scoutfs_super_block *)buf; + + crc = crc_block((struct scoutfs_block_header *)buf, size); + + debug("data fsid 0x%016llx", le64_to_cpu(super->hdr.fsid)); + debug("data super magic 0x%04x", super->hdr.magic); + debug("data crc calc 0x%08x exp 0x%08x %s", crc, le32_to_cpu(super->hdr.crc), + crc == le32_to_cpu(super->hdr.crc) ? "(match)" : "(mismatch)"); + debug("data flags %llu fmt_vers %llu", le64_to_cpu(super->flags), le64_to_cpu(super->fmt_vers)); + + if (crc != le32_to_cpu(super->hdr.crc)) + /* tis but a scratch */ + ret = -EIO; + + if (le64_to_cpu(super->hdr.fsid) != le64_to_cpu(global_super->hdr.fsid)) + /* mismatched data bdev? not good */ + ret = -EINVAL; + + if (le32_to_cpu(super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER) + /* fsid matched but not a superblock? yikes */ + ret = -EINVAL; + + if (le64_to_cpu(super->flags) != 0) /* !SCOUTFS_FLAG_IS_META_BDEV */ + ret = -EINVAL; + + if ((le64_to_cpu(super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) || + (le64_to_cpu(super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX)) + ret = -EINVAL; + + if (ret != 0) + problem(PB_DATA_DEV_SB_INVALID, "data device is invalid or corrupt (%d)", ret); +out: + free(buf); + return ret; +} + +/* + * After checking the supers we save a copy of it in a global buffer that's used by + * other modules to track the current super. It can be modified and written during commits. + */ +int check_supers(int data_fd) +{ + struct scoutfs_super_block *super = NULL; + struct block *blk = NULL; + struct scoutfs_quorum_slot* slot = NULL; + struct in_addr in; + uint16_t family; + uint16_t port; + int ret; + + sns_push("supers", 0, 0); + + global_super = malloc(sizeof(struct scoutfs_super_block)); + if (!global_super) { + fprintf(stderr, "error allocating super block buffer\n"); + ret = -ENOMEM; + goto out; + } + + ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM); + if (ret < 0) { + fprintf(stderr, "error reading super block\n"); + goto out; + } + + ret = block_hdr_valid(blk, SCOUTFS_SUPER_BLKNO, BF_SM, SCOUTFS_BLOCK_MAGIC_SUPER); + + super = block_buf(blk); + + if (ret < 0) { + /* */ + if (ret == -EINVAL) { + /* that's really bad */ + fprintf(stderr, "superblock invalid magic\n"); + goto out; + } else if (ret == -EIO) + /* just report/count a CRC error */ + problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x", + super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER); + } + + memcpy(global_super, super, sizeof(struct scoutfs_super_block)); + + debug("Superblock flag: %llu", global_super->flags); + if (le64_to_cpu(global_super->flags) != SCOUTFS_FLAG_IS_META_BDEV) + problem(PB_SB_BAD_FLAG, "Bad flag: %llu expecting: 1 or 0", global_super->flags); + + debug("Superblock fmt_vers: %llu", le64_to_cpu(global_super->fmt_vers)); + if ((le64_to_cpu(global_super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) || + (le64_to_cpu(global_super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX)) + problem(PB_SB_BAD_FMT_VERS, "Bad fmt_vers: %llu outside supported range (%d-%d)", + le64_to_cpu(global_super->fmt_vers), SCOUTFS_FORMAT_VERSION_MIN, + SCOUTFS_FORMAT_VERSION_MAX); + + debug("Quorum Config Version: %llu", global_super->qconf.version); + if (le64_to_cpu(global_super->qconf.version) != 1) + problem(PB_QCONF_WRONG_VERSION, "Wrong Version: %llu (expected 1)", global_super->qconf.version); + + for (int i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + slot = &global_super->qconf.slots[i]; + family = le16_to_cpu(slot->addr.v4.family); + port = le16_to_cpu(slot->addr.v4.port); + in.s_addr = le32_to_cpu(slot->addr.v4.addr); + + if (family == SCOUTFS_AF_NONE) { + debug("Quorum slot %u is empty", i); + continue; + } + + debug("Quorum slot %u family: %u, port: %u, address: %s", i, family, port, inet_ntoa(in)); + if (family != SCOUTFS_AF_IPV4) + problem(PB_QSLOT_BAD_FAM, "Quorum Slot %u doesn't have valid address", i); + + if (port == 0) + problem(PB_QSLOT_BAD_PORT, "Quorum Slot %u has bad port", i); + + if (!in.s_addr) { + problem(PB_QSLOT_NO_ADDR, "Quorum Slot %u has not been assigned ipv4 address", i); + } else if (!(in.s_addr & 0xff000000)) { + problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i); + } else if ((in.s_addr & 0xff) == 0xff) { + problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i); + } + } + + debug("super magic 0x%04x", global_super->hdr.magic); + if (le32_to_cpu(global_super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER) + problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x", + global_super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER); + + /* `scoutfs image` command doesn't open data_fd */ + if (data_fd < 0) + ret = 0; + else + ret = check_data_super(data_fd); +out: + block_put(&blk); + + sns_pop(); + + return ret; +} + +void super_shutdown(void) +{ + free(global_super); +} diff --git a/utils/src/check/super.h b/utils/src/check/super.h new file mode 100644 index 00000000..f14417ba --- /dev/null +++ b/utils/src/check/super.h @@ -0,0 +1,12 @@ +#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_ +#define _SCOUTFS_UTILS_CHECK_SUPER_H_ + +extern struct scoutfs_super_block *global_super; + +int check_super_crc(); +int check_supers(int data_fd); +int super_commit(void); +int check_super_in_use(int meta_fd); +void super_shutdown(void); + +#endif diff --git a/utils/src/parallel_restore.c b/utils/src/parallel_restore.c new file mode 100644 index 00000000..4a1ab3fd --- /dev/null +++ b/utils/src/parallel_restore.c @@ -0,0 +1,1986 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "util.h" +#include "format.h" +#include "crc.h" +#include "rand.h" +#include "key.h" +#include "bitops.h" +#include "btree.h" +#include "leaf_item_hash.h" +#include "name_hash.h" +#include "mode_types.h" +#include "srch.h" +#include "bloom.h" + +#include "parallel_restore.h" + +#include "list.h" +#include "lk_rbtree_wrapper.h" + +/* + * XXX + * - interface versioning? + * - next seq and next ino are both max ino + 1 + * - fix writer builder layout to match super, users except for build order + * - look into zeroing buffers consistently + * - init_alb looks weird? naming consistency? + * - make sure inode_count makes sense (fs root, log deltas) + * - audit file types + */ + +#define dprintf(fmt, args...) \ +do { \ + if (0) \ + printf(fmt, ##args); \ +} while (0) + +struct btree_item { + struct rb_node node; + struct scoutfs_key key; + unsigned int val_len; + void *val; +}; + +struct srch_node { + struct rb_node node; + u64 hash; + u64 ino; + u64 id; +}; + +struct block_builder; +typedef bool (*bld_empty_t)(struct block_builder *bld); +typedef void (*bld_reset_t)(struct block_builder *bld); +typedef spr_err_t (*bld_build_t)(struct scoutfs_parallel_restore_writer *wri, + struct block_builder *bld, void *buf, u64 blkno); +typedef spr_err_t (*bld_post_t)(struct scoutfs_parallel_restore_writer *wri, + struct block_builder *bld); + +struct block_builder { + struct list_head head; + bld_empty_t empty; + bld_reset_t reset; + bld_build_t build; + bld_post_t post; +}; + +struct btree_builder { + struct block_builder bld; + + /* track all items */ + u64 total_items; + /* track total length of extent items */ + u64 total_len; + + /* eventual root that references built blocks */ + struct scoutfs_btree_root btroot; + + /* blocks are built as levels accumulate sufficient items */ + struct { + struct rb_root root; + unsigned long nr; + } items[SCOUTFS_BTREE_MAX_HEIGHT]; +}; + +struct alloc_list_builder { + struct block_builder bld; + u64 start; + u64 len; + struct scoutfs_alloc_list_head lhead; +}; + +/* + * srch parent radix fanout is really wide, it doesn't take many to have + * 2^64 bytes in entry blocks. + */ +#define MAX_SRCH_HEIGHT 6 + +struct srch_builder { + struct block_builder bld; + + /* accumulates blocks/entries as we build */ + struct scoutfs_srch_file sfl; + + /* no parents at level 0, [0] never used */ + u64 total_parent_refs; + struct { + struct scoutfs_block_ref *refs; + unsigned long nr; + } parents[MAX_SRCH_HEIGHT]; + + struct rb_root entries; +}; + +struct bloom_builder { + struct block_builder bld; + struct scoutfs_bloom_block *bloom; +}; + +struct scoutfs_parallel_restore_writer { + u64 inode_count; + u64 max_ino; + + __le64 fsid; + u64 meta_start; + u64 meta_len; + struct list_head meta_extents; + + struct list_head builders; + struct btree_builder meta_btb[2]; + struct btree_builder data_btb; + struct alloc_list_builder meta_alb[2]; + struct btree_builder root_btb; + struct btree_builder fs_btb; + struct btree_builder srch_btb; + struct btree_builder log_btb; + + struct srch_builder srch_sbld; + struct bloom_builder bloom_bbld; + + struct scoutfs_btree_root root_items; + struct scoutfs_super_block super; +}; + +struct extent_head { + struct list_head head; + u64 start; + u64 len; +}; + +static void init_builder(struct block_builder *bld, bld_empty_t empty, bld_reset_t reset, + bld_build_t build) +{ + INIT_LIST_HEAD(&bld->head); + bld->empty = empty; + bld->reset = reset; + bld->build = build; + bld->post = NULL; +} + +static spr_err_t meta_alloc_add(struct scoutfs_parallel_restore_writer *wri, + u64 start, u64 len) +{ + struct extent_head *eh; + + if (len == 0) + return 0; + + if (wri->meta_len == 0) { + wri->meta_start = start; + wri->meta_len = len; + } else { + eh = malloc(sizeof(struct extent_head)); + if (!eh) + return ENOMEM; + eh->start = start; + eh->len = len; + list_add_tail(&eh->head, &wri->meta_extents); + } + + return 0; +} + +static spr_err_t meta_alloc_contig(struct scoutfs_parallel_restore_writer *wri, + u64 prev, u64 *blkno_ret) +{ + struct extent_head *eh; + + if (prev && wri->meta_len && (wri->meta_start != prev + 1)) { + *blkno_ret = 0; + return 0; + } + + if (!wri->meta_len) { + *blkno_ret = 0; + return ENOSPC; + } + + *blkno_ret = wri->meta_start++; + + if (--wri->meta_len == 0 && !list_empty(&wri->meta_extents)) { + eh = list_entry(wri->meta_extents.next, struct extent_head, head); + wri->meta_start = eh->start; + wri->meta_len = eh->len; + free(eh); + } + + return 0; +} + +static spr_err_t bti_alloc(int val_len, struct btree_item **bti_ret) +{ + struct btree_item *bti; + spr_err_t err; + + bti = malloc(sizeof(struct btree_item) + val_len); + if (bti) { + bti->val = (void *)(bti + 1); + bti->val_len = val_len; + err = 0; + } else { + err = ENOMEM; + } + + *bti_ret = bti; + return err; +} + +static struct btree_item *bti_walk(struct rb_root *root, struct scoutfs_key *key, + struct btree_item *ins) +{ + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + struct btree_item *found = NULL; + struct btree_item *bti; + int cmp; + + while (*node) { + parent = *node; + bti = container_of(*node, struct btree_item, node); + + cmp = scoutfs_key_compare(key, &bti->key); + if (cmp < 0) { + node = &(*node)->rb_left; + } else if (cmp > 0) { + node = &(*node)->rb_right; + } else { + found = bti; + break; + } + } + + if (ins && !found) { + rb_link_node(&ins->node, parent, node); + rb_insert_color(&ins->node, root); + } + + return found; +} + +static struct btree_item *node_bti(struct rb_node *node) +{ + return node ? container_of(node, struct btree_item, node) : NULL; +} + +static struct btree_item *bti_first(struct rb_root *root) +{ + return node_bti(rb_first(root)); +} + +static struct btree_item *bti_next(struct btree_item *bti) +{ + return bti ? node_bti(rb_next(&bti->node)) : NULL; +} + +#define for_each_bti_safe(root, bti, tmp) \ + for (bti = bti_first(root); bti && ((tmp = bti_next(bti)), 1); bti = tmp) + +/* + * It's always an error to try and insert a key that was already tracked + * in a btree level. + */ +static spr_err_t btb_insert(struct btree_builder *btb, struct btree_item *bti, int level) +{ + struct btree_item *found; + + found = bti_walk(&btb->items[level].root, &bti->key, bti); + if (found) { + return EEXIST; + } else { + btb->items[level].nr++; + btb->total_items++; + return 0; + } +} + +static void btb_erase(struct btree_builder *btb, struct btree_item *bti, int level) +{ + rb_erase(&bti->node, &btb->items[level].root); + btb->items[level].nr--; + btb->total_items--; +} + +static void btb_destroy(struct btree_builder *btb) +{ + struct btree_item *bti; + struct btree_item *tmp; + int i; + + for (i = 0; i < array_size(btb->items); i++) { + for_each_bti_safe(&btb->items[i].root, bti, tmp) { + btb_erase(btb, bti, i); + free(bti); + } + } +} + +static void init_key(struct scoutfs_key *key, u8 zone, u8 type, u64 first, u64 second, + u64 third, u8 fourth) +{ + key->_sk_first = cpu_to_le64(first); + key->_sk_second = cpu_to_le64(second); + key->_sk_third = cpu_to_le64(third); + key->_sk_fourth = fourth; + key->sk_zone = zone; + key->sk_type = type; + memset(&key->__pad, 0, sizeof(key->__pad)); +} + +static u64 free_extent_order(u64 len) +{ + return (fls64(len | 1) - 1) / 3; +} + +static int insert_free_items(struct btree_builder *btb, u64 start, u64 len) +{ + struct scoutfs_key keys[2]; + struct btree_item *bti; + spr_err_t err; + u64 order; + u64 end; + int i; + + end = start + len - 1; + order = U64_MAX - free_extent_order(len); + + init_key(&keys[0], SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, end, len, 0, 0); + init_key(&keys[1], SCOUTFS_FREE_EXTENT_ORDER_ZONE, 0, order, end, len, 0); + + for (i = 0; i < array_size(keys); i++) { + err = bti_alloc(0, &bti); + if (err) + goto out; + + bti->key = keys[i]; + + err = btb_insert(btb, bti, 0); + if (err) { + free(bti); + goto out; + } + } + + btb->total_len += len; + + err = 0; +out: + return err; +} + +static void set_alloc_root(struct scoutfs_alloc_root *root, struct btree_builder *btb) +{ + root->total_len = cpu_to_le64(btb->total_len); + root->flags = 0; + root->_pad = 0; + root->root = btb->btroot; +} + +static spr_err_t map_start_key(struct scoutfs_key *start, struct scoutfs_key *key) +{ + if (key->sk_zone == SCOUTFS_FS_ZONE) { + init_key(start, SCOUTFS_FS_ZONE, 0, + le64_to_cpu(key->_sk_first) & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK, + 0, 0, 0); + + } else if (key->sk_zone == SCOUTFS_XATTR_TOTL_ZONE) { + init_key(start, SCOUTFS_XATTR_TOTL_ZONE, 0, 0, 0, 0, 0); + + } else if (key->sk_zone == SCOUTFS_INODE_INDEX_ZONE) { + init_key(start, SCOUTFS_INODE_INDEX_ZONE, 0, 0, + le64_to_cpu(key->_sk_second) & ~(u64)SCOUTFS_LOCK_SEQ_GROUP_MASK, + 0, 0); + } else if (key->sk_zone == SCOUTFS_QUOTA_ZONE) { + init_key(start, SCOUTFS_QUOTA_ZONE, 0, 0, 0, 0, 0); + } else { + return EINVAL; + } + + return 0; +} + +static spr_err_t update_bloom(struct bloom_builder *bbld, struct scoutfs_key *key) +{ + struct scoutfs_bloom_block *bb = bbld->bloom; + unsigned int nrs[SCOUTFS_FOREST_BLOOM_NRS]; + struct scoutfs_key start; + spr_err_t err; + int i; + + err = map_start_key(&start, key); + if (err) + goto out; + + calc_bloom_nrs(&start, nrs); + + for (i = 0; i < SCOUTFS_FOREST_BLOOM_NRS; i++) { + if (!test_and_set_bit_le(nrs[i], bb->bits)) + le64_add_cpu(&bb->total_set, 1); + } + + err = 0; +out: + return err; +} + +static spr_err_t insert_fs_item(struct scoutfs_parallel_restore_writer *wri, + struct btree_item *bti) +{ + spr_err_t err; + + if (bti->key.sk_zone == SCOUTFS_FS_ZONE && bti->key.sk_type == SCOUTFS_INODE_TYPE && + le64_to_cpu(bti->key.ski_ino) == SCOUTFS_ROOT_INO) { + err = btb_insert(&wri->root_btb, bti, 0); + } else { + err = btb_insert(&wri->fs_btb, bti, 0) ?: + update_bloom(&wri->bloom_bbld, &bti->key); + } + + return err; +} + +static spr_err_t insert_entry_items(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_entry *entry) +{ + struct scoutfs_dirent *dent = NULL; + struct scoutfs_key keys[3]; + struct btree_item *bti; + unsigned int bytes; + spr_err_t err = 0; + u64 dir_ino; + u64 hash; + u64 ino; + u64 pos; + int i; + + bytes = offsetof(struct scoutfs_dirent, name[entry->name_len]); + dent = malloc(bytes); + if (!dent) { + err = ENOMEM; + goto out; + } + + dir_ino = entry->dir_ino; + ino = entry->ino; + hash = dirent_name_hash(entry->name, entry->name_len); + pos = entry->pos; + + dent->ino = cpu_to_le64(ino); + dent->hash = cpu_to_le64(hash); + dent->pos = cpu_to_le64(pos); + dent->type = mode_to_type(entry->mode); + memset(&dent->__pad, 0, sizeof(dent->__pad)); + memcpy(dent->name, entry->name, entry->name_len); + + init_key(&keys[0], SCOUTFS_FS_ZONE, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos, 0); + init_key(&keys[1], SCOUTFS_FS_ZONE, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0, 0); + init_key(&keys[2], SCOUTFS_FS_ZONE, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos, 0); + + for (i = 0; i < array_size(keys); i++) { + err = bti_alloc(bytes, &bti); + if (err) + goto out; + + bti->key = keys[i]; + memcpy(bti->val, dent, bytes); + + err = insert_fs_item(wri, bti); + if (err) { + free(bti); + goto out; + } + } + + err = 0; +out: + free(dent); + return err; +} + +static spr_err_t insert_extent_item(struct scoutfs_parallel_restore_writer *wri, u64 ino, u64 len) +{ + struct scoutfs_data_extent_val *dv; + struct scoutfs_key key; + struct btree_item *bti; + spr_err_t err; + + init_key(&key, SCOUTFS_FS_ZONE, SCOUTFS_DATA_EXTENT_TYPE, ino, 0 + len - 1, len, 0); + + err = bti_alloc(sizeof(struct scoutfs_data_extent_val), &bti); + if (!err) { + bti->key = key; + dv = bti->val; + dv->blkno = 0; + dv->flags = SEF_OFFLINE; + + err = insert_fs_item(wri, bti); + if (err) + free(bti); + } + + return err; +} + +/* + * We're trusting that the caller hasn't made up garbage xattrs. + * All we have to do is check for the scoutfs prefix and then + * identify the sequence of known tags. There can be a lot more + * xattrs than files so this is a surprisingly hot path. + */ +#define HIDE_BE32 cpu_to_be32(0x68696465) +#define SRCH_BE32 cpu_to_be32(0x73726368) +#define TOTL_BE32 cpu_to_be32(0x746f746c) +#define TAG_LEN 5 +#define XTAG_SRCH (1 << 1) +#define XTAG_TOTL (1 << 2) +static int get_xattr_tags(char *name, int name_len) +{ + static const char prefix[] = "scoutfs."; + static const size_t prefix_len = array_size(prefix) - 1; + __be32 betag; + int xtags = 0; + + if (name_len < prefix_len || strncmp(name, prefix, prefix_len)) + return 0; + + name += prefix_len; + name_len -= prefix_len; + + while (name_len >= TAG_LEN && name[TAG_LEN - 1] == '.') { + memcpy(&betag, name, sizeof(betag)); + + dprintf("tag 0x%08x\n", be32_to_cpu(betag)); + + if (betag == HIDE_BE32) + ; + else if (betag == SRCH_BE32) + xtags |= XTAG_SRCH; + else if (betag == TOTL_BE32) + xtags |= XTAG_TOTL; + else + break; + + name += TAG_LEN; + name_len -= TAG_LEN; + } + + dprintf("xat name %.*s tags 0x%x\n", name_len, name, xtags); + + return xtags; +} + +static spr_err_t insert_xattr_items(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_xattr *xattr, u32 hash) +{ + struct scoutfs_xattr xat; + struct iovec value[3] = { + { &xat, sizeof(xat) }, + { xattr->name, xattr->name_len, }, + { xattr->value, xattr->value_len, }, + }; + struct iovec *iov = value; + struct scoutfs_key key; + struct btree_item *bti; + unsigned int total; + unsigned int bytes; + unsigned int piece; + spr_err_t err; + char *buf; + + init_key(&key, SCOUTFS_FS_ZONE, SCOUTFS_XATTR_TYPE, xattr->ino, hash, xattr->pos, 0); + total = value[0].iov_len + value[1].iov_len + value[2].iov_len; + + xat.val_len = cpu_to_le16(xattr->value_len); + xat.name_len = xattr->name_len; + memset(xat.__pad, 0, sizeof(xat.__pad)); + + while (total > 0) { + bytes = min(total, SCOUTFS_XATTR_MAX_PART_SIZE); + + err = bti_alloc(bytes, &bti); + if (err) + goto out; + + bti->key = key; + buf = bti->val; + + while (bytes) { + piece = min(bytes, iov->iov_len); + memcpy(buf, iov->iov_base, piece); + buf += piece; + bytes -= piece; + total -= piece; + iov->iov_base += piece; + iov->iov_len -= piece; + if (iov->iov_len == 0) + iov++; /* falls off array when done */ + } + + err = insert_fs_item(wri, bti); + if (err) { + free(bti); + goto out; + } + + key._sk_fourth++; + } + + err = 0; +out: + return err; +} + +static spr_err_t insert_symlink_items(struct scoutfs_parallel_restore_writer *wri, + u64 ino, char *target, int target_len) +{ + struct scoutfs_key key; + struct btree_item *bti; + spr_err_t err; + int bytes; + int off = 0; + + init_key(&key, SCOUTFS_FS_ZONE, SCOUTFS_SYMLINK_TYPE, ino, 0, 0, 0); + + while (off < target_len) { + bytes = min(target_len - off, SCOUTFS_MAX_VAL_SIZE); + + err = bti_alloc(bytes, &bti); + if (err) + goto out; + + bti->key = key; + memcpy(bti->val, target + off, bytes); + + err = insert_fs_item(wri, bti); + if (err) { + free(bti); + goto out; + } + + off += bytes; + le64_add_cpu(&key._sk_second, 1); + } + + err = 0; +out: + return err; +} + +/* forbid the leading + that strtoull allows */ +static spr_err_t totl_strtoull(char *s, int len, unsigned long long *res) +{ + char str[SCOUTFS_XATTR_MAX_TOTL_U64 + 1]; + + if (len <= 0 || len >= array_size(str) || s[0] == '+') + return EINVAL; + + memcpy(str, s, len); + str[len] = '\0'; + + errno = 0; + *res = strtoull(str, NULL, 0); + return errno; +} + +/* + * .totl. xattrs turn into items with the key based on dotted u64s at the end of the + * name and a value in the .. value. + */ +static spr_err_t insert_totl_item(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_xattr *xattr) +{ + static const char prefix[] = "scoutfs.totl."; + static const int prefix_len = sizeof(prefix) - 1; + struct scoutfs_xattr_totl_val *found_tval; + struct scoutfs_xattr_totl_val *tval; + struct btree_item *found; + struct btree_item *bti; + unsigned long long longs[3]; + unsigned long long v; + spr_err_t err; + int nr = 0; + int prev; + int i; + + prev = xattr->name_len; + for (i = xattr->name_len - 1; i > prefix_len; i--) { + if (xattr->name[i] == '.') { + err = totl_strtoull(&xattr->name[i + 1], prev - (i + 1), &longs[nr]); + if (err) + goto out; + if (++nr == array_size(longs)) + break; + prev = i; + } + } + if (nr != array_size(longs)) { + err = EINVAL; + goto out; + } + + err = totl_strtoull(xattr->value, xattr->value_len, &v); + if (err) + goto out; + + if (v == 0) { + err = 0; + goto out; + } + + err = bti_alloc(sizeof(struct scoutfs_xattr_totl_val), &bti); + if (err) + goto out; + + init_key(&bti->key, SCOUTFS_XATTR_TOTL_ZONE, 0, longs[2], longs[1], longs[0], 0); + tval = bti->val; + tval->total = cpu_to_le64(v); + tval->count = cpu_to_le64(1); + + found = bti_walk(&wri->fs_btb.items[0].root, &bti->key, NULL); + if (found) { + found_tval = found->val; + le64_add_cpu(&found_tval->total, le64_to_cpu(tval->total)); + le64_add_cpu(&found_tval->count, le64_to_cpu(tval->count)); + if (found_tval->total == 0) + btb_erase(&wri->fs_btb, found, 0); + free(bti); + } else { + err = insert_fs_item(wri, bti); + if (err) { + free(bti); + goto out; + } + } + + err = 0; +out: + return err; +} + +static spr_err_t insert_inode_index_item(struct scoutfs_parallel_restore_writer *wri, + u8 type, u64 major, u64 ino) +{ + struct btree_item *bti; + spr_err_t err; + + err = bti_alloc(0, &bti); + if (!err) { + init_key(&bti->key, SCOUTFS_INODE_INDEX_ZONE, type, 0, major, ino, 0); + err = insert_fs_item(wri, bti); + if (err) + free(bti); + } + + return err; +} + +static spr_err_t insert_inode_items(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_inode *inode) +{ + struct scoutfs_inode *si; + struct btree_item *bti; + spr_err_t err; + + err = bti_alloc(sizeof(struct scoutfs_inode), &bti); + if (err) + goto out; + + init_key(&bti->key, SCOUTFS_FS_ZONE, SCOUTFS_INODE_TYPE, inode->ino, 0, 0, 0); + + si = bti->val; + + si->size = 0; + si->meta_seq = cpu_to_le64(inode->meta_seq); + si->data_seq = cpu_to_le64(inode->data_seq); + si->data_version = 0; + si->online_blocks = 0; + si->offline_blocks = 0; + si->next_readdir_pos = 0; + si->next_xattr_id = cpu_to_le64(inode->nr_xattrs + 1); + si->version = cpu_to_le64(1); + si->nlink = cpu_to_le32(inode->nlink ? inode->nlink : 1); + si->uid = cpu_to_le32(inode->uid); + si->gid = cpu_to_le32(inode->gid); + si->mode = cpu_to_le32(inode->mode); + si->flags = 0; + si->flags = cpu_to_le32(inode->flags); + si->atime.sec = cpu_to_le64(inode->atime.tv_sec); + si->atime.nsec = cpu_to_le32(inode->atime.tv_nsec); + si->ctime.sec = cpu_to_le64(inode->ctime.tv_sec); + si->ctime.nsec = cpu_to_le32(inode->ctime.tv_nsec); + si->mtime.sec = cpu_to_le64(inode->mtime.tv_sec); + si->mtime.nsec = cpu_to_le32(inode->mtime.tv_nsec); + si->crtime.sec = cpu_to_le64(inode->crtime.tv_sec); + si->crtime.nsec = cpu_to_le32(inode->crtime.tv_nsec); + si->proj = cpu_to_le64(inode->proj); + + /* XXX make sure this works across all el7/8/9 due to glibc magic */ + si->rdev = (inode->rdev & 0xff) | ((inode->rdev & 0xffffff00) << 12); + + err = insert_inode_index_item(wri, SCOUTFS_INODE_INDEX_META_SEQ_TYPE, + le64_to_cpu(si->meta_seq), inode->ino); + if (err) + goto out; + + if (S_ISREG(inode->mode)) { + si->size = cpu_to_le64(inode->size); + si->data_version = cpu_to_le64(inode->data_version); + + err = insert_inode_index_item(wri, SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE, + le64_to_cpu(si->data_seq), inode->ino); + if (err) + goto out; + + if (inode->offline) { + si->offline_blocks = cpu_to_le64(DIV_ROUND_UP(inode->size, + SCOUTFS_BLOCK_SM_SIZE)); + err = insert_extent_item(wri, inode->ino, le64_to_cpu(si->offline_blocks)); + if (err) + goto out; + } + + } else if (S_ISDIR(inode->mode)) { + si->size = cpu_to_le64(inode->total_entry_name_bytes); + si->next_readdir_pos = cpu_to_le64(SCOUTFS_DIRENT_FIRST_POS + inode->nr_subdirs); + si->nlink = cpu_to_le32(2 + inode->nr_subdirs); + + } else if (S_ISLNK(inode->mode)) { + si->size = cpu_to_le64(inode->target_len); + + err = insert_symlink_items(wri, inode->ino, inode->target, inode->target_len); + if (err) + goto out; + } + + err = insert_fs_item(wri, bti); +out: + return err; +} + +static spr_err_t insert_log_trees_item(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_progress *prog) +{ + struct scoutfs_log_trees *lt; + struct btree_item *bti; + spr_err_t err; + + err = bti_alloc(sizeof(struct scoutfs_log_trees), &bti); + if (err) + goto out; + + lt = bti->val; + memset(lt, 0, sizeof(struct scoutfs_log_trees)); + lt->item_root = prog->fs_items; + lt->bloom_ref = prog->bloom_ref; + /* lt srch_file is blank once finalized, moved to srch_root items */ + lt->inode_count_delta = prog->inode_count; + lt->get_trans_seq = cpu_to_le64(1); + lt->commit_trans_seq = cpu_to_le64(1); + lt->max_item_seq = cpu_to_le64(1); + lt->finalize_seq = cpu_to_le64(1); + lt->rid = prog->max_ino; + lt->nr = cpu_to_le64(1); + lt->flags = cpu_to_le64(SCOUTFS_LOG_TREES_FINALIZED); + + init_key(&bti->key, SCOUTFS_LOG_TREES_ZONE, 0, + le64_to_cpu(lt->rid), le64_to_cpu(lt->nr), 0, 0); + + err = btb_insert(&wri->log_btb, bti, 0); +out: + return err; +} + +static spr_err_t insert_srch_item(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_srch_file *sfl) +{ + struct btree_item *bti; + spr_err_t err; + + err = bti_alloc(sizeof(struct scoutfs_srch_file), &bti); + if (!err) { + init_key(&bti->key, SCOUTFS_SRCH_ZONE, SCOUTFS_SRCH_BLOCKS_TYPE, + 0, le64_to_cpu(sfl->blocks), le64_to_cpu(sfl->ref.blkno), 0); + memcpy(bti->val, sfl, sizeof(struct scoutfs_srch_file)); + err = btb_insert(&wri->srch_btb, bti, 0); + } + + return err; +} + +static spr_err_t insert_quota_item(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_quota_rule *rule) +{ + struct scoutfs_quota_rule_val *rv; + struct btree_item *bti; + spr_err_t err; + + err = bti_alloc(sizeof(struct scoutfs_quota_rule_val), &bti); + if (err) + goto out; + + rv = bti->val; + memset(rv, 0, sizeof(struct scoutfs_quota_rule_val)); + rv->limit = cpu_to_le64(rule->limit); + rv->prio = rule->prio; + rv->op = rule->op; + rv->rule_flags = rule->rule_flags; + rv->name_val[0] = cpu_to_le64(rule->names[0].val); + rv->name_source[0] = rule->names[0].source; + rv->name_flags[0] = rule->names[0].flags; + rv->name_val[1] = cpu_to_le64(rule->names[1].val); + rv->name_source[1] = rule->names[1].source; + rv->name_flags[1] = rule->names[1].flags; + rv->name_val[2] = cpu_to_le64(rule->names[2].val); + rv->name_source[2] = rule->names[2].source; + rv->name_flags[2] = rule->names[2].flags; + memset(&rv->_pad, 0, sizeof(rv->_pad)); + + init_key(&bti->key, SCOUTFS_QUOTA_ZONE, SCOUTFS_QUOTA_RULE_TYPE, + 0, scoutfs_hash64(rv, sizeof(struct scoutfs_quota_rule_val)), 0, 0); + + err = insert_fs_item(wri, bti); + if (err) { + free(bti); + goto out; + } +out: + return err; +} + +#define UNLINKED_AVL_HEIGHT 255 + +static void link_avl_nodes(struct scoutfs_btree_block *bt, __le16 *parent, __le16 parent_off, + u8 height, int first, int last) +{ + int ind = (first + last) / 2; + struct scoutfs_avl_node *node = &bt->items[ind].node; + u64 off = (long)node - (long)&bt->item_root; + + dprintf("first %d ind %d last %d height %u\n", first, ind, last, height); + + if (ind < first || ind > last || node->height != UNLINKED_AVL_HEIGHT) + return; + + *parent = cpu_to_le16(off); + node->parent = parent_off; + node->height = height; + node->left = 0; + node->right = 0; + memset(node->__pad, 0, sizeof(node->__pad)); + + if (height > 1) { + link_avl_nodes(bt, &node->left, cpu_to_le16(off), height - 1, first, ind - 1); + link_avl_nodes(bt, &node->right, cpu_to_le16(off), height - 1, ind + 1, last); + } +} + +#define DEFINE_BUILDER_CONTAINER(type, name, ptr) \ + type *name = container_of(ptr, type, bld) + +static bool btree_empty(struct block_builder *bld) +{ + DEFINE_BUILDER_CONTAINER(struct btree_builder, btb, bld); + + return btb->total_items == 0; +} + +static void btree_reset(struct block_builder *bld) +{ + DEFINE_BUILDER_CONTAINER(struct btree_builder, btb, bld); + + btb->total_items = 0; + btb->total_len = 0; + memset(&btb->btroot, 0, sizeof(btb->btroot)); +} + +/* + * Incrementally build btrees. By the time we're called the builder has + * all the sorted leaf items in an rbtree at their level. We streaem + * them into blocks and store parent items at the next highest level. + * Once we're out of leaf items we stream the parent items into blocks + * and store their parent items at the next highest level. Eventually + * we drain all the items and are left with the root's reference to the + * first block in the tree. + */ +static spr_err_t build_btree_block(struct scoutfs_parallel_restore_writer *wri, + struct block_builder *bld, void *buf, u64 blkno) +{ + DEFINE_BUILDER_CONTAINER(struct btree_builder, btb, bld); + struct scoutfs_block_header *hdr; + struct scoutfs_btree_item *item; + struct scoutfs_btree_block *bt; + struct scoutfs_block_ref *ref; + struct btree_item *bti; + struct btree_item *tmp; + unsigned long val_align; + unsigned long bytes; + unsigned long nr; + unsigned long min_items; + long item_bytes_after_block; + void *val_buf; + spr_err_t err; + u8 height; + int level; + int i; + + /* find next highest level to build items from */ + for (i = 0; i < SCOUTFS_BTREE_MAX_HEIGHT; i++) { + if (btb->items[i].nr == 0) + continue; + + level = i; + break; + } + + /* shouldn't be possible */ + if (i >= SCOUTFS_BTREE_MAX_HEIGHT) { + err = ENOBUFS; + goto out; + } + + dprintf("building btree blkno %llu level %u nr %lu tot %llu \n", + blkno, level, btb->items[level].nr, btb->total_items); + + /* + * XXX Be more careful about item filling.. can parents be entirely + * full? Should we let the last nodes on the right be under the + * min? We can see that there are < (nr + min) left and emit + * half the remaining in each. + */ + + /* initialize the non-item parts of the block */ + bt = buf; + memset(bt, 0, sizeof(struct scoutfs_btree_block)); + hdr = &bt->hdr; + hdr->magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BTREE); + hdr->fsid = wri->fsid; + hdr->blkno = cpu_to_le64(blkno); + hdr->seq = cpu_to_le64(1); + bt->level = level; + btree_init_block(bt, level); + if (level == 0) + memset((char *)bt + SCOUTFS_BLOCK_LG_SIZE - SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES, 0, + SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES); + + /* find the items that fit in the leaf */ + item = &bt->items[0]; + nr = 0; + val_buf = (void *)item + le16_to_cpu(bt->mid_free_len); + + for_each_bti_safe(&btb->items[level].root, bti, tmp) { + val_align = round_up(bti->val_len, SCOUTFS_BTREE_VALUE_ALIGN); + bytes = sizeof(struct scoutfs_btree_item) + val_align; + item_bytes_after_block = (le64_to_cpu(btb->total_items) * bytes) - le16_to_cpu(bt->mid_free_len); + min_items = (SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) / 4; + + if (le16_to_cpu(bt->mid_free_len) < bytes) + break; + + /* stop when there are not enough items to fill the next block */ + if (item_bytes_after_block > 0 && item_bytes_after_block < min_items) + break; + + item->node.height = UNLINKED_AVL_HEIGHT; + item->key = bti->key; + item->seq = cpu_to_le64(1); + item->val_len = cpu_to_le16(bti->val_len); + item->flags = 0; + memset(item->node.__pad, 0, sizeof(item->node.__pad)); + + if (bti->val_len) { + val_buf -= val_align; + item->val_off = cpu_to_le16((long)val_buf - (long)bt); + memcpy(val_buf, bti->val, bti->val_len); + } else { + item->val_off = 0; + } + + le16_add_cpu(&bt->nr_items, 1); + le16_add_cpu(&bt->total_item_bytes, bytes); + le16_add_cpu(&bt->mid_free_len, -bytes); + if (level == 0) + leaf_item_hash_insert(bt, &item->key, + cpu_to_le16((void *)item - (void *)bt)); + + item++; + nr++; + + btb_erase(btb, bti, level); + free(bti); + } + + /* zero the middle of the block without items */ + if (bt->mid_free_len) + memset(&bt->items[nr], 0, le16_to_cpu(bt->mid_free_len)); + + height = (int)ceil(log2(nr)) + 2; /* leaves are height 1 */ + link_avl_nodes(bt, &bt->item_root.node, 0, height - 1, 0, nr - 1); + + /* finish block */ + hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE)); + + if (btb->total_items == 0) { + /* root refs hightest/last block we build */ + btb->btroot.ref.blkno = hdr->blkno; + btb->btroot.ref.seq = hdr->seq; + btb->btroot.height = level +1; + } else { + /* parent ref items will be built into parent blocks */ + /* we'll always need a parent ref for the block we're building */ + err = bti_alloc(sizeof(struct scoutfs_block_ref), &bti); + if (err) + goto out; + + /* refs to right spine blocks has all ones key */ + if (btb->items[level].nr == 0) + scoutfs_key_set_ones(&bti->key); + else + bti->key = bt->items[nr - 1].key; + ref = bti->val; + ref->blkno = hdr->blkno; + ref->seq = hdr->seq; + btb_insert(btb, bti, level + 1); + } + + err = 0; +out: + return err; +} + +static void btb_init(struct btree_builder *btb) +{ + int i; + + init_builder(&btb->bld, btree_empty, btree_reset, build_btree_block); + + for (i = 0; i < array_size(btb->items); i++) + btb->items[i].root = RB_ROOT; +} + +/* + * This is how we get around the recursion of allocating blocks to write blocks that + * store the allocators. After we've written all other metadata blocks we know precisely + * how many allocation blocks we'll need. We modify the writer to only have that many + * free blocks remaining and put the rest in the alloc block builders. + */ +static spr_err_t prepare_alloc_builders(struct scoutfs_parallel_restore_writer *wri, + struct block_builder *bld) +{ +#define ALLOC_BLOCKS 5 /* 2 meta list, 2 meta btree, 1 data btree */ + struct extent_head *eh_tmp; + struct extent_head *eh; + spr_err_t err; + u64 start; + u64 skip; + u64 len; + int ind; + + dprintf("starting prepare with start %llu len %llu\n", wri->meta_start, wri->meta_len); + + skip = ALLOC_BLOCKS + (SCOUTFS_ALLOC_LIST_MAX_BLOCKS * 2); + if (wri->meta_len <= skip) + return ENOSPC; + + /* store remainder of meta alloc as a free extent */ + start = wri->meta_start + skip; + len = wri->meta_len - skip; + err = insert_free_items(&wri->meta_btb[0], start, len); + if (err) + goto out; + wri->meta_len -= len; + + /* the rest of the meta extents are items in the two meta trees */ + ind = 1; + list_for_each_entry_safe(eh, eh_tmp, &wri->meta_extents, head) { + err = insert_free_items(&wri->meta_btb[ind], eh->start, eh->len); + if (err) + goto out; + list_del_init(&eh->head); + free(eh); + ind ^= 1; + } + + /* fill the two server avail alloc list blocks */ + wri->meta_alb[0].start = wri->meta_start + ALLOC_BLOCKS; + wri->meta_alb[0].len = SCOUTFS_ALLOC_LIST_MAX_BLOCKS; + wri->meta_alb[1].start = wri->meta_alb[0].start + wri->meta_alb[0].len; + wri->meta_alb[1].len = wri->meta_alb[0].len; + + /* writer left with only meta allocation for remaining alloc blocks */ + wri->meta_len = ALLOC_BLOCKS; + + err = 0; +out: + return err; +} + +static bool alloc_list_empty(struct block_builder *bld) +{ + DEFINE_BUILDER_CONTAINER(struct alloc_list_builder, alb, bld); + + return alb->len == 0; +} + +static spr_err_t build_alloc_list_block(struct scoutfs_parallel_restore_writer *wri, + struct block_builder *bld, void *buf, u64 blkno) +{ + DEFINE_BUILDER_CONTAINER(struct alloc_list_builder, alb, bld); + struct scoutfs_alloc_list_block *lblk; + struct scoutfs_block_header *hdr; + int i; + + if (alb->len > SCOUTFS_ALLOC_LIST_MAX_BLOCKS) + return EOVERFLOW; + + lblk = buf; + memset(&lblk->next, 0, sizeof(lblk->next)); + lblk->start = 0; + lblk->nr = cpu_to_le32(alb->len); + + for (i = 0; i < alb->len; i++) + lblk->blknos[i] = cpu_to_le64(alb->start + i); + + hdr = &lblk->hdr; + hdr->magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_ALLOC_LIST); + hdr->fsid = wri->fsid; + hdr->blkno = cpu_to_le64(blkno); + hdr->seq = cpu_to_le64(1); + hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE)); + + alb->lhead.ref.blkno = hdr->blkno; + alb->lhead.ref.seq = hdr->seq; + alb->lhead.first_nr = cpu_to_le32(alb->len); + alb->lhead.total_nr = cpu_to_le64(alb->len); + + alb->start = 0; + alb->len = 0; + + return 0; +} + +static void init_alb(struct alloc_list_builder *alb) +{ + init_builder(&alb->bld, alloc_list_empty, NULL, build_alloc_list_block); +} + +static struct srch_node *node_srn(struct rb_node *node) +{ + return node ? container_of(node, struct srch_node, node) : NULL; +} + +static struct srch_node *srn_first(struct rb_root *root) +{ + return node_srn(rb_first(root)); +} + +static struct srch_node *srn_next(struct srch_node *srn) +{ + return srn ? node_srn(rb_next(&srn->node)) : NULL; +} + +static spr_err_t insert_srch_entry(struct srch_builder *sbld, u64 hash, u64 ino, u64 id) +{ + struct rb_root *root = &sbld->entries; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + struct srch_node *ins; + struct srch_node *srn; + int cmp; + + ins = malloc(sizeof(struct srch_node)); + if (!ins) + return ENOMEM; + + ins->hash = hash; + ins->ino = ino; + ins->id = id; + + while (*node) { + parent = *node; + srn = node_srn(*node); + + cmp = scoutfs_cmp(ins->hash, srn->hash) ?: + scoutfs_cmp(ins->ino, srn->ino) ?: + scoutfs_cmp(ins->id, srn->id); + if (cmp < 0) + node = &(*node)->rb_left; + else if (cmp > 0) + node = &(*node)->rb_right; + else + return EEXIST; + } + + rb_link_node(&ins->node, parent, node); + rb_insert_color(&ins->node, root); + + return 0; +} + +static bool srch_empty(struct block_builder *bld) +{ + DEFINE_BUILDER_CONTAINER(struct srch_builder, sbld, bld); + + return RB_EMPTY_ROOT(&sbld->entries) && sbld->total_parent_refs == 0; +} + +static void srch_reset(struct block_builder *bld) +{ + DEFINE_BUILDER_CONTAINER(struct srch_builder, sbld, bld); + + memset(&sbld->sfl, 0, sizeof(sbld->sfl)); +} + +#define for_each_sbld_parent(sbld, i) \ + for (i = 1; i < array_size(sbld->parents); i++) + +static spr_err_t build_srch_block(struct scoutfs_parallel_restore_writer *wri, + struct block_builder *bld, void *buf, u64 blkno) +{ + DEFINE_BUILDER_CONTAINER(struct srch_builder, sbld, bld); + struct scoutfs_block_header *hdr; + struct scoutfs_srch_parent *par; + struct scoutfs_srch_block *srb; + struct scoutfs_srch_entry sre; + struct scoutfs_block_ref *ref; + struct srch_node *srn_tmp; + struct srch_node *srn; + unsigned int nr; + spr_err_t err; + u32 magic; + int level; + int tail; + int ret; + + dprintf("building srch blkno %llu empty_entries %u tot refs %llu parent nrs: ", + blkno, RB_EMPTY_ROOT(&sbld->entries), sbld->total_parent_refs); + for_each_sbld_parent(sbld, level) + dprintf("%u:%lu ", level, sbld->parents[level].nr); + dprintf("\n"); + + /* build parents with refs that are full or when we're out of entries */ + for_each_sbld_parent(sbld, level) { + + nr = sbld->parents[level].nr; + if (nr == 0 || (nr < SCOUTFS_SRCH_PARENT_REFS && !RB_EMPTY_ROOT(&sbld->entries))) + continue; + + /* copy parent refs */ + par = buf; + memcpy(par->refs, sbld->parents[level].refs, nr * sizeof(par->refs[0])); + sbld->total_parent_refs -= nr; + sbld->parents[level].nr = 0; + + /* zero the tail of the block */ + tail = SCOUTFS_BLOCK_LG_SIZE - offsetof(struct scoutfs_srch_parent, refs[nr]); + if (tail > 0) + memset(buf + SCOUTFS_BLOCK_LG_SIZE - tail, 0, tail); + + magic = SCOUTFS_BLOCK_MAGIC_SRCH_PARENT; + hdr = &par->hdr; + goto finish_hdr; + } + + /* no built parent, must have entries to build */ + level = 0; + if (RB_EMPTY_ROOT(&sbld->entries)) { + err = EINVAL; + goto out; + } + + srn = srn_first(&sbld->entries); + sre.hash = cpu_to_le64(srn->hash); + sre.ino = cpu_to_le64(srn->ino); + sre.id = cpu_to_le64(srn->id); + + srb = buf; + srb->entry_nr = 0; + srb->entry_bytes = 0; + srb->first = sre; + memset(&srb->tail, 0, sizeof(srb->tail)); + + if (sbld->sfl.blocks == 0) + sbld->sfl.first = sre; + + do { + if (le32_to_cpu(srb->entry_bytes) > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) + break; + + ret = srch_encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes), + &sre, &srb->tail); + + dprintf("%llu.%llu.%llu ret %d\n", srn->hash, srn->ino, srn->id, ret); + + le32_add_cpu(&srb->entry_bytes, ret); + le32_add_cpu(&srb->entry_nr, 1); + srb->tail = sre; + + srn_tmp = srn_next(srn); + rb_erase(&srn->node, &sbld->entries); + free(srn); + + if ((srn = srn_tmp)) { + sre.hash = cpu_to_le64(srn->hash); + sre.ino = cpu_to_le64(srn->ino); + sre.id = cpu_to_le64(srn->id); + } + } while (srn); + + srb->last = srb->tail; + sbld->sfl.last = srb->tail; + + le64_add_cpu(&sbld->sfl.blocks, 1); + le64_add_cpu(&sbld->sfl.entries, le32_to_cpu(srb->entry_nr)); + + magic = SCOUTFS_BLOCK_MAGIC_SRCH_BLOCK; + hdr = &srb->hdr; + +finish_hdr: + hdr->magic = cpu_to_le32(magic); + hdr->fsid = wri->fsid; + hdr->blkno = cpu_to_le64(blkno); + hdr->seq = cpu_to_le64(1); + hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE)); + + if (srch_empty(&sbld->bld)) { + /* the last block is referenced by the root */ + sbld->sfl.ref.blkno = hdr->blkno; + sbld->sfl.ref.seq = hdr->seq; + sbld->sfl.height = level + 1; + memset(sbld->sfl.__pad, 0, sizeof(sbld->sfl.__pad)); + } else { + /* store the parent ref to our block */ + nr = sbld->parents[level + 1].nr++; + ref = &sbld->parents[level + 1].refs[nr]; + ref->blkno = hdr->blkno; + ref->seq = hdr->seq; + sbld->total_parent_refs++; + } + + err = 0; +out: + return err; +} + +static spr_err_t sbld_create(struct srch_builder *sbld) +{ + spr_err_t err = 0; + int i; + + init_builder(&sbld->bld, srch_empty, srch_reset, build_srch_block); + + for_each_sbld_parent(sbld, i) { + sbld->parents[i].refs = malloc(SCOUTFS_SRCH_PARENT_REFS * + sizeof(struct scoutfs_block_ref)); + if (!sbld->parents[i].refs) { + while (--i >= 1) { + free(sbld->parents[i].refs); + sbld->parents[i].refs = NULL; + } + err = ENOMEM; + break; + } + } + + return err; +} + +static void sbld_destroy(struct srch_builder *sbld) +{ + int i; + + for_each_sbld_parent(sbld, i) { + free(sbld->parents[i].refs); + sbld->parents[i].refs = NULL; + } +} + +/* + * We've written the bloom block if we've filled out its header. + */ +static bool bloom_empty(struct block_builder *bld) +{ + DEFINE_BUILDER_CONTAINER(struct bloom_builder, bbld, bld); + + return bbld->bloom->hdr.seq != 0; +} + +static void bloom_reset(struct block_builder *bld) +{ + DEFINE_BUILDER_CONTAINER(struct bloom_builder, bbld, bld); + + memset(bbld->bloom, 0, SCOUTFS_BLOCK_LG_SIZE); +} + +static spr_err_t build_bloom_block(struct scoutfs_parallel_restore_writer *wri, + struct block_builder *bld, void *buf, u64 blkno) +{ + DEFINE_BUILDER_CONTAINER(struct bloom_builder, bbld, bld); + struct scoutfs_block_header *hdr; + + hdr = &bbld->bloom->hdr; + hdr->magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BLOOM); + hdr->fsid = wri->fsid; + hdr->blkno = cpu_to_le64(blkno); + hdr->seq = cpu_to_le64(1); + hdr->crc = cpu_to_le32(crc_block(hdr, SCOUTFS_BLOCK_LG_SIZE)); + + memcpy(buf, bbld->bloom, SCOUTFS_BLOCK_LG_SIZE); + + return 0; +} + +static spr_err_t bbld_create(struct bloom_builder *bbld) +{ + init_builder(&bbld->bld, bloom_empty, bloom_reset, build_bloom_block); + + bbld->bloom = malloc(SCOUTFS_BLOCK_LG_SIZE); + if (!bbld->bloom) + return ENOMEM; + + memset(&bbld->bloom->hdr, 0, sizeof(bbld->bloom->hdr)); + + return 0; +} + +static void bbld_destroy(struct bloom_builder *bbld) +{ + free(bbld->bloom); +} + +static bool wri_has_super(struct scoutfs_parallel_restore_writer *wri) +{ + return wri->super.hdr.blkno != 0; +} + +static void reset_builders(struct scoutfs_parallel_restore_writer *wri) +{ + /* define block build order, different than struct layout order */ + struct block_builder *builders[] = { + /* fs items written in parallel by writers */ + &wri->fs_btb.bld, + &wri->bloom_bbld.bld, + &wri->srch_sbld.bld, + + /* global items written finally by global super writer */ + &wri->root_btb.bld, + &wri->srch_btb.bld, + /* log .post() prepares final allocators */ + &wri->log_btb.bld, + &wri->meta_alb[0].bld, + &wri->meta_alb[1].bld, + &wri->meta_btb[0].bld, + &wri->meta_btb[1].bld, + &wri->data_btb.bld, + }; + struct block_builder *bld; + int i; + + for (i = 0; i < array_size(builders); i++) { + bld = builders[i]; + + if (bld->reset) + bld->reset(bld); + + if (!list_empty(&bld->head)) + list_del_init(&bld->head); + list_add_tail(&bld->head, &wri->builders); + } +} + +spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip) +{ + struct scoutfs_parallel_restore_writer *wri; + spr_err_t err; + + wri = calloc(1, sizeof(struct scoutfs_parallel_restore_writer)); + if (!wri) { + err = ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&wri->meta_extents); + INIT_LIST_HEAD(&wri->builders); + btb_init(&wri->root_btb); + btb_init(&wri->fs_btb); + btb_init(&wri->srch_btb); + btb_init(&wri->log_btb); + btb_init(&wri->meta_btb[0]); + btb_init(&wri->meta_btb[1]); + btb_init(&wri->data_btb); + init_alb(&wri->meta_alb[0]); + init_alb(&wri->meta_alb[1]); + + err = sbld_create(&wri->srch_sbld) ?: + bbld_create(&wri->bloom_bbld); + if (err) + goto out; + + reset_builders(wri); + err = 0; +out: + if (err) { + if (wri) { + sbld_destroy(&wri->srch_sbld); + bbld_destroy(&wri->bloom_bbld); + free(wri); + } + wri = NULL; + } + *wrip = wri; + return err; +} + +void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip) +{ + struct scoutfs_parallel_restore_writer *wri = *wrip; + struct extent_head *eh; + struct extent_head *eh_tmp; + + if (!wri) + return; + + btb_destroy(&wri->root_btb); + btb_destroy(&wri->fs_btb); + btb_destroy(&wri->srch_btb); + btb_destroy(&wri->log_btb); + btb_destroy(&wri->meta_btb[0]); + btb_destroy(&wri->meta_btb[1]); + btb_destroy(&wri->data_btb); + sbld_destroy(&wri->srch_sbld); + bbld_destroy(&wri->bloom_bbld); + + list_for_each_entry_safe(eh, eh_tmp, &wri->meta_extents, head) { + list_del_init(&eh->head); + free(eh); + } + + free(wri); + *wrip = NULL; +} + +spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_slice *slices, + int nr) +{ + u64 total = le64_to_cpu(wri->super.total_meta_blocks); + u64 start = SCOUTFS_META_DEV_START_BLKNO; + u64 each = (total - start) / nr; + int i; + + if (!wri_has_super(wri)) + return EINVAL; + + for (i = 0; i < nr - 1; i++) { + slices[i].fsid = wri->super.hdr.fsid; + slices[i].meta_start = cpu_to_le64(start); + slices[i].meta_len = cpu_to_le64(each); + start += each; + } + + slices[i].fsid = wri->super.hdr.fsid; + slices[i].meta_start = cpu_to_le64(start); + slices[i].meta_len = cpu_to_le64(total - start); + + return 0; +} + +spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_slice *slice) +{ + wri->fsid = slice->fsid; + + return meta_alloc_add(wri, le64_to_cpu(slice->meta_start), le64_to_cpu(slice->meta_len)); +} + +spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_slice *slice) +{ + slice->fsid = wri->fsid; + slice->meta_start = cpu_to_le64(wri->meta_start); + slice->meta_len = cpu_to_le64(wri->meta_len); + return 0; +} + +spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_inode *inode) +{ + spr_err_t err; + + if (wri_has_super(wri)) + return EINVAL; + + err = insert_inode_items(wri, inode); + if (err) + goto out; + + wri->inode_count++; + wri->max_ino = max(wri->max_ino, inode->ino); + err = 0; +out: + return err; +} + +spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_entry *entry) +{ + + if (wri_has_super(wri)) + return EINVAL; + + return insert_entry_items(wri, entry); +} + +spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_xattr *xattr) +{ + spr_err_t err; + int xtags; + u32 xat_hash; + u64 srch_hash; + + xat_hash = crc32c(U32_MAX, xattr->name, xattr->name_len); + srch_hash = scoutfs_hash64(xattr->name, xattr->name_len); + xtags = get_xattr_tags(xattr->name, xattr->name_len); + + err = insert_xattr_items(wri, xattr, xat_hash); + if (!err) { + if (xtags & XTAG_SRCH) + err = insert_srch_entry(&wri->srch_sbld, srch_hash, xattr->ino, xattr->pos); + if (!err && (xtags & XTAG_TOTL)) + err = insert_totl_item(wri, xattr); + } + + return err; +} + +spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_progress *prog) +{ + if (wri_has_super(wri)) + return EINVAL; + + memset(prog, 0, sizeof(struct scoutfs_parallel_restore_progress)); + prog->fs_items = wri->fs_btb.btroot; + prog->root_items = wri->root_btb.btroot; + prog->sfl = wri->srch_sbld.sfl; + prog->bloom_ref.blkno = wri->bloom_bbld.bloom->hdr.blkno; + prog->bloom_ref.seq = wri->bloom_bbld.bloom->hdr.seq; + prog->inode_count = cpu_to_le64(wri->inode_count); + prog->max_ino = cpu_to_le64(wri->max_ino); + + reset_builders(wri); + wri->inode_count = 0; + wri->max_ino = 0; + + return 0; +} + +spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_progress *prog) +{ + spr_err_t err; + + if (!wri_has_super(wri)) + return EINVAL; + + /* + * Only one writer's progress should contain the root inode. + */ + if (prog->root_items.ref.blkno) { + if (wri->root_items.ref.blkno) + return EEXIST; + wri->root_items = prog->root_items; + } + + wri->max_ino = max(wri->max_ino, le64_to_cpu(prog->max_ino)); + + err = insert_log_trees_item(wri, prog); + if (!err && prog->sfl.ref.blkno) + err = insert_srch_item(wri, &prog->sfl); + + return err; +} + +spr_err_t scoutfs_parallel_restore_add_quota_rule(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_quota_rule *rule) +{ + return insert_quota_item(wri, rule); +} + +spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri, + void *buf, size_t len, off_t *off_ret, + size_t *count_ret) +{ + struct block_builder *bld; + off_t count = 0; + off_t off = 0; + u64 blkno = 0; + spr_err_t err; + + if (len < SCOUTFS_BLOCK_LG_SIZE) { + err = EINVAL; + goto out; + } + + while (len >= SCOUTFS_BLOCK_LG_SIZE) { + bld = list_first_entry_or_null(&wri->builders, struct block_builder, head); + if (!bld) { + err = 0; + break; + } + + if (bld->empty(bld)) { + if (bld->post && ((err = bld->post(wri, bld)))) + break; + list_del_init(&bld->head); + continue; + } + + err = meta_alloc_contig(wri, blkno, &blkno); + if (err || blkno == 0) + break; + + if (off == 0) + off = blkno << SCOUTFS_BLOCK_LG_SHIFT; + + err = bld->build(wri, bld, buf, blkno); + if (err) + break; + + buf += SCOUTFS_BLOCK_LG_SIZE; + len -= SCOUTFS_BLOCK_LG_SIZE; + count += SCOUTFS_BLOCK_LG_SIZE; + + dprintf("built blkno %llu off %llu count %llu\n", blkno, (u64)off, (u64)count); + } + +out: + *off_ret = off; + *count_ret = count; + return count > 0 ? 0 : err; +} + +/* + * Here we take in a dev's fd an read its quorum blocks to see if the dev has + * been mounted before + */ +static spr_err_t scoutfs_check_if_previous_mount(int fd) +{ + struct scoutfs_quorum_block *blk = NULL; + struct scoutfs_quorum_block_event *ev; + u64 blkno; + int i, j; + spr_err_t err; + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + blkno = SCOUTFS_QUORUM_BLKNO + i; + err = read_block(fd, blkno, SCOUTFS_BLOCK_SM_SHIFT, (void **)&blk); + if (!blk) + return EINVAL; + + dprintf("quorum block read; quorum bklno: %llu, err_val: %d\n", blkno, err); + if (err) { + free(blk); + return err; + } + + for (j = 0; j < SCOUTFS_QUORUM_EVENT_NR; j++) { + ev = &blk->events[j]; + if (ev->ts.sec || ev->ts.nsec) { + free(blk); + return EINVAL; + } + } + + free(blk); + } + + return err; +} + +spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_super_block *super, int fd) +{ + spr_err_t err; + u64 start; + u64 len; + + /* + * check the device we are restoring into to make sure + * that it has has never been mounted + */ + if (scoutfs_check_if_previous_mount(fd)) + return EINVAL; + + if (le64_to_cpu(super->fmt_vers) < 2) + return EINVAL; + + if ((le64_to_cpu(super->flags) & SCOUTFS_FLAG_IS_META_BDEV) == 0) + return EINVAL; + + if (wri_has_super(wri)) + return EINVAL; + + start = SCOUTFS_DATA_DEV_START_BLKNO; + len = le64_to_cpu(super->total_data_blocks) - start; + + /* make sure all data extents are free */ + if (le64_to_cpu(super->data_alloc.total_len) != len) + return ENOTEMPTY; + + /* we write new allocator blocks so that we don't have to read exiting */ + err = insert_free_items(&wri->data_btb, start, len); + if (err) + return err; + + wri->super = *super; + + /* prepare alloc block builders only after other metadata blocks are built */ + wri->log_btb.bld.post = prepare_alloc_builders; + + return 0; +} + +spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_super_block *super) +{ + if (!wri_has_super(wri)) + return EINVAL; + + *super = wri->super; + + super->seq = cpu_to_le64(wri->max_ino + 1); + super->next_ino = cpu_to_le64(wri->max_ino + 1); + super->inode_count = cpu_to_le64(wri->inode_count); + set_alloc_root(&super->meta_alloc[0], &wri->meta_btb[0]); + set_alloc_root(&super->meta_alloc[1], &wri->meta_btb[1]); + set_alloc_root(&super->data_alloc, &wri->data_btb); + super->server_meta_avail[0] = wri->meta_alb[0].lhead; + super->server_meta_avail[1] = wri->meta_alb[1].lhead; + memset(super->server_meta_freed, 0, sizeof(super->server_meta_freed)); + super->fs_root = wri->root_items; + super->logs_root = wri->log_btb.btroot; + memset(&super->log_merge, 0, sizeof(super->log_merge)); + memset(&super->mounted_clients, 0, sizeof(super->mounted_clients)); + super->srch_root = wri->srch_btb.btroot; + /* test volopt? */ + + super->hdr.crc = cpu_to_le32(crc_block(&super->hdr, SCOUTFS_BLOCK_SM_SIZE)); + + return 0; +} diff --git a/utils/src/parallel_restore.h b/utils/src/parallel_restore.h new file mode 100644 index 00000000..8865e842 --- /dev/null +++ b/utils/src/parallel_restore.h @@ -0,0 +1,126 @@ +#ifndef _SCOUTFS_PARALLEL_RESTORE_H_ +#define _SCOUTFS_PARALLEL_RESTORE_H_ + +#include + +struct scoutfs_parallel_restore_progress { + struct scoutfs_btree_root fs_items; + struct scoutfs_btree_root root_items; + struct scoutfs_srch_file sfl; + struct scoutfs_block_ref bloom_ref; + __le64 inode_count; + __le64 max_ino; +}; + +struct scoutfs_parallel_restore_slice { + __le64 fsid; + __le64 meta_start; + __le64 meta_len; +}; + +struct scoutfs_parallel_restore_entry { + u64 dir_ino; + u64 pos; + u64 ino; + mode_t mode; + char *name; + unsigned int name_len; +}; + +struct scoutfs_parallel_restore_xattr { + u64 ino; + u64 pos; + char *name; + unsigned int name_len; + void *value; + unsigned int value_len; +}; + +struct scoutfs_parallel_restore_inode { + /* all inodes */ + u64 ino; + u64 meta_seq; + u64 data_seq; + u64 nr_xattrs; + u32 uid; + u32 gid; + u32 mode; + u32 rdev; + u32 flags; + u8 pad[4]; + struct timespec atime; + struct timespec ctime; + struct timespec mtime; + struct timespec crtime; + u64 proj; + + /* regular files */ + u64 data_version; + u64 size; + bool offline; + u32 nlink; + + /* only used for directories */ + u64 nr_subdirs; + u64 total_entry_name_bytes; + + /* only used for symlnks */ + char *target; + unsigned int target_len; /* not including null terminator */ +}; + +struct scoutfs_parallel_restore_quota_rule { + u64 limit; + u8 prio; + u8 op; + u8 rule_flags; + struct quota_rule_name { + u64 val; + u8 source; + u8 flags; + } names [3]; + char *value; + unsigned int value_len; +}; + +typedef __typeof__(EINVAL) spr_err_t; + +struct scoutfs_parallel_restore_writer; + +spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip); +void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip); + +spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_slice *slices, + int nr); +spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_slice *slice); +spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_slice *slice); + +spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_inode *inode); +spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_entry *entry); +spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_xattr *xattr); + +spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_progress *prog); +spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_progress *prog); + +spr_err_t scoutfs_parallel_restore_add_quota_rule(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_parallel_restore_quota_rule *rule); + +spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri, + void *buf, size_t len, off_t *off_ret, + size_t *count_ret); + +spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_super_block *super, int fd); +spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri, + struct scoutfs_super_block *super); + + +#endif