diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 0e5c31d3..b2ad0296 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -68,6 +68,8 @@ struct forest_info { struct delayed_work log_merge_dwork; atomic64_t inode_count_delta; + + u64 merge_dirty_limit_override; }; #define DECLARE_FOREST_INFO(sb, name) \ @@ -668,6 +670,7 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work) struct scoutfs_key key; unsigned long delay; LIST_HEAD(inputs); + u64 dirty_limit; int ret; ret = scoutfs_client_get_log_merge(sb, &req); @@ -728,10 +731,14 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work) goto out; } + dirty_limit = READ_ONCE(finf->merge_dirty_limit_override); + if (!dirty_limit) + dirty_limit = SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT; + ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end, &next, &comp.root, &inputs, !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)), - SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10, + dirty_limit, 10, (2 * 1024 * 1024)); if (ret == -ERANGE) { comp.remain = next; @@ -781,6 +788,9 @@ int scoutfs_forest_setup(struct super_block *sb) scoutfs_forest_log_merge_worker); sbi->forest_info = finf; + debugfs_create_u64("log_merge_dirty_limit", 0644, sbi->debug_root, + &finf->merge_dirty_limit_override); + finf->workq = alloc_workqueue("scoutfs_log_merge", WQ_NON_REENTRANT | WQ_UNBOUND | WQ_HIGHPRI, 0); if (!finf->workq) { diff --git a/tests/Makefile b/tests/Makefile index 3a2380dc..2323098f 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -15,7 +15,8 @@ BIN := src/createmany \ src/o_tmpfile_umask \ src/o_tmpfile_linkat \ src/mmap_stress \ - src/mmap_validate + src/mmap_validate \ + src/totl_xattr_fill DEPS := $(wildcard src/*.d) diff --git a/tests/golden/totl-merge-negative b/tests/golden/totl-merge-negative new file mode 100644 index 00000000..e526dff6 --- /dev/null +++ b/tests/golden/totl-merge-negative @@ -0,0 +1,4 @@ +== setup +== update and merge +== verify totals +== cleanup diff --git a/tests/sequence b/tests/sequence index 8091b1b1..59cec775 100644 --- a/tests/sequence +++ b/tests/sequence @@ -26,6 +26,7 @@ simple-xattr-unit.sh retention-basic.sh totl-xattr-tag.sh quota.sh +totl-merge-negative.sh lock-refleak.sh lock-shrink-consistency.sh lock-shrink-read-race.sh diff --git a/tests/src/totl_xattr_fill.c b/tests/src/totl_xattr_fill.c new file mode 100644 index 00000000..6e3cecdb --- /dev/null +++ b/tests/src/totl_xattr_fill.c @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define error_exit(cond, fmt, args...) \ +do { \ + if (cond) { \ + printf("error: "fmt"\n", ##args); \ + exit(1); \ + } \ +} while (0) + +#define ERRF " errno %d (%s)" +#define ERRA errno, strerror(errno) + +/* + * Doing setxattr() from a shell a few thousand times takes way too + * long, hence, this program + */ + +static void usage(void) +{ + printf("usage: totl_xattr_fill -d DIR -p PREFIX -n COUNT -v VALUE [-c]\n" + " -d DIR directory containing files\n" + " -p PREFIX file name prefix\n" + " -n COUNT number of files (numbered 1..COUNT)\n" + " -v VALUE xattr value string\n" + " -c create files before setting xattr\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + char *dir = NULL; + char *prefix = NULL; + char *value = NULL; + unsigned long count = 0; + int create = 0; + char path[4096]; + char name[256]; + unsigned long i; + int fd; + int ret; + int c; + + while ((c = getopt(argc, argv, "d:p:n:v:c")) != -1) { + switch (c) { + case 'd': + dir = optarg; + break; + case 'p': + prefix = optarg; + break; + case 'n': + count = strtoul(optarg, NULL, 0); + break; + case 'v': + value = optarg; + break; + case 'c': + create = 1; + break; + case '?': + usage(); + } + } + + error_exit(!dir, "must specify directory with -d"); + error_exit(!prefix, "must specify prefix with -p"); + error_exit(!count, "must specify count with -n"); + error_exit(!value, "must specify value with -v"); + + for (i = 1; i <= count; i++) { + ret = snprintf(path, sizeof(path), "%s/%s%lu", dir, prefix, i); + error_exit(ret >= (int)sizeof(path), "path too long"); + + if (create) { + fd = open(path, O_CREAT | O_WRONLY, 0644); + error_exit(fd < 0, "open %s failed"ERRF, path, ERRA); + close(fd); + } + + ret = snprintf(name, sizeof(name), + "scoutfs.totl.test.%lu.0.0", i); + error_exit(ret >= (int)sizeof(name), "xattr name too long"); + + ret = setxattr(path, name, value, strlen(value), 0); + error_exit(ret, "setxattr %s %s failed"ERRF, + path, name, ERRA); + } + + return 0; +} diff --git a/tests/tests/totl-merge-negative.sh b/tests/tests/totl-merge-negative.sh new file mode 100644 index 00000000..4902b015 --- /dev/null +++ b/tests/tests/totl-merge-negative.sh @@ -0,0 +1,118 @@ +# +# Test that merge_read_item() correctly updates the sequence number when +# combining delta items from multiple finalized log trees. +# +# A bug in merge_read_item() fails to update found->seq to the max of +# both items' seqs when combining deltas. The combined item retains +# a stale seq from the lower-RID tree processed first. +# +# Two mounts create totl delta items for the same keys. The lower-RID +# mount writes first so it acquires locks first and gets lower seqs. +# Finalized trees are iterated in (rid, nr) key order during merge, so +# the lower-RID tree (with lower seqs) is processed first -- the +# combined item keeps the stale seq from that tree. +# +# An initial round of writes + merge establishes baseline totl values +# and commits all transactions. The second round writes FIRST then +# SECOND in sequence so FIRST opens a fresh transaction first (lower +# trans_seq) and acquires locks first (lower write_seq). +# +# The dirty limit override forces partial merges so that stale-seq +# items are written to fs_root while finalized logs still exist, +# causing readers to double-count finalized deltas. +# +# The read-xattr-totals ioctl uses a sliding cursor that can return +# pre-merge values (< expected) or duplicate entries when btree data +# changes between batches. Only totals ABOVE the expected value +# indicate the double-counting bug. +# + +t_require_commands totl_xattr_fill scoutfs +t_require_mounts 2 + +NR_KEYS=50000 +EXPECTED_TOTAL=250 + +rid_0=$(t_mount_rid 0) +rid_1=$(t_mount_rid 1) + +if [[ "$rid_0" < "$rid_1" ]]; then + FIRST=0; SECOND=1 +else + FIRST=1; SECOND=0 +fi + +FIRST_D="$(eval echo \$T_D$FIRST)" +SECOND_D="$(eval echo \$T_D$SECOND)" + +echo "== setup" +totl_xattr_fill -d "$FIRST_D" -p file-first- -n $NR_KEYS -v 100 -c +totl_xattr_fill -d "$SECOND_D" -p file-second- -n $NR_KEYS -v 100 -c +sync +t_force_log_merge + +echo "== update and merge" + +# FIRST writes first: opens new transaction first (lower trans_seq), +# acquires locks first (lower write_seq). +totl_xattr_fill -d "$FIRST_D" -p file-first- -n $NR_KEYS -v 200 +totl_xattr_fill -d "$SECOND_D" -p file-second- -n $NR_KEYS -v 50 + +# Force partial merges by limiting dirty bytes to one block so that +# stale-seq combined items get written to fs_root while finalized +# logs still exist. +for i in $(t_fs_nrs); do + echo 4096 > $(t_debugfs_path $i)/log_merge_dirty_limit +done + +sync + +# Start the merge but don't wait for completion -- read totals while +# the merge is running so we can catch the double-counting window. +sv=$(t_server_nr) +last_complete=$(t_counter log_merge_complete $sv) + +t_trigger_arm_silent log_merge_force_finalize_ours $sv +t_sync_seq_index +while test "$(t_trigger_get log_merge_force_finalize_ours $sv)" == "1"; do + sleep .1 +done + +echo "== verify totals" + +# Read totals repeatedly while the merge runs. The stale-seq bug +# causes totl_merge_resolve() to double-count finalized deltas for +# keys spliced into fs_root with a stale sequence number, producing +# totals above the expected value. +# +# The sliding cursor may return pre-merge values (below expected) or +# duplicate entries -- these are not the bug. Only totals ABOVE the +# expected value indicate double-counting. +nr_reads=0 +while (( $(t_counter log_merge_complete $sv) == last_complete )); do + for i in $(t_fs_nrs); do + echo 1 > $(t_debugfs_path $i)/drop_weak_item_cache + done + bad=$(scoutfs read-xattr-totals -p "$T_M0" | \ + awk -F'[ =,]+' -v limit=$EXPECTED_TOTAL '$2+0 > limit+0') + nr_reads=$((nr_reads + 1)) + if [ -n "$bad" ]; then + echo "double-counted totals during merge (read $nr_reads):" + echo "$bad" + for i in $(t_fs_nrs); do + echo 0 > $(t_debugfs_path $i)/log_merge_dirty_limit + done + t_fail "stale seq caused double-counted totals" + fi +done + +# Restore normal dirty limit. +for i in $(t_fs_nrs); do + echo 0 > $(t_debugfs_path $i)/log_merge_dirty_limit +done + +echo "== cleanup" +find "$FIRST_D" -maxdepth 1 -name 'file-first-*' -delete +find "$SECOND_D" -maxdepth 1 -name 'file-second-*' -delete + +t_pass