From a8a55a43c3b467a33996bb62520719e6495b2f0a Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Wed, 18 Feb 2026 11:39:49 -0800 Subject: [PATCH] Add test and counter for stale seq in merge delta combining. merge_read_item() fails to update found->seq when combining delta items from multiple finalized log trees. Add a test case to replicate the conditions of this issue. The conditions to get this to reproduce are pretty tricky. We have to exceed the block limit and that's nearly impossible at VM scale with out significant load. Instead, we lower the merge limit and create an executable to create the conditions for us. That keeps the test case reasonably short. Signed-off-by: Auke Kok --- kmod/src/forest.c | 12 ++- tests/Makefile | 3 +- tests/golden/totl-merge-negative | 4 + tests/sequence | 1 + tests/src/totl_xattr_fill.c | 98 ++++++++++++++++++++++++ tests/tests/totl-merge-negative.sh | 118 +++++++++++++++++++++++++++++ 6 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 tests/golden/totl-merge-negative create mode 100644 tests/src/totl_xattr_fill.c create mode 100644 tests/tests/totl-merge-negative.sh diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 0e5c31d36..b2ad0296f 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -68,6 +68,8 @@ struct forest_info { struct delayed_work log_merge_dwork; atomic64_t inode_count_delta; + + u64 merge_dirty_limit_override; }; #define DECLARE_FOREST_INFO(sb, name) \ @@ -668,6 +670,7 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work) struct scoutfs_key key; unsigned long delay; LIST_HEAD(inputs); + u64 dirty_limit; int ret; ret = scoutfs_client_get_log_merge(sb, &req); @@ -728,10 +731,14 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work) goto out; } + dirty_limit = READ_ONCE(finf->merge_dirty_limit_override); + if (!dirty_limit) + dirty_limit = SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT; + ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end, &next, &comp.root, &inputs, !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)), - SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10, + dirty_limit, 10, (2 * 1024 * 1024)); if (ret == -ERANGE) { comp.remain = next; @@ -781,6 +788,9 @@ int scoutfs_forest_setup(struct super_block *sb) scoutfs_forest_log_merge_worker); sbi->forest_info = finf; + debugfs_create_u64("log_merge_dirty_limit", 0644, sbi->debug_root, + &finf->merge_dirty_limit_override); + finf->workq = alloc_workqueue("scoutfs_log_merge", WQ_NON_REENTRANT | WQ_UNBOUND | WQ_HIGHPRI, 0); if (!finf->workq) { diff --git a/tests/Makefile b/tests/Makefile index 3a2380dc2..2323098f6 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -15,7 +15,8 @@ BIN := src/createmany \ src/o_tmpfile_umask \ src/o_tmpfile_linkat \ src/mmap_stress \ - src/mmap_validate + src/mmap_validate \ + src/totl_xattr_fill DEPS := $(wildcard src/*.d) diff --git a/tests/golden/totl-merge-negative b/tests/golden/totl-merge-negative new file mode 100644 index 000000000..e526dff66 --- /dev/null +++ b/tests/golden/totl-merge-negative @@ -0,0 +1,4 @@ +== setup +== update and merge +== verify totals +== cleanup diff --git a/tests/sequence b/tests/sequence index 8091b1b18..59cec7750 100644 --- a/tests/sequence +++ b/tests/sequence @@ -26,6 +26,7 @@ simple-xattr-unit.sh retention-basic.sh totl-xattr-tag.sh quota.sh +totl-merge-negative.sh lock-refleak.sh lock-shrink-consistency.sh lock-shrink-read-race.sh diff --git a/tests/src/totl_xattr_fill.c b/tests/src/totl_xattr_fill.c new file mode 100644 index 000000000..6e3cecdb3 --- /dev/null +++ b/tests/src/totl_xattr_fill.c @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define error_exit(cond, fmt, args...) \ +do { \ + if (cond) { \ + printf("error: "fmt"\n", ##args); \ + exit(1); \ + } \ +} while (0) + +#define ERRF " errno %d (%s)" +#define ERRA errno, strerror(errno) + +/* + * Doing setxattr() from a shell a few thousand times takes way too + * long, hence, this program + */ + +static void usage(void) +{ + printf("usage: totl_xattr_fill -d DIR -p PREFIX -n COUNT -v VALUE [-c]\n" + " -d DIR directory containing files\n" + " -p PREFIX file name prefix\n" + " -n COUNT number of files (numbered 1..COUNT)\n" + " -v VALUE xattr value string\n" + " -c create files before setting xattr\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + char *dir = NULL; + char *prefix = NULL; + char *value = NULL; + unsigned long count = 0; + int create = 0; + char path[4096]; + char name[256]; + unsigned long i; + int fd; + int ret; + int c; + + while ((c = getopt(argc, argv, "d:p:n:v:c")) != -1) { + switch (c) { + case 'd': + dir = optarg; + break; + case 'p': + prefix = optarg; + break; + case 'n': + count = strtoul(optarg, NULL, 0); + break; + case 'v': + value = optarg; + break; + case 'c': + create = 1; + break; + case '?': + usage(); + } + } + + error_exit(!dir, "must specify directory with -d"); + error_exit(!prefix, "must specify prefix with -p"); + error_exit(!count, "must specify count with -n"); + error_exit(!value, "must specify value with -v"); + + for (i = 1; i <= count; i++) { + ret = snprintf(path, sizeof(path), "%s/%s%lu", dir, prefix, i); + error_exit(ret >= (int)sizeof(path), "path too long"); + + if (create) { + fd = open(path, O_CREAT | O_WRONLY, 0644); + error_exit(fd < 0, "open %s failed"ERRF, path, ERRA); + close(fd); + } + + ret = snprintf(name, sizeof(name), + "scoutfs.totl.test.%lu.0.0", i); + error_exit(ret >= (int)sizeof(name), "xattr name too long"); + + ret = setxattr(path, name, value, strlen(value), 0); + error_exit(ret, "setxattr %s %s failed"ERRF, + path, name, ERRA); + } + + return 0; +} diff --git a/tests/tests/totl-merge-negative.sh b/tests/tests/totl-merge-negative.sh new file mode 100644 index 000000000..4902b015d --- /dev/null +++ b/tests/tests/totl-merge-negative.sh @@ -0,0 +1,118 @@ +# +# Test that merge_read_item() correctly updates the sequence number when +# combining delta items from multiple finalized log trees. +# +# A bug in merge_read_item() fails to update found->seq to the max of +# both items' seqs when combining deltas. The combined item retains +# a stale seq from the lower-RID tree processed first. +# +# Two mounts create totl delta items for the same keys. The lower-RID +# mount writes first so it acquires locks first and gets lower seqs. +# Finalized trees are iterated in (rid, nr) key order during merge, so +# the lower-RID tree (with lower seqs) is processed first -- the +# combined item keeps the stale seq from that tree. +# +# An initial round of writes + merge establishes baseline totl values +# and commits all transactions. The second round writes FIRST then +# SECOND in sequence so FIRST opens a fresh transaction first (lower +# trans_seq) and acquires locks first (lower write_seq). +# +# The dirty limit override forces partial merges so that stale-seq +# items are written to fs_root while finalized logs still exist, +# causing readers to double-count finalized deltas. +# +# The read-xattr-totals ioctl uses a sliding cursor that can return +# pre-merge values (< expected) or duplicate entries when btree data +# changes between batches. Only totals ABOVE the expected value +# indicate the double-counting bug. +# + +t_require_commands totl_xattr_fill scoutfs +t_require_mounts 2 + +NR_KEYS=50000 +EXPECTED_TOTAL=250 + +rid_0=$(t_mount_rid 0) +rid_1=$(t_mount_rid 1) + +if [[ "$rid_0" < "$rid_1" ]]; then + FIRST=0; SECOND=1 +else + FIRST=1; SECOND=0 +fi + +FIRST_D="$(eval echo \$T_D$FIRST)" +SECOND_D="$(eval echo \$T_D$SECOND)" + +echo "== setup" +totl_xattr_fill -d "$FIRST_D" -p file-first- -n $NR_KEYS -v 100 -c +totl_xattr_fill -d "$SECOND_D" -p file-second- -n $NR_KEYS -v 100 -c +sync +t_force_log_merge + +echo "== update and merge" + +# FIRST writes first: opens new transaction first (lower trans_seq), +# acquires locks first (lower write_seq). +totl_xattr_fill -d "$FIRST_D" -p file-first- -n $NR_KEYS -v 200 +totl_xattr_fill -d "$SECOND_D" -p file-second- -n $NR_KEYS -v 50 + +# Force partial merges by limiting dirty bytes to one block so that +# stale-seq combined items get written to fs_root while finalized +# logs still exist. +for i in $(t_fs_nrs); do + echo 4096 > $(t_debugfs_path $i)/log_merge_dirty_limit +done + +sync + +# Start the merge but don't wait for completion -- read totals while +# the merge is running so we can catch the double-counting window. +sv=$(t_server_nr) +last_complete=$(t_counter log_merge_complete $sv) + +t_trigger_arm_silent log_merge_force_finalize_ours $sv +t_sync_seq_index +while test "$(t_trigger_get log_merge_force_finalize_ours $sv)" == "1"; do + sleep .1 +done + +echo "== verify totals" + +# Read totals repeatedly while the merge runs. The stale-seq bug +# causes totl_merge_resolve() to double-count finalized deltas for +# keys spliced into fs_root with a stale sequence number, producing +# totals above the expected value. +# +# The sliding cursor may return pre-merge values (below expected) or +# duplicate entries -- these are not the bug. Only totals ABOVE the +# expected value indicate double-counting. +nr_reads=0 +while (( $(t_counter log_merge_complete $sv) == last_complete )); do + for i in $(t_fs_nrs); do + echo 1 > $(t_debugfs_path $i)/drop_weak_item_cache + done + bad=$(scoutfs read-xattr-totals -p "$T_M0" | \ + awk -F'[ =,]+' -v limit=$EXPECTED_TOTAL '$2+0 > limit+0') + nr_reads=$((nr_reads + 1)) + if [ -n "$bad" ]; then + echo "double-counted totals during merge (read $nr_reads):" + echo "$bad" + for i in $(t_fs_nrs); do + echo 0 > $(t_debugfs_path $i)/log_merge_dirty_limit + done + t_fail "stale seq caused double-counted totals" + fi +done + +# Restore normal dirty limit. +for i in $(t_fs_nrs); do + echo 0 > $(t_debugfs_path $i)/log_merge_dirty_limit +done + +echo "== cleanup" +find "$FIRST_D" -maxdepth 1 -name 'file-first-*' -delete +find "$SECOND_D" -maxdepth 1 -name 'file-second-*' -delete + +t_pass