From b7a3d0371171f5c8cb972de8476ec59f51eefe26 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Mon, 27 Jul 2020 17:03:51 -0400 Subject: [PATCH 01/14] Add support for read only mmap() Adds the required memory mapped ops struct and page fault handler for reads. Signed-off-by: Benjamin LaHaise Signed-off-by: Auke Kok --- kmod/src/Makefile.kernelcompat | 16 +++++++ kmod/src/data.c | 78 ++++++++++++++++++++++++++++++++++ kmod/src/kernelcompat.h | 16 +++++++ 3 files changed, 110 insertions(+) diff --git a/kmod/src/Makefile.kernelcompat b/kmod/src/Makefile.kernelcompat index 403a2f66..211ee0d4 100644 --- a/kmod/src/Makefile.kernelcompat +++ b/kmod/src/Makefile.kernelcompat @@ -431,3 +431,19 @@ endif ifneq (,$(shell grep 'struct file.*bdev_file_open_by_path.const char.*path' include/linux/blkdev.h)) ccflags-y += -DKC_BDEV_FILE_OPEN_BY_PATH endif + +# v4.0-rc7-1796-gfe0f07d08ee3 +# +# direct-io changes modify inode_dio_done to now be called inode_dio_end +ifneq (,$(shell grep 'void inode_dio_end.struct inode' include/linux/fs.h)) +ccflags-y += -DKC_INODE_DIO_END +endif + +# +# v5.0-6476-g3d3539018d2c +# +# page fault handlers return a bitmask vm_fault_t instead +# Note: el8's header has a slightly modified prefix here +ifneq (,$(shell grep 'typedef.*__bitwise unsigned.*int vm_fault_t' include/linux/mm_types.h)) +ccflags-y += -DKC_MM_VM_FAULT_T +endif diff --git a/kmod/src/data.c b/kmod/src/data.c index 909167bc..63ed830d 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -1914,6 +1914,83 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock, return ret; } +#ifdef KC_MM_VM_FAULT_T +static vm_fault_t scoutfs_data_filemap_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; +#else +static int scoutfs_data_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +#endif + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + struct super_block *sb = inode->i_sb; + struct scoutfs_lock *inode_lock = NULL; + SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); + DECLARE_DATA_WAIT(dw); + loff_t pos; + int err; + vm_fault_t ret = VM_FAULT_SIGBUS; + + pos = vmf->pgoff; + pos <<= PAGE_SHIFT; + +retry: + err = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, + SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock); + if (err < 0) + return vmf_error(err); + + if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) { + /* protect checked extents from stage/release */ + atomic_inc(&inode->i_dio_count); + + err = scoutfs_data_wait_check(inode, pos, PAGE_SIZE, + SEF_OFFLINE, SCOUTFS_IOC_DWO_READ, + &dw, inode_lock); + if (err != 0) { + if (err < 0) + ret = vmf_error(err); + goto out; + } + } + +#ifdef KC_MM_VM_FAULT_T + ret = filemap_fault(vmf); +#else + ret = filemap_fault(vma, vmf); +#endif + +out: + if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent)) + kc_inode_dio_end(inode); + scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); + if (scoutfs_data_wait_found(&dw)) { + err = scoutfs_data_wait(inode, &dw); + if (err == 0) + goto retry; + + ret = VM_FAULT_RETRY; + } + + return ret; +} + +static const struct vm_operations_struct scoutfs_data_file_vm_ops = { + .fault = scoutfs_data_filemap_fault, + .remap_pages = generic_file_remap_pages, +}; + +static int scoutfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + file_accessed(file); + vma->vm_ops = &scoutfs_data_file_vm_ops; + return 0; +} + const struct address_space_operations scoutfs_file_aops = { #ifdef KC_MPAGE_READ_FOLIO .dirty_folio = block_dirty_folio, @@ -1945,6 +2022,7 @@ const struct file_operations scoutfs_file_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, #endif + .mmap = scoutfs_file_mmap, .unlocked_ioctl = scoutfs_ioctl, .fsync = scoutfs_file_fsync, .llseek = scoutfs_file_llseek, diff --git a/kmod/src/kernelcompat.h b/kmod/src/kernelcompat.h index 8c17d31e..74ad0d66 100644 --- a/kmod/src/kernelcompat.h +++ b/kmod/src/kernelcompat.h @@ -438,4 +438,20 @@ static inline int kc_tcp_sock_set_nodelay(struct socket *sock) } #endif +#ifdef KC_INODE_DIO_END +#define kc_inode_dio_end inode_dio_end +#else +#define kc_inode_dio_end inode_dio_done +#endif + +#ifndef KC_MM_VM_FAULT_T +typedef unsigned int vm_fault_t; +static inline vm_fault_t vmf_error(int err) +{ + if (err == -ENOMEM) + return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; +} +#endif + #endif From 3788d67101ca692a2606d4a30cefab200a6c3971 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Mon, 27 Jul 2020 17:04:22 -0400 Subject: [PATCH 02/14] Add support for writable shared mmap()ings Add support for writable MAP_SHARED mmap()ings. Avoid issues with late writepage()s building transactions by doing the block_write_begin() work in scoutfs_data_page_mkwrite(). Ensure the page is marked dirty and prepared for write, then let the VM complete the write when the page is flushed or invalidated. Signed-off-by: Benjamin LaHaise Signed-off-by: Auke Kok --- kmod/src/data.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 150 insertions(+), 3 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index 63ed830d..ce3cafe8 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -560,7 +560,7 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock, u64 offset; int ret; - WARN_ON_ONCE(create && !inode_is_locked(inode)); + WARN_ON_ONCE(create && !rwsem_is_locked(&si->extent_sem)); /* make sure caller holds a cluster lock */ lock = scoutfs_per_task_get(&si->pt_data_lock); @@ -1914,6 +1914,154 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock, return ret; } +#ifdef KC_MM_VM_FAULT_T +static vm_fault_t scoutfs_data_page_mkwrite(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; +#else +static int scoutfs_data_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ +#endif + struct page *page = vmf->page; + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + struct super_block *sb = inode->i_sb; + struct scoutfs_lock *lock = NULL; + SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); + DECLARE_DATA_WAIT(dw); + struct write_begin_data wbd; + u64 ind_seq; + loff_t pos; + loff_t size; + unsigned int len = PAGE_SIZE; + vm_fault_t ret = VM_FAULT_SIGBUS; + int err; + + pos = vmf->pgoff << PAGE_SHIFT; + + sb_start_pagefault(sb); + + err = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, + SCOUTFS_LKF_REFRESH_INODE, inode, &lock); + if (err) { + ret = vmf_error(err); + goto out; + } + + size = i_size_read(inode); + + if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, lock)) { + /* data_version is per inode, whole file must be online */ + err = scoutfs_data_wait_check(inode, 0, size, + SEF_OFFLINE, + SCOUTFS_IOC_DWO_WRITE, + &dw, lock); + if (err != 0) { + if (err < 0) + ret = vmf_error(err); + goto out_unlock; + } + } + + + /* scoutfs_write_begin */ + memset(&wbd, 0, sizeof(wbd)); + INIT_LIST_HEAD(&wbd.ind_locks); + wbd.lock = lock; + + /* + * Start transaction before taking page locks - we want to make sure we're + * not locking a page, then waiting for trans, because writeback might race + * against it and cause a lock inversion hang - as demonstrated by both + * holetest and fsstress tests in xfstests. + */ + do { + err = scoutfs_inode_index_start(sb, &ind_seq) ?: + scoutfs_inode_index_prepare(sb, &wbd.ind_locks, inode, + true) ?: + scoutfs_inode_index_try_lock_hold(sb, &wbd.ind_locks, + ind_seq, false); + } while (err > 0); + if (err < 0) { + ret = vmf_error(err); + goto out_trans; + } + + down_write(&si->extent_sem); + + if (!trylock_page(page)) { + ret = VM_FAULT_NOPAGE; + goto out_sem; + } + ret = VM_FAULT_LOCKED; + + if ((page->mapping != inode->i_mapping) || + (!PageUptodate(page)) || + (page_offset(page) > size)) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out_sem; + } + + if (page->index == (size - 1) >> PAGE_SHIFT) + len = ((size - 1) & ~PAGE_MASK) + 1; + + err = __block_write_begin(page, pos, PAGE_SIZE, scoutfs_get_block); + if (err) { + ret = vmf_error(err); + unlock_page(page); + goto out_sem; + } + /* end scoutfs_write_begin */ + + /* + * We mark the page dirty already here so that when freeze is in + * progress, we are guaranteed that writeback during freezing will + * see the dirty page and writeprotect it again. + */ + set_page_dirty(page); + wait_for_stable_page(page); + + /* scoutfs_write_end */ + scoutfs_inode_set_data_seq(inode); + scoutfs_inode_inc_data_version(inode); + + file_update_time(vma->vm_file); + + scoutfs_update_inode_item(inode, wbd.lock, &wbd.ind_locks); + scoutfs_inode_queue_writeback(inode); + +out_sem: + up_write(&si->extent_sem); +out_trans: + scoutfs_release_trans(sb); + scoutfs_inode_index_unlock(sb, &wbd.ind_locks); + /* end scoutfs_write_end */ + +out_unlock: + scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); + +out: + sb_end_pagefault(sb); + + if (scoutfs_data_wait_found(&dw)) { + /* + * It'd be really nice to not hold the mmap_sem lock here + * before waiting for data, and then return VM_FAULT_RETRY + */ + err = scoutfs_data_wait(inode, &dw); + if (err == 0) + ret = VM_FAULT_NOPAGE; + else + ret = vmf_error(err); + } + + return ret; +} + #ifdef KC_MM_VM_FAULT_T static vm_fault_t scoutfs_data_filemap_fault(struct vm_fault *vmf) { @@ -1979,13 +2127,12 @@ static int scoutfs_data_filemap_fault(struct vm_area_struct *vma, struct vm_faul static const struct vm_operations_struct scoutfs_data_file_vm_ops = { .fault = scoutfs_data_filemap_fault, + .page_mkwrite = scoutfs_data_page_mkwrite, .remap_pages = generic_file_remap_pages, }; static int scoutfs_file_mmap(struct file *file, struct vm_area_struct *vma) { - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - return -EINVAL; file_accessed(file); vma->vm_ops = &scoutfs_data_file_vm_ops; return 0; From 311bf75902a4a32c6fb7486f6122dbd9b63712bd Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Mon, 10 Jun 2024 18:32:39 -0400 Subject: [PATCH 03/14] Add mmap tests. Two test programs are added. The run time is about 1min on my el7 instance. The test script finishes up with a read/write mmap test on offline extents to verify the data wait paths in those functions. One program will perform vfs read/write and mmap read/write calls on the same file from across 5 threads (mounts) repeatedly. The goal is to assure there are no locking issues between read/write paths. The second test program performs consistency checking on a file that is repeatedly written/read using memory maps and normal reads and writes, and the content is verified after every operation. Signed-off-by: Auke Kok --- tests/.gitignore | 2 + tests/Makefile | 8 +- tests/golden/mmap | 27 ++++++ tests/sequence | 1 + tests/src/mmap_stress.c | 181 ++++++++++++++++++++++++++++++++++++++ tests/src/mmap_validate.c | 159 +++++++++++++++++++++++++++++++++ tests/tests/mmap.sh | 54 ++++++++++++ 7 files changed, 430 insertions(+), 2 deletions(-) create mode 100644 tests/golden/mmap create mode 100644 tests/src/mmap_stress.c create mode 100644 tests/src/mmap_validate.c create mode 100644 tests/tests/mmap.sh diff --git a/tests/.gitignore b/tests/.gitignore index b19b962a..32ad161c 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -10,3 +10,5 @@ src/stage_tmpfile src/create_xattr_loop src/o_tmpfile_umask src/o_tmpfile_linkat +src/mmap_stress +src/mmap_validate diff --git a/tests/Makefile b/tests/Makefile index 4c61a0b3..3a2380dc 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -13,7 +13,9 @@ BIN := src/createmany \ src/create_xattr_loop \ src/fragmented_data_extents \ src/o_tmpfile_umask \ - src/o_tmpfile_linkat + src/o_tmpfile_linkat \ + src/mmap_stress \ + src/mmap_validate DEPS := $(wildcard src/*.d) @@ -23,8 +25,10 @@ ifneq ($(DEPS),) -include $(DEPS) endif +src/mmap_stress: LIBS+=-lpthread + $(BIN): %: %.c Makefile - gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ + gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS) .PHONY: clean clean: diff --git a/tests/golden/mmap b/tests/golden/mmap new file mode 100644 index 00000000..8d5a058e --- /dev/null +++ b/tests/golden/mmap @@ -0,0 +1,27 @@ +== mmap_stress +thread 0 complete +thread 1 complete +thread 2 complete +thread 3 complete +thread 4 complete +== basic mmap/read/write consistency checks +== mmap read from offline extent +0: offset: 0 length: 2 flags: O.L +extents: 1 +1 +00000200: ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea ................ +0 +0: offset: 0 length: 2 flags: ..L +extents: 1 +== mmap write to an offline extent +0: offset: 0 length: 2 flags: O.L +extents: 1 +1 +0 +0: offset: 0 length: 2 flags: ..L +extents: 1 +00000000 ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea |................| +00000010 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 |................| +00000020 ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea |................| +00000030 +== done diff --git a/tests/sequence b/tests/sequence index 78001e59..6b78ff41 100644 --- a/tests/sequence +++ b/tests/sequence @@ -17,6 +17,7 @@ projects.sh large-fragmented-free.sh format-version-forward-back.sh enospc.sh +mmap.sh srch-safe-merge-pos.sh srch-basic-functionality.sh simple-xattr-unit.sh diff --git a/tests/src/mmap_stress.c b/tests/src/mmap_stress.c new file mode 100644 index 00000000..94a41484 --- /dev/null +++ b/tests/src/mmap_stress.c @@ -0,0 +1,181 @@ +#define _GNU_SOURCE +/* + * mmap() stress test for scoutfs + * + * This test exercises the scoutfs kernel module's locking by + * repeatedly reading/writing using mmap and pread/write calls + * across 5 clients (mounts). + * + * Each thread operates on a single thread/client, and performs + * operations in a random order on the file. + * + * The goal is to assure that locking between _page_mkwrite vfs + * calls and the normal read/write paths do not cause deadlocks. + * + * There is no content validation performed. All that is done is + * assure that the programs continues without errors. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int size = 0; +static int count = 0; /* XXX make this duration instead */ + +struct thread_info { + int nr; + int fd; +}; + +static void *run_test_func(void *ptr) +{ + void *buf = NULL; + char *addr = NULL; + struct thread_info *tinfo = ptr; + int c = 0; + int fd; + ssize_t read, written, ret; + int preads = 0, pwrites = 0, mreads = 0, mwrites = 0; + + fd = tinfo->fd; + + if (posix_memalign(&buf, 4096, size) != 0) { + perror("calloc"); + exit(-1); + } + + addr = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(-1); + } + + usleep(100000); /* 0.1sec to allow all threads to start roughly at the same time */ + + for (;;) { + if (++c > count) + break; + + switch (rand() % 4) { + case 0: /* pread */ + preads++; + for (read = 0; read < size;) { + ret = pread(fd, buf, size - read, read); + if (ret < 0) { + perror("pwrite"); + exit(-1); + } + read += ret; + } + break; + case 1: /* pwrite */ + pwrites++; + memset(buf, (char)(c & 0xff), size); + for (written = 0; written < size;) { + ret = pwrite(fd, buf, size - written, written); + if (ret < 0) { + perror("pwrite"); + exit(-1); + } + written += ret; + } + break; + case 2: /* mmap read */ + mreads++; + memcpy(buf, addr, size); /* noerr */ + break; + case 3: /* mmap write */ + mwrites++; + memset(buf, (char)(c & 0xff), size); + memcpy(addr, buf, size); /* noerr */ + break; + } + } + + munmap(addr, size); + + free(buf); + + printf("thread %u complete: preads %u pwrites %u mreads %u mwrites %u\n", tinfo->nr, + mreads, mwrites, preads, pwrites); + + return NULL; +} + +int main(int argc, char **argv) +{ + pthread_t thread[5]; + struct thread_info tinfo[5]; + int fd[5]; + int ret; + int i; + + if (argc != 8) { + fprintf(stderr, "%s requires 7 arguments - size count file1 file2 file3 file4 file5\n", argv[0]); + exit(-1); + } + + size = atoi(argv[1]); + if (size <= 0) { + fprintf(stderr, "invalid size, must be greater than 0\n"); + exit(-1); + } + + count = atoi(argv[2]); + if (count < 0) { + fprintf(stderr, "invalid count, must be greater than 0\n"); + exit(-1); + } + + /* create and truncate one fd */ + fd[0] = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, 00644); + if (fd[0] < 0) { + perror("open"); + exit(-1); + } + + /* make it the test size */ + if (posix_fallocate(fd[0], 0, size) != 0) { + perror("fallocate"); + exit(-1); + } + + /* now open the rest of the fds */ + for (i = 1; i < 5; i++) { + fd[i] = open(argv[3+i], O_RDWR); + if (fd[i] < 0) { + perror("open"); + exit(-1); + } + } + + /* start threads */ + for (i = 0; i < 5; i++) { + tinfo[i].fd = fd[i]; + tinfo[i].nr = i; + ret = pthread_create(&thread[i], NULL, run_test_func, (void*)&tinfo[i]); + + if (ret) { + perror("pthread_create"); + exit(-1); + } + } + + /* wait for complete */ + for (i = 0; i < 5; i++) + pthread_join(thread[i], NULL); + + for (i = 0; i < 5; i++) + close(fd[i]); + + exit(0); +} diff --git a/tests/src/mmap_validate.c b/tests/src/mmap_validate.c new file mode 100644 index 00000000..40f7435d --- /dev/null +++ b/tests/src/mmap_validate.c @@ -0,0 +1,159 @@ +#define _GNU_SOURCE +/* + * mmap() content consistency checking for scoutfs + * + * This test program validates that content from memory mappings + * are consistent across clients, whether written/read with mmap or + * normal writes/reads. + * + * One side of (read/write) will always be memory mapped. It may + * be that both sides do memory mapped (33% of the time). + */ + +#include +#include +#include +#include +#include +#include +#include + +static int count = 0; +static int size = 0; + +static void run_test_func(int fd1, int fd2) +{ + void *buf1 = NULL; + void *buf2 = NULL; + char *addr1 = NULL; + char *addr2 = NULL; + int c = 0; + ssize_t read, written, ret; + + /* buffers for both sides to compare */ + if (posix_memalign(&buf1, 4096, size) != 0) { + perror("calloc1"); + exit(-1); + } + + if (posix_memalign(&buf2, 4096, size) != 0) { + perror("calloc1"); + exit(-1); + } + + /* memory maps for both sides */ + addr1 = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd1, 0); + if (addr1 == MAP_FAILED) { + perror("mmap1"); + exit(-1); + } + + addr2 = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd2, 0); + if (addr2 == MAP_FAILED) { + perror("mmap2"); + exit(-1); + } + + for (;;) { + if (++c > count) /* 10k iterations */ + break; + + /* put a pattern in buf1 */ + memset(buf1, c & 0xff, size); + + /* pwrite or mmap write from buf1 */ + switch (c % 3) { + case 0: /* pwrite */ + for (written = 0; written < size;) { + ret = pwrite(fd1, buf1, size - written, written); + if (ret < 0) { + perror("pwrite"); + exit(-1); + } + written += ret; + } + break; + default: /* mmap write */ + memcpy(addr1, buf1, size); + break; + } + + /* pread or mmap read to buf2 */ + switch (c % 3) { + case 2: /* pread */ + for (read = 0; read < size;) { + ret = pread(fd2, buf2, size - read, read); + if (ret < 0) { + perror("pwrite"); + exit(-1); + } + read += ret; + } + break; + default: /* mmap read */ + memcpy(buf2, addr2, size); + break; + } + + /* compare bufs */ + if (memcmp(buf1, buf2, size) != 0) { + fprintf(stderr, "memcmp() failed\n"); + exit(-1); + } + } + + munmap(addr1, size); + munmap(addr2, size); + + free(buf1); + free(buf2); +} + +int main(int argc, char **argv) +{ + int fd[1]; + + if (argc != 5) { + fprintf(stderr, "%s requires 4 arguments - size count file1 file2\n", argv[0]); + exit(-1); + } + + size = atoi(argv[1]); + if (size <= 0) { + fprintf(stderr, "invalid size, must be greater than 0\n"); + exit(-1); + } + + count = atoi(argv[2]); + if (count < 3) { + fprintf(stderr, "invalid count, must be greater than 3\n"); + exit(-1); + } + + /* create and truncate one fd */ + fd[0] = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, 00644); + if (fd[0] < 0) { + perror("open"); + exit(-1); + } + + fd[1] = open(argv[4], O_RDWR , 00644); + if (fd[1] < 0) { + perror("open"); + exit(-1); + } + + /* make it the test size */ + if (posix_fallocate(fd[0], 0, size) != 0) { + perror("fallocate"); + exit(-1); + } + + /* run the test function */ + run_test_func(fd[0], fd[1]); + + close(fd[0]); + close(fd[1]); + + exit(0); +} diff --git a/tests/tests/mmap.sh b/tests/tests/mmap.sh new file mode 100644 index 00000000..bf465ce9 --- /dev/null +++ b/tests/tests/mmap.sh @@ -0,0 +1,54 @@ +# +# test mmap() and normal read/write consistency between different nodes +# + +t_require_commands mmap_stress mmap_validate scoutfs xfs_io + +echo "== mmap_stress" +mmap_stress 8192 2000 "$T_D0/mmap_stress" "$T_D1/mmap_stress" "$T_D2/mmap_stress" "$T_D3/mmap_stress" "$T_D4/mmap_stress" | sed 's/:.*//g' | sort + +echo "== basic mmap/read/write consistency checks" +mmap_validate 256 1000 "$T_D0/mmap_val1" "$T_D1/mmap_val1" +mmap_validate 8192 1000 "$T_D0/mmap_val2" "$T_D1/mmap_val2" +mmap_validate 88400 1000 "$T_D0/mmap_val3" "$T_D1/mmap_val3" + +echo "== mmap read from offline extent" +F="$T_D0/mmap-offline" +touch "$F" +xfs_io -c "pwrite -S 0xEA 0 8192" "$F" > /dev/null +cp "$F" "${F}-stage" +vers=$(scoutfs stat -s data_version "$F") +scoutfs release "$F" -V "$vers" -o 0 -l 8192 +scoutfs get-fiemap -L "$F" +xfs_io -c "mmap -rwx 0 8192" \ + -c "mread -v 512 16" "$F" & +sleep 1 +# should be 1 - data waiting +jobs | wc -l +scoutfs stage "${F}-stage" "$F" -V "$vers" -o 0 -l 8192 +# xfs_io thread will output 16 bytes of read data +sleep 1 +# should be 0 - no more waiting jobs, xfs_io should have exited +jobs | wc -l +scoutfs get-fiemap -L "$F" + +echo "== mmap write to an offline extent" +# reuse the same file +scoutfs release "$F" -V "$vers" -o 0 -l 8192 +scoutfs get-fiemap -L "$F" +xfs_io -c "mmap -rwx 0 8192" \ + -c "mwrite -S 0x11 528 16" "$F" & +sleep 1 +# should be 1 job waiting +jobs | wc -l +scoutfs stage "${F}-stage" "$F" -V "$vers" -o 0 -l 8192 +# no output here from write +sleep 1 +# should be 0 - no more waiting jobs, xfs_io should have exited +jobs | wc -l +scoutfs get-fiemap -L "$F" +# read back contents to assure write changed the file +dd status=none if="$F" bs=1 count=48 skip=512 | hexdump -C + +echo "== done" +t_pass From 92f704d35a4ba39f1fd751b210315b6e238382b3 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Mon, 22 Jul 2024 15:18:49 -0400 Subject: [PATCH 04/14] Enable all xfstests mmap() tests. Now that all of these should be passing, we enable all mmap() tests in xfstests, and update the golden output with the new tests. Signed-off-by: Auke Kok --- tests/golden/xfstests | 19 ++++++++++++++++++- tests/tests/xfstests.sh | 19 +------------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/golden/xfstests b/tests/golden/xfstests index 8247aa14..c4032ca9 100644 --- a/tests/golden/xfstests +++ b/tests/golden/xfstests @@ -22,6 +22,8 @@ generic/024 generic/025 generic/026 generic/028 +generic/029 +generic/030 generic/031 generic/032 generic/033 @@ -53,6 +55,7 @@ generic/073 generic/076 generic/078 generic/079 +generic/080 generic/081 generic/082 generic/084 @@ -81,10 +84,12 @@ generic/116 generic/117 generic/118 generic/119 +generic/120 generic/121 generic/122 generic/123 generic/124 +generic/126 generic/128 generic/129 generic/130 @@ -95,6 +100,7 @@ generic/136 generic/138 generic/139 generic/140 +generic/141 generic/142 generic/143 generic/144 @@ -153,6 +159,7 @@ generic/210 generic/211 generic/212 generic/214 +generic/215 generic/216 generic/217 generic/218 @@ -173,6 +180,9 @@ generic/238 generic/240 generic/244 generic/245 +generic/246 +generic/247 +generic/248 generic/249 generic/250 generic/252 @@ -231,6 +241,7 @@ generic/317 generic/319 generic/322 generic/324 +generic/325 generic/326 generic/327 generic/328 @@ -244,6 +255,7 @@ generic/337 generic/341 generic/342 generic/343 +generic/346 generic/348 generic/353 generic/355 @@ -305,7 +317,9 @@ generic/424 generic/425 generic/426 generic/427 +generic/428 generic/436 +generic/437 generic/439 generic/440 generic/443 @@ -315,6 +329,7 @@ generic/448 generic/449 generic/450 generic/451 +generic/452 generic/453 generic/454 generic/456 @@ -438,6 +453,7 @@ generic/610 generic/611 generic/612 generic/613 +generic/614 generic/618 generic/621 generic/623 @@ -451,6 +467,7 @@ generic/632 generic/634 generic/635 generic/637 +generic/638 generic/639 generic/640 generic/644 @@ -862,4 +879,4 @@ generic/688 generic/689 shared/002 shared/032 -Passed all 495 tests +Passed all 512 tests diff --git a/tests/tests/xfstests.sh b/tests/tests/xfstests.sh index f2850a62..b0ae44cf 100644 --- a/tests/tests/xfstests.sh +++ b/tests/tests/xfstests.sh @@ -65,26 +65,14 @@ EOF cat << EOF > local.exclude generic/003 # missing atime update in buffered read -generic/029 # mmap missing -generic/030 # mmap missing generic/075 # file content mismatch failures (fds, etc) -generic/080 # mmap missing generic/103 # enospc causes trans commit failures generic/108 # mount fails on failing device? generic/112 # file content mismatch failures (fds, etc) -generic/120 # (can't exec 'cause no mmap) -generic/126 # (can't exec 'cause no mmap) -generic/141 # mmap missing generic/213 # enospc causes trans commit failures -generic/215 # mmap missing -generic/246 # mmap missing -generic/247 # mmap missing -generic/248 # mmap missing generic/318 # can't support user namespaces until v5.11 generic/321 # requires selinux enabled for '+' in ls? -generic/325 # mmap missing generic/338 # BUG_ON update inode error handling -generic/346 # mmap missing generic/347 # _dmthin_mount doesn't work? generic/356 # swap generic/357 # swap @@ -92,16 +80,13 @@ generic/409 # bind mounts not scripted yet generic/410 # bind mounts not scripted yet generic/411 # bind mounts not scripted yet generic/423 # symlink inode size is strlen() + 1 on scoutfs -generic/428 # mmap missing generic/430 # xfs_io copy_range missing in el7 generic/431 # xfs_io copy_range missing in el7 generic/432 # xfs_io copy_range missing in el7 generic/433 # xfs_io copy_range missing in el7 generic/434 # xfs_io copy_range missing in el7 -generic/437 # mmap missing generic/441 # dm-mapper generic/444 # el9's posix_acl_update_mode is buggy ? -generic/452 # exec test - no mmap generic/467 # open_by_handle ESTALE generic/472 # swap generic/484 # dm-mapper @@ -118,11 +103,9 @@ generic/565 # xfs_io copy_range missing in el7 generic/568 # falloc not resulting in block count increase generic/569 # swap generic/570 # swap -generic/614 # mmap missing generic/620 # dm-hugedisk -generic/633 # mmap, id-mapped mounts missing in el7 +generic/633 # id-mapped mounts missing in el7 generic/636 # swap -generic/638 # mmap missing generic/641 # swap generic/643 # swap EOF From 519b47a53c80e3dc58cf6e7d8dd5402e67111367 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Mon, 5 Aug 2024 17:59:34 -0400 Subject: [PATCH 05/14] mmap() trace events. We merely trace exit values and position, and ignore length. Because vm_fault_t is __bitwise, sparse will loudly complain about a plain cast to u32, so we must __force (on el8). ret will be 512 in normal cases. Signed-off-by: Auke Kok --- kmod/src/data.c | 4 ++++ kmod/src/scoutfs_trace.h | 46 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/kmod/src/data.c b/kmod/src/data.c index ce3cafe8..907d4fdf 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -2059,6 +2059,8 @@ static int scoutfs_data_page_mkwrite(struct vm_area_struct *vma, ret = vmf_error(err); } + trace_scoutfs_data_page_mkwrite(sb, scoutfs_ino(inode), pos, (__force u32)ret); + return ret; } @@ -2122,6 +2124,8 @@ static int scoutfs_data_filemap_fault(struct vm_area_struct *vma, struct vm_faul ret = VM_FAULT_RETRY; } + trace_scoutfs_data_filemap_fault(sb, scoutfs_ino(inode), pos, (__force u32)ret); + return ret; } diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index e9c09750..930a275a 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -286,6 +286,52 @@ TRACE_EVENT(scoutfs_data_alloc_block_enter, STE_ENTRY_ARGS(ext)) ); +TRACE_EVENT(scoutfs_data_page_mkwrite, + TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u32 ret), + + TP_ARGS(sb, ino, pos, ret), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, ino) + __field(__u64, pos) + __field(__u32, ret) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->ino = ino; + __entry->pos = pos; + __entry->ret = ret; + ), + + TP_printk(SCSBF" ino %llu pos %llu ret %u ", + SCSB_TRACE_ARGS, __entry->ino, __entry->pos, __entry->ret) +); + +TRACE_EVENT(scoutfs_data_filemap_fault, + TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u32 ret), + + TP_ARGS(sb, ino, pos, ret), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, ino) + __field(__u64, pos) + __field(__u32, ret) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->ino = ino; + __entry->pos = pos; + __entry->ret = ret; + ), + + TP_printk(SCSBF" ino %llu pos %llu ret %u ", + SCSB_TRACE_ARGS, __entry->ino, __entry->pos, __entry->ret) +); + DECLARE_EVENT_CLASS(scoutfs_data_file_extent_class, TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), From b944f609aaaac8ed6f6361e1de20247d595781dc Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Fri, 31 May 2024 13:29:25 -0400 Subject: [PATCH 06/14] remap_pages ops becomes obsolete. --- kmod/src/Makefile.kernelcompat | 7 +++++++ kmod/src/data.c | 2 ++ 2 files changed, 9 insertions(+) diff --git a/kmod/src/Makefile.kernelcompat b/kmod/src/Makefile.kernelcompat index 211ee0d4..b248658c 100644 --- a/kmod/src/Makefile.kernelcompat +++ b/kmod/src/Makefile.kernelcompat @@ -447,3 +447,10 @@ endif ifneq (,$(shell grep 'typedef.*__bitwise unsigned.*int vm_fault_t' include/linux/mm_types.h)) ccflags-y += -DKC_MM_VM_FAULT_T endif + +# v3.19-499-gd83a08db5ba6 +# +# .remap pages becomes obsolete +ifneq (,$(shell grep 'int ..remap_pages..struct vm_area_struct' include/linux/mm.h)) +ccflags-y += -DKC_MM_REMAP_PAGES +endif diff --git a/kmod/src/data.c b/kmod/src/data.c index 907d4fdf..3ebccc9b 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -2132,7 +2132,9 @@ static int scoutfs_data_filemap_fault(struct vm_area_struct *vma, struct vm_faul static const struct vm_operations_struct scoutfs_data_file_vm_ops = { .fault = scoutfs_data_filemap_fault, .page_mkwrite = scoutfs_data_page_mkwrite, +#ifdef KC_MM_REMAP_PAGES .remap_pages = generic_file_remap_pages, +#endif }; static int scoutfs_file_mmap(struct file *file, struct vm_area_struct *vma) From 1bcd1d4d009c74c570f3b96f56b6aea641e5141a Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Fri, 12 Jul 2024 15:18:12 -0400 Subject: [PATCH 07/14] Drop readdir pre-.iterate() compat (el7.5ish). These 2 sections of compat for readdir are wholly obsolete and can be hard dropped, which restores the method to look like current upstream code. This was added in ddd1a4e. Signed-off-by: Auke Kok --- kmod/src/Makefile.kernelcompat | 20 ---------------- kmod/src/dir.c | 17 +++++++------ kmod/src/kernelcompat.h | 44 ---------------------------------- 3 files changed, 8 insertions(+), 73 deletions(-) diff --git a/kmod/src/Makefile.kernelcompat b/kmod/src/Makefile.kernelcompat index b248658c..d21b51f5 100644 --- a/kmod/src/Makefile.kernelcompat +++ b/kmod/src/Makefile.kernelcompat @@ -6,26 +6,6 @@ ccflags-y += -include $(src)/kernelcompat.h -# -# v3.10-rc6-21-gbb6f619b3a49 -# -# _readdir changes from fop->readdir() to fop->iterate() and from -# filldir(dirent) to dir_emit(ctx). -# -ifneq (,$(shell grep 'iterate.*dir_context' include/linux/fs.h)) -ccflags-y += -DKC_ITERATE_DIR_CONTEXT -endif - -# -# v3.10-rc6-23-g5f99f4e79abc -# -# Helpers including dir_emit_dots() are added in the process of -# switching dcache_readdir() from fop->readdir() to fop->iterate() -# -ifneq (,$(shell grep 'dir_emit_dots' include/linux/fs.h)) -ccflags-y += -DKC_DIR_EMIT_DOTS -endif - # # v3.18-rc2-19-gb5ae6b15bd73 # diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 21c20274..bd1540ff 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -441,8 +441,7 @@ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry, * It will need to be careful not to read past the region of the dirent * hash offset keys that it has access to. */ -static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file, - void *dirent, kc_readdir_ctx_t ctx) +static int scoutfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; @@ -454,7 +453,7 @@ static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file, u64 pos; int ret; - if (!kc_dir_emit_dots(file, dirent, ctx)) + if (!dir_emit_dots(file, ctx)) return 0; dent = alloc_dirent(SCOUTFS_NAME_LEN); @@ -471,7 +470,7 @@ static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file, for (;;) { init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode), - kc_readdir_pos(file, ctx), 0); + ctx->pos, 0); ret = scoutfs_item_next(sb, &key, &last_key, dent, dirent_bytes(SCOUTFS_NAME_LEN), @@ -488,23 +487,23 @@ static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file, corrupt_dirent_readdir_name_len, "dir_ino %llu pos %llu key "SK_FMT" len %d", scoutfs_ino(inode), - kc_readdir_pos(file, ctx), + ctx->pos, SK_ARG(&key), name_len); ret = -EIO; goto out; } pos = le64_to_cpu(key.skd_major); - kc_readdir_pos(file, ctx) = pos; + ctx->pos = pos; - if (!kc_dir_emit(ctx, dirent, dent->name, name_len, pos, + if (!dir_emit(ctx, dent->name, name_len, le64_to_cpu(dent->ino), dentry_type(dent->type))) { ret = 0; break; } - kc_readdir_pos(file, ctx) = pos + 1; + ctx->pos = pos + 1; } out: @@ -1973,7 +1972,7 @@ const struct inode_operations scoutfs_symlink_iops = { }; const struct file_operations scoutfs_dir_fops = { - .KC_FOP_READDIR = scoutfs_readdir, + .iterate = scoutfs_readdir, #ifdef KC_FMODE_KABI_ITERATE .open = scoutfs_dir_open, #endif diff --git a/kmod/src/kernelcompat.h b/kmod/src/kernelcompat.h index 74ad0d66..691db6b4 100644 --- a/kmod/src/kernelcompat.h +++ b/kmod/src/kernelcompat.h @@ -29,50 +29,6 @@ do { \ }) #endif -#ifndef KC_ITERATE_DIR_CONTEXT -typedef filldir_t kc_readdir_ctx_t; -#define KC_DECLARE_READDIR(name, file, dirent, ctx) name(file, dirent, ctx) -#define KC_FOP_READDIR readdir -#define kc_readdir_pos(filp, ctx) (filp)->f_pos -#define kc_dir_emit_dots(file, dirent, ctx) dir_emit_dots(file, dirent, ctx) -#define kc_dir_emit(ctx, dirent, name, name_len, pos, ino, dt) \ - (ctx(dirent, name, name_len, pos, ino, dt) == 0) -#else -typedef struct dir_context * kc_readdir_ctx_t; -#define KC_DECLARE_READDIR(name, file, dirent, ctx) name(file, ctx) -#define KC_FOP_READDIR iterate -#define kc_readdir_pos(filp, ctx) (ctx)->pos -#define kc_dir_emit_dots(file, dirent, ctx) dir_emit_dots(file, ctx) -#define kc_dir_emit(ctx, dirent, name, name_len, pos, ino, dt) \ - dir_emit(ctx, name, name_len, ino, dt) -#endif - -#ifndef KC_DIR_EMIT_DOTS -/* - * Kernels before ->iterate and don't have dir_emit_dots so we give them - * one that works with the ->readdir() filldir() method. - */ -static inline int dir_emit_dots(struct file *file, void *dirent, - filldir_t filldir) -{ - if (file->f_pos == 0) { - if (filldir(dirent, ".", 1, 1, - file->f_path.dentry->d_inode->i_ino, DT_DIR)) - return 0; - file->f_pos = 1; - } - - if (file->f_pos == 1) { - if (filldir(dirent, "..", 2, 1, - parent_ino(file->f_path.dentry), DT_DIR)) - return 0; - file->f_pos = 2; - } - - return 1; -} -#endif - #ifdef KC_POSIX_ACL_VALID_USER_NS #define kc_posix_acl_valid(user_ns, acl) posix_acl_valid(user_ns, acl) #else From e59a5f8ebda5b17c3cce3afbeecb76da5f01cc05 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Tue, 21 Jan 2025 18:06:39 -0500 Subject: [PATCH 08/14] Readdir w/offset validation. Verify using xfs_io that readdir offsets match expected output. Signed-off-by: Auke Kok --- tests/golden/simple-readdir | 97 +++++++++++++++++++++++++++++++++++ tests/sequence | 1 + tests/tests/simple-readdir.sh | 37 +++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 tests/golden/simple-readdir create mode 100644 tests/tests/simple-readdir.sh diff --git a/tests/golden/simple-readdir b/tests/golden/simple-readdir new file mode 100644 index 00000000..c2661939 --- /dev/null +++ b/tests/golden/simple-readdir @@ -0,0 +1,97 @@ +== create content +== readdir all +00000000: d_off: 0x00000001 d_reclen: 0x18 d_type: DT_DIR d_name: . +00000001: d_off: 0x00000002 d_reclen: 0x18 d_type: DT_DIR d_name: .. +00000002: d_off: 0x00000003 d_reclen: 0x18 d_type: DT_REG d_name: a +00000003: d_off: 0x00000004 d_reclen: 0x20 d_type: DT_REG d_name: aaaaaaaa +00000004: d_off: 0x00000005 d_reclen: 0x28 d_type: DT_REG d_name: aaaaaaaaaaaaaaa +00000005: d_off: 0x00000006 d_reclen: 0x30 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaa +00000006: d_off: 0x00000007 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000007: d_off: 0x00000008 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000008: d_off: 0x00000009 d_reclen: 0x40 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000009: d_off: 0x0000000a d_reclen: 0x48 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000000a: d_off: 0x0000000b d_reclen: 0x50 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000000b: d_off: 0x0000000c d_reclen: 0x58 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000000c: d_off: 0x0000000d d_reclen: 0x60 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000000d: d_off: 0x0000000e d_reclen: 0x68 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000000e: d_off: 0x0000000f d_reclen: 0x70 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000000f: d_off: 0x00000010 d_reclen: 0x70 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000010: d_off: 0x00000011 d_reclen: 0x78 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000011: d_off: 0x00000012 d_reclen: 0x80 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000012: d_off: 0x00000013 d_reclen: 0x88 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000013: d_off: 0x00000014 d_reclen: 0x90 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000014: d_off: 0x00000015 d_reclen: 0x98 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000015: d_off: 0x00000016 d_reclen: 0xa0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000016: d_off: 0x00000017 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000017: d_off: 0x00000018 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000018: d_off: 0x00000019 d_reclen: 0xb0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000019: d_off: 0x0000001a d_reclen: 0xb8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001a: d_off: 0x0000001b d_reclen: 0xc0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001b: d_off: 0x0000001c d_reclen: 0xc8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001c: d_off: 0x0000001d d_reclen: 0xd0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001d: d_off: 0x0000001e d_reclen: 0xd8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001e: d_off: 0x0000001f d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001f: d_off: 0x00000020 d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000020: d_off: 0x00000021 d_reclen: 0xe8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000021: d_off: 0x00000022 d_reclen: 0xf0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000022: d_off: 0x00000023 d_reclen: 0xf8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000023: d_off: 0x00000024 d_reclen: 0x100 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000024: d_off: 0x00000025 d_reclen: 0x108 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000025: d_off: 0x00000026 d_reclen: 0x110 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +== readdir offset +00000014: d_off: 0x00000015 d_reclen: 0x98 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000015: d_off: 0x00000016 d_reclen: 0xa0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000016: d_off: 0x00000017 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000017: d_off: 0x00000018 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000018: d_off: 0x00000019 d_reclen: 0xb0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000019: d_off: 0x0000001a d_reclen: 0xb8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001a: d_off: 0x0000001b d_reclen: 0xc0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001b: d_off: 0x0000001c d_reclen: 0xc8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001c: d_off: 0x0000001d d_reclen: 0xd0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001d: d_off: 0x0000001e d_reclen: 0xd8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001e: d_off: 0x0000001f d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001f: d_off: 0x00000020 d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000020: d_off: 0x00000021 d_reclen: 0xe8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000021: d_off: 0x00000022 d_reclen: 0xf0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000022: d_off: 0x00000023 d_reclen: 0xf8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000023: d_off: 0x00000024 d_reclen: 0x100 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000024: d_off: 0x00000025 d_reclen: 0x108 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000025: d_off: 0x00000026 d_reclen: 0x110 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +== readdir len (bytes) +00000000: d_off: 0x00000001 d_reclen: 0x18 d_type: DT_DIR d_name: . +00000001: d_off: 0x00000002 d_reclen: 0x18 d_type: DT_DIR d_name: .. +00000002: d_off: 0x00000003 d_reclen: 0x18 d_type: DT_REG d_name: a +00000003: d_off: 0x00000004 d_reclen: 0x20 d_type: DT_REG d_name: aaaaaaaa +00000004: d_off: 0x00000005 d_reclen: 0x28 d_type: DT_REG d_name: aaaaaaaaaaaaaaa +00000005: d_off: 0x00000006 d_reclen: 0x30 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaa +00000006: d_off: 0x00000007 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaa +== introduce gap +00000000: d_off: 0x00000001 d_reclen: 0x18 d_type: DT_DIR d_name: . +00000001: d_off: 0x00000002 d_reclen: 0x18 d_type: DT_DIR d_name: .. +00000002: d_off: 0x00000003 d_reclen: 0x18 d_type: DT_REG d_name: a +00000003: d_off: 0x00000004 d_reclen: 0x20 d_type: DT_REG d_name: aaaaaaaa +00000004: d_off: 0x00000005 d_reclen: 0x28 d_type: DT_REG d_name: aaaaaaaaaaaaaaa +00000005: d_off: 0x00000006 d_reclen: 0x30 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaa +00000006: d_off: 0x00000007 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000007: d_off: 0x00000008 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000008: d_off: 0x00000009 d_reclen: 0x40 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000009: d_off: 0x00000014 d_reclen: 0x48 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000014: d_off: 0x00000015 d_reclen: 0x98 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000015: d_off: 0x00000016 d_reclen: 0xa0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000016: d_off: 0x00000017 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000017: d_off: 0x00000018 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000018: d_off: 0x00000019 d_reclen: 0xb0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000019: d_off: 0x0000001a d_reclen: 0xb8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001a: d_off: 0x0000001b d_reclen: 0xc0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001b: d_off: 0x0000001c d_reclen: 0xc8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001c: d_off: 0x0000001d d_reclen: 0xd0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001d: d_off: 0x0000001e d_reclen: 0xd8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001e: d_off: 0x0000001f d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +0000001f: d_off: 0x00000020 d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000020: d_off: 0x00000021 d_reclen: 0xe8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000021: d_off: 0x00000022 d_reclen: 0xf0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000022: d_off: 0x00000023 d_reclen: 0xf8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000023: d_off: 0x00000024 d_reclen: 0x100 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000024: d_off: 0x00000025 d_reclen: 0x108 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +00000025: d_off: 0x00000026 d_reclen: 0x110 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +== cleanup diff --git a/tests/sequence b/tests/sequence index 6b78ff41..18eff7cf 100644 --- a/tests/sequence +++ b/tests/sequence @@ -6,6 +6,7 @@ inode-items-updated.sh simple-inode-index.sh simple-staging.sh simple-release-extents.sh +simple-readdir.sh get-referring-entries.sh fallocate.sh basic-truncate.sh diff --git a/tests/tests/simple-readdir.sh b/tests/tests/simple-readdir.sh new file mode 100644 index 00000000..03ccb3be --- /dev/null +++ b/tests/tests/simple-readdir.sh @@ -0,0 +1,37 @@ +# +# verify d_off output of xfs_io is consistent. +# + +t_require_commands xfs_io + +filt() +{ + grep d_off | cut -d ' ' -f 1,4- +} + +echo "== create content" +for s in $(seq 1 7 250); do + f=$(printf '%*s' $s | tr ' ' 'a') + touch ${T_D0}/$f +done + +echo "== readdir all" +xfs_io -c "readdir -v" $T_D0 | filt + +echo "== readdir offset" +xfs_io -c "readdir -v -o 20" $T_D0 | filt + +echo "== readdir len (bytes)" +xfs_io -c "readdir -v -l 193" $T_D0 | filt + +echo "== introduce gap" +for s in $(seq 57 7 120); do + f=$(printf '%*s' $s | tr ' ' 'a') + rm -f ${T_D0}/$f +done +xfs_io -c "readdir -v" $T_D0 | filt + +echo "== cleanup" +rm -rf $T_D0 + +t_pass From cad12d5ce80ea9d4af15b8f10b385dbb6e1358ec Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Tue, 16 Jul 2024 19:44:09 -0400 Subject: [PATCH 09/14] Avoid deadlock in _readdir() due to copy_to_user(). dir_emit() will copy_to_user, which can pagefault. If this happens while cluster locked, we could deadlock. We use a single page to stage dir_emit data, and iterate between fetching dirents while locked, and emitting them while not locked. Signed-off-by: Auke Kok --- kmod/src/dir.c | 119 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 37 deletions(-) diff --git a/kmod/src/dir.c b/kmod/src/dir.c index bd1540ff..fb468df1 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -11,11 +11,13 @@ * General Public License for more details. */ #include +#include #include #include #include #include #include +#include #include "format.h" #include "file.h" @@ -434,6 +436,15 @@ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } +/* + * Helper to make iterating through dirent ptrs aligned + */ +static inline struct scoutfs_dirent *next_aligned_dirent(struct scoutfs_dirent *dent, u8 len) +{ + return (void *)dent + + ALIGN(offsetof(struct scoutfs_dirent, name[len]), __alignof__(struct scoutfs_dirent)); +} + /* * readdir simply iterates over the dirent items for the dir inode and * uses their offset as the readdir position. @@ -447,69 +458,103 @@ static int scoutfs_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct scoutfs_lock *dir_lock = NULL; struct scoutfs_dirent *dent = NULL; +/* we'll store name_len in dent->__pad[0] */ +#define hacky_name_len __pad[0] struct scoutfs_key last_key; struct scoutfs_key key; + struct page *page = NULL; int name_len; u64 pos; + int entries = 0; int ret; + int complete = 0; + struct scoutfs_dirent *end; if (!dir_emit_dots(file, ctx)) return 0; - dent = alloc_dirent(SCOUTFS_NAME_LEN); - if (!dent) { + page = alloc_page(GFP_KERNEL); + if (!page) return -ENOMEM; - } + + end = page_address(page) + PAGE_SIZE; init_dirent_key(&last_key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode), SCOUTFS_DIRENT_LAST_POS, 0); - ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock); - if (ret) - goto out; - + /* + * lock and fetch dirent items, until the page no longer fits + * a max size dirent (288b). Then unlock and dir_emit the ones + * we stored in the page. + */ for (;;) { - init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode), - ctx->pos, 0); - - ret = scoutfs_item_next(sb, &key, &last_key, dent, - dirent_bytes(SCOUTFS_NAME_LEN), - dir_lock); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; + /* lock */ + ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock); + if (ret) break; - } - name_len = ret - sizeof(struct scoutfs_dirent); - if (name_len < 1 || name_len > SCOUTFS_NAME_LEN) { - scoutfs_corruption(sb, SC_DIRENT_READDIR_NAME_LEN, - corrupt_dirent_readdir_name_len, - "dir_ino %llu pos %llu key "SK_FMT" len %d", - scoutfs_ino(inode), - ctx->pos, - SK_ARG(&key), name_len); - ret = -EIO; - goto out; + dent = page_address(page); + pos = ctx->pos; + while (next_aligned_dirent(dent, SCOUTFS_NAME_LEN) < end) { + init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode), + pos, 0); + + ret = scoutfs_item_next(sb, &key, &last_key, dent, + dirent_bytes(SCOUTFS_NAME_LEN), + dir_lock); + if (ret < 0) { + if (ret == -ENOENT) { + ret = 0; + complete = 1; + } + break; + } + + name_len = ret - sizeof(struct scoutfs_dirent); + dent->hacky_name_len = name_len; + if (name_len < 1 || name_len > SCOUTFS_NAME_LEN) { + scoutfs_corruption(sb, SC_DIRENT_READDIR_NAME_LEN, + corrupt_dirent_readdir_name_len, + "dir_ino %llu pos %llu key "SK_FMT" len %d", + scoutfs_ino(inode), + pos, + SK_ARG(&key), name_len); + ret = -EIO; + break; + } + + pos = le64_to_cpu(dent->pos) + 1; + + dent = next_aligned_dirent(dent, name_len); + entries++; } - pos = le64_to_cpu(key.skd_major); - ctx->pos = pos; + /* unlock */ + scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ); - if (!dir_emit(ctx, dent->name, name_len, - le64_to_cpu(dent->ino), - dentry_type(dent->type))) { - ret = 0; + if (ret < 0) break; + + dent = page_address(page); + for (; entries > 0; entries--) { + if (!dir_emit(ctx, dent->name, dent->hacky_name_len, + le64_to_cpu(dent->ino), + dentry_type(dent->type))) { + ret = 0; + goto out; + } + ctx->pos = le64_to_cpu(dent->pos) + 1; + + dent = next_aligned_dirent(dent, dent->hacky_name_len); } - ctx->pos = pos + 1; + if (complete) + break; } out: - scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ); - - kfree(dent); + if (page) + __free_page(page); return ret; } From 8cb08507d6eaaa9ae2dc678683b60870f5880594 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Fri, 2 Aug 2024 14:11:10 -0400 Subject: [PATCH 10/14] Do not copy to user while holding locks in scoutfs_data_fiemap() Now that we support mmap writes, at any point in time we could pagefault and lock for writes. That means - just like readdir - we can no longer lock and copy_to_user, since it also may page fault and thus deadlock. We statically allocate 32 extent entries on the stack and use these to shuffle out fiemap entries at a time, locking and unlocking around collecting and fiemap_fill_extent_next. Signed-off-by: Auke Kok --- kmod/src/data.c | 112 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 78 insertions(+), 34 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index 3ebccc9b..7903e8d7 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -1551,13 +1551,17 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); struct scoutfs_lock *lock = NULL; + struct scoutfs_extent *info = NULL; + struct page *page = NULL; struct scoutfs_extent ext; struct scoutfs_extent cur; struct data_ext_args args; u32 last_flags; u64 iblock; u64 last; + int entries = 0; int ret; + int complete = 0; if (len == 0) { ret = 0; @@ -1568,16 +1572,11 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) goto out; - inode_lock(inode); - down_read(&si->extent_sem); - - ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock); - if (ret) - goto unlock; - - args.ino = ino; - args.inode = inode; - args.lock = lock; + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto out; + } /* use a dummy extent to track */ memset(&cur, 0, sizeof(cur)); @@ -1586,48 +1585,93 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, iblock = start >> SCOUTFS_BLOCK_SM_SHIFT; last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; + args.ino = ino; + args.inode = inode; + + /* outer loop */ while (iblock <= last) { - ret = scoutfs_ext_next(sb, &data_ext_ops, &args, - iblock, 1, &ext); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - last_flags = FIEMAP_EXTENT_LAST; + /* lock */ + inode_lock(inode); + down_read(&si->extent_sem); + + ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock); + if (ret) { + up_read(&si->extent_sem); + inode_unlock(inode); break; } - trace_scoutfs_data_fiemap_extent(sb, ino, &ext); + args.lock = lock; - if (ext.start > last) { - /* not setting _LAST, it's for end of file */ - ret = 0; - break; + /* collect entries */ + info = page_address(page); + memset(info, 0, PAGE_SIZE); + while (entries < (PAGE_SIZE / sizeof(struct fiemap_extent)) - 1) { + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + complete = 1; + last_flags = FIEMAP_EXTENT_LAST; + break; + } + + trace_scoutfs_data_fiemap_extent(sb, ino, &ext); + + if (ext.start > last) { + /* not setting _LAST, it's for end of file */ + ret = 0; + complete = 1; + break; + } + + if (scoutfs_ext_can_merge(&cur, &ext)) { + /* merged extents could be greater than input len */ + cur.len += ext.len; + } else { + /* fill it */ + memcpy(info, &cur, sizeof(cur)); + + entries++; + info++; + + cur = ext; + } + + iblock = ext.start + ext.len; } - if (scoutfs_ext_can_merge(&cur, &ext)) { - /* merged extents could be greater than input len */ - cur.len += ext.len; - } else { - ret = fill_extent(fieinfo, &cur, 0); + /* unlock */ + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); + up_read(&si->extent_sem); + inode_unlock(inode); + + if (ret) + break; + + /* emit entries */ + info = page_address(page); + for (; entries > 0; entries--) { + ret = fill_extent(fieinfo, info, 0); if (ret != 0) - goto unlock; - cur = ext; + goto out; + info++; } - iblock = ext.start + ext.len; + if (complete) + break; } + /* still one left, it's in cur */ if (cur.len) ret = fill_extent(fieinfo, &cur, last_flags); -unlock: - scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); - up_read(&si->extent_sem); - inode_unlock(inode); out: if (ret == 1) ret = 0; - + if (page) + __free_page(page); trace_scoutfs_data_fiemap(sb, start, len, ret); return ret; From e76a171c40a366e118d7e4962db0cfa5d42425aa Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Tue, 20 Aug 2024 16:18:03 -0400 Subject: [PATCH 11/14] Avoid faulting while cluster locked in _walk_inodes. Similar to readdir and fiemap vfs methods, we can't copy to user while holding cluster locks. The previous comment about it being safe no longer applies, and this could deadlock. Rewrite the loop to iterate and store entries in a page, then flush the page contents while not holding a clusterlock. Signed-off-by: Auke Kok --- kmod/src/ioctl.c | 144 +++++++++++++++++++++++++++++------------------ 1 file changed, 88 insertions(+), 56 deletions(-) diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index ebde0a05..dd4b5eb8 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -58,25 +58,23 @@ * key space after we find no items in a given lock region. This is * relatively cheap because reading is going to check the segments * anyway. - * - * This is copying to userspace while holding a read lock. This is safe - * because faulting can send a request for a write lock while the read - * lock is being used. The cluster locks don't block tasks in a node, - * they match and the tasks fall back to local locking. In this case - * the spin locks around the item cache. */ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_walk_inodes __user *uwalk = (void __user *)arg; struct scoutfs_ioctl_walk_inodes walk; - struct scoutfs_ioctl_walk_inodes_entry ent; + struct scoutfs_ioctl_walk_inodes_entry *ent = NULL; + struct scoutfs_ioctl_walk_inodes_entry *end; struct scoutfs_key next_key; struct scoutfs_key last_key; struct scoutfs_key key; struct scoutfs_lock *lock; + struct page *page = NULL; u64 last_seq; + u64 entries = 0; int ret = 0; + int complete = 0; u32 nr = 0; u8 type; @@ -107,6 +105,10 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg) } } + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + scoutfs_inode_init_index_key(&key, type, walk.first.major, walk.first.minor, walk.first.ino); scoutfs_inode_init_index_key(&last_key, type, walk.last.major, @@ -115,77 +117,107 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg) /* cap nr to the max the ioctl can return to a compat task */ walk.nr_entries = min_t(u64, walk.nr_entries, INT_MAX); - ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, type, - walk.first.major, walk.first.ino, - &lock); - if (ret < 0) - goto out; + end = page_address(page) + PAGE_SIZE; - for (nr = 0; nr < walk.nr_entries; ) { + /* outer loop */ + for (nr = 0;;) { + ent = page_address(page); + /* make sure _pad and minor are zeroed */ + memset(ent, 0, PAGE_SIZE); - ret = scoutfs_item_next(sb, &key, &last_key, NULL, 0, lock); - if (ret < 0 && ret != -ENOENT) + ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, type, + le64_to_cpu(key.skii_major), + le64_to_cpu(key.skii_ino), + &lock); + if (ret) break; - if (ret == -ENOENT) { - - /* done if lock covers last iteration key */ - if (scoutfs_key_compare(&last_key, &lock->end) <= 0) { - ret = 0; + /* inner loop 1 */ + while (ent + 1 < end) { + ret = scoutfs_item_next(sb, &key, &last_key, NULL, 0, lock); + if (ret < 0 && ret != -ENOENT) break; + + if (ret == -ENOENT) { + /* done if lock covers last iteration key */ + if (scoutfs_key_compare(&last_key, &lock->end) <= 0) { + ret = 0; + complete = 1; + break; + } + + /* continue iterating after locked empty region */ + key = lock->end; + scoutfs_key_inc(&key); + + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); + /* avoid double-unlocking here after break */ + lock = NULL; + + ret = scoutfs_forest_next_hint(sb, &key, &next_key); + if (ret < 0 && ret != -ENOENT) + break; + + if (ret == -ENOENT || + scoutfs_key_compare(&next_key, &last_key) > 0) { + ret = 0; + complete = 1; + break; + } + + key = next_key; + + ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, + type, + le64_to_cpu(key.skii_major), + le64_to_cpu(key.skii_ino), + &lock); + if (ret) + break; + + continue; } - /* continue iterating after locked empty region */ - key = lock->end; - scoutfs_key_inc(&key); + ent->major = le64_to_cpu(key.skii_major); + ent->ino = le64_to_cpu(key.skii_ino); - scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); + scoutfs_key_inc(&key); - ret = scoutfs_forest_next_hint(sb, &key, &next_key); - if (ret < 0 && ret != -ENOENT) - goto out; + ent++; + entries++; - if (ret == -ENOENT || - scoutfs_key_compare(&next_key, &last_key) > 0) { - ret = 0; - goto out; + if (nr + entries >= walk.nr_entries) { + complete = 1; + break; } + } - key = next_key; + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); + if (ret < 0) + break; - ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, - key.sk_type, - le64_to_cpu(key.skii_major), - le64_to_cpu(key.skii_ino), - &lock); - if (ret < 0) + /* inner loop 2 */ + ent = page_address(page); + for (; entries > 0; entries--) { + if (copy_to_user((void __user *)walk.entries_ptr, ent, + sizeof(struct scoutfs_ioctl_walk_inodes_entry))) { + ret = -EFAULT; goto out; - - continue; + } + walk.entries_ptr += sizeof(struct scoutfs_ioctl_walk_inodes_entry); + ent++; + nr++; } - ent.major = le64_to_cpu(key.skii_major); - ent.minor = 0; - ent.ino = le64_to_cpu(key.skii_ino); - - if (copy_to_user((void __user *)walk.entries_ptr, &ent, - sizeof(ent))) { - ret = -EFAULT; + if (complete) break; - } - - nr++; - walk.entries_ptr += sizeof(ent); - - scoutfs_key_inc(&key); } - scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); - out: + if (page) + __free_page(page); if (nr > 0) ret = nr; - return ret; } From 8b76a53cf34eb0ea9075855d012d21399d648034 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Wed, 21 Aug 2024 18:24:33 -0400 Subject: [PATCH 12/14] Avoid cluster locking while put_user() in _allocated_inos. Similar to fiemap, readdir and walk_inodes, this method could have put_user during a page fault, causing potentially a deadlock. Signed-off-by: Auke Kok --- kmod/src/ioctl.c | 86 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index dd4b5eb8..fea7aae3 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -1195,11 +1195,15 @@ static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg) struct scoutfs_lock *lock = NULL; struct scoutfs_key key; struct scoutfs_key end; + struct page *page = NULL; u64 __user *uinos; u64 bytes; - u64 ino; + u64 *ino; + u64 *ino_end; + int entries = 0; int nr; int ret; + int complete = 0; if (!(file->f_mode & FMODE_READ)) { ret = -EBADF; @@ -1221,47 +1225,83 @@ static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg) goto out; } + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto out; + } + ino_end = page_address(page) + PAGE_SIZE; + scoutfs_inode_init_key(&key, gai.start_ino); scoutfs_inode_init_key(&end, gai.start_ino | SCOUTFS_LOCK_INODE_GROUP_MASK); uinos = (void __user *)gai.inos_ptr; bytes = gai.inos_bytes; nr = 0; - ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock); - if (ret < 0) - goto out; + for (;;) { + + ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock); + if (ret < 0) + goto out; - while (bytes >= sizeof(*uinos)) { + ino = page_address(page); + while (ino < ino_end) { - ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock); - if (ret < 0) { - if (ret == -ENOENT) + ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock); + if (ret < 0) { + if (ret == -ENOENT) { + ret = 0; + complete = 1; + } + break; + } + + if (key.sk_zone != SCOUTFS_FS_ZONE) { ret = 0; - break; + complete = 1; + break; + } + + /* all fs items are owned by allocated inodes, and _first is always ino */ + *ino = le64_to_cpu(key._sk_first); + scoutfs_inode_init_key(&key, *ino + 1); + + ino++; + entries++; + nr++; + + bytes -= sizeof(*uinos); + if (bytes < sizeof(*uinos)) { + complete = 1; + break; + } + + if (nr == INT_MAX) { + complete = 1; + break; + } } - if (key.sk_zone != SCOUTFS_FS_ZONE) { - ret = 0; + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); + + if (ret < 0) break; - } - /* all fs items are owned by allocated inodes, and _first is always ino */ - ino = le64_to_cpu(key._sk_first); - if (put_user(ino, uinos)) { + ino = page_address(page); + if (copy_to_user(uinos, ino, entries * sizeof(*uinos))) { ret = -EFAULT; - break; + goto out; } - uinos++; - bytes -= sizeof(*uinos); - if (++nr == INT_MAX) - break; + uinos += entries; + entries = 0; - scoutfs_inode_init_key(&key, ino + 1); + if (complete) + break; } - - scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); out: + if (page) + __free_page(page); return ret ?: nr; } From 6c85879489f1c590b816645fbf8fee1c39e33472 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Mon, 9 Sep 2024 19:50:20 -0400 Subject: [PATCH 13/14] Assert unlock doesn't underflow lock user count. While debugging a double unlock error we hit this condition and debugging would have been a lot easier had we enforced this simple constraint that we can't decrement the lock users count if it's already 0. Signed-off-by: Auke Kok --- kmod/src/lock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kmod/src/lock.c b/kmod/src/lock.c index 5f280054..ce77bb51 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -302,6 +302,7 @@ static void lock_inc_count(unsigned int *counts, enum scoutfs_lock_mode mode) static void lock_dec_count(unsigned int *counts, enum scoutfs_lock_mode mode) { BUG_ON(mode < 0 || mode >= SCOUTFS_LOCK_NR_MODES); + BUG_ON(counts[mode] == 0); counts[mode]--; } From e9d147260c2796d21b18b111f1e3b62d01ddda21 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Mon, 27 Jan 2025 14:46:29 -0500 Subject: [PATCH 14/14] Fix ctx->pos updating to properly handle dent gaps We need to assure we're emitting dents with the proper position and we already have them as part of our dent. The only caveat is to increment ctx->pos once beyond the list to make sure the caller doesn't call us once more. Signed-off-by: Auke Kok --- kmod/src/dir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kmod/src/dir.c b/kmod/src/dir.c index fb468df1..95ba9db0 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -537,15 +537,18 @@ static int scoutfs_readdir(struct file *file, struct dir_context *ctx) dent = page_address(page); for (; entries > 0; entries--) { + ctx->pos = le64_to_cpu(dent->pos); if (!dir_emit(ctx, dent->name, dent->hacky_name_len, le64_to_cpu(dent->ino), dentry_type(dent->type))) { ret = 0; goto out; } - ctx->pos = le64_to_cpu(dent->pos) + 1; dent = next_aligned_dirent(dent, dent->hacky_name_len); + + /* always advance ctx->pos past */ + ctx->pos++; } if (complete)