From 14f8ac16e5e3e84c6b3f102cfba01a7ad459ed74 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 08:31:42 +0100 Subject: [PATCH 01/12] break out options to a separate file --- Makefile.am | 4 +- Options.cc | 229 +++++++++++++++++++++++++++++ Options.hh | 41 ++++++ inofficial_cmake/CMakeLists.txt | 2 + rdfind.cc | 252 +------------------------------- 5 files changed, 275 insertions(+), 253 deletions(-) create mode 100644 Options.cc create mode 100644 Options.hh diff --git a/Makefile.am b/Makefile.am index 3bcea57..ea0ac3c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -4,7 +4,7 @@ AUTOMAKE_OPTIONS = gnu # I would like dist-bzip2 here, but automake complains bin_PROGRAMS = rdfind rdfind_SOURCES = rdfind.cc Checksum.cc Dirlist.cc Fileinfo.cc Rdutil.cc \ - EasyRandom.cc UndoableUnlink.cc CmdlineParser.cc + EasyRandom.cc UndoableUnlink.cc CmdlineParser.cc Options.cc LDADD = @LIBXXHASH@ #these are the test scripts to execute - I do not know how to glob here, @@ -41,7 +41,7 @@ AUXFILES=testcases/common_funcs.sh \ EXTRA_DIST = \ Dirlist.hh Checksum.hh Fileinfo.hh \ Rdutil.hh bootstrap.sh RdfindDebug.hh EasyRandom.hh UndoableUnlink.hh \ - CmdlineParser.hh \ + CmdlineParser.hh Options.hh \ $(TESTS) \ $(AUXFILES) \ rdfind.1 LICENSE \ diff --git a/Options.cc b/Options.cc new file mode 100644 index 0000000..f1ae69f --- /dev/null +++ b/Options.cc @@ -0,0 +1,229 @@ +#include "config.h" //header file from autoconf, must come first to make large file support work properly + +#include +#include + +#include "CmdlineParser.hh" +#include "Options.hh" + +void +usage() +{ + const auto indent = " "; + std::cout + << "Usage: rdfind [options] FILE ...\n" + << '\n' + << "Finds duplicate files recursively in the given FILEs (directories),\n" + << "and takes appropriate action (by default, nothing).\n" + << "Directories listed first are ranked higher, meaning that if a\n" + << "file is found on several places, the file found in the directory " + "first\n" + << "encountered on the command line is kept, and the others are " + "considered duplicate.\n" + << '\n' + << "options are (default choice within parentheses)\n" + << '\n' + << " -ignoreempty (true)| false ignore empty files (true implies " + "-minsize 1,\n" + << " false implies -minsize 0)\n" + << " -minsize N (N=1) ignores files with size less than N " + "bytes\n" + << " -maxsize N (N=0) ignores files with size N " + "bytes and larger (use 0 to disable this check).\n" + << " -followsymlinks true |(false) follow symlinks\n" + << " -removeidentinode (true)| false ignore files with nonunique " + "device and inode\n" + << " -checksum none | md5 |(sha1)| sha256 | sha512 | xxh128\n" + << indent << "checksum type\n" + << indent << "xxh128 is very fast, but is noncryptographic.\n" + << " -buffersize N\n" + << indent << "chunksize in bytes when calculating the checksum.\n" + << indent << "The default is 1 MiB, can be up to 128 MiB.\n" + << " -deterministic (true)| false makes results independent of order\n" + << " from listing the filesystem\n" + << " -makesymlinks true |(false) replace duplicate files with " + "symbolic links\n" + << " -makehardlinks true |(false) replace duplicate files with " + "hard links\n" + << " -makeresultsfile (true)| false makes a results file\n" + << " -outputname name sets the results file name to \"name\" " + "(default results.txt)\n" + << " -progress true |(false) output progress information" + << " -deleteduplicates true |(false) delete duplicate files\n" + << " -sleep Xms sleep for X milliseconds between " + "file reads.\n" + << " Default is 0. Only a few values\n" + << " are supported; 0,1-5,10,25,50,100\n" + << " -dryrun|-n true |(false) print to stdout instead of " + "changing anything\n" + << " -h|-help|--help show this help and exit\n" + << " -v|--version display version number and exit\n" + << '\n' + << "If properly installed, a man page should be available as man rdfind.\n" + << '\n' + << "rdfind is written by Paul Dreik 2006 onwards. License: GPL v2 or " + "later (at your option).\n" + << "version is " << VERSION << '\n'; +} + +Options +parseOptions(Parser& parser) +{ + Options o; + for (; parser.has_args_left(); parser.advance()) { + // empty strings are forbidden as input since they can not be file names or + // options + if (parser.get_current_arg()[0] == '\0') { + std::cerr << "bad argument " << parser.get_current_index() << '\n'; + std::exit(EXIT_FAILURE); + } + + // if we reach the end of the argument list - exit the loop and proceed with + // the file list instead. + if (parser.get_current_arg()[0] != '-') { + // end of argument list - exit! + break; + } + if (parser.try_parse_bool("-makesymlinks")) { + o.makesymlinks = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-makehardlinks")) { + o.makehardlinks = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-makeresultsfile")) { + o.makeresultsfile = parser.get_parsed_bool(); + } else if (parser.try_parse_string("-outputname")) { + o.resultsfile = parser.get_parsed_string(); + } else if (parser.try_parse_bool("-ignoreempty")) { + if (parser.get_parsed_bool()) { + o.minimumfilesize = 1; + } else { + o.minimumfilesize = 0; + } + } else if (parser.try_parse_string("-minsize")) { + const long long minsize = std::stoll(parser.get_parsed_string()); + if (minsize < 0) { + throw std::runtime_error("negative value of minsize not allowed"); + } + o.minimumfilesize = minsize; + } else if (parser.try_parse_string("-maxsize")) { + const long long maxsize = std::stoll(parser.get_parsed_string()); + if (maxsize < 0) { + throw std::runtime_error("negative value of maxsize not allowed"); + } + o.maximumfilesize = maxsize; + } else if (parser.try_parse_bool("-deleteduplicates")) { + o.deleteduplicates = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-followsymlinks")) { + o.followsymlinks = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-dryrun")) { + o.dryrun = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-n")) { + o.dryrun = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-removeidentinode")) { + o.remove_identical_inode = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-deterministic")) { + o.deterministic = parser.get_parsed_bool(); + } else if (parser.try_parse_string("-checksum")) { + if (parser.parsed_string_is("md5")) { + o.usemd5 = true; + } else if (parser.parsed_string_is("sha1")) { + o.usesha1 = true; + } else if (parser.parsed_string_is("sha256")) { + o.usesha256 = true; + } else if (parser.parsed_string_is("sha512")) { + o.usesha512 = true; + } else if (parser.parsed_string_is("xxh128")) { +#ifdef HAVE_LIBXXHASH + o.usexxh128 = true; +#else + std::cerr << "not compiled with xxhash, to make use of xxh128 please " + "reconfigure and rebuild '--with-xxhash'\n"; + std::exit(EXIT_FAILURE); +#endif + } else if (parser.parsed_string_is("none")) { + std::cout + << "DANGER! -checksum none given, will skip the checksumming stage\n"; + o.nochecksum = true; + } else { + std::cerr << "expected none/md5/sha1/sha256/sha512/xxh128, not \"" + << parser.get_parsed_string() << "\"\n"; + std::exit(EXIT_FAILURE); + } + } else if (parser.try_parse_string("-buffersize")) { + const long buffersize = std::stoll(parser.get_parsed_string()); + constexpr long max_buffersize = 128 << 20; + if (buffersize <= 0) { + std::cerr << "a negative or zero buffersize is not allowed\n"; + std::exit(EXIT_FAILURE); + } else if (buffersize > max_buffersize) { + std::cerr << "a maximum of " << (max_buffersize >> 20) + << " MiB buffersize is allowed, got " << (buffersize >> 20) + << " MiB\n"; + std::exit(EXIT_FAILURE); + } + o.buffersize = static_cast(buffersize); + } else if (parser.try_parse_string("-sleep")) { + const auto nextarg = std::string(parser.get_parsed_string()); + if (nextarg == "1ms") { + o.nsecsleep = 1000000; + } else if (nextarg == "2ms") { + o.nsecsleep = 2000000; + } else if (nextarg == "3ms") { + o.nsecsleep = 3000000; + } else if (nextarg == "4ms") { + o.nsecsleep = 4000000; + } else if (nextarg == "5ms") { + o.nsecsleep = 5000000; + } else if (nextarg == "10ms") { + o.nsecsleep = 10000000; + } else if (nextarg == "25ms") { + o.nsecsleep = 25000000; + } else if (nextarg == "50ms") { + o.nsecsleep = 50000000; + } else if (nextarg == "100ms") { + o.nsecsleep = 100000000; + } else { + std::cerr << "sorry, can only understand a few sleep values for " + "now. \"" + << nextarg << "\" is not among them.\n"; + std::exit(EXIT_FAILURE); + } + } else if (parser.try_parse_bool("-progress")) { + o.showprogress = parser.get_parsed_bool(); + } else if (parser.current_arg_is("-help") || parser.current_arg_is("-h") || + parser.current_arg_is("--help")) { + usage(); + std::exit(EXIT_SUCCESS); + } else if (parser.current_arg_is("-version") || + parser.current_arg_is("--version") || + parser.current_arg_is("-v")) { + std::cout << "This is rdfind version " << VERSION << '\n'; + std::exit(EXIT_SUCCESS); + } else { + std::cerr << "did not understand option " << parser.get_current_index() + << ":\"" << parser.get_current_arg() << "\"\n"; + std::exit(EXIT_FAILURE); + } + } + + // fix default values + if (o.maximumfilesize == 0) { + o.maximumfilesize = std::numeric_limits::max(); + } + + // verify conflicting arguments + if (!(o.minimumfilesize < o.maximumfilesize)) { + std::cerr << "maximum filesize " << o.maximumfilesize + << " must be larger than minimum filesize " << o.minimumfilesize + << "\n"; + std::exit(EXIT_FAILURE); + } + + // done with parsing of options. remaining arguments are files and dirs. + + // decide what checksum to use, default to sha1 + if (!o.usemd5 && !o.usesha1 && !o.usesha256 && !o.usesha512 && !o.usexxh128 && + !o.nochecksum) { + o.usesha1 = true; + } + return o; +} diff --git a/Options.hh b/Options.hh new file mode 100644 index 0000000..c1d6b4f --- /dev/null +++ b/Options.hh @@ -0,0 +1,41 @@ +#pragma once + +#include +#include + +#include "Fileinfo.hh" + +class Parser; + +struct Options +{ + // operation mode and default values + bool makesymlinks = false; // turn duplicates into symbolic links + bool makehardlinks = false; // turn duplicates into hard links + bool makeresultsfile = true; // write a results file + Fileinfo::filesizetype minimumfilesize = + 1; // minimum file size to be noticed (0 - include empty files) + Fileinfo::filesizetype maximumfilesize = + 0; // if nonzero, files this size or larger are ignored + bool deleteduplicates = false; // delete duplicate files + bool followsymlinks = false; // follow symlinks + bool dryrun = false; // only dryrun, don't destroy anything + bool remove_identical_inode = true; // remove files with identical inodes + bool usemd5 = false; // use md5 checksum to check for similarity + bool usesha1 = false; // use sha1 checksum to check for similarity + bool usesha256 = false; // use sha256 checksum to check for similarity + bool usesha512 = false; // use sha512 checksum to check for similarity + bool usexxh128 = false; // use xxh128 checksum to check for similarity + bool nochecksum = false; // skip using checksumming (unsafe!) + bool deterministic = true; // be independent of filesystem order + bool showprogress = false; // show progress while reading file contents + std::size_t buffersize = 1 << 20; // chunksize to use when reading files + long nsecsleep = 0; // number of nanoseconds to sleep between each file read. + std::string resultsfile = "results.txt"; // results file name. +}; + +void +usage(); + +Options +parseOptions(Parser& parser); diff --git a/inofficial_cmake/CMakeLists.txt b/inofficial_cmake/CMakeLists.txt index 8fc7906..b6eaf12 100644 --- a/inofficial_cmake/CMakeLists.txt +++ b/inofficial_cmake/CMakeLists.txt @@ -32,6 +32,8 @@ add_library( ../EasyRandom.hh ../Fileinfo.cc ../Fileinfo.hh + ../Options.cc + ../Options.hh ../RdfindDebug.hh ../Rdutil.cc ../Rdutil.hh diff --git a/rdfind.cc b/rdfind.cc index 1d5f23d..343392b 100644 --- a/rdfind.cc +++ b/rdfind.cc @@ -11,7 +11,6 @@ static_assert(__cplusplus >= 201703L, // std #include -#include #include #include #include @@ -20,6 +19,7 @@ static_assert(__cplusplus >= 201703L, #include "CmdlineParser.hh" #include "Dirlist.hh" //to find files #include "Fileinfo.hh" //file container +#include "Options.hh" // #include "RdfindDebug.hh" //debug macro #include "Rdutil.hh" //to do some work @@ -27,7 +27,6 @@ static_assert(__cplusplus >= 201703L, // this vector holds the information about all files found std::vector filelist; -struct Options; const Options* global_options{}; /** @@ -37,255 +36,6 @@ const Options* global_options{}; */ int current_cmdline_index = 0; -static void -usage() -{ - const auto indent = " "; - std::cout - << "Usage: rdfind [options] FILE ...\n" - << '\n' - << "Finds duplicate files recursively in the given FILEs (directories),\n" - << "and takes appropriate action (by default, nothing).\n" - << "Directories listed first are ranked higher, meaning that if a\n" - << "file is found on several places, the file found in the directory " - "first\n" - << "encountered on the command line is kept, and the others are " - "considered duplicate.\n" - << '\n' - << "options are (default choice within parentheses)\n" - << '\n' - << " -ignoreempty (true)| false ignore empty files (true implies " - "-minsize 1,\n" - << " false implies -minsize 0)\n" - << " -minsize N (N=1) ignores files with size less than N " - "bytes\n" - << " -maxsize N (N=0) ignores files with size N " - "bytes and larger (use 0 to disable this check).\n" - << " -followsymlinks true |(false) follow symlinks\n" - << " -removeidentinode (true)| false ignore files with nonunique " - "device and inode\n" - << " -checksum none | md5 |(sha1)| sha256 | sha512 | xxh128\n" - << indent << "checksum type\n" - << indent << "xxh128 is very fast, but is noncryptographic.\n" - << " -buffersize N\n" - << indent << "chunksize in bytes when calculating the checksum.\n" - << indent << "The default is 1 MiB, can be up to 128 MiB.\n" - << " -deterministic (true)| false makes results independent of order\n" - << " from listing the filesystem\n" - << " -makesymlinks true |(false) replace duplicate files with " - "symbolic links\n" - << " -makehardlinks true |(false) replace duplicate files with " - "hard links\n" - << " -makeresultsfile (true)| false makes a results file\n" - << " -outputname name sets the results file name to \"name\" " - "(default results.txt)\n" - << " -progress true |(false) output progress information" - << " -deleteduplicates true |(false) delete duplicate files\n" - << " -sleep Xms sleep for X milliseconds between " - "file reads.\n" - << " Default is 0. Only a few values\n" - << " are supported; 0,1-5,10,25,50,100\n" - << " -dryrun|-n true |(false) print to stdout instead of " - "changing anything\n" - << " -h|-help|--help show this help and exit\n" - << " -v|--version display version number and exit\n" - << '\n' - << "If properly installed, a man page should be available as man rdfind.\n" - << '\n' - << "rdfind is written by Paul Dreik 2006 onwards. License: GPL v2 or " - "later (at your option).\n" - << "version is " << VERSION << '\n'; -} - -struct Options -{ - // operation mode and default values - bool makesymlinks = false; // turn duplicates into symbolic links - bool makehardlinks = false; // turn duplicates into hard links - bool makeresultsfile = true; // write a results file - Fileinfo::filesizetype minimumfilesize = - 1; // minimum file size to be noticed (0 - include empty files) - Fileinfo::filesizetype maximumfilesize = - 0; // if nonzero, files this size or larger are ignored - bool deleteduplicates = false; // delete duplicate files - bool followsymlinks = false; // follow symlinks - bool dryrun = false; // only dryrun, don't destroy anything - bool remove_identical_inode = true; // remove files with identical inodes - bool usemd5 = false; // use md5 checksum to check for similarity - bool usesha1 = false; // use sha1 checksum to check for similarity - bool usesha256 = false; // use sha256 checksum to check for similarity - bool usesha512 = false; // use sha512 checksum to check for similarity - bool usexxh128 = false; // use xxh128 checksum to check for similarity - bool nochecksum = false; // skip using checksumming (unsafe!) - bool deterministic = true; // be independent of filesystem order - bool showprogress = false; // show progress while reading file contents - std::size_t buffersize = 1 << 20; // chunksize to use when reading files - long nsecsleep = 0; // number of nanoseconds to sleep between each file read. - std::string resultsfile = "results.txt"; // results file name. -}; - -Options -parseOptions(Parser& parser) -{ - Options o; - for (; parser.has_args_left(); parser.advance()) { - // empty strings are forbidden as input since they can not be file names or - // options - if (parser.get_current_arg()[0] == '\0') { - std::cerr << "bad argument " << parser.get_current_index() << '\n'; - std::exit(EXIT_FAILURE); - } - - // if we reach the end of the argument list - exit the loop and proceed with - // the file list instead. - if (parser.get_current_arg()[0] != '-') { - // end of argument list - exit! - break; - } - if (parser.try_parse_bool("-makesymlinks")) { - o.makesymlinks = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-makehardlinks")) { - o.makehardlinks = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-makeresultsfile")) { - o.makeresultsfile = parser.get_parsed_bool(); - } else if (parser.try_parse_string("-outputname")) { - o.resultsfile = parser.get_parsed_string(); - } else if (parser.try_parse_bool("-ignoreempty")) { - if (parser.get_parsed_bool()) { - o.minimumfilesize = 1; - } else { - o.minimumfilesize = 0; - } - } else if (parser.try_parse_string("-minsize")) { - const long long minsize = std::stoll(parser.get_parsed_string()); - if (minsize < 0) { - throw std::runtime_error("negative value of minsize not allowed"); - } - o.minimumfilesize = minsize; - } else if (parser.try_parse_string("-maxsize")) { - const long long maxsize = std::stoll(parser.get_parsed_string()); - if (maxsize < 0) { - throw std::runtime_error("negative value of maxsize not allowed"); - } - o.maximumfilesize = maxsize; - } else if (parser.try_parse_bool("-deleteduplicates")) { - o.deleteduplicates = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-followsymlinks")) { - o.followsymlinks = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-dryrun")) { - o.dryrun = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-n")) { - o.dryrun = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-removeidentinode")) { - o.remove_identical_inode = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-deterministic")) { - o.deterministic = parser.get_parsed_bool(); - } else if (parser.try_parse_string("-checksum")) { - if (parser.parsed_string_is("md5")) { - o.usemd5 = true; - } else if (parser.parsed_string_is("sha1")) { - o.usesha1 = true; - } else if (parser.parsed_string_is("sha256")) { - o.usesha256 = true; - } else if (parser.parsed_string_is("sha512")) { - o.usesha512 = true; - } else if (parser.parsed_string_is("xxh128")) { -#ifdef HAVE_LIBXXHASH - o.usexxh128 = true; -#else - std::cerr << "not compiled with xxhash, to make use of xxh128 please " - "reconfigure and rebuild '--with-xxhash'\n"; - std::exit(EXIT_FAILURE); -#endif - } else if (parser.parsed_string_is("none")) { - std::cout - << "DANGER! -checksum none given, will skip the checksumming stage\n"; - o.nochecksum = true; - } else { - std::cerr << "expected none/md5/sha1/sha256/sha512/xxh128, not \"" - << parser.get_parsed_string() << "\"\n"; - std::exit(EXIT_FAILURE); - } - } else if (parser.try_parse_string("-buffersize")) { - const long buffersize = std::stoll(parser.get_parsed_string()); - constexpr long max_buffersize = 128 << 20; - if (buffersize <= 0) { - std::cerr << "a negative or zero buffersize is not allowed\n"; - std::exit(EXIT_FAILURE); - } else if (buffersize > max_buffersize) { - std::cerr << "a maximum of " << (max_buffersize >> 20) - << " MiB buffersize is allowed, got " << (buffersize >> 20) - << " MiB\n"; - std::exit(EXIT_FAILURE); - } - o.buffersize = static_cast(buffersize); - } else if (parser.try_parse_string("-sleep")) { - const auto nextarg = std::string(parser.get_parsed_string()); - if (nextarg == "1ms") { - o.nsecsleep = 1000000; - } else if (nextarg == "2ms") { - o.nsecsleep = 2000000; - } else if (nextarg == "3ms") { - o.nsecsleep = 3000000; - } else if (nextarg == "4ms") { - o.nsecsleep = 4000000; - } else if (nextarg == "5ms") { - o.nsecsleep = 5000000; - } else if (nextarg == "10ms") { - o.nsecsleep = 10000000; - } else if (nextarg == "25ms") { - o.nsecsleep = 25000000; - } else if (nextarg == "50ms") { - o.nsecsleep = 50000000; - } else if (nextarg == "100ms") { - o.nsecsleep = 100000000; - } else { - std::cerr << "sorry, can only understand a few sleep values for " - "now. \"" - << nextarg << "\" is not among them.\n"; - std::exit(EXIT_FAILURE); - } - } else if (parser.try_parse_bool("-progress")) { - o.showprogress = parser.get_parsed_bool(); - } else if (parser.current_arg_is("-help") || parser.current_arg_is("-h") || - parser.current_arg_is("--help")) { - usage(); - std::exit(EXIT_SUCCESS); - } else if (parser.current_arg_is("-version") || - parser.current_arg_is("--version") || - parser.current_arg_is("-v")) { - std::cout << "This is rdfind version " << VERSION << '\n'; - std::exit(EXIT_SUCCESS); - } else { - std::cerr << "did not understand option " << parser.get_current_index() - << ":\"" << parser.get_current_arg() << "\"\n"; - std::exit(EXIT_FAILURE); - } - } - - // fix default values - if (o.maximumfilesize == 0) { - o.maximumfilesize = std::numeric_limits::max(); - } - - // verify conflicting arguments - if (!(o.minimumfilesize < o.maximumfilesize)) { - std::cerr << "maximum filesize " << o.maximumfilesize - << " must be larger than minimum filesize " << o.minimumfilesize - << "\n"; - std::exit(EXIT_FAILURE); - } - - // done with parsing of options. remaining arguments are files and dirs. - - // decide what checksum to use, default to sha1 - if (!o.usemd5 && !o.usesha1 && !o.usesha256 && !o.usesha512 && !o.usexxh128 && - !o.nochecksum) { - o.usesha1 = true; - } - return o; -} - // function to add items to the list of all files static int report(const std::string& path, const std::string& name, int depth) From 9137d117e2b680d0c23abb0703425e7972b79740 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 09:12:06 +0100 Subject: [PATCH 02/12] pass options to Fileinfo::fillwithbytes --- Fileinfo.cc | 4 +++- Fileinfo.hh | 5 ++++- Rdutil.cc | 12 ++++++------ Rdutil.hh | 5 +++-- rdfind.cc | 3 +-- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/Fileinfo.cc b/Fileinfo.cc index c8ee097..933cd33 100644 --- a/Fileinfo.cc +++ b/Fileinfo.cc @@ -20,12 +20,14 @@ // project #include "Checksum.hh" //checksum calculation #include "Fileinfo.hh" +#include "Options.hh" #include "UndoableUnlink.hh" int Fileinfo::fillwithbytes(enum readtobuffermode filltype, enum readtobuffermode lasttype, - std::vector& buffer) + std::vector& buffer, + const Options& options) { // Decide if we are going to read from file or not. diff --git a/Fileinfo.hh b/Fileinfo.hh index 0ece9f1..4167719 100644 --- a/Fileinfo.hh +++ b/Fileinfo.hh @@ -15,6 +15,8 @@ // os specific headers #include //for off_t and others. +struct Options; + /** Holds information about a file. Keeping this small is probably beneficial for performance, because the @@ -143,7 +145,8 @@ public: */ int fillwithbytes(enum readtobuffermode filltype, enum readtobuffermode lasttype, - std::vector& buffer); + std::vector& buffer, + const Options& options); /// get a pointer to the bytes read from the file const char* getbyteptr() const { return m_somebytes.data(); } diff --git a/Rdutil.cc b/Rdutil.cc index 6a4902c..2ff65d2 100644 --- a/Rdutil.cc +++ b/Rdutil.cc @@ -20,6 +20,7 @@ // project #include "Fileinfo.hh" //file container +#include "Options.hh" #include "RdfindDebug.hh" // class declaration @@ -542,16 +543,15 @@ Rdutil::saveablespace(std::ostream& out) const int Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type, enum Fileinfo::readtobuffermode lasttype, - const long nsecsleep, - const std::size_t buffersize, + const Options& options, std::function progress_cb) { // first sort on inode (to read efficiently from the hard drive) sortOnDeviceAndInode(); - const auto duration = std::chrono::nanoseconds{ nsecsleep }; + const auto duration = std::chrono::nanoseconds{ options.nsecsleep }; - std::vector buffer(buffersize, '\0'); + std::vector buffer(options.buffersize, '\0'); std::size_t progress_count = 0; for (auto& elem : m_list) { @@ -559,8 +559,8 @@ Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type, ++progress_count; progress_cb(progress_count); } - elem.fillwithbytes(type, lasttype, buffer); - if (nsecsleep > 0) { + elem.fillwithbytes(type, lasttype, buffer, options); + if (options.nsecsleep > 0) { std::this_thread::sleep_for(duration); } } diff --git a/Rdutil.hh b/Rdutil.hh index 47f6303..2bd629f 100644 --- a/Rdutil.hh +++ b/Rdutil.hh @@ -14,6 +14,8 @@ #include "Fileinfo.hh" //file container +struct Options; + class Rdutil { public: @@ -90,8 +92,7 @@ public: // nanoseconds can be made between each file. int fillwithbytes(enum Fileinfo::readtobuffermode type, enum Fileinfo::readtobuffermode lasttype, - long nsecsleep, - std::size_t buffersize, + const Options& options, std::function progress_cb); /// make symlinks of duplicates. diff --git a/rdfind.cc b/rdfind.cc index 343392b..271c464 100644 --- a/rdfind.cc +++ b/rdfind.cc @@ -191,8 +191,7 @@ main(int narg, const char* argv[]) } // read bytes (destroys the sorting, for disk reading efficiency) - gswd.fillwithbytes( - it[0].first, it[-1].first, o.nsecsleep, o.buffersize, progress_callback); + gswd.fillwithbytes(it[0].first, it[-1].first, o, progress_callback); // remove non-duplicates std::cout << "removed " << gswd.removeUniqSizeAndBuffer() From 9901aa88ba352a11c2f88df57274200f72dbd659 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 09:24:26 +0100 Subject: [PATCH 03/12] add options for first and last byte size --- Options.hh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Options.hh b/Options.hh index c1d6b4f..392c640 100644 --- a/Options.hh +++ b/Options.hh @@ -32,6 +32,10 @@ struct Options std::size_t buffersize = 1 << 20; // chunksize to use when reading files long nsecsleep = 0; // number of nanoseconds to sleep between each file read. std::string resultsfile = "results.txt"; // results file name. + std::uint64_t first_bytes_size = + 64; // how much to read during the "read first bytes" step + std::uint64_t last_bytes_size = + 64; // how much to read during the "read last bytes" step }; void From 21268bb1bab950036f362c2fc138852a5c003f1c Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 16:51:25 +0100 Subject: [PATCH 04/12] make checksum resettable --- Checksum.cc | 56 ++++++++++++++++++++++---------------- Checksum.hh | 3 ++ unittests/test_checksum.cc | 22 +++++++++++++++ 3 files changed, 58 insertions(+), 23 deletions(-) diff --git a/Checksum.cc b/Checksum.cc index ece1268..22bd270 100644 --- a/Checksum.cc +++ b/Checksum.cc @@ -20,31 +20,13 @@ Checksum::Checksum(checksumtypes type) : m_checksumtype(type) { - switch (m_checksumtype) { - case checksumtypes::SHA1: { - sha1_init(&m_state.sha1); - } break; - case checksumtypes::SHA256: { - sha256_init(&m_state.sha256); - } break; - case checksumtypes::SHA512: { - sha512_init(&m_state.sha512); - } break; - case checksumtypes::MD5: { - md5_init(&m_state.md5); - } break; #ifdef HAVE_LIBXXHASH - case checksumtypes::XXH128: { - m_state.xxh128 = XXH3_createState(); - assert(m_state.xxh128 != NULL && "Out of memory!"); - [[maybe_unused]] const auto ret = XXH3_128bits_reset(m_state.xxh128); - assert(ret == XXH_OK); - } break; -#endif - default: - // not allowed to have something that is not recognized. - throw std::runtime_error("wrong checksum type - programming error"); + if (m_checksumtype == checksumtypes::XXH128) { + m_state.xxh128 = XXH3_createState(); + assert(m_state.xxh128 != NULL && "Out of memory!"); } +#endif + reset(); } Checksum::Checksum(Checksum&& other) @@ -120,6 +102,34 @@ Checksum::update(std::size_t length, const char* buffer) static_cast(static_cast(buffer))); } +void +Checksum::reset() +{ + switch (m_checksumtype) { + case checksumtypes::SHA1: { + sha1_init(&m_state.sha1); + } break; + case checksumtypes::SHA256: { + sha256_init(&m_state.sha256); + } break; + case checksumtypes::SHA512: { + sha512_init(&m_state.sha512); + } break; + case checksumtypes::MD5: { + md5_init(&m_state.md5); + } break; +#ifdef HAVE_LIBXXHASH + case checksumtypes::XXH128: { + [[maybe_unused]] const auto ret = XXH3_128bits_reset(m_state.xxh128); + assert(ret == XXH_OK); + } break; +#endif + default: + // not allowed to have something that is not recognized. + throw std::runtime_error("wrong checksum type - programming error"); + } +} + #if 0 // prints checksum to stdout static void diff --git a/Checksum.hh b/Checksum.hh index 83e1f85..b154f0c 100644 --- a/Checksum.hh +++ b/Checksum.hh @@ -45,6 +45,9 @@ public: int update(std::size_t length, const unsigned char* buffer); int update(std::size_t length, const char* buffer); + /// makes the object behave as if it was newly constructed + void reset(); + #if 0 /// prints the checksum on stdout int print(); diff --git a/unittests/test_checksum.cc b/unittests/test_checksum.cc index 253aeb0..0c69571 100644 --- a/unittests/test_checksum.cc +++ b/unittests/test_checksum.cc @@ -94,3 +94,25 @@ TEST_CASE("copy from an rval is fine") REQUIRE(expected == finalize_checksum(movedto)); } } + +TEST_CASE("resetting the state works as intended") +{ + static const char* content = "abcd"; + for (auto type : types) { + + Checksum ck1(type); + const auto v1 = finalize_checksum(ck1); + + Checksum ck2(type); + REQUIRE(0 == ck2.update(std::strlen(content), content)); + const auto v2 = finalize_checksum(ck2); + REQUIRE(v1 != v2); + + // resetting should get the same checksum as an empty object. + Checksum ck3(type); + REQUIRE(0 == ck3.update(std::strlen(content), content)); + ck3.reset(); + const auto v3 = finalize_checksum(ck3); + REQUIRE(v1 == v3); + } +} From 9e68c6441fcbec1ec570c15c62077c73b9f34883 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 18:02:06 +0100 Subject: [PATCH 05/12] break out checksumtypes into it's own file --- Checksum.hh | 13 ++----------- ChecksumTypes.hh | 12 ++++++++++++ Fileinfo.cc | 14 +++++++------- Makefile.am | 2 +- inofficial_cmake/CMakeLists.txt | 1 + unittests/test_checksum.cc | 2 +- 6 files changed, 24 insertions(+), 20 deletions(-) create mode 100644 ChecksumTypes.hh diff --git a/Checksum.hh b/Checksum.hh index b154f0c..484ef32 100644 --- a/Checksum.hh +++ b/Checksum.hh @@ -20,23 +20,14 @@ #include #endif +#include "ChecksumTypes.hh" + /** * class for checksum calculation */ class Checksum { public: - // these are the checksums that can be calculated - enum class checksumtypes - { - NOTSET = 0, - MD5, - SHA1, - SHA256, - SHA512, - XXH128 - }; - explicit Checksum(checksumtypes type); Checksum(const Checksum& other); Checksum(Checksum&& other); diff --git a/ChecksumTypes.hh b/ChecksumTypes.hh new file mode 100644 index 0000000..87f20e1 --- /dev/null +++ b/ChecksumTypes.hh @@ -0,0 +1,12 @@ +#pragma once + +/// these are the checksums that can be calculated. see class Checksum +enum class checksumtypes +{ + NOTSET = 0, + MD5, + SHA1, + SHA256, + SHA512, + XXH128 +}; diff --git a/Fileinfo.cc b/Fileinfo.cc index 933cd33..f70dacc 100644 --- a/Fileinfo.cc +++ b/Fileinfo.cc @@ -51,7 +51,7 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype, return -1; } - auto checksumtype = Checksum::checksumtypes::NOTSET; + auto checksumtype = checksumtypes::NOTSET; // read some bytes switch (filltype) { case readtobuffermode::READ_FIRST_BYTES: @@ -64,26 +64,26 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype, f1.read(m_somebytes.data(), SomeByteSize); break; case readtobuffermode::CREATE_MD5_CHECKSUM: - checksumtype = Checksum::checksumtypes::MD5; + checksumtype = checksumtypes::MD5; break; case readtobuffermode::CREATE_SHA1_CHECKSUM: - checksumtype = Checksum::checksumtypes::SHA1; + checksumtype = checksumtypes::SHA1; break; case readtobuffermode::CREATE_SHA256_CHECKSUM: - checksumtype = Checksum::checksumtypes::SHA256; + checksumtype = checksumtypes::SHA256; break; case readtobuffermode::CREATE_SHA512_CHECKSUM: - checksumtype = Checksum::checksumtypes::SHA512; + checksumtype = checksumtypes::SHA512; break; case readtobuffermode::CREATE_XXH128_CHECKSUM: - checksumtype = Checksum::checksumtypes::XXH128; + checksumtype = checksumtypes::XXH128; break; default: std::cerr << "does not know how to do that filltype:" << static_cast(filltype) << std::endl; } - if (checksumtype != Checksum::checksumtypes::NOTSET) { + if (checksumtype != checksumtypes::NOTSET) { Checksum chk(checksumtype); while (f1) { diff --git a/Makefile.am b/Makefile.am index ea0ac3c..b298979 100644 --- a/Makefile.am +++ b/Makefile.am @@ -41,7 +41,7 @@ AUXFILES=testcases/common_funcs.sh \ EXTRA_DIST = \ Dirlist.hh Checksum.hh Fileinfo.hh \ Rdutil.hh bootstrap.sh RdfindDebug.hh EasyRandom.hh UndoableUnlink.hh \ - CmdlineParser.hh Options.hh \ + CmdlineParser.hh Options.hh ChecksumTypes.hh \ $(TESTS) \ $(AUXFILES) \ rdfind.1 LICENSE \ diff --git a/inofficial_cmake/CMakeLists.txt b/inofficial_cmake/CMakeLists.txt index b6eaf12..8d8b8b4 100644 --- a/inofficial_cmake/CMakeLists.txt +++ b/inofficial_cmake/CMakeLists.txt @@ -24,6 +24,7 @@ add_library( rdfindimpl OBJECT ../Checksum.cc ../Checksum.hh + ../ChecksumTypes.hh ../CmdlineParser.cc ../CmdlineParser.hh ../Dirlist.cc diff --git a/unittests/test_checksum.cc b/unittests/test_checksum.cc index 0c69571..920d586 100644 --- a/unittests/test_checksum.cc +++ b/unittests/test_checksum.cc @@ -4,7 +4,7 @@ #include namespace { -using enum Checksum::checksumtypes; +using enum checksumtypes; const auto types = { MD5, SHA1, SHA256, From ef36fe02b3aa7a3098ebf2e260f2c44e3abffbc8 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 18:23:33 +0100 Subject: [PATCH 06/12] add checksum algorithm for first and last bytes to options --- Options.hh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Options.hh b/Options.hh index 392c640..c8a243a 100644 --- a/Options.hh +++ b/Options.hh @@ -1,8 +1,11 @@ #pragma once +#include "config.h" + #include #include +#include "ChecksumTypes.hh" #include "Fileinfo.hh" class Parser; @@ -36,6 +39,14 @@ struct Options 64; // how much to read during the "read first bytes" step std::uint64_t last_bytes_size = 64; // how much to read during the "read last bytes" step + /// checksum used for first and last bytes + checksumtypes checksum_for_firstlast_bytes = +#ifdef HAVE_LIBXXHASH + checksumtypes::XXH128; +#else + // the fastest one after xxh128 is sha1 + checksumtypes::SHA1; +#endif }; void From 02e1efc0c32edc15d8c835eda418fa5a38d9c0f0 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 18:41:25 +0100 Subject: [PATCH 07/12] add accessor to checksum type --- Checksum.hh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Checksum.hh b/Checksum.hh index 484ef32..3d529e0 100644 --- a/Checksum.hh +++ b/Checksum.hh @@ -51,6 +51,8 @@ public: // returns negative if something is wrong. [[gnu::pure]] int getDigestLength() const; + checksumtypes getType() const noexcept { return m_checksumtype; } + private: // to know what type of checksum we are doing const checksumtypes m_checksumtype = checksumtypes::NOTSET; From 616729464ac7ba9af8fdd5efed9ff9000cd26984 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 19:52:42 +0100 Subject: [PATCH 08/12] replace reading first/last bytes with checksumming them --- Fileinfo.cc | 96 ++++++++++++++++++++++++++--------------------------- Fileinfo.hh | 2 ++ Rdutil.cc | 34 ++++++++++++++++++- 3 files changed, 82 insertions(+), 50 deletions(-) diff --git a/Fileinfo.cc b/Fileinfo.cc index f70dacc..ab239e2 100644 --- a/Fileinfo.cc +++ b/Fileinfo.cc @@ -27,22 +27,26 @@ int Fileinfo::fillwithbytes(enum readtobuffermode filltype, enum readtobuffermode lasttype, std::vector& buffer, + Checksum& chk, const Options& options) { - - // Decide if we are going to read from file or not. - // If file is short, first bytes might be ALL bytes! - if (lasttype != readtobuffermode::NOT_DEFINED) { - if (this->size() <= static_cast(m_somebytes.size())) { - // pointless to read - all bytes in the file are in the field - // m_somebytes, or checksum is calculated! + const auto filesize = this->size(); + const auto ufilesize = static_cast(filesize); + // we might already have checksummed the entire file in the previous step, if + // it was smaller than the buffer. + if (chk.getType() == options.checksum_for_firstlast_bytes) { + if (lasttype == readtobuffermode::READ_FIRST_BYTES && + options.first_bytes_size >= ufilesize) { + // already checksummed! + return 0; + } + if (lasttype == readtobuffermode::READ_LAST_BYTES && + options.last_bytes_size >= ufilesize) { + // already checksummed! return 0; } } - // set memory to zero - m_somebytes.fill('\0'); - std::fstream f1; f1.open(m_filename, std::ios_base::in); if (!f1.is_open()) { @@ -51,56 +55,50 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype, return -1; } - auto checksumtype = checksumtypes::NOTSET; - // read some bytes - switch (filltype) { - case readtobuffermode::READ_FIRST_BYTES: - // read at start of file - f1.read(m_somebytes.data(), SomeByteSize); - break; - case readtobuffermode::READ_LAST_BYTES: - // read at end of file - f1.seekg(-SomeByteSize, std::ios_base::end); - f1.read(m_somebytes.data(), SomeByteSize); - break; - case readtobuffermode::CREATE_MD5_CHECKSUM: - checksumtype = checksumtypes::MD5; - break; - case readtobuffermode::CREATE_SHA1_CHECKSUM: - checksumtype = checksumtypes::SHA1; - break; - case readtobuffermode::CREATE_SHA256_CHECKSUM: - checksumtype = checksumtypes::SHA256; - break; - case readtobuffermode::CREATE_SHA512_CHECKSUM: - checksumtype = checksumtypes::SHA512; - break; - case readtobuffermode::CREATE_XXH128_CHECKSUM: - checksumtype = checksumtypes::XXH128; - break; - default: - std::cerr << "does not know how to do that filltype:" - << static_cast(filltype) << std::endl; + bool read_entire_file = true; + std::streamsize bytes_to_read{}; + if (filltype == readtobuffermode::READ_FIRST_BYTES) { + bytes_to_read = static_cast(options.first_bytes_size); + if (filesize > bytes_to_read) { + read_entire_file = false; + } + } else if (filltype == readtobuffermode::READ_LAST_BYTES) { + bytes_to_read = static_cast(options.last_bytes_size); + if (filesize > bytes_to_read) { + read_entire_file = false; + f1.seekg(-options.last_bytes_size, std::ios_base::end); + } } - if (checksumtype != checksumtypes::NOTSET) { - Checksum chk(checksumtype); + // set memory to zero + m_somebytes.fill('\0'); + + // ensure the checksum object is in a good state + chk.reset(); + if (read_entire_file) { while (f1) { f1.read(buffer.data(), static_cast(buffer.size())); // gcount is never negative, the cast is safe. chk.update(static_cast(f1.gcount()), buffer.data()); } - - // store the result of the checksum calculation in somebytes - assert(chk.getDigestLength() > 0); - assert(static_cast(chk.getDigestLength()) <= - m_somebytes.size()); - if (chk.printToBuffer(m_somebytes.data(), m_somebytes.size())) { - std::cerr << "failed writing digest to buffer!!" << std::endl; + } else { + const auto bufsize = static_cast(buffer.size()); + while (f1 && bytes_to_read > 0) { + f1.read(buffer.data(), std::min(bufsize, bytes_to_read)); + // gcount is never negative, the cast is safe. + bytes_to_read -= f1.gcount(); + chk.update(static_cast(f1.gcount()), buffer.data()); } } + // store the result of the checksum calculation in somebytes + assert(chk.getDigestLength() > 0); + assert(static_cast(chk.getDigestLength()) <= m_somebytes.size()); + if (chk.printToBuffer(m_somebytes.data(), m_somebytes.size())) { + std::cerr << "failed writing digest to buffer!!" << std::endl; + } + return 0; } diff --git a/Fileinfo.hh b/Fileinfo.hh index 4167719..10c1672 100644 --- a/Fileinfo.hh +++ b/Fileinfo.hh @@ -15,6 +15,7 @@ // os specific headers #include //for off_t and others. +class Checksum; struct Options; /** @@ -146,6 +147,7 @@ public: int fillwithbytes(enum readtobuffermode filltype, enum readtobuffermode lasttype, std::vector& buffer, + Checksum& cksum, const Options& options); /// get a pointer to the bytes read from the file diff --git a/Rdutil.cc b/Rdutil.cc index 2ff65d2..4142ffe 100644 --- a/Rdutil.cc +++ b/Rdutil.cc @@ -19,6 +19,7 @@ #include //sleep // project +#include "Checksum.hh" #include "Fileinfo.hh" //file container #include "Options.hh" #include "RdfindDebug.hh" @@ -549,6 +550,37 @@ Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type, // first sort on inode (to read efficiently from the hard drive) sortOnDeviceAndInode(); + // make a checksum object which can be reused to avoid creating an object + // per processed file + checksumtypes cktype{}; + switch (type) { + case Fileinfo::readtobuffermode::READ_FIRST_BYTES: + cktype = options.checksum_for_firstlast_bytes; + break; + case Fileinfo::readtobuffermode::READ_LAST_BYTES: + cktype = options.checksum_for_firstlast_bytes; + break; + case Fileinfo::readtobuffermode::CREATE_XXH128_CHECKSUM: + cktype = checksumtypes::XXH128; + break; + case Fileinfo::readtobuffermode::CREATE_SHA1_CHECKSUM: + cktype = checksumtypes::SHA1; + break; + case Fileinfo::readtobuffermode::CREATE_SHA256_CHECKSUM: + cktype = checksumtypes::SHA256; + break; + case Fileinfo::readtobuffermode::CREATE_SHA512_CHECKSUM: + cktype = checksumtypes::SHA512; + break; + case Fileinfo::readtobuffermode::CREATE_MD5_CHECKSUM: + cktype = checksumtypes::MD5; + break; + default: + throw std::runtime_error("bad readtobuffermode"); + } + + Checksum cksum(cktype); + const auto duration = std::chrono::nanoseconds{ options.nsecsleep }; std::vector buffer(options.buffersize, '\0'); @@ -559,7 +591,7 @@ Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type, ++progress_count; progress_cb(progress_count); } - elem.fillwithbytes(type, lasttype, buffer, options); + elem.fillwithbytes(type, lasttype, buffer, cksum, options); if (options.nsecsleep > 0) { std::this_thread::sleep_for(duration); } From 47b44bfbe98cbea4657364b4304e3f7971e9e581 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 20:07:03 +0100 Subject: [PATCH 09/12] polish the usage text --- Options.cc | 116 ++++++++++++++++++++++++++++------------------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/Options.cc b/Options.cc index f1ae69f..842b6c3 100644 --- a/Options.cc +++ b/Options.cc @@ -6,64 +6,70 @@ #include "CmdlineParser.hh" #include "Options.hh" +namespace { +constexpr auto usagetext = R"( +Usage: rdfind [options] FILE ... + +Finds duplicate files recursively in the given FILEs (directories), and takes +appropriate action (by default, nothing). Directories listed first are ranked +higher, meaning that if a file is found on several places, the file found in +the directory first encountered on the command line is kept, and the others are +considered duplicate. + +options are (default choice within parentheses) + + Searching options: + + -ignoreempty (true)| false ignore empty files (true implies -minsize 1, + false implies -minsize 0) + -minsize N (N=1) ignores files with size less than N bytes + -maxsize N (N=0) ignores files with size N bytes and larger + (use 0 to disable this check). + -followsymlinks true |(false) follow symlinks + -removeidentinode (true)| false ignore files with nonunique device and inode + + Processing options: + + -checksum none | md5 |(sha1)| sha256 | sha512 | xxh128 + checksum type + xxh128 is very fast, but is noncryptographic. + -buffersize N chunksize in bytes when calculating the + checksum. The default is 1 MiB, can be up + to 128 MiB. + -deterministic (true)| false makes results independent of order + from listing the filesystem + + Action options: + + -makeresultsfile (true)| false makes a results file + -makesymlinks true |(false) replace duplicate files with symbolic links + -makehardlinks true |(false) replace duplicate files with hard links + -deleteduplicates true |(false) delete duplicate files + Default is 0. Only a few values + are supported; 0, 1-5, 10, 25, 50, 100 + -dryrun|-n true |(false) print to stdout instead of changing anything + + General options: + + -outputname NAME sets the results file name to NAME, + default is results.txt + -sleep Xms sleep for X milliseconds between file reads. + -progress true |(false) output progress information + -h|-help|--help show this help and exit + -v|--version display version number and exit + +If properly installed, a man page should be available as man rdfind. + +rdfind is written by Paul Dreik 2006 onwards. +License: GPL v2 or later (at your option). +)"; + +} + void usage() { - const auto indent = " "; - std::cout - << "Usage: rdfind [options] FILE ...\n" - << '\n' - << "Finds duplicate files recursively in the given FILEs (directories),\n" - << "and takes appropriate action (by default, nothing).\n" - << "Directories listed first are ranked higher, meaning that if a\n" - << "file is found on several places, the file found in the directory " - "first\n" - << "encountered on the command line is kept, and the others are " - "considered duplicate.\n" - << '\n' - << "options are (default choice within parentheses)\n" - << '\n' - << " -ignoreempty (true)| false ignore empty files (true implies " - "-minsize 1,\n" - << " false implies -minsize 0)\n" - << " -minsize N (N=1) ignores files with size less than N " - "bytes\n" - << " -maxsize N (N=0) ignores files with size N " - "bytes and larger (use 0 to disable this check).\n" - << " -followsymlinks true |(false) follow symlinks\n" - << " -removeidentinode (true)| false ignore files with nonunique " - "device and inode\n" - << " -checksum none | md5 |(sha1)| sha256 | sha512 | xxh128\n" - << indent << "checksum type\n" - << indent << "xxh128 is very fast, but is noncryptographic.\n" - << " -buffersize N\n" - << indent << "chunksize in bytes when calculating the checksum.\n" - << indent << "The default is 1 MiB, can be up to 128 MiB.\n" - << " -deterministic (true)| false makes results independent of order\n" - << " from listing the filesystem\n" - << " -makesymlinks true |(false) replace duplicate files with " - "symbolic links\n" - << " -makehardlinks true |(false) replace duplicate files with " - "hard links\n" - << " -makeresultsfile (true)| false makes a results file\n" - << " -outputname name sets the results file name to \"name\" " - "(default results.txt)\n" - << " -progress true |(false) output progress information" - << " -deleteduplicates true |(false) delete duplicate files\n" - << " -sleep Xms sleep for X milliseconds between " - "file reads.\n" - << " Default is 0. Only a few values\n" - << " are supported; 0,1-5,10,25,50,100\n" - << " -dryrun|-n true |(false) print to stdout instead of " - "changing anything\n" - << " -h|-help|--help show this help and exit\n" - << " -v|--version display version number and exit\n" - << '\n' - << "If properly installed, a man page should be available as man rdfind.\n" - << '\n' - << "rdfind is written by Paul Dreik 2006 onwards. License: GPL v2 or " - "later (at your option).\n" - << "version is " << VERSION << '\n'; + std::cout << usagetext + 1 << "version is " << VERSION << '\n'; } Options From 4d87fac3b2fd39dfdffdd61a2694358987a8e341 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 20:30:04 +0100 Subject: [PATCH 10/12] allow setting the first/last bytes reading size --- NEWS | 1 + Options.cc | 19 ++++++++++++ README.md | 4 +-- rdfind.1 | 8 +++++ rdfind.cc | 10 ++++-- testcases/verify_skipfirstbytes.sh | 49 ++++++++++++++++++++++++++++++ 6 files changed, 87 insertions(+), 4 deletions(-) create mode 100755 testcases/verify_skipfirstbytes.sh diff --git a/NEWS b/NEWS index 0c8ac0b..3ae8862 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,7 @@ next optionally disable the checksum step by giving -checksum none optionally show progress +optionally adjust the size of first/last bytes, or disable it completely. 1.7.0 requires a C++17 capable compiler. new fast non-cryptographic hash xxh diff --git a/Options.cc b/Options.cc index 842b6c3..ac5bcee 100644 --- a/Options.cc +++ b/Options.cc @@ -30,6 +30,12 @@ options are (default choice within parentheses) Processing options: + -firstbytessize N sets the size in bytes when comparing the + beginning of files, prior to full checksumming. + default is 64 byte. Use 0 to disable the stage. + -lastbytessize N sets the size in bytes when comparing the + end of files, prior to full checksumming. + default is 64 byte. Use 0 to disable the stage. -checksum none | md5 |(sha1)| sha256 | sha512 | xxh128 checksum type xxh128 is very fast, but is noncryptographic. @@ -128,6 +134,19 @@ parseOptions(Parser& parser) o.remove_identical_inode = parser.get_parsed_bool(); } else if (parser.try_parse_bool("-deterministic")) { o.deterministic = parser.get_parsed_bool(); + } else if (parser.try_parse_string("-firstbytessize")) { + const auto tmp = std::stoll(parser.get_parsed_string()); + if (tmp < 0) { + throw std::runtime_error( + "negative value of firstbytessize not allowed"); + } + o.first_bytes_size = tmp; + } else if (parser.try_parse_string("-lastbytessize")) { + const auto tmp = std::stoll(parser.get_parsed_string()); + if (tmp < 0) { + throw std::runtime_error("negative value of lastbytessize not allowed"); + } + o.last_bytes_size = tmp; } else if (parser.try_parse_string("-checksum")) { if (parser.parsed_string_is("md5")) { o.usemd5 = true; diff --git a/README.md b/README.md index 582fc51..c2c95d9 100644 --- a/README.md +++ b/README.md @@ -86,9 +86,9 @@ Rdfind uses the following algorithm. If N is the number of files to search throu 5. If flag -removeidentinode true: Remove items from the list which already are added, based on the combination of inode and device number. A group of files that are hardlinked to the same file are collapsed to one entry. Also see the comment on hardlinks under ”caveats below”! 6. Sort files on size. Remove files from the list, which have unique sizes. 7. Sort on device and inode(speeds up file reading). Read a few bytes from the beginning of each file (first bytes). -8. Remove files from list that have the same size but different first bytes. +8. Remove files from list that have the same size but different first bytes. (This step is possible to disable by using -firstbytessize 0). 9. Sort on device and inode(speeds up file reading). Read a few bytes from the end of each file (last bytes). -10. Remove files from list that have the same size but different last bytes. +10. Remove files from list that have the same size but different last bytes. (This step is possible to disable by using -lastbytessize 0). 11. Sort on device and inode(speeds up file reading). Perform a checksum calculation for each file (unless disabled with -checksum none). 12. Only keep files on the list with the same size and checksum. These are duplicates. 13. Sort list on size, priority number, and depth. The first file for every set of duplicates is considered to be the original. diff --git a/rdfind.1 b/rdfind.1 index 0804cc4..3e99aea 100644 --- a/rdfind.1 +++ b/rdfind.1 @@ -91,6 +91,14 @@ for files, smaller or bigger can improve performance dependent on filesystem and checksum algorithm. The default is 1 MiB, the maximum allowed is 128MiB (inclusive). .TP +.BR \-firstbytessize " " \fIN\fR +Size in bytes when scanning the first bytes of each file, prior to full +checksumming. Setting this to 0 means skipping the step entirely. +.TP +.BR \-lastbytessize " " \fIN\fR +Size in bytes when scanning the last bytes of each file, prior to full +checksumming. Setting this to 0 means skipping the step entirely. +.TP .BR \-deterministic " " \fItrue\fR|\fIfalse\fR If set (the default), sort files of equal rank in an unspecified but deterministic order. This makes the behaviour independent of in which diff --git a/rdfind.cc b/rdfind.cc index 271c464..c64ad42 100644 --- a/rdfind.cc +++ b/rdfind.cc @@ -146,9 +146,15 @@ main(int narg, const char* argv[]) // candidates. start looking at the contents. std::vector> modes{ { Fileinfo::readtobuffermode::NOT_DEFINED, "" }, - { Fileinfo::readtobuffermode::READ_FIRST_BYTES, "first bytes" }, - { Fileinfo::readtobuffermode::READ_LAST_BYTES, "last bytes" }, }; + if (o.first_bytes_size > 0) { + modes.emplace_back(Fileinfo::readtobuffermode::READ_FIRST_BYTES, + "first bytes"); + } + if (o.last_bytes_size > 0) { + modes.emplace_back(Fileinfo::readtobuffermode::READ_LAST_BYTES, + "last bytes"); + } if (o.usemd5) { modes.emplace_back(Fileinfo::readtobuffermode::CREATE_MD5_CHECKSUM, "md5 checksum"); diff --git a/testcases/verify_skipfirstbytes.sh b/testcases/verify_skipfirstbytes.sh new file mode 100755 index 0000000..689e895 --- /dev/null +++ b/testcases/verify_skipfirstbytes.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# Ensures the skip first bytes step checks +# + +set -e +. "$(dirname "$0")/common_funcs.sh" + +FIRSTBYTES=1000 +MIDDLEBYTES=1000 +LASTBYTES=1000 + +# make a file which is longer than "first bytes" and "last bytes" together, +# so we can make two files that differ only in the middle and will +# need checksumming to see they are different. +makefiles() { + for f in a b; do + ( + head -c$FIRSTBYTES $f + done +} + +reset_teststate +makefiles + +defaultfirst="-firstbytessize 64" +defaultlast="-lastbytessize 64" + +# with no checksum, we should falsely believe the files are equal +# shellcheck disable=SC2086 +$rdfind -checksum none $defaultfirst $defaultlast a* b* \ + | grep "files that are not unique" >output.log +verify [ "$(cat output.log)" = "It seems like you have 2 files that are not unique" ] + +# if we set the first bytes size to be very large, we will detect it +# shellcheck disable=SC2086 +$rdfind -checksum none -firstbytessize $((FIRSTBYTES + MIDDLEBYTES)) $defaultlast a* b* \ + | grep "files that are not unique" >output.log +verify [ "$(cat output.log)" = "It seems like you have 0 files that are not unique" ] + +# if we set the last bytes size to be very large, we will also detect it +# shellcheck disable=SC2086 +$rdfind -checksum none $defaultfirst -lastbytessize $((MIDDLEBYTES + LASTBYTES)) a* b* \ + | grep "files that are not unique" >output.log +verify [ "$(cat output.log)" = "It seems like you have 0 files that are not unique" ] + +dbgecho "all is good for the skip first bytes step check!" From ee81dd83c18e41c2511a785b2496d421673a3fd8 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 21:05:15 +0100 Subject: [PATCH 11/12] default first/last bytes size to 4096 --- Options.hh | 4 ++-- testcases/md5collisions.sh | 9 +++++++-- testcases/sha1collisions.sh | 9 +++++++-- testcases/verify_nochecksum.sh | 9 +++++++-- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/Options.hh b/Options.hh index c8a243a..4905546 100644 --- a/Options.hh +++ b/Options.hh @@ -36,9 +36,9 @@ struct Options long nsecsleep = 0; // number of nanoseconds to sleep between each file read. std::string resultsfile = "results.txt"; // results file name. std::uint64_t first_bytes_size = - 64; // how much to read during the "read first bytes" step + 4096; // how much to read during the "read first bytes" step std::uint64_t last_bytes_size = - 64; // how much to read during the "read last bytes" step + 4096; // how much to read during the "read last bytes" step /// checksum used for first and last bytes checksumtypes checksum_for_firstlast_bytes = #ifdef HAVE_LIBXXHASH diff --git a/testcases/md5collisions.sh b/testcases/md5collisions.sh index 4030ecf..f820505 100755 --- a/testcases/md5collisions.sh +++ b/testcases/md5collisions.sh @@ -11,12 +11,17 @@ mkdir md5coll cp "$testscriptsdir/md5collisions/"*.ps md5coll sync +# set small first/last bytes sizes +firstlastoptions="-firstbytessize 64 -lastbytessize 64" + #make sure nothing happens when using sha -$rdfind -checksum sha1 -deleteduplicates true md5coll 2>&1 | tee rdfind.out +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum sha1 -deleteduplicates true md5coll 2>&1 | tee rdfind.out grep -q "^Deleted 0 files.$" rdfind.out dbgecho "using sha1 did not delete any files, as expected" -$rdfind -checksum md5 -deleteduplicates true md5coll 2>&1 | tee rdfind.out +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum md5 -deleteduplicates true md5coll 2>&1 | tee rdfind.out grep -q "^Deleted 1 files.$" rdfind.out dbgecho "using md5 did delete files, as expected" diff --git a/testcases/sha1collisions.sh b/testcases/sha1collisions.sh index 3c1b817..de32687 100755 --- a/testcases/sha1collisions.sh +++ b/testcases/sha1collisions.sh @@ -6,15 +6,20 @@ set -e reset_teststate +# set small first/last bytes sizes +firstlastoptions="-firstbytessize 64 -lastbytessize 64" + #unpack collisions example from https://shattered.it/static/shattered.pdf base64 --decode <"$testscriptsdir/sha1collisions/coll.tar.bz2.b64" | tar xvfj - #make sure nothing happens when using sha256 -$rdfind -checksum sha256 -deleteduplicates true . 2>&1 | tee rdfind.out +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum sha256 -deleteduplicates true . 2>&1 | tee rdfind.out grep -q "^Deleted 0 files.$" rdfind.out dbgecho "using sha256 did not delete any files, as expected" -$rdfind -checksum sha1 -deleteduplicates true . 2>&1 | tee rdfind.out +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum sha1 -deleteduplicates true . 2>&1 | tee rdfind.out grep -q "^Deleted 1 files.$" rdfind.out dbgecho "using sha1 did delete the files, as expected" diff --git a/testcases/verify_nochecksum.sh b/testcases/verify_nochecksum.sh index 5b90075..bd1913e 100755 --- a/testcases/verify_nochecksum.sh +++ b/testcases/verify_nochecksum.sh @@ -24,14 +24,19 @@ makefiles() { reset_teststate makefiles +# set small first/last to small sizes +firstlastoptions="-firstbytessize 64 -lastbytessize 64" + # with no checksum, we should falsely believe the files are equal -$rdfind -checksum none a* b* \ +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum none a* b* \ | grep "files that are not unique" >output.log verify [ "$(cat output.log)" = "It seems like you have 2 files that are not unique" ] # with checksumming (the default) the files should not be considered equal. -$rdfind -checksum sha1 a* b* \ +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum sha1 a* b* \ | grep "files that are not unique" >output.log verify [ "$(cat output.log)" = "It seems like you have 0 files that are not unique" ] From 4d47d337f982806ddad89c38da1acba8b7e50562 Mon Sep 17 00:00:00 2001 From: Paul Dreik Date: Sun, 15 Feb 2026 21:13:16 +0100 Subject: [PATCH 12/12] add new tests to cmake and automake --- Makefile.am | 18 ++++++++++-------- inofficial_cmake/CMakeLists.txt | 3 ++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Makefile.am b/Makefile.am index b298979..a539317 100644 --- a/Makefile.am +++ b/Makefile.am @@ -9,19 +9,21 @@ rdfind_SOURCES = rdfind.cc Checksum.cc Dirlist.cc Fileinfo.cc Rdutil.cc \ LDADD = @LIBXXHASH@ #these are the test scripts to execute - I do not know how to glob here, #feedback welcome. -TESTS=testcases/largefilesupport.sh \ +TESTS=testcases/checksum_buffersize.sh \ + testcases/checksum_options.sh \ testcases/hardlink_fails.sh \ + testcases/largefilesupport.sh \ + testcases/md5collisions.sh \ + testcases/sha1collisions.sh \ testcases/symlinking_action.sh \ + testcases/verify_deterministic_operation.sh \ + testcases/verify_dryrun_option.sh \ testcases/verify_filesize_option.sh \ testcases/verify_maxfilesize_option.sh \ - testcases/verify_dryrun_option.sh \ + testcases/verify_nochecksum.sh \ testcases/verify_ranking.sh \ - testcases/verify_deterministic_operation.sh \ - testcases/checksum_options.sh \ - testcases/md5collisions.sh \ - testcases/sha1collisions.sh \ - testcases/checksum_buffersize.sh \ - testcases/verify_nochecksum.sh + testcases/verify_size_savings.sh \ + testcases/verify_skipfirstbytes.sh AUXFILES=testcases/common_funcs.sh \ diff --git a/inofficial_cmake/CMakeLists.txt b/inofficial_cmake/CMakeLists.txt index 8d8b8b4..a88847d 100644 --- a/inofficial_cmake/CMakeLists.txt +++ b/inofficial_cmake/CMakeLists.txt @@ -81,7 +81,8 @@ set(testscripts testcases/verify_maxfilesize_option.sh testcases/verify_nochecksum.sh testcases/verify_ranking.sh - testcases/verify_size_savings.sh) + testcases/verify_size_savings.sh + testcases/verify_skipfirstbytes.sh) foreach(testscript ${testscripts}) cmake_path(GET testscript STEM testname)