diff --git a/Checksum.cc b/Checksum.cc index ece1268..22bd270 100644 --- a/Checksum.cc +++ b/Checksum.cc @@ -20,31 +20,13 @@ Checksum::Checksum(checksumtypes type) : m_checksumtype(type) { - switch (m_checksumtype) { - case checksumtypes::SHA1: { - sha1_init(&m_state.sha1); - } break; - case checksumtypes::SHA256: { - sha256_init(&m_state.sha256); - } break; - case checksumtypes::SHA512: { - sha512_init(&m_state.sha512); - } break; - case checksumtypes::MD5: { - md5_init(&m_state.md5); - } break; #ifdef HAVE_LIBXXHASH - case checksumtypes::XXH128: { - m_state.xxh128 = XXH3_createState(); - assert(m_state.xxh128 != NULL && "Out of memory!"); - [[maybe_unused]] const auto ret = XXH3_128bits_reset(m_state.xxh128); - assert(ret == XXH_OK); - } break; -#endif - default: - // not allowed to have something that is not recognized. - throw std::runtime_error("wrong checksum type - programming error"); + if (m_checksumtype == checksumtypes::XXH128) { + m_state.xxh128 = XXH3_createState(); + assert(m_state.xxh128 != NULL && "Out of memory!"); } +#endif + reset(); } Checksum::Checksum(Checksum&& other) @@ -120,6 +102,34 @@ Checksum::update(std::size_t length, const char* buffer) static_cast(static_cast(buffer))); } +void +Checksum::reset() +{ + switch (m_checksumtype) { + case checksumtypes::SHA1: { + sha1_init(&m_state.sha1); + } break; + case checksumtypes::SHA256: { + sha256_init(&m_state.sha256); + } break; + case checksumtypes::SHA512: { + sha512_init(&m_state.sha512); + } break; + case checksumtypes::MD5: { + md5_init(&m_state.md5); + } break; +#ifdef HAVE_LIBXXHASH + case checksumtypes::XXH128: { + [[maybe_unused]] const auto ret = XXH3_128bits_reset(m_state.xxh128); + assert(ret == XXH_OK); + } break; +#endif + default: + // not allowed to have something that is not recognized. + throw std::runtime_error("wrong checksum type - programming error"); + } +} + #if 0 // prints checksum to stdout static void diff --git a/Checksum.hh b/Checksum.hh index 83e1f85..3d529e0 100644 --- a/Checksum.hh +++ b/Checksum.hh @@ -20,23 +20,14 @@ #include #endif +#include "ChecksumTypes.hh" + /** * class for checksum calculation */ class Checksum { public: - // these are the checksums that can be calculated - enum class checksumtypes - { - NOTSET = 0, - MD5, - SHA1, - SHA256, - SHA512, - XXH128 - }; - explicit Checksum(checksumtypes type); Checksum(const Checksum& other); Checksum(Checksum&& other); @@ -45,6 +36,9 @@ public: int update(std::size_t length, const unsigned char* buffer); int update(std::size_t length, const char* buffer); + /// makes the object behave as if it was newly constructed + void reset(); + #if 0 /// prints the checksum on stdout int print(); @@ -57,6 +51,8 @@ public: // returns negative if something is wrong. [[gnu::pure]] int getDigestLength() const; + checksumtypes getType() const noexcept { return m_checksumtype; } + private: // to know what type of checksum we are doing const checksumtypes m_checksumtype = checksumtypes::NOTSET; diff --git a/ChecksumTypes.hh b/ChecksumTypes.hh new file mode 100644 index 0000000..87f20e1 --- /dev/null +++ b/ChecksumTypes.hh @@ -0,0 +1,12 @@ +#pragma once + +/// these are the checksums that can be calculated. see class Checksum +enum class checksumtypes +{ + NOTSET = 0, + MD5, + SHA1, + SHA256, + SHA512, + XXH128 +}; diff --git a/Fileinfo.cc b/Fileinfo.cc index c8ee097..ab239e2 100644 --- a/Fileinfo.cc +++ b/Fileinfo.cc @@ -20,27 +20,33 @@ // project #include "Checksum.hh" //checksum calculation #include "Fileinfo.hh" +#include "Options.hh" #include "UndoableUnlink.hh" int Fileinfo::fillwithbytes(enum readtobuffermode filltype, enum readtobuffermode lasttype, - std::vector& buffer) + std::vector& buffer, + Checksum& chk, + const Options& options) { - - // Decide if we are going to read from file or not. - // If file is short, first bytes might be ALL bytes! - if (lasttype != readtobuffermode::NOT_DEFINED) { - if (this->size() <= static_cast(m_somebytes.size())) { - // pointless to read - all bytes in the file are in the field - // m_somebytes, or checksum is calculated! + const auto filesize = this->size(); + const auto ufilesize = static_cast(filesize); + // we might already have checksummed the entire file in the previous step, if + // it was smaller than the buffer. + if (chk.getType() == options.checksum_for_firstlast_bytes) { + if (lasttype == readtobuffermode::READ_FIRST_BYTES && + options.first_bytes_size >= ufilesize) { + // already checksummed! + return 0; + } + if (lasttype == readtobuffermode::READ_LAST_BYTES && + options.last_bytes_size >= ufilesize) { + // already checksummed! return 0; } } - // set memory to zero - m_somebytes.fill('\0'); - std::fstream f1; f1.open(m_filename, std::ios_base::in); if (!f1.is_open()) { @@ -49,56 +55,50 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype, return -1; } - auto checksumtype = Checksum::checksumtypes::NOTSET; - // read some bytes - switch (filltype) { - case readtobuffermode::READ_FIRST_BYTES: - // read at start of file - f1.read(m_somebytes.data(), SomeByteSize); - break; - case readtobuffermode::READ_LAST_BYTES: - // read at end of file - f1.seekg(-SomeByteSize, std::ios_base::end); - f1.read(m_somebytes.data(), SomeByteSize); - break; - case readtobuffermode::CREATE_MD5_CHECKSUM: - checksumtype = Checksum::checksumtypes::MD5; - break; - case readtobuffermode::CREATE_SHA1_CHECKSUM: - checksumtype = Checksum::checksumtypes::SHA1; - break; - case readtobuffermode::CREATE_SHA256_CHECKSUM: - checksumtype = Checksum::checksumtypes::SHA256; - break; - case readtobuffermode::CREATE_SHA512_CHECKSUM: - checksumtype = Checksum::checksumtypes::SHA512; - break; - case readtobuffermode::CREATE_XXH128_CHECKSUM: - checksumtype = Checksum::checksumtypes::XXH128; - break; - default: - std::cerr << "does not know how to do that filltype:" - << static_cast(filltype) << std::endl; + bool read_entire_file = true; + std::streamsize bytes_to_read{}; + if (filltype == readtobuffermode::READ_FIRST_BYTES) { + bytes_to_read = static_cast(options.first_bytes_size); + if (filesize > bytes_to_read) { + read_entire_file = false; + } + } else if (filltype == readtobuffermode::READ_LAST_BYTES) { + bytes_to_read = static_cast(options.last_bytes_size); + if (filesize > bytes_to_read) { + read_entire_file = false; + f1.seekg(-options.last_bytes_size, std::ios_base::end); + } } - if (checksumtype != Checksum::checksumtypes::NOTSET) { - Checksum chk(checksumtype); + // set memory to zero + m_somebytes.fill('\0'); + + // ensure the checksum object is in a good state + chk.reset(); + if (read_entire_file) { while (f1) { f1.read(buffer.data(), static_cast(buffer.size())); // gcount is never negative, the cast is safe. chk.update(static_cast(f1.gcount()), buffer.data()); } - - // store the result of the checksum calculation in somebytes - assert(chk.getDigestLength() > 0); - assert(static_cast(chk.getDigestLength()) <= - m_somebytes.size()); - if (chk.printToBuffer(m_somebytes.data(), m_somebytes.size())) { - std::cerr << "failed writing digest to buffer!!" << std::endl; + } else { + const auto bufsize = static_cast(buffer.size()); + while (f1 && bytes_to_read > 0) { + f1.read(buffer.data(), std::min(bufsize, bytes_to_read)); + // gcount is never negative, the cast is safe. + bytes_to_read -= f1.gcount(); + chk.update(static_cast(f1.gcount()), buffer.data()); } } + // store the result of the checksum calculation in somebytes + assert(chk.getDigestLength() > 0); + assert(static_cast(chk.getDigestLength()) <= m_somebytes.size()); + if (chk.printToBuffer(m_somebytes.data(), m_somebytes.size())) { + std::cerr << "failed writing digest to buffer!!" << std::endl; + } + return 0; } diff --git a/Fileinfo.hh b/Fileinfo.hh index 0ece9f1..10c1672 100644 --- a/Fileinfo.hh +++ b/Fileinfo.hh @@ -15,6 +15,9 @@ // os specific headers #include //for off_t and others. +class Checksum; +struct Options; + /** Holds information about a file. Keeping this small is probably beneficial for performance, because the @@ -143,7 +146,9 @@ public: */ int fillwithbytes(enum readtobuffermode filltype, enum readtobuffermode lasttype, - std::vector& buffer); + std::vector& buffer, + Checksum& cksum, + const Options& options); /// get a pointer to the bytes read from the file const char* getbyteptr() const { return m_somebytes.data(); } diff --git a/Makefile.am b/Makefile.am index 3bcea57..a539317 100644 --- a/Makefile.am +++ b/Makefile.am @@ -4,24 +4,26 @@ AUTOMAKE_OPTIONS = gnu # I would like dist-bzip2 here, but automake complains bin_PROGRAMS = rdfind rdfind_SOURCES = rdfind.cc Checksum.cc Dirlist.cc Fileinfo.cc Rdutil.cc \ - EasyRandom.cc UndoableUnlink.cc CmdlineParser.cc + EasyRandom.cc UndoableUnlink.cc CmdlineParser.cc Options.cc LDADD = @LIBXXHASH@ #these are the test scripts to execute - I do not know how to glob here, #feedback welcome. -TESTS=testcases/largefilesupport.sh \ +TESTS=testcases/checksum_buffersize.sh \ + testcases/checksum_options.sh \ testcases/hardlink_fails.sh \ + testcases/largefilesupport.sh \ + testcases/md5collisions.sh \ + testcases/sha1collisions.sh \ testcases/symlinking_action.sh \ + testcases/verify_deterministic_operation.sh \ + testcases/verify_dryrun_option.sh \ testcases/verify_filesize_option.sh \ testcases/verify_maxfilesize_option.sh \ - testcases/verify_dryrun_option.sh \ + testcases/verify_nochecksum.sh \ testcases/verify_ranking.sh \ - testcases/verify_deterministic_operation.sh \ - testcases/checksum_options.sh \ - testcases/md5collisions.sh \ - testcases/sha1collisions.sh \ - testcases/checksum_buffersize.sh \ - testcases/verify_nochecksum.sh + testcases/verify_size_savings.sh \ + testcases/verify_skipfirstbytes.sh AUXFILES=testcases/common_funcs.sh \ @@ -41,7 +43,7 @@ AUXFILES=testcases/common_funcs.sh \ EXTRA_DIST = \ Dirlist.hh Checksum.hh Fileinfo.hh \ Rdutil.hh bootstrap.sh RdfindDebug.hh EasyRandom.hh UndoableUnlink.hh \ - CmdlineParser.hh \ + CmdlineParser.hh Options.hh ChecksumTypes.hh \ $(TESTS) \ $(AUXFILES) \ rdfind.1 LICENSE \ diff --git a/NEWS b/NEWS index 0c8ac0b..3ae8862 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,7 @@ next optionally disable the checksum step by giving -checksum none optionally show progress +optionally adjust the size of first/last bytes, or disable it completely. 1.7.0 requires a C++17 capable compiler. new fast non-cryptographic hash xxh diff --git a/Options.cc b/Options.cc new file mode 100644 index 0000000..ac5bcee --- /dev/null +++ b/Options.cc @@ -0,0 +1,254 @@ +#include "config.h" //header file from autoconf, must come first to make large file support work properly + +#include +#include + +#include "CmdlineParser.hh" +#include "Options.hh" + +namespace { +constexpr auto usagetext = R"( +Usage: rdfind [options] FILE ... + +Finds duplicate files recursively in the given FILEs (directories), and takes +appropriate action (by default, nothing). Directories listed first are ranked +higher, meaning that if a file is found on several places, the file found in +the directory first encountered on the command line is kept, and the others are +considered duplicate. + +options are (default choice within parentheses) + + Searching options: + + -ignoreempty (true)| false ignore empty files (true implies -minsize 1, + false implies -minsize 0) + -minsize N (N=1) ignores files with size less than N bytes + -maxsize N (N=0) ignores files with size N bytes and larger + (use 0 to disable this check). + -followsymlinks true |(false) follow symlinks + -removeidentinode (true)| false ignore files with nonunique device and inode + + Processing options: + + -firstbytessize N sets the size in bytes when comparing the + beginning of files, prior to full checksumming. + default is 64 byte. Use 0 to disable the stage. + -lastbytessize N sets the size in bytes when comparing the + end of files, prior to full checksumming. + default is 64 byte. Use 0 to disable the stage. + -checksum none | md5 |(sha1)| sha256 | sha512 | xxh128 + checksum type + xxh128 is very fast, but is noncryptographic. + -buffersize N chunksize in bytes when calculating the + checksum. The default is 1 MiB, can be up + to 128 MiB. + -deterministic (true)| false makes results independent of order + from listing the filesystem + + Action options: + + -makeresultsfile (true)| false makes a results file + -makesymlinks true |(false) replace duplicate files with symbolic links + -makehardlinks true |(false) replace duplicate files with hard links + -deleteduplicates true |(false) delete duplicate files + Default is 0. Only a few values + are supported; 0, 1-5, 10, 25, 50, 100 + -dryrun|-n true |(false) print to stdout instead of changing anything + + General options: + + -outputname NAME sets the results file name to NAME, + default is results.txt + -sleep Xms sleep for X milliseconds between file reads. + -progress true |(false) output progress information + -h|-help|--help show this help and exit + -v|--version display version number and exit + +If properly installed, a man page should be available as man rdfind. + +rdfind is written by Paul Dreik 2006 onwards. +License: GPL v2 or later (at your option). +)"; + +} + +void +usage() +{ + std::cout << usagetext + 1 << "version is " << VERSION << '\n'; +} + +Options +parseOptions(Parser& parser) +{ + Options o; + for (; parser.has_args_left(); parser.advance()) { + // empty strings are forbidden as input since they can not be file names or + // options + if (parser.get_current_arg()[0] == '\0') { + std::cerr << "bad argument " << parser.get_current_index() << '\n'; + std::exit(EXIT_FAILURE); + } + + // if we reach the end of the argument list - exit the loop and proceed with + // the file list instead. + if (parser.get_current_arg()[0] != '-') { + // end of argument list - exit! + break; + } + if (parser.try_parse_bool("-makesymlinks")) { + o.makesymlinks = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-makehardlinks")) { + o.makehardlinks = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-makeresultsfile")) { + o.makeresultsfile = parser.get_parsed_bool(); + } else if (parser.try_parse_string("-outputname")) { + o.resultsfile = parser.get_parsed_string(); + } else if (parser.try_parse_bool("-ignoreempty")) { + if (parser.get_parsed_bool()) { + o.minimumfilesize = 1; + } else { + o.minimumfilesize = 0; + } + } else if (parser.try_parse_string("-minsize")) { + const long long minsize = std::stoll(parser.get_parsed_string()); + if (minsize < 0) { + throw std::runtime_error("negative value of minsize not allowed"); + } + o.minimumfilesize = minsize; + } else if (parser.try_parse_string("-maxsize")) { + const long long maxsize = std::stoll(parser.get_parsed_string()); + if (maxsize < 0) { + throw std::runtime_error("negative value of maxsize not allowed"); + } + o.maximumfilesize = maxsize; + } else if (parser.try_parse_bool("-deleteduplicates")) { + o.deleteduplicates = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-followsymlinks")) { + o.followsymlinks = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-dryrun")) { + o.dryrun = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-n")) { + o.dryrun = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-removeidentinode")) { + o.remove_identical_inode = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-deterministic")) { + o.deterministic = parser.get_parsed_bool(); + } else if (parser.try_parse_string("-firstbytessize")) { + const auto tmp = std::stoll(parser.get_parsed_string()); + if (tmp < 0) { + throw std::runtime_error( + "negative value of firstbytessize not allowed"); + } + o.first_bytes_size = tmp; + } else if (parser.try_parse_string("-lastbytessize")) { + const auto tmp = std::stoll(parser.get_parsed_string()); + if (tmp < 0) { + throw std::runtime_error("negative value of lastbytessize not allowed"); + } + o.last_bytes_size = tmp; + } else if (parser.try_parse_string("-checksum")) { + if (parser.parsed_string_is("md5")) { + o.usemd5 = true; + } else if (parser.parsed_string_is("sha1")) { + o.usesha1 = true; + } else if (parser.parsed_string_is("sha256")) { + o.usesha256 = true; + } else if (parser.parsed_string_is("sha512")) { + o.usesha512 = true; + } else if (parser.parsed_string_is("xxh128")) { +#ifdef HAVE_LIBXXHASH + o.usexxh128 = true; +#else + std::cerr << "not compiled with xxhash, to make use of xxh128 please " + "reconfigure and rebuild '--with-xxhash'\n"; + std::exit(EXIT_FAILURE); +#endif + } else if (parser.parsed_string_is("none")) { + std::cout + << "DANGER! -checksum none given, will skip the checksumming stage\n"; + o.nochecksum = true; + } else { + std::cerr << "expected none/md5/sha1/sha256/sha512/xxh128, not \"" + << parser.get_parsed_string() << "\"\n"; + std::exit(EXIT_FAILURE); + } + } else if (parser.try_parse_string("-buffersize")) { + const long buffersize = std::stoll(parser.get_parsed_string()); + constexpr long max_buffersize = 128 << 20; + if (buffersize <= 0) { + std::cerr << "a negative or zero buffersize is not allowed\n"; + std::exit(EXIT_FAILURE); + } else if (buffersize > max_buffersize) { + std::cerr << "a maximum of " << (max_buffersize >> 20) + << " MiB buffersize is allowed, got " << (buffersize >> 20) + << " MiB\n"; + std::exit(EXIT_FAILURE); + } + o.buffersize = static_cast(buffersize); + } else if (parser.try_parse_string("-sleep")) { + const auto nextarg = std::string(parser.get_parsed_string()); + if (nextarg == "1ms") { + o.nsecsleep = 1000000; + } else if (nextarg == "2ms") { + o.nsecsleep = 2000000; + } else if (nextarg == "3ms") { + o.nsecsleep = 3000000; + } else if (nextarg == "4ms") { + o.nsecsleep = 4000000; + } else if (nextarg == "5ms") { + o.nsecsleep = 5000000; + } else if (nextarg == "10ms") { + o.nsecsleep = 10000000; + } else if (nextarg == "25ms") { + o.nsecsleep = 25000000; + } else if (nextarg == "50ms") { + o.nsecsleep = 50000000; + } else if (nextarg == "100ms") { + o.nsecsleep = 100000000; + } else { + std::cerr << "sorry, can only understand a few sleep values for " + "now. \"" + << nextarg << "\" is not among them.\n"; + std::exit(EXIT_FAILURE); + } + } else if (parser.try_parse_bool("-progress")) { + o.showprogress = parser.get_parsed_bool(); + } else if (parser.current_arg_is("-help") || parser.current_arg_is("-h") || + parser.current_arg_is("--help")) { + usage(); + std::exit(EXIT_SUCCESS); + } else if (parser.current_arg_is("-version") || + parser.current_arg_is("--version") || + parser.current_arg_is("-v")) { + std::cout << "This is rdfind version " << VERSION << '\n'; + std::exit(EXIT_SUCCESS); + } else { + std::cerr << "did not understand option " << parser.get_current_index() + << ":\"" << parser.get_current_arg() << "\"\n"; + std::exit(EXIT_FAILURE); + } + } + + // fix default values + if (o.maximumfilesize == 0) { + o.maximumfilesize = std::numeric_limits::max(); + } + + // verify conflicting arguments + if (!(o.minimumfilesize < o.maximumfilesize)) { + std::cerr << "maximum filesize " << o.maximumfilesize + << " must be larger than minimum filesize " << o.minimumfilesize + << "\n"; + std::exit(EXIT_FAILURE); + } + + // done with parsing of options. remaining arguments are files and dirs. + + // decide what checksum to use, default to sha1 + if (!o.usemd5 && !o.usesha1 && !o.usesha256 && !o.usesha512 && !o.usexxh128 && + !o.nochecksum) { + o.usesha1 = true; + } + return o; +} diff --git a/Options.hh b/Options.hh new file mode 100644 index 0000000..4905546 --- /dev/null +++ b/Options.hh @@ -0,0 +1,56 @@ +#pragma once + +#include "config.h" + +#include +#include + +#include "ChecksumTypes.hh" +#include "Fileinfo.hh" + +class Parser; + +struct Options +{ + // operation mode and default values + bool makesymlinks = false; // turn duplicates into symbolic links + bool makehardlinks = false; // turn duplicates into hard links + bool makeresultsfile = true; // write a results file + Fileinfo::filesizetype minimumfilesize = + 1; // minimum file size to be noticed (0 - include empty files) + Fileinfo::filesizetype maximumfilesize = + 0; // if nonzero, files this size or larger are ignored + bool deleteduplicates = false; // delete duplicate files + bool followsymlinks = false; // follow symlinks + bool dryrun = false; // only dryrun, don't destroy anything + bool remove_identical_inode = true; // remove files with identical inodes + bool usemd5 = false; // use md5 checksum to check for similarity + bool usesha1 = false; // use sha1 checksum to check for similarity + bool usesha256 = false; // use sha256 checksum to check for similarity + bool usesha512 = false; // use sha512 checksum to check for similarity + bool usexxh128 = false; // use xxh128 checksum to check for similarity + bool nochecksum = false; // skip using checksumming (unsafe!) + bool deterministic = true; // be independent of filesystem order + bool showprogress = false; // show progress while reading file contents + std::size_t buffersize = 1 << 20; // chunksize to use when reading files + long nsecsleep = 0; // number of nanoseconds to sleep between each file read. + std::string resultsfile = "results.txt"; // results file name. + std::uint64_t first_bytes_size = + 4096; // how much to read during the "read first bytes" step + std::uint64_t last_bytes_size = + 4096; // how much to read during the "read last bytes" step + /// checksum used for first and last bytes + checksumtypes checksum_for_firstlast_bytes = +#ifdef HAVE_LIBXXHASH + checksumtypes::XXH128; +#else + // the fastest one after xxh128 is sha1 + checksumtypes::SHA1; +#endif +}; + +void +usage(); + +Options +parseOptions(Parser& parser); diff --git a/README.md b/README.md index 582fc51..c2c95d9 100644 --- a/README.md +++ b/README.md @@ -86,9 +86,9 @@ Rdfind uses the following algorithm. If N is the number of files to search throu 5. If flag -removeidentinode true: Remove items from the list which already are added, based on the combination of inode and device number. A group of files that are hardlinked to the same file are collapsed to one entry. Also see the comment on hardlinks under ”caveats below”! 6. Sort files on size. Remove files from the list, which have unique sizes. 7. Sort on device and inode(speeds up file reading). Read a few bytes from the beginning of each file (first bytes). -8. Remove files from list that have the same size but different first bytes. +8. Remove files from list that have the same size but different first bytes. (This step is possible to disable by using -firstbytessize 0). 9. Sort on device and inode(speeds up file reading). Read a few bytes from the end of each file (last bytes). -10. Remove files from list that have the same size but different last bytes. +10. Remove files from list that have the same size but different last bytes. (This step is possible to disable by using -lastbytessize 0). 11. Sort on device and inode(speeds up file reading). Perform a checksum calculation for each file (unless disabled with -checksum none). 12. Only keep files on the list with the same size and checksum. These are duplicates. 13. Sort list on size, priority number, and depth. The first file for every set of duplicates is considered to be the original. diff --git a/Rdutil.cc b/Rdutil.cc index 6a4902c..4142ffe 100644 --- a/Rdutil.cc +++ b/Rdutil.cc @@ -19,7 +19,9 @@ #include //sleep // project +#include "Checksum.hh" #include "Fileinfo.hh" //file container +#include "Options.hh" #include "RdfindDebug.hh" // class declaration @@ -542,16 +544,46 @@ Rdutil::saveablespace(std::ostream& out) const int Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type, enum Fileinfo::readtobuffermode lasttype, - const long nsecsleep, - const std::size_t buffersize, + const Options& options, std::function progress_cb) { // first sort on inode (to read efficiently from the hard drive) sortOnDeviceAndInode(); - const auto duration = std::chrono::nanoseconds{ nsecsleep }; + // make a checksum object which can be reused to avoid creating an object + // per processed file + checksumtypes cktype{}; + switch (type) { + case Fileinfo::readtobuffermode::READ_FIRST_BYTES: + cktype = options.checksum_for_firstlast_bytes; + break; + case Fileinfo::readtobuffermode::READ_LAST_BYTES: + cktype = options.checksum_for_firstlast_bytes; + break; + case Fileinfo::readtobuffermode::CREATE_XXH128_CHECKSUM: + cktype = checksumtypes::XXH128; + break; + case Fileinfo::readtobuffermode::CREATE_SHA1_CHECKSUM: + cktype = checksumtypes::SHA1; + break; + case Fileinfo::readtobuffermode::CREATE_SHA256_CHECKSUM: + cktype = checksumtypes::SHA256; + break; + case Fileinfo::readtobuffermode::CREATE_SHA512_CHECKSUM: + cktype = checksumtypes::SHA512; + break; + case Fileinfo::readtobuffermode::CREATE_MD5_CHECKSUM: + cktype = checksumtypes::MD5; + break; + default: + throw std::runtime_error("bad readtobuffermode"); + } + + Checksum cksum(cktype); + + const auto duration = std::chrono::nanoseconds{ options.nsecsleep }; - std::vector buffer(buffersize, '\0'); + std::vector buffer(options.buffersize, '\0'); std::size_t progress_count = 0; for (auto& elem : m_list) { @@ -559,8 +591,8 @@ Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type, ++progress_count; progress_cb(progress_count); } - elem.fillwithbytes(type, lasttype, buffer); - if (nsecsleep > 0) { + elem.fillwithbytes(type, lasttype, buffer, cksum, options); + if (options.nsecsleep > 0) { std::this_thread::sleep_for(duration); } } diff --git a/Rdutil.hh b/Rdutil.hh index 47f6303..2bd629f 100644 --- a/Rdutil.hh +++ b/Rdutil.hh @@ -14,6 +14,8 @@ #include "Fileinfo.hh" //file container +struct Options; + class Rdutil { public: @@ -90,8 +92,7 @@ public: // nanoseconds can be made between each file. int fillwithbytes(enum Fileinfo::readtobuffermode type, enum Fileinfo::readtobuffermode lasttype, - long nsecsleep, - std::size_t buffersize, + const Options& options, std::function progress_cb); /// make symlinks of duplicates. diff --git a/inofficial_cmake/CMakeLists.txt b/inofficial_cmake/CMakeLists.txt index 8fc7906..a88847d 100644 --- a/inofficial_cmake/CMakeLists.txt +++ b/inofficial_cmake/CMakeLists.txt @@ -24,6 +24,7 @@ add_library( rdfindimpl OBJECT ../Checksum.cc ../Checksum.hh + ../ChecksumTypes.hh ../CmdlineParser.cc ../CmdlineParser.hh ../Dirlist.cc @@ -32,6 +33,8 @@ add_library( ../EasyRandom.hh ../Fileinfo.cc ../Fileinfo.hh + ../Options.cc + ../Options.hh ../RdfindDebug.hh ../Rdutil.cc ../Rdutil.hh @@ -78,7 +81,8 @@ set(testscripts testcases/verify_maxfilesize_option.sh testcases/verify_nochecksum.sh testcases/verify_ranking.sh - testcases/verify_size_savings.sh) + testcases/verify_size_savings.sh + testcases/verify_skipfirstbytes.sh) foreach(testscript ${testscripts}) cmake_path(GET testscript STEM testname) diff --git a/rdfind.1 b/rdfind.1 index 0804cc4..3e99aea 100644 --- a/rdfind.1 +++ b/rdfind.1 @@ -91,6 +91,14 @@ for files, smaller or bigger can improve performance dependent on filesystem and checksum algorithm. The default is 1 MiB, the maximum allowed is 128MiB (inclusive). .TP +.BR \-firstbytessize " " \fIN\fR +Size in bytes when scanning the first bytes of each file, prior to full +checksumming. Setting this to 0 means skipping the step entirely. +.TP +.BR \-lastbytessize " " \fIN\fR +Size in bytes when scanning the last bytes of each file, prior to full +checksumming. Setting this to 0 means skipping the step entirely. +.TP .BR \-deterministic " " \fItrue\fR|\fIfalse\fR If set (the default), sort files of equal rank in an unspecified but deterministic order. This makes the behaviour independent of in which diff --git a/rdfind.cc b/rdfind.cc index 1d5f23d..c64ad42 100644 --- a/rdfind.cc +++ b/rdfind.cc @@ -11,7 +11,6 @@ static_assert(__cplusplus >= 201703L, // std #include -#include #include #include #include @@ -20,6 +19,7 @@ static_assert(__cplusplus >= 201703L, #include "CmdlineParser.hh" #include "Dirlist.hh" //to find files #include "Fileinfo.hh" //file container +#include "Options.hh" // #include "RdfindDebug.hh" //debug macro #include "Rdutil.hh" //to do some work @@ -27,7 +27,6 @@ static_assert(__cplusplus >= 201703L, // this vector holds the information about all files found std::vector filelist; -struct Options; const Options* global_options{}; /** @@ -37,255 +36,6 @@ const Options* global_options{}; */ int current_cmdline_index = 0; -static void -usage() -{ - const auto indent = " "; - std::cout - << "Usage: rdfind [options] FILE ...\n" - << '\n' - << "Finds duplicate files recursively in the given FILEs (directories),\n" - << "and takes appropriate action (by default, nothing).\n" - << "Directories listed first are ranked higher, meaning that if a\n" - << "file is found on several places, the file found in the directory " - "first\n" - << "encountered on the command line is kept, and the others are " - "considered duplicate.\n" - << '\n' - << "options are (default choice within parentheses)\n" - << '\n' - << " -ignoreempty (true)| false ignore empty files (true implies " - "-minsize 1,\n" - << " false implies -minsize 0)\n" - << " -minsize N (N=1) ignores files with size less than N " - "bytes\n" - << " -maxsize N (N=0) ignores files with size N " - "bytes and larger (use 0 to disable this check).\n" - << " -followsymlinks true |(false) follow symlinks\n" - << " -removeidentinode (true)| false ignore files with nonunique " - "device and inode\n" - << " -checksum none | md5 |(sha1)| sha256 | sha512 | xxh128\n" - << indent << "checksum type\n" - << indent << "xxh128 is very fast, but is noncryptographic.\n" - << " -buffersize N\n" - << indent << "chunksize in bytes when calculating the checksum.\n" - << indent << "The default is 1 MiB, can be up to 128 MiB.\n" - << " -deterministic (true)| false makes results independent of order\n" - << " from listing the filesystem\n" - << " -makesymlinks true |(false) replace duplicate files with " - "symbolic links\n" - << " -makehardlinks true |(false) replace duplicate files with " - "hard links\n" - << " -makeresultsfile (true)| false makes a results file\n" - << " -outputname name sets the results file name to \"name\" " - "(default results.txt)\n" - << " -progress true |(false) output progress information" - << " -deleteduplicates true |(false) delete duplicate files\n" - << " -sleep Xms sleep for X milliseconds between " - "file reads.\n" - << " Default is 0. Only a few values\n" - << " are supported; 0,1-5,10,25,50,100\n" - << " -dryrun|-n true |(false) print to stdout instead of " - "changing anything\n" - << " -h|-help|--help show this help and exit\n" - << " -v|--version display version number and exit\n" - << '\n' - << "If properly installed, a man page should be available as man rdfind.\n" - << '\n' - << "rdfind is written by Paul Dreik 2006 onwards. License: GPL v2 or " - "later (at your option).\n" - << "version is " << VERSION << '\n'; -} - -struct Options -{ - // operation mode and default values - bool makesymlinks = false; // turn duplicates into symbolic links - bool makehardlinks = false; // turn duplicates into hard links - bool makeresultsfile = true; // write a results file - Fileinfo::filesizetype minimumfilesize = - 1; // minimum file size to be noticed (0 - include empty files) - Fileinfo::filesizetype maximumfilesize = - 0; // if nonzero, files this size or larger are ignored - bool deleteduplicates = false; // delete duplicate files - bool followsymlinks = false; // follow symlinks - bool dryrun = false; // only dryrun, don't destroy anything - bool remove_identical_inode = true; // remove files with identical inodes - bool usemd5 = false; // use md5 checksum to check for similarity - bool usesha1 = false; // use sha1 checksum to check for similarity - bool usesha256 = false; // use sha256 checksum to check for similarity - bool usesha512 = false; // use sha512 checksum to check for similarity - bool usexxh128 = false; // use xxh128 checksum to check for similarity - bool nochecksum = false; // skip using checksumming (unsafe!) - bool deterministic = true; // be independent of filesystem order - bool showprogress = false; // show progress while reading file contents - std::size_t buffersize = 1 << 20; // chunksize to use when reading files - long nsecsleep = 0; // number of nanoseconds to sleep between each file read. - std::string resultsfile = "results.txt"; // results file name. -}; - -Options -parseOptions(Parser& parser) -{ - Options o; - for (; parser.has_args_left(); parser.advance()) { - // empty strings are forbidden as input since they can not be file names or - // options - if (parser.get_current_arg()[0] == '\0') { - std::cerr << "bad argument " << parser.get_current_index() << '\n'; - std::exit(EXIT_FAILURE); - } - - // if we reach the end of the argument list - exit the loop and proceed with - // the file list instead. - if (parser.get_current_arg()[0] != '-') { - // end of argument list - exit! - break; - } - if (parser.try_parse_bool("-makesymlinks")) { - o.makesymlinks = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-makehardlinks")) { - o.makehardlinks = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-makeresultsfile")) { - o.makeresultsfile = parser.get_parsed_bool(); - } else if (parser.try_parse_string("-outputname")) { - o.resultsfile = parser.get_parsed_string(); - } else if (parser.try_parse_bool("-ignoreempty")) { - if (parser.get_parsed_bool()) { - o.minimumfilesize = 1; - } else { - o.minimumfilesize = 0; - } - } else if (parser.try_parse_string("-minsize")) { - const long long minsize = std::stoll(parser.get_parsed_string()); - if (minsize < 0) { - throw std::runtime_error("negative value of minsize not allowed"); - } - o.minimumfilesize = minsize; - } else if (parser.try_parse_string("-maxsize")) { - const long long maxsize = std::stoll(parser.get_parsed_string()); - if (maxsize < 0) { - throw std::runtime_error("negative value of maxsize not allowed"); - } - o.maximumfilesize = maxsize; - } else if (parser.try_parse_bool("-deleteduplicates")) { - o.deleteduplicates = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-followsymlinks")) { - o.followsymlinks = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-dryrun")) { - o.dryrun = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-n")) { - o.dryrun = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-removeidentinode")) { - o.remove_identical_inode = parser.get_parsed_bool(); - } else if (parser.try_parse_bool("-deterministic")) { - o.deterministic = parser.get_parsed_bool(); - } else if (parser.try_parse_string("-checksum")) { - if (parser.parsed_string_is("md5")) { - o.usemd5 = true; - } else if (parser.parsed_string_is("sha1")) { - o.usesha1 = true; - } else if (parser.parsed_string_is("sha256")) { - o.usesha256 = true; - } else if (parser.parsed_string_is("sha512")) { - o.usesha512 = true; - } else if (parser.parsed_string_is("xxh128")) { -#ifdef HAVE_LIBXXHASH - o.usexxh128 = true; -#else - std::cerr << "not compiled with xxhash, to make use of xxh128 please " - "reconfigure and rebuild '--with-xxhash'\n"; - std::exit(EXIT_FAILURE); -#endif - } else if (parser.parsed_string_is("none")) { - std::cout - << "DANGER! -checksum none given, will skip the checksumming stage\n"; - o.nochecksum = true; - } else { - std::cerr << "expected none/md5/sha1/sha256/sha512/xxh128, not \"" - << parser.get_parsed_string() << "\"\n"; - std::exit(EXIT_FAILURE); - } - } else if (parser.try_parse_string("-buffersize")) { - const long buffersize = std::stoll(parser.get_parsed_string()); - constexpr long max_buffersize = 128 << 20; - if (buffersize <= 0) { - std::cerr << "a negative or zero buffersize is not allowed\n"; - std::exit(EXIT_FAILURE); - } else if (buffersize > max_buffersize) { - std::cerr << "a maximum of " << (max_buffersize >> 20) - << " MiB buffersize is allowed, got " << (buffersize >> 20) - << " MiB\n"; - std::exit(EXIT_FAILURE); - } - o.buffersize = static_cast(buffersize); - } else if (parser.try_parse_string("-sleep")) { - const auto nextarg = std::string(parser.get_parsed_string()); - if (nextarg == "1ms") { - o.nsecsleep = 1000000; - } else if (nextarg == "2ms") { - o.nsecsleep = 2000000; - } else if (nextarg == "3ms") { - o.nsecsleep = 3000000; - } else if (nextarg == "4ms") { - o.nsecsleep = 4000000; - } else if (nextarg == "5ms") { - o.nsecsleep = 5000000; - } else if (nextarg == "10ms") { - o.nsecsleep = 10000000; - } else if (nextarg == "25ms") { - o.nsecsleep = 25000000; - } else if (nextarg == "50ms") { - o.nsecsleep = 50000000; - } else if (nextarg == "100ms") { - o.nsecsleep = 100000000; - } else { - std::cerr << "sorry, can only understand a few sleep values for " - "now. \"" - << nextarg << "\" is not among them.\n"; - std::exit(EXIT_FAILURE); - } - } else if (parser.try_parse_bool("-progress")) { - o.showprogress = parser.get_parsed_bool(); - } else if (parser.current_arg_is("-help") || parser.current_arg_is("-h") || - parser.current_arg_is("--help")) { - usage(); - std::exit(EXIT_SUCCESS); - } else if (parser.current_arg_is("-version") || - parser.current_arg_is("--version") || - parser.current_arg_is("-v")) { - std::cout << "This is rdfind version " << VERSION << '\n'; - std::exit(EXIT_SUCCESS); - } else { - std::cerr << "did not understand option " << parser.get_current_index() - << ":\"" << parser.get_current_arg() << "\"\n"; - std::exit(EXIT_FAILURE); - } - } - - // fix default values - if (o.maximumfilesize == 0) { - o.maximumfilesize = std::numeric_limits::max(); - } - - // verify conflicting arguments - if (!(o.minimumfilesize < o.maximumfilesize)) { - std::cerr << "maximum filesize " << o.maximumfilesize - << " must be larger than minimum filesize " << o.minimumfilesize - << "\n"; - std::exit(EXIT_FAILURE); - } - - // done with parsing of options. remaining arguments are files and dirs. - - // decide what checksum to use, default to sha1 - if (!o.usemd5 && !o.usesha1 && !o.usesha256 && !o.usesha512 && !o.usexxh128 && - !o.nochecksum) { - o.usesha1 = true; - } - return o; -} - // function to add items to the list of all files static int report(const std::string& path, const std::string& name, int depth) @@ -396,9 +146,15 @@ main(int narg, const char* argv[]) // candidates. start looking at the contents. std::vector> modes{ { Fileinfo::readtobuffermode::NOT_DEFINED, "" }, - { Fileinfo::readtobuffermode::READ_FIRST_BYTES, "first bytes" }, - { Fileinfo::readtobuffermode::READ_LAST_BYTES, "last bytes" }, }; + if (o.first_bytes_size > 0) { + modes.emplace_back(Fileinfo::readtobuffermode::READ_FIRST_BYTES, + "first bytes"); + } + if (o.last_bytes_size > 0) { + modes.emplace_back(Fileinfo::readtobuffermode::READ_LAST_BYTES, + "last bytes"); + } if (o.usemd5) { modes.emplace_back(Fileinfo::readtobuffermode::CREATE_MD5_CHECKSUM, "md5 checksum"); @@ -441,8 +197,7 @@ main(int narg, const char* argv[]) } // read bytes (destroys the sorting, for disk reading efficiency) - gswd.fillwithbytes( - it[0].first, it[-1].first, o.nsecsleep, o.buffersize, progress_callback); + gswd.fillwithbytes(it[0].first, it[-1].first, o, progress_callback); // remove non-duplicates std::cout << "removed " << gswd.removeUniqSizeAndBuffer() diff --git a/testcases/md5collisions.sh b/testcases/md5collisions.sh index 4030ecf..f820505 100755 --- a/testcases/md5collisions.sh +++ b/testcases/md5collisions.sh @@ -11,12 +11,17 @@ mkdir md5coll cp "$testscriptsdir/md5collisions/"*.ps md5coll sync +# set small first/last bytes sizes +firstlastoptions="-firstbytessize 64 -lastbytessize 64" + #make sure nothing happens when using sha -$rdfind -checksum sha1 -deleteduplicates true md5coll 2>&1 | tee rdfind.out +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum sha1 -deleteduplicates true md5coll 2>&1 | tee rdfind.out grep -q "^Deleted 0 files.$" rdfind.out dbgecho "using sha1 did not delete any files, as expected" -$rdfind -checksum md5 -deleteduplicates true md5coll 2>&1 | tee rdfind.out +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum md5 -deleteduplicates true md5coll 2>&1 | tee rdfind.out grep -q "^Deleted 1 files.$" rdfind.out dbgecho "using md5 did delete files, as expected" diff --git a/testcases/sha1collisions.sh b/testcases/sha1collisions.sh index 3c1b817..de32687 100755 --- a/testcases/sha1collisions.sh +++ b/testcases/sha1collisions.sh @@ -6,15 +6,20 @@ set -e reset_teststate +# set small first/last bytes sizes +firstlastoptions="-firstbytessize 64 -lastbytessize 64" + #unpack collisions example from https://shattered.it/static/shattered.pdf base64 --decode <"$testscriptsdir/sha1collisions/coll.tar.bz2.b64" | tar xvfj - #make sure nothing happens when using sha256 -$rdfind -checksum sha256 -deleteduplicates true . 2>&1 | tee rdfind.out +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum sha256 -deleteduplicates true . 2>&1 | tee rdfind.out grep -q "^Deleted 0 files.$" rdfind.out dbgecho "using sha256 did not delete any files, as expected" -$rdfind -checksum sha1 -deleteduplicates true . 2>&1 | tee rdfind.out +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum sha1 -deleteduplicates true . 2>&1 | tee rdfind.out grep -q "^Deleted 1 files.$" rdfind.out dbgecho "using sha1 did delete the files, as expected" diff --git a/testcases/verify_nochecksum.sh b/testcases/verify_nochecksum.sh index 5b90075..bd1913e 100755 --- a/testcases/verify_nochecksum.sh +++ b/testcases/verify_nochecksum.sh @@ -24,14 +24,19 @@ makefiles() { reset_teststate makefiles +# set small first/last to small sizes +firstlastoptions="-firstbytessize 64 -lastbytessize 64" + # with no checksum, we should falsely believe the files are equal -$rdfind -checksum none a* b* \ +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum none a* b* \ | grep "files that are not unique" >output.log verify [ "$(cat output.log)" = "It seems like you have 2 files that are not unique" ] # with checksumming (the default) the files should not be considered equal. -$rdfind -checksum sha1 a* b* \ +# shellcheck disable=SC2086 +$rdfind $firstlastoptions -checksum sha1 a* b* \ | grep "files that are not unique" >output.log verify [ "$(cat output.log)" = "It seems like you have 0 files that are not unique" ] diff --git a/testcases/verify_skipfirstbytes.sh b/testcases/verify_skipfirstbytes.sh new file mode 100755 index 0000000..689e895 --- /dev/null +++ b/testcases/verify_skipfirstbytes.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# Ensures the skip first bytes step checks +# + +set -e +. "$(dirname "$0")/common_funcs.sh" + +FIRSTBYTES=1000 +MIDDLEBYTES=1000 +LASTBYTES=1000 + +# make a file which is longer than "first bytes" and "last bytes" together, +# so we can make two files that differ only in the middle and will +# need checksumming to see they are different. +makefiles() { + for f in a b; do + ( + head -c$FIRSTBYTES $f + done +} + +reset_teststate +makefiles + +defaultfirst="-firstbytessize 64" +defaultlast="-lastbytessize 64" + +# with no checksum, we should falsely believe the files are equal +# shellcheck disable=SC2086 +$rdfind -checksum none $defaultfirst $defaultlast a* b* \ + | grep "files that are not unique" >output.log +verify [ "$(cat output.log)" = "It seems like you have 2 files that are not unique" ] + +# if we set the first bytes size to be very large, we will detect it +# shellcheck disable=SC2086 +$rdfind -checksum none -firstbytessize $((FIRSTBYTES + MIDDLEBYTES)) $defaultlast a* b* \ + | grep "files that are not unique" >output.log +verify [ "$(cat output.log)" = "It seems like you have 0 files that are not unique" ] + +# if we set the last bytes size to be very large, we will also detect it +# shellcheck disable=SC2086 +$rdfind -checksum none $defaultfirst -lastbytessize $((MIDDLEBYTES + LASTBYTES)) a* b* \ + | grep "files that are not unique" >output.log +verify [ "$(cat output.log)" = "It seems like you have 0 files that are not unique" ] + +dbgecho "all is good for the skip first bytes step check!" diff --git a/unittests/test_checksum.cc b/unittests/test_checksum.cc index 253aeb0..920d586 100644 --- a/unittests/test_checksum.cc +++ b/unittests/test_checksum.cc @@ -4,7 +4,7 @@ #include namespace { -using enum Checksum::checksumtypes; +using enum checksumtypes; const auto types = { MD5, SHA1, SHA256, @@ -94,3 +94,25 @@ TEST_CASE("copy from an rval is fine") REQUIRE(expected == finalize_checksum(movedto)); } } + +TEST_CASE("resetting the state works as intended") +{ + static const char* content = "abcd"; + for (auto type : types) { + + Checksum ck1(type); + const auto v1 = finalize_checksum(ck1); + + Checksum ck2(type); + REQUIRE(0 == ck2.update(std::strlen(content), content)); + const auto v2 = finalize_checksum(ck2); + REQUIRE(v1 != v2); + + // resetting should get the same checksum as an empty object. + Checksum ck3(type); + REQUIRE(0 == ck3.update(std::strlen(content), content)); + ck3.reset(); + const auto v3 = finalize_checksum(ck3); + REQUIRE(v1 == v3); + } +}