From 88214af3f16945ba6089a87f18c93b790eb4b5d1 Mon Sep 17 00:00:00 2001 From: Alex Mi Date: Tue, 3 Jun 2025 13:03:56 -0400 Subject: [PATCH] Add option to partially hash files Adds the `-partialchecksum` option which takes in the number of MB to be hashed from the start and end of the file instead of hashing the whole file. This option is set to 0 by default which means it is disabled. When it is set to a value like 1, the first 1mb of the file would be read, the we'd skip head to file_size - 1mb and read that last 1mb to compute the checksum. This option will only take info effect if file_size > 2 * partial checksum size. This has the tradeoff of much quicker hashing times (the different hashing algos don't matter much where you're limited by disk speed) at the risk of false positives. --- .gitignore | 3 +++ Fileinfo.cc | 40 +++++++++++++++++++++++++++++++++++----- Fileinfo.hh | 5 ++++- Rdutil.cc | 5 +++-- Rdutil.hh | 3 ++- rdfind.cc | 13 ++++++++++++- 6 files changed, 59 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 37fed47..365bf4e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ aclocal* autom4te* config.h ./config.h.in +config.h.in config.log config.status configure @@ -17,9 +18,11 @@ missing rdfind stamp-h1 *~ +.idea compile test-driver nettle32bit/ *.log rdfind-*.tar.gz +results.txt diff --git a/Fileinfo.cc b/Fileinfo.cc index c8ee097..14a2c0e 100644 --- a/Fileinfo.cc +++ b/Fileinfo.cc @@ -25,7 +25,8 @@ int Fileinfo::fillwithbytes(enum readtobuffermode filltype, enum readtobuffermode lasttype, - std::vector& buffer) + std::vector& buffer, + int partialchecksum) { // Decide if we are going to read from file or not. @@ -84,10 +85,39 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype, if (checksumtype != Checksum::checksumtypes::NOTSET) { Checksum chk(checksumtype); - while (f1) { - f1.read(buffer.data(), static_cast(buffer.size())); - // gcount is never negative, the cast is safe. - chk.update(static_cast(f1.gcount()), buffer.data()); + const std::size_t max_bytes = 1024 * 1024 * partialchecksum; + const std::size_t file_size = size(); + + if (partialchecksum > 0 && file_size > 2 * max_bytes) { + // Read first amount of MiB + std::size_t bytes_read = 0; + while (bytes_read < max_bytes && f1) { + f1.read(buffer.data(), static_cast(buffer.size())); + std::streamsize count = f1.gcount(); + chk.update(static_cast(count), buffer.data()); + bytes_read += count; + } + + // Seek to last amount of MiB + f1.clear(); // Clear any EOF flags + f1.seekg(static_cast(file_size - max_bytes), + std::ios::beg); + + // Read last amount of MiB + bytes_read = 0; + while (bytes_read < max_bytes && f1) { + f1.read(buffer.data(), static_cast(buffer.size())); + std::streamsize count = f1.gcount(); + chk.update(static_cast(count), buffer.data()); + bytes_read += count; + } + } else { + // Original behavior: read entire file + while (f1) { + f1.read(buffer.data(), static_cast(buffer.size())); + // gcount is never negative, the cast is safe. + chk.update(static_cast(f1.gcount()), buffer.data()); + } } // store the result of the checksum calculation in somebytes diff --git a/Fileinfo.hh b/Fileinfo.hh index 0ece9f1..2d8e3d9 100644 --- a/Fileinfo.hh +++ b/Fileinfo.hh @@ -139,11 +139,14 @@ public: * @param lasttype * @param buffer will be used as a scratch buffer - provided from the outside * to avoid having to reallocate it for each file + * @param partialchecksum when filltype is a CREATE_XXX_CHECKSUM type then + * only hash the first and last N MiB of the file skipping contents inbetween * @return zero on success */ int fillwithbytes(enum readtobuffermode filltype, enum readtobuffermode lasttype, - std::vector& buffer); + std::vector& buffer, + int partialchecksum); /// get a pointer to the bytes read from the file const char* getbyteptr() const { return m_somebytes.data(); } diff --git a/Rdutil.cc b/Rdutil.cc index 7b9ddef..d3284a8 100644 --- a/Rdutil.cc +++ b/Rdutil.cc @@ -543,7 +543,8 @@ int Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type, enum Fileinfo::readtobuffermode lasttype, const long nsecsleep, - const std::size_t buffersize) + const std::size_t buffersize, + int partialchecksum) { // first sort on inode (to read efficiently from the hard drive) sortOnDeviceAndInode(); @@ -553,7 +554,7 @@ Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type, std::vector buffer(buffersize, '\0'); for (auto& elem : m_list) { - elem.fillwithbytes(type, lasttype, buffer); + elem.fillwithbytes(type, lasttype, buffer, partialchecksum); if (nsecsleep > 0) { std::this_thread::sleep_for(duration); } diff --git a/Rdutil.hh b/Rdutil.hh index 6f5899a..3d68de8 100644 --- a/Rdutil.hh +++ b/Rdutil.hh @@ -90,7 +90,8 @@ public: int fillwithbytes(enum Fileinfo::readtobuffermode type, enum Fileinfo::readtobuffermode lasttype, long nsecsleep, - std::size_t buffersize); + std::size_t buffersize, + int partialchecksum); /// make symlinks of duplicates. std::size_t makesymlinks(bool dryrun) const; diff --git a/rdfind.cc b/rdfind.cc index 71d7588..125b5aa 100644 --- a/rdfind.cc +++ b/rdfind.cc @@ -69,6 +69,9 @@ usage() << " -buffersize N\n" << indent << "chunksize in bytes when calculating the checksum.\n" << indent << "The default is 1 MiB, can be up to 128 MiB.\n" + << " -partialchecksum (N=0) number of MiB to checksum from the " + "start and end of a file instead of hashing the whole file.\n" + << indent << "Zero means hash the whole file. 1 would be 2MiB hashed\n" << " -deterministic (true)| false makes results independent of order\n" << " from listing the filesystem\n" << " -makesymlinks true |(false) replace duplicate files with " @@ -109,6 +112,7 @@ struct Options bool followsymlinks = false; // follow symlinks bool dryrun = false; // only dryrun, don't destroy anything bool remove_identical_inode = true; // remove files with identical inodes + int partialchecksum = 0; // compute checksum for only part of the file bool usemd5 = false; // use md5 checksum to check for similarity bool usesha1 = false; // use sha1 checksum to check for similarity bool usesha256 = false; // use sha256 checksum to check for similarity @@ -176,6 +180,12 @@ parseOptions(Parser& parser) o.remove_identical_inode = parser.get_parsed_bool(); } else if (parser.try_parse_bool("-deterministic")) { o.deterministic = parser.get_parsed_bool(); + } else if (parser.try_parse_string("-partialchecksum")) { + const int partialchecksumsize = std::stoi(parser.get_parsed_string()); + if (partialchecksumsize < 0) { + throw std::runtime_error("negative value of partialchecksum not allowed"); + } + o.partialchecksum = partialchecksumsize; } else if (parser.try_parse_string("-checksum")) { if (parser.parsed_string_is("md5")) { o.usemd5 = true; @@ -414,7 +424,8 @@ main(int narg, const char* argv[]) << it->second << ": " << std::flush; // read bytes (destroys the sorting, for disk reading efficiency) - gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep, o.buffersize); + gswd.fillwithbytes( + it[0].first, it[-1].first, o.nsecsleep, o.buffersize, o.partialchecksum); // remove non-duplicates std::cout << "removed " << gswd.removeUniqSizeAndBuffer()