From 88214af3f16945ba6089a87f18c93b790eb4b5d1 Mon Sep 17 00:00:00 2001
From: Alex Mi <alexmi3.14@gmail.com>
Date: Tue, 3 Jun 2025 13:03:56 -0400
Subject: [PATCH] Add option to partially hash files

Adds the `-partialchecksum` option which takes in the number
of MB to be hashed from the start and end of the file instead of
hashing the whole file.

This option is set to 0 by default which means it is disabled.

When it is set to a value like 1, the first 1mb of the file would
be read, the we'd skip head to file_size - 1mb and read that last
1mb to compute the checksum.
This option will only take info effect if file_size > 2 * partial
checksum size.

This has the tradeoff of much quicker hashing times (the different
hashing algos don't matter much where you're limited by disk speed)
at the risk of false positives.
---
 .gitignore  |  3 +++
 Fileinfo.cc | 40 +++++++++++++++++++++++++++++++++++-----
 Fileinfo.hh |  5 ++++-
 Rdutil.cc   |  5 +++--
 Rdutil.hh   |  3 ++-
 rdfind.cc   | 13 ++++++++++++-
 6 files changed, 59 insertions(+), 10 deletions(-)
diff --git a/.gitignore b/.gitignore
index 37fed47..365bf4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ aclocal*
 autom4te*
 config.h
 ./config.h.in
+config.h.in
 config.log
 config.status
 configure
@@ -17,9 +18,11 @@ missing
 rdfind
 stamp-h1
 *~
+.idea
 
 compile
 test-driver
 nettle32bit/
 *.log
 rdfind-*.tar.gz
+results.txt
diff --git a/Fileinfo.cc b/Fileinfo.cc
index c8ee097..14a2c0e 100644
--- a/Fileinfo.cc
+++ b/Fileinfo.cc
@@ -25,7 +25,8 @@
 int
 Fileinfo::fillwithbytes(enum readtobuffermode filltype,
                         enum readtobuffermode lasttype,
-                        std::vector<char>& buffer)
+                        std::vector<char>& buffer,
+                        int partialchecksum)
 {
 
   // Decide if we are going to read from file or not.
@@ -84,10 +85,39 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype,
   if (checksumtype != Checksum::checksumtypes::NOTSET) {
     Checksum chk(checksumtype);
 
-    while (f1) {
-      f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
-      // gcount is never negative, the cast is safe.
-      chk.update(static_cast<std::size_t>(f1.gcount()), buffer.data());
+    const std::size_t max_bytes = 1024 * 1024 * partialchecksum;
+    const std::size_t file_size = size();
+
+    if (partialchecksum > 0 && file_size > 2 * max_bytes) {
+      // Read first amount of MiB
+      std::size_t bytes_read = 0;
+      while (bytes_read < max_bytes && f1) {
+        f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
+        std::streamsize count = f1.gcount();
+        chk.update(static_cast<std::size_t>(count), buffer.data());
+        bytes_read += count;
+      }
+
+      // Seek to last amount of MiB
+      f1.clear(); // Clear any EOF flags
+      f1.seekg(static_cast<std::streamoff>(file_size - max_bytes),
+               std::ios::beg);
+
+      // Read last amount of MiB
+      bytes_read = 0;
+      while (bytes_read < max_bytes && f1) {
+        f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
+        std::streamsize count = f1.gcount();
+        chk.update(static_cast<std::size_t>(count), buffer.data());
+        bytes_read += count;
+      }
+    } else {
+      // Original behavior: read entire file
+      while (f1) {
+        f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
+        // gcount is never negative, the cast is safe.
+        chk.update(static_cast<std::size_t>(f1.gcount()), buffer.data());
+      }
     }
 
     // store the result of the checksum calculation in somebytes
diff --git a/Fileinfo.hh b/Fileinfo.hh
index 0ece9f1..2d8e3d9 100644
--- a/Fileinfo.hh
+++ b/Fileinfo.hh
@@ -139,11 +139,14 @@ public:
    * @param lasttype
    * @param buffer will be used as a scratch buffer - provided from the outside
    * to avoid having to reallocate it for each file
+   * @param partialchecksum when filltype is a CREATE_XXX_CHECKSUM type then
+   * only hash the first and last N MiB of the file skipping contents inbetween
    * @return zero on success
    */
   int fillwithbytes(enum readtobuffermode filltype,
                     enum readtobuffermode lasttype,
-                    std::vector<char>& buffer);
+                    std::vector<char>& buffer,
+                    int partialchecksum);
 
   /// get a pointer to the bytes read from the file
   const char* getbyteptr() const { return m_somebytes.data(); }
diff --git a/Rdutil.cc b/Rdutil.cc
index 7b9ddef..d3284a8 100644
--- a/Rdutil.cc
+++ b/Rdutil.cc
@@ -543,7 +543,8 @@ int
 Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type,
                       enum Fileinfo::readtobuffermode lasttype,
                       const long nsecsleep,
-                      const std::size_t buffersize)
+                      const std::size_t buffersize,
+                      int partialchecksum)
 {
   // first sort on inode (to read efficiently from the hard drive)
   sortOnDeviceAndInode();
@@ -553,7 +554,7 @@ Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type,
   std::vector<char> buffer(buffersize, '\0');
 
   for (auto& elem : m_list) {
-    elem.fillwithbytes(type, lasttype, buffer);
+    elem.fillwithbytes(type, lasttype, buffer, partialchecksum);
     if (nsecsleep > 0) {
       std::this_thread::sleep_for(duration);
     }
diff --git a/Rdutil.hh b/Rdutil.hh
index 6f5899a..3d68de8 100644
--- a/Rdutil.hh
+++ b/Rdutil.hh
@@ -90,7 +90,8 @@ public:
   int fillwithbytes(enum Fileinfo::readtobuffermode type,
                     enum Fileinfo::readtobuffermode lasttype,
                     long nsecsleep,
-                    std::size_t buffersize);
+                    std::size_t buffersize,
+                    int partialchecksum);
 
   /// make symlinks of duplicates.
   std::size_t makesymlinks(bool dryrun) const;
diff --git a/rdfind.cc b/rdfind.cc
index 71d7588..125b5aa 100644
--- a/rdfind.cc
+++ b/rdfind.cc
@@ -69,6 +69,9 @@ usage()
     << " -buffersize N\n"
     << indent << "chunksize in bytes when calculating the checksum.\n"
     << indent << "The default is 1 MiB, can be up to 128 MiB.\n"
+    << " -partialchecksum  (N=0)          number of MiB to checksum from the "
+       "start and end of a file instead of hashing the whole file.\n"
+    << indent << "Zero means hash the whole file. 1 would be 2MiB hashed\n"
     << " -deterministic    (true)| false  makes results independent of order\n"
     << "                                  from listing the filesystem\n"
     << " -makesymlinks      true |(false) replace duplicate files with "
@@ -109,6 +112,7 @@ struct Options
   bool followsymlinks = false;        // follow symlinks
   bool dryrun = false;                // only dryrun, don't destroy anything
   bool remove_identical_inode = true; // remove files with identical inodes
+  int partialchecksum = 0;   // compute checksum for only part of the file
   bool usemd5 = false;       // use md5 checksum to check for similarity
   bool usesha1 = false;      // use sha1 checksum to check for similarity
   bool usesha256 = false;    // use sha256 checksum to check for similarity
@@ -176,6 +180,12 @@ parseOptions(Parser& parser)
       o.remove_identical_inode = parser.get_parsed_bool();
     } else if (parser.try_parse_bool("-deterministic")) {
       o.deterministic = parser.get_parsed_bool();
+    } else if (parser.try_parse_string("-partialchecksum")) {
+      const int partialchecksumsize = std::stoi(parser.get_parsed_string());
+      if (partialchecksumsize < 0) {
+        throw std::runtime_error("negative value of partialchecksum not allowed");
+      }
+      o.partialchecksum = partialchecksumsize;
     } else if (parser.try_parse_string("-checksum")) {
       if (parser.parsed_string_is("md5")) {
         o.usemd5 = true;
@@ -414,7 +424,8 @@ main(int narg, const char* argv[])
               << it->second << ": " << std::flush;
 
     // read bytes (destroys the sorting, for disk reading efficiency)
-    gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep, o.buffersize);
+    gswd.fillwithbytes(
+      it[0].first, it[-1].first, o.nsecsleep, o.buffersize, o.partialchecksum);
 
     // remove non-duplicates
     std::cout << "removed " << gswd.removeUniqSizeAndBuffer()