pauldreik · alexmi256 · Jun 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ aclocal*
 autom4te*
 config.h
 ./config.h.in
+config.h.in
 config.log
 config.status
 configure
@@ -17,9 +18,11 @@ missing
 rdfind
 stamp-h1
 *~
+.idea
 
 compile
 test-driver
 nettle32bit/
 *.log
 rdfind-*.tar.gz
+results.txt
diff --git a/Fileinfo.cc b/Fileinfo.cc
@@ -25,7 +25,8 @@
 int
 Fileinfo::fillwithbytes(enum readtobuffermode filltype,
                         enum readtobuffermode lasttype,
-                        std::vector<char>& buffer)
+                        std::vector<char>& buffer,
+                        int partialchecksum)
 {
 
   // Decide if we are going to read from file or not.
@@ -84,10 +85,39 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype,
   if (checksumtype != Checksum::checksumtypes::NOTSET) {
     Checksum chk(checksumtype);
 
-    while (f1) {
-      f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
-      // gcount is never negative, the cast is safe.
-      chk.update(static_cast<std::size_t>(f1.gcount()), buffer.data());
+    const std::size_t max_bytes = 1024 * 1024 * partialchecksum;
+    const std::size_t file_size = size();
+
+    if (partialchecksum > 0 && file_size > 2 * max_bytes) {
+      // Read first amount of MiB
+      std::size_t bytes_read = 0;
+      while (bytes_read < max_bytes && f1) {
+        f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
+        std::streamsize count = f1.gcount();
+        chk.update(static_cast<std::size_t>(count), buffer.data());
+        bytes_read += count;
+      }
+
+      // Seek to last amount of MiB
+      f1.clear(); // Clear any EOF flags
+      f1.seekg(static_cast<std::streamoff>(file_size - max_bytes),
+               std::ios::beg);
+
+      // Read last amount of MiB
+      bytes_read = 0;
+      while (bytes_read < max_bytes && f1) {
+        f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
+        std::streamsize count = f1.gcount();
+        chk.update(static_cast<std::size_t>(count), buffer.data());
+        bytes_read += count;
+      }
+    } else {
+      // Original behavior: read entire file
+      while (f1) {
+        f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
+        // gcount is never negative, the cast is safe.
+        chk.update(static_cast<std::size_t>(f1.gcount()), buffer.data());
+      }
     }
 
     // store the result of the checksum calculation in somebytes

diff --git a/Fileinfo.hh b/Fileinfo.hh
@@ -139,11 +139,14 @@ public:
    * @param lasttype
    * @param buffer will be used as a scratch buffer - provided from the outside
    * to avoid having to reallocate it for each file
+   * @param partialchecksum when filltype is a CREATE_XXX_CHECKSUM type then
+   * only hash the first and last N MiB of the file skipping contents inbetween
    * @return zero on success
    */
   int fillwithbytes(enum readtobuffermode filltype,
                     enum readtobuffermode lasttype,
-                    std::vector<char>& buffer);
+                    std::vector<char>& buffer,
+                    int partialchecksum);
 
   /// get a pointer to the bytes read from the file
   const char* getbyteptr() const { return m_somebytes.data(); }

diff --git a/Rdutil.cc b/Rdutil.cc
@@ -543,7 +543,8 @@ int
 Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type,
                       enum Fileinfo::readtobuffermode lasttype,
                       const long nsecsleep,
-                      const std::size_t buffersize)
+                      const std::size_t buffersize,
+                      int partialchecksum)
 {
   // first sort on inode (to read efficiently from the hard drive)
   sortOnDeviceAndInode();
@@ -553,7 +554,7 @@ Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type,
   std::vector<char> buffer(buffersize, '\0');
 
   for (auto& elem : m_list) {
-    elem.fillwithbytes(type, lasttype, buffer);
+    elem.fillwithbytes(type, lasttype, buffer, partialchecksum);
     if (nsecsleep > 0) {
       std::this_thread::sleep_for(duration);
     }

diff --git a/Rdutil.hh b/Rdutil.hh
@@ -90,7 +90,8 @@ public:
   int fillwithbytes(enum Fileinfo::readtobuffermode type,
                     enum Fileinfo::readtobuffermode lasttype,
                     long nsecsleep,
-                    std::size_t buffersize);
+                    std::size_t buffersize,
+                    int partialchecksum);
 
   /// make symlinks of duplicates.
   std::size_t makesymlinks(bool dryrun) const;

diff --git a/rdfind.cc b/rdfind.cc
@@ -69,6 +69,9 @@ usage()
     << " -buffersize N\n"
     << indent << "chunksize in bytes when calculating the checksum.\n"
     << indent << "The default is 1 MiB, can be up to 128 MiB.\n"
+    << " -partialchecksum  (N=0)          number of MiB to checksum from the "
+       "start and end of a file instead of hashing the whole file.\n"
+    << indent << "Zero means hash the whole file. 1 would be 2MiB hashed\n"
     << " -deterministic    (true)| false  makes results independent of order\n"
     << "                                  from listing the filesystem\n"
     << " -makesymlinks      true |(false) replace duplicate files with "
@@ -109,6 +112,7 @@ struct Options
   bool followsymlinks = false;        // follow symlinks
   bool dryrun = false;                // only dryrun, don't destroy anything
   bool remove_identical_inode = true; // remove files with identical inodes
+  int partialchecksum = 0;   // compute checksum for only part of the file
   bool usemd5 = false;       // use md5 checksum to check for similarity
   bool usesha1 = false;      // use sha1 checksum to check for similarity
   bool usesha256 = false;    // use sha256 checksum to check for similarity
@@ -176,6 +180,12 @@ parseOptions(Parser& parser)
       o.remove_identical_inode = parser.get_parsed_bool();
     } else if (parser.try_parse_bool("-deterministic")) {
       o.deterministic = parser.get_parsed_bool();
+    } else if (parser.try_parse_string("-partialchecksum")) {
+      const int partialchecksumsize = std::stoi(parser.get_parsed_string());
+      if (partialchecksumsize < 0) {
+        throw std::runtime_error("negative value of partialchecksum not allowed");
+      }
+      o.partialchecksum = partialchecksumsize;
     } else if (parser.try_parse_string("-checksum")) {
       if (parser.parsed_string_is("md5")) {
         o.usemd5 = true;
@@ -414,7 +424,8 @@ main(int narg, const char* argv[])
               << it->second << ": " << std::flush;
 
     // read bytes (destroys the sorting, for disk reading efficiency)
-    gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep, o.buffersize);
+    gswd.fillwithbytes(
+      it[0].first, it[-1].first, o.nsecsleep, o.buffersize, o.partialchecksum);
 
     // remove non-duplicates
     std::cout << "removed " << gswd.removeUniqSizeAndBuffer()