Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ aclocal*
autom4te*
config.h
./config.h.in
config.h.in
config.log
config.status
configure
Expand All @@ -17,9 +18,11 @@ missing
rdfind
stamp-h1
*~
.idea

compile
test-driver
nettle32bit/
*.log
rdfind-*.tar.gz
results.txt
40 changes: 35 additions & 5 deletions Fileinfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
int
Fileinfo::fillwithbytes(enum readtobuffermode filltype,
enum readtobuffermode lasttype,
std::vector<char>& buffer)
std::vector<char>& buffer,
int partialchecksum)
{

// Decide if we are going to read from file or not.
Expand Down Expand Up @@ -84,10 +85,39 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype,
if (checksumtype != Checksum::checksumtypes::NOTSET) {
Checksum chk(checksumtype);

while (f1) {
f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
// gcount is never negative, the cast is safe.
chk.update(static_cast<std::size_t>(f1.gcount()), buffer.data());
const std::size_t max_bytes = 1024 * 1024 * partialchecksum;
const std::size_t file_size = size();

if (partialchecksum > 0 && file_size > 2 * max_bytes) {
// Read first amount of MiB
std::size_t bytes_read = 0;
while (bytes_read < max_bytes && f1) {
f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
std::streamsize count = f1.gcount();
chk.update(static_cast<std::size_t>(count), buffer.data());
bytes_read += count;
}

// Seek to last amount of MiB
f1.clear(); // Clear any EOF flags
f1.seekg(static_cast<std::streamoff>(file_size - max_bytes),
std::ios::beg);

// Read last amount of MiB
bytes_read = 0;
while (bytes_read < max_bytes && f1) {
f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
std::streamsize count = f1.gcount();
chk.update(static_cast<std::size_t>(count), buffer.data());
bytes_read += count;
}
} else {
// Original behavior: read entire file
while (f1) {
f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
// gcount is never negative, the cast is safe.
chk.update(static_cast<std::size_t>(f1.gcount()), buffer.data());
}
}

// store the result of the checksum calculation in somebytes
Expand Down
5 changes: 4 additions & 1 deletion Fileinfo.hh
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,14 @@ public:
* @param lasttype
* @param buffer will be used as a scratch buffer - provided from the outside
* to avoid having to reallocate it for each file
* @param partialchecksum when filltype is a CREATE_XXX_CHECKSUM type then
* only hash the first and last N MiB of the file skipping contents inbetween
* @return zero on success
*/
int fillwithbytes(enum readtobuffermode filltype,
enum readtobuffermode lasttype,
std::vector<char>& buffer);
std::vector<char>& buffer,
int partialchecksum);

/// get a pointer to the bytes read from the file
const char* getbyteptr() const { return m_somebytes.data(); }
Expand Down
5 changes: 3 additions & 2 deletions Rdutil.cc
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,8 @@ int
Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type,
enum Fileinfo::readtobuffermode lasttype,
const long nsecsleep,
const std::size_t buffersize)
const std::size_t buffersize,
int partialchecksum)
{
// first sort on inode (to read efficiently from the hard drive)
sortOnDeviceAndInode();
Expand All @@ -553,7 +554,7 @@ Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type,
std::vector<char> buffer(buffersize, '\0');

for (auto& elem : m_list) {
elem.fillwithbytes(type, lasttype, buffer);
elem.fillwithbytes(type, lasttype, buffer, partialchecksum);
if (nsecsleep > 0) {
std::this_thread::sleep_for(duration);
}
Expand Down
3 changes: 2 additions & 1 deletion Rdutil.hh
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ public:
int fillwithbytes(enum Fileinfo::readtobuffermode type,
enum Fileinfo::readtobuffermode lasttype,
long nsecsleep,
std::size_t buffersize);
std::size_t buffersize,
int partialchecksum);

/// make symlinks of duplicates.
std::size_t makesymlinks(bool dryrun) const;
Expand Down
13 changes: 12 additions & 1 deletion rdfind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ usage()
<< " -buffersize N\n"
<< indent << "chunksize in bytes when calculating the checksum.\n"
<< indent << "The default is 1 MiB, can be up to 128 MiB.\n"
<< " -partialchecksum (N=0) number of MiB to checksum from the "
"start and end of a file instead of hashing the whole file.\n"
<< indent << "Zero means hash the whole file. 1 would be 2MiB hashed\n"
<< " -deterministic (true)| false makes results independent of order\n"
<< " from listing the filesystem\n"
<< " -makesymlinks true |(false) replace duplicate files with "
Expand Down Expand Up @@ -109,6 +112,7 @@ struct Options
bool followsymlinks = false; // follow symlinks
bool dryrun = false; // only dryrun, don't destroy anything
bool remove_identical_inode = true; // remove files with identical inodes
int partialchecksum = 0; // compute checksum for only part of the file
bool usemd5 = false; // use md5 checksum to check for similarity
bool usesha1 = false; // use sha1 checksum to check for similarity
bool usesha256 = false; // use sha256 checksum to check for similarity
Expand Down Expand Up @@ -176,6 +180,12 @@ parseOptions(Parser& parser)
o.remove_identical_inode = parser.get_parsed_bool();
} else if (parser.try_parse_bool("-deterministic")) {
o.deterministic = parser.get_parsed_bool();
} else if (parser.try_parse_string("-partialchecksum")) {
const int partialchecksumsize = std::stoi(parser.get_parsed_string());
if (partialchecksumsize < 0) {
throw std::runtime_error("negative value of partialchecksum not allowed");
}
o.partialchecksum = partialchecksumsize;
} else if (parser.try_parse_string("-checksum")) {
if (parser.parsed_string_is("md5")) {
o.usemd5 = true;
Expand Down Expand Up @@ -414,7 +424,8 @@ main(int narg, const char* argv[])
<< it->second << ": " << std::flush;

// read bytes (destroys the sorting, for disk reading efficiency)
gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep, o.buffersize);
gswd.fillwithbytes(
it[0].first, it[-1].first, o.nsecsleep, o.buffersize, o.partialchecksum);

// remove non-duplicates
std::cout << "removed " << gswd.removeUniqSizeAndBuffer()
Expand Down