From a433cb10cd4e633289a448649f9509e1b24d9c7e Mon Sep 17 00:00:00 2001
From: Zygo Blaxell <zblaxell@waya.furryterror.org>
Date: Fri, 8 Jan 2010 23:01:23 -0500
Subject: [PATCH] Add a threshold to skip-hash

Hashing usually performs well except in special cases where many
large files have the same size but very different contents near
the beginning of the file.  In these cases, it is usually faster
to execute the O(N^2) comparisons between all the files, thereby
avoiding reading most of their data.

In cases where there are many small files, the opposite setting of
the skip-hash option is usually better, so skip-hash is now a
threshold parameter with a default of 1MB.

Unfortunately it is not trivial to detect this condition without
doing sufficient work to negate the benefit; hence, we require
the operator to specify a preference.
---
 faster-dupemerge | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)
diff --git a/faster-dupemerge b/faster-dupemerge
index 7e899dd..14360dc 100755
--- a/faster-dupemerge
+++ b/faster-dupemerge
@@ -87,6 +87,7 @@ my $collapse_timestamp = 0;
 my $collapse_zero = 0;
 my $skip_compares = 0;
 my $skip_hashes = 0;
+my $skip_hashes_threshold = 0;
 my $progress = 0;
 my $verbose = 0;
 my $debug = 0;
@@ -160,7 +161,10 @@ hard links).
 
         --skip-compare  skip byte-by-byte file comparisons
 
-        --skip-hash     skip calculation of hash function on files
+        --skip-hash[=N] skip calculation of hash function on files
+                        larger than N bytes (default 1M).
+                        Scalars KMGT specify KiB, MiB, GiB, and TiB.
+                        Scalars kmgt specify KB, MB, GB, and TB.
 
         --trust         old name for --skip-compare
                         (trust the hash function)
@@ -181,8 +185,22 @@ while ($#ARGV >= 0) {
 		$collapse_zero = 1;
 	} elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
 		$skip_compares = 1;
-	} elsif ($arg eq '--skip-hash') {
-		$skip_hashes = 1;
+	} elsif ($arg =~ /^--skip-hash(?:=(\d+)([KkMmGgTt]?))?$/os) {
+		my ($quantity, $unit) = ($1, $2);
+		$unit ||= '_';
+		$quantity ||= 1048576;
+		my %scale = (
+			_ => 1,
+			k => 1000,
+			K => 1024,
+			m => 1000*1000,
+			M => 1024*1024,
+			g => 1000*1000*1000,
+			G => 1024*1024*1024,
+			t => 1000*1000*1000*1000,
+			T => 1024*1024*1024*1024,
+		);
+		$skip_hashes = $skip_hashes_threshold = $quantity * $scale{$unit};
 	} elsif ($arg eq '--progress') {
 		$progress = 1;
 	} elsif ($arg eq '--verbose') {
@@ -572,13 +590,15 @@ end_merge:
 }
 
 while (<FIND>) {
-	my ($weak_key, $dev, $ino, $name) = m/^(\d+ \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so;
+	my ($weak_key, $size, $dev, $ino, $name) = m/^((\d+) \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so;
 	die "read error: $!\nLast input line was '$_'" unless defined($name);
 
 	my $inode = format_inode($dev, $ino);
 
 	print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
 
+	$skip_hashes = $size >= $skip_hashes_threshold;
+
 	$input_links++;
 	merge_files if $weak_key ne $current_key;
 	$current_key = $weak_key;
-- 
2.39.5