Add a threshold to skip-hash

author Zygo Blaxell <zblaxell@waya.furryterror.org>

Sat, 9 Jan 2010 04:01:23 +0000 (23:01 -0500)

committer Zygo Blaxell <zblaxell@waya.furryterror.org>

Sat, 9 Jan 2010 04:01:23 +0000 (23:01 -0500)
author Zygo Blaxell <zblaxell@waya.furryterror.org>
Sat, 9 Jan 2010 04:01:23 +0000 (23:01 -0500)
committer Zygo Blaxell <zblaxell@waya.furryterror.org>
Sat, 9 Jan 2010 04:01:23 +0000 (23:01 -0500)
diff --git a/faster-dupemerge b/faster-dupemerge

index 7e899ddc8eb8ae4f87770d42a33efe4acca43b95..14360dc8c802d83c89132fe5780b7070fd0896f0 100755 (executable)
--- a/faster-dupemerge
+++ b/faster-dupemerge
@@ -87,6 +87,7 @@ my $collapse_timestamp = 0;
  my $collapse_zero = 0;
  my $skip_compares = 0;
  my $skip_hashes = 0;
+my $skip_hashes_threshold = 0;
  my $progress = 0;
  my $verbose = 0;
  my $debug = 0;
@@ -160,7 +161,10 @@ hard links).
  
          --skip-compare  skip byte-by-byte file comparisons
  
-        --skip-hash     skip calculation of hash function on files
+        --skip-hash[=N] skip calculation of hash function on files
+                        larger than N bytes (default 1M).
+                        Scalars KMGT specify KiB, MiB, GiB, and TiB.
+                        Scalars kmgt specify KB, MB, GB, and TB.
  
          --trust         old name for --skip-compare
                          (trust the hash function)
@@ -181,8 +185,22 @@ while ($#ARGV >= 0) {
                 $collapse_zero = 1;
         } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
                 $skip_compares = 1;
-       } elsif ($arg eq '--skip-hash') {
-               $skip_hashes = 1;
+       } elsif ($arg =~ /^--skip-hash(?:=(\d+)([KkMmGgTt]?))?$/os) {
+               my ($quantity, $unit) = ($1, $2);
+               $unit ||= '_';
+               $quantity ||= 1048576;
+               my %scale = (
+                       _ => 1,
+                       k => 1000,
+                       K => 1024,
+                       m => 1000*1000,
+                       M => 1024*1024,
+                       g => 1000*1000*1000,
+                       G => 1024*1024*1024,
+                       t => 1000*1000*1000*1000,
+                       T => 1024*1024*1024*1024,
+               );
+               $skip_hashes = $skip_hashes_threshold = $quantity * $scale{$unit};
         } elsif ($arg eq '--progress') {
                 $progress = 1;
         } elsif ($arg eq '--verbose') {
@@ -572,13 +590,15 @@ end_merge:
  }
  
  while (<FIND>) {
-       my ($weak_key, $dev, $ino, $name) = m/^(\d+ \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so;
+       my ($weak_key, $size, $dev, $ino, $name) = m/^((\d+) \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so;
         die "read error: $!\nLast input line was '$_'" unless defined($name);
  
         my $inode = format_inode($dev, $ino);
  
         print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
  
+       $skip_hashes = $size >= $skip_hashes_threshold;
+
         $input_links++;
         merge_files if $weak_key ne $current_key;
         $current_key = $weak_key;
author	Zygo Blaxell <zblaxell@waya.furryterror.org>
	Sat, 9 Jan 2010 04:01:23 +0000 (23:01 -0500)
committer	Zygo Blaxell <zblaxell@waya.furryterror.org>
	Sat, 9 Jan 2010 04:01:23 +0000 (23:01 -0500)