Replace --trust with --skip-compare and add --skip-hash and copyright statement

author Zygo Blaxell <zblaxell@waya.furryterror.org>

Sat, 9 Jan 2010 01:04:54 +0000 (20:04 -0500)

committer Zygo Blaxell <zblaxell@waya.furryterror.org>

Sat, 9 Jan 2010 01:04:54 +0000 (20:04 -0500)
author Zygo Blaxell <zblaxell@waya.furryterror.org>
Sat, 9 Jan 2010 01:04:54 +0000 (20:04 -0500)
committer Zygo Blaxell <zblaxell@waya.furryterror.org>
Sat, 9 Jan 2010 01:04:54 +0000 (20:04 -0500)
diff --git a/faster-dupemerge b/faster-dupemerge

index 6508fc0da801e8d66cb8476cbfb0bd5ce25ce373..a007540b3ace88bd5ba7009b3b4c7d6e66d163c3 100755 (executable)
--- a/faster-dupemerge
+++ b/faster-dupemerge
@@ -1,14 +1,22 @@
  #!/usr/bin/perl -w
  #!/usr/bin/perl -w
+# $Id$
+
+# Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
+# Use, modification, and distribution permitted
+# under the terms of the GNU GPL.
+
  use strict;
  use Fcntl qw(:DEFAULT :flock);
  use File::Compare;
  use File::Temp;
  
  use strict;
  use Fcntl qw(:DEFAULT :flock);
  use File::Compare;
  use File::Temp;
  
-my $links_input = 0;
-my $files_input = 0;
-my $bogons_input = 0;
-my $files_hashed = 0;
+my $input_links = 0;
+my $input_files = 0;
+my $input_bogons = 0;
+my $hash_bytes = 0;
+my $hash_files = 0;
  my $hash_errors = 0;
  my $hash_errors = 0;
+my $compare_bytes = 0;
  my $compare_count = 0;
  my $compare_errors = 0;
  my $compare_differences = 0;
  my $compare_count = 0;
  my $compare_errors = 0;
  my $compare_differences = 0;
@@ -17,10 +25,10 @@ my $merges_attempted = 0;
  my $hard_links = 0;
  my $link_errors = 0;
  my $link_retries = 0;
  my $hard_links = 0;
  my $link_errors = 0;
  my $link_retries = 0;
-my $bytes_recovered = 0;
-my $files_recovered = 0;
-my $files_lost = 0;
-my $bytes_lost = 0;
+my $recovered_bytes = 0;
+my $recovered_files = 0;
+my $lost_files = 0;
+my $lost_bytes = 0;
  my $surprises = 0;
  
  eval '
  my $surprises = 0;
  
  eval '
@@ -31,7 +39,7 @@ if ($@) {
         warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
  
         eval <<'DIGEST';
         warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
  
         eval <<'DIGEST';
-               sub digest {
+               sub really_digest {
                         my ($filename) = (@_);
                         my $fv = open(MD5SUM, "-|");    
                         die "fork: $!" unless defined($fv);
                         my ($filename) = (@_);
                         my $fv = open(MD5SUM, "-|");    
                         die "fork: $!" unless defined($fv);
@@ -51,7 +59,7 @@ if ($@) {
  DIGEST
  } else {
         eval <<'DIGEST';
  DIGEST
  } else {
         eval <<'DIGEST';
-               sub digest {
+               sub really_digest {
                         my ($filename) = (@_);
                         die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
                         my $ctx = Digest::SHA1->new;
                         my ($filename) = (@_);
                         die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
                         my $ctx = Digest::SHA1->new;
@@ -67,7 +75,8 @@ DIGEST
  my $collapse_access = 0;
  my $collapse_timestamp = 0;
  my $collapse_zero = 0;
  my $collapse_access = 0;
  my $collapse_timestamp = 0;
  my $collapse_zero = 0;
-my $trust_hashes = 0;
+my $skip_compares = 0;
+my $skip_hashes = 0;
  my $verbose = 0;
  my $debug = 0;
  my @extra_find_opts = ();
  my $verbose = 0;
  my $debug = 0;
  my @extra_find_opts = ();
@@ -76,6 +85,17 @@ my $lock_file;
  my $lock_rm = 0;
  my $lock_obtained = 0;
  
  my $lock_rm = 0;
  my $lock_obtained = 0;
  
+sub digest {
+       my ($filename) = (@_);
+       if ($skip_hashes) {
+               return "SKIPPING HASHES";
+       } else {
+               &really_digest($filename);
+               $hash_bytes += -s $filename;
+               $hash_files++;
+       }
+}
+
  my @directories;
  
  sub usage {
  my @directories;
  
  sub usage {
@@ -110,7 +130,11 @@ hard links).
  
          --timestamps    mtime may be different for identical files
  
  
          --timestamps    mtime may be different for identical files
  
-        --trust         skip byte-by-byte file comparisons
+       --skip-compare  skip byte-by-byte file comparisons
+
+       --skip-hash     skip calculation of hash function on files
+
+        --trust         old name for --skip-compare
                          (trust the hash function)
  
          --verbose       report files as they are considered
                          (trust the hash function)
  
          --verbose       report files as they are considered
@@ -127,8 +151,10 @@ while ($#ARGV >= 0) {
                 $collapse_timestamp = 1;
         } elsif ($arg eq '--zeros') {
                 $collapse_zero = 1;
                 $collapse_timestamp = 1;
         } elsif ($arg eq '--zeros') {
                 $collapse_zero = 1;
-       } elsif ($arg eq '--trust') {
-               $trust_hashes = 1;
+       } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
+               $skip_compares = 1;
+       } elsif ($arg eq '--skip-hash') {
+               $skip_hashes = 1;
         } elsif ($arg eq '--verbose') {
                 $verbose = 1;
         } elsif ($arg eq '--lock-rm') {
         } elsif ($arg eq '--verbose') {
                 $verbose = 1;
         } elsif ($arg eq '--lock-rm') {
@@ -161,6 +187,10 @@ while ($#ARGV >= 0) {
         }
  }
  
         }
  }
  
+if ($skip_hashes && $skip_compares) {
+       die "Cannot skip both hashes and compares.\n";
+}
+
  if (defined($lock_file)) {
         sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
         flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
  if (defined($lock_file)) {
         sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
         flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
@@ -240,7 +270,7 @@ sub merge_files {
         my %stop_loop;
  
         my @candidate_list = keys(%inode_to_file_name);
         my %stop_loop;
  
         my @candidate_list = keys(%inode_to_file_name);
-       $files_input += @candidate_list;
+       $input_files += @candidate_list;
         if (@candidate_list < 2) {
                 print STDERR "Merging...only one candidate to merge..." if $debug;
                 $trivially_unique++;
         if (@candidate_list < 2) {
                 print STDERR "Merging...only one candidate to merge..." if $debug;
                 $trivially_unique++;
@@ -262,13 +292,14 @@ hash_file:
                                 $surprises++;
                                 next;
                         }
                                 $surprises++;
                                 next;
                         }
-                       eval { $digest = digest($filename); };
+                       eval { 
+                               $digest = digest($filename); 
+                       };
                         if ($@) {
                                 warn "Digest($filename)(#$candidate) failed: $@";
                                 $hash_errors++;
                         } else {
                                 $ok = 1;
                         if ($@) {
                                 warn "Digest($filename)(#$candidate) failed: $@";
                                 $hash_errors++;
                         } else {
                                 $ok = 1;
-                               $files_hashed++;
                                 last hash_file;
                         }
                 }
                                 last hash_file;
                         }
                 }
@@ -289,7 +320,7 @@ link_start:
                                         print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
                                         print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
  
                                         print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
                                         print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
  
-       incumbent_file:
+incumbent_file:
  
                                         foreach my $incumbent_file (@incumbent_names) {
                                                 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
  
                                         foreach my $incumbent_file (@incumbent_names) {
                                                 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
@@ -307,10 +338,10 @@ link_start:
                                                         next incumbent_file;
                                                 }
  
                                                         next incumbent_file;
                                                 }
  
-       candidate_file:
-
                                                 my $at_least_one_link_done = 0;
  
                                                 my $at_least_one_link_done = 0;
  
+candidate_file:
+
                                                 foreach my $candidate_file (@candidate_names) {
                                                         my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
                                                         print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
                                                 foreach my $candidate_file (@candidate_names) {
                                                         my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
                                                         print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
@@ -336,21 +367,26 @@ link_start:
                                                         my $identical;
  
                                                         eval {
                                                         my $identical;
  
                                                         eval {
-                                                               if ($trust_hashes) {
-                                                                       print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
+                                                               if ($skip_compares) {
+                                                                       print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
                                                                         $identical = 1;
                                                                 } else {
                                                                         my $quoted_incumbent_file = tick_quote($incumbent_file);
                                                                         my $quoted_candidate_file = tick_quote($candidate_file);
                                                                         $identical = 1;
                                                                 } else {
                                                                         my $quoted_incumbent_file = tick_quote($incumbent_file);
                                                                         my $quoted_candidate_file = tick_quote($candidate_file);
-                                                                       print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
+                                                                       print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
                                                                         if (compare($incumbent_file, $candidate_file)) {
                                                                                 $compare_differences++;
                                                                                 $identical = 0;
                                                                         if (compare($incumbent_file, $candidate_file)) {
                                                                                 $compare_differences++;
                                                                                 $identical = 0;
-                                                                               print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
+                                                                               # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
+                                                                               # Some kind of I/O error is more likely, so this message cannot be turned off.
+                                                                               # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
+                                                                               # so the warning in that case is quite silly.  Hmmm.
+                                                                               print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
                                                                         } else {
                                                                                 $identical = 1;
                                                                         }
                                                                         $compare_count++;
                                                                         } else {
                                                                                 $identical = 1;
                                                                         }
                                                                         $compare_count++;
+                                                                       $compare_bytes += $incumbent_size;
                                                                 }
                                                         };
                                                         if ($@) {
                                                                 }
                                                         };
                                                         if ($@) {
@@ -430,8 +466,8 @@ link_start:
  
                                                                                 $hard_links++;
                                                                                 if ($to_nlink == 1) {
  
                                                                                 $hard_links++;
                                                                                 if ($to_nlink == 1) {
-                                                                                       $files_recovered++;
-                                                                                       $bytes_recovered += $incumbent_size;
+                                                                                       $recovered_files++;
+                                                                                       $recovered_bytes += $incumbent_size;
                                                                                 }
  
                                                                                 # FIXME:  Now we're really confused for some reason.
                                                                                 }
  
                                                                                 # FIXME:  Now we're really confused for some reason.
@@ -444,8 +480,8 @@ link_start:
                                                                                 # tried all possible ways to hardlink the file out of existence first;
                                                                                 # however, that is complex and only benefits a silly statistic.
                                                                                 if ($to_nlink == 1 || $from_nlink == 1) {
                                                                                 # tried all possible ways to hardlink the file out of existence first;
                                                                                 # however, that is complex and only benefits a silly statistic.
                                                                                 if ($to_nlink == 1 || $from_nlink == 1) {
-                                                                                       $files_lost++;
-                                                                                       $bytes_lost += $incumbent_size;
+                                                                                       $lost_files++;
+                                                                                       $lost_bytes += $incumbent_size;
                                                                                 }
                                                                         }
                                                                 }
                                                                                 }
                                                                         }
                                                                 }
@@ -478,11 +514,11 @@ while (<FIND>) {
  
         unless (! (-l $name) && (-f _)) {
                 warn "Bogon file " . tick_quote($name);
  
         unless (! (-l $name) && (-f _)) {
                 warn "Bogon file " . tick_quote($name);
-               $bogons_input++;
+               $input_bogons++;
                 next;
         }
  
                 next;
         }
  
-       $links_input++;
+       $input_links++;
         merge_files if $weak_key ne $current_key;
         $current_key = $weak_key;
  
         merge_files if $weak_key ne $current_key;
         $current_key = $weak_key;
  
@@ -493,25 +529,31 @@ while (<FIND>) {
  
  merge_files;
  
  
  merge_files;
  
-print STDERR <<STATS;
-links_input             $links_input
-files_input ........... $files_input
-bogons_input            $bogons_input
-merges_attempted ...... $merges_attempted
-trivially_unique        $trivially_unique
-files_hashed .......... $files_hashed
-hash_errors             $hash_errors
-surprises ............. $surprises
+my $stats_blob = <<STATS;
+compare_bytes           $compare_bytes
  compare_count           $compare_count
  compare_count           $compare_count
-compare_differences ... $compare_differences
+compare_differences     $compare_differences
  compare_errors          $compare_errors
  compare_errors          $compare_errors
-hard_links ............ $hard_links
+hard_links              $hard_links
+hash_bytes              $hash_bytes
+hash_errors             $hash_errors
+hash_files              $hash_files
+input_bogons            $input_bogons
+input_files             $input_files
+input_links             $input_links
  link_errors             $link_errors
  link_errors             $link_errors
-link_retries .......... $link_retries
-bytes_recovered         $bytes_recovered
-files_recovered ....... $files_recovered
-bytes_lost              $bytes_lost
-files_lost ............ $files_lost
+link_retries            $link_retries
+lost_bytes              $lost_bytes
+lost_files              $lost_files
+merges_attempted        $merges_attempted
+recovered_bytes         $recovered_bytes
+recovered_files         $recovered_files
+surprises               $surprises
+trivially_unique        $trivially_unique
  STATS
  
  STATS
  
+$stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
+
+print STDERR $stats_blob;
+
  exit(0);
  exit(0);
author	Zygo Blaxell <zblaxell@waya.furryterror.org>
	Sat, 9 Jan 2010 01:04:54 +0000 (20:04 -0500)
committer	Zygo Blaxell <zblaxell@waya.furryterror.org>
	Sat, 9 Jan 2010 01:04:54 +0000 (20:04 -0500)