Replace --trust with --skip-compare and add --skip-hash and copyright statement
authorZygo Blaxell <zblaxell@waya.furryterror.org>
Sat, 9 Jan 2010 01:04:54 +0000 (20:04 -0500)
committerZygo Blaxell <zblaxell@waya.furryterror.org>
Sat, 9 Jan 2010 01:04:54 +0000 (20:04 -0500)
git-svn-id: svn+ssh://svn.furryterror.org/r/trunk/mokona/zblaxell@3225 a5e33b96-951a-0410-ae88-c0fe16d076bb

Conflicts:

faster-dupemerge

faster-dupemerge

index 6508fc0da801e8d66cb8476cbfb0bd5ce25ce373..a007540b3ace88bd5ba7009b3b4c7d6e66d163c3 100755 (executable)
@@ -1,14 +1,22 @@
 #!/usr/bin/perl -w
+# $Id$
+
+# Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
+# Use, modification, and distribution permitted
+# under the terms of the GNU GPL.
+
 use strict;
 use Fcntl qw(:DEFAULT :flock);
 use File::Compare;
 use File::Temp;
 
-my $links_input = 0;
-my $files_input = 0;
-my $bogons_input = 0;
-my $files_hashed = 0;
+my $input_links = 0;
+my $input_files = 0;
+my $input_bogons = 0;
+my $hash_bytes = 0;
+my $hash_files = 0;
 my $hash_errors = 0;
+my $compare_bytes = 0;
 my $compare_count = 0;
 my $compare_errors = 0;
 my $compare_differences = 0;
@@ -17,10 +25,10 @@ my $merges_attempted = 0;
 my $hard_links = 0;
 my $link_errors = 0;
 my $link_retries = 0;
-my $bytes_recovered = 0;
-my $files_recovered = 0;
-my $files_lost = 0;
-my $bytes_lost = 0;
+my $recovered_bytes = 0;
+my $recovered_files = 0;
+my $lost_files = 0;
+my $lost_bytes = 0;
 my $surprises = 0;
 
 eval '
@@ -31,7 +39,7 @@ if ($@) {
        warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
 
        eval <<'DIGEST';
-               sub digest {
+               sub really_digest {
                        my ($filename) = (@_);
                        my $fv = open(MD5SUM, "-|");    
                        die "fork: $!" unless defined($fv);
@@ -51,7 +59,7 @@ if ($@) {
 DIGEST
 } else {
        eval <<'DIGEST';
-               sub digest {
+               sub really_digest {
                        my ($filename) = (@_);
                        die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
                        my $ctx = Digest::SHA1->new;
@@ -67,7 +75,8 @@ DIGEST
 my $collapse_access = 0;
 my $collapse_timestamp = 0;
 my $collapse_zero = 0;
-my $trust_hashes = 0;
+my $skip_compares = 0;
+my $skip_hashes = 0;
 my $verbose = 0;
 my $debug = 0;
 my @extra_find_opts = ();
@@ -76,6 +85,17 @@ my $lock_file;
 my $lock_rm = 0;
 my $lock_obtained = 0;
 
+sub digest {
+       my ($filename) = (@_);
+       if ($skip_hashes) {
+               return "SKIPPING HASHES";
+       } else {
+               &really_digest($filename);
+               $hash_bytes += -s $filename;
+               $hash_files++;
+       }
+}
+
 my @directories;
 
 sub usage {
@@ -110,7 +130,11 @@ hard links).
 
         --timestamps    mtime may be different for identical files
 
-        --trust         skip byte-by-byte file comparisons
+       --skip-compare  skip byte-by-byte file comparisons
+
+       --skip-hash     skip calculation of hash function on files
+
+        --trust         old name for --skip-compare
                         (trust the hash function)
 
         --verbose       report files as they are considered
@@ -127,8 +151,10 @@ while ($#ARGV >= 0) {
                $collapse_timestamp = 1;
        } elsif ($arg eq '--zeros') {
                $collapse_zero = 1;
-       } elsif ($arg eq '--trust') {
-               $trust_hashes = 1;
+       } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
+               $skip_compares = 1;
+       } elsif ($arg eq '--skip-hash') {
+               $skip_hashes = 1;
        } elsif ($arg eq '--verbose') {
                $verbose = 1;
        } elsif ($arg eq '--lock-rm') {
@@ -161,6 +187,10 @@ while ($#ARGV >= 0) {
        }
 }
 
+if ($skip_hashes && $skip_compares) {
+       die "Cannot skip both hashes and compares.\n";
+}
+
 if (defined($lock_file)) {
        sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
        flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
@@ -240,7 +270,7 @@ sub merge_files {
        my %stop_loop;
 
        my @candidate_list = keys(%inode_to_file_name);
-       $files_input += @candidate_list;
+       $input_files += @candidate_list;
        if (@candidate_list < 2) {
                print STDERR "Merging...only one candidate to merge..." if $debug;
                $trivially_unique++;
@@ -262,13 +292,14 @@ hash_file:
                                $surprises++;
                                next;
                        }
-                       eval { $digest = digest($filename); };
+                       eval { 
+                               $digest = digest($filename); 
+                       };
                        if ($@) {
                                warn "Digest($filename)(#$candidate) failed: $@";
                                $hash_errors++;
                        } else {
                                $ok = 1;
-                               $files_hashed++;
                                last hash_file;
                        }
                }
@@ -289,7 +320,7 @@ link_start:
                                        print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
                                        print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
 
-       incumbent_file:
+incumbent_file:
 
                                        foreach my $incumbent_file (@incumbent_names) {
                                                my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
@@ -307,10 +338,10 @@ link_start:
                                                        next incumbent_file;
                                                }
 
-       candidate_file:
-
                                                my $at_least_one_link_done = 0;
 
+candidate_file:
+
                                                foreach my $candidate_file (@candidate_names) {
                                                        my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
                                                        print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
@@ -336,21 +367,26 @@ link_start:
                                                        my $identical;
 
                                                        eval {
-                                                               if ($trust_hashes) {
-                                                                       print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
+                                                               if ($skip_compares) {
+                                                                       print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
                                                                        $identical = 1;
                                                                } else {
                                                                        my $quoted_incumbent_file = tick_quote($incumbent_file);
                                                                        my $quoted_candidate_file = tick_quote($candidate_file);
-                                                                       print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
+                                                                       print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
                                                                        if (compare($incumbent_file, $candidate_file)) {
                                                                                $compare_differences++;
                                                                                $identical = 0;
-                                                                               print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
+                                                                               # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
+                                                                               # Some kind of I/O error is more likely, so this message cannot be turned off.
+                                                                               # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
+                                                                               # so the warning in that case is quite silly.  Hmmm.
+                                                                               print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
                                                                        } else {
                                                                                $identical = 1;
                                                                        }
                                                                        $compare_count++;
+                                                                       $compare_bytes += $incumbent_size;
                                                                }
                                                        };
                                                        if ($@) {
@@ -430,8 +466,8 @@ link_start:
 
                                                                                $hard_links++;
                                                                                if ($to_nlink == 1) {
-                                                                                       $files_recovered++;
-                                                                                       $bytes_recovered += $incumbent_size;
+                                                                                       $recovered_files++;
+                                                                                       $recovered_bytes += $incumbent_size;
                                                                                }
 
                                                                                # FIXME:  Now we're really confused for some reason.
@@ -444,8 +480,8 @@ link_start:
                                                                                # tried all possible ways to hardlink the file out of existence first;
                                                                                # however, that is complex and only benefits a silly statistic.
                                                                                if ($to_nlink == 1 || $from_nlink == 1) {
-                                                                                       $files_lost++;
-                                                                                       $bytes_lost += $incumbent_size;
+                                                                                       $lost_files++;
+                                                                                       $lost_bytes += $incumbent_size;
                                                                                }
                                                                        }
                                                                }
@@ -478,11 +514,11 @@ while (<FIND>) {
 
        unless (! (-l $name) && (-f _)) {
                warn "Bogon file " . tick_quote($name);
-               $bogons_input++;
+               $input_bogons++;
                next;
        }
 
-       $links_input++;
+       $input_links++;
        merge_files if $weak_key ne $current_key;
        $current_key = $weak_key;
 
@@ -493,25 +529,31 @@ while (<FIND>) {
 
 merge_files;
 
-print STDERR <<STATS;
-links_input             $links_input
-files_input ........... $files_input
-bogons_input            $bogons_input
-merges_attempted ...... $merges_attempted
-trivially_unique        $trivially_unique
-files_hashed .......... $files_hashed
-hash_errors             $hash_errors
-surprises ............. $surprises
+my $stats_blob = <<STATS;
+compare_bytes           $compare_bytes
 compare_count           $compare_count
-compare_differences ... $compare_differences
+compare_differences     $compare_differences
 compare_errors          $compare_errors
-hard_links ............ $hard_links
+hard_links              $hard_links
+hash_bytes              $hash_bytes
+hash_errors             $hash_errors
+hash_files              $hash_files
+input_bogons            $input_bogons
+input_files             $input_files
+input_links             $input_links
 link_errors             $link_errors
-link_retries .......... $link_retries
-bytes_recovered         $bytes_recovered
-files_recovered ....... $files_recovered
-bytes_lost              $bytes_lost
-files_lost ............ $files_lost
+link_retries            $link_retries
+lost_bytes              $lost_bytes
+lost_files              $lost_files
+merges_attempted        $merges_attempted
+recovered_bytes         $recovered_bytes
+recovered_files         $recovered_files
+surprises               $surprises
+trivially_unique        $trivially_unique
 STATS
 
+$stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
+
+print STDERR $stats_blob;
+
 exit(0);