tick_quote: properly quote the string '\''
[dupemerge] / faster-dupemerge
index 6508fc0da801e8d66cb8476cbfb0bd5ce25ce373..0ae1032f2192b4fe4d7556f8b5be4ffd9ce671a8 100755 (executable)
@@ -1,14 +1,22 @@
 #!/usr/bin/perl -w
+# $Id$
+
+# Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
+# Use, modification, and distribution permitted
+# under the terms of the GNU GPL.
+
 use strict;
 use Fcntl qw(:DEFAULT :flock);
 use File::Compare;
 use File::Temp;
 
-my $links_input = 0;
-my $files_input = 0;
-my $bogons_input = 0;
-my $files_hashed = 0;
+my $input_links = 0;
+my $input_files = 0;
+my $input_bogons = 0;
+my $hash_bytes = 0;
+my $hash_files = 0;
 my $hash_errors = 0;
+my $compare_bytes = 0;
 my $compare_count = 0;
 my $compare_errors = 0;
 my $compare_differences = 0;
@@ -17,10 +25,10 @@ my $merges_attempted = 0;
 my $hard_links = 0;
 my $link_errors = 0;
 my $link_retries = 0;
-my $bytes_recovered = 0;
-my $files_recovered = 0;
-my $files_lost = 0;
-my $bytes_lost = 0;
+my $recovered_bytes = 0;
+my $recovered_files = 0;
+my $lost_files = 0;
+my $lost_bytes = 0;
 my $surprises = 0;
 
 eval '
@@ -31,7 +39,7 @@ if ($@) {
        warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
 
        eval <<'DIGEST';
-               sub digest {
+               sub really_digest {
                        my ($filename) = (@_);
                        my $fv = open(MD5SUM, "-|");    
                        die "fork: $!" unless defined($fv);
@@ -51,7 +59,7 @@ if ($@) {
 DIGEST
 } else {
        eval <<'DIGEST';
-               sub digest {
+               sub really_digest {
                        my ($filename) = (@_);
                        die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
                        my $ctx = Digest::SHA1->new;
@@ -67,15 +75,30 @@ DIGEST
 my $collapse_access = 0;
 my $collapse_timestamp = 0;
 my $collapse_zero = 0;
-my $trust_hashes = 0;
+my $skip_compares = 0;
+my $skip_hashes = 0;
 my $verbose = 0;
 my $debug = 0;
+my $dry_run = 0;
+my $humane = 0;
 my @extra_find_opts = ();
 my @extra_sort_opts = ();
 my $lock_file;
 my $lock_rm = 0;
 my $lock_obtained = 0;
 
+sub digest {
+       my ($filename) = (@_);
+       if ($skip_hashes) {
+               return "SKIPPING HASHES";
+       } else {
+               my $digest = &really_digest($filename);
+               $hash_bytes += -s $filename;
+               $hash_files++;
+               return $digest
+       }
+}
+
 my @directories;
 
 sub usage {
@@ -99,8 +122,12 @@ hard links).
         --debug         show all steps in duplication discovery process
                         (implies --verbose)
 
+       --dry-run       do not lock files or make changes to filesystem
+
         --find          pass next options (up to --) to find command
 
+       --humane        human-readable statistics (e.g. 1 048 576)
+
         --lock FILE     exit immediately (status 10) if unable to obtain a 
                         flock(LOCK_EX|LOCK_NB) on FILE
 
@@ -110,7 +137,11 @@ hard links).
 
         --timestamps    mtime may be different for identical files
 
-        --trust         skip byte-by-byte file comparisons
+       --skip-compare  skip byte-by-byte file comparisons
+
+       --skip-hash     skip calculation of hash function on files
+
+        --trust         old name for --skip-compare
                         (trust the hash function)
 
         --verbose       report files as they are considered
@@ -127,8 +158,10 @@ while ($#ARGV >= 0) {
                $collapse_timestamp = 1;
        } elsif ($arg eq '--zeros') {
                $collapse_zero = 1;
-       } elsif ($arg eq '--trust') {
-               $trust_hashes = 1;
+       } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
+               $skip_compares = 1;
+       } elsif ($arg eq '--skip-hash') {
+               $skip_hashes = 1;
        } elsif ($arg eq '--verbose') {
                $verbose = 1;
        } elsif ($arg eq '--lock-rm') {
@@ -141,6 +174,10 @@ while ($#ARGV >= 0) {
                }
        } elsif ($arg eq '--debug') {
                $debug = $verbose = 1;
+       } elsif ($arg eq '--dry-run') {
+               $dry_run = 1;
+       } elsif ($arg eq '--humane') {
+               $humane = 1;
        } elsif ($arg eq '--find') {
                while ($#ARGV >= 0) {
                        my $extra_arg = shift(@ARGV);
@@ -161,7 +198,13 @@ while ($#ARGV >= 0) {
        }
 }
 
-if (defined($lock_file)) {
+if ($skip_hashes && $skip_compares) {
+       die "Cannot skip both hashes and compares.\n";
+}
+
+@directories or usage;
+
+if (defined($lock_file) && !$dry_run) {
        sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
        flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
        print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
@@ -169,7 +212,7 @@ if (defined($lock_file)) {
 }
 
 END {
-       if ($lock_obtained) {
+       if ($lock_obtained && !$dry_run) {
                print STDERR "Removing '$lock_file'.\n" if $verbose;
                unlink($lock_file) or warn "unlink: $lock_file: $!";
        }
@@ -177,7 +220,7 @@ END {
 
 sub tick_quote {
        my ($text) = (@_);
-       $text =~ s/'/'\''/go;
+       $text =~ s/'/'\\''/go;
        return "'$text'";
 }
 
@@ -213,14 +256,18 @@ my %inode_to_file_name = ();
 # Link files
 sub link_files {
        my ($from, $to) = (@_);
+
+       my $quoted_from = tick_quote($from);
+       my $quoted_to = tick_quote($to);
+       print STDERR "ln -f $quoted_from $quoted_to\n";
+
+       return if $dry_run;
+
        my $inode_dir = $to;
        my $inode_base = $to;
        $inode_dir =~ s:[^/]*$::o;
        $inode_base =~ s:^.*/::os;
        my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
-       my $quoted_from = tick_quote($from);
-       my $quoted_to = tick_quote($to);
-       print STDERR "ln -f $quoted_from $quoted_to\n";
        print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
        link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
        print STDERR "\trename: $tmp_to -> $to\n" if $debug;
@@ -240,7 +287,7 @@ sub merge_files {
        my %stop_loop;
 
        my @candidate_list = keys(%inode_to_file_name);
-       $files_input += @candidate_list;
+       $input_files += @candidate_list;
        if (@candidate_list < 2) {
                print STDERR "Merging...only one candidate to merge..." if $debug;
                $trivially_unique++;
@@ -262,13 +309,14 @@ hash_file:
                                $surprises++;
                                next;
                        }
-                       eval { $digest = digest($filename); };
+                       eval { 
+                               $digest = digest($filename); 
+                       };
                        if ($@) {
                                warn "Digest($filename)(#$candidate) failed: $@";
                                $hash_errors++;
                        } else {
                                $ok = 1;
-                               $files_hashed++;
                                last hash_file;
                        }
                }
@@ -289,7 +337,7 @@ link_start:
                                        print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
                                        print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
 
-       incumbent_file:
+incumbent_file:
 
                                        foreach my $incumbent_file (@incumbent_names) {
                                                my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
@@ -307,10 +355,10 @@ link_start:
                                                        next incumbent_file;
                                                }
 
-       candidate_file:
-
                                                my $at_least_one_link_done = 0;
 
+candidate_file:
+
                                                foreach my $candidate_file (@candidate_names) {
                                                        my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
                                                        print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
@@ -336,21 +384,26 @@ link_start:
                                                        my $identical;
 
                                                        eval {
-                                                               if ($trust_hashes) {
-                                                                       print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
+                                                               if ($skip_compares) {
+                                                                       print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
                                                                        $identical = 1;
                                                                } else {
                                                                        my $quoted_incumbent_file = tick_quote($incumbent_file);
                                                                        my $quoted_candidate_file = tick_quote($candidate_file);
-                                                                       print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
+                                                                       print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
                                                                        if (compare($incumbent_file, $candidate_file)) {
                                                                                $compare_differences++;
                                                                                $identical = 0;
-                                                                               print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
+                                                                               # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
+                                                                               # Some kind of I/O error is more likely, so this message cannot be turned off.
+                                                                               # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
+                                                                               # so the warning in that case is quite silly.  Hmmm.
+                                                                               print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
                                                                        } else {
                                                                                $identical = 1;
                                                                        }
                                                                        $compare_count++;
+                                                                       $compare_bytes += $incumbent_size;
                                                                }
                                                        };
                                                        if ($@) {
@@ -424,14 +477,18 @@ link_start:
                                                                        # My random number generator chooses the incumbent's size.
 
                                                                        if ($link_done) {
+                                                                               # Since we're in a dry run, the filesystem doesn't change.
+                                                                               # Our notion of what the filesystem should look like should not change either.
                                                                                delete $inode_to_file_name{$to_inode}->{$to_file};
-                                                                               $inode_to_file_name{$from_inode}->{$to_file} = undef;
-                                                                               $hash_to_inode{$digest} = $from_inode;
+                                                                               unless ($dry_run) {
+                                                                                       $inode_to_file_name{$from_inode}->{$to_file} = undef;
+                                                                                       $hash_to_inode{$digest} = $from_inode;
+                                                                               }
 
                                                                                $hard_links++;
                                                                                if ($to_nlink == 1) {
-                                                                                       $files_recovered++;
-                                                                                       $bytes_recovered += $incumbent_size;
+                                                                                       $recovered_files++;
+                                                                                       $recovered_bytes += $incumbent_size;
                                                                                }
 
                                                                                # FIXME:  Now we're really confused for some reason.
@@ -444,8 +501,8 @@ link_start:
                                                                                # tried all possible ways to hardlink the file out of existence first;
                                                                                # however, that is complex and only benefits a silly statistic.
                                                                                if ($to_nlink == 1 || $from_nlink == 1) {
-                                                                                       $files_lost++;
-                                                                                       $bytes_lost += $incumbent_size;
+                                                                                       $lost_files++;
+                                                                                       $lost_bytes += $incumbent_size;
                                                                                }
                                                                        }
                                                                }
@@ -478,11 +535,11 @@ while (<FIND>) {
 
        unless (! (-l $name) && (-f _)) {
                warn "Bogon file " . tick_quote($name);
-               $bogons_input++;
+               $input_bogons++;
                next;
        }
 
-       $links_input++;
+       $input_links++;
        merge_files if $weak_key ne $current_key;
        $current_key = $weak_key;
 
@@ -493,25 +550,53 @@ while (<FIND>) {
 
 merge_files;
 
-print STDERR <<STATS;
-links_input             $links_input
-files_input ........... $files_input
-bogons_input            $bogons_input
-merges_attempted ...... $merges_attempted
-trivially_unique        $trivially_unique
-files_hashed .......... $files_hashed
-hash_errors             $hash_errors
-surprises ............. $surprises
+my $stats_blob = <<STATS;
+compare_bytes           $compare_bytes
 compare_count           $compare_count
-compare_differences ... $compare_differences
+compare_differences     $compare_differences
 compare_errors          $compare_errors
-hard_links ............ $hard_links
+hard_links              $hard_links
+hash_bytes              $hash_bytes
+hash_errors             $hash_errors
+hash_files              $hash_files
+input_bogons            $input_bogons
+input_files             $input_files
+input_links             $input_links
 link_errors             $link_errors
-link_retries .......... $link_retries
-bytes_recovered         $bytes_recovered
-files_recovered ....... $files_recovered
-bytes_lost              $bytes_lost
-files_lost ............ $files_lost
+link_retries            $link_retries
+lost_bytes              $lost_bytes
+lost_files              $lost_files
+merges_attempted        $merges_attempted
+recovered_bytes         $recovered_bytes
+recovered_files         $recovered_files
+surprises               $surprises
+trivially_unique        $trivially_unique
 STATS
 
+if ($humane) {
+       my $max_num_len = 0;
+
+       sub measure_numbers {
+               my ($num) = @_;
+               my $len = length($num);
+               $len += int( (length($num) - 1) / 3);
+               $max_num_len = $len if $len > $max_num_len;
+       }
+
+       (my $dummy = $stats_blob) =~ s/\d+/measure_numbers($&)/geos;
+
+       sub space_numbers {
+               my ($num) = @_;
+               1 while $num =~ s/(\d)(\d\d\d)( \d\d\d)*$/$1 $2$3/os;
+               $num = ' ' x ($max_num_len - length($num)) . $num;
+               return $num;
+       }
+
+       $stats_blob =~ s/\d+/space_numbers($&)/geos;
+}
+
+$stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
+
+print STDERR $stats_blob;
+
 exit(0);