Digest::SHA: deprecate Digest::SHA1, bump version to 0.20120914 and copyright year...
[dupemerge] / faster-dupemerge
index 7e899ddc8eb8ae4f87770d42a33efe4acca43b95..62f4bc99c47b401bff653a252366eb9ef11e28e2 100755 (executable)
@@ -4,7 +4,7 @@ use Fcntl qw(:DEFAULT :flock);
 use File::Compare;
 use File::Temp;
 
-# Copyright (C) 2002-2010 Zygo Blaxell <faster-dupemerge@mailtoo.hungrycats.org>
+# Copyright (C) 2002-2012 Zygo Blaxell <faster-dupemerge@mailtoo.hungrycats.org>
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -42,11 +42,11 @@ my $lost_bytes = 0;
 my $surprises = 0;
 
 eval '
-       use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
+       use Digest::SHA qw(sha1 sha1_hex sha1_base64);
 ';
 
 if ($@) {
-       warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
+       warn "Digest::SHA: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA";
 
        eval <<'DIGEST';
                sub really_digest {
@@ -72,7 +72,7 @@ DIGEST
                sub really_digest {
                        my ($filename) = (@_);
                        die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
-                       my $ctx = Digest::SHA1->new;
+                       my $ctx = Digest::SHA->new;
                        sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
                        binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
                        $ctx->addfile(\*FILE);
@@ -86,7 +86,9 @@ my $collapse_access = 0;
 my $collapse_timestamp = 0;
 my $collapse_zero = 0;
 my $skip_compares = 0;
+my $skip_compare_preference = 0;
 my $skip_hashes = 0;
+my $skip_hashes_threshold = 0;
 my $progress = 0;
 my $verbose = 0;
 my $debug = 0;
@@ -153,14 +155,19 @@ hard links).
                         S, s - lstat(2) (see source for details)
                         U - unlink(2)
                         . - all inodes with similar attributes done
+                        (123456) - current file size in bytes
 
         --sort          pass next options (up to --) to sort command
 
         --timestamps    mtime may be different for identical files
 
-        --skip-compare  skip byte-by-byte file comparisons
+        --skip-compare  skip byte-by-byte file comparisons,
+                        compare only file hashes
 
-        --skip-hash     skip calculation of hash function on files
+        --skip-hash[=N] skip calculation of hash function on files
+                        larger than N bytes (default 1M).
+                        Scalars KMGT specify KiB, MiB, GiB, and TiB.
+                        Scalars kmgt specify KB, MB, GB, and TB.
 
         --trust         old name for --skip-compare
                         (trust the hash function)
@@ -168,6 +175,12 @@ hard links).
         --verbose       report files as they are considered
 
         --zeros         hard-link zero-length files too
+
+--skip-compare and --skip-hash can be combined, in which case a file is
+either hashed (if it is below the --skip-hash size threshold) or compared
+(if it is above), but never both.
+
+Version:  0.20120914
 USAGE
 }
 
@@ -180,9 +193,24 @@ while ($#ARGV >= 0) {
        } elsif ($arg eq '--zeros') {
                $collapse_zero = 1;
        } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
-               $skip_compares = 1;
-       } elsif ($arg eq '--skip-hash') {
-               $skip_hashes = 1;
+               $skip_compares = $skip_compare_preference = 1;
+       } elsif ($arg =~ /^--skip-hash(?:=(\d+)([KkMmGgTt]?))?$/os) {
+               my ($quantity, $unit) = ($1, $2);
+               $unit ||= '_';
+               $quantity ||= 1048576;
+               my %scale = (
+                       _ => 1,
+                       k => 1000,
+                       K => 1024,
+                       m => 1000*1000,
+                       M => 1024*1024,
+                       g => 1000*1000*1000,
+                       G => 1024*1024*1024,
+                       t => 1000*1000*1000*1000,
+                       T => 1024*1024*1024*1024,
+               );
+               $skip_hashes = 0;
+               $skip_hashes_threshold = $quantity * $scale{$unit};
        } elsif ($arg eq '--progress') {
                $progress = 1;
        } elsif ($arg eq '--verbose') {
@@ -221,11 +249,7 @@ while ($#ARGV >= 0) {
        }
 }
 
-if ($skip_hashes && $skip_compares) {
-       die "Cannot skip both hashes and compares.\n";
-}
-
-@directories or usage;
+@directories or usage($0);
 
 if (defined($lock_file) && !$dry_run) {
        sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
@@ -339,6 +363,7 @@ hash_file:
                foreach my $filename (sort keys(%{$inode_to_file_name{$candidate}})) {
                        print STDERR "\t\tDigesting file $filename\n" if $debug;
                        if ((-l $filename) || ! -f _) {
+                               print STDERR "\n" if $progress;
                                warn "Bogon file " . tick_quote($filename);
                                $input_bogons++;
                                delete $inode_to_file_name{$candidate}->{$filename};
@@ -387,7 +412,7 @@ incumbent_file:
                                                }
 
                                                if (format_inode($incumbent_dev, $incumbent_ino) ne $incumbent) {
-                                                       warn "$incumbent_file: expected inode $incumbent, found $incumbent_dev:$incumbent_ino";
+                                                       warn "$incumbent_file: expected inode $incumbent, found ".format_inode($incumbent_dev, $incumbent_ino);
                                                        $surprises++;
                                                        next incumbent_file;
                                                }
@@ -408,7 +433,7 @@ candidate_file:
                                                        }
 
                                                        if (format_inode($candidate_dev, $candidate_ino) ne $candidate) {
-                                                               warn "$candidate_file: expected inode $candidate, found $candidate_dev:$candidate_ino";
+                                                               warn "$candidate_file: expected inode $candidate, found ".format_inode($candidate_dev, $candidate_ino);
                                                                $surprises++;
                                                                next candidate_file;
                                                        }
@@ -524,7 +549,7 @@ candidate_file:
 
                                                                        if ($link_done) {
                                                                                delete $inode_to_file_name{$to_inode}->{$to_file};
-                                                                               $inode_to_file_name{$from_inode}->{$to_file} = undef;
+                                                                               $inode_to_file_name{$from_inode}->{$to_file} = undef unless ($dry_run);
                                                                                $hash_to_inode{$digest} = [ $from_inode ];
 
                                                                                $hard_links++;
@@ -571,14 +596,34 @@ end_merge:
        undef %inode_to_file_name;
 }
 
+my $last_time = 0;
+my $last_size = 0;
+
 while (<FIND>) {
-       my ($weak_key, $dev, $ino, $name) = m/^(\d+ \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so;
+       my ($weak_key, $size, $dev, $ino, $name) = m/^((\d+) \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so;
        die "read error: $!\nLast input line was '$_'" unless defined($name);
 
        my $inode = format_inode($dev, $ino);
 
        print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
 
+       if ($skip_hashes_threshold && $size >= $skip_hashes_threshold) {
+               $skip_hashes = 1;
+               $skip_compares = 0;
+       } else {
+               $skip_hashes = 0;
+               $skip_compares = $skip_compare_preference;
+       }
+
+       if ($progress) {
+               my $this_time = time();
+               if ($this_time != $last_time && $size != $last_size) {
+                       $last_time = $this_time;
+                       $last_size = $size;
+                       print STDERR "($size)";
+               }
+       }
+
        $input_links++;
        merge_files if $weak_key ne $current_key;
        $current_key = $weak_key;
@@ -627,7 +672,7 @@ if ($humane) {
 
        sub space_numbers {
                my ($num) = @_;
-               1 while $num =~ s/(\d)(\d\d\d)( \d\d\d)*$/$1 $2$3/os;
+               1 while $num =~ s/(\d)(\d\d\d)((?: \d\d\d)*)$/$1 $2$3/os;
                $num = ' ' x ($max_num_len - length($num)) . $num;
                return $num;
        }