Merge branch 'dm6'
authorZygo Blaxell <zblaxell@faye.furryterror.org>
Fri, 14 May 2010 16:57:42 +0000 (12:57 -0400)
committerZygo Blaxell <zblaxell@faye.furryterror.org>
Fri, 14 May 2010 16:57:42 +0000 (12:57 -0400)
faster-dupemerge [new file with mode: 0755]

diff --git a/faster-dupemerge b/faster-dupemerge
new file mode 100755 (executable)
index 0000000..4cdef97
--- /dev/null
@@ -0,0 +1,1026 @@
+#!/usr/bin/perl -w
+use strict;
+use Fcntl qw(:DEFAULT :flock);
+use File::Compare;
+use File::Temp;
+
+# Copyright (C) 2002-2010 Zygo Blaxell <faster-dupemerge@mailtoo.hungrycats.org>
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+my $input_links = 0;
+my $input_files = 0;
+my $input_bogons = 0;
+my $hash_bytes = 0;
+my $hash_files = 0;
+my $hash_errors = 0;
+my $compare_bytes = 0;
+my $compare_count = 0;
+my $compare_errors = 0;
+my $compare_differences = 0;
+my $trivially_unique = 0;
+my $merges_attempted = 0;
+my $hard_links = 0;
+my $link_errors = 0;
+my $link_retries = 0;
+my $recovered_bytes = 0;
+my $recovered_files = 0;
+my $lost_files = 0;
+my $lost_bytes = 0;
+my $surprises = 0;
+
+eval '
+       use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
+';
+
+if ($@) {
+       warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
+
+       eval <<'DIGEST';
+               sub really_digest {
+                       my ($filename) = (@_);
+                       my $fv = open(MD5SUM, "-|");    
+                       die "fork: $!" unless defined($fv);
+                       if ($fv) {
+                               my ($sum_line) = <MD5SUM>;
+                               close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
+                               die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
+                               my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
+                               die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
+                               return $sum;
+                       } else {
+                               sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
+                               exec('md5sum');
+                               # Perl guarantees it will die here
+                       }
+               }
+DIGEST
+} else {
+       eval <<'DIGEST';
+               sub really_digest {
+                       my ($filename) = (@_);
+                       die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
+                       my $ctx = Digest::SHA1->new;
+                       sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
+                       binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
+                       $ctx->addfile(\*FILE);
+                       close(FILE) or die "close: $filename: $!";
+                       return $ctx->b64digest;
+               }
+DIGEST
+}
+       
+my $collapse_access = 0;
+my $collapse_timestamp = 0;
+my $collapse_zero = 0;
+my $skip_compares = 0;
+my $skip_compare_preference = 0;
+my $skip_hashes = 0;
+my $skip_hashes_threshold = 0;
+my $progress = 0;
+my $verbose = 0;
+my $debug = 0;
+my $dry_run = 0;
+my $humane = 0;
+my @extra_find_opts = ();
+my @extra_sort_opts = ();
+my $lock_file;
+my $lock_rm = 0;
+my $lock_obtained = 0;
+
+sub digest {
+       my ($filename) = (@_);
+       if ($skip_hashes) {
+               return "SKIPPING HASHES";
+       } else {
+               print STDERR 'H' if $progress;
+               my $digest = &really_digest($filename);
+               $hash_bytes += -s $filename;
+               $hash_files++;
+               return $digest
+       }
+}
+
+my @directories;
+
+sub usage {
+       my $name = shift(@_);
+       die <<USAGE;
+Usage: $name [--opts] directory [directory...]
+Finds duplicate files in the given directories, and replaces all identical
+copies of a file with hard-links to a single file.
+
+Several options modify the definition of a "duplicate".  By default, files
+which have differences in owner uid or gid, permission (mode), or
+modification time (mtime) are considered different, so that hardlinking
+files does not also change their attributes.  Additionally, all files of
+zero size are ignored for performance reasons (there tend to be many
+of them, and they tend not to release any space when replaced with
+hard links).
+
+        --access        uid, gid, and mode may be different for identical
+                        files
+
+        --debug         show all steps in duplication discovery process
+                        (implies --verbose)
+
+        --dry-run       do not lock files or make changes to filesystem
+
+        --find          pass next options (up to --) to find command
+
+        --humane        human-readable statistics (e.g. 1 048 576)
+
+        --lock FILE     exit immediately (status 10) if unable to obtain a 
+                        flock(LOCK_EX|LOCK_NB) on FILE
+
+        --lock-rm       remove lock file at exit
+
+        --progress      output single-character progress indicators:
+                        C - compare
+                        H - hash
+                        L - link(2)
+                        R - rename(2)
+                        S, s - lstat(2) (see source for details)
+                        U - unlink(2)
+                        . - all inodes with similar attributes done
+                        (123456) - current file size in bytes
+
+        --sort          pass next options (up to --) to sort command
+
+        --timestamps    mtime may be different for identical files
+
+        --skip-compare  skip byte-by-byte file comparisons
+
+        --skip-hash[=N] skip calculation of hash function on files
+                        larger than N bytes (default 1M).
+                        Scalars KMGT specify KiB, MiB, GiB, and TiB.
+                        Scalars kmgt specify KB, MB, GB, and TB.
+
+        --trust         old name for --skip-compare
+                        (trust the hash function)
+
+        --verbose       report files as they are considered
+
+        --zeros         hard-link zero-length files too
+USAGE
+}
+
+while ($#ARGV >= 0) {
+       my $arg = shift(@ARGV);
+       if ($arg eq '--access') {
+               $collapse_access = 1;
+       } elsif ($arg eq '--timestamps') {
+               $collapse_timestamp = 1;
+       } elsif ($arg eq '--zeros') {
+               $collapse_zero = 1;
+       } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
+               $skip_compares = $skip_compare_preference = 1;
+       } elsif ($arg =~ /^--skip-hash(?:=(\d+)([KkMmGgTt]?))?$/os) {
+               my ($quantity, $unit) = ($1, $2);
+               $unit ||= '_';
+               $quantity ||= 1048576;
+               my %scale = (
+                       _ => 1,
+                       k => 1000,
+                       K => 1024,
+                       m => 1000*1000,
+                       M => 1024*1024,
+                       g => 1000*1000*1000,
+                       G => 1024*1024*1024,
+                       t => 1000*1000*1000*1000,
+                       T => 1024*1024*1024*1024,
+               );
+               $skip_hashes = 0;
+               $skip_hashes_threshold = $quantity * $scale{$unit};
+       } elsif ($arg eq '--progress') {
+               $progress = 1;
+       } elsif ($arg eq '--verbose') {
+               $verbose = 1;
+       } elsif ($arg eq '--lock-rm') {
+               $lock_rm = 1;
+       } elsif ($arg eq '--lock') {
+               $lock_file = shift(@ARGV);
+               unless (defined($lock_file)) {
+                       usage($0);
+                       exit(1);
+               }
+       } elsif ($arg eq '--debug') {
+               $debug = $verbose = 1;
+       } elsif ($arg eq '--dry-run') {
+               $dry_run = 1;
+       } elsif ($arg eq '--humane') {
+               $humane = 1;
+       } elsif ($arg eq '--find') {
+               while ($#ARGV >= 0) {
+                       my $extra_arg = shift(@ARGV);
+                       last if $extra_arg eq '--';
+                       push(@extra_find_opts, $extra_arg);
+               }
+       } elsif ($arg eq '--sort') {
+               while ($#ARGV >= 0) {
+                       my $extra_arg = shift(@ARGV);
+                       last if $extra_arg eq '--';
+                       push(@extra_sort_opts, $extra_arg);
+               }
+       } elsif ($arg =~ /^-/o) {
+               usage($0);
+               exit(1);
+       } else {
+               push(@directories, $arg);
+       }
+}
+
+@directories or usage($0);
+
+if (defined($lock_file) && !$dry_run) {
+       sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
+       flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
+       print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
+       $lock_obtained = 1;
+}
+
+END {
+       if ($lock_obtained && !$dry_run) {
+               print STDERR "Removing '$lock_file'.\n" if $verbose;
+               unlink($lock_file) or warn "unlink: $lock_file: $!";
+       }
+}
+
+sub tick_quote {
+       my ($text) = (@_);
+       $text =~ s/'/'\\''/go;
+       return "'$text'";
+}
+
+my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
+my $printf_string = '%s ' .
+       ($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
+       ($collapse_timestamp ? '0 '     : '%T@ ') .
+       '%D %i %p\0';
+
+push(@find_command, '!', '-empty') unless $collapse_zero;
+push(@find_command, '-printf', $printf_string);
+
+my @sort_command = ('sort', '-znr', @extra_sort_opts);
+my @quoted_sort_command = @sort_command;
+grep(tick_quote($_), @quoted_sort_command);
+my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
+
+my @quoted_find_command = @find_command;
+grep(tick_quote($_), @quoted_find_command);
+my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
+print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
+
+open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
+$/ = "\0";
+
+# Input is sorted so that all weak keys are contiguous.
+# When the key changes, we have to process all files we previously know about.
+my $current_key = -1;
+
+# $inode_to_file_name{$inode} = [@file_names]
+my %inode_to_file_name = ();
+
+# Link files
+sub link_files {
+       my ($from, $to) = (@_);
+
+       my $quoted_from = tick_quote($from);
+       my $quoted_to = tick_quote($to);
+       print STDERR "\n" if $progress;
+       print STDERR "ln -f $quoted_from $quoted_to\n";
+
+       return if $dry_run;
+
+       my $inode_dir = $to;
+       my $inode_base = $to;
+       $inode_dir =~ s:[^/]*$::o;
+       $inode_base =~ s:^.*/::os;
+       my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
+       print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
+       print STDERR 'L' if $progress;
+       link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
+       print STDERR "\trename: $tmp_to -> $to\n" if $debug;
+       print STDERR 'R' if $progress;
+       unless (rename($tmp_to, $to)) {
+               my $saved_bang = $!;
+               print STDERR 'U' if $progress;
+               unlink($tmp_to) or warn "unlink: $tmp_to: $!";  # Try, possibly in vain, to clean up
+               die "rename: $tmp_to -> $from: $saved_bang";
+       }
+}
+
+# Convert $dev,$ino into a single string where lexical and numeric orderings are equivalent
+sub format_inode ($$) {
+       my ($dev, $ino) = @_;
+       # 64 bits ought to be enough for everybody!
+       return sprintf('%016x:%016x', $dev, $ino);
+}
+
+# Process all known files so far.
+sub merge_files {
+       $merges_attempted++;
+
+       my %hash_to_inode;
+       # Used to stop link retry loops (there is a goto in here!  Actually two...)
+       my %stop_loop;
+
+       my @candidate_list = keys(%inode_to_file_name);
+       $input_files += @candidate_list;
+       if (@candidate_list < 2) {
+               print STDERR "Merging...only one candidate to merge..." if $debug;
+               $trivially_unique++;
+               goto end_merge;
+       }
+
+       print STDERR "Merging...\n" if $debug;
+       foreach my $candidate (sort @candidate_list) {
+               print STDERR "\tDigesting candidate $candidate\n" if $debug;
+               my $ok = 0;
+               my $digest;
+
+hash_file:
+
+               foreach my $filename (sort keys(%{$inode_to_file_name{$candidate}})) {
+                       print STDERR "\t\tDigesting file $filename\n" if $debug;
+                       if ((-l $filename) || ! -f _) {
+                               print STDERR "\n" if $progress;
+                               warn "Bogon file " . tick_quote($filename);
+                               $input_bogons++;
+                               delete $inode_to_file_name{$candidate}->{$filename};
+                               next;
+                       }
+                       eval {
+                               $digest = digest($filename); 
+                       };
+                       if ($@) {
+                               warn "Digest($filename)(#$candidate) failed: $@";
+                               $hash_errors++;
+                       } else {
+                               $ok = 1;
+                               last hash_file;
+                       }
+               }
+               if ($ok) {
+                       print STDERR "\t\tDigest is $digest\n" if $debug;
+
+                       my $incumbent_list = ($hash_to_inode{$digest} ||= []);
+                       my $incumbent_matched = 0;
+                       for my $incumbent (sort @$incumbent_list) {
+                               print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
+
+                               my $finished = 0;
+
+link_start:
+
+                               until ($finished) {
+                                       my @incumbent_names = sort keys(%{$inode_to_file_name{$incumbent}});
+                                       my @candidate_names = sort keys(%{$inode_to_file_name{$candidate}});
+                                       print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug;
+                                       print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
+
+incumbent_file:
+
+                                       foreach my $incumbent_file (@incumbent_names) {
+                                               print STDERR 'S' if $progress;
+                                               my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
+                                               print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
+
+                                               if (!defined($incumbent_blocks) || ! (-f _)) {
+                                                       warn "lstat: $incumbent_file: $!";
+                                                       $surprises++;
+                                                       next incumbent_file;
+                                               }
+
+                                               if (format_inode($incumbent_dev, $incumbent_ino) ne $incumbent) {
+                                                       warn "$incumbent_file: expected inode $incumbent, found ".format_inode($incumbent_dev, $incumbent_ino);
+                                                       $surprises++;
+                                                       next incumbent_file;
+                                               }
+
+                                               my $at_least_one_link_done = 0;
+
+candidate_file:
+
+                                               foreach my $candidate_file (@candidate_names) {
+                                                       print STDERR 's' if $progress;
+                                                       my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
+                                                       print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
+
+                                                       if (!defined($candidate_blocks) || ! (-f _)) {
+                                                               warn "lstat: $candidate_file: $!";
+                                                               $surprises++;
+                                                               next candidate_file;
+                                                       }
+
+                                                       if (format_inode($candidate_dev, $candidate_ino) ne $candidate) {
+                                                               warn "$candidate_file: expected inode $candidate, found ".format_inode($candidate_dev, $candidate_ino);
+                                                               $surprises++;
+                                                               next candidate_file;
+                                                       }
+
+                                                       if ($candidate_size != $incumbent_size) {
+                                                               warn "$candidate_file, $incumbent_file: file sizes are different";
+                                                               $surprises++;
+                                                               next candidate_file;
+                                                       }
+
+                                                       my $identical;
+
+                                                       eval {
+                                                               if ($skip_compares) {
+                                                                       print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
+                                                                       $identical = 1;
+                                                               } else {
+                                                                       my $quoted_incumbent_file = tick_quote($incumbent_file);
+                                                                       my $quoted_candidate_file = tick_quote($candidate_file);
+                                                                       print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
+                                                                       print STDERR 'C' if $progress;
+                                                                       if (compare($incumbent_file, $candidate_file)) {
+                                                                               $compare_differences++;
+                                                                               $identical = 0;
+                                                                               # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
+                                                                               # Some kind of I/O error is more likely, so this message cannot be turned off.
+                                                                               # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
+                                                                               # so the warning in that case is quite silly.  Hmmm.
+                                                                               print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
+                                                                       } else {
+                                                                               $identical = 1;
+                                                                               $incumbent_matched = 1;
+                                                                       }
+                                                                       $compare_count++;
+                                                                       $compare_bytes += $incumbent_size;
+                                                               }
+                                                       };
+                                                       if ($@) {
+                                                               warn $@;
+                                                               $compare_errors++;
+                                                               next candidate_file;
+                                                       }
+
+                                                       if ($identical) {
+                                                               print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
+
+                                                               # We have to do this to break out of a possible infinite loop.
+                                                               # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
+                                                               # such that A1 and B1 are in non-writable directories, we will loop
+                                                               # forever hardlinking A2 with A and B.
+                                                               # To break the loop, we never attempt to hardlink any files X and Y twice.
+
+                                                               if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
+                                                                   defined($stop_loop{$candidate_file}->{$incumbent_file})) {
+                                                                       print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
+                                                               } else {
+                                                                       $stop_loop{$incumbent_file}->{$candidate_file} = 1;
+                                                                       $stop_loop{$candidate_file}->{$incumbent_file} = 1;
+
+                                                                       my $link_done = 0;
+
+                                                                       my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
+
+                                                                       # If the candidate has more links than incumbent, replace incumbent with candidate.
+                                                                       # If the incumbent has more links than candidate, replace candidate with incumbent.
+                                                                       # If the link counts are equal, we saw incumbent first, so keep the incumbent.
+                                                                       # "We saw incumbent first" is significant because we explicitly sort the inodes.
+                                                                       # Thank Johannes Niess for this idea.
+                                                                       if ($candidate_nlink > $incumbent_nlink) {
+                                                                               $from_file = $candidate_file;
+                                                                               $to_file = $incumbent_file;
+                                                                               $from_inode = $candidate;
+                                                                               $to_inode = $incumbent;
+                                                                               $from_nlink = $candidate_nlink;
+                                                                               $to_nlink = $incumbent_nlink;
+                                                                       } else {
+                                                                               $to_file = $candidate_file;
+                                                                               $from_file = $incumbent_file;
+                                                                               $to_inode = $candidate;
+                                                                               $from_inode = $incumbent;
+                                                                               $to_nlink = $candidate_nlink;
+                                                                               $from_nlink = $incumbent_nlink;
+                                                                       }
+
+                                                                       eval {
+                                                                               link_files($from_file, $to_file);
+                                                                               $link_done = 1;
+                                                                       };
+
+                                                                       if ($@) {
+                                                                               warn $@;
+                                                                               $link_errors++;
+
+                                                                               print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
+                                                                               $link_retries++;
+
+                                                                               eval {
+                                                                                       ($from_file, $to_file) = ($to_file, $from_file);
+                                                                                       ($from_inode, $to_inode) = ($to_inode, $from_inode);
+                                                                                       ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
+                                                                                       link_files($from_file, $to_file);
+                                                                                       $link_done = 1;
+                                                                               };
+
+                                                                               if ($@) {
+                                                                                       warn $@;
+                                                                                       $link_errors++;
+                                                                               }
+                                                                       }
+
+                                                                       # Note since the files are presumably identical, they both have the same size.
+                                                                       # My random number generator chooses the incumbent's size.
+
+                                                                       if ($link_done) {
+                                                                               delete $inode_to_file_name{$to_inode}->{$to_file};
+                                                                               $inode_to_file_name{$from_inode}->{$to_file} = undef unless ($dry_run);
+                                                                               $hash_to_inode{$digest} = [ $from_inode ];
+
+                                                                               $hard_links++;
+                                                                               if ($to_nlink == 1) {
+                                                                                       $recovered_files++;
+                                                                                       $recovered_bytes += $incumbent_size;
+                                                                               }
+
+                                                                               # FIXME:  Now we're really confused for some reason.
+                                                                               # Start over to rebuild state.
+                                                                               next link_start;
+                                                                       } else {
+                                                                               warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
+
+                                                                               # FIXME:  This is a lame heuristic.  We really need to know if we've
+                                                                               # tried all possible ways to hardlink the file out of existence first;
+                                                                               # however, that is complex and only benefits a silly statistic.
+                                                                               if ($to_nlink == 1 || $from_nlink == 1) {
+                                                                                       $lost_files++;
+                                                                                       $lost_bytes += $incumbent_size;
+                                                                               }
+                                                                       }
+                                                               }
+                                                       }
+                                               }
+                                       }
+                                       $finished = 1;
+                               }
+                       }
+                       unless ($incumbent_matched) {
+                               print STDERR "\t\tNew hash entered\n" if $debug;
+                               push(@$incumbent_list, $candidate);
+                       }
+               } else {
+                       warn "No digests found for inode $candidate\n";
+                       delete $inode_to_file_name{$candidate};
+               }
+       }
+       print STDERR '.' if $progress;
+
+end_merge:
+
+       print STDERR "Merge done.\n" if $debug;
+       undef %inode_to_file_name;
+}
+
+my $last_time = 0;
+my $last_size = 0;
+
+while (<FIND>) {
+       my ($weak_key, $size, $dev, $ino, $name) = m/^((\d+) \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so;
+       die "read error: $!\nLast input line was '$_'" unless defined($name);
+
+       my $inode = format_inode($dev, $ino);
+
+       print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
+
+       if ($skip_hashes_threshold && $size >= $skip_hashes_threshold) {
+               $skip_hashes = 1;
+               $skip_compares = 0;
+       } else {
+               $skip_hashes = 0;
+               $skip_compares = $skip_compare_preference;
+       }
+
+       if ($progress) {
+               my $this_time = time();
+               if ($this_time != $last_time && $size != $last_size) {
+                       $last_time = $this_time;
+                       $last_size = $size;
+                       print STDERR "($size)";
+               }
+       }
+
+       $input_links++;
+       merge_files if $weak_key ne $current_key;
+       $current_key = $weak_key;
+
+       $inode_to_file_name{$inode}->{$name} = undef;
+
+       print STDERR "$name\n" if $verbose;
+}
+
+merge_files;
+
+my $stats_blob = <<STATS;
+compare_bytes           $compare_bytes
+compare_count           $compare_count
+compare_differences     $compare_differences
+compare_errors          $compare_errors
+hard_links              $hard_links
+hash_bytes              $hash_bytes
+hash_errors             $hash_errors
+hash_files              $hash_files
+input_bogons            $input_bogons
+input_files             $input_files
+input_links             $input_links
+link_errors             $link_errors
+link_retries            $link_retries
+lost_bytes              $lost_bytes
+lost_files              $lost_files
+merges_attempted        $merges_attempted
+recovered_bytes         $recovered_bytes
+recovered_files         $recovered_files
+surprises               $surprises
+trivially_unique        $trivially_unique
+STATS
+
+if ($humane) {
+       my $max_num_len = 0;
+
+       sub measure_numbers {
+               my ($num) = @_;
+               my $len = length($num);
+               $len += int( (length($num) - 1) / 3);
+               $max_num_len = $len if $len > $max_num_len;
+       }
+
+       (my $dummy = $stats_blob) =~ s/\d+/measure_numbers($&)/geos;
+
+       sub space_numbers {
+               my ($num) = @_;
+               1 while $num =~ s/(\d)(\d\d\d)((?: \d\d\d)*)$/$1 $2$3/os;
+               $num = ' ' x ($max_num_len - length($num)) . $num;
+               return $num;
+       }
+
+       $stats_blob =~ s/\d+/space_numbers($&)/geos;
+}
+
+$stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
+
+print STDERR "\n" if $progress;
+print STDERR $stats_blob;
+
+exit(0);
+
+__END__
+
+#################################################################################
+#                     GNU GENERAL PUBLIC LICENSE                                #
+#                        Version 2, June 1991                                   #
+#                                                                               #
+#  Copyright (C) 1989, 1991 Free Software Foundation, Inc.                      #
+#      59 Temple Place, Suite 330, Boston, MA  02111-1307  USA                  #
+#  Everyone is permitted to copy and distribute verbatim copies                 #
+#  of this license document, but changing it is not allowed.                    #
+#                                                                               #
+#                             Preamble                                          #
+#                                                                               #
+#   The licenses for most software are designed to take away your               #
+# freedom to share and change it.  By contrast, the GNU General Public          #
+# License is intended to guarantee your freedom to share and change free        #
+# software--to make sure the software is free for all its users.  This          #
+# General Public License applies to most of the Free Software                   #
+# Foundation's software and to any other program whose authors commit to        #
+# using it.  (Some other Free Software Foundation software is covered by        #
+# the GNU Library General Public License instead.)  You can apply it to         #
+# your programs, too.                                                           #
+#                                                                               #
+#   When we speak of free software, we are referring to freedom, not            #
+# price.  Our General Public Licenses are designed to make sure that you        #
+# have the freedom to distribute copies of free software (and charge for        #
+# this service if you wish), that you receive source code or can get it         #
+# if you want it, that you can change the software or use pieces of it          #
+# in new free programs; and that you know you can do these things.              #
+#                                                                               #
+#   To protect your rights, we need to make restrictions that forbid            #
+# anyone to deny you these rights or to ask you to surrender the rights.        #
+# These restrictions translate to certain responsibilities for you if you       #
+# distribute copies of the software, or if you modify it.                       #
+#                                                                               #
+#   For example, if you distribute copies of such a program, whether            #
+# gratis or for a fee, you must give the recipients all the rights that         #
+# you have.  You must make sure that they, too, receive or can get the          #
+# source code.  And you must show them these terms so they know their           #
+# rights.                                                                       #
+#                                                                               #
+#   We protect your rights with two steps: (1) copyright the software, and      #
+# (2) offer you this license which gives you legal permission to copy,          #
+# distribute and/or modify the software.                                        #
+#                                                                               #
+#   Also, for each author's protection and ours, we want to make certain        #
+# that everyone understands that there is no warranty for this free             #
+# software.  If the software is modified by someone else and passed on, we      #
+# want its recipients to know that what they have is not the original, so       #
+# that any problems introduced by others will not reflect on the original       #
+# authors' reputations.                                                         #
+#                                                                               #
+#   Finally, any free program is threatened constantly by software              #
+# patents.  We wish to avoid the danger that redistributors of a free           #
+# program will individually obtain patent licenses, in effect making the        #
+# program proprietary.  To prevent this, we have made it clear that any         #
+# patent must be licensed for everyone's free use or not licensed at all.       #
+#                                                                               #
+#   The precise terms and conditions for copying, distribution and              #
+# modification follow.                                                          #
+#                                                                               #
+#                     GNU GENERAL PUBLIC LICENSE                                #
+#    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION            #
+#                                                                               #
+#   0. This License applies to any program or other work which contains         #
+# a notice placed by the copyright holder saying it may be distributed          #
+# under the terms of this General Public License.  The "Program", below,        #
+# refers to any such program or work, and a "work based on the Program"         #
+# means either the Program or any derivative work under copyright law:          #
+# that is to say, a work containing the Program or a portion of it,             #
+# either verbatim or with modifications and/or translated into another          #
+# language.  (Hereinafter, translation is included without limitation in        #
+# the term "modification".)  Each licensee is addressed as "you".               #
+#                                                                               #
+# Activities other than copying, distribution and modification are not          #
+# covered by this License; they are outside its scope.  The act of              #
+# running the Program is not restricted, and the output from the Program        #
+# is covered only if its contents constitute a work based on the                #
+# Program (independent of having been made by running the Program).             #
+# Whether that is true depends on what the Program does.                        #
+#                                                                               #
+#   1. You may copy and distribute verbatim copies of the Program's             #
+# source code as you receive it, in any medium, provided that you               #
+# conspicuously and appropriately publish on each copy an appropriate           #
+# copyright notice and disclaimer of warranty; keep intact all the              #
+# notices that refer to this License and to the absence of any warranty;        #
+# and give any other recipients of the Program a copy of this License           #
+# along with the Program.                                                       #
+#                                                                               #
+# You may charge a fee for the physical act of transferring a copy, and         #
+# you may at your option offer warranty protection in exchange for a fee.       #
+#                                                                               #
+#   2. You may modify your copy or copies of the Program or any portion         #
+# of it, thus forming a work based on the Program, and copy and                 #
+# distribute such modifications or work under the terms of Section 1            #
+# above, provided that you also meet all of these conditions:                   #
+#                                                                               #
+#     a) You must cause the modified files to carry prominent notices           #
+#     stating that you changed the files and the date of any change.            #
+#                                                                               #
+#     b) You must cause any work that you distribute or publish, that in        #
+#     whole or in part contains or is derived from the Program or any           #
+#     part thereof, to be licensed as a whole at no charge to all third         #
+#     parties under the terms of this License.                                  #
+#                                                                               #
+#     c) If the modified program normally reads commands interactively          #
+#     when run, you must cause it, when started running for such                #
+#     interactive use in the most ordinary way, to print or display an          #
+#     announcement including an appropriate copyright notice and a              #
+#     notice that there is no warranty (or else, saying that you provide        #
+#     a warranty) and that users may redistribute the program under             #
+#     these conditions, and telling the user how to view a copy of this         #
+#     License.  (Exception: if the Program itself is interactive but            #
+#     does not normally print such an announcement, your work based on          #
+#     the Program is not required to print an announcement.)                    #
+#                                                                               #
+# These requirements apply to the modified work as a whole.  If                 #
+# identifiable sections of that work are not derived from the Program,          #
+# and can be reasonably considered independent and separate works in            #
+# themselves, then this License, and its terms, do not apply to those           #
+# sections when you distribute them as separate works.  But when you            #
+# distribute the same sections as part of a whole which is a work based         #
+# on the Program, the distribution of the whole must be on the terms of         #
+# this License, whose permissions for other licensees extend to the             #
+# entire whole, and thus to each and every part regardless of who wrote it.     #
+#                                                                               #
+# Thus, it is not the intent of this section to claim rights or contest         #
+# your rights to work written entirely by you; rather, the intent is to         #
+# exercise the right to control the distribution of derivative or               #
+# collective works based on the Program.                                        #
+#                                                                               #
+# In addition, mere aggregation of another work not based on the Program        #
+# with the Program (or with a work based on the Program) on a volume of         #
+# a storage or distribution medium does not bring the other work under          #
+# the scope of this License.                                                    #
+#                                                                               #
+#   3. You may copy and distribute the Program (or a work based on it,          #
+# under Section 2) in object code or executable form under the terms of         #
+# Sections 1 and 2 above provided that you also do one of the following:        #
+#                                                                               #
+#     a) Accompany it with the complete corresponding machine-readable          #
+#     source code, which must be distributed under the terms of Sections        #
+#     1 and 2 above on a medium customarily used for software interchange; or,  #
+#                                                                               #
+#     b) Accompany it with a written offer, valid for at least three            #
+#     years, to give any third party, for a charge no more than your            #
+#     cost of physically performing source distribution, a complete             #
+#     machine-readable copy of the corresponding source code, to be             #
+#     distributed under the terms of Sections 1 and 2 above on a medium         #
+#     customarily used for software interchange; or,                            #
+#                                                                               #
+#     c) Accompany it with the information you received as to the offer         #
+#     to distribute corresponding source code.  (This alternative is            #
+#     allowed only for noncommercial distribution and only if you               #
+#     received the program in object code or executable form with such          #
+#     an offer, in accord with Subsection b above.)                             #
+#                                                                               #
+# The source code for a work means the preferred form of the work for           #
+# making modifications to it.  For an executable work, complete source          #
+# code means all the source code for all modules it contains, plus any          #
+# associated interface definition files, plus the scripts used to               #
+# control compilation and installation of the executable.  However, as a        #
+# special exception, the source code distributed need not include               #
+# anything that is normally distributed (in either source or binary             #
+# form) with the major components (compiler, kernel, and so on) of the          #
+# operating system on which the executable runs, unless that component          #
+# itself accompanies the executable.                                            #
+#                                                                               #
+# If distribution of executable or object code is made by offering              #
+# access to copy from a designated place, then offering equivalent              #
+# access to copy the source code from the same place counts as                  #
+# distribution of the source code, even though third parties are not            #
+# compelled to copy the source along with the object code.                      #
+#                                                                               #
+#   4. You may not copy, modify, sublicense, or distribute the Program          #
+# except as expressly provided under this License.  Any attempt                 #
+# otherwise to copy, modify, sublicense or distribute the Program is            #
+# void, and will automatically terminate your rights under this License.        #
+# However, parties who have received copies, or rights, from you under          #
+# this License will not have their licenses terminated so long as such          #
+# parties remain in full compliance.                                            #
+#                                                                               #
+#   5. You are not required to accept this License, since you have not          #
+# signed it.  However, nothing else grants you permission to modify or          #
+# distribute the Program or its derivative works.  These actions are            #
+# prohibited by law if you do not accept this License.  Therefore, by           #
+# modifying or distributing the Program (or any work based on the               #
+# Program), you indicate your acceptance of this License to do so, and          #
+# all its terms and conditions for copying, distributing or modifying           #
+# the Program or works based on it.                                             #
+#                                                                               #
+#   6. Each time you redistribute the Program (or any work based on the         #
+# Program), the recipient automatically receives a license from the             #
+# original licensor to copy, distribute or modify the Program subject to        #
+# these terms and conditions.  You may not impose any further                   #
+# restrictions on the recipients' exercise of the rights granted herein.        #
+# You are not responsible for enforcing compliance by third parties to          #
+# this License.                                                                 #
+#                                                                               #
+#   7. If, as a consequence of a court judgment or allegation of patent         #
+# infringement or for any other reason (not limited to patent issues),          #
+# conditions are imposed on you (whether by court order, agreement or           #
+# otherwise) that contradict the conditions of this License, they do not        #
+# excuse you from the conditions of this License.  If you cannot                #
+# distribute so as to satisfy simultaneously your obligations under this        #
+# License and any other pertinent obligations, then as a consequence you        #
+# may not distribute the Program at all.  For example, if a patent              #
+# license would not permit royalty-free redistribution of the Program by        #
+# all those who receive copies directly or indirectly through you, then         #
+# the only way you could satisfy both it and this License would be to           #
+# refrain entirely from distribution of the Program.                            #
+#                                                                               #
+# If any portion of this section is held invalid or unenforceable under         #
+# any particular circumstance, the balance of the section is intended to        #
+# apply and the section as a whole is intended to apply in other                #
+# circumstances.                                                                #
+#                                                                               #
+# It is not the purpose of this section to induce you to infringe any           #
+# patents or other property right claims or to contest validity of any          #
+# such claims; this section has the sole purpose of protecting the              #
+# integrity of the free software distribution system, which is                  #
+# implemented by public license practices.  Many people have made               #
+# generous contributions to the wide range of software distributed              #
+# through that system in reliance on consistent application of that             #
+# system; it is up to the author/donor to decide if he or she is willing        #
+# to distribute software through any other system and a licensee cannot         #
+# impose that choice.                                                           #
+#                                                                               #
+# This section is intended to make thoroughly clear what is believed to         #
+# be a consequence of the rest of this License.                                 #
+#                                                                               #
+#   8. If the distribution and/or use of the Program is restricted in           #
+# certain countries either by patents or by copyrighted interfaces, the         #
+# original copyright holder who places the Program under this License           #
+# may add an explicit geographical distribution limitation excluding            #
+# those countries, so that distribution is permitted only in or among           #
+# countries not thus excluded.  In such case, this License incorporates         #
+# the limitation as if written in the body of this License.                     #
+#                                                                               #
+#   9. The Free Software Foundation may publish revised and/or new versions     #
+# of the General Public License from time to time.  Such new versions will      #
+# be similar in spirit to the present version, but may differ in detail to      #
+# address new problems or concerns.                                             #
+#                                                                               #
+# Each version is given a distinguishing version number.  If the Program        #
+# specifies a version number of this License which applies to it and "any       #
+# later version", you have the option of following the terms and conditions     #
+# either of that version or of any later version published by the Free          #
+# Software Foundation.  If the Program does not specify a version number of     #
+# this License, you may choose any version ever published by the Free Software  #
+# Foundation.                                                                   #
+#                                                                               #
+#   10. If you wish to incorporate parts of the Program into other free         #
+# programs whose distribution conditions are different, write to the author     #
+# to ask for permission.  For software which is copyrighted by the Free         #
+# Software Foundation, write to the Free Software Foundation; we sometimes      #
+# make exceptions for this.  Our decision will be guided by the two goals       #
+# of preserving the free status of all derivatives of our free software and     #
+# of promoting the sharing and reuse of software generally.                     #
+#                                                                               #
+#                             NO WARRANTY                                       #
+#                                                                               #
+#   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY    #
+# FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN      #
+# OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES        #
+# PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED    #
+# OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF          #
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS     #
+# TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE        #
+# PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,      #
+# REPAIR OR CORRECTION.                                                         #
+#                                                                               #
+#   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING   #
+# WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR           #
+# REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,    #
+# INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING   #
+# OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED     #
+# TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY      #
+# YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER    #
+# PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE         #
+# POSSIBILITY OF SUCH DAMAGES.                                                  #
+#                                                                               #
+#                      END OF TERMS AND CONDITIONS                              #
+#                                                                               #
+#             How to Apply These Terms to Your New Programs                     #
+#                                                                               #
+#   If you develop a new program, and you want it to be of the greatest         #
+# possible use to the public, the best way to achieve this is to make it        #
+# free software which everyone can redistribute and change under these terms.   #
+#                                                                               #
+#   To do so, attach the following notices to the program.  It is safest        #
+# to attach them to the start of each source file to most effectively           #
+# convey the exclusion of warranty; and each file should have at least          #
+# the "copyright" line and a pointer to where the full notice is found.         #
+#                                                                               #
+#     <one line to give the program's name and a brief idea of what it does.>   #
+#     Copyright (C) <year>  <name of author>                                    #
+#                                                                               #
+#     This program is free software; you can redistribute it and/or modify      #
+#     it under the terms of the GNU General Public License as published by      #
+#     the Free Software Foundation; either version 2 of the License, or         #
+#     (at your option) any later version.                                       #
+#                                                                               #
+#     This program is distributed in the hope that it will be useful,           #
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of            #
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
+#     GNU General Public License for more details.                              #
+#                                                                               #
+#     You should have received a copy of the GNU General Public License         #
+#     along with this program; if not, write to the Free Software               #
+#     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA #
+#                                                                               #
+#                                                                               #
+# Also add information on how to contact you by electronic and paper mail.      #
+#                                                                               #
+# If the program is interactive, make it output a short notice like this        #
+# when it starts in an interactive mode:                                        #
+#                                                                               #
+#     Gnomovision version 69, Copyright (C) year  name of author                #
+#     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. #
+#     This is free software, and you are welcome to redistribute it             #
+#     under certain conditions; type `show c' for details.                      #
+#                                                                               #
+# The hypothetical commands `show w' and `show c' should show the appropriate   #
+# parts of the General Public License.  Of course, the commands you use may     #
+# be called something other than `show w' and `show c'; they could even be      #
+# mouse-clicks or menu items--whatever suits your program.                      #
+#                                                                               #
+# You should also get your employer (if you work as a programmer) or your       #
+# school, if any, to sign a "copyright disclaimer" for the program, if          #
+# necessary.  Here is a sample; alter the names:                                #
+#                                                                               #
+#   Yoyodyne, Inc., hereby disclaims all copyright interest in the program      #
+#   `Gnomovision' (which makes passes at compilers) written by James Hacker.    #
+#                                                                               #
+#   <signature of Ty Coon>, 1 April 1989                                        #
+#   Ty Coon, President of Vice                                                  #
+#                                                                               #
+# This General Public License does not permit incorporating your program into   #
+# proprietary programs.  If your program is a subroutine library, you may       #
+# consider it more useful to permit linking proprietary applications with the   #
+# library.  If this is what you want to do, use the GNU Library General         #
+# Public License instead of this License.                                       #
+#################################################################################