Initial commit
authorroot <root@ashura.furryterror.org>
Tue, 23 Dec 2008 19:52:04 +0000 (14:52 -0500)
committerZygo Blaxell <zblaxell@faye.furryterror.org>
Fri, 8 Jan 2010 22:44:58 +0000 (17:44 -0500)
faster-dupemerge [new file with mode: 0755]

diff --git a/faster-dupemerge b/faster-dupemerge
new file mode 100755 (executable)
index 0000000..6508fc0
--- /dev/null
@@ -0,0 +1,517 @@
+#!/usr/bin/perl -w
+use strict;
+use Fcntl qw(:DEFAULT :flock);
+use File::Compare;
+use File::Temp;
+
+my $links_input = 0;
+my $files_input = 0;
+my $bogons_input = 0;
+my $files_hashed = 0;
+my $hash_errors = 0;
+my $compare_count = 0;
+my $compare_errors = 0;
+my $compare_differences = 0;
+my $trivially_unique = 0;
+my $merges_attempted = 0;
+my $hard_links = 0;
+my $link_errors = 0;
+my $link_retries = 0;
+my $bytes_recovered = 0;
+my $files_recovered = 0;
+my $files_lost = 0;
+my $bytes_lost = 0;
+my $surprises = 0;
+
+eval '
+       use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
+';
+
+if ($@) {
+       warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
+
+       eval <<'DIGEST';
+               sub digest {
+                       my ($filename) = (@_);
+                       my $fv = open(MD5SUM, "-|");    
+                       die "fork: $!" unless defined($fv);
+                       if ($fv) {
+                               my ($sum_line) = <MD5SUM>;
+                               close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
+                               die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
+                               my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
+                               die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
+                               return $sum;
+                       } else {
+                               sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
+                               exec('md5sum');
+                               # Perl guarantees it will die here
+                       }
+               }
+DIGEST
+} else {
+       eval <<'DIGEST';
+               sub digest {
+                       my ($filename) = (@_);
+                       die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
+                       my $ctx = Digest::SHA1->new;
+                       sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
+                       binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
+                       $ctx->addfile(\*FILE);
+                       close(FILE) or die "close: $filename: $!";
+                       return $ctx->b64digest;
+               }
+DIGEST
+}
+       
+my $collapse_access = 0;
+my $collapse_timestamp = 0;
+my $collapse_zero = 0;
+my $trust_hashes = 0;
+my $verbose = 0;
+my $debug = 0;
+my @extra_find_opts = ();
+my @extra_sort_opts = ();
+my $lock_file;
+my $lock_rm = 0;
+my $lock_obtained = 0;
+
+my @directories;
+
+sub usage {
+       my $name = shift(@_);
+       die <<USAGE;
+Usage: $name [--opts] directory [directory...]
+Finds duplicate files in the given directories, and replaces all identical
+copies of a file with hard-links to a single file.
+
+Several options modify the definition of a "duplicate".  By default, files
+which have differences in owner uid or gid, permission (mode), or
+modification time (mtime) are considered different, so that hardlinking
+files does not also change their attributes.  Additionally, all files of
+zero size are ignored for performance reasons (there tend to be many
+of them, and they tend not to release any space when replaced with
+hard links).
+
+        --access        uid, gid, and mode may be different for identical
+                        files
+
+        --debug         show all steps in duplication discovery process
+                        (implies --verbose)
+
+        --find          pass next options (up to --) to find command
+
+        --lock FILE     exit immediately (status 10) if unable to obtain a 
+                        flock(LOCK_EX|LOCK_NB) on FILE
+
+       --lock-rm       remove lock file at exit
+
+        --sort          pass next options (up to --) to sort command
+
+        --timestamps    mtime may be different for identical files
+
+        --trust         skip byte-by-byte file comparisons
+                        (trust the hash function)
+
+        --verbose       report files as they are considered
+
+        --zeros         hard-link zero-length files too
+USAGE
+}
+
+while ($#ARGV >= 0) {
+       my $arg = shift(@ARGV);
+       if ($arg eq '--access') {
+               $collapse_access = 1;
+       } elsif ($arg eq '--timestamps') {
+               $collapse_timestamp = 1;
+       } elsif ($arg eq '--zeros') {
+               $collapse_zero = 1;
+       } elsif ($arg eq '--trust') {
+               $trust_hashes = 1;
+       } elsif ($arg eq '--verbose') {
+               $verbose = 1;
+       } elsif ($arg eq '--lock-rm') {
+               $lock_rm = 1;
+       } elsif ($arg eq '--lock') {
+               $lock_file = shift(@ARGV);
+               unless (defined($lock_file)) {
+                       usage($0);
+                       exit(1);
+               }
+       } elsif ($arg eq '--debug') {
+               $debug = $verbose = 1;
+       } elsif ($arg eq '--find') {
+               while ($#ARGV >= 0) {
+                       my $extra_arg = shift(@ARGV);
+                       last if $extra_arg eq '--';
+                       push(@extra_find_opts, $extra_arg);
+               }
+       } elsif ($arg eq '--sort') {
+               while ($#ARGV >= 0) {
+                       my $extra_arg = shift(@ARGV);
+                       last if $extra_arg eq '--';
+                       push(@extra_sort_opts, $extra_arg);
+               }
+       } elsif ($arg =~ /^-/o) {
+               usage($0);
+               exit(1);
+       } else {
+               push(@directories, $arg);
+       }
+}
+
+if (defined($lock_file)) {
+       sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
+       flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
+       print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
+       $lock_obtained = 1;
+}
+
+END {
+       if ($lock_obtained) {
+               print STDERR "Removing '$lock_file'.\n" if $verbose;
+               unlink($lock_file) or warn "unlink: $lock_file: $!";
+       }
+}
+
+sub tick_quote {
+       my ($text) = (@_);
+       $text =~ s/'/'\''/go;
+       return "'$text'";
+}
+
+my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
+my $printf_string = '%s ' .
+       ($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
+       ($collapse_timestamp ? '0 '     : '%T@ ') .
+       '%i %p\0';
+
+push(@find_command, '!', '-empty') unless $collapse_zero;
+push(@find_command, '-printf', $printf_string);
+
+my @sort_command = ('sort', '-znr', @extra_sort_opts);
+my @quoted_sort_command = @sort_command;
+grep(tick_quote($_), @quoted_sort_command);
+my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
+
+my @quoted_find_command = @find_command;
+grep(tick_quote($_), @quoted_find_command);
+my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
+print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
+
+open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
+$/ = "\0";
+
+# Input is sorted so that all weak keys are contiguous.
+# When the key changes, we have to process all files we previously know about.
+my $current_key = -1;
+
+# $inode_to_file_name{$inode} = [@file_names]
+my %inode_to_file_name = ();
+
+# Link files
+sub link_files {
+       my ($from, $to) = (@_);
+       my $inode_dir = $to;
+       my $inode_base = $to;
+       $inode_dir =~ s:[^/]*$::o;
+       $inode_base =~ s:^.*/::os;
+       my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
+       my $quoted_from = tick_quote($from);
+       my $quoted_to = tick_quote($to);
+       print STDERR "ln -f $quoted_from $quoted_to\n";
+       print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
+       link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
+       print STDERR "\trename: $tmp_to -> $to\n" if $debug;
+       unless (rename($tmp_to, $to)) {
+               my $saved_bang = $!;
+               unlink($tmp_to) or warn "unlink: $tmp_to: $!";  # Try, possibly in vain, to clean up
+               die "rename: $tmp_to -> $from: $saved_bang";
+       }
+}
+
+# Process all known files so far.
+sub merge_files {
+       $merges_attempted++;
+
+       my %hash_to_inode;
+       # Used to stop link retry loops (there is a goto in here!  Actually two...)
+       my %stop_loop;
+
+       my @candidate_list = keys(%inode_to_file_name);
+       $files_input += @candidate_list;
+       if (@candidate_list < 2) {
+               print STDERR "Merging...only one candidate to merge..." if $debug;
+               $trivially_unique++;
+               goto end_merge;
+       }
+
+       print STDERR "Merging...\n" if $debug;
+       foreach my $candidate (@candidate_list) {
+               print STDERR "\tDigesting candidate $candidate\n" if $debug;
+               my $ok = 0;
+               my $digest;
+
+hash_file:
+
+               foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
+                       print STDERR "\t\tDigesting file $filename\n" if $debug;
+                       if ((-l $filename) || ! -f _) {
+                               warn "Bogon file " . tick_quote($filename);
+                               $surprises++;
+                               next;
+                       }
+                       eval { $digest = digest($filename); };
+                       if ($@) {
+                               warn "Digest($filename)(#$candidate) failed: $@";
+                               $hash_errors++;
+                       } else {
+                               $ok = 1;
+                               $files_hashed++;
+                               last hash_file;
+                       }
+               }
+               if ($ok) {
+                       print STDERR "\t\tDigest is $digest\n" if $debug;
+
+                       my $incumbent = $hash_to_inode{$digest};
+                       if (defined($incumbent)) {
+                               print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
+
+                               my $finished = 0;
+
+link_start:
+
+                               until ($finished) {
+                                       my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
+                                       my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
+                                       print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
+                                       print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
+
+       incumbent_file:
+
+                                       foreach my $incumbent_file (@incumbent_names) {
+                                               my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
+                                               print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
+
+                                               if (!defined($incumbent_blocks)) {
+                                                       warn "lstat: $incumbent_file: $!";
+                                                       $surprises++;
+                                                       next incumbent_file;
+                                               }
+
+                                               if ($incumbent_ino != $incumbent) {
+                                                       warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
+                                                       $surprises++;
+                                                       next incumbent_file;
+                                               }
+
+       candidate_file:
+
+                                               my $at_least_one_link_done = 0;
+
+                                               foreach my $candidate_file (@candidate_names) {
+                                                       my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
+                                                       print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
+
+                                                       if (!defined($candidate_blocks)) {
+                                                               warn "lstat: $candidate_file: $!";
+                                                               $surprises++;
+                                                               next candidate_file;
+                                                       }
+
+                                                       if ($candidate_ino != $candidate) {
+                                                               warn "$candidate_file: expected inode $candidate, found $candidate_ino";
+                                                               $surprises++;
+                                                               next candidate_file;
+                                                       }
+
+                                                       if ($candidate_size != $incumbent_size) {
+                                                               warn "$candidate_file, $incumbent_file: file sizes are different";
+                                                               $surprises++;
+                                                               next candidate_file;
+                                                       }
+
+                                                       my $identical;
+
+                                                       eval {
+                                                               if ($trust_hashes) {
+                                                                       print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
+                                                                       $identical = 1;
+                                                               } else {
+                                                                       my $quoted_incumbent_file = tick_quote($incumbent_file);
+                                                                       my $quoted_candidate_file = tick_quote($candidate_file);
+                                                                       print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
+                                                                       if (compare($incumbent_file, $candidate_file)) {
+                                                                               $compare_differences++;
+                                                                               $identical = 0;
+                                                                               print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
+                                                                       } else {
+                                                                               $identical = 1;
+                                                                       }
+                                                                       $compare_count++;
+                                                               }
+                                                       };
+                                                       if ($@) {
+                                                               warn $@;
+                                                               $compare_errors++;
+                                                               next candidate_file;
+                                                       }
+
+                                                       if ($identical) {
+                                                               print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
+
+                                                               # We have to do this to break out of a possible infinite loop.
+                                                               # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
+                                                               # such that A1 and B1 are in non-writable directories, we will loop
+                                                               # forever hardlinking A2 with A and B.
+                                                               # To break the loop, we never attempt to hardlink any files X and Y twice.
+
+                                                               if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
+                                                                   defined($stop_loop{$candidate_file}->{$incumbent_file})) {
+                                                                       print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
+                                                               } else {
+                                                                       $stop_loop{$incumbent_file}->{$candidate_file} = 1;
+                                                                       $stop_loop{$candidate_file}->{$incumbent_file} = 1;
+
+                                                                       my $link_done = 0;
+
+                                                                       my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
+                                                                       if ($candidate_nlink > $incumbent_nlink) {
+                                                                               $from_file = $candidate_file;
+                                                                               $to_file = $incumbent_file;
+                                                                               $from_inode = $candidate;
+                                                                               $to_inode = $incumbent;
+                                                                               $from_nlink = $candidate_nlink;
+                                                                               $to_nlink = $incumbent_nlink;
+                                                                       } else {
+                                                                               $to_file = $candidate_file;
+                                                                               $from_file = $incumbent_file;
+                                                                               $to_inode = $candidate;
+                                                                               $from_inode = $incumbent;
+                                                                               $to_nlink = $candidate_nlink;
+                                                                               $from_nlink = $incumbent_nlink;
+                                                                       }
+
+                                                                       eval {
+                                                                               link_files($from_file, $to_file);
+                                                                               $link_done = 1;
+                                                                       };
+
+                                                                       if ($@) {
+                                                                               warn $@;
+                                                                               $link_errors++;
+
+                                                                               print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
+                                                                               $link_retries++;
+
+                                                                               eval {
+                                                                                       ($from_file, $to_file) = ($to_file, $from_file);
+                                                                                       ($from_inode, $to_inode) = ($to_inode, $from_inode);
+                                                                                       ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
+                                                                                       link_files($from_file, $to_file);
+                                                                                       $link_done = 1;
+                                                                               };
+
+                                                                               if ($@) {
+                                                                                       warn $@;
+                                                                                       $link_errors++;
+                                                                               }
+                                                                       }
+
+                                                                       # Note since the files are presumably identical, they both have the same size.
+                                                                       # My random number generator chooses the incumbent's size.
+
+                                                                       if ($link_done) {
+                                                                               delete $inode_to_file_name{$to_inode}->{$to_file};
+                                                                               $inode_to_file_name{$from_inode}->{$to_file} = undef;
+                                                                               $hash_to_inode{$digest} = $from_inode;
+
+                                                                               $hard_links++;
+                                                                               if ($to_nlink == 1) {
+                                                                                       $files_recovered++;
+                                                                                       $bytes_recovered += $incumbent_size;
+                                                                               }
+
+                                                                               # FIXME:  Now we're really confused for some reason.
+                                                                               # Start over to rebuild state.
+                                                                               next link_start;
+                                                                       } else {
+                                                                               warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
+
+                                                                               # FIXME:  This is a lame heuristic.  We really need to know if we've
+                                                                               # tried all possible ways to hardlink the file out of existence first;
+                                                                               # however, that is complex and only benefits a silly statistic.
+                                                                               if ($to_nlink == 1 || $from_nlink == 1) {
+                                                                                       $files_lost++;
+                                                                                       $bytes_lost += $incumbent_size;
+                                                                               }
+                                                                       }
+                                                               }
+                                                       }
+                                               }
+                                       }
+                                       $finished = 1;
+                               }
+                       } else {
+                               print STDERR "\t\tNew hash entered\n" if $debug;
+                               $hash_to_inode{$digest} = $candidate;
+                       }
+               } else {
+                       warn "No digests found for inode $candidate\n";
+                       delete $inode_to_file_name{$candidate};
+               }
+       }
+
+end_merge:
+
+       print STDERR "Merge done.\n" if $debug;
+       undef %inode_to_file_name;
+}
+
+while (<FIND>) {
+       my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
+       die "read error: $!\nLast input line was '$_'" unless defined($name);
+
+       print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
+
+       unless (! (-l $name) && (-f _)) {
+               warn "Bogon file " . tick_quote($name);
+               $bogons_input++;
+               next;
+       }
+
+       $links_input++;
+       merge_files if $weak_key ne $current_key;
+       $current_key = $weak_key;
+
+       $inode_to_file_name{$inode}->{$name} = undef;
+
+       print STDERR "$name\n" if $verbose;
+}
+
+merge_files;
+
+print STDERR <<STATS;
+links_input             $links_input
+files_input ........... $files_input
+bogons_input            $bogons_input
+merges_attempted ...... $merges_attempted
+trivially_unique        $trivially_unique
+files_hashed .......... $files_hashed
+hash_errors             $hash_errors
+surprises ............. $surprises
+compare_count           $compare_count
+compare_differences ... $compare_differences
+compare_errors          $compare_errors
+hard_links ............ $hard_links
+link_errors             $link_errors
+link_retries .......... $link_retries
+bytes_recovered         $bytes_recovered
+files_recovered ....... $files_recovered
+bytes_lost              $bytes_lost
+files_lost ............ $files_lost
+STATS
+
+exit(0);