From 92a22d87fd9fb90d125cea123e093fa900883c84 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 23 Dec 2008 14:52:04 -0500 Subject: [PATCH] Initial commit --- faster-dupemerge | 517 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 517 insertions(+) create mode 100755 faster-dupemerge diff --git a/faster-dupemerge b/faster-dupemerge new file mode 100755 index 0000000..6508fc0 --- /dev/null +++ b/faster-dupemerge @@ -0,0 +1,517 @@ +#!/usr/bin/perl -w +use strict; +use Fcntl qw(:DEFAULT :flock); +use File::Compare; +use File::Temp; + +my $links_input = 0; +my $files_input = 0; +my $bogons_input = 0; +my $files_hashed = 0; +my $hash_errors = 0; +my $compare_count = 0; +my $compare_errors = 0; +my $compare_differences = 0; +my $trivially_unique = 0; +my $merges_attempted = 0; +my $hard_links = 0; +my $link_errors = 0; +my $link_retries = 0; +my $bytes_recovered = 0; +my $files_recovered = 0; +my $files_lost = 0; +my $bytes_lost = 0; +my $surprises = 0; + +eval ' + use Digest::SHA1 qw(sha1 sha1_hex sha1_base64); +'; + +if ($@) { + warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)"; + + eval <<'DIGEST'; + sub digest { + my ($filename) = (@_); + my $fv = open(MD5SUM, "-|"); + die "fork: $!" unless defined($fv); + if ($fv) { + my ($sum_line) = ; + close(MD5SUM) or die "md5sum: exit status $? (error status $!)"; + die "hash error: got EOF instead of md5sum output" unless defined($sum_line); + my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o; + die "hash error: got \Q$sum_line\E instead of md5sum output" unless defined($sum); + return $sum; + } else { + sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!"; + exec('md5sum'); + # Perl guarantees it will die here + } + } +DIGEST +} else { + eval <<'DIGEST'; + sub digest { + my ($filename) = (@_); + die "'$filename' is not a plain file" if (-l $filename) || ! (-f _); + my $ctx = Digest::SHA1->new; + sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!"; + binmode(FILE); # FIXME: Necessary? Probably harmless... + $ctx->addfile(\*FILE); + close(FILE) or die "close: $filename: $!"; + return $ctx->b64digest; + } +DIGEST +} + +my $collapse_access = 0; +my $collapse_timestamp = 0; +my $collapse_zero = 0; +my $trust_hashes = 0; +my $verbose = 0; +my $debug = 0; +my @extra_find_opts = (); +my @extra_sort_opts = (); +my $lock_file; +my $lock_rm = 0; +my $lock_obtained = 0; + +my @directories; + +sub usage { + my $name = shift(@_); + die <= 0) { + my $arg = shift(@ARGV); + if ($arg eq '--access') { + $collapse_access = 1; + } elsif ($arg eq '--timestamps') { + $collapse_timestamp = 1; + } elsif ($arg eq '--zeros') { + $collapse_zero = 1; + } elsif ($arg eq '--trust') { + $trust_hashes = 1; + } elsif ($arg eq '--verbose') { + $verbose = 1; + } elsif ($arg eq '--lock-rm') { + $lock_rm = 1; + } elsif ($arg eq '--lock') { + $lock_file = shift(@ARGV); + unless (defined($lock_file)) { + usage($0); + exit(1); + } + } elsif ($arg eq '--debug') { + $debug = $verbose = 1; + } elsif ($arg eq '--find') { + while ($#ARGV >= 0) { + my $extra_arg = shift(@ARGV); + last if $extra_arg eq '--'; + push(@extra_find_opts, $extra_arg); + } + } elsif ($arg eq '--sort') { + while ($#ARGV >= 0) { + my $extra_arg = shift(@ARGV); + last if $extra_arg eq '--'; + push(@extra_sort_opts, $extra_arg); + } + } elsif ($arg =~ /^-/o) { + usage($0); + exit(1); + } else { + push(@directories, $arg); + } +} + +if (defined($lock_file)) { + sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!"; + flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!"; + print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose; + $lock_obtained = 1; +} + +END { + if ($lock_obtained) { + print STDERR "Removing '$lock_file'.\n" if $verbose; + unlink($lock_file) or warn "unlink: $lock_file: $!"; + } +} + +sub tick_quote { + my ($text) = (@_); + $text =~ s/'/'\''/go; + return "'$text'"; +} + +my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f'); +my $printf_string = '%s ' . + ($collapse_access ? '0 0 0 ' : '%U %G %m ') . + ($collapse_timestamp ? '0 ' : '%T@ ') . + '%i %p\0'; + +push(@find_command, '!', '-empty') unless $collapse_zero; +push(@find_command, '-printf', $printf_string); + +my @sort_command = ('sort', '-znr', @extra_sort_opts); +my @quoted_sort_command = @sort_command; +grep(tick_quote($_), @quoted_sort_command); +my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'"; + +my @quoted_find_command = @find_command; +grep(tick_quote($_), @quoted_find_command); +my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'"; +print STDERR "find command: $quoted_find_command | $quoted_sort_command\n" if $verbose; + +open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!"; +$/ = "\0"; + +# Input is sorted so that all weak keys are contiguous. +# When the key changes, we have to process all files we previously know about. +my $current_key = -1; + +# $inode_to_file_name{$inode} = [@file_names] +my %inode_to_file_name = (); + +# Link files +sub link_files { + my ($from, $to) = (@_); + my $inode_dir = $to; + my $inode_base = $to; + $inode_dir =~ s:[^/]*$::o; + $inode_base =~ s:^.*/::os; + my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base."); + my $quoted_from = tick_quote($from); + my $quoted_to = tick_quote($to); + print STDERR "ln -f $quoted_from $quoted_to\n"; + print STDERR "\tlink: $from -> $tmp_to\n" if $debug; + link($from, $tmp_to) or die "link: $from -> $tmp_to: $!"; + print STDERR "\trename: $tmp_to -> $to\n" if $debug; + unless (rename($tmp_to, $to)) { + my $saved_bang = $!; + unlink($tmp_to) or warn "unlink: $tmp_to: $!"; # Try, possibly in vain, to clean up + die "rename: $tmp_to -> $from: $saved_bang"; + } +} + +# Process all known files so far. +sub merge_files { + $merges_attempted++; + + my %hash_to_inode; + # Used to stop link retry loops (there is a goto in here! Actually two...) + my %stop_loop; + + my @candidate_list = keys(%inode_to_file_name); + $files_input += @candidate_list; + if (@candidate_list < 2) { + print STDERR "Merging...only one candidate to merge..." if $debug; + $trivially_unique++; + goto end_merge; + } + + print STDERR "Merging...\n" if $debug; + foreach my $candidate (@candidate_list) { + print STDERR "\tDigesting candidate $candidate\n" if $debug; + my $ok = 0; + my $digest; + +hash_file: + + foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) { + print STDERR "\t\tDigesting file $filename\n" if $debug; + if ((-l $filename) || ! -f _) { + warn "Bogon file " . tick_quote($filename); + $surprises++; + next; + } + eval { $digest = digest($filename); }; + if ($@) { + warn "Digest($filename)(#$candidate) failed: $@"; + $hash_errors++; + } else { + $ok = 1; + $files_hashed++; + last hash_file; + } + } + if ($ok) { + print STDERR "\t\tDigest is $digest\n" if $debug; + + my $incumbent = $hash_to_inode{$digest}; + if (defined($incumbent)) { + print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug; + + my $finished = 0; + +link_start: + + until ($finished) { + my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}}); + my @candidate_names = keys(%{$inode_to_file_name{$candidate}}); + print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug; + print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug; + + incumbent_file: + + foreach my $incumbent_file (@incumbent_names) { + my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file); + print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug; + + if (!defined($incumbent_blocks)) { + warn "lstat: $incumbent_file: $!"; + $surprises++; + next incumbent_file; + } + + if ($incumbent_ino != $incumbent) { + warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino"; + $surprises++; + next incumbent_file; + } + + candidate_file: + + my $at_least_one_link_done = 0; + + foreach my $candidate_file (@candidate_names) { + my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file); + print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug; + + if (!defined($candidate_blocks)) { + warn "lstat: $candidate_file: $!"; + $surprises++; + next candidate_file; + } + + if ($candidate_ino != $candidate) { + warn "$candidate_file: expected inode $candidate, found $candidate_ino"; + $surprises++; + next candidate_file; + } + + if ($candidate_size != $incumbent_size) { + warn "$candidate_file, $incumbent_file: file sizes are different"; + $surprises++; + next candidate_file; + } + + my $identical; + + eval { + if ($trust_hashes) { + print STDERR "\t\t\t\tTrusting hashes!\n" if $debug; + $identical = 1; + } else { + my $quoted_incumbent_file = tick_quote($incumbent_file); + my $quoted_candidate_file = tick_quote($candidate_file); + print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n"; + if (compare($incumbent_file, $candidate_file)) { + $compare_differences++; + $identical = 0; + print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" + } else { + $identical = 1; + } + $compare_count++; + } + }; + if ($@) { + warn $@; + $compare_errors++; + next candidate_file; + } + + if ($identical) { + print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug; + + # We have to do this to break out of a possible infinite loop. + # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1, + # such that A1 and B1 are in non-writable directories, we will loop + # forever hardlinking A2 with A and B. + # To break the loop, we never attempt to hardlink any files X and Y twice. + + if (defined($stop_loop{$incumbent_file}->{$candidate_file}) || + defined($stop_loop{$candidate_file}->{$incumbent_file})) { + print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n"; + } else { + $stop_loop{$incumbent_file}->{$candidate_file} = 1; + $stop_loop{$candidate_file}->{$incumbent_file} = 1; + + my $link_done = 0; + + my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink); + if ($candidate_nlink > $incumbent_nlink) { + $from_file = $candidate_file; + $to_file = $incumbent_file; + $from_inode = $candidate; + $to_inode = $incumbent; + $from_nlink = $candidate_nlink; + $to_nlink = $incumbent_nlink; + } else { + $to_file = $candidate_file; + $from_file = $incumbent_file; + $to_inode = $candidate; + $from_inode = $incumbent; + $to_nlink = $candidate_nlink; + $from_nlink = $incumbent_nlink; + } + + eval { + link_files($from_file, $to_file); + $link_done = 1; + }; + + if ($@) { + warn $@; + $link_errors++; + + print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug; + $link_retries++; + + eval { + ($from_file, $to_file) = ($to_file, $from_file); + ($from_inode, $to_inode) = ($to_inode, $from_inode); + ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink); + link_files($from_file, $to_file); + $link_done = 1; + }; + + if ($@) { + warn $@; + $link_errors++; + } + } + + # Note since the files are presumably identical, they both have the same size. + # My random number generator chooses the incumbent's size. + + if ($link_done) { + delete $inode_to_file_name{$to_inode}->{$to_file}; + $inode_to_file_name{$from_inode}->{$to_file} = undef; + $hash_to_inode{$digest} = $from_inode; + + $hard_links++; + if ($to_nlink == 1) { + $files_recovered++; + $bytes_recovered += $incumbent_size; + } + + # FIXME: Now we're really confused for some reason. + # Start over to rebuild state. + next link_start; + } else { + warn "Could not hardlink '$incumbent_file' and '$candidate_file'"; + + # FIXME: This is a lame heuristic. We really need to know if we've + # tried all possible ways to hardlink the file out of existence first; + # however, that is complex and only benefits a silly statistic. + if ($to_nlink == 1 || $from_nlink == 1) { + $files_lost++; + $bytes_lost += $incumbent_size; + } + } + } + } + } + } + $finished = 1; + } + } else { + print STDERR "\t\tNew hash entered\n" if $debug; + $hash_to_inode{$digest} = $candidate; + } + } else { + warn "No digests found for inode $candidate\n"; + delete $inode_to_file_name{$candidate}; + } + } + +end_merge: + + print STDERR "Merge done.\n" if $debug; + undef %inode_to_file_name; +} + +while () { + my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so; + die "read error: $!\nLast input line was '$_'" unless defined($name); + + print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug; + + unless (! (-l $name) && (-f _)) { + warn "Bogon file " . tick_quote($name); + $bogons_input++; + next; + } + + $links_input++; + merge_files if $weak_key ne $current_key; + $current_key = $weak_key; + + $inode_to_file_name{$inode}->{$name} = undef; + + print STDERR "$name\n" if $verbose; +} + +merge_files; + +print STDERR <