X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=faster-dupemerge;h=0ae1032f2192b4fe4d7556f8b5be4ffd9ce671a8;hb=51db2c8ce235245d7eb2c02e337d9d18b94c9813;hp=6508fc0da801e8d66cb8476cbfb0bd5ce25ce373;hpb=92a22d87fd9fb90d125cea123e093fa900883c84;p=dupemerge diff --git a/faster-dupemerge b/faster-dupemerge index 6508fc0..0ae1032 100755 --- a/faster-dupemerge +++ b/faster-dupemerge @@ -1,14 +1,22 @@ #!/usr/bin/perl -w +# $Id$ + +# Copyright (C) 2002-2003 by Zygo Blaxell +# Use, modification, and distribution permitted +# under the terms of the GNU GPL. + use strict; use Fcntl qw(:DEFAULT :flock); use File::Compare; use File::Temp; -my $links_input = 0; -my $files_input = 0; -my $bogons_input = 0; -my $files_hashed = 0; +my $input_links = 0; +my $input_files = 0; +my $input_bogons = 0; +my $hash_bytes = 0; +my $hash_files = 0; my $hash_errors = 0; +my $compare_bytes = 0; my $compare_count = 0; my $compare_errors = 0; my $compare_differences = 0; @@ -17,10 +25,10 @@ my $merges_attempted = 0; my $hard_links = 0; my $link_errors = 0; my $link_retries = 0; -my $bytes_recovered = 0; -my $files_recovered = 0; -my $files_lost = 0; -my $bytes_lost = 0; +my $recovered_bytes = 0; +my $recovered_files = 0; +my $lost_files = 0; +my $lost_bytes = 0; my $surprises = 0; eval ' @@ -31,7 +39,7 @@ if ($@) { warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)"; eval <<'DIGEST'; - sub digest { + sub really_digest { my ($filename) = (@_); my $fv = open(MD5SUM, "-|"); die "fork: $!" unless defined($fv); @@ -51,7 +59,7 @@ if ($@) { DIGEST } else { eval <<'DIGEST'; - sub digest { + sub really_digest { my ($filename) = (@_); die "'$filename' is not a plain file" if (-l $filename) || ! (-f _); my $ctx = Digest::SHA1->new; @@ -67,15 +75,30 @@ DIGEST my $collapse_access = 0; my $collapse_timestamp = 0; my $collapse_zero = 0; -my $trust_hashes = 0; +my $skip_compares = 0; +my $skip_hashes = 0; my $verbose = 0; my $debug = 0; +my $dry_run = 0; +my $humane = 0; my @extra_find_opts = (); my @extra_sort_opts = (); my $lock_file; my $lock_rm = 0; my $lock_obtained = 0; +sub digest { + my ($filename) = (@_); + if ($skip_hashes) { + return "SKIPPING HASHES"; + } else { + my $digest = &really_digest($filename); + $hash_bytes += -s $filename; + $hash_files++; + return $digest + } +} + my @directories; sub usage { @@ -99,8 +122,12 @@ hard links). --debug show all steps in duplication discovery process (implies --verbose) + --dry-run do not lock files or make changes to filesystem + --find pass next options (up to --) to find command + --humane human-readable statistics (e.g. 1 048 576) + --lock FILE exit immediately (status 10) if unable to obtain a flock(LOCK_EX|LOCK_NB) on FILE @@ -110,7 +137,11 @@ hard links). --timestamps mtime may be different for identical files - --trust skip byte-by-byte file comparisons + --skip-compare skip byte-by-byte file comparisons + + --skip-hash skip calculation of hash function on files + + --trust old name for --skip-compare (trust the hash function) --verbose report files as they are considered @@ -127,8 +158,10 @@ while ($#ARGV >= 0) { $collapse_timestamp = 1; } elsif ($arg eq '--zeros') { $collapse_zero = 1; - } elsif ($arg eq '--trust') { - $trust_hashes = 1; + } elsif ($arg eq '--trust' || $arg eq '--skip-compare') { + $skip_compares = 1; + } elsif ($arg eq '--skip-hash') { + $skip_hashes = 1; } elsif ($arg eq '--verbose') { $verbose = 1; } elsif ($arg eq '--lock-rm') { @@ -141,6 +174,10 @@ while ($#ARGV >= 0) { } } elsif ($arg eq '--debug') { $debug = $verbose = 1; + } elsif ($arg eq '--dry-run') { + $dry_run = 1; + } elsif ($arg eq '--humane') { + $humane = 1; } elsif ($arg eq '--find') { while ($#ARGV >= 0) { my $extra_arg = shift(@ARGV); @@ -161,7 +198,13 @@ while ($#ARGV >= 0) { } } -if (defined($lock_file)) { +if ($skip_hashes && $skip_compares) { + die "Cannot skip both hashes and compares.\n"; +} + +@directories or usage; + +if (defined($lock_file) && !$dry_run) { sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!"; flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!"; print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose; @@ -169,7 +212,7 @@ if (defined($lock_file)) { } END { - if ($lock_obtained) { + if ($lock_obtained && !$dry_run) { print STDERR "Removing '$lock_file'.\n" if $verbose; unlink($lock_file) or warn "unlink: $lock_file: $!"; } @@ -177,7 +220,7 @@ END { sub tick_quote { my ($text) = (@_); - $text =~ s/'/'\''/go; + $text =~ s/'/'\\''/go; return "'$text'"; } @@ -213,14 +256,18 @@ my %inode_to_file_name = (); # Link files sub link_files { my ($from, $to) = (@_); + + my $quoted_from = tick_quote($from); + my $quoted_to = tick_quote($to); + print STDERR "ln -f $quoted_from $quoted_to\n"; + + return if $dry_run; + my $inode_dir = $to; my $inode_base = $to; $inode_dir =~ s:[^/]*$::o; $inode_base =~ s:^.*/::os; my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base."); - my $quoted_from = tick_quote($from); - my $quoted_to = tick_quote($to); - print STDERR "ln -f $quoted_from $quoted_to\n"; print STDERR "\tlink: $from -> $tmp_to\n" if $debug; link($from, $tmp_to) or die "link: $from -> $tmp_to: $!"; print STDERR "\trename: $tmp_to -> $to\n" if $debug; @@ -240,7 +287,7 @@ sub merge_files { my %stop_loop; my @candidate_list = keys(%inode_to_file_name); - $files_input += @candidate_list; + $input_files += @candidate_list; if (@candidate_list < 2) { print STDERR "Merging...only one candidate to merge..." if $debug; $trivially_unique++; @@ -262,13 +309,14 @@ hash_file: $surprises++; next; } - eval { $digest = digest($filename); }; + eval { + $digest = digest($filename); + }; if ($@) { warn "Digest($filename)(#$candidate) failed: $@"; $hash_errors++; } else { $ok = 1; - $files_hashed++; last hash_file; } } @@ -289,7 +337,7 @@ link_start: print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug; print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug; - incumbent_file: +incumbent_file: foreach my $incumbent_file (@incumbent_names) { my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file); @@ -307,10 +355,10 @@ link_start: next incumbent_file; } - candidate_file: - my $at_least_one_link_done = 0; +candidate_file: + foreach my $candidate_file (@candidate_names) { my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file); print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug; @@ -336,21 +384,26 @@ link_start: my $identical; eval { - if ($trust_hashes) { - print STDERR "\t\t\t\tTrusting hashes!\n" if $debug; + if ($skip_compares) { + print STDERR "\t\t\t\tSkipping compare!\n" if $debug; $identical = 1; } else { my $quoted_incumbent_file = tick_quote($incumbent_file); my $quoted_candidate_file = tick_quote($candidate_file); - print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n"; + print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug; if (compare($incumbent_file, $candidate_file)) { $compare_differences++; $identical = 0; - print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" + # It is significant for two non-identical files to have identical SHA1 or MD5 hashes. + # Some kind of I/O error is more likely, so this message cannot be turned off. + # On the other hand, if we're skipping hashes, _all_ files will have the same hash, + # so the warning in that case is quite silly. Hmmm. + print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes; } else { $identical = 1; } $compare_count++; + $compare_bytes += $incumbent_size; } }; if ($@) { @@ -424,14 +477,18 @@ link_start: # My random number generator chooses the incumbent's size. if ($link_done) { + # Since we're in a dry run, the filesystem doesn't change. + # Our notion of what the filesystem should look like should not change either. delete $inode_to_file_name{$to_inode}->{$to_file}; - $inode_to_file_name{$from_inode}->{$to_file} = undef; - $hash_to_inode{$digest} = $from_inode; + unless ($dry_run) { + $inode_to_file_name{$from_inode}->{$to_file} = undef; + $hash_to_inode{$digest} = $from_inode; + } $hard_links++; if ($to_nlink == 1) { - $files_recovered++; - $bytes_recovered += $incumbent_size; + $recovered_files++; + $recovered_bytes += $incumbent_size; } # FIXME: Now we're really confused for some reason. @@ -444,8 +501,8 @@ link_start: # tried all possible ways to hardlink the file out of existence first; # however, that is complex and only benefits a silly statistic. if ($to_nlink == 1 || $from_nlink == 1) { - $files_lost++; - $bytes_lost += $incumbent_size; + $lost_files++; + $lost_bytes += $incumbent_size; } } } @@ -478,11 +535,11 @@ while () { unless (! (-l $name) && (-f _)) { warn "Bogon file " . tick_quote($name); - $bogons_input++; + $input_bogons++; next; } - $links_input++; + $input_links++; merge_files if $weak_key ne $current_key; $current_key = $weak_key; @@ -493,25 +550,53 @@ while () { merge_files; -print STDERR < $max_num_len; + } + + (my $dummy = $stats_blob) =~ s/\d+/measure_numbers($&)/geos; + + sub space_numbers { + my ($num) = @_; + 1 while $num =~ s/(\d)(\d\d\d)( \d\d\d)*$/$1 $2$3/os; + $num = ' ' x ($max_num_len - length($num)) . $num; + return $num; + } + + $stats_blob =~ s/\d+/space_numbers($&)/geos; +} + +$stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg; + +print STDERR $stats_blob; + exit(0);