From aa42755213d1f38be21a9446de9cfe1b4c011410 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Fri, 8 Jan 2010 20:04:54 -0500 Subject: [PATCH] Replace --trust with --skip-compare and add --skip-hash and copyright statement git-svn-id: svn+ssh://svn.furryterror.org/r/trunk/mokona/zblaxell@3225 a5e33b96-951a-0410-ae88-c0fe16d076bb Conflicts: faster-dupemerge --- faster-dupemerge | 134 +++++++++++++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 46 deletions(-) diff --git a/faster-dupemerge b/faster-dupemerge index 6508fc0..a007540 100755 --- a/faster-dupemerge +++ b/faster-dupemerge @@ -1,14 +1,22 @@ #!/usr/bin/perl -w +# $Id$ + +# Copyright (C) 2002-2003 by Zygo Blaxell +# Use, modification, and distribution permitted +# under the terms of the GNU GPL. + use strict; use Fcntl qw(:DEFAULT :flock); use File::Compare; use File::Temp; -my $links_input = 0; -my $files_input = 0; -my $bogons_input = 0; -my $files_hashed = 0; +my $input_links = 0; +my $input_files = 0; +my $input_bogons = 0; +my $hash_bytes = 0; +my $hash_files = 0; my $hash_errors = 0; +my $compare_bytes = 0; my $compare_count = 0; my $compare_errors = 0; my $compare_differences = 0; @@ -17,10 +25,10 @@ my $merges_attempted = 0; my $hard_links = 0; my $link_errors = 0; my $link_retries = 0; -my $bytes_recovered = 0; -my $files_recovered = 0; -my $files_lost = 0; -my $bytes_lost = 0; +my $recovered_bytes = 0; +my $recovered_files = 0; +my $lost_files = 0; +my $lost_bytes = 0; my $surprises = 0; eval ' @@ -31,7 +39,7 @@ if ($@) { warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)"; eval <<'DIGEST'; - sub digest { + sub really_digest { my ($filename) = (@_); my $fv = open(MD5SUM, "-|"); die "fork: $!" unless defined($fv); @@ -51,7 +59,7 @@ if ($@) { DIGEST } else { eval <<'DIGEST'; - sub digest { + sub really_digest { my ($filename) = (@_); die "'$filename' is not a plain file" if (-l $filename) || ! (-f _); my $ctx = Digest::SHA1->new; @@ -67,7 +75,8 @@ DIGEST my $collapse_access = 0; my $collapse_timestamp = 0; my $collapse_zero = 0; -my $trust_hashes = 0; +my $skip_compares = 0; +my $skip_hashes = 0; my $verbose = 0; my $debug = 0; my @extra_find_opts = (); @@ -76,6 +85,17 @@ my $lock_file; my $lock_rm = 0; my $lock_obtained = 0; +sub digest { + my ($filename) = (@_); + if ($skip_hashes) { + return "SKIPPING HASHES"; + } else { + &really_digest($filename); + $hash_bytes += -s $filename; + $hash_files++; + } +} + my @directories; sub usage { @@ -110,7 +130,11 @@ hard links). --timestamps mtime may be different for identical files - --trust skip byte-by-byte file comparisons + --skip-compare skip byte-by-byte file comparisons + + --skip-hash skip calculation of hash function on files + + --trust old name for --skip-compare (trust the hash function) --verbose report files as they are considered @@ -127,8 +151,10 @@ while ($#ARGV >= 0) { $collapse_timestamp = 1; } elsif ($arg eq '--zeros') { $collapse_zero = 1; - } elsif ($arg eq '--trust') { - $trust_hashes = 1; + } elsif ($arg eq '--trust' || $arg eq '--skip-compare') { + $skip_compares = 1; + } elsif ($arg eq '--skip-hash') { + $skip_hashes = 1; } elsif ($arg eq '--verbose') { $verbose = 1; } elsif ($arg eq '--lock-rm') { @@ -161,6 +187,10 @@ while ($#ARGV >= 0) { } } +if ($skip_hashes && $skip_compares) { + die "Cannot skip both hashes and compares.\n"; +} + if (defined($lock_file)) { sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!"; flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!"; @@ -240,7 +270,7 @@ sub merge_files { my %stop_loop; my @candidate_list = keys(%inode_to_file_name); - $files_input += @candidate_list; + $input_files += @candidate_list; if (@candidate_list < 2) { print STDERR "Merging...only one candidate to merge..." if $debug; $trivially_unique++; @@ -262,13 +292,14 @@ hash_file: $surprises++; next; } - eval { $digest = digest($filename); }; + eval { + $digest = digest($filename); + }; if ($@) { warn "Digest($filename)(#$candidate) failed: $@"; $hash_errors++; } else { $ok = 1; - $files_hashed++; last hash_file; } } @@ -289,7 +320,7 @@ link_start: print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug; print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug; - incumbent_file: +incumbent_file: foreach my $incumbent_file (@incumbent_names) { my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file); @@ -307,10 +338,10 @@ link_start: next incumbent_file; } - candidate_file: - my $at_least_one_link_done = 0; +candidate_file: + foreach my $candidate_file (@candidate_names) { my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file); print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug; @@ -336,21 +367,26 @@ link_start: my $identical; eval { - if ($trust_hashes) { - print STDERR "\t\t\t\tTrusting hashes!\n" if $debug; + if ($skip_compares) { + print STDERR "\t\t\t\tSkipping compare!\n" if $debug; $identical = 1; } else { my $quoted_incumbent_file = tick_quote($incumbent_file); my $quoted_candidate_file = tick_quote($candidate_file); - print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n"; + print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug; if (compare($incumbent_file, $candidate_file)) { $compare_differences++; $identical = 0; - print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" + # It is significant for two non-identical files to have identical SHA1 or MD5 hashes. + # Some kind of I/O error is more likely, so this message cannot be turned off. + # On the other hand, if we're skipping hashes, _all_ files will have the same hash, + # so the warning in that case is quite silly. Hmmm. + print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes; } else { $identical = 1; } $compare_count++; + $compare_bytes += $incumbent_size; } }; if ($@) { @@ -430,8 +466,8 @@ link_start: $hard_links++; if ($to_nlink == 1) { - $files_recovered++; - $bytes_recovered += $incumbent_size; + $recovered_files++; + $recovered_bytes += $incumbent_size; } # FIXME: Now we're really confused for some reason. @@ -444,8 +480,8 @@ link_start: # tried all possible ways to hardlink the file out of existence first; # however, that is complex and only benefits a silly statistic. if ($to_nlink == 1 || $from_nlink == 1) { - $files_lost++; - $bytes_lost += $incumbent_size; + $lost_files++; + $lost_bytes += $incumbent_size; } } } @@ -478,11 +514,11 @@ while () { unless (! (-l $name) && (-f _)) { warn "Bogon file " . tick_quote($name); - $bogons_input++; + $input_bogons++; next; } - $links_input++; + $input_links++; merge_files if $weak_key ne $current_key; $current_key = $weak_key; @@ -493,25 +529,31 @@ while () { merge_files; -print STDERR <