git-svn-id: svn+ssh://svn.furryterror.org/r/trunk/mokona/zblaxell@3225
a5e33b96-951a-0410-ae88-
c0fe16d076bb
Conflicts:
faster-dupemerge
+# $Id$
+
+# Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
+# Use, modification, and distribution permitted
+# under the terms of the GNU GPL.
+
use strict;
use Fcntl qw(:DEFAULT :flock);
use File::Compare;
use File::Temp;
use strict;
use Fcntl qw(:DEFAULT :flock);
use File::Compare;
use File::Temp;
-my $links_input = 0;
-my $files_input = 0;
-my $bogons_input = 0;
-my $files_hashed = 0;
+my $input_links = 0;
+my $input_files = 0;
+my $input_bogons = 0;
+my $hash_bytes = 0;
+my $hash_files = 0;
my $compare_count = 0;
my $compare_errors = 0;
my $compare_differences = 0;
my $compare_count = 0;
my $compare_errors = 0;
my $compare_differences = 0;
my $hard_links = 0;
my $link_errors = 0;
my $link_retries = 0;
my $hard_links = 0;
my $link_errors = 0;
my $link_retries = 0;
-my $bytes_recovered = 0;
-my $files_recovered = 0;
-my $files_lost = 0;
-my $bytes_lost = 0;
+my $recovered_bytes = 0;
+my $recovered_files = 0;
+my $lost_files = 0;
+my $lost_bytes = 0;
my $surprises = 0;
eval '
my $surprises = 0;
eval '
warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
eval <<'DIGEST';
warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
eval <<'DIGEST';
my ($filename) = (@_);
my $fv = open(MD5SUM, "-|");
die "fork: $!" unless defined($fv);
my ($filename) = (@_);
my $fv = open(MD5SUM, "-|");
die "fork: $!" unless defined($fv);
DIGEST
} else {
eval <<'DIGEST';
DIGEST
} else {
eval <<'DIGEST';
my ($filename) = (@_);
die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
my $ctx = Digest::SHA1->new;
my ($filename) = (@_);
die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
my $ctx = Digest::SHA1->new;
my $collapse_access = 0;
my $collapse_timestamp = 0;
my $collapse_zero = 0;
my $collapse_access = 0;
my $collapse_timestamp = 0;
my $collapse_zero = 0;
+my $skip_compares = 0;
+my $skip_hashes = 0;
my $verbose = 0;
my $debug = 0;
my @extra_find_opts = ();
my $verbose = 0;
my $debug = 0;
my @extra_find_opts = ();
my $lock_rm = 0;
my $lock_obtained = 0;
my $lock_rm = 0;
my $lock_obtained = 0;
+sub digest {
+ my ($filename) = (@_);
+ if ($skip_hashes) {
+ return "SKIPPING HASHES";
+ } else {
+ &really_digest($filename);
+ $hash_bytes += -s $filename;
+ $hash_files++;
+ }
+}
+
my @directories;
sub usage {
my @directories;
sub usage {
--timestamps mtime may be different for identical files
--timestamps mtime may be different for identical files
- --trust skip byte-by-byte file comparisons
+ --skip-compare skip byte-by-byte file comparisons
+
+ --skip-hash skip calculation of hash function on files
+
+ --trust old name for --skip-compare
(trust the hash function)
--verbose report files as they are considered
(trust the hash function)
--verbose report files as they are considered
$collapse_timestamp = 1;
} elsif ($arg eq '--zeros') {
$collapse_zero = 1;
$collapse_timestamp = 1;
} elsif ($arg eq '--zeros') {
$collapse_zero = 1;
- } elsif ($arg eq '--trust') {
- $trust_hashes = 1;
+ } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
+ $skip_compares = 1;
+ } elsif ($arg eq '--skip-hash') {
+ $skip_hashes = 1;
} elsif ($arg eq '--verbose') {
$verbose = 1;
} elsif ($arg eq '--lock-rm') {
} elsif ($arg eq '--verbose') {
$verbose = 1;
} elsif ($arg eq '--lock-rm') {
+if ($skip_hashes && $skip_compares) {
+ die "Cannot skip both hashes and compares.\n";
+}
+
if (defined($lock_file)) {
sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
if (defined($lock_file)) {
sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
my %stop_loop;
my @candidate_list = keys(%inode_to_file_name);
my %stop_loop;
my @candidate_list = keys(%inode_to_file_name);
- $files_input += @candidate_list;
+ $input_files += @candidate_list;
if (@candidate_list < 2) {
print STDERR "Merging...only one candidate to merge..." if $debug;
$trivially_unique++;
if (@candidate_list < 2) {
print STDERR "Merging...only one candidate to merge..." if $debug;
$trivially_unique++;
- eval { $digest = digest($filename); };
+ eval {
+ $digest = digest($filename);
+ };
if ($@) {
warn "Digest($filename)(#$candidate) failed: $@";
$hash_errors++;
} else {
$ok = 1;
if ($@) {
warn "Digest($filename)(#$candidate) failed: $@";
$hash_errors++;
} else {
$ok = 1;
print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug;
print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug;
print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
foreach my $incumbent_file (@incumbent_names) {
my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
foreach my $incumbent_file (@incumbent_names) {
my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
my $at_least_one_link_done = 0;
my $at_least_one_link_done = 0;
foreach my $candidate_file (@candidate_names) {
my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
foreach my $candidate_file (@candidate_names) {
my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
- if ($trust_hashes) {
- print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
+ if ($skip_compares) {
+ print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
$identical = 1;
} else {
my $quoted_incumbent_file = tick_quote($incumbent_file);
my $quoted_candidate_file = tick_quote($candidate_file);
$identical = 1;
} else {
my $quoted_incumbent_file = tick_quote($incumbent_file);
my $quoted_candidate_file = tick_quote($candidate_file);
- print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
+ print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
if (compare($incumbent_file, $candidate_file)) {
$compare_differences++;
$identical = 0;
if (compare($incumbent_file, $candidate_file)) {
$compare_differences++;
$identical = 0;
- print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
+ # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
+ # Some kind of I/O error is more likely, so this message cannot be turned off.
+ # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
+ # so the warning in that case is quite silly. Hmmm.
+ print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
} else {
$identical = 1;
}
$compare_count++;
} else {
$identical = 1;
}
$compare_count++;
+ $compare_bytes += $incumbent_size;
$hard_links++;
if ($to_nlink == 1) {
$hard_links++;
if ($to_nlink == 1) {
- $files_recovered++;
- $bytes_recovered += $incumbent_size;
+ $recovered_files++;
+ $recovered_bytes += $incumbent_size;
}
# FIXME: Now we're really confused for some reason.
}
# FIXME: Now we're really confused for some reason.
# tried all possible ways to hardlink the file out of existence first;
# however, that is complex and only benefits a silly statistic.
if ($to_nlink == 1 || $from_nlink == 1) {
# tried all possible ways to hardlink the file out of existence first;
# however, that is complex and only benefits a silly statistic.
if ($to_nlink == 1 || $from_nlink == 1) {
- $files_lost++;
- $bytes_lost += $incumbent_size;
+ $lost_files++;
+ $lost_bytes += $incumbent_size;
unless (! (-l $name) && (-f _)) {
warn "Bogon file " . tick_quote($name);
unless (! (-l $name) && (-f _)) {
warn "Bogon file " . tick_quote($name);
merge_files if $weak_key ne $current_key;
$current_key = $weak_key;
merge_files if $weak_key ne $current_key;
$current_key = $weak_key;
-print STDERR <<STATS;
-links_input $links_input
-files_input ........... $files_input
-bogons_input $bogons_input
-merges_attempted ...... $merges_attempted
-trivially_unique $trivially_unique
-files_hashed .......... $files_hashed
-hash_errors $hash_errors
-surprises ............. $surprises
+my $stats_blob = <<STATS;
+compare_bytes $compare_bytes
compare_count $compare_count
compare_count $compare_count
-compare_differences ... $compare_differences
+compare_differences $compare_differences
compare_errors $compare_errors
compare_errors $compare_errors
-hard_links ............ $hard_links
+hard_links $hard_links
+hash_bytes $hash_bytes
+hash_errors $hash_errors
+hash_files $hash_files
+input_bogons $input_bogons
+input_files $input_files
+input_links $input_links
-link_retries .......... $link_retries
-bytes_recovered $bytes_recovered
-files_recovered ....... $files_recovered
-bytes_lost $bytes_lost
-files_lost ............ $files_lost
+link_retries $link_retries
+lost_bytes $lost_bytes
+lost_files $lost_files
+merges_attempted $merges_attempted
+recovered_bytes $recovered_bytes
+recovered_files $recovered_files
+surprises $surprises
+trivially_unique $trivially_unique
+$stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
+
+print STDERR $stats_blob;
+