use File::Compare;
use File::Temp;
-# Copyright (C) 2002-2010 Zygo Blaxell <faster-dupemerge@mailtoo.hungrycats.org>
+# Copyright (C) 2002-2012 Zygo Blaxell <faster-dupemerge@mailtoo.hungrycats.org>
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
my $surprises = 0;
eval '
- use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
+ use Digest::SHA qw(sha1 sha1_hex sha1_base64);
';
if ($@) {
- warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
+ warn "Digest::SHA: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA";
eval <<'DIGEST';
sub really_digest {
sub really_digest {
my ($filename) = (@_);
die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
- my $ctx = Digest::SHA1->new;
+ my $ctx = Digest::SHA->new;
sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
binmode(FILE); # FIXME: Necessary? Probably harmless...
$ctx->addfile(\*FILE);
my $collapse_timestamp = 0;
my $collapse_zero = 0;
my $skip_compares = 0;
+my $skip_compare_preference = 0;
my $skip_hashes = 0;
my $skip_hashes_threshold = 0;
my $progress = 0;
S, s - lstat(2) (see source for details)
U - unlink(2)
. - all inodes with similar attributes done
+ (123456) - current file size in bytes
--sort pass next options (up to --) to sort command
--timestamps mtime may be different for identical files
- --skip-compare skip byte-by-byte file comparisons
+ --skip-compare skip byte-by-byte file comparisons,
+ compare only file hashes
--skip-hash[=N] skip calculation of hash function on files
larger than N bytes (default 1M).
--verbose report files as they are considered
--zeros hard-link zero-length files too
+
+--skip-compare and --skip-hash can be combined, in which case a file is
+either hashed (if it is below the --skip-hash size threshold) or compared
+(if it is above), but never both.
+
+Version: 0.20120914
USAGE
}
} elsif ($arg eq '--zeros') {
$collapse_zero = 1;
} elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
- $skip_compares = 1;
+ $skip_compares = $skip_compare_preference = 1;
} elsif ($arg =~ /^--skip-hash(?:=(\d+)([KkMmGgTt]?))?$/os) {
my ($quantity, $unit) = ($1, $2);
$unit ||= '_';
t => 1000*1000*1000*1000,
T => 1024*1024*1024*1024,
);
- $skip_hashes = $skip_hashes_threshold = $quantity * $scale{$unit};
+ $skip_hashes = 0;
+ $skip_hashes_threshold = $quantity * $scale{$unit};
} elsif ($arg eq '--progress') {
$progress = 1;
} elsif ($arg eq '--verbose') {
}
}
-if ($skip_hashes && $skip_compares) {
- die "Cannot skip both hashes and compares.\n";
-}
-
-@directories or usage;
+@directories or usage($0);
if (defined($lock_file) && !$dry_run) {
sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
foreach my $filename (sort keys(%{$inode_to_file_name{$candidate}})) {
print STDERR "\t\tDigesting file $filename\n" if $debug;
if ((-l $filename) || ! -f _) {
+ print STDERR "\n" if $progress;
warn "Bogon file " . tick_quote($filename);
$input_bogons++;
delete $inode_to_file_name{$candidate}->{$filename};
}
if (format_inode($incumbent_dev, $incumbent_ino) ne $incumbent) {
- warn "$incumbent_file: expected inode $incumbent, found $incumbent_dev:$incumbent_ino";
+ warn "$incumbent_file: expected inode $incumbent, found ".format_inode($incumbent_dev, $incumbent_ino);
$surprises++;
next incumbent_file;
}
}
if (format_inode($candidate_dev, $candidate_ino) ne $candidate) {
- warn "$candidate_file: expected inode $candidate, found $candidate_dev:$candidate_ino";
+ warn "$candidate_file: expected inode $candidate, found ".format_inode($candidate_dev, $candidate_ino);
$surprises++;
next candidate_file;
}
if ($link_done) {
delete $inode_to_file_name{$to_inode}->{$to_file};
- $inode_to_file_name{$from_inode}->{$to_file} = undef;
+ $inode_to_file_name{$from_inode}->{$to_file} = undef unless ($dry_run);
$hash_to_inode{$digest} = [ $from_inode ];
$hard_links++;
undef %inode_to_file_name;
}
+my $last_time = 0;
+my $last_size = 0;
+
while (<FIND>) {
my ($weak_key, $size, $dev, $ino, $name) = m/^((\d+) \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so;
die "read error: $!\nLast input line was '$_'" unless defined($name);
print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
- $skip_hashes = $size >= $skip_hashes_threshold;
+ if ($skip_hashes_threshold && $size >= $skip_hashes_threshold) {
+ $skip_hashes = 1;
+ $skip_compares = 0;
+ } else {
+ $skip_hashes = 0;
+ $skip_compares = $skip_compare_preference;
+ }
+
+ if ($progress) {
+ my $this_time = time();
+ if ($this_time != $last_time && $size != $last_size) {
+ $last_time = $this_time;
+ $last_size = $size;
+ print STDERR "($size)";
+ }
+ }
$input_links++;
merge_files if $weak_key ne $current_key;
sub space_numbers {
my ($num) = @_;
- 1 while $num =~ s/(\d)(\d\d\d)( \d\d\d)*$/$1 $2$3/os;
+ 1 while $num =~ s/(\d)(\d\d\d)((?: \d\d\d)*)$/$1 $2$3/os;
$num = ' ' x ($max_num_len - length($num)) . $num;
return $num;
}