X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?p=dupemerge;a=blobdiff_plain;f=faster-dupemerge;h=62f4bc99c47b401bff653a252366eb9ef11e28e2;hp=9c159eec69ac48863a4e392a258ba767822bab81;hb=4f5affec900edd9b4b9acc4c390bfed4c91fb248;hpb=e92b42b97ce1be0e2fc8accc46a2a944db923a2f diff --git a/faster-dupemerge b/faster-dupemerge index 9c159ee..62f4bc9 100755 --- a/faster-dupemerge +++ b/faster-dupemerge @@ -4,7 +4,7 @@ use Fcntl qw(:DEFAULT :flock); use File::Compare; use File::Temp; -# Copyright (C) 2002-2010 Zygo Blaxell +# Copyright (C) 2002-2012 Zygo Blaxell # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -42,11 +42,11 @@ my $lost_bytes = 0; my $surprises = 0; eval ' - use Digest::SHA1 qw(sha1 sha1_hex sha1_base64); + use Digest::SHA qw(sha1 sha1_hex sha1_base64); '; if ($@) { - warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)"; + warn "Digest::SHA: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA"; eval <<'DIGEST'; sub really_digest { @@ -72,7 +72,7 @@ DIGEST sub really_digest { my ($filename) = (@_); die "'$filename' is not a plain file" if (-l $filename) || ! (-f _); - my $ctx = Digest::SHA1->new; + my $ctx = Digest::SHA->new; sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!"; binmode(FILE); # FIXME: Necessary? Probably harmless... $ctx->addfile(\*FILE); @@ -86,7 +86,9 @@ my $collapse_access = 0; my $collapse_timestamp = 0; my $collapse_zero = 0; my $skip_compares = 0; +my $skip_compare_preference = 0; my $skip_hashes = 0; +my $skip_hashes_threshold = 0; my $progress = 0; my $verbose = 0; my $debug = 0; @@ -153,14 +155,19 @@ hard links). S, s - lstat(2) (see source for details) U - unlink(2) . - all inodes with similar attributes done + (123456) - current file size in bytes --sort pass next options (up to --) to sort command --timestamps mtime may be different for identical files - --skip-compare skip byte-by-byte file comparisons + --skip-compare skip byte-by-byte file comparisons, + compare only file hashes - --skip-hash skip calculation of hash function on files + --skip-hash[=N] skip calculation of hash function on files + larger than N bytes (default 1M). + Scalars KMGT specify KiB, MiB, GiB, and TiB. + Scalars kmgt specify KB, MB, GB, and TB. --trust old name for --skip-compare (trust the hash function) @@ -168,6 +175,12 @@ hard links). --verbose report files as they are considered --zeros hard-link zero-length files too + +--skip-compare and --skip-hash can be combined, in which case a file is +either hashed (if it is below the --skip-hash size threshold) or compared +(if it is above), but never both. + +Version: 0.20120914 USAGE } @@ -180,9 +193,24 @@ while ($#ARGV >= 0) { } elsif ($arg eq '--zeros') { $collapse_zero = 1; } elsif ($arg eq '--trust' || $arg eq '--skip-compare') { - $skip_compares = 1; - } elsif ($arg eq '--skip-hash') { - $skip_hashes = 1; + $skip_compares = $skip_compare_preference = 1; + } elsif ($arg =~ /^--skip-hash(?:=(\d+)([KkMmGgTt]?))?$/os) { + my ($quantity, $unit) = ($1, $2); + $unit ||= '_'; + $quantity ||= 1048576; + my %scale = ( + _ => 1, + k => 1000, + K => 1024, + m => 1000*1000, + M => 1024*1024, + g => 1000*1000*1000, + G => 1024*1024*1024, + t => 1000*1000*1000*1000, + T => 1024*1024*1024*1024, + ); + $skip_hashes = 0; + $skip_hashes_threshold = $quantity * $scale{$unit}; } elsif ($arg eq '--progress') { $progress = 1; } elsif ($arg eq '--verbose') { @@ -221,11 +249,7 @@ while ($#ARGV >= 0) { } } -if ($skip_hashes && $skip_compares) { - die "Cannot skip both hashes and compares.\n"; -} - -@directories or usage; +@directories or usage($0); if (defined($lock_file) && !$dry_run) { sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!"; @@ -339,6 +363,7 @@ hash_file: foreach my $filename (sort keys(%{$inode_to_file_name{$candidate}})) { print STDERR "\t\tDigesting file $filename\n" if $debug; if ((-l $filename) || ! -f _) { + print STDERR "\n" if $progress; warn "Bogon file " . tick_quote($filename); $input_bogons++; delete $inode_to_file_name{$candidate}->{$filename}; @@ -380,14 +405,14 @@ incumbent_file: my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file); print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug; - if (!defined($incumbent_blocks)) { + if (!defined($incumbent_blocks) || ! (-f _)) { warn "lstat: $incumbent_file: $!"; $surprises++; next incumbent_file; } if (format_inode($incumbent_dev, $incumbent_ino) ne $incumbent) { - warn "$incumbent_file: expected inode $incumbent, found $incumbent_dev:$incumbent_ino"; + warn "$incumbent_file: expected inode $incumbent, found ".format_inode($incumbent_dev, $incumbent_ino); $surprises++; next incumbent_file; } @@ -401,14 +426,14 @@ candidate_file: my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file); print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug; - if (!defined($candidate_blocks)) { + if (!defined($candidate_blocks) || ! (-f _)) { warn "lstat: $candidate_file: $!"; $surprises++; next candidate_file; } if (format_inode($candidate_dev, $candidate_ino) ne $candidate) { - warn "$candidate_file: expected inode $candidate, found $candidate_dev:$candidate_ino"; + warn "$candidate_file: expected inode $candidate, found ".format_inode($candidate_dev, $candidate_ino); $surprises++; next candidate_file; } @@ -524,7 +549,7 @@ candidate_file: if ($link_done) { delete $inode_to_file_name{$to_inode}->{$to_file}; - $inode_to_file_name{$from_inode}->{$to_file} = undef; + $inode_to_file_name{$from_inode}->{$to_file} = undef unless ($dry_run); $hash_to_inode{$digest} = [ $from_inode ]; $hard_links++; @@ -571,14 +596,34 @@ end_merge: undef %inode_to_file_name; } +my $last_time = 0; +my $last_size = 0; + while () { - my ($weak_key, $dev, $ino, $name) = m/^(\d+ \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so; + my ($weak_key, $size, $dev, $ino, $name) = m/^((\d+) \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so; die "read error: $!\nLast input line was '$_'" unless defined($name); my $inode = format_inode($dev, $ino); print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug; + if ($skip_hashes_threshold && $size >= $skip_hashes_threshold) { + $skip_hashes = 1; + $skip_compares = 0; + } else { + $skip_hashes = 0; + $skip_compares = $skip_compare_preference; + } + + if ($progress) { + my $this_time = time(); + if ($this_time != $last_time && $size != $last_size) { + $last_time = $this_time; + $last_size = $size; + print STDERR "($size)"; + } + } + $input_links++; merge_files if $weak_key ne $current_key; $current_key = $weak_key; @@ -627,7 +672,7 @@ if ($humane) { sub space_numbers { my ($num) = @_; - 1 while $num =~ s/(\d)(\d\d\d)( \d\d\d)*$/$1 $2$3/os; + 1 while $num =~ s/(\d)(\d\d\d)((?: \d\d\d)*)$/$1 $2$3/os; $num = ' ' x ($max_num_len - length($num)) . $num; return $num; }