X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=faster-dupemerge;h=14360dc8c802d83c89132fe5780b7070fd0896f0;hb=a433cb10cd4e633289a448649f9509e1b24d9c7e;hp=fe433f79283d5646cb5c3781aae75505f6710940;hpb=2dbbadc3d2baeaf6ea7138ba88800499f0ce603a;p=dupemerge diff --git a/faster-dupemerge b/faster-dupemerge index fe433f7..14360dc 100755 --- a/faster-dupemerge +++ b/faster-dupemerge @@ -4,7 +4,7 @@ use Fcntl qw(:DEFAULT :flock); use File::Compare; use File::Temp; -# Copyright (C) 2003-2010 Zygo Blaxell +# Copyright (C) 2002-2010 Zygo Blaxell # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -87,6 +87,7 @@ my $collapse_timestamp = 0; my $collapse_zero = 0; my $skip_compares = 0; my $skip_hashes = 0; +my $skip_hashes_threshold = 0; my $progress = 0; my $verbose = 0; my $debug = 0; @@ -160,7 +161,10 @@ hard links). --skip-compare skip byte-by-byte file comparisons - --skip-hash skip calculation of hash function on files + --skip-hash[=N] skip calculation of hash function on files + larger than N bytes (default 1M). + Scalars KMGT specify KiB, MiB, GiB, and TiB. + Scalars kmgt specify KB, MB, GB, and TB. --trust old name for --skip-compare (trust the hash function) @@ -181,8 +185,22 @@ while ($#ARGV >= 0) { $collapse_zero = 1; } elsif ($arg eq '--trust' || $arg eq '--skip-compare') { $skip_compares = 1; - } elsif ($arg eq '--skip-hash') { - $skip_hashes = 1; + } elsif ($arg =~ /^--skip-hash(?:=(\d+)([KkMmGgTt]?))?$/os) { + my ($quantity, $unit) = ($1, $2); + $unit ||= '_'; + $quantity ||= 1048576; + my %scale = ( + _ => 1, + k => 1000, + K => 1024, + m => 1000*1000, + M => 1024*1024, + g => 1000*1000*1000, + G => 1024*1024*1024, + t => 1000*1000*1000*1000, + T => 1024*1024*1024*1024, + ); + $skip_hashes = $skip_hashes_threshold = $quantity * $scale{$unit}; } elsif ($arg eq '--progress') { $progress = 1; } elsif ($arg eq '--verbose') { @@ -380,7 +398,7 @@ incumbent_file: my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file); print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug; - if (!defined($incumbent_blocks)) { + if (!defined($incumbent_blocks) || ! (-f _)) { warn "lstat: $incumbent_file: $!"; $surprises++; next incumbent_file; @@ -401,7 +419,7 @@ candidate_file: my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file); print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug; - if (!defined($candidate_blocks)) { + if (!defined($candidate_blocks) || ! (-f _)) { warn "lstat: $candidate_file: $!"; $surprises++; next candidate_file; @@ -572,13 +590,15 @@ end_merge: } while () { - my ($weak_key, $dev, $ino, $name) = m/^(\d+ \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so; + my ($weak_key, $size, $dev, $ino, $name) = m/^((\d+) \d+ \d+ \d+ -?[\d.]+) (\d+) (\d+) (.+)\0$/so; die "read error: $!\nLast input line was '$_'" unless defined($name); my $inode = format_inode($dev, $ino); print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug; + $skip_hashes = $size >= $skip_hashes_threshold; + $input_links++; merge_files if $weak_key ne $current_key; $current_key = $weak_key;