4 # Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
5 # Use, modification, and distribution permitted
6 # under the terms of the GNU GPL.
9 use Fcntl qw(:DEFAULT :flock);
19 my $compare_bytes = 0;
20 my $compare_count = 0;
21 my $compare_errors = 0;
22 my $compare_differences = 0;
23 my $trivially_unique = 0;
24 my $merges_attempted = 0;
28 my $recovered_bytes = 0;
29 my $recovered_files = 0;
35 use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
39 warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
43 my ($filename) = (@_);
44 my $fv = open(MD5SUM, "-|");
45 die "fork: $!" unless defined($fv);
47 my ($sum_line) = <MD5SUM>;
48 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
49 die "hash error: got EOF instead of md5sum output" unless defined($sum_line);
50 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
51 die "hash error: got \Q$sum_line\E instead of md5sum output" unless defined($sum);
54 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
56 # Perl guarantees it will die here
63 my ($filename) = (@_);
64 die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
65 my $ctx = Digest::SHA1->new;
66 sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
67 binmode(FILE); # FIXME: Necessary? Probably harmless...
68 $ctx->addfile(\*FILE);
69 close(FILE) or die "close: $filename: $!";
70 return $ctx->b64digest;
75 my $collapse_access = 0;
76 my $collapse_timestamp = 0;
77 my $collapse_zero = 0;
78 my $skip_compares = 0;
84 my @extra_find_opts = ();
85 my @extra_sort_opts = ();
88 my $lock_obtained = 0;
91 my ($filename) = (@_);
93 return "SKIPPING HASHES";
95 my $digest = &really_digest($filename);
96 $hash_bytes += -s $filename;
105 my $name = shift(@_);
107 Usage: $name [--opts] directory [directory...]
108 Finds duplicate files in the given directories, and replaces all identical
109 copies of a file with hard-links to a single file.
111 Several options modify the definition of a "duplicate". By default, files
112 which have differences in owner uid or gid, permission (mode), or
113 modification time (mtime) are considered different, so that hardlinking
114 files does not also change their attributes. Additionally, all files of
115 zero size are ignored for performance reasons (there tend to be many
116 of them, and they tend not to release any space when replaced with
119 --access uid, gid, and mode may be different for identical
122 --debug show all steps in duplication discovery process
125 --dry-run do not lock files or make changes to filesystem
127 --find pass next options (up to --) to find command
129 --humane human-readable statistics (e.g. 1 048 576)
131 --lock FILE exit immediately (status 10) if unable to obtain a
132 flock(LOCK_EX|LOCK_NB) on FILE
134 --lock-rm remove lock file at exit
136 --sort pass next options (up to --) to sort command
138 --timestamps mtime may be different for identical files
140 --skip-compare skip byte-by-byte file comparisons
142 --skip-hash skip calculation of hash function on files
144 --trust old name for --skip-compare
145 (trust the hash function)
147 --verbose report files as they are considered
149 --zeros hard-link zero-length files too
153 while ($#ARGV >= 0) {
154 my $arg = shift(@ARGV);
155 if ($arg eq '--access') {
156 $collapse_access = 1;
157 } elsif ($arg eq '--timestamps') {
158 $collapse_timestamp = 1;
159 } elsif ($arg eq '--zeros') {
161 } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
163 } elsif ($arg eq '--skip-hash') {
165 } elsif ($arg eq '--verbose') {
167 } elsif ($arg eq '--lock-rm') {
169 } elsif ($arg eq '--lock') {
170 $lock_file = shift(@ARGV);
171 unless (defined($lock_file)) {
175 } elsif ($arg eq '--debug') {
176 $debug = $verbose = 1;
177 } elsif ($arg eq '--dry-run') {
179 } elsif ($arg eq '--humane') {
181 } elsif ($arg eq '--find') {
182 while ($#ARGV >= 0) {
183 my $extra_arg = shift(@ARGV);
184 last if $extra_arg eq '--';
185 push(@extra_find_opts, $extra_arg);
187 } elsif ($arg eq '--sort') {
188 while ($#ARGV >= 0) {
189 my $extra_arg = shift(@ARGV);
190 last if $extra_arg eq '--';
191 push(@extra_sort_opts, $extra_arg);
193 } elsif ($arg =~ /^-/o) {
197 push(@directories, $arg);
201 if ($skip_hashes && $skip_compares) {
202 die "Cannot skip both hashes and compares.\n";
205 @directories or usage;
207 if (defined($lock_file) && !$dry_run) {
208 sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
209 flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
210 print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
215 if ($lock_obtained && !$dry_run) {
216 print STDERR "Removing '$lock_file'.\n" if $verbose;
217 unlink($lock_file) or warn "unlink: $lock_file: $!";
223 $text =~ s/'/'\''/go;
227 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
228 my $printf_string = '%s ' .
229 ($collapse_access ? '0 0 0 ' : '%U %G %m ') .
230 ($collapse_timestamp ? '0 ' : '%T@ ') .
233 push(@find_command, '!', '-empty') unless $collapse_zero;
234 push(@find_command, '-printf', $printf_string);
236 my @sort_command = ('sort', '-znr', @extra_sort_opts);
237 my @quoted_sort_command = @sort_command;
238 grep(tick_quote($_), @quoted_sort_command);
239 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
241 my @quoted_find_command = @find_command;
242 grep(tick_quote($_), @quoted_find_command);
243 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
244 print STDERR "find command: $quoted_find_command | $quoted_sort_command\n" if $verbose;
246 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
249 # Input is sorted so that all weak keys are contiguous.
250 # When the key changes, we have to process all files we previously know about.
251 my $current_key = -1;
253 # $inode_to_file_name{$inode} = [@file_names]
254 my %inode_to_file_name = ();
258 my ($from, $to) = (@_);
260 my $quoted_from = tick_quote($from);
261 my $quoted_to = tick_quote($to);
262 print STDERR "ln -f $quoted_from $quoted_to\n";
267 my $inode_base = $to;
268 $inode_dir =~ s:[^/]*$::o;
269 $inode_base =~ s:^.*/::os;
270 my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
271 print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
272 link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
273 print STDERR "\trename: $tmp_to -> $to\n" if $debug;
274 unless (rename($tmp_to, $to)) {
276 unlink($tmp_to) or warn "unlink: $tmp_to: $!"; # Try, possibly in vain, to clean up
277 die "rename: $tmp_to -> $from: $saved_bang";
281 # Process all known files so far.
286 # Used to stop link retry loops (there is a goto in here! Actually two...)
289 my @candidate_list = keys(%inode_to_file_name);
290 $input_files += @candidate_list;
291 if (@candidate_list < 2) {
292 print STDERR "Merging...only one candidate to merge..." if $debug;
297 print STDERR "Merging...\n" if $debug;
298 foreach my $candidate (@candidate_list) {
299 print STDERR "\tDigesting candidate $candidate\n" if $debug;
305 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
306 print STDERR "\t\tDigesting file $filename\n" if $debug;
307 if ((-l $filename) || ! -f _) {
308 warn "Bogon file " . tick_quote($filename);
313 $digest = digest($filename);
316 warn "Digest($filename)(#$candidate) failed: $@";
324 print STDERR "\t\tDigest is $digest\n" if $debug;
326 my $incumbent = $hash_to_inode{$digest};
327 if (defined($incumbent)) {
328 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
335 my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
336 my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
337 print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug;
338 print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
342 foreach my $incumbent_file (@incumbent_names) {
343 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
344 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
346 if (!defined($incumbent_blocks)) {
347 warn "lstat: $incumbent_file: $!";
352 if ($incumbent_ino != $incumbent) {
353 warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
358 my $at_least_one_link_done = 0;
362 foreach my $candidate_file (@candidate_names) {
363 my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
364 print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
366 if (!defined($candidate_blocks)) {
367 warn "lstat: $candidate_file: $!";
372 if ($candidate_ino != $candidate) {
373 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
378 if ($candidate_size != $incumbent_size) {
379 warn "$candidate_file, $incumbent_file: file sizes are different";
387 if ($skip_compares) {
388 print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
391 my $quoted_incumbent_file = tick_quote($incumbent_file);
392 my $quoted_candidate_file = tick_quote($candidate_file);
393 print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
394 if (compare($incumbent_file, $candidate_file)) {
395 $compare_differences++;
397 # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
398 # Some kind of I/O error is more likely, so this message cannot be turned off.
399 # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
400 # so the warning in that case is quite silly. Hmmm.
401 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
406 $compare_bytes += $incumbent_size;
416 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
418 # We have to do this to break out of a possible infinite loop.
419 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
420 # such that A1 and B1 are in non-writable directories, we will loop
421 # forever hardlinking A2 with A and B.
422 # To break the loop, we never attempt to hardlink any files X and Y twice.
424 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
425 defined($stop_loop{$candidate_file}->{$incumbent_file})) {
426 print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
428 $stop_loop{$incumbent_file}->{$candidate_file} = 1;
429 $stop_loop{$candidate_file}->{$incumbent_file} = 1;
433 my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
434 if ($candidate_nlink > $incumbent_nlink) {
435 $from_file = $candidate_file;
436 $to_file = $incumbent_file;
437 $from_inode = $candidate;
438 $to_inode = $incumbent;
439 $from_nlink = $candidate_nlink;
440 $to_nlink = $incumbent_nlink;
442 $to_file = $candidate_file;
443 $from_file = $incumbent_file;
444 $to_inode = $candidate;
445 $from_inode = $incumbent;
446 $to_nlink = $candidate_nlink;
447 $from_nlink = $incumbent_nlink;
451 link_files($from_file, $to_file);
459 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
463 ($from_file, $to_file) = ($to_file, $from_file);
464 ($from_inode, $to_inode) = ($to_inode, $from_inode);
465 ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
466 link_files($from_file, $to_file);
476 # Note since the files are presumably identical, they both have the same size.
477 # My random number generator chooses the incumbent's size.
480 # Since we're in a dry run, the filesystem doesn't change.
481 # Our notion of what the filesystem should look like should not change either.
482 delete $inode_to_file_name{$to_inode}->{$to_file};
484 $inode_to_file_name{$from_inode}->{$to_file} = undef;
485 $hash_to_inode{$digest} = $from_inode;
489 if ($to_nlink == 1) {
491 $recovered_bytes += $incumbent_size;
494 # FIXME: Now we're really confused for some reason.
495 # Start over to rebuild state.
498 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
500 # FIXME: This is a lame heuristic. We really need to know if we've
501 # tried all possible ways to hardlink the file out of existence first;
502 # however, that is complex and only benefits a silly statistic.
503 if ($to_nlink == 1 || $from_nlink == 1) {
505 $lost_bytes += $incumbent_size;
515 print STDERR "\t\tNew hash entered\n" if $debug;
516 $hash_to_inode{$digest} = $candidate;
519 warn "No digests found for inode $candidate\n";
520 delete $inode_to_file_name{$candidate};
526 print STDERR "Merge done.\n" if $debug;
527 undef %inode_to_file_name;
531 my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
532 die "read error: $!\nLast input line was '$_'" unless defined($name);
534 print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
536 unless (! (-l $name) && (-f _)) {
537 warn "Bogon file " . tick_quote($name);
543 merge_files if $weak_key ne $current_key;
544 $current_key = $weak_key;
546 $inode_to_file_name{$inode}->{$name} = undef;
548 print STDERR "$name\n" if $verbose;
553 my $stats_blob = <<STATS;
554 compare_bytes $compare_bytes
555 compare_count $compare_count
556 compare_differences $compare_differences
557 compare_errors $compare_errors
558 hard_links $hard_links
559 hash_bytes $hash_bytes
560 hash_errors $hash_errors
561 hash_files $hash_files
562 input_bogons $input_bogons
563 input_files $input_files
564 input_links $input_links
565 link_errors $link_errors
566 link_retries $link_retries
567 lost_bytes $lost_bytes
568 lost_files $lost_files
569 merges_attempted $merges_attempted
570 recovered_bytes $recovered_bytes
571 recovered_files $recovered_files
573 trivially_unique $trivially_unique
579 sub measure_numbers {
581 my $len = length($num);
582 $len += int( (length($num) - 1) / 3);
583 $max_num_len = $len if $len > $max_num_len;
586 (my $dummy = $stats_blob) =~ s/\d+/measure_numbers($&)/geos;
590 1 while $num =~ s/(\d)(\d\d\d)( \d\d\d)*$/$1 $2$3/os;
591 $num = ' ' x ($max_num_len - length($num)) . $num;
595 $stats_blob =~ s/\d+/space_numbers($&)/geos;
598 $stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
600 print STDERR $stats_blob;