4 # Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
5 # Use, modification, and distribution permitted
6 # under the terms of the GNU GPL.
9 use Fcntl qw(:DEFAULT :flock);
19 my $compare_bytes = 0;
20 my $compare_count = 0;
21 my $compare_errors = 0;
22 my $compare_differences = 0;
23 my $trivially_unique = 0;
24 my $merges_attempted = 0;
28 my $recovered_bytes = 0;
29 my $recovered_files = 0;
35 use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
39 warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
43 my ($filename) = (@_);
44 my $fv = open(MD5SUM, "-|");
45 die "fork: $!" unless defined($fv);
47 my ($sum_line) = <MD5SUM>;
48 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
49 die "hash error: got EOF instead of md5sum output" unless defined($sum_line);
50 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
51 die "hash error: got \Q$sum_line\E instead of md5sum output" unless defined($sum);
54 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
56 # Perl guarantees it will die here
63 my ($filename) = (@_);
64 die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
65 my $ctx = Digest::SHA1->new;
66 sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
67 binmode(FILE); # FIXME: Necessary? Probably harmless...
68 $ctx->addfile(\*FILE);
69 close(FILE) or die "close: $filename: $!";
70 return $ctx->b64digest;
75 my $collapse_access = 0;
76 my $collapse_timestamp = 0;
77 my $collapse_zero = 0;
78 my $skip_compares = 0;
82 my @extra_find_opts = ();
83 my @extra_sort_opts = ();
86 my $lock_obtained = 0;
89 my ($filename) = (@_);
91 return "SKIPPING HASHES";
93 my $digest = &really_digest($filename);
94 $hash_bytes += -s $filename;
103 my $name = shift(@_);
105 Usage: $name [--opts] directory [directory...]
106 Finds duplicate files in the given directories, and replaces all identical
107 copies of a file with hard-links to a single file.
109 Several options modify the definition of a "duplicate". By default, files
110 which have differences in owner uid or gid, permission (mode), or
111 modification time (mtime) are considered different, so that hardlinking
112 files does not also change their attributes. Additionally, all files of
113 zero size are ignored for performance reasons (there tend to be many
114 of them, and they tend not to release any space when replaced with
117 --access uid, gid, and mode may be different for identical
120 --debug show all steps in duplication discovery process
123 --find pass next options (up to --) to find command
125 --lock FILE exit immediately (status 10) if unable to obtain a
126 flock(LOCK_EX|LOCK_NB) on FILE
128 --lock-rm remove lock file at exit
130 --sort pass next options (up to --) to sort command
132 --timestamps mtime may be different for identical files
134 --skip-compare skip byte-by-byte file comparisons
136 --skip-hash skip calculation of hash function on files
138 --trust old name for --skip-compare
139 (trust the hash function)
141 --verbose report files as they are considered
143 --zeros hard-link zero-length files too
147 while ($#ARGV >= 0) {
148 my $arg = shift(@ARGV);
149 if ($arg eq '--access') {
150 $collapse_access = 1;
151 } elsif ($arg eq '--timestamps') {
152 $collapse_timestamp = 1;
153 } elsif ($arg eq '--zeros') {
155 } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
157 } elsif ($arg eq '--skip-hash') {
159 } elsif ($arg eq '--verbose') {
161 } elsif ($arg eq '--lock-rm') {
163 } elsif ($arg eq '--lock') {
164 $lock_file = shift(@ARGV);
165 unless (defined($lock_file)) {
169 } elsif ($arg eq '--debug') {
170 $debug = $verbose = 1;
171 } elsif ($arg eq '--find') {
172 while ($#ARGV >= 0) {
173 my $extra_arg = shift(@ARGV);
174 last if $extra_arg eq '--';
175 push(@extra_find_opts, $extra_arg);
177 } elsif ($arg eq '--sort') {
178 while ($#ARGV >= 0) {
179 my $extra_arg = shift(@ARGV);
180 last if $extra_arg eq '--';
181 push(@extra_sort_opts, $extra_arg);
183 } elsif ($arg =~ /^-/o) {
187 push(@directories, $arg);
191 if ($skip_hashes && $skip_compares) {
192 die "Cannot skip both hashes and compares.\n";
195 if (defined($lock_file)) {
196 sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
197 flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
198 print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
203 if ($lock_obtained) {
204 print STDERR "Removing '$lock_file'.\n" if $verbose;
205 unlink($lock_file) or warn "unlink: $lock_file: $!";
211 $text =~ s/'/'\''/go;
215 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
216 my $printf_string = '%s ' .
217 ($collapse_access ? '0 0 0 ' : '%U %G %m ') .
218 ($collapse_timestamp ? '0 ' : '%T@ ') .
221 push(@find_command, '!', '-empty') unless $collapse_zero;
222 push(@find_command, '-printf', $printf_string);
224 my @sort_command = ('sort', '-znr', @extra_sort_opts);
225 my @quoted_sort_command = @sort_command;
226 grep(tick_quote($_), @quoted_sort_command);
227 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
229 my @quoted_find_command = @find_command;
230 grep(tick_quote($_), @quoted_find_command);
231 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
232 print STDERR "find command: $quoted_find_command | $quoted_sort_command\n" if $verbose;
234 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
237 # Input is sorted so that all weak keys are contiguous.
238 # When the key changes, we have to process all files we previously know about.
239 my $current_key = -1;
241 # $inode_to_file_name{$inode} = [@file_names]
242 my %inode_to_file_name = ();
246 my ($from, $to) = (@_);
248 my $inode_base = $to;
249 $inode_dir =~ s:[^/]*$::o;
250 $inode_base =~ s:^.*/::os;
251 my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
252 my $quoted_from = tick_quote($from);
253 my $quoted_to = tick_quote($to);
254 print STDERR "ln -f $quoted_from $quoted_to\n";
255 print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
256 link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
257 print STDERR "\trename: $tmp_to -> $to\n" if $debug;
258 unless (rename($tmp_to, $to)) {
260 unlink($tmp_to) or warn "unlink: $tmp_to: $!"; # Try, possibly in vain, to clean up
261 die "rename: $tmp_to -> $from: $saved_bang";
265 # Process all known files so far.
270 # Used to stop link retry loops (there is a goto in here! Actually two...)
273 my @candidate_list = keys(%inode_to_file_name);
274 $input_files += @candidate_list;
275 if (@candidate_list < 2) {
276 print STDERR "Merging...only one candidate to merge..." if $debug;
281 print STDERR "Merging...\n" if $debug;
282 foreach my $candidate (@candidate_list) {
283 print STDERR "\tDigesting candidate $candidate\n" if $debug;
289 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
290 print STDERR "\t\tDigesting file $filename\n" if $debug;
291 if ((-l $filename) || ! -f _) {
292 warn "Bogon file " . tick_quote($filename);
297 $digest = digest($filename);
300 warn "Digest($filename)(#$candidate) failed: $@";
308 print STDERR "\t\tDigest is $digest\n" if $debug;
310 my $incumbent = $hash_to_inode{$digest};
311 if (defined($incumbent)) {
312 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
319 my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
320 my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
321 print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug;
322 print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
326 foreach my $incumbent_file (@incumbent_names) {
327 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
328 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
330 if (!defined($incumbent_blocks)) {
331 warn "lstat: $incumbent_file: $!";
336 if ($incumbent_ino != $incumbent) {
337 warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
342 my $at_least_one_link_done = 0;
346 foreach my $candidate_file (@candidate_names) {
347 my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
348 print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
350 if (!defined($candidate_blocks)) {
351 warn "lstat: $candidate_file: $!";
356 if ($candidate_ino != $candidate) {
357 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
362 if ($candidate_size != $incumbent_size) {
363 warn "$candidate_file, $incumbent_file: file sizes are different";
371 if ($skip_compares) {
372 print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
375 my $quoted_incumbent_file = tick_quote($incumbent_file);
376 my $quoted_candidate_file = tick_quote($candidate_file);
377 print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
378 if (compare($incumbent_file, $candidate_file)) {
379 $compare_differences++;
381 # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
382 # Some kind of I/O error is more likely, so this message cannot be turned off.
383 # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
384 # so the warning in that case is quite silly. Hmmm.
385 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
390 $compare_bytes += $incumbent_size;
400 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
402 # We have to do this to break out of a possible infinite loop.
403 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
404 # such that A1 and B1 are in non-writable directories, we will loop
405 # forever hardlinking A2 with A and B.
406 # To break the loop, we never attempt to hardlink any files X and Y twice.
408 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
409 defined($stop_loop{$candidate_file}->{$incumbent_file})) {
410 print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
412 $stop_loop{$incumbent_file}->{$candidate_file} = 1;
413 $stop_loop{$candidate_file}->{$incumbent_file} = 1;
417 my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
418 if ($candidate_nlink > $incumbent_nlink) {
419 $from_file = $candidate_file;
420 $to_file = $incumbent_file;
421 $from_inode = $candidate;
422 $to_inode = $incumbent;
423 $from_nlink = $candidate_nlink;
424 $to_nlink = $incumbent_nlink;
426 $to_file = $candidate_file;
427 $from_file = $incumbent_file;
428 $to_inode = $candidate;
429 $from_inode = $incumbent;
430 $to_nlink = $candidate_nlink;
431 $from_nlink = $incumbent_nlink;
435 link_files($from_file, $to_file);
443 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
447 ($from_file, $to_file) = ($to_file, $from_file);
448 ($from_inode, $to_inode) = ($to_inode, $from_inode);
449 ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
450 link_files($from_file, $to_file);
460 # Note since the files are presumably identical, they both have the same size.
461 # My random number generator chooses the incumbent's size.
464 delete $inode_to_file_name{$to_inode}->{$to_file};
465 $inode_to_file_name{$from_inode}->{$to_file} = undef;
466 $hash_to_inode{$digest} = $from_inode;
469 if ($to_nlink == 1) {
471 $recovered_bytes += $incumbent_size;
474 # FIXME: Now we're really confused for some reason.
475 # Start over to rebuild state.
478 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
480 # FIXME: This is a lame heuristic. We really need to know if we've
481 # tried all possible ways to hardlink the file out of existence first;
482 # however, that is complex and only benefits a silly statistic.
483 if ($to_nlink == 1 || $from_nlink == 1) {
485 $lost_bytes += $incumbent_size;
495 print STDERR "\t\tNew hash entered\n" if $debug;
496 $hash_to_inode{$digest} = $candidate;
499 warn "No digests found for inode $candidate\n";
500 delete $inode_to_file_name{$candidate};
506 print STDERR "Merge done.\n" if $debug;
507 undef %inode_to_file_name;
511 my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
512 die "read error: $!\nLast input line was '$_'" unless defined($name);
514 print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
516 unless (! (-l $name) && (-f _)) {
517 warn "Bogon file " . tick_quote($name);
523 merge_files if $weak_key ne $current_key;
524 $current_key = $weak_key;
526 $inode_to_file_name{$inode}->{$name} = undef;
528 print STDERR "$name\n" if $verbose;
533 my $stats_blob = <<STATS;
534 compare_bytes $compare_bytes
535 compare_count $compare_count
536 compare_differences $compare_differences
537 compare_errors $compare_errors
538 hard_links $hard_links
539 hash_bytes $hash_bytes
540 hash_errors $hash_errors
541 hash_files $hash_files
542 input_bogons $input_bogons
543 input_files $input_files
544 input_links $input_links
545 link_errors $link_errors
546 link_retries $link_retries
547 lost_bytes $lost_bytes
548 lost_files $lost_files
549 merges_attempted $merges_attempted
550 recovered_bytes $recovered_bytes
551 recovered_files $recovered_files
553 trivially_unique $trivially_unique
556 $stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
558 print STDERR $stats_blob;