4 # Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
5 # Use, modification, and distribution permitted
6 # under the terms of the GNU GPL.
9 use Fcntl qw(:DEFAULT :flock);
19 my $compare_bytes = 0;
20 my $compare_count = 0;
21 my $compare_errors = 0;
22 my $compare_differences = 0;
23 my $trivially_unique = 0;
24 my $merges_attempted = 0;
28 my $recovered_bytes = 0;
29 my $recovered_files = 0;
35 use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
39 warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
43 my ($filename) = (@_);
44 my $fv = open(MD5SUM, "-|");
45 die "fork: $!" unless defined($fv);
47 my ($sum_line) = <MD5SUM>;
48 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
49 die "hash error: got EOF instead of md5sum output" unless defined($sum_line);
50 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
51 die "hash error: got \Q$sum_line\E instead of md5sum output" unless defined($sum);
54 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
56 # Perl guarantees it will die here
63 my ($filename) = (@_);
64 die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
65 my $ctx = Digest::SHA1->new;
66 sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
67 binmode(FILE); # FIXME: Necessary? Probably harmless...
68 $ctx->addfile(\*FILE);
69 close(FILE) or die "close: $filename: $!";
70 return $ctx->b64digest;
75 my $collapse_access = 0;
76 my $collapse_timestamp = 0;
77 my $collapse_zero = 0;
78 my $skip_compares = 0;
82 my @extra_find_opts = ();
83 my @extra_sort_opts = ();
86 my $lock_obtained = 0;
89 my ($filename) = (@_);
91 return "SKIPPING HASHES";
93 &really_digest($filename);
94 $hash_bytes += -s $filename;
102 my $name = shift(@_);
104 Usage: $name [--opts] directory [directory...]
105 Finds duplicate files in the given directories, and replaces all identical
106 copies of a file with hard-links to a single file.
108 Several options modify the definition of a "duplicate". By default, files
109 which have differences in owner uid or gid, permission (mode), or
110 modification time (mtime) are considered different, so that hardlinking
111 files does not also change their attributes. Additionally, all files of
112 zero size are ignored for performance reasons (there tend to be many
113 of them, and they tend not to release any space when replaced with
116 --access uid, gid, and mode may be different for identical
119 --debug show all steps in duplication discovery process
122 --find pass next options (up to --) to find command
124 --lock FILE exit immediately (status 10) if unable to obtain a
125 flock(LOCK_EX|LOCK_NB) on FILE
127 --lock-rm remove lock file at exit
129 --sort pass next options (up to --) to sort command
131 --timestamps mtime may be different for identical files
133 --skip-compare skip byte-by-byte file comparisons
135 --skip-hash skip calculation of hash function on files
137 --trust old name for --skip-compare
138 (trust the hash function)
140 --verbose report files as they are considered
142 --zeros hard-link zero-length files too
146 while ($#ARGV >= 0) {
147 my $arg = shift(@ARGV);
148 if ($arg eq '--access') {
149 $collapse_access = 1;
150 } elsif ($arg eq '--timestamps') {
151 $collapse_timestamp = 1;
152 } elsif ($arg eq '--zeros') {
154 } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
156 } elsif ($arg eq '--skip-hash') {
158 } elsif ($arg eq '--verbose') {
160 } elsif ($arg eq '--lock-rm') {
162 } elsif ($arg eq '--lock') {
163 $lock_file = shift(@ARGV);
164 unless (defined($lock_file)) {
168 } elsif ($arg eq '--debug') {
169 $debug = $verbose = 1;
170 } elsif ($arg eq '--find') {
171 while ($#ARGV >= 0) {
172 my $extra_arg = shift(@ARGV);
173 last if $extra_arg eq '--';
174 push(@extra_find_opts, $extra_arg);
176 } elsif ($arg eq '--sort') {
177 while ($#ARGV >= 0) {
178 my $extra_arg = shift(@ARGV);
179 last if $extra_arg eq '--';
180 push(@extra_sort_opts, $extra_arg);
182 } elsif ($arg =~ /^-/o) {
186 push(@directories, $arg);
190 if ($skip_hashes && $skip_compares) {
191 die "Cannot skip both hashes and compares.\n";
194 if (defined($lock_file)) {
195 sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
196 flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
197 print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
202 if ($lock_obtained) {
203 print STDERR "Removing '$lock_file'.\n" if $verbose;
204 unlink($lock_file) or warn "unlink: $lock_file: $!";
210 $text =~ s/'/'\''/go;
214 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
215 my $printf_string = '%s ' .
216 ($collapse_access ? '0 0 0 ' : '%U %G %m ') .
217 ($collapse_timestamp ? '0 ' : '%T@ ') .
220 push(@find_command, '!', '-empty') unless $collapse_zero;
221 push(@find_command, '-printf', $printf_string);
223 my @sort_command = ('sort', '-znr', @extra_sort_opts);
224 my @quoted_sort_command = @sort_command;
225 grep(tick_quote($_), @quoted_sort_command);
226 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
228 my @quoted_find_command = @find_command;
229 grep(tick_quote($_), @quoted_find_command);
230 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
231 print STDERR "find command: $quoted_find_command | $quoted_sort_command\n" if $verbose;
233 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
236 # Input is sorted so that all weak keys are contiguous.
237 # When the key changes, we have to process all files we previously know about.
238 my $current_key = -1;
240 # $inode_to_file_name{$inode} = [@file_names]
241 my %inode_to_file_name = ();
245 my ($from, $to) = (@_);
247 my $inode_base = $to;
248 $inode_dir =~ s:[^/]*$::o;
249 $inode_base =~ s:^.*/::os;
250 my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
251 my $quoted_from = tick_quote($from);
252 my $quoted_to = tick_quote($to);
253 print STDERR "ln -f $quoted_from $quoted_to\n";
254 print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
255 link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
256 print STDERR "\trename: $tmp_to -> $to\n" if $debug;
257 unless (rename($tmp_to, $to)) {
259 unlink($tmp_to) or warn "unlink: $tmp_to: $!"; # Try, possibly in vain, to clean up
260 die "rename: $tmp_to -> $from: $saved_bang";
264 # Process all known files so far.
269 # Used to stop link retry loops (there is a goto in here! Actually two...)
272 my @candidate_list = keys(%inode_to_file_name);
273 $input_files += @candidate_list;
274 if (@candidate_list < 2) {
275 print STDERR "Merging...only one candidate to merge..." if $debug;
280 print STDERR "Merging...\n" if $debug;
281 foreach my $candidate (@candidate_list) {
282 print STDERR "\tDigesting candidate $candidate\n" if $debug;
288 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
289 print STDERR "\t\tDigesting file $filename\n" if $debug;
290 if ((-l $filename) || ! -f _) {
291 warn "Bogon file " . tick_quote($filename);
296 $digest = digest($filename);
299 warn "Digest($filename)(#$candidate) failed: $@";
307 print STDERR "\t\tDigest is $digest\n" if $debug;
309 my $incumbent = $hash_to_inode{$digest};
310 if (defined($incumbent)) {
311 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
318 my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
319 my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
320 print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug;
321 print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
325 foreach my $incumbent_file (@incumbent_names) {
326 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
327 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
329 if (!defined($incumbent_blocks)) {
330 warn "lstat: $incumbent_file: $!";
335 if ($incumbent_ino != $incumbent) {
336 warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
341 my $at_least_one_link_done = 0;
345 foreach my $candidate_file (@candidate_names) {
346 my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
347 print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
349 if (!defined($candidate_blocks)) {
350 warn "lstat: $candidate_file: $!";
355 if ($candidate_ino != $candidate) {
356 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
361 if ($candidate_size != $incumbent_size) {
362 warn "$candidate_file, $incumbent_file: file sizes are different";
370 if ($skip_compares) {
371 print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
374 my $quoted_incumbent_file = tick_quote($incumbent_file);
375 my $quoted_candidate_file = tick_quote($candidate_file);
376 print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
377 if (compare($incumbent_file, $candidate_file)) {
378 $compare_differences++;
380 # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
381 # Some kind of I/O error is more likely, so this message cannot be turned off.
382 # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
383 # so the warning in that case is quite silly. Hmmm.
384 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
389 $compare_bytes += $incumbent_size;
399 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
401 # We have to do this to break out of a possible infinite loop.
402 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
403 # such that A1 and B1 are in non-writable directories, we will loop
404 # forever hardlinking A2 with A and B.
405 # To break the loop, we never attempt to hardlink any files X and Y twice.
407 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
408 defined($stop_loop{$candidate_file}->{$incumbent_file})) {
409 print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
411 $stop_loop{$incumbent_file}->{$candidate_file} = 1;
412 $stop_loop{$candidate_file}->{$incumbent_file} = 1;
416 my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
417 if ($candidate_nlink > $incumbent_nlink) {
418 $from_file = $candidate_file;
419 $to_file = $incumbent_file;
420 $from_inode = $candidate;
421 $to_inode = $incumbent;
422 $from_nlink = $candidate_nlink;
423 $to_nlink = $incumbent_nlink;
425 $to_file = $candidate_file;
426 $from_file = $incumbent_file;
427 $to_inode = $candidate;
428 $from_inode = $incumbent;
429 $to_nlink = $candidate_nlink;
430 $from_nlink = $incumbent_nlink;
434 link_files($from_file, $to_file);
442 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
446 ($from_file, $to_file) = ($to_file, $from_file);
447 ($from_inode, $to_inode) = ($to_inode, $from_inode);
448 ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
449 link_files($from_file, $to_file);
459 # Note since the files are presumably identical, they both have the same size.
460 # My random number generator chooses the incumbent's size.
463 delete $inode_to_file_name{$to_inode}->{$to_file};
464 $inode_to_file_name{$from_inode}->{$to_file} = undef;
465 $hash_to_inode{$digest} = $from_inode;
468 if ($to_nlink == 1) {
470 $recovered_bytes += $incumbent_size;
473 # FIXME: Now we're really confused for some reason.
474 # Start over to rebuild state.
477 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
479 # FIXME: This is a lame heuristic. We really need to know if we've
480 # tried all possible ways to hardlink the file out of existence first;
481 # however, that is complex and only benefits a silly statistic.
482 if ($to_nlink == 1 || $from_nlink == 1) {
484 $lost_bytes += $incumbent_size;
494 print STDERR "\t\tNew hash entered\n" if $debug;
495 $hash_to_inode{$digest} = $candidate;
498 warn "No digests found for inode $candidate\n";
499 delete $inode_to_file_name{$candidate};
505 print STDERR "Merge done.\n" if $debug;
506 undef %inode_to_file_name;
510 my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
511 die "read error: $!\nLast input line was '$_'" unless defined($name);
513 print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
515 unless (! (-l $name) && (-f _)) {
516 warn "Bogon file " . tick_quote($name);
522 merge_files if $weak_key ne $current_key;
523 $current_key = $weak_key;
525 $inode_to_file_name{$inode}->{$name} = undef;
527 print STDERR "$name\n" if $verbose;
532 my $stats_blob = <<STATS;
533 compare_bytes $compare_bytes
534 compare_count $compare_count
535 compare_differences $compare_differences
536 compare_errors $compare_errors
537 hard_links $hard_links
538 hash_bytes $hash_bytes
539 hash_errors $hash_errors
540 hash_files $hash_files
541 input_bogons $input_bogons
542 input_files $input_files
543 input_links $input_links
544 link_errors $link_errors
545 link_retries $link_retries
546 lost_bytes $lost_bytes
547 lost_files $lost_files
548 merges_attempted $merges_attempted
549 recovered_bytes $recovered_bytes
550 recovered_files $recovered_files
552 trivially_unique $trivially_unique
555 $stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
557 print STDERR $stats_blob;