3 use Fcntl qw(:DEFAULT :flock);
12 my $compare_count = 0;
13 my $compare_errors = 0;
14 my $compare_differences = 0;
15 my $trivially_unique = 0;
16 my $merges_attempted = 0;
20 my $bytes_recovered = 0;
21 my $files_recovered = 0;
27 use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
31 warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
35 my ($filename) = (@_);
36 my $fv = open(MD5SUM, "-|");
37 die "fork: $!" unless defined($fv);
39 my ($sum_line) = <MD5SUM>;
40 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
41 die "hash error: got EOF instead of md5sum output" unless defined($sum_line);
42 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
43 die "hash error: got \Q$sum_line\E instead of md5sum output" unless defined($sum);
46 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
48 # Perl guarantees it will die here
55 my ($filename) = (@_);
56 die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
57 my $ctx = Digest::SHA1->new;
58 sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
59 binmode(FILE); # FIXME: Necessary? Probably harmless...
60 $ctx->addfile(\*FILE);
61 close(FILE) or die "close: $filename: $!";
62 return $ctx->b64digest;
67 my $collapse_access = 0;
68 my $collapse_timestamp = 0;
69 my $collapse_zero = 0;
73 my @extra_find_opts = ();
74 my @extra_sort_opts = ();
77 my $lock_obtained = 0;
84 Usage: $name [--opts] directory [directory...]
85 Finds duplicate files in the given directories, and replaces all identical
86 copies of a file with hard-links to a single file.
88 Several options modify the definition of a "duplicate". By default, files
89 which have differences in owner uid or gid, permission (mode), or
90 modification time (mtime) are considered different, so that hardlinking
91 files does not also change their attributes. Additionally, all files of
92 zero size are ignored for performance reasons (there tend to be many
93 of them, and they tend not to release any space when replaced with
96 --access uid, gid, and mode may be different for identical
99 --debug show all steps in duplication discovery process
102 --find pass next options (up to --) to find command
104 --lock FILE exit immediately (status 10) if unable to obtain a
105 flock(LOCK_EX|LOCK_NB) on FILE
107 --lock-rm remove lock file at exit
109 --sort pass next options (up to --) to sort command
111 --timestamps mtime may be different for identical files
113 --trust skip byte-by-byte file comparisons
114 (trust the hash function)
116 --verbose report files as they are considered
118 --zeros hard-link zero-length files too
122 while ($#ARGV >= 0) {
123 my $arg = shift(@ARGV);
124 if ($arg eq '--access') {
125 $collapse_access = 1;
126 } elsif ($arg eq '--timestamps') {
127 $collapse_timestamp = 1;
128 } elsif ($arg eq '--zeros') {
130 } elsif ($arg eq '--trust') {
132 } elsif ($arg eq '--verbose') {
134 } elsif ($arg eq '--lock-rm') {
136 } elsif ($arg eq '--lock') {
137 $lock_file = shift(@ARGV);
138 unless (defined($lock_file)) {
142 } elsif ($arg eq '--debug') {
143 $debug = $verbose = 1;
144 } elsif ($arg eq '--find') {
145 while ($#ARGV >= 0) {
146 my $extra_arg = shift(@ARGV);
147 last if $extra_arg eq '--';
148 push(@extra_find_opts, $extra_arg);
150 } elsif ($arg eq '--sort') {
151 while ($#ARGV >= 0) {
152 my $extra_arg = shift(@ARGV);
153 last if $extra_arg eq '--';
154 push(@extra_sort_opts, $extra_arg);
156 } elsif ($arg =~ /^-/o) {
160 push(@directories, $arg);
164 if (defined($lock_file)) {
165 sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
166 flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
167 print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
172 if ($lock_obtained) {
173 print STDERR "Removing '$lock_file'.\n" if $verbose;
174 unlink($lock_file) or warn "unlink: $lock_file: $!";
180 $text =~ s/'/'\''/go;
184 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
185 my $printf_string = '%s ' .
186 ($collapse_access ? '0 0 0 ' : '%U %G %m ') .
187 ($collapse_timestamp ? '0 ' : '%T@ ') .
190 push(@find_command, '!', '-empty') unless $collapse_zero;
191 push(@find_command, '-printf', $printf_string);
193 my @sort_command = ('sort', '-znr', @extra_sort_opts);
194 my @quoted_sort_command = @sort_command;
195 grep(tick_quote($_), @quoted_sort_command);
196 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
198 my @quoted_find_command = @find_command;
199 grep(tick_quote($_), @quoted_find_command);
200 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
201 print STDERR "find command: $quoted_find_command | $quoted_sort_command\n" if $verbose;
203 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
206 # Input is sorted so that all weak keys are contiguous.
207 # When the key changes, we have to process all files we previously know about.
208 my $current_key = -1;
210 # $inode_to_file_name{$inode} = [@file_names]
211 my %inode_to_file_name = ();
215 my ($from, $to) = (@_);
217 my $inode_base = $to;
218 $inode_dir =~ s:[^/]*$::o;
219 $inode_base =~ s:^.*/::os;
220 my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
221 my $quoted_from = tick_quote($from);
222 my $quoted_to = tick_quote($to);
223 print STDERR "ln -f $quoted_from $quoted_to\n";
224 print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
225 link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
226 print STDERR "\trename: $tmp_to -> $to\n" if $debug;
227 unless (rename($tmp_to, $to)) {
229 unlink($tmp_to) or warn "unlink: $tmp_to: $!"; # Try, possibly in vain, to clean up
230 die "rename: $tmp_to -> $from: $saved_bang";
234 # Process all known files so far.
239 # Used to stop link retry loops (there is a goto in here! Actually two...)
242 my @candidate_list = keys(%inode_to_file_name);
243 $files_input += @candidate_list;
244 if (@candidate_list < 2) {
245 print STDERR "Merging...only one candidate to merge..." if $debug;
250 print STDERR "Merging...\n" if $debug;
251 foreach my $candidate (@candidate_list) {
252 print STDERR "\tDigesting candidate $candidate\n" if $debug;
258 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
259 print STDERR "\t\tDigesting file $filename\n" if $debug;
260 if ((-l $filename) || ! -f _) {
261 warn "Bogon file " . tick_quote($filename);
265 eval { $digest = digest($filename); };
267 warn "Digest($filename)(#$candidate) failed: $@";
276 print STDERR "\t\tDigest is $digest\n" if $debug;
278 my $incumbent = $hash_to_inode{$digest};
279 if (defined($incumbent)) {
280 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
287 my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
288 my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
289 print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug;
290 print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
294 foreach my $incumbent_file (@incumbent_names) {
295 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
296 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
298 if (!defined($incumbent_blocks)) {
299 warn "lstat: $incumbent_file: $!";
304 if ($incumbent_ino != $incumbent) {
305 warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
312 my $at_least_one_link_done = 0;
314 foreach my $candidate_file (@candidate_names) {
315 my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
316 print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
318 if (!defined($candidate_blocks)) {
319 warn "lstat: $candidate_file: $!";
324 if ($candidate_ino != $candidate) {
325 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
330 if ($candidate_size != $incumbent_size) {
331 warn "$candidate_file, $incumbent_file: file sizes are different";
340 print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
343 my $quoted_incumbent_file = tick_quote($incumbent_file);
344 my $quoted_candidate_file = tick_quote($candidate_file);
345 print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
346 if (compare($incumbent_file, $candidate_file)) {
347 $compare_differences++;
349 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
363 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
365 # We have to do this to break out of a possible infinite loop.
366 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
367 # such that A1 and B1 are in non-writable directories, we will loop
368 # forever hardlinking A2 with A and B.
369 # To break the loop, we never attempt to hardlink any files X and Y twice.
371 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
372 defined($stop_loop{$candidate_file}->{$incumbent_file})) {
373 print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
375 $stop_loop{$incumbent_file}->{$candidate_file} = 1;
376 $stop_loop{$candidate_file}->{$incumbent_file} = 1;
380 my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
381 if ($candidate_nlink > $incumbent_nlink) {
382 $from_file = $candidate_file;
383 $to_file = $incumbent_file;
384 $from_inode = $candidate;
385 $to_inode = $incumbent;
386 $from_nlink = $candidate_nlink;
387 $to_nlink = $incumbent_nlink;
389 $to_file = $candidate_file;
390 $from_file = $incumbent_file;
391 $to_inode = $candidate;
392 $from_inode = $incumbent;
393 $to_nlink = $candidate_nlink;
394 $from_nlink = $incumbent_nlink;
398 link_files($from_file, $to_file);
406 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
410 ($from_file, $to_file) = ($to_file, $from_file);
411 ($from_inode, $to_inode) = ($to_inode, $from_inode);
412 ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
413 link_files($from_file, $to_file);
423 # Note since the files are presumably identical, they both have the same size.
424 # My random number generator chooses the incumbent's size.
427 delete $inode_to_file_name{$to_inode}->{$to_file};
428 $inode_to_file_name{$from_inode}->{$to_file} = undef;
429 $hash_to_inode{$digest} = $from_inode;
432 if ($to_nlink == 1) {
434 $bytes_recovered += $incumbent_size;
437 # FIXME: Now we're really confused for some reason.
438 # Start over to rebuild state.
441 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
443 # FIXME: This is a lame heuristic. We really need to know if we've
444 # tried all possible ways to hardlink the file out of existence first;
445 # however, that is complex and only benefits a silly statistic.
446 if ($to_nlink == 1 || $from_nlink == 1) {
448 $bytes_lost += $incumbent_size;
458 print STDERR "\t\tNew hash entered\n" if $debug;
459 $hash_to_inode{$digest} = $candidate;
462 warn "No digests found for inode $candidate\n";
463 delete $inode_to_file_name{$candidate};
469 print STDERR "Merge done.\n" if $debug;
470 undef %inode_to_file_name;
474 my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
475 die "read error: $!\nLast input line was '$_'" unless defined($name);
477 print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
479 unless (! (-l $name) && (-f _)) {
480 warn "Bogon file " . tick_quote($name);
486 merge_files if $weak_key ne $current_key;
487 $current_key = $weak_key;
489 $inode_to_file_name{$inode}->{$name} = undef;
491 print STDERR "$name\n" if $verbose;
496 print STDERR <<STATS;
497 links_input $links_input
498 files_input ........... $files_input
499 bogons_input $bogons_input
500 merges_attempted ...... $merges_attempted
501 trivially_unique $trivially_unique
502 files_hashed .......... $files_hashed
503 hash_errors $hash_errors
504 surprises ............. $surprises
505 compare_count $compare_count
506 compare_differences ... $compare_differences
507 compare_errors $compare_errors
508 hard_links ............ $hard_links
509 link_errors $link_errors
510 link_retries .......... $link_retries
511 bytes_recovered $bytes_recovered
512 files_recovered ....... $files_recovered
513 bytes_lost $bytes_lost
514 files_lost ............ $files_lost