digest: Fix incorrect statistics when hashes fail
[dupemerge] / faster-dupemerge
1 #!/usr/bin/perl -w
2 # $Id$
3
4 # Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
5 # Use, modification, and distribution permitted
6 # under the terms of the GNU GPL.
7
8 use strict;
9 use Fcntl qw(:DEFAULT :flock);
10 use File::Compare;
11 use File::Temp;
12
13 my $input_links = 0;
14 my $input_files = 0;
15 my $input_bogons = 0;
16 my $hash_bytes = 0;
17 my $hash_files = 0;
18 my $hash_errors = 0;
19 my $compare_bytes = 0;
20 my $compare_count = 0;
21 my $compare_errors = 0;
22 my $compare_differences = 0;
23 my $trivially_unique = 0;
24 my $merges_attempted = 0;
25 my $hard_links = 0;
26 my $link_errors = 0;
27 my $link_retries = 0;
28 my $recovered_bytes = 0;
29 my $recovered_files = 0;
30 my $lost_files = 0;
31 my $lost_bytes = 0;
32 my $surprises = 0;
33
34 eval '
35         use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
36 ';
37
38 if ($@) {
39         warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
40
41         eval <<'DIGEST';
42                 sub really_digest {
43                         my ($filename) = (@_);
44                         my $fv = open(MD5SUM, "-|");    
45                         die "fork: $!" unless defined($fv);
46                         if ($fv) {
47                                 my ($sum_line) = <MD5SUM>;
48                                 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
49                                 die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
50                                 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
51                                 die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
52                                 return $sum;
53                         } else {
54                                 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
55                                 exec('md5sum');
56                                 # Perl guarantees it will die here
57                         }
58                 }
59 DIGEST
60 } else {
61         eval <<'DIGEST';
62                 sub really_digest {
63                         my ($filename) = (@_);
64                         die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
65                         my $ctx = Digest::SHA1->new;
66                         sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
67                         binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
68                         $ctx->addfile(\*FILE);
69                         close(FILE) or die "close: $filename: $!";
70                         return $ctx->b64digest;
71                 }
72 DIGEST
73 }
74         
75 my $collapse_access = 0;
76 my $collapse_timestamp = 0;
77 my $collapse_zero = 0;
78 my $skip_compares = 0;
79 my $skip_hashes = 0;
80 my $verbose = 0;
81 my $debug = 0;
82 my @extra_find_opts = ();
83 my @extra_sort_opts = ();
84 my $lock_file;
85 my $lock_rm = 0;
86 my $lock_obtained = 0;
87
88 sub digest {
89         my ($filename) = (@_);
90         if ($skip_hashes) {
91                 return "SKIPPING HASHES";
92         } else {
93                 my $digest = &really_digest($filename);
94                 $hash_bytes += -s $filename;
95                 $hash_files++;
96                 return $digest
97         }
98 }
99
100 my @directories;
101
102 sub usage {
103         my $name = shift(@_);
104         die <<USAGE;
105 Usage: $name [--opts] directory [directory...]
106 Finds duplicate files in the given directories, and replaces all identical
107 copies of a file with hard-links to a single file.
108
109 Several options modify the definition of a "duplicate".  By default, files
110 which have differences in owner uid or gid, permission (mode), or
111 modification time (mtime) are considered different, so that hardlinking
112 files does not also change their attributes.  Additionally, all files of
113 zero size are ignored for performance reasons (there tend to be many
114 of them, and they tend not to release any space when replaced with
115 hard links).
116
117         --access        uid, gid, and mode may be different for identical
118                         files
119
120         --debug         show all steps in duplication discovery process
121                         (implies --verbose)
122
123         --find          pass next options (up to --) to find command
124
125         --lock FILE     exit immediately (status 10) if unable to obtain a 
126                         flock(LOCK_EX|LOCK_NB) on FILE
127
128         --lock-rm       remove lock file at exit
129
130         --sort          pass next options (up to --) to sort command
131
132         --timestamps    mtime may be different for identical files
133
134         --skip-compare  skip byte-by-byte file comparisons
135
136         --skip-hash     skip calculation of hash function on files
137
138         --trust         old name for --skip-compare
139                         (trust the hash function)
140
141         --verbose       report files as they are considered
142
143         --zeros         hard-link zero-length files too
144 USAGE
145 }
146
147 while ($#ARGV >= 0) {
148         my $arg = shift(@ARGV);
149         if ($arg eq '--access') {
150                 $collapse_access = 1;
151         } elsif ($arg eq '--timestamps') {
152                 $collapse_timestamp = 1;
153         } elsif ($arg eq '--zeros') {
154                 $collapse_zero = 1;
155         } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
156                 $skip_compares = 1;
157         } elsif ($arg eq '--skip-hash') {
158                 $skip_hashes = 1;
159         } elsif ($arg eq '--verbose') {
160                 $verbose = 1;
161         } elsif ($arg eq '--lock-rm') {
162                 $lock_rm = 1;
163         } elsif ($arg eq '--lock') {
164                 $lock_file = shift(@ARGV);
165                 unless (defined($lock_file)) {
166                         usage($0);
167                         exit(1);
168                 }
169         } elsif ($arg eq '--debug') {
170                 $debug = $verbose = 1;
171         } elsif ($arg eq '--find') {
172                 while ($#ARGV >= 0) {
173                         my $extra_arg = shift(@ARGV);
174                         last if $extra_arg eq '--';
175                         push(@extra_find_opts, $extra_arg);
176                 }
177         } elsif ($arg eq '--sort') {
178                 while ($#ARGV >= 0) {
179                         my $extra_arg = shift(@ARGV);
180                         last if $extra_arg eq '--';
181                         push(@extra_sort_opts, $extra_arg);
182                 }
183         } elsif ($arg =~ /^-/o) {
184                 usage($0);
185                 exit(1);
186         } else {
187                 push(@directories, $arg);
188         }
189 }
190
191 if ($skip_hashes && $skip_compares) {
192         die "Cannot skip both hashes and compares.\n";
193 }
194
195 if (defined($lock_file)) {
196         sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
197         flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
198         print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
199         $lock_obtained = 1;
200 }
201
202 END {
203         if ($lock_obtained) {
204                 print STDERR "Removing '$lock_file'.\n" if $verbose;
205                 unlink($lock_file) or warn "unlink: $lock_file: $!";
206         }
207 }
208
209 sub tick_quote {
210         my ($text) = (@_);
211         $text =~ s/'/'\''/go;
212         return "'$text'";
213 }
214
215 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
216 my $printf_string = '%s ' .
217         ($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
218         ($collapse_timestamp ? '0 '     : '%T@ ') .
219         '%i %p\0';
220
221 push(@find_command, '!', '-empty') unless $collapse_zero;
222 push(@find_command, '-printf', $printf_string);
223
224 my @sort_command = ('sort', '-znr', @extra_sort_opts);
225 my @quoted_sort_command = @sort_command;
226 grep(tick_quote($_), @quoted_sort_command);
227 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
228
229 my @quoted_find_command = @find_command;
230 grep(tick_quote($_), @quoted_find_command);
231 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
232 print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
233
234 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
235 $/ = "\0";
236
237 # Input is sorted so that all weak keys are contiguous.
238 # When the key changes, we have to process all files we previously know about.
239 my $current_key = -1;
240
241 # $inode_to_file_name{$inode} = [@file_names]
242 my %inode_to_file_name = ();
243
244 # Link files
245 sub link_files {
246         my ($from, $to) = (@_);
247         my $inode_dir = $to;
248         my $inode_base = $to;
249         $inode_dir =~ s:[^/]*$::o;
250         $inode_base =~ s:^.*/::os;
251         my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
252         my $quoted_from = tick_quote($from);
253         my $quoted_to = tick_quote($to);
254         print STDERR "ln -f $quoted_from $quoted_to\n";
255         print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
256         link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
257         print STDERR "\trename: $tmp_to -> $to\n" if $debug;
258         unless (rename($tmp_to, $to)) {
259                 my $saved_bang = $!;
260                 unlink($tmp_to) or warn "unlink: $tmp_to: $!";  # Try, possibly in vain, to clean up
261                 die "rename: $tmp_to -> $from: $saved_bang";
262         }
263 }
264
265 # Process all known files so far.
266 sub merge_files {
267         $merges_attempted++;
268
269         my %hash_to_inode;
270         # Used to stop link retry loops (there is a goto in here!  Actually two...)
271         my %stop_loop;
272
273         my @candidate_list = keys(%inode_to_file_name);
274         $input_files += @candidate_list;
275         if (@candidate_list < 2) {
276                 print STDERR "Merging...only one candidate to merge..." if $debug;
277                 $trivially_unique++;
278                 goto end_merge;
279         }
280
281         print STDERR "Merging...\n" if $debug;
282         foreach my $candidate (@candidate_list) {
283                 print STDERR "\tDigesting candidate $candidate\n" if $debug;
284                 my $ok = 0;
285                 my $digest;
286
287 hash_file:
288
289                 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
290                         print STDERR "\t\tDigesting file $filename\n" if $debug;
291                         if ((-l $filename) || ! -f _) {
292                                 warn "Bogon file " . tick_quote($filename);
293                                 $surprises++;
294                                 next;
295                         }
296                         eval { 
297                                 $digest = digest($filename); 
298                         };
299                         if ($@) {
300                                 warn "Digest($filename)(#$candidate) failed: $@";
301                                 $hash_errors++;
302                         } else {
303                                 $ok = 1;
304                                 last hash_file;
305                         }
306                 }
307                 if ($ok) {
308                         print STDERR "\t\tDigest is $digest\n" if $debug;
309
310                         my $incumbent = $hash_to_inode{$digest};
311                         if (defined($incumbent)) {
312                                 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
313
314                                 my $finished = 0;
315
316 link_start:
317
318                                 until ($finished) {
319                                         my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
320                                         my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
321                                         print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
322                                         print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
323
324 incumbent_file:
325
326                                         foreach my $incumbent_file (@incumbent_names) {
327                                                 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
328                                                 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
329
330                                                 if (!defined($incumbent_blocks)) {
331                                                         warn "lstat: $incumbent_file: $!";
332                                                         $surprises++;
333                                                         next incumbent_file;
334                                                 }
335
336                                                 if ($incumbent_ino != $incumbent) {
337                                                         warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
338                                                         $surprises++;
339                                                         next incumbent_file;
340                                                 }
341
342                                                 my $at_least_one_link_done = 0;
343
344 candidate_file:
345
346                                                 foreach my $candidate_file (@candidate_names) {
347                                                         my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
348                                                         print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
349
350                                                         if (!defined($candidate_blocks)) {
351                                                                 warn "lstat: $candidate_file: $!";
352                                                                 $surprises++;
353                                                                 next candidate_file;
354                                                         }
355
356                                                         if ($candidate_ino != $candidate) {
357                                                                 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
358                                                                 $surprises++;
359                                                                 next candidate_file;
360                                                         }
361
362                                                         if ($candidate_size != $incumbent_size) {
363                                                                 warn "$candidate_file, $incumbent_file: file sizes are different";
364                                                                 $surprises++;
365                                                                 next candidate_file;
366                                                         }
367
368                                                         my $identical;
369
370                                                         eval {
371                                                                 if ($skip_compares) {
372                                                                         print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
373                                                                         $identical = 1;
374                                                                 } else {
375                                                                         my $quoted_incumbent_file = tick_quote($incumbent_file);
376                                                                         my $quoted_candidate_file = tick_quote($candidate_file);
377                                                                         print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
378                                                                         if (compare($incumbent_file, $candidate_file)) {
379                                                                                 $compare_differences++;
380                                                                                 $identical = 0;
381                                                                                 # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
382                                                                                 # Some kind of I/O error is more likely, so this message cannot be turned off.
383                                                                                 # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
384                                                                                 # so the warning in that case is quite silly.  Hmmm.
385                                                                                 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
386                                                                         } else {
387                                                                                 $identical = 1;
388                                                                         }
389                                                                         $compare_count++;
390                                                                         $compare_bytes += $incumbent_size;
391                                                                 }
392                                                         };
393                                                         if ($@) {
394                                                                 warn $@;
395                                                                 $compare_errors++;
396                                                                 next candidate_file;
397                                                         }
398
399                                                         if ($identical) {
400                                                                 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
401
402                                                                 # We have to do this to break out of a possible infinite loop.
403                                                                 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
404                                                                 # such that A1 and B1 are in non-writable directories, we will loop
405                                                                 # forever hardlinking A2 with A and B.
406                                                                 # To break the loop, we never attempt to hardlink any files X and Y twice.
407
408                                                                 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
409                                                                     defined($stop_loop{$candidate_file}->{$incumbent_file})) {
410                                                                         print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
411                                                                 } else {
412                                                                         $stop_loop{$incumbent_file}->{$candidate_file} = 1;
413                                                                         $stop_loop{$candidate_file}->{$incumbent_file} = 1;
414
415                                                                         my $link_done = 0;
416
417                                                                         my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
418                                                                         if ($candidate_nlink > $incumbent_nlink) {
419                                                                                 $from_file = $candidate_file;
420                                                                                 $to_file = $incumbent_file;
421                                                                                 $from_inode = $candidate;
422                                                                                 $to_inode = $incumbent;
423                                                                                 $from_nlink = $candidate_nlink;
424                                                                                 $to_nlink = $incumbent_nlink;
425                                                                         } else {
426                                                                                 $to_file = $candidate_file;
427                                                                                 $from_file = $incumbent_file;
428                                                                                 $to_inode = $candidate;
429                                                                                 $from_inode = $incumbent;
430                                                                                 $to_nlink = $candidate_nlink;
431                                                                                 $from_nlink = $incumbent_nlink;
432                                                                         }
433
434                                                                         eval {
435                                                                                 link_files($from_file, $to_file);
436                                                                                 $link_done = 1;
437                                                                         };
438
439                                                                         if ($@) {
440                                                                                 warn $@;
441                                                                                 $link_errors++;
442
443                                                                                 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
444                                                                                 $link_retries++;
445
446                                                                                 eval {
447                                                                                         ($from_file, $to_file) = ($to_file, $from_file);
448                                                                                         ($from_inode, $to_inode) = ($to_inode, $from_inode);
449                                                                                         ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
450                                                                                         link_files($from_file, $to_file);
451                                                                                         $link_done = 1;
452                                                                                 };
453
454                                                                                 if ($@) {
455                                                                                         warn $@;
456                                                                                         $link_errors++;
457                                                                                 }
458                                                                         }
459
460                                                                         # Note since the files are presumably identical, they both have the same size.
461                                                                         # My random number generator chooses the incumbent's size.
462
463                                                                         if ($link_done) {
464                                                                                 delete $inode_to_file_name{$to_inode}->{$to_file};
465                                                                                 $inode_to_file_name{$from_inode}->{$to_file} = undef;
466                                                                                 $hash_to_inode{$digest} = $from_inode;
467
468                                                                                 $hard_links++;
469                                                                                 if ($to_nlink == 1) {
470                                                                                         $recovered_files++;
471                                                                                         $recovered_bytes += $incumbent_size;
472                                                                                 }
473
474                                                                                 # FIXME:  Now we're really confused for some reason.
475                                                                                 # Start over to rebuild state.
476                                                                                 next link_start;
477                                                                         } else {
478                                                                                 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
479
480                                                                                 # FIXME:  This is a lame heuristic.  We really need to know if we've
481                                                                                 # tried all possible ways to hardlink the file out of existence first;
482                                                                                 # however, that is complex and only benefits a silly statistic.
483                                                                                 if ($to_nlink == 1 || $from_nlink == 1) {
484                                                                                         $lost_files++;
485                                                                                         $lost_bytes += $incumbent_size;
486                                                                                 }
487                                                                         }
488                                                                 }
489                                                         }
490                                                 }
491                                         }
492                                         $finished = 1;
493                                 }
494                         } else {
495                                 print STDERR "\t\tNew hash entered\n" if $debug;
496                                 $hash_to_inode{$digest} = $candidate;
497                         }
498                 } else {
499                         warn "No digests found for inode $candidate\n";
500                         delete $inode_to_file_name{$candidate};
501                 }
502         }
503
504 end_merge:
505
506         print STDERR "Merge done.\n" if $debug;
507         undef %inode_to_file_name;
508 }
509
510 while (<FIND>) {
511         my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
512         die "read error: $!\nLast input line was '$_'" unless defined($name);
513
514         print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
515
516         unless (! (-l $name) && (-f _)) {
517                 warn "Bogon file " . tick_quote($name);
518                 $input_bogons++;
519                 next;
520         }
521
522         $input_links++;
523         merge_files if $weak_key ne $current_key;
524         $current_key = $weak_key;
525
526         $inode_to_file_name{$inode}->{$name} = undef;
527
528         print STDERR "$name\n" if $verbose;
529 }
530
531 merge_files;
532
533 my $stats_blob = <<STATS;
534 compare_bytes           $compare_bytes
535 compare_count           $compare_count
536 compare_differences     $compare_differences
537 compare_errors          $compare_errors
538 hard_links              $hard_links
539 hash_bytes              $hash_bytes
540 hash_errors             $hash_errors
541 hash_files              $hash_files
542 input_bogons            $input_bogons
543 input_files             $input_files
544 input_links             $input_links
545 link_errors             $link_errors
546 link_retries            $link_retries
547 lost_bytes              $lost_bytes
548 lost_files              $lost_files
549 merges_attempted        $merges_attempted
550 recovered_bytes         $recovered_bytes
551 recovered_files         $recovered_files
552 surprises               $surprises
553 trivially_unique        $trivially_unique
554 STATS
555
556 $stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
557
558 print STDERR $stats_blob;
559
560 exit(0);