Replace --trust with --skip-compare and add --skip-hash and copyright statement
[dupemerge] / faster-dupemerge
1 #!/usr/bin/perl -w
2 # $Id$
3
4 # Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
5 # Use, modification, and distribution permitted
6 # under the terms of the GNU GPL.
7
8 use strict;
9 use Fcntl qw(:DEFAULT :flock);
10 use File::Compare;
11 use File::Temp;
12
13 my $input_links = 0;
14 my $input_files = 0;
15 my $input_bogons = 0;
16 my $hash_bytes = 0;
17 my $hash_files = 0;
18 my $hash_errors = 0;
19 my $compare_bytes = 0;
20 my $compare_count = 0;
21 my $compare_errors = 0;
22 my $compare_differences = 0;
23 my $trivially_unique = 0;
24 my $merges_attempted = 0;
25 my $hard_links = 0;
26 my $link_errors = 0;
27 my $link_retries = 0;
28 my $recovered_bytes = 0;
29 my $recovered_files = 0;
30 my $lost_files = 0;
31 my $lost_bytes = 0;
32 my $surprises = 0;
33
34 eval '
35         use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
36 ';
37
38 if ($@) {
39         warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
40
41         eval <<'DIGEST';
42                 sub really_digest {
43                         my ($filename) = (@_);
44                         my $fv = open(MD5SUM, "-|");    
45                         die "fork: $!" unless defined($fv);
46                         if ($fv) {
47                                 my ($sum_line) = <MD5SUM>;
48                                 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
49                                 die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
50                                 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
51                                 die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
52                                 return $sum;
53                         } else {
54                                 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
55                                 exec('md5sum');
56                                 # Perl guarantees it will die here
57                         }
58                 }
59 DIGEST
60 } else {
61         eval <<'DIGEST';
62                 sub really_digest {
63                         my ($filename) = (@_);
64                         die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
65                         my $ctx = Digest::SHA1->new;
66                         sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
67                         binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
68                         $ctx->addfile(\*FILE);
69                         close(FILE) or die "close: $filename: $!";
70                         return $ctx->b64digest;
71                 }
72 DIGEST
73 }
74         
75 my $collapse_access = 0;
76 my $collapse_timestamp = 0;
77 my $collapse_zero = 0;
78 my $skip_compares = 0;
79 my $skip_hashes = 0;
80 my $verbose = 0;
81 my $debug = 0;
82 my @extra_find_opts = ();
83 my @extra_sort_opts = ();
84 my $lock_file;
85 my $lock_rm = 0;
86 my $lock_obtained = 0;
87
88 sub digest {
89         my ($filename) = (@_);
90         if ($skip_hashes) {
91                 return "SKIPPING HASHES";
92         } else {
93                 &really_digest($filename);
94                 $hash_bytes += -s $filename;
95                 $hash_files++;
96         }
97 }
98
99 my @directories;
100
101 sub usage {
102         my $name = shift(@_);
103         die <<USAGE;
104 Usage: $name [--opts] directory [directory...]
105 Finds duplicate files in the given directories, and replaces all identical
106 copies of a file with hard-links to a single file.
107
108 Several options modify the definition of a "duplicate".  By default, files
109 which have differences in owner uid or gid, permission (mode), or
110 modification time (mtime) are considered different, so that hardlinking
111 files does not also change their attributes.  Additionally, all files of
112 zero size are ignored for performance reasons (there tend to be many
113 of them, and they tend not to release any space when replaced with
114 hard links).
115
116         --access        uid, gid, and mode may be different for identical
117                         files
118
119         --debug         show all steps in duplication discovery process
120                         (implies --verbose)
121
122         --find          pass next options (up to --) to find command
123
124         --lock FILE     exit immediately (status 10) if unable to obtain a 
125                         flock(LOCK_EX|LOCK_NB) on FILE
126
127         --lock-rm       remove lock file at exit
128
129         --sort          pass next options (up to --) to sort command
130
131         --timestamps    mtime may be different for identical files
132
133         --skip-compare  skip byte-by-byte file comparisons
134
135         --skip-hash     skip calculation of hash function on files
136
137         --trust         old name for --skip-compare
138                         (trust the hash function)
139
140         --verbose       report files as they are considered
141
142         --zeros         hard-link zero-length files too
143 USAGE
144 }
145
146 while ($#ARGV >= 0) {
147         my $arg = shift(@ARGV);
148         if ($arg eq '--access') {
149                 $collapse_access = 1;
150         } elsif ($arg eq '--timestamps') {
151                 $collapse_timestamp = 1;
152         } elsif ($arg eq '--zeros') {
153                 $collapse_zero = 1;
154         } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
155                 $skip_compares = 1;
156         } elsif ($arg eq '--skip-hash') {
157                 $skip_hashes = 1;
158         } elsif ($arg eq '--verbose') {
159                 $verbose = 1;
160         } elsif ($arg eq '--lock-rm') {
161                 $lock_rm = 1;
162         } elsif ($arg eq '--lock') {
163                 $lock_file = shift(@ARGV);
164                 unless (defined($lock_file)) {
165                         usage($0);
166                         exit(1);
167                 }
168         } elsif ($arg eq '--debug') {
169                 $debug = $verbose = 1;
170         } elsif ($arg eq '--find') {
171                 while ($#ARGV >= 0) {
172                         my $extra_arg = shift(@ARGV);
173                         last if $extra_arg eq '--';
174                         push(@extra_find_opts, $extra_arg);
175                 }
176         } elsif ($arg eq '--sort') {
177                 while ($#ARGV >= 0) {
178                         my $extra_arg = shift(@ARGV);
179                         last if $extra_arg eq '--';
180                         push(@extra_sort_opts, $extra_arg);
181                 }
182         } elsif ($arg =~ /^-/o) {
183                 usage($0);
184                 exit(1);
185         } else {
186                 push(@directories, $arg);
187         }
188 }
189
190 if ($skip_hashes && $skip_compares) {
191         die "Cannot skip both hashes and compares.\n";
192 }
193
194 if (defined($lock_file)) {
195         sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
196         flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
197         print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
198         $lock_obtained = 1;
199 }
200
201 END {
202         if ($lock_obtained) {
203                 print STDERR "Removing '$lock_file'.\n" if $verbose;
204                 unlink($lock_file) or warn "unlink: $lock_file: $!";
205         }
206 }
207
208 sub tick_quote {
209         my ($text) = (@_);
210         $text =~ s/'/'\''/go;
211         return "'$text'";
212 }
213
214 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
215 my $printf_string = '%s ' .
216         ($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
217         ($collapse_timestamp ? '0 '     : '%T@ ') .
218         '%i %p\0';
219
220 push(@find_command, '!', '-empty') unless $collapse_zero;
221 push(@find_command, '-printf', $printf_string);
222
223 my @sort_command = ('sort', '-znr', @extra_sort_opts);
224 my @quoted_sort_command = @sort_command;
225 grep(tick_quote($_), @quoted_sort_command);
226 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
227
228 my @quoted_find_command = @find_command;
229 grep(tick_quote($_), @quoted_find_command);
230 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
231 print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
232
233 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
234 $/ = "\0";
235
236 # Input is sorted so that all weak keys are contiguous.
237 # When the key changes, we have to process all files we previously know about.
238 my $current_key = -1;
239
240 # $inode_to_file_name{$inode} = [@file_names]
241 my %inode_to_file_name = ();
242
243 # Link files
244 sub link_files {
245         my ($from, $to) = (@_);
246         my $inode_dir = $to;
247         my $inode_base = $to;
248         $inode_dir =~ s:[^/]*$::o;
249         $inode_base =~ s:^.*/::os;
250         my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
251         my $quoted_from = tick_quote($from);
252         my $quoted_to = tick_quote($to);
253         print STDERR "ln -f $quoted_from $quoted_to\n";
254         print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
255         link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
256         print STDERR "\trename: $tmp_to -> $to\n" if $debug;
257         unless (rename($tmp_to, $to)) {
258                 my $saved_bang = $!;
259                 unlink($tmp_to) or warn "unlink: $tmp_to: $!";  # Try, possibly in vain, to clean up
260                 die "rename: $tmp_to -> $from: $saved_bang";
261         }
262 }
263
264 # Process all known files so far.
265 sub merge_files {
266         $merges_attempted++;
267
268         my %hash_to_inode;
269         # Used to stop link retry loops (there is a goto in here!  Actually two...)
270         my %stop_loop;
271
272         my @candidate_list = keys(%inode_to_file_name);
273         $input_files += @candidate_list;
274         if (@candidate_list < 2) {
275                 print STDERR "Merging...only one candidate to merge..." if $debug;
276                 $trivially_unique++;
277                 goto end_merge;
278         }
279
280         print STDERR "Merging...\n" if $debug;
281         foreach my $candidate (@candidate_list) {
282                 print STDERR "\tDigesting candidate $candidate\n" if $debug;
283                 my $ok = 0;
284                 my $digest;
285
286 hash_file:
287
288                 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
289                         print STDERR "\t\tDigesting file $filename\n" if $debug;
290                         if ((-l $filename) || ! -f _) {
291                                 warn "Bogon file " . tick_quote($filename);
292                                 $surprises++;
293                                 next;
294                         }
295                         eval { 
296                                 $digest = digest($filename); 
297                         };
298                         if ($@) {
299                                 warn "Digest($filename)(#$candidate) failed: $@";
300                                 $hash_errors++;
301                         } else {
302                                 $ok = 1;
303                                 last hash_file;
304                         }
305                 }
306                 if ($ok) {
307                         print STDERR "\t\tDigest is $digest\n" if $debug;
308
309                         my $incumbent = $hash_to_inode{$digest};
310                         if (defined($incumbent)) {
311                                 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
312
313                                 my $finished = 0;
314
315 link_start:
316
317                                 until ($finished) {
318                                         my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
319                                         my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
320                                         print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
321                                         print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
322
323 incumbent_file:
324
325                                         foreach my $incumbent_file (@incumbent_names) {
326                                                 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
327                                                 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
328
329                                                 if (!defined($incumbent_blocks)) {
330                                                         warn "lstat: $incumbent_file: $!";
331                                                         $surprises++;
332                                                         next incumbent_file;
333                                                 }
334
335                                                 if ($incumbent_ino != $incumbent) {
336                                                         warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
337                                                         $surprises++;
338                                                         next incumbent_file;
339                                                 }
340
341                                                 my $at_least_one_link_done = 0;
342
343 candidate_file:
344
345                                                 foreach my $candidate_file (@candidate_names) {
346                                                         my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
347                                                         print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
348
349                                                         if (!defined($candidate_blocks)) {
350                                                                 warn "lstat: $candidate_file: $!";
351                                                                 $surprises++;
352                                                                 next candidate_file;
353                                                         }
354
355                                                         if ($candidate_ino != $candidate) {
356                                                                 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
357                                                                 $surprises++;
358                                                                 next candidate_file;
359                                                         }
360
361                                                         if ($candidate_size != $incumbent_size) {
362                                                                 warn "$candidate_file, $incumbent_file: file sizes are different";
363                                                                 $surprises++;
364                                                                 next candidate_file;
365                                                         }
366
367                                                         my $identical;
368
369                                                         eval {
370                                                                 if ($skip_compares) {
371                                                                         print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
372                                                                         $identical = 1;
373                                                                 } else {
374                                                                         my $quoted_incumbent_file = tick_quote($incumbent_file);
375                                                                         my $quoted_candidate_file = tick_quote($candidate_file);
376                                                                         print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
377                                                                         if (compare($incumbent_file, $candidate_file)) {
378                                                                                 $compare_differences++;
379                                                                                 $identical = 0;
380                                                                                 # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
381                                                                                 # Some kind of I/O error is more likely, so this message cannot be turned off.
382                                                                                 # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
383                                                                                 # so the warning in that case is quite silly.  Hmmm.
384                                                                                 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
385                                                                         } else {
386                                                                                 $identical = 1;
387                                                                         }
388                                                                         $compare_count++;
389                                                                         $compare_bytes += $incumbent_size;
390                                                                 }
391                                                         };
392                                                         if ($@) {
393                                                                 warn $@;
394                                                                 $compare_errors++;
395                                                                 next candidate_file;
396                                                         }
397
398                                                         if ($identical) {
399                                                                 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
400
401                                                                 # We have to do this to break out of a possible infinite loop.
402                                                                 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
403                                                                 # such that A1 and B1 are in non-writable directories, we will loop
404                                                                 # forever hardlinking A2 with A and B.
405                                                                 # To break the loop, we never attempt to hardlink any files X and Y twice.
406
407                                                                 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
408                                                                     defined($stop_loop{$candidate_file}->{$incumbent_file})) {
409                                                                         print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
410                                                                 } else {
411                                                                         $stop_loop{$incumbent_file}->{$candidate_file} = 1;
412                                                                         $stop_loop{$candidate_file}->{$incumbent_file} = 1;
413
414                                                                         my $link_done = 0;
415
416                                                                         my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
417                                                                         if ($candidate_nlink > $incumbent_nlink) {
418                                                                                 $from_file = $candidate_file;
419                                                                                 $to_file = $incumbent_file;
420                                                                                 $from_inode = $candidate;
421                                                                                 $to_inode = $incumbent;
422                                                                                 $from_nlink = $candidate_nlink;
423                                                                                 $to_nlink = $incumbent_nlink;
424                                                                         } else {
425                                                                                 $to_file = $candidate_file;
426                                                                                 $from_file = $incumbent_file;
427                                                                                 $to_inode = $candidate;
428                                                                                 $from_inode = $incumbent;
429                                                                                 $to_nlink = $candidate_nlink;
430                                                                                 $from_nlink = $incumbent_nlink;
431                                                                         }
432
433                                                                         eval {
434                                                                                 link_files($from_file, $to_file);
435                                                                                 $link_done = 1;
436                                                                         };
437
438                                                                         if ($@) {
439                                                                                 warn $@;
440                                                                                 $link_errors++;
441
442                                                                                 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
443                                                                                 $link_retries++;
444
445                                                                                 eval {
446                                                                                         ($from_file, $to_file) = ($to_file, $from_file);
447                                                                                         ($from_inode, $to_inode) = ($to_inode, $from_inode);
448                                                                                         ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
449                                                                                         link_files($from_file, $to_file);
450                                                                                         $link_done = 1;
451                                                                                 };
452
453                                                                                 if ($@) {
454                                                                                         warn $@;
455                                                                                         $link_errors++;
456                                                                                 }
457                                                                         }
458
459                                                                         # Note since the files are presumably identical, they both have the same size.
460                                                                         # My random number generator chooses the incumbent's size.
461
462                                                                         if ($link_done) {
463                                                                                 delete $inode_to_file_name{$to_inode}->{$to_file};
464                                                                                 $inode_to_file_name{$from_inode}->{$to_file} = undef;
465                                                                                 $hash_to_inode{$digest} = $from_inode;
466
467                                                                                 $hard_links++;
468                                                                                 if ($to_nlink == 1) {
469                                                                                         $recovered_files++;
470                                                                                         $recovered_bytes += $incumbent_size;
471                                                                                 }
472
473                                                                                 # FIXME:  Now we're really confused for some reason.
474                                                                                 # Start over to rebuild state.
475                                                                                 next link_start;
476                                                                         } else {
477                                                                                 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
478
479                                                                                 # FIXME:  This is a lame heuristic.  We really need to know if we've
480                                                                                 # tried all possible ways to hardlink the file out of existence first;
481                                                                                 # however, that is complex and only benefits a silly statistic.
482                                                                                 if ($to_nlink == 1 || $from_nlink == 1) {
483                                                                                         $lost_files++;
484                                                                                         $lost_bytes += $incumbent_size;
485                                                                                 }
486                                                                         }
487                                                                 }
488                                                         }
489                                                 }
490                                         }
491                                         $finished = 1;
492                                 }
493                         } else {
494                                 print STDERR "\t\tNew hash entered\n" if $debug;
495                                 $hash_to_inode{$digest} = $candidate;
496                         }
497                 } else {
498                         warn "No digests found for inode $candidate\n";
499                         delete $inode_to_file_name{$candidate};
500                 }
501         }
502
503 end_merge:
504
505         print STDERR "Merge done.\n" if $debug;
506         undef %inode_to_file_name;
507 }
508
509 while (<FIND>) {
510         my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
511         die "read error: $!\nLast input line was '$_'" unless defined($name);
512
513         print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
514
515         unless (! (-l $name) && (-f _)) {
516                 warn "Bogon file " . tick_quote($name);
517                 $input_bogons++;
518                 next;
519         }
520
521         $input_links++;
522         merge_files if $weak_key ne $current_key;
523         $current_key = $weak_key;
524
525         $inode_to_file_name{$inode}->{$name} = undef;
526
527         print STDERR "$name\n" if $verbose;
528 }
529
530 merge_files;
531
532 my $stats_blob = <<STATS;
533 compare_bytes           $compare_bytes
534 compare_count           $compare_count
535 compare_differences     $compare_differences
536 compare_errors          $compare_errors
537 hard_links              $hard_links
538 hash_bytes              $hash_bytes
539 hash_errors             $hash_errors
540 hash_files              $hash_files
541 input_bogons            $input_bogons
542 input_files             $input_files
543 input_links             $input_links
544 link_errors             $link_errors
545 link_retries            $link_retries
546 lost_bytes              $lost_bytes
547 lost_files              $lost_files
548 merges_attempted        $merges_attempted
549 recovered_bytes         $recovered_bytes
550 recovered_files         $recovered_files
551 surprises               $surprises
552 trivially_unique        $trivially_unique
553 STATS
554
555 $stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
556
557 print STDERR $stats_blob;
558
559 exit(0);