tick_quote: properly quote the string '\''
[dupemerge] / faster-dupemerge
1 #!/usr/bin/perl -w
2 # $Id$
3
4 # Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
5 # Use, modification, and distribution permitted
6 # under the terms of the GNU GPL.
7
8 use strict;
9 use Fcntl qw(:DEFAULT :flock);
10 use File::Compare;
11 use File::Temp;
12
13 my $input_links = 0;
14 my $input_files = 0;
15 my $input_bogons = 0;
16 my $hash_bytes = 0;
17 my $hash_files = 0;
18 my $hash_errors = 0;
19 my $compare_bytes = 0;
20 my $compare_count = 0;
21 my $compare_errors = 0;
22 my $compare_differences = 0;
23 my $trivially_unique = 0;
24 my $merges_attempted = 0;
25 my $hard_links = 0;
26 my $link_errors = 0;
27 my $link_retries = 0;
28 my $recovered_bytes = 0;
29 my $recovered_files = 0;
30 my $lost_files = 0;
31 my $lost_bytes = 0;
32 my $surprises = 0;
33
34 eval '
35         use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
36 ';
37
38 if ($@) {
39         warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
40
41         eval <<'DIGEST';
42                 sub really_digest {
43                         my ($filename) = (@_);
44                         my $fv = open(MD5SUM, "-|");    
45                         die "fork: $!" unless defined($fv);
46                         if ($fv) {
47                                 my ($sum_line) = <MD5SUM>;
48                                 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
49                                 die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
50                                 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
51                                 die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
52                                 return $sum;
53                         } else {
54                                 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
55                                 exec('md5sum');
56                                 # Perl guarantees it will die here
57                         }
58                 }
59 DIGEST
60 } else {
61         eval <<'DIGEST';
62                 sub really_digest {
63                         my ($filename) = (@_);
64                         die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
65                         my $ctx = Digest::SHA1->new;
66                         sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
67                         binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
68                         $ctx->addfile(\*FILE);
69                         close(FILE) or die "close: $filename: $!";
70                         return $ctx->b64digest;
71                 }
72 DIGEST
73 }
74         
75 my $collapse_access = 0;
76 my $collapse_timestamp = 0;
77 my $collapse_zero = 0;
78 my $skip_compares = 0;
79 my $skip_hashes = 0;
80 my $verbose = 0;
81 my $debug = 0;
82 my $dry_run = 0;
83 my $humane = 0;
84 my @extra_find_opts = ();
85 my @extra_sort_opts = ();
86 my $lock_file;
87 my $lock_rm = 0;
88 my $lock_obtained = 0;
89
90 sub digest {
91         my ($filename) = (@_);
92         if ($skip_hashes) {
93                 return "SKIPPING HASHES";
94         } else {
95                 my $digest = &really_digest($filename);
96                 $hash_bytes += -s $filename;
97                 $hash_files++;
98                 return $digest
99         }
100 }
101
102 my @directories;
103
104 sub usage {
105         my $name = shift(@_);
106         die <<USAGE;
107 Usage: $name [--opts] directory [directory...]
108 Finds duplicate files in the given directories, and replaces all identical
109 copies of a file with hard-links to a single file.
110
111 Several options modify the definition of a "duplicate".  By default, files
112 which have differences in owner uid or gid, permission (mode), or
113 modification time (mtime) are considered different, so that hardlinking
114 files does not also change their attributes.  Additionally, all files of
115 zero size are ignored for performance reasons (there tend to be many
116 of them, and they tend not to release any space when replaced with
117 hard links).
118
119         --access        uid, gid, and mode may be different for identical
120                         files
121
122         --debug         show all steps in duplication discovery process
123                         (implies --verbose)
124
125         --dry-run       do not lock files or make changes to filesystem
126
127         --find          pass next options (up to --) to find command
128
129         --humane        human-readable statistics (e.g. 1 048 576)
130
131         --lock FILE     exit immediately (status 10) if unable to obtain a 
132                         flock(LOCK_EX|LOCK_NB) on FILE
133
134         --lock-rm       remove lock file at exit
135
136         --sort          pass next options (up to --) to sort command
137
138         --timestamps    mtime may be different for identical files
139
140         --skip-compare  skip byte-by-byte file comparisons
141
142         --skip-hash     skip calculation of hash function on files
143
144         --trust         old name for --skip-compare
145                         (trust the hash function)
146
147         --verbose       report files as they are considered
148
149         --zeros         hard-link zero-length files too
150 USAGE
151 }
152
153 while ($#ARGV >= 0) {
154         my $arg = shift(@ARGV);
155         if ($arg eq '--access') {
156                 $collapse_access = 1;
157         } elsif ($arg eq '--timestamps') {
158                 $collapse_timestamp = 1;
159         } elsif ($arg eq '--zeros') {
160                 $collapse_zero = 1;
161         } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
162                 $skip_compares = 1;
163         } elsif ($arg eq '--skip-hash') {
164                 $skip_hashes = 1;
165         } elsif ($arg eq '--verbose') {
166                 $verbose = 1;
167         } elsif ($arg eq '--lock-rm') {
168                 $lock_rm = 1;
169         } elsif ($arg eq '--lock') {
170                 $lock_file = shift(@ARGV);
171                 unless (defined($lock_file)) {
172                         usage($0);
173                         exit(1);
174                 }
175         } elsif ($arg eq '--debug') {
176                 $debug = $verbose = 1;
177         } elsif ($arg eq '--dry-run') {
178                 $dry_run = 1;
179         } elsif ($arg eq '--humane') {
180                 $humane = 1;
181         } elsif ($arg eq '--find') {
182                 while ($#ARGV >= 0) {
183                         my $extra_arg = shift(@ARGV);
184                         last if $extra_arg eq '--';
185                         push(@extra_find_opts, $extra_arg);
186                 }
187         } elsif ($arg eq '--sort') {
188                 while ($#ARGV >= 0) {
189                         my $extra_arg = shift(@ARGV);
190                         last if $extra_arg eq '--';
191                         push(@extra_sort_opts, $extra_arg);
192                 }
193         } elsif ($arg =~ /^-/o) {
194                 usage($0);
195                 exit(1);
196         } else {
197                 push(@directories, $arg);
198         }
199 }
200
201 if ($skip_hashes && $skip_compares) {
202         die "Cannot skip both hashes and compares.\n";
203 }
204
205 @directories or usage;
206
207 if (defined($lock_file) && !$dry_run) {
208         sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
209         flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
210         print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
211         $lock_obtained = 1;
212 }
213
214 END {
215         if ($lock_obtained && !$dry_run) {
216                 print STDERR "Removing '$lock_file'.\n" if $verbose;
217                 unlink($lock_file) or warn "unlink: $lock_file: $!";
218         }
219 }
220
221 sub tick_quote {
222         my ($text) = (@_);
223         $text =~ s/'/'\\''/go;
224         return "'$text'";
225 }
226
227 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
228 my $printf_string = '%s ' .
229         ($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
230         ($collapse_timestamp ? '0 '     : '%T@ ') .
231         '%i %p\0';
232
233 push(@find_command, '!', '-empty') unless $collapse_zero;
234 push(@find_command, '-printf', $printf_string);
235
236 my @sort_command = ('sort', '-znr', @extra_sort_opts);
237 my @quoted_sort_command = @sort_command;
238 grep(tick_quote($_), @quoted_sort_command);
239 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
240
241 my @quoted_find_command = @find_command;
242 grep(tick_quote($_), @quoted_find_command);
243 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
244 print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
245
246 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
247 $/ = "\0";
248
249 # Input is sorted so that all weak keys are contiguous.
250 # When the key changes, we have to process all files we previously know about.
251 my $current_key = -1;
252
253 # $inode_to_file_name{$inode} = [@file_names]
254 my %inode_to_file_name = ();
255
256 # Link files
257 sub link_files {
258         my ($from, $to) = (@_);
259
260         my $quoted_from = tick_quote($from);
261         my $quoted_to = tick_quote($to);
262         print STDERR "ln -f $quoted_from $quoted_to\n";
263
264         return if $dry_run;
265
266         my $inode_dir = $to;
267         my $inode_base = $to;
268         $inode_dir =~ s:[^/]*$::o;
269         $inode_base =~ s:^.*/::os;
270         my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
271         print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
272         link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
273         print STDERR "\trename: $tmp_to -> $to\n" if $debug;
274         unless (rename($tmp_to, $to)) {
275                 my $saved_bang = $!;
276                 unlink($tmp_to) or warn "unlink: $tmp_to: $!";  # Try, possibly in vain, to clean up
277                 die "rename: $tmp_to -> $from: $saved_bang";
278         }
279 }
280
281 # Process all known files so far.
282 sub merge_files {
283         $merges_attempted++;
284
285         my %hash_to_inode;
286         # Used to stop link retry loops (there is a goto in here!  Actually two...)
287         my %stop_loop;
288
289         my @candidate_list = keys(%inode_to_file_name);
290         $input_files += @candidate_list;
291         if (@candidate_list < 2) {
292                 print STDERR "Merging...only one candidate to merge..." if $debug;
293                 $trivially_unique++;
294                 goto end_merge;
295         }
296
297         print STDERR "Merging...\n" if $debug;
298         foreach my $candidate (@candidate_list) {
299                 print STDERR "\tDigesting candidate $candidate\n" if $debug;
300                 my $ok = 0;
301                 my $digest;
302
303 hash_file:
304
305                 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
306                         print STDERR "\t\tDigesting file $filename\n" if $debug;
307                         if ((-l $filename) || ! -f _) {
308                                 warn "Bogon file " . tick_quote($filename);
309                                 $surprises++;
310                                 next;
311                         }
312                         eval { 
313                                 $digest = digest($filename); 
314                         };
315                         if ($@) {
316                                 warn "Digest($filename)(#$candidate) failed: $@";
317                                 $hash_errors++;
318                         } else {
319                                 $ok = 1;
320                                 last hash_file;
321                         }
322                 }
323                 if ($ok) {
324                         print STDERR "\t\tDigest is $digest\n" if $debug;
325
326                         my $incumbent = $hash_to_inode{$digest};
327                         if (defined($incumbent)) {
328                                 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
329
330                                 my $finished = 0;
331
332 link_start:
333
334                                 until ($finished) {
335                                         my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
336                                         my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
337                                         print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
338                                         print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
339
340 incumbent_file:
341
342                                         foreach my $incumbent_file (@incumbent_names) {
343                                                 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
344                                                 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
345
346                                                 if (!defined($incumbent_blocks)) {
347                                                         warn "lstat: $incumbent_file: $!";
348                                                         $surprises++;
349                                                         next incumbent_file;
350                                                 }
351
352                                                 if ($incumbent_ino != $incumbent) {
353                                                         warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
354                                                         $surprises++;
355                                                         next incumbent_file;
356                                                 }
357
358                                                 my $at_least_one_link_done = 0;
359
360 candidate_file:
361
362                                                 foreach my $candidate_file (@candidate_names) {
363                                                         my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
364                                                         print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
365
366                                                         if (!defined($candidate_blocks)) {
367                                                                 warn "lstat: $candidate_file: $!";
368                                                                 $surprises++;
369                                                                 next candidate_file;
370                                                         }
371
372                                                         if ($candidate_ino != $candidate) {
373                                                                 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
374                                                                 $surprises++;
375                                                                 next candidate_file;
376                                                         }
377
378                                                         if ($candidate_size != $incumbent_size) {
379                                                                 warn "$candidate_file, $incumbent_file: file sizes are different";
380                                                                 $surprises++;
381                                                                 next candidate_file;
382                                                         }
383
384                                                         my $identical;
385
386                                                         eval {
387                                                                 if ($skip_compares) {
388                                                                         print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
389                                                                         $identical = 1;
390                                                                 } else {
391                                                                         my $quoted_incumbent_file = tick_quote($incumbent_file);
392                                                                         my $quoted_candidate_file = tick_quote($candidate_file);
393                                                                         print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
394                                                                         if (compare($incumbent_file, $candidate_file)) {
395                                                                                 $compare_differences++;
396                                                                                 $identical = 0;
397                                                                                 # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
398                                                                                 # Some kind of I/O error is more likely, so this message cannot be turned off.
399                                                                                 # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
400                                                                                 # so the warning in that case is quite silly.  Hmmm.
401                                                                                 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
402                                                                         } else {
403                                                                                 $identical = 1;
404                                                                         }
405                                                                         $compare_count++;
406                                                                         $compare_bytes += $incumbent_size;
407                                                                 }
408                                                         };
409                                                         if ($@) {
410                                                                 warn $@;
411                                                                 $compare_errors++;
412                                                                 next candidate_file;
413                                                         }
414
415                                                         if ($identical) {
416                                                                 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
417
418                                                                 # We have to do this to break out of a possible infinite loop.
419                                                                 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
420                                                                 # such that A1 and B1 are in non-writable directories, we will loop
421                                                                 # forever hardlinking A2 with A and B.
422                                                                 # To break the loop, we never attempt to hardlink any files X and Y twice.
423
424                                                                 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
425                                                                     defined($stop_loop{$candidate_file}->{$incumbent_file})) {
426                                                                         print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
427                                                                 } else {
428                                                                         $stop_loop{$incumbent_file}->{$candidate_file} = 1;
429                                                                         $stop_loop{$candidate_file}->{$incumbent_file} = 1;
430
431                                                                         my $link_done = 0;
432
433                                                                         my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
434                                                                         if ($candidate_nlink > $incumbent_nlink) {
435                                                                                 $from_file = $candidate_file;
436                                                                                 $to_file = $incumbent_file;
437                                                                                 $from_inode = $candidate;
438                                                                                 $to_inode = $incumbent;
439                                                                                 $from_nlink = $candidate_nlink;
440                                                                                 $to_nlink = $incumbent_nlink;
441                                                                         } else {
442                                                                                 $to_file = $candidate_file;
443                                                                                 $from_file = $incumbent_file;
444                                                                                 $to_inode = $candidate;
445                                                                                 $from_inode = $incumbent;
446                                                                                 $to_nlink = $candidate_nlink;
447                                                                                 $from_nlink = $incumbent_nlink;
448                                                                         }
449
450                                                                         eval {
451                                                                                 link_files($from_file, $to_file);
452                                                                                 $link_done = 1;
453                                                                         };
454
455                                                                         if ($@) {
456                                                                                 warn $@;
457                                                                                 $link_errors++;
458
459                                                                                 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
460                                                                                 $link_retries++;
461
462                                                                                 eval {
463                                                                                         ($from_file, $to_file) = ($to_file, $from_file);
464                                                                                         ($from_inode, $to_inode) = ($to_inode, $from_inode);
465                                                                                         ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
466                                                                                         link_files($from_file, $to_file);
467                                                                                         $link_done = 1;
468                                                                                 };
469
470                                                                                 if ($@) {
471                                                                                         warn $@;
472                                                                                         $link_errors++;
473                                                                                 }
474                                                                         }
475
476                                                                         # Note since the files are presumably identical, they both have the same size.
477                                                                         # My random number generator chooses the incumbent's size.
478
479                                                                         if ($link_done) {
480                                                                                 # Since we're in a dry run, the filesystem doesn't change.
481                                                                                 # Our notion of what the filesystem should look like should not change either.
482                                                                                 delete $inode_to_file_name{$to_inode}->{$to_file};
483                                                                                 unless ($dry_run) {
484                                                                                         $inode_to_file_name{$from_inode}->{$to_file} = undef;
485                                                                                         $hash_to_inode{$digest} = $from_inode;
486                                                                                 }
487
488                                                                                 $hard_links++;
489                                                                                 if ($to_nlink == 1) {
490                                                                                         $recovered_files++;
491                                                                                         $recovered_bytes += $incumbent_size;
492                                                                                 }
493
494                                                                                 # FIXME:  Now we're really confused for some reason.
495                                                                                 # Start over to rebuild state.
496                                                                                 next link_start;
497                                                                         } else {
498                                                                                 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
499
500                                                                                 # FIXME:  This is a lame heuristic.  We really need to know if we've
501                                                                                 # tried all possible ways to hardlink the file out of existence first;
502                                                                                 # however, that is complex and only benefits a silly statistic.
503                                                                                 if ($to_nlink == 1 || $from_nlink == 1) {
504                                                                                         $lost_files++;
505                                                                                         $lost_bytes += $incumbent_size;
506                                                                                 }
507                                                                         }
508                                                                 }
509                                                         }
510                                                 }
511                                         }
512                                         $finished = 1;
513                                 }
514                         } else {
515                                 print STDERR "\t\tNew hash entered\n" if $debug;
516                                 $hash_to_inode{$digest} = $candidate;
517                         }
518                 } else {
519                         warn "No digests found for inode $candidate\n";
520                         delete $inode_to_file_name{$candidate};
521                 }
522         }
523
524 end_merge:
525
526         print STDERR "Merge done.\n" if $debug;
527         undef %inode_to_file_name;
528 }
529
530 while (<FIND>) {
531         my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
532         die "read error: $!\nLast input line was '$_'" unless defined($name);
533
534         print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
535
536         unless (! (-l $name) && (-f _)) {
537                 warn "Bogon file " . tick_quote($name);
538                 $input_bogons++;
539                 next;
540         }
541
542         $input_links++;
543         merge_files if $weak_key ne $current_key;
544         $current_key = $weak_key;
545
546         $inode_to_file_name{$inode}->{$name} = undef;
547
548         print STDERR "$name\n" if $verbose;
549 }
550
551 merge_files;
552
553 my $stats_blob = <<STATS;
554 compare_bytes           $compare_bytes
555 compare_count           $compare_count
556 compare_differences     $compare_differences
557 compare_errors          $compare_errors
558 hard_links              $hard_links
559 hash_bytes              $hash_bytes
560 hash_errors             $hash_errors
561 hash_files              $hash_files
562 input_bogons            $input_bogons
563 input_files             $input_files
564 input_links             $input_links
565 link_errors             $link_errors
566 link_retries            $link_retries
567 lost_bytes              $lost_bytes
568 lost_files              $lost_files
569 merges_attempted        $merges_attempted
570 recovered_bytes         $recovered_bytes
571 recovered_files         $recovered_files
572 surprises               $surprises
573 trivially_unique        $trivially_unique
574 STATS
575
576 if ($humane) {
577         my $max_num_len = 0;
578
579         sub measure_numbers {
580                 my ($num) = @_;
581                 my $len = length($num);
582                 $len += int( (length($num) - 1) / 3);
583                 $max_num_len = $len if $len > $max_num_len;
584         }
585
586         (my $dummy = $stats_blob) =~ s/\d+/measure_numbers($&)/geos;
587
588         sub space_numbers {
589                 my ($num) = @_;
590                 1 while $num =~ s/(\d)(\d\d\d)( \d\d\d)*$/$1 $2$3/os;
591                 $num = ' ' x ($max_num_len - length($num)) . $num;
592                 return $num;
593         }
594
595         $stats_blob =~ s/\d+/space_numbers($&)/geos;
596 }
597
598 $stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
599
600 print STDERR $stats_blob;
601
602 exit(0);