Initial commit
[dupemerge] / faster-dupemerge
1 #!/usr/bin/perl -w
2 use strict;
3 use Fcntl qw(:DEFAULT :flock);
4 use File::Compare;
5 use File::Temp;
6
7 my $links_input = 0;
8 my $files_input = 0;
9 my $bogons_input = 0;
10 my $files_hashed = 0;
11 my $hash_errors = 0;
12 my $compare_count = 0;
13 my $compare_errors = 0;
14 my $compare_differences = 0;
15 my $trivially_unique = 0;
16 my $merges_attempted = 0;
17 my $hard_links = 0;
18 my $link_errors = 0;
19 my $link_retries = 0;
20 my $bytes_recovered = 0;
21 my $files_recovered = 0;
22 my $files_lost = 0;
23 my $bytes_lost = 0;
24 my $surprises = 0;
25
26 eval '
27         use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
28 ';
29
30 if ($@) {
31         warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
32
33         eval <<'DIGEST';
34                 sub digest {
35                         my ($filename) = (@_);
36                         my $fv = open(MD5SUM, "-|");    
37                         die "fork: $!" unless defined($fv);
38                         if ($fv) {
39                                 my ($sum_line) = <MD5SUM>;
40                                 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
41                                 die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
42                                 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
43                                 die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
44                                 return $sum;
45                         } else {
46                                 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
47                                 exec('md5sum');
48                                 # Perl guarantees it will die here
49                         }
50                 }
51 DIGEST
52 } else {
53         eval <<'DIGEST';
54                 sub digest {
55                         my ($filename) = (@_);
56                         die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
57                         my $ctx = Digest::SHA1->new;
58                         sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
59                         binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
60                         $ctx->addfile(\*FILE);
61                         close(FILE) or die "close: $filename: $!";
62                         return $ctx->b64digest;
63                 }
64 DIGEST
65 }
66         
67 my $collapse_access = 0;
68 my $collapse_timestamp = 0;
69 my $collapse_zero = 0;
70 my $trust_hashes = 0;
71 my $verbose = 0;
72 my $debug = 0;
73 my @extra_find_opts = ();
74 my @extra_sort_opts = ();
75 my $lock_file;
76 my $lock_rm = 0;
77 my $lock_obtained = 0;
78
79 my @directories;
80
81 sub usage {
82         my $name = shift(@_);
83         die <<USAGE;
84 Usage: $name [--opts] directory [directory...]
85 Finds duplicate files in the given directories, and replaces all identical
86 copies of a file with hard-links to a single file.
87
88 Several options modify the definition of a "duplicate".  By default, files
89 which have differences in owner uid or gid, permission (mode), or
90 modification time (mtime) are considered different, so that hardlinking
91 files does not also change their attributes.  Additionally, all files of
92 zero size are ignored for performance reasons (there tend to be many
93 of them, and they tend not to release any space when replaced with
94 hard links).
95
96         --access        uid, gid, and mode may be different for identical
97                         files
98
99         --debug         show all steps in duplication discovery process
100                         (implies --verbose)
101
102         --find          pass next options (up to --) to find command
103
104         --lock FILE     exit immediately (status 10) if unable to obtain a 
105                         flock(LOCK_EX|LOCK_NB) on FILE
106
107         --lock-rm       remove lock file at exit
108
109         --sort          pass next options (up to --) to sort command
110
111         --timestamps    mtime may be different for identical files
112
113         --trust         skip byte-by-byte file comparisons
114                         (trust the hash function)
115
116         --verbose       report files as they are considered
117
118         --zeros         hard-link zero-length files too
119 USAGE
120 }
121
122 while ($#ARGV >= 0) {
123         my $arg = shift(@ARGV);
124         if ($arg eq '--access') {
125                 $collapse_access = 1;
126         } elsif ($arg eq '--timestamps') {
127                 $collapse_timestamp = 1;
128         } elsif ($arg eq '--zeros') {
129                 $collapse_zero = 1;
130         } elsif ($arg eq '--trust') {
131                 $trust_hashes = 1;
132         } elsif ($arg eq '--verbose') {
133                 $verbose = 1;
134         } elsif ($arg eq '--lock-rm') {
135                 $lock_rm = 1;
136         } elsif ($arg eq '--lock') {
137                 $lock_file = shift(@ARGV);
138                 unless (defined($lock_file)) {
139                         usage($0);
140                         exit(1);
141                 }
142         } elsif ($arg eq '--debug') {
143                 $debug = $verbose = 1;
144         } elsif ($arg eq '--find') {
145                 while ($#ARGV >= 0) {
146                         my $extra_arg = shift(@ARGV);
147                         last if $extra_arg eq '--';
148                         push(@extra_find_opts, $extra_arg);
149                 }
150         } elsif ($arg eq '--sort') {
151                 while ($#ARGV >= 0) {
152                         my $extra_arg = shift(@ARGV);
153                         last if $extra_arg eq '--';
154                         push(@extra_sort_opts, $extra_arg);
155                 }
156         } elsif ($arg =~ /^-/o) {
157                 usage($0);
158                 exit(1);
159         } else {
160                 push(@directories, $arg);
161         }
162 }
163
164 if (defined($lock_file)) {
165         sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
166         flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
167         print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
168         $lock_obtained = 1;
169 }
170
171 END {
172         if ($lock_obtained) {
173                 print STDERR "Removing '$lock_file'.\n" if $verbose;
174                 unlink($lock_file) or warn "unlink: $lock_file: $!";
175         }
176 }
177
178 sub tick_quote {
179         my ($text) = (@_);
180         $text =~ s/'/'\''/go;
181         return "'$text'";
182 }
183
184 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
185 my $printf_string = '%s ' .
186         ($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
187         ($collapse_timestamp ? '0 '     : '%T@ ') .
188         '%i %p\0';
189
190 push(@find_command, '!', '-empty') unless $collapse_zero;
191 push(@find_command, '-printf', $printf_string);
192
193 my @sort_command = ('sort', '-znr', @extra_sort_opts);
194 my @quoted_sort_command = @sort_command;
195 grep(tick_quote($_), @quoted_sort_command);
196 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
197
198 my @quoted_find_command = @find_command;
199 grep(tick_quote($_), @quoted_find_command);
200 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
201 print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
202
203 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
204 $/ = "\0";
205
206 # Input is sorted so that all weak keys are contiguous.
207 # When the key changes, we have to process all files we previously know about.
208 my $current_key = -1;
209
210 # $inode_to_file_name{$inode} = [@file_names]
211 my %inode_to_file_name = ();
212
213 # Link files
214 sub link_files {
215         my ($from, $to) = (@_);
216         my $inode_dir = $to;
217         my $inode_base = $to;
218         $inode_dir =~ s:[^/]*$::o;
219         $inode_base =~ s:^.*/::os;
220         my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
221         my $quoted_from = tick_quote($from);
222         my $quoted_to = tick_quote($to);
223         print STDERR "ln -f $quoted_from $quoted_to\n";
224         print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
225         link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
226         print STDERR "\trename: $tmp_to -> $to\n" if $debug;
227         unless (rename($tmp_to, $to)) {
228                 my $saved_bang = $!;
229                 unlink($tmp_to) or warn "unlink: $tmp_to: $!";  # Try, possibly in vain, to clean up
230                 die "rename: $tmp_to -> $from: $saved_bang";
231         }
232 }
233
234 # Process all known files so far.
235 sub merge_files {
236         $merges_attempted++;
237
238         my %hash_to_inode;
239         # Used to stop link retry loops (there is a goto in here!  Actually two...)
240         my %stop_loop;
241
242         my @candidate_list = keys(%inode_to_file_name);
243         $files_input += @candidate_list;
244         if (@candidate_list < 2) {
245                 print STDERR "Merging...only one candidate to merge..." if $debug;
246                 $trivially_unique++;
247                 goto end_merge;
248         }
249
250         print STDERR "Merging...\n" if $debug;
251         foreach my $candidate (@candidate_list) {
252                 print STDERR "\tDigesting candidate $candidate\n" if $debug;
253                 my $ok = 0;
254                 my $digest;
255
256 hash_file:
257
258                 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
259                         print STDERR "\t\tDigesting file $filename\n" if $debug;
260                         if ((-l $filename) || ! -f _) {
261                                 warn "Bogon file " . tick_quote($filename);
262                                 $surprises++;
263                                 next;
264                         }
265                         eval { $digest = digest($filename); };
266                         if ($@) {
267                                 warn "Digest($filename)(#$candidate) failed: $@";
268                                 $hash_errors++;
269                         } else {
270                                 $ok = 1;
271                                 $files_hashed++;
272                                 last hash_file;
273                         }
274                 }
275                 if ($ok) {
276                         print STDERR "\t\tDigest is $digest\n" if $debug;
277
278                         my $incumbent = $hash_to_inode{$digest};
279                         if (defined($incumbent)) {
280                                 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
281
282                                 my $finished = 0;
283
284 link_start:
285
286                                 until ($finished) {
287                                         my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
288                                         my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
289                                         print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
290                                         print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
291
292         incumbent_file:
293
294                                         foreach my $incumbent_file (@incumbent_names) {
295                                                 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
296                                                 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
297
298                                                 if (!defined($incumbent_blocks)) {
299                                                         warn "lstat: $incumbent_file: $!";
300                                                         $surprises++;
301                                                         next incumbent_file;
302                                                 }
303
304                                                 if ($incumbent_ino != $incumbent) {
305                                                         warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
306                                                         $surprises++;
307                                                         next incumbent_file;
308                                                 }
309
310         candidate_file:
311
312                                                 my $at_least_one_link_done = 0;
313
314                                                 foreach my $candidate_file (@candidate_names) {
315                                                         my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
316                                                         print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
317
318                                                         if (!defined($candidate_blocks)) {
319                                                                 warn "lstat: $candidate_file: $!";
320                                                                 $surprises++;
321                                                                 next candidate_file;
322                                                         }
323
324                                                         if ($candidate_ino != $candidate) {
325                                                                 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
326                                                                 $surprises++;
327                                                                 next candidate_file;
328                                                         }
329
330                                                         if ($candidate_size != $incumbent_size) {
331                                                                 warn "$candidate_file, $incumbent_file: file sizes are different";
332                                                                 $surprises++;
333                                                                 next candidate_file;
334                                                         }
335
336                                                         my $identical;
337
338                                                         eval {
339                                                                 if ($trust_hashes) {
340                                                                         print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
341                                                                         $identical = 1;
342                                                                 } else {
343                                                                         my $quoted_incumbent_file = tick_quote($incumbent_file);
344                                                                         my $quoted_candidate_file = tick_quote($candidate_file);
345                                                                         print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
346                                                                         if (compare($incumbent_file, $candidate_file)) {
347                                                                                 $compare_differences++;
348                                                                                 $identical = 0;
349                                                                                 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
350                                                                         } else {
351                                                                                 $identical = 1;
352                                                                         }
353                                                                         $compare_count++;
354                                                                 }
355                                                         };
356                                                         if ($@) {
357                                                                 warn $@;
358                                                                 $compare_errors++;
359                                                                 next candidate_file;
360                                                         }
361
362                                                         if ($identical) {
363                                                                 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
364
365                                                                 # We have to do this to break out of a possible infinite loop.
366                                                                 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
367                                                                 # such that A1 and B1 are in non-writable directories, we will loop
368                                                                 # forever hardlinking A2 with A and B.
369                                                                 # To break the loop, we never attempt to hardlink any files X and Y twice.
370
371                                                                 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
372                                                                     defined($stop_loop{$candidate_file}->{$incumbent_file})) {
373                                                                         print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
374                                                                 } else {
375                                                                         $stop_loop{$incumbent_file}->{$candidate_file} = 1;
376                                                                         $stop_loop{$candidate_file}->{$incumbent_file} = 1;
377
378                                                                         my $link_done = 0;
379
380                                                                         my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
381                                                                         if ($candidate_nlink > $incumbent_nlink) {
382                                                                                 $from_file = $candidate_file;
383                                                                                 $to_file = $incumbent_file;
384                                                                                 $from_inode = $candidate;
385                                                                                 $to_inode = $incumbent;
386                                                                                 $from_nlink = $candidate_nlink;
387                                                                                 $to_nlink = $incumbent_nlink;
388                                                                         } else {
389                                                                                 $to_file = $candidate_file;
390                                                                                 $from_file = $incumbent_file;
391                                                                                 $to_inode = $candidate;
392                                                                                 $from_inode = $incumbent;
393                                                                                 $to_nlink = $candidate_nlink;
394                                                                                 $from_nlink = $incumbent_nlink;
395                                                                         }
396
397                                                                         eval {
398                                                                                 link_files($from_file, $to_file);
399                                                                                 $link_done = 1;
400                                                                         };
401
402                                                                         if ($@) {
403                                                                                 warn $@;
404                                                                                 $link_errors++;
405
406                                                                                 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
407                                                                                 $link_retries++;
408
409                                                                                 eval {
410                                                                                         ($from_file, $to_file) = ($to_file, $from_file);
411                                                                                         ($from_inode, $to_inode) = ($to_inode, $from_inode);
412                                                                                         ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
413                                                                                         link_files($from_file, $to_file);
414                                                                                         $link_done = 1;
415                                                                                 };
416
417                                                                                 if ($@) {
418                                                                                         warn $@;
419                                                                                         $link_errors++;
420                                                                                 }
421                                                                         }
422
423                                                                         # Note since the files are presumably identical, they both have the same size.
424                                                                         # My random number generator chooses the incumbent's size.
425
426                                                                         if ($link_done) {
427                                                                                 delete $inode_to_file_name{$to_inode}->{$to_file};
428                                                                                 $inode_to_file_name{$from_inode}->{$to_file} = undef;
429                                                                                 $hash_to_inode{$digest} = $from_inode;
430
431                                                                                 $hard_links++;
432                                                                                 if ($to_nlink == 1) {
433                                                                                         $files_recovered++;
434                                                                                         $bytes_recovered += $incumbent_size;
435                                                                                 }
436
437                                                                                 # FIXME:  Now we're really confused for some reason.
438                                                                                 # Start over to rebuild state.
439                                                                                 next link_start;
440                                                                         } else {
441                                                                                 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
442
443                                                                                 # FIXME:  This is a lame heuristic.  We really need to know if we've
444                                                                                 # tried all possible ways to hardlink the file out of existence first;
445                                                                                 # however, that is complex and only benefits a silly statistic.
446                                                                                 if ($to_nlink == 1 || $from_nlink == 1) {
447                                                                                         $files_lost++;
448                                                                                         $bytes_lost += $incumbent_size;
449                                                                                 }
450                                                                         }
451                                                                 }
452                                                         }
453                                                 }
454                                         }
455                                         $finished = 1;
456                                 }
457                         } else {
458                                 print STDERR "\t\tNew hash entered\n" if $debug;
459                                 $hash_to_inode{$digest} = $candidate;
460                         }
461                 } else {
462                         warn "No digests found for inode $candidate\n";
463                         delete $inode_to_file_name{$candidate};
464                 }
465         }
466
467 end_merge:
468
469         print STDERR "Merge done.\n" if $debug;
470         undef %inode_to_file_name;
471 }
472
473 while (<FIND>) {
474         my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
475         die "read error: $!\nLast input line was '$_'" unless defined($name);
476
477         print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
478
479         unless (! (-l $name) && (-f _)) {
480                 warn "Bogon file " . tick_quote($name);
481                 $bogons_input++;
482                 next;
483         }
484
485         $links_input++;
486         merge_files if $weak_key ne $current_key;
487         $current_key = $weak_key;
488
489         $inode_to_file_name{$inode}->{$name} = undef;
490
491         print STDERR "$name\n" if $verbose;
492 }
493
494 merge_files;
495
496 print STDERR <<STATS;
497 links_input             $links_input
498 files_input ........... $files_input
499 bogons_input            $bogons_input
500 merges_attempted ...... $merges_attempted
501 trivially_unique        $trivially_unique
502 files_hashed .......... $files_hashed
503 hash_errors             $hash_errors
504 surprises ............. $surprises
505 compare_count           $compare_count
506 compare_differences ... $compare_differences
507 compare_errors          $compare_errors
508 hard_links ............ $hard_links
509 link_errors             $link_errors
510 link_retries .......... $link_retries
511 bytes_recovered         $bytes_recovered
512 files_recovered ....... $files_recovered
513 bytes_lost              $bytes_lost
514 files_lost ............ $files_lost
515 STATS
516
517 exit(0);