git.hungrycats.org Git - dupemerge/blob - faster-dupemerge

   1 #!/usr/bin/perl -w
   2 # $Id$
   3
   4 # Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
   5 # Use, modification, and distribution permitted
   6 # under the terms of the GNU GPL.
   7
   8 use strict;
   9 use Fcntl qw(:DEFAULT :flock);
  10 use File::Compare;
  11 use File::Temp;
  12
  13 my $input_links = 0;
  14 my $input_files = 0;
  15 my $input_bogons = 0;
  16 my $hash_bytes = 0;
  17 my $hash_files = 0;
  18 my $hash_errors = 0;
  19 my $compare_bytes = 0;
  20 my $compare_count = 0;
  21 my $compare_errors = 0;
  22 my $compare_differences = 0;
  23 my $trivially_unique = 0;
  24 my $merges_attempted = 0;
  25 my $hard_links = 0;
  26 my $link_errors = 0;
  27 my $link_retries = 0;
  28 my $recovered_bytes = 0;
  29 my $recovered_files = 0;
  30 my $lost_files = 0;
  31 my $lost_bytes = 0;
  32 my $surprises = 0;
  33
  34 eval '
  35         use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
  36 ';
  37
  38 if ($@) {
  39         warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
  40
  41         eval <<'DIGEST';
  42                 sub really_digest {
  43                         my ($filename) = (@_);
  44                         my $fv = open(MD5SUM, "-|");
  45                         die "fork: $!" unless defined($fv);
  46                         if ($fv) {
  47                                 my ($sum_line) = <MD5SUM>;
  48                                 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
  49                                 die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
  50                                 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
  51                                 die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
  52                                 return $sum;
  53                         } else {
  54                                 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
  55                                 exec('md5sum');
  56                                 # Perl guarantees it will die here
  57                         }
  58                 }
  59 DIGEST
  60 } else {
  61         eval <<'DIGEST';
  62                 sub really_digest {
  63                         my ($filename) = (@_);
  64                         die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
  65                         my $ctx = Digest::SHA1->new;
  66                         sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
  67                         binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
  68                         $ctx->addfile(\*FILE);
  69                         close(FILE) or die "close: $filename: $!";
  70                         return $ctx->b64digest;
  71                 }
  72 DIGEST
  73 }
  74
  75 my $collapse_access = 0;
  76 my $collapse_timestamp = 0;
  77 my $collapse_zero = 0;
  78 my $skip_compares = 0;
  79 my $skip_hashes = 0;
  80 my $verbose = 0;
  81 my $debug = 0;
  82 my @extra_find_opts = ();
  83 my @extra_sort_opts = ();
  84 my $lock_file;
  85 my $lock_rm = 0;
  86 my $lock_obtained = 0;
  87
  88 sub digest {
  89         my ($filename) = (@_);
  90         if ($skip_hashes) {
  91                 return "SKIPPING HASHES";
  92         } else {
  93                 my $digest = &really_digest($filename);
  94                 $hash_bytes += -s $filename;
  95                 $hash_files++;
  96                 return $digest
  97         }
  98 }
  99
 100 my @directories;
 101
 102 sub usage {
 103         my $name = shift(@_);
 104         die <<USAGE;
 105 Usage: $name [--opts] directory [directory...]
 106 Finds duplicate files in the given directories, and replaces all identical
 107 copies of a file with hard-links to a single file.
 108
 109 Several options modify the definition of a "duplicate".  By default, files
 110 which have differences in owner uid or gid, permission (mode), or
 111 modification time (mtime) are considered different, so that hardlinking
 112 files does not also change their attributes.  Additionally, all files of
 113 zero size are ignored for performance reasons (there tend to be many
 114 of them, and they tend not to release any space when replaced with
 115 hard links).
 116
 117         --access        uid, gid, and mode may be different for identical
 118                         files
 119
 120         --debug         show all steps in duplication discovery process
 121                         (implies --verbose)
 122
 123         --find          pass next options (up to --) to find command
 124
 125         --lock FILE     exit immediately (status 10) if unable to obtain a
 126                         flock(LOCK_EX|LOCK_NB) on FILE
 127
 128         --lock-rm       remove lock file at exit
 129
 130         --sort          pass next options (up to --) to sort command
 131
 132         --timestamps    mtime may be different for identical files
 133
 134         --skip-compare  skip byte-by-byte file comparisons
 135
 136         --skip-hash     skip calculation of hash function on files
 137
 138         --trust         old name for --skip-compare
 139                         (trust the hash function)
 140
 141         --verbose       report files as they are considered
 142
 143         --zeros         hard-link zero-length files too
 144 USAGE
 145 }
 146
 147 while ($#ARGV >= 0) {
 148         my $arg = shift(@ARGV);
 149         if ($arg eq '--access') {
 150                 $collapse_access = 1;
 151         } elsif ($arg eq '--timestamps') {
 152                 $collapse_timestamp = 1;
 153         } elsif ($arg eq '--zeros') {
 154                 $collapse_zero = 1;
 155         } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
 156                 $skip_compares = 1;
 157         } elsif ($arg eq '--skip-hash') {
 158                 $skip_hashes = 1;
 159         } elsif ($arg eq '--verbose') {
 160                 $verbose = 1;
 161         } elsif ($arg eq '--lock-rm') {
 162                 $lock_rm = 1;
 163         } elsif ($arg eq '--lock') {
 164                 $lock_file = shift(@ARGV);
 165                 unless (defined($lock_file)) {
 166                         usage($0);
 167                         exit(1);
 168                 }
 169         } elsif ($arg eq '--debug') {
 170                 $debug = $verbose = 1;
 171         } elsif ($arg eq '--find') {
 172                 while ($#ARGV >= 0) {
 173                         my $extra_arg = shift(@ARGV);
 174                         last if $extra_arg eq '--';
 175                         push(@extra_find_opts, $extra_arg);
 176                 }
 177         } elsif ($arg eq '--sort') {
 178                 while ($#ARGV >= 0) {
 179                         my $extra_arg = shift(@ARGV);
 180                         last if $extra_arg eq '--';
 181                         push(@extra_sort_opts, $extra_arg);
 182                 }
 183         } elsif ($arg =~ /^-/o) {
 184                 usage($0);
 185                 exit(1);
 186         } else {
 187                 push(@directories, $arg);
 188         }
 189 }
 190
 191 if ($skip_hashes && $skip_compares) {
 192         die "Cannot skip both hashes and compares.\n";
 193 }
 194
 195 if (defined($lock_file)) {
 196         sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
 197         flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
 198         print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
 199         $lock_obtained = 1;
 200 }
 201
 202 END {
 203         if ($lock_obtained) {
 204                 print STDERR "Removing '$lock_file'.\n" if $verbose;
 205                 unlink($lock_file) or warn "unlink: $lock_file: $!";
 206         }
 207 }
 208
 209 sub tick_quote {
 210         my ($text) = (@_);
 211         $text =~ s/'/'\''/go;
 212         return "'$text'";
 213 }
 214
 215 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
 216 my $printf_string = '%s ' .
 217         ($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
 218         ($collapse_timestamp ? '0 '     : '%T@ ') .
 219         '%i %p\0';
 220
 221 push(@find_command, '!', '-empty') unless $collapse_zero;
 222 push(@find_command, '-printf', $printf_string);
 223
 224 my @sort_command = ('sort', '-znr', @extra_sort_opts);
 225 my @quoted_sort_command = @sort_command;
 226 grep(tick_quote($_), @quoted_sort_command);
 227 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
 228
 229 my @quoted_find_command = @find_command;
 230 grep(tick_quote($_), @quoted_find_command);
 231 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
 232 print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
 233
 234 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
 235 $/ = "\0";
 236
 237 # Input is sorted so that all weak keys are contiguous.
 238 # When the key changes, we have to process all files we previously know about.
 239 my $current_key = -1;
 240
 241 # $inode_to_file_name{$inode} = [@file_names]
 242 my %inode_to_file_name = ();
 243
 244 # Link files
 245 sub link_files {
 246         my ($from, $to) = (@_);
 247         my $inode_dir = $to;
 248         my $inode_base = $to;
 249         $inode_dir =~ s:[^/]*$::o;
 250         $inode_base =~ s:^.*/::os;
 251         my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
 252         my $quoted_from = tick_quote($from);
 253         my $quoted_to = tick_quote($to);
 254         print STDERR "ln -f $quoted_from $quoted_to\n";
 255         print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
 256         link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
 257         print STDERR "\trename: $tmp_to -> $to\n" if $debug;
 258         unless (rename($tmp_to, $to)) {
 259                 my $saved_bang = $!;
 260                 unlink($tmp_to) or warn "unlink: $tmp_to: $!";  # Try, possibly in vain, to clean up
 261                 die "rename: $tmp_to -> $from: $saved_bang";
 262         }
 263 }
 264
 265 # Process all known files so far.
 266 sub merge_files {
 267         $merges_attempted++;
 268
 269         my %hash_to_inode;
 270         # Used to stop link retry loops (there is a goto in here!  Actually two...)
 271         my %stop_loop;
 272
 273         my @candidate_list = keys(%inode_to_file_name);
 274         $input_files += @candidate_list;
 275         if (@candidate_list < 2) {
 276                 print STDERR "Merging...only one candidate to merge..." if $debug;
 277                 $trivially_unique++;
 278                 goto end_merge;
 279         }
 280
 281         print STDERR "Merging...\n" if $debug;
 282         foreach my $candidate (@candidate_list) {
 283                 print STDERR "\tDigesting candidate $candidate\n" if $debug;
 284                 my $ok = 0;
 285                 my $digest;
 286
 287 hash_file:
 288
 289                 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
 290                         print STDERR "\t\tDigesting file $filename\n" if $debug;
 291                         if ((-l $filename) || ! -f _) {
 292                                 warn "Bogon file " . tick_quote($filename);
 293                                 $surprises++;
 294                                 next;
 295                         }
 296                         eval {
 297                                 $digest = digest($filename);
 298                         };
 299                         if ($@) {
 300                                 warn "Digest($filename)(#$candidate) failed: $@";
 301                                 $hash_errors++;
 302                         } else {
 303                                 $ok = 1;
 304                                 last hash_file;
 305                         }
 306                 }
 307                 if ($ok) {
 308                         print STDERR "\t\tDigest is $digest\n" if $debug;
 309
 310                         my $incumbent = $hash_to_inode{$digest};
 311                         if (defined($incumbent)) {
 312                                 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
 313
 314                                 my $finished = 0;
 315
 316 link_start:
 317
 318                                 until ($finished) {
 319                                         my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
 320                                         my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
 321                                         print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
 322                                         print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
 323
 324 incumbent_file:
 325
 326                                         foreach my $incumbent_file (@incumbent_names) {
 327                                                 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
 328                                                 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
 329
 330                                                 if (!defined($incumbent_blocks)) {
 331                                                         warn "lstat: $incumbent_file: $!";
 332                                                         $surprises++;
 333                                                         next incumbent_file;
 334                                                 }
 335
 336                                                 if ($incumbent_ino != $incumbent) {
 337                                                         warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
 338                                                         $surprises++;
 339                                                         next incumbent_file;
 340                                                 }
 341
 342                                                 my $at_least_one_link_done = 0;
 343
 344 candidate_file:
 345
 346                                                 foreach my $candidate_file (@candidate_names) {
 347                                                         my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
 348                                                         print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
 349
 350                                                         if (!defined($candidate_blocks)) {
 351                                                                 warn "lstat: $candidate_file: $!";
 352                                                                 $surprises++;
 353                                                                 next candidate_file;
 354                                                         }
 355
 356                                                         if ($candidate_ino != $candidate) {
 357                                                                 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
 358                                                                 $surprises++;
 359                                                                 next candidate_file;
 360                                                         }
 361
 362                                                         if ($candidate_size != $incumbent_size) {
 363                                                                 warn "$candidate_file, $incumbent_file: file sizes are different";
 364                                                                 $surprises++;
 365                                                                 next candidate_file;
 366                                                         }
 367
 368                                                         my $identical;
 369
 370                                                         eval {
 371                                                                 if ($skip_compares) {
 372                                                                         print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
 373                                                                         $identical = 1;
 374                                                                 } else {
 375                                                                         my $quoted_incumbent_file = tick_quote($incumbent_file);
 376                                                                         my $quoted_candidate_file = tick_quote($candidate_file);
 377                                                                         print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
 378                                                                         if (compare($incumbent_file, $candidate_file)) {
 379                                                                                 $compare_differences++;
 380                                                                                 $identical = 0;
 381                                                                                 # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
 382                                                                                 # Some kind of I/O error is more likely, so this message cannot be turned off.
 383                                                                                 # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
 384                                                                                 # so the warning in that case is quite silly.  Hmmm.
 385                                                                                 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
 386                                                                         } else {
 387                                                                                 $identical = 1;
 388                                                                         }
 389                                                                         $compare_count++;
 390                                                                         $compare_bytes += $incumbent_size;
 391                                                                 }
 392                                                         };
 393                                                         if ($@) {
 394                                                                 warn $@;
 395                                                                 $compare_errors++;
 396                                                                 next candidate_file;
 397                                                         }
 398
 399                                                         if ($identical) {
 400                                                                 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
 401
 402                                                                 # We have to do this to break out of a possible infinite loop.
 403                                                                 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
 404                                                                 # such that A1 and B1 are in non-writable directories, we will loop
 405                                                                 # forever hardlinking A2 with A and B.
 406                                                                 # To break the loop, we never attempt to hardlink any files X and Y twice.
 407
 408                                                                 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
 409                                                                     defined($stop_loop{$candidate_file}->{$incumbent_file})) {
 410                                                                         print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
 411                                                                 } else {
 412                                                                         $stop_loop{$incumbent_file}->{$candidate_file} = 1;
 413                                                                         $stop_loop{$candidate_file}->{$incumbent_file} = 1;
 414
 415                                                                         my $link_done = 0;
 416
 417                                                                         my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
 418                                                                         if ($candidate_nlink > $incumbent_nlink) {
 419                                                                                 $from_file = $candidate_file;
 420                                                                                 $to_file = $incumbent_file;
 421                                                                                 $from_inode = $candidate;
 422                                                                                 $to_inode = $incumbent;
 423                                                                                 $from_nlink = $candidate_nlink;
 424                                                                                 $to_nlink = $incumbent_nlink;
 425                                                                         } else {
 426                                                                                 $to_file = $candidate_file;
 427                                                                                 $from_file = $incumbent_file;
 428                                                                                 $to_inode = $candidate;
 429                                                                                 $from_inode = $incumbent;
 430                                                                                 $to_nlink = $candidate_nlink;
 431                                                                                 $from_nlink = $incumbent_nlink;
 432                                                                         }
 433
 434                                                                         eval {
 435                                                                                 link_files($from_file, $to_file);
 436                                                                                 $link_done = 1;
 437                                                                         };
 438
 439                                                                         if ($@) {
 440                                                                                 warn $@;
 441                                                                                 $link_errors++;
 442
 443                                                                                 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
 444                                                                                 $link_retries++;
 445
 446                                                                                 eval {
 447                                                                                         ($from_file, $to_file) = ($to_file, $from_file);
 448                                                                                         ($from_inode, $to_inode) = ($to_inode, $from_inode);
 449                                                                                         ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
 450                                                                                         link_files($from_file, $to_file);
 451                                                                                         $link_done = 1;
 452                                                                                 };
 453
 454                                                                                 if ($@) {
 455                                                                                         warn $@;
 456                                                                                         $link_errors++;
 457                                                                                 }
 458                                                                         }
 459
 460                                                                         # Note since the files are presumably identical, they both have the same size.
 461                                                                         # My random number generator chooses the incumbent's size.
 462
 463                                                                         if ($link_done) {
 464                                                                                 delete $inode_to_file_name{$to_inode}->{$to_file};
 465                                                                                 $inode_to_file_name{$from_inode}->{$to_file} = undef;
 466                                                                                 $hash_to_inode{$digest} = $from_inode;
 467
 468                                                                                 $hard_links++;
 469                                                                                 if ($to_nlink == 1) {
 470                                                                                         $recovered_files++;
 471                                                                                         $recovered_bytes += $incumbent_size;
 472                                                                                 }
 473
 474                                                                                 # FIXME:  Now we're really confused for some reason.
 475                                                                                 # Start over to rebuild state.
 476                                                                                 next link_start;
 477                                                                         } else {
 478                                                                                 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
 479
 480                                                                                 # FIXME:  This is a lame heuristic.  We really need to know if we've
 481                                                                                 # tried all possible ways to hardlink the file out of existence first;
 482                                                                                 # however, that is complex and only benefits a silly statistic.
 483                                                                                 if ($to_nlink == 1 || $from_nlink == 1) {
 484                                                                                         $lost_files++;
 485                                                                                         $lost_bytes += $incumbent_size;
 486                                                                                 }
 487                                                                         }
 488                                                                 }
 489                                                         }
 490                                                 }
 491                                         }
 492                                         $finished = 1;
 493                                 }
 494                         } else {
 495                                 print STDERR "\t\tNew hash entered\n" if $debug;
 496                                 $hash_to_inode{$digest} = $candidate;
 497                         }
 498                 } else {
 499                         warn "No digests found for inode $candidate\n";
 500                         delete $inode_to_file_name{$candidate};
 501                 }
 502         }
 503
 504 end_merge:
 505
 506         print STDERR "Merge done.\n" if $debug;
 507         undef %inode_to_file_name;
 508 }
 509
 510 while (<FIND>) {
 511         my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
 512         die "read error: $!\nLast input line was '$_'" unless defined($name);
 513
 514         print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
 515
 516         unless (! (-l $name) && (-f _)) {
 517                 warn "Bogon file " . tick_quote($name);
 518                 $input_bogons++;
 519                 next;
 520         }
 521
 522         $input_links++;
 523         merge_files if $weak_key ne $current_key;
 524         $current_key = $weak_key;
 525
 526         $inode_to_file_name{$inode}->{$name} = undef;
 527
 528         print STDERR "$name\n" if $verbose;
 529 }
 530
 531 merge_files;
 532
 533 my $stats_blob = <<STATS;
 534 compare_bytes           $compare_bytes
 535 compare_count           $compare_count
 536 compare_differences     $compare_differences
 537 compare_errors          $compare_errors
 538 hard_links              $hard_links
 539 hash_bytes              $hash_bytes
 540 hash_errors             $hash_errors
 541 hash_files              $hash_files
 542 input_bogons            $input_bogons
 543 input_files             $input_files
 544 input_links             $input_links
 545 link_errors             $link_errors
 546 link_retries            $link_retries
 547 lost_bytes              $lost_bytes
 548 lost_files              $lost_files
 549 merges_attempted        $merges_attempted
 550 recovered_bytes         $recovered_bytes
 551 recovered_files         $recovered_files
 552 surprises               $surprises
 553 trivially_unique        $trivially_unique
 554 STATS
 555
 556 $stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
 557
 558 print STDERR $stats_blob;
 559
 560 exit(0);