git.hungrycats.org Git - dupemerge/blob - faster-dupemerge

   1 #!/usr/bin/perl -w
   2 # $Id$
   3
   4 # Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
   5 # Use, modification, and distribution permitted
   6 # under the terms of the GNU GPL.
   7
   8 use strict;
   9 use Fcntl qw(:DEFAULT :flock);
  10 use File::Compare;
  11 use File::Temp;
  12
  13 my $input_links = 0;
  14 my $input_files = 0;
  15 my $input_bogons = 0;
  16 my $hash_bytes = 0;
  17 my $hash_files = 0;
  18 my $hash_errors = 0;
  19 my $compare_bytes = 0;
  20 my $compare_count = 0;
  21 my $compare_errors = 0;
  22 my $compare_differences = 0;
  23 my $trivially_unique = 0;
  24 my $merges_attempted = 0;
  25 my $hard_links = 0;
  26 my $link_errors = 0;
  27 my $link_retries = 0;
  28 my $recovered_bytes = 0;
  29 my $recovered_files = 0;
  30 my $lost_files = 0;
  31 my $lost_bytes = 0;
  32 my $surprises = 0;
  33
  34 eval '
  35         use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
  36 ';
  37
  38 if ($@) {
  39         warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
  40
  41         eval <<'DIGEST';
  42                 sub really_digest {
  43                         my ($filename) = (@_);
  44                         my $fv = open(MD5SUM, "-|");
  45                         die "fork: $!" unless defined($fv);
  46                         if ($fv) {
  47                                 my ($sum_line) = <MD5SUM>;
  48                                 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
  49                                 die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
  50                                 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
  51                                 die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
  52                                 return $sum;
  53                         } else {
  54                                 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
  55                                 exec('md5sum');
  56                                 # Perl guarantees it will die here
  57                         }
  58                 }
  59 DIGEST
  60 } else {
  61         eval <<'DIGEST';
  62                 sub really_digest {
  63                         my ($filename) = (@_);
  64                         die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
  65                         my $ctx = Digest::SHA1->new;
  66                         sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
  67                         binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
  68                         $ctx->addfile(\*FILE);
  69                         close(FILE) or die "close: $filename: $!";
  70                         return $ctx->b64digest;
  71                 }
  72 DIGEST
  73 }
  74
  75 my $collapse_access = 0;
  76 my $collapse_timestamp = 0;
  77 my $collapse_zero = 0;
  78 my $skip_compares = 0;
  79 my $skip_hashes = 0;
  80 my $verbose = 0;
  81 my $debug = 0;
  82 my @extra_find_opts = ();
  83 my @extra_sort_opts = ();
  84 my $lock_file;
  85 my $lock_rm = 0;
  86 my $lock_obtained = 0;
  87
  88 sub digest {
  89         my ($filename) = (@_);
  90         if ($skip_hashes) {
  91                 return "SKIPPING HASHES";
  92         } else {
  93                 &really_digest($filename);
  94                 $hash_bytes += -s $filename;
  95                 $hash_files++;
  96         }
  97 }
  98
  99 my @directories;
 100
 101 sub usage {
 102         my $name = shift(@_);
 103         die <<USAGE;
 104 Usage: $name [--opts] directory [directory...]
 105 Finds duplicate files in the given directories, and replaces all identical
 106 copies of a file with hard-links to a single file.
 107
 108 Several options modify the definition of a "duplicate".  By default, files
 109 which have differences in owner uid or gid, permission (mode), or
 110 modification time (mtime) are considered different, so that hardlinking
 111 files does not also change their attributes.  Additionally, all files of
 112 zero size are ignored for performance reasons (there tend to be many
 113 of them, and they tend not to release any space when replaced with
 114 hard links).
 115
 116         --access        uid, gid, and mode may be different for identical
 117                         files
 118
 119         --debug         show all steps in duplication discovery process
 120                         (implies --verbose)
 121
 122         --find          pass next options (up to --) to find command
 123
 124         --lock FILE     exit immediately (status 10) if unable to obtain a
 125                         flock(LOCK_EX|LOCK_NB) on FILE
 126
 127         --lock-rm       remove lock file at exit
 128
 129         --sort          pass next options (up to --) to sort command
 130
 131         --timestamps    mtime may be different for identical files
 132
 133         --skip-compare  skip byte-by-byte file comparisons
 134
 135         --skip-hash     skip calculation of hash function on files
 136
 137         --trust         old name for --skip-compare
 138                         (trust the hash function)
 139
 140         --verbose       report files as they are considered
 141
 142         --zeros         hard-link zero-length files too
 143 USAGE
 144 }
 145
 146 while ($#ARGV >= 0) {
 147         my $arg = shift(@ARGV);
 148         if ($arg eq '--access') {
 149                 $collapse_access = 1;
 150         } elsif ($arg eq '--timestamps') {
 151                 $collapse_timestamp = 1;
 152         } elsif ($arg eq '--zeros') {
 153                 $collapse_zero = 1;
 154         } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
 155                 $skip_compares = 1;
 156         } elsif ($arg eq '--skip-hash') {
 157                 $skip_hashes = 1;
 158         } elsif ($arg eq '--verbose') {
 159                 $verbose = 1;
 160         } elsif ($arg eq '--lock-rm') {
 161                 $lock_rm = 1;
 162         } elsif ($arg eq '--lock') {
 163                 $lock_file = shift(@ARGV);
 164                 unless (defined($lock_file)) {
 165                         usage($0);
 166                         exit(1);
 167                 }
 168         } elsif ($arg eq '--debug') {
 169                 $debug = $verbose = 1;
 170         } elsif ($arg eq '--find') {
 171                 while ($#ARGV >= 0) {
 172                         my $extra_arg = shift(@ARGV);
 173                         last if $extra_arg eq '--';
 174                         push(@extra_find_opts, $extra_arg);
 175                 }
 176         } elsif ($arg eq '--sort') {
 177                 while ($#ARGV >= 0) {
 178                         my $extra_arg = shift(@ARGV);
 179                         last if $extra_arg eq '--';
 180                         push(@extra_sort_opts, $extra_arg);
 181                 }
 182         } elsif ($arg =~ /^-/o) {
 183                 usage($0);
 184                 exit(1);
 185         } else {
 186                 push(@directories, $arg);
 187         }
 188 }
 189
 190 if ($skip_hashes && $skip_compares) {
 191         die "Cannot skip both hashes and compares.\n";
 192 }
 193
 194 if (defined($lock_file)) {
 195         sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
 196         flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
 197         print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
 198         $lock_obtained = 1;
 199 }
 200
 201 END {
 202         if ($lock_obtained) {
 203                 print STDERR "Removing '$lock_file'.\n" if $verbose;
 204                 unlink($lock_file) or warn "unlink: $lock_file: $!";
 205         }
 206 }
 207
 208 sub tick_quote {
 209         my ($text) = (@_);
 210         $text =~ s/'/'\''/go;
 211         return "'$text'";
 212 }
 213
 214 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
 215 my $printf_string = '%s ' .
 216         ($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
 217         ($collapse_timestamp ? '0 '     : '%T@ ') .
 218         '%i %p\0';
 219
 220 push(@find_command, '!', '-empty') unless $collapse_zero;
 221 push(@find_command, '-printf', $printf_string);
 222
 223 my @sort_command = ('sort', '-znr', @extra_sort_opts);
 224 my @quoted_sort_command = @sort_command;
 225 grep(tick_quote($_), @quoted_sort_command);
 226 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
 227
 228 my @quoted_find_command = @find_command;
 229 grep(tick_quote($_), @quoted_find_command);
 230 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
 231 print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
 232
 233 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
 234 $/ = "\0";
 235
 236 # Input is sorted so that all weak keys are contiguous.
 237 # When the key changes, we have to process all files we previously know about.
 238 my $current_key = -1;
 239
 240 # $inode_to_file_name{$inode} = [@file_names]
 241 my %inode_to_file_name = ();
 242
 243 # Link files
 244 sub link_files {
 245         my ($from, $to) = (@_);
 246         my $inode_dir = $to;
 247         my $inode_base = $to;
 248         $inode_dir =~ s:[^/]*$::o;
 249         $inode_base =~ s:^.*/::os;
 250         my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
 251         my $quoted_from = tick_quote($from);
 252         my $quoted_to = tick_quote($to);
 253         print STDERR "ln -f $quoted_from $quoted_to\n";
 254         print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
 255         link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
 256         print STDERR "\trename: $tmp_to -> $to\n" if $debug;
 257         unless (rename($tmp_to, $to)) {
 258                 my $saved_bang = $!;
 259                 unlink($tmp_to) or warn "unlink: $tmp_to: $!";  # Try, possibly in vain, to clean up
 260                 die "rename: $tmp_to -> $from: $saved_bang";
 261         }
 262 }
 263
 264 # Process all known files so far.
 265 sub merge_files {
 266         $merges_attempted++;
 267
 268         my %hash_to_inode;
 269         # Used to stop link retry loops (there is a goto in here!  Actually two...)
 270         my %stop_loop;
 271
 272         my @candidate_list = keys(%inode_to_file_name);
 273         $input_files += @candidate_list;
 274         if (@candidate_list < 2) {
 275                 print STDERR "Merging...only one candidate to merge..." if $debug;
 276                 $trivially_unique++;
 277                 goto end_merge;
 278         }
 279
 280         print STDERR "Merging...\n" if $debug;
 281         foreach my $candidate (@candidate_list) {
 282                 print STDERR "\tDigesting candidate $candidate\n" if $debug;
 283                 my $ok = 0;
 284                 my $digest;
 285
 286 hash_file:
 287
 288                 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
 289                         print STDERR "\t\tDigesting file $filename\n" if $debug;
 290                         if ((-l $filename) || ! -f _) {
 291                                 warn "Bogon file " . tick_quote($filename);
 292                                 $surprises++;
 293                                 next;
 294                         }
 295                         eval {
 296                                 $digest = digest($filename);
 297                         };
 298                         if ($@) {
 299                                 warn "Digest($filename)(#$candidate) failed: $@";
 300                                 $hash_errors++;
 301                         } else {
 302                                 $ok = 1;
 303                                 last hash_file;
 304                         }
 305                 }
 306                 if ($ok) {
 307                         print STDERR "\t\tDigest is $digest\n" if $debug;
 308
 309                         my $incumbent = $hash_to_inode{$digest};
 310                         if (defined($incumbent)) {
 311                                 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
 312
 313                                 my $finished = 0;
 314
 315 link_start:
 316
 317                                 until ($finished) {
 318                                         my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
 319                                         my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
 320                                         print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
 321                                         print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
 322
 323 incumbent_file:
 324
 325                                         foreach my $incumbent_file (@incumbent_names) {
 326                                                 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
 327                                                 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
 328
 329                                                 if (!defined($incumbent_blocks)) {
 330                                                         warn "lstat: $incumbent_file: $!";
 331                                                         $surprises++;
 332                                                         next incumbent_file;
 333                                                 }
 334
 335                                                 if ($incumbent_ino != $incumbent) {
 336                                                         warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
 337                                                         $surprises++;
 338                                                         next incumbent_file;
 339                                                 }
 340
 341                                                 my $at_least_one_link_done = 0;
 342
 343 candidate_file:
 344
 345                                                 foreach my $candidate_file (@candidate_names) {
 346                                                         my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
 347                                                         print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
 348
 349                                                         if (!defined($candidate_blocks)) {
 350                                                                 warn "lstat: $candidate_file: $!";
 351                                                                 $surprises++;
 352                                                                 next candidate_file;
 353                                                         }
 354
 355                                                         if ($candidate_ino != $candidate) {
 356                                                                 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
 357                                                                 $surprises++;
 358                                                                 next candidate_file;
 359                                                         }
 360
 361                                                         if ($candidate_size != $incumbent_size) {
 362                                                                 warn "$candidate_file, $incumbent_file: file sizes are different";
 363                                                                 $surprises++;
 364                                                                 next candidate_file;
 365                                                         }
 366
 367                                                         my $identical;
 368
 369                                                         eval {
 370                                                                 if ($skip_compares) {
 371                                                                         print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
 372                                                                         $identical = 1;
 373                                                                 } else {
 374                                                                         my $quoted_incumbent_file = tick_quote($incumbent_file);
 375                                                                         my $quoted_candidate_file = tick_quote($candidate_file);
 376                                                                         print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
 377                                                                         if (compare($incumbent_file, $candidate_file)) {
 378                                                                                 $compare_differences++;
 379                                                                                 $identical = 0;
 380                                                                                 # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
 381                                                                                 # Some kind of I/O error is more likely, so this message cannot be turned off.
 382                                                                                 # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
 383                                                                                 # so the warning in that case is quite silly.  Hmmm.
 384                                                                                 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
 385                                                                         } else {
 386                                                                                 $identical = 1;
 387                                                                         }
 388                                                                         $compare_count++;
 389                                                                         $compare_bytes += $incumbent_size;
 390                                                                 }
 391                                                         };
 392                                                         if ($@) {
 393                                                                 warn $@;
 394                                                                 $compare_errors++;
 395                                                                 next candidate_file;
 396                                                         }
 397
 398                                                         if ($identical) {
 399                                                                 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
 400
 401                                                                 # We have to do this to break out of a possible infinite loop.
 402                                                                 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
 403                                                                 # such that A1 and B1 are in non-writable directories, we will loop
 404                                                                 # forever hardlinking A2 with A and B.
 405                                                                 # To break the loop, we never attempt to hardlink any files X and Y twice.
 406
 407                                                                 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
 408                                                                     defined($stop_loop{$candidate_file}->{$incumbent_file})) {
 409                                                                         print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
 410                                                                 } else {
 411                                                                         $stop_loop{$incumbent_file}->{$candidate_file} = 1;
 412                                                                         $stop_loop{$candidate_file}->{$incumbent_file} = 1;
 413
 414                                                                         my $link_done = 0;
 415
 416                                                                         my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
 417                                                                         if ($candidate_nlink > $incumbent_nlink) {
 418                                                                                 $from_file = $candidate_file;
 419                                                                                 $to_file = $incumbent_file;
 420                                                                                 $from_inode = $candidate;
 421                                                                                 $to_inode = $incumbent;
 422                                                                                 $from_nlink = $candidate_nlink;
 423                                                                                 $to_nlink = $incumbent_nlink;
 424                                                                         } else {
 425                                                                                 $to_file = $candidate_file;
 426                                                                                 $from_file = $incumbent_file;
 427                                                                                 $to_inode = $candidate;
 428                                                                                 $from_inode = $incumbent;
 429                                                                                 $to_nlink = $candidate_nlink;
 430                                                                                 $from_nlink = $incumbent_nlink;
 431                                                                         }
 432
 433                                                                         eval {
 434                                                                                 link_files($from_file, $to_file);
 435                                                                                 $link_done = 1;
 436                                                                         };
 437
 438                                                                         if ($@) {
 439                                                                                 warn $@;
 440                                                                                 $link_errors++;
 441
 442                                                                                 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
 443                                                                                 $link_retries++;
 444
 445                                                                                 eval {
 446                                                                                         ($from_file, $to_file) = ($to_file, $from_file);
 447                                                                                         ($from_inode, $to_inode) = ($to_inode, $from_inode);
 448                                                                                         ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
 449                                                                                         link_files($from_file, $to_file);
 450                                                                                         $link_done = 1;
 451                                                                                 };
 452
 453                                                                                 if ($@) {
 454                                                                                         warn $@;
 455                                                                                         $link_errors++;
 456                                                                                 }
 457                                                                         }
 458
 459                                                                         # Note since the files are presumably identical, they both have the same size.
 460                                                                         # My random number generator chooses the incumbent's size.
 461
 462                                                                         if ($link_done) {
 463                                                                                 delete $inode_to_file_name{$to_inode}->{$to_file};
 464                                                                                 $inode_to_file_name{$from_inode}->{$to_file} = undef;
 465                                                                                 $hash_to_inode{$digest} = $from_inode;
 466
 467                                                                                 $hard_links++;
 468                                                                                 if ($to_nlink == 1) {
 469                                                                                         $recovered_files++;
 470                                                                                         $recovered_bytes += $incumbent_size;
 471                                                                                 }
 472
 473                                                                                 # FIXME:  Now we're really confused for some reason.
 474                                                                                 # Start over to rebuild state.
 475                                                                                 next link_start;
 476                                                                         } else {
 477                                                                                 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
 478
 479                                                                                 # FIXME:  This is a lame heuristic.  We really need to know if we've
 480                                                                                 # tried all possible ways to hardlink the file out of existence first;
 481                                                                                 # however, that is complex and only benefits a silly statistic.
 482                                                                                 if ($to_nlink == 1 || $from_nlink == 1) {
 483                                                                                         $lost_files++;
 484                                                                                         $lost_bytes += $incumbent_size;
 485                                                                                 }
 486                                                                         }
 487                                                                 }
 488                                                         }
 489                                                 }
 490                                         }
 491                                         $finished = 1;
 492                                 }
 493                         } else {
 494                                 print STDERR "\t\tNew hash entered\n" if $debug;
 495                                 $hash_to_inode{$digest} = $candidate;
 496                         }
 497                 } else {
 498                         warn "No digests found for inode $candidate\n";
 499                         delete $inode_to_file_name{$candidate};
 500                 }
 501         }
 502
 503 end_merge:
 504
 505         print STDERR "Merge done.\n" if $debug;
 506         undef %inode_to_file_name;
 507 }
 508
 509 while (<FIND>) {
 510         my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
 511         die "read error: $!\nLast input line was '$_'" unless defined($name);
 512
 513         print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
 514
 515         unless (! (-l $name) && (-f _)) {
 516                 warn "Bogon file " . tick_quote($name);
 517                 $input_bogons++;
 518                 next;
 519         }
 520
 521         $input_links++;
 522         merge_files if $weak_key ne $current_key;
 523         $current_key = $weak_key;
 524
 525         $inode_to_file_name{$inode}->{$name} = undef;
 526
 527         print STDERR "$name\n" if $verbose;
 528 }
 529
 530 merge_files;
 531
 532 my $stats_blob = <<STATS;
 533 compare_bytes           $compare_bytes
 534 compare_count           $compare_count
 535 compare_differences     $compare_differences
 536 compare_errors          $compare_errors
 537 hard_links              $hard_links
 538 hash_bytes              $hash_bytes
 539 hash_errors             $hash_errors
 540 hash_files              $hash_files
 541 input_bogons            $input_bogons
 542 input_files             $input_files
 543 input_links             $input_links
 544 link_errors             $link_errors
 545 link_retries            $link_retries
 546 lost_bytes              $lost_bytes
 547 lost_files              $lost_files
 548 merges_attempted        $merges_attempted
 549 recovered_bytes         $recovered_bytes
 550 recovered_files         $recovered_files
 551 surprises               $surprises
 552 trivially_unique        $trivially_unique
 553 STATS
 554
 555 $stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
 556
 557 print STDERR $stats_blob;
 558
 559 exit(0);