git.hungrycats.org Git - dupemerge/blob - faster-dupemerge

   1 #!/usr/bin/perl -w
   2 # $Id$
   3
   4 # Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
   5 # Use, modification, and distribution permitted
   6 # under the terms of the GNU GPL.
   7
   8 use strict;
   9 use Fcntl qw(:DEFAULT :flock);
  10 use File::Compare;
  11 use File::Temp;
  12
  13 my $input_links = 0;
  14 my $input_files = 0;
  15 my $input_bogons = 0;
  16 my $hash_bytes = 0;
  17 my $hash_files = 0;
  18 my $hash_errors = 0;
  19 my $compare_bytes = 0;
  20 my $compare_count = 0;
  21 my $compare_errors = 0;
  22 my $compare_differences = 0;
  23 my $trivially_unique = 0;
  24 my $merges_attempted = 0;
  25 my $hard_links = 0;
  26 my $link_errors = 0;
  27 my $link_retries = 0;
  28 my $recovered_bytes = 0;
  29 my $recovered_files = 0;
  30 my $lost_files = 0;
  31 my $lost_bytes = 0;
  32 my $surprises = 0;
  33
  34 eval '
  35         use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
  36 ';
  37
  38 if ($@) {
  39         warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
  40
  41         eval <<'DIGEST';
  42                 sub really_digest {
  43                         my ($filename) = (@_);
  44                         my $fv = open(MD5SUM, "-|");
  45                         die "fork: $!" unless defined($fv);
  46                         if ($fv) {
  47                                 my ($sum_line) = <MD5SUM>;
  48                                 close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
  49                                 die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
  50                                 my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
  51                                 die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
  52                                 return $sum;
  53                         } else {
  54                                 sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
  55                                 exec('md5sum');
  56                                 # Perl guarantees it will die here
  57                         }
  58                 }
  59 DIGEST
  60 } else {
  61         eval <<'DIGEST';
  62                 sub really_digest {
  63                         my ($filename) = (@_);
  64                         die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
  65                         my $ctx = Digest::SHA1->new;
  66                         sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
  67                         binmode(FILE);          # FIXME:  Necessary?  Probably harmless...
  68                         $ctx->addfile(\*FILE);
  69                         close(FILE) or die "close: $filename: $!";
  70                         return $ctx->b64digest;
  71                 }
  72 DIGEST
  73 }
  74
  75 my $collapse_access = 0;
  76 my $collapse_timestamp = 0;
  77 my $collapse_zero = 0;
  78 my $skip_compares = 0;
  79 my $skip_hashes = 0;
  80 my $verbose = 0;
  81 my $debug = 0;
  82 my $dry_run = 0;
  83 my $humane = 0;
  84 my @extra_find_opts = ();
  85 my @extra_sort_opts = ();
  86 my $lock_file;
  87 my $lock_rm = 0;
  88 my $lock_obtained = 0;
  89
  90 sub digest {
  91         my ($filename) = (@_);
  92         if ($skip_hashes) {
  93                 return "SKIPPING HASHES";
  94         } else {
  95                 my $digest = &really_digest($filename);
  96                 $hash_bytes += -s $filename;
  97                 $hash_files++;
  98                 return $digest
  99         }
 100 }
 101
 102 my @directories;
 103
 104 sub usage {
 105         my $name = shift(@_);
 106         die <<USAGE;
 107 Usage: $name [--opts] directory [directory...]
 108 Finds duplicate files in the given directories, and replaces all identical
 109 copies of a file with hard-links to a single file.
 110
 111 Several options modify the definition of a "duplicate".  By default, files
 112 which have differences in owner uid or gid, permission (mode), or
 113 modification time (mtime) are considered different, so that hardlinking
 114 files does not also change their attributes.  Additionally, all files of
 115 zero size are ignored for performance reasons (there tend to be many
 116 of them, and they tend not to release any space when replaced with
 117 hard links).
 118
 119         --access        uid, gid, and mode may be different for identical
 120                         files
 121
 122         --debug         show all steps in duplication discovery process
 123                         (implies --verbose)
 124
 125         --dry-run       do not lock files or make changes to filesystem
 126
 127         --find          pass next options (up to --) to find command
 128
 129         --humane        human-readable statistics (e.g. 1 048 576)
 130
 131         --lock FILE     exit immediately (status 10) if unable to obtain a
 132                         flock(LOCK_EX|LOCK_NB) on FILE
 133
 134         --lock-rm       remove lock file at exit
 135
 136         --sort          pass next options (up to --) to sort command
 137
 138         --timestamps    mtime may be different for identical files
 139
 140         --skip-compare  skip byte-by-byte file comparisons
 141
 142         --skip-hash     skip calculation of hash function on files
 143
 144         --trust         old name for --skip-compare
 145                         (trust the hash function)
 146
 147         --verbose       report files as they are considered
 148
 149         --zeros         hard-link zero-length files too
 150 USAGE
 151 }
 152
 153 while ($#ARGV >= 0) {
 154         my $arg = shift(@ARGV);
 155         if ($arg eq '--access') {
 156                 $collapse_access = 1;
 157         } elsif ($arg eq '--timestamps') {
 158                 $collapse_timestamp = 1;
 159         } elsif ($arg eq '--zeros') {
 160                 $collapse_zero = 1;
 161         } elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
 162                 $skip_compares = 1;
 163         } elsif ($arg eq '--skip-hash') {
 164                 $skip_hashes = 1;
 165         } elsif ($arg eq '--verbose') {
 166                 $verbose = 1;
 167         } elsif ($arg eq '--lock-rm') {
 168                 $lock_rm = 1;
 169         } elsif ($arg eq '--lock') {
 170                 $lock_file = shift(@ARGV);
 171                 unless (defined($lock_file)) {
 172                         usage($0);
 173                         exit(1);
 174                 }
 175         } elsif ($arg eq '--debug') {
 176                 $debug = $verbose = 1;
 177         } elsif ($arg eq '--dry-run') {
 178                 $dry_run = 1;
 179         } elsif ($arg eq '--humane') {
 180                 $humane = 1;
 181         } elsif ($arg eq '--find') {
 182                 while ($#ARGV >= 0) {
 183                         my $extra_arg = shift(@ARGV);
 184                         last if $extra_arg eq '--';
 185                         push(@extra_find_opts, $extra_arg);
 186                 }
 187         } elsif ($arg eq '--sort') {
 188                 while ($#ARGV >= 0) {
 189                         my $extra_arg = shift(@ARGV);
 190                         last if $extra_arg eq '--';
 191                         push(@extra_sort_opts, $extra_arg);
 192                 }
 193         } elsif ($arg =~ /^-/o) {
 194                 usage($0);
 195                 exit(1);
 196         } else {
 197                 push(@directories, $arg);
 198         }
 199 }
 200
 201 if ($skip_hashes && $skip_compares) {
 202         die "Cannot skip both hashes and compares.\n";
 203 }
 204
 205 @directories or usage;
 206
 207 if (defined($lock_file) && !$dry_run) {
 208         sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
 209         flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
 210         print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
 211         $lock_obtained = 1;
 212 }
 213
 214 END {
 215         if ($lock_obtained && !$dry_run) {
 216                 print STDERR "Removing '$lock_file'.\n" if $verbose;
 217                 unlink($lock_file) or warn "unlink: $lock_file: $!";
 218         }
 219 }
 220
 221 sub tick_quote {
 222         my ($text) = (@_);
 223         $text =~ s/'/'\''/go;
 224         return "'$text'";
 225 }
 226
 227 my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
 228 my $printf_string = '%s ' .
 229         ($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
 230         ($collapse_timestamp ? '0 '     : '%T@ ') .
 231         '%i %p\0';
 232
 233 push(@find_command, '!', '-empty') unless $collapse_zero;
 234 push(@find_command, '-printf', $printf_string);
 235
 236 my @sort_command = ('sort', '-znr', @extra_sort_opts);
 237 my @quoted_sort_command = @sort_command;
 238 grep(tick_quote($_), @quoted_sort_command);
 239 my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
 240
 241 my @quoted_find_command = @find_command;
 242 grep(tick_quote($_), @quoted_find_command);
 243 my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
 244 print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
 245
 246 open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
 247 $/ = "\0";
 248
 249 # Input is sorted so that all weak keys are contiguous.
 250 # When the key changes, we have to process all files we previously know about.
 251 my $current_key = -1;
 252
 253 # $inode_to_file_name{$inode} = [@file_names]
 254 my %inode_to_file_name = ();
 255
 256 # Link files
 257 sub link_files {
 258         my ($from, $to) = (@_);
 259
 260         my $quoted_from = tick_quote($from);
 261         my $quoted_to = tick_quote($to);
 262         print STDERR "ln -f $quoted_from $quoted_to\n";
 263
 264         return if $dry_run;
 265
 266         my $inode_dir = $to;
 267         my $inode_base = $to;
 268         $inode_dir =~ s:[^/]*$::o;
 269         $inode_base =~ s:^.*/::os;
 270         my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
 271         print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
 272         link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
 273         print STDERR "\trename: $tmp_to -> $to\n" if $debug;
 274         unless (rename($tmp_to, $to)) {
 275                 my $saved_bang = $!;
 276                 unlink($tmp_to) or warn "unlink: $tmp_to: $!";  # Try, possibly in vain, to clean up
 277                 die "rename: $tmp_to -> $from: $saved_bang";
 278         }
 279 }
 280
 281 # Process all known files so far.
 282 sub merge_files {
 283         $merges_attempted++;
 284
 285         my %hash_to_inode;
 286         # Used to stop link retry loops (there is a goto in here!  Actually two...)
 287         my %stop_loop;
 288
 289         my @candidate_list = keys(%inode_to_file_name);
 290         $input_files += @candidate_list;
 291         if (@candidate_list < 2) {
 292                 print STDERR "Merging...only one candidate to merge..." if $debug;
 293                 $trivially_unique++;
 294                 goto end_merge;
 295         }
 296
 297         print STDERR "Merging...\n" if $debug;
 298         foreach my $candidate (@candidate_list) {
 299                 print STDERR "\tDigesting candidate $candidate\n" if $debug;
 300                 my $ok = 0;
 301                 my $digest;
 302
 303 hash_file:
 304
 305                 foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
 306                         print STDERR "\t\tDigesting file $filename\n" if $debug;
 307                         if ((-l $filename) || ! -f _) {
 308                                 warn "Bogon file " . tick_quote($filename);
 309                                 $surprises++;
 310                                 next;
 311                         }
 312                         eval {
 313                                 $digest = digest($filename);
 314                         };
 315                         if ($@) {
 316                                 warn "Digest($filename)(#$candidate) failed: $@";
 317                                 $hash_errors++;
 318                         } else {
 319                                 $ok = 1;
 320                                 last hash_file;
 321                         }
 322                 }
 323                 if ($ok) {
 324                         print STDERR "\t\tDigest is $digest\n" if $debug;
 325
 326                         my $incumbent = $hash_to_inode{$digest};
 327                         if (defined($incumbent)) {
 328                                 print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
 329
 330                                 my $finished = 0;
 331
 332 link_start:
 333
 334                                 until ($finished) {
 335                                         my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
 336                                         my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
 337                                         print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
 338                                         print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
 339
 340 incumbent_file:
 341
 342                                         foreach my $incumbent_file (@incumbent_names) {
 343                                                 my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
 344                                                 print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
 345
 346                                                 if (!defined($incumbent_blocks)) {
 347                                                         warn "lstat: $incumbent_file: $!";
 348                                                         $surprises++;
 349                                                         next incumbent_file;
 350                                                 }
 351
 352                                                 if ($incumbent_ino != $incumbent) {
 353                                                         warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
 354                                                         $surprises++;
 355                                                         next incumbent_file;
 356                                                 }
 357
 358                                                 my $at_least_one_link_done = 0;
 359
 360 candidate_file:
 361
 362                                                 foreach my $candidate_file (@candidate_names) {
 363                                                         my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
 364                                                         print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
 365
 366                                                         if (!defined($candidate_blocks)) {
 367                                                                 warn "lstat: $candidate_file: $!";
 368                                                                 $surprises++;
 369                                                                 next candidate_file;
 370                                                         }
 371
 372                                                         if ($candidate_ino != $candidate) {
 373                                                                 warn "$candidate_file: expected inode $candidate, found $candidate_ino";
 374                                                                 $surprises++;
 375                                                                 next candidate_file;
 376                                                         }
 377
 378                                                         if ($candidate_size != $incumbent_size) {
 379                                                                 warn "$candidate_file, $incumbent_file: file sizes are different";
 380                                                                 $surprises++;
 381                                                                 next candidate_file;
 382                                                         }
 383
 384                                                         my $identical;
 385
 386                                                         eval {
 387                                                                 if ($skip_compares) {
 388                                                                         print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
 389                                                                         $identical = 1;
 390                                                                 } else {
 391                                                                         my $quoted_incumbent_file = tick_quote($incumbent_file);
 392                                                                         my $quoted_candidate_file = tick_quote($candidate_file);
 393                                                                         print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
 394                                                                         if (compare($incumbent_file, $candidate_file)) {
 395                                                                                 $compare_differences++;
 396                                                                                 $identical = 0;
 397                                                                                 # It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
 398                                                                                 # Some kind of I/O error is more likely, so this message cannot be turned off.
 399                                                                                 # On the other hand, if we're skipping hashes, _all_ files will have the same hash,
 400                                                                                 # so the warning in that case is quite silly.  Hmmm.
 401                                                                                 print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
 402                                                                         } else {
 403                                                                                 $identical = 1;
 404                                                                         }
 405                                                                         $compare_count++;
 406                                                                         $compare_bytes += $incumbent_size;
 407                                                                 }
 408                                                         };
 409                                                         if ($@) {
 410                                                                 warn $@;
 411                                                                 $compare_errors++;
 412                                                                 next candidate_file;
 413                                                         }
 414
 415                                                         if ($identical) {
 416                                                                 print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
 417
 418                                                                 # We have to do this to break out of a possible infinite loop.
 419                                                                 # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
 420                                                                 # such that A1 and B1 are in non-writable directories, we will loop
 421                                                                 # forever hardlinking A2 with A and B.
 422                                                                 # To break the loop, we never attempt to hardlink any files X and Y twice.
 423
 424                                                                 if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
 425                                                                     defined($stop_loop{$candidate_file}->{$incumbent_file})) {
 426                                                                         print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
 427                                                                 } else {
 428                                                                         $stop_loop{$incumbent_file}->{$candidate_file} = 1;
 429                                                                         $stop_loop{$candidate_file}->{$incumbent_file} = 1;
 430
 431                                                                         my $link_done = 0;
 432
 433                                                                         my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
 434                                                                         if ($candidate_nlink > $incumbent_nlink) {
 435                                                                                 $from_file = $candidate_file;
 436                                                                                 $to_file = $incumbent_file;
 437                                                                                 $from_inode = $candidate;
 438                                                                                 $to_inode = $incumbent;
 439                                                                                 $from_nlink = $candidate_nlink;
 440                                                                                 $to_nlink = $incumbent_nlink;
 441                                                                         } else {
 442                                                                                 $to_file = $candidate_file;
 443                                                                                 $from_file = $incumbent_file;
 444                                                                                 $to_inode = $candidate;
 445                                                                                 $from_inode = $incumbent;
 446                                                                                 $to_nlink = $candidate_nlink;
 447                                                                                 $from_nlink = $incumbent_nlink;
 448                                                                         }
 449
 450                                                                         eval {
 451                                                                                 link_files($from_file, $to_file);
 452                                                                                 $link_done = 1;
 453                                                                         };
 454
 455                                                                         if ($@) {
 456                                                                                 warn $@;
 457                                                                                 $link_errors++;
 458
 459                                                                                 print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
 460                                                                                 $link_retries++;
 461
 462                                                                                 eval {
 463                                                                                         ($from_file, $to_file) = ($to_file, $from_file);
 464                                                                                         ($from_inode, $to_inode) = ($to_inode, $from_inode);
 465                                                                                         ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
 466                                                                                         link_files($from_file, $to_file);
 467                                                                                         $link_done = 1;
 468                                                                                 };
 469
 470                                                                                 if ($@) {
 471                                                                                         warn $@;
 472                                                                                         $link_errors++;
 473                                                                                 }
 474                                                                         }
 475
 476                                                                         # Note since the files are presumably identical, they both have the same size.
 477                                                                         # My random number generator chooses the incumbent's size.
 478
 479                                                                         if ($link_done) {
 480                                                                                 # Since we're in a dry run, the filesystem doesn't change.
 481                                                                                 # Our notion of what the filesystem should look like should not change either.
 482                                                                                 delete $inode_to_file_name{$to_inode}->{$to_file};
 483                                                                                 unless ($dry_run) {
 484                                                                                         $inode_to_file_name{$from_inode}->{$to_file} = undef;
 485                                                                                         $hash_to_inode{$digest} = $from_inode;
 486                                                                                 }
 487
 488                                                                                 $hard_links++;
 489                                                                                 if ($to_nlink == 1) {
 490                                                                                         $recovered_files++;
 491                                                                                         $recovered_bytes += $incumbent_size;
 492                                                                                 }
 493
 494                                                                                 # FIXME:  Now we're really confused for some reason.
 495                                                                                 # Start over to rebuild state.
 496                                                                                 next link_start;
 497                                                                         } else {
 498                                                                                 warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
 499
 500                                                                                 # FIXME:  This is a lame heuristic.  We really need to know if we've
 501                                                                                 # tried all possible ways to hardlink the file out of existence first;
 502                                                                                 # however, that is complex and only benefits a silly statistic.
 503                                                                                 if ($to_nlink == 1 || $from_nlink == 1) {
 504                                                                                         $lost_files++;
 505                                                                                         $lost_bytes += $incumbent_size;
 506                                                                                 }
 507                                                                         }
 508                                                                 }
 509                                                         }
 510                                                 }
 511                                         }
 512                                         $finished = 1;
 513                                 }
 514                         } else {
 515                                 print STDERR "\t\tNew hash entered\n" if $debug;
 516                                 $hash_to_inode{$digest} = $candidate;
 517                         }
 518                 } else {
 519                         warn "No digests found for inode $candidate\n";
 520                         delete $inode_to_file_name{$candidate};
 521                 }
 522         }
 523
 524 end_merge:
 525
 526         print STDERR "Merge done.\n" if $debug;
 527         undef %inode_to_file_name;
 528 }
 529
 530 while (<FIND>) {
 531         my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
 532         die "read error: $!\nLast input line was '$_'" unless defined($name);
 533
 534         print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
 535
 536         unless (! (-l $name) && (-f _)) {
 537                 warn "Bogon file " . tick_quote($name);
 538                 $input_bogons++;
 539                 next;
 540         }
 541
 542         $input_links++;
 543         merge_files if $weak_key ne $current_key;
 544         $current_key = $weak_key;
 545
 546         $inode_to_file_name{$inode}->{$name} = undef;
 547
 548         print STDERR "$name\n" if $verbose;
 549 }
 550
 551 merge_files;
 552
 553 my $stats_blob = <<STATS;
 554 compare_bytes           $compare_bytes
 555 compare_count           $compare_count
 556 compare_differences     $compare_differences
 557 compare_errors          $compare_errors
 558 hard_links              $hard_links
 559 hash_bytes              $hash_bytes
 560 hash_errors             $hash_errors
 561 hash_files              $hash_files
 562 input_bogons            $input_bogons
 563 input_files             $input_files
 564 input_links             $input_links
 565 link_errors             $link_errors
 566 link_retries            $link_retries
 567 lost_bytes              $lost_bytes
 568 lost_files              $lost_files
 569 merges_attempted        $merges_attempted
 570 recovered_bytes         $recovered_bytes
 571 recovered_files         $recovered_files
 572 surprises               $surprises
 573 trivially_unique        $trivially_unique
 574 STATS
 575
 576 if ($humane) {
 577         my $max_num_len = 0;
 578
 579         sub measure_numbers {
 580                 my ($num) = @_;
 581                 my $len = length($num);
 582                 $len += int( (length($num) - 1) / 3);
 583                 $max_num_len = $len if $len > $max_num_len;
 584         }
 585
 586         (my $dummy = $stats_blob) =~ s/\d+/measure_numbers($&)/geos;
 587
 588         sub space_numbers {
 589                 my ($num) = @_;
 590                 1 while $num =~ s/(\d)(\d\d\d)( \d\d\d)*$/$1 $2$3/os;
 591                 $num = ' ' x ($max_num_len - length($num)) . $num;
 592                 return $num;
 593         }
 594
 595         $stats_blob =~ s/\d+/space_numbers($&)/geos;
 596 }
 597
 598 $stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
 599
 600 print STDERR $stats_blob;
 601
 602 exit(0);