--- /dev/null
+#!/usr/bin/perl -w
+use strict;
+use Fcntl qw(:DEFAULT :flock);
+use File::Compare;
+use File::Temp;
+
+my $links_input = 0;
+my $files_input = 0;
+my $bogons_input = 0;
+my $files_hashed = 0;
+my $hash_errors = 0;
+my $compare_count = 0;
+my $compare_errors = 0;
+my $compare_differences = 0;
+my $trivially_unique = 0;
+my $merges_attempted = 0;
+my $hard_links = 0;
+my $link_errors = 0;
+my $link_retries = 0;
+my $bytes_recovered = 0;
+my $files_recovered = 0;
+my $files_lost = 0;
+my $bytes_lost = 0;
+my $surprises = 0;
+
+eval '
+ use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
+';
+
+if ($@) {
+ warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
+
+ eval <<'DIGEST';
+ sub digest {
+ my ($filename) = (@_);
+ my $fv = open(MD5SUM, "-|");
+ die "fork: $!" unless defined($fv);
+ if ($fv) {
+ my ($sum_line) = <MD5SUM>;
+ close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
+ die "hash error: got EOF instead of md5sum output" unless defined($sum_line);
+ my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
+ die "hash error: got \Q$sum_line\E instead of md5sum output" unless defined($sum);
+ return $sum;
+ } else {
+ sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
+ exec('md5sum');
+ # Perl guarantees it will die here
+ }
+ }
+DIGEST
+} else {
+ eval <<'DIGEST';
+ sub digest {
+ my ($filename) = (@_);
+ die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
+ my $ctx = Digest::SHA1->new;
+ sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
+ binmode(FILE); # FIXME: Necessary? Probably harmless...
+ $ctx->addfile(\*FILE);
+ close(FILE) or die "close: $filename: $!";
+ return $ctx->b64digest;
+ }
+DIGEST
+}
+
+my $collapse_access = 0;
+my $collapse_timestamp = 0;
+my $collapse_zero = 0;
+my $trust_hashes = 0;
+my $verbose = 0;
+my $debug = 0;
+my @extra_find_opts = ();
+my @extra_sort_opts = ();
+my $lock_file;
+my $lock_rm = 0;
+my $lock_obtained = 0;
+
+my @directories;
+
+sub usage {
+ my $name = shift(@_);
+ die <<USAGE;
+Usage: $name [--opts] directory [directory...]
+Finds duplicate files in the given directories, and replaces all identical
+copies of a file with hard-links to a single file.
+
+Several options modify the definition of a "duplicate". By default, files
+which have differences in owner uid or gid, permission (mode), or
+modification time (mtime) are considered different, so that hardlinking
+files does not also change their attributes. Additionally, all files of
+zero size are ignored for performance reasons (there tend to be many
+of them, and they tend not to release any space when replaced with
+hard links).
+
+ --access uid, gid, and mode may be different for identical
+ files
+
+ --debug show all steps in duplication discovery process
+ (implies --verbose)
+
+ --find pass next options (up to --) to find command
+
+ --lock FILE exit immediately (status 10) if unable to obtain a
+ flock(LOCK_EX|LOCK_NB) on FILE
+
+ --lock-rm remove lock file at exit
+
+ --sort pass next options (up to --) to sort command
+
+ --timestamps mtime may be different for identical files
+
+ --trust skip byte-by-byte file comparisons
+ (trust the hash function)
+
+ --verbose report files as they are considered
+
+ --zeros hard-link zero-length files too
+USAGE
+}
+
+while ($#ARGV >= 0) {
+ my $arg = shift(@ARGV);
+ if ($arg eq '--access') {
+ $collapse_access = 1;
+ } elsif ($arg eq '--timestamps') {
+ $collapse_timestamp = 1;
+ } elsif ($arg eq '--zeros') {
+ $collapse_zero = 1;
+ } elsif ($arg eq '--trust') {
+ $trust_hashes = 1;
+ } elsif ($arg eq '--verbose') {
+ $verbose = 1;
+ } elsif ($arg eq '--lock-rm') {
+ $lock_rm = 1;
+ } elsif ($arg eq '--lock') {
+ $lock_file = shift(@ARGV);
+ unless (defined($lock_file)) {
+ usage($0);
+ exit(1);
+ }
+ } elsif ($arg eq '--debug') {
+ $debug = $verbose = 1;
+ } elsif ($arg eq '--find') {
+ while ($#ARGV >= 0) {
+ my $extra_arg = shift(@ARGV);
+ last if $extra_arg eq '--';
+ push(@extra_find_opts, $extra_arg);
+ }
+ } elsif ($arg eq '--sort') {
+ while ($#ARGV >= 0) {
+ my $extra_arg = shift(@ARGV);
+ last if $extra_arg eq '--';
+ push(@extra_sort_opts, $extra_arg);
+ }
+ } elsif ($arg =~ /^-/o) {
+ usage($0);
+ exit(1);
+ } else {
+ push(@directories, $arg);
+ }
+}
+
+if (defined($lock_file)) {
+ sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
+ flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
+ print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
+ $lock_obtained = 1;
+}
+
+END {
+ if ($lock_obtained) {
+ print STDERR "Removing '$lock_file'.\n" if $verbose;
+ unlink($lock_file) or warn "unlink: $lock_file: $!";
+ }
+}
+
+sub tick_quote {
+ my ($text) = (@_);
+ $text =~ s/'/'\''/go;
+ return "'$text'";
+}
+
+my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
+my $printf_string = '%s ' .
+ ($collapse_access ? '0 0 0 ' : '%U %G %m ') .
+ ($collapse_timestamp ? '0 ' : '%T@ ') .
+ '%i %p\0';
+
+push(@find_command, '!', '-empty') unless $collapse_zero;
+push(@find_command, '-printf', $printf_string);
+
+my @sort_command = ('sort', '-znr', @extra_sort_opts);
+my @quoted_sort_command = @sort_command;
+grep(tick_quote($_), @quoted_sort_command);
+my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
+
+my @quoted_find_command = @find_command;
+grep(tick_quote($_), @quoted_find_command);
+my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
+print STDERR "find command: $quoted_find_command | $quoted_sort_command\n" if $verbose;
+
+open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
+$/ = "\0";
+
+# Input is sorted so that all weak keys are contiguous.
+# When the key changes, we have to process all files we previously know about.
+my $current_key = -1;
+
+# $inode_to_file_name{$inode} = [@file_names]
+my %inode_to_file_name = ();
+
+# Link files
+sub link_files {
+ my ($from, $to) = (@_);
+ my $inode_dir = $to;
+ my $inode_base = $to;
+ $inode_dir =~ s:[^/]*$::o;
+ $inode_base =~ s:^.*/::os;
+ my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
+ my $quoted_from = tick_quote($from);
+ my $quoted_to = tick_quote($to);
+ print STDERR "ln -f $quoted_from $quoted_to\n";
+ print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
+ link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
+ print STDERR "\trename: $tmp_to -> $to\n" if $debug;
+ unless (rename($tmp_to, $to)) {
+ my $saved_bang = $!;
+ unlink($tmp_to) or warn "unlink: $tmp_to: $!"; # Try, possibly in vain, to clean up
+ die "rename: $tmp_to -> $from: $saved_bang";
+ }
+}
+
+# Process all known files so far.
+sub merge_files {
+ $merges_attempted++;
+
+ my %hash_to_inode;
+ # Used to stop link retry loops (there is a goto in here! Actually two...)
+ my %stop_loop;
+
+ my @candidate_list = keys(%inode_to_file_name);
+ $files_input += @candidate_list;
+ if (@candidate_list < 2) {
+ print STDERR "Merging...only one candidate to merge..." if $debug;
+ $trivially_unique++;
+ goto end_merge;
+ }
+
+ print STDERR "Merging...\n" if $debug;
+ foreach my $candidate (@candidate_list) {
+ print STDERR "\tDigesting candidate $candidate\n" if $debug;
+ my $ok = 0;
+ my $digest;
+
+hash_file:
+
+ foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
+ print STDERR "\t\tDigesting file $filename\n" if $debug;
+ if ((-l $filename) || ! -f _) {
+ warn "Bogon file " . tick_quote($filename);
+ $surprises++;
+ next;
+ }
+ eval { $digest = digest($filename); };
+ if ($@) {
+ warn "Digest($filename)(#$candidate) failed: $@";
+ $hash_errors++;
+ } else {
+ $ok = 1;
+ $files_hashed++;
+ last hash_file;
+ }
+ }
+ if ($ok) {
+ print STDERR "\t\tDigest is $digest\n" if $debug;
+
+ my $incumbent = $hash_to_inode{$digest};
+ if (defined($incumbent)) {
+ print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
+
+ my $finished = 0;
+
+link_start:
+
+ until ($finished) {
+ my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
+ my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
+ print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug;
+ print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
+
+ incumbent_file:
+
+ foreach my $incumbent_file (@incumbent_names) {
+ my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
+ print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
+
+ if (!defined($incumbent_blocks)) {
+ warn "lstat: $incumbent_file: $!";
+ $surprises++;
+ next incumbent_file;
+ }
+
+ if ($incumbent_ino != $incumbent) {
+ warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
+ $surprises++;
+ next incumbent_file;
+ }
+
+ candidate_file:
+
+ my $at_least_one_link_done = 0;
+
+ foreach my $candidate_file (@candidate_names) {
+ my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
+ print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
+
+ if (!defined($candidate_blocks)) {
+ warn "lstat: $candidate_file: $!";
+ $surprises++;
+ next candidate_file;
+ }
+
+ if ($candidate_ino != $candidate) {
+ warn "$candidate_file: expected inode $candidate, found $candidate_ino";
+ $surprises++;
+ next candidate_file;
+ }
+
+ if ($candidate_size != $incumbent_size) {
+ warn "$candidate_file, $incumbent_file: file sizes are different";
+ $surprises++;
+ next candidate_file;
+ }
+
+ my $identical;
+
+ eval {
+ if ($trust_hashes) {
+ print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
+ $identical = 1;
+ } else {
+ my $quoted_incumbent_file = tick_quote($incumbent_file);
+ my $quoted_candidate_file = tick_quote($candidate_file);
+ print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
+ if (compare($incumbent_file, $candidate_file)) {
+ $compare_differences++;
+ $identical = 0;
+ print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
+ } else {
+ $identical = 1;
+ }
+ $compare_count++;
+ }
+ };
+ if ($@) {
+ warn $@;
+ $compare_errors++;
+ next candidate_file;
+ }
+
+ if ($identical) {
+ print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
+
+ # We have to do this to break out of a possible infinite loop.
+ # Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
+ # such that A1 and B1 are in non-writable directories, we will loop
+ # forever hardlinking A2 with A and B.
+ # To break the loop, we never attempt to hardlink any files X and Y twice.
+
+ if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
+ defined($stop_loop{$candidate_file}->{$incumbent_file})) {
+ print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
+ } else {
+ $stop_loop{$incumbent_file}->{$candidate_file} = 1;
+ $stop_loop{$candidate_file}->{$incumbent_file} = 1;
+
+ my $link_done = 0;
+
+ my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
+ if ($candidate_nlink > $incumbent_nlink) {
+ $from_file = $candidate_file;
+ $to_file = $incumbent_file;
+ $from_inode = $candidate;
+ $to_inode = $incumbent;
+ $from_nlink = $candidate_nlink;
+ $to_nlink = $incumbent_nlink;
+ } else {
+ $to_file = $candidate_file;
+ $from_file = $incumbent_file;
+ $to_inode = $candidate;
+ $from_inode = $incumbent;
+ $to_nlink = $candidate_nlink;
+ $from_nlink = $incumbent_nlink;
+ }
+
+ eval {
+ link_files($from_file, $to_file);
+ $link_done = 1;
+ };
+
+ if ($@) {
+ warn $@;
+ $link_errors++;
+
+ print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
+ $link_retries++;
+
+ eval {
+ ($from_file, $to_file) = ($to_file, $from_file);
+ ($from_inode, $to_inode) = ($to_inode, $from_inode);
+ ($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
+ link_files($from_file, $to_file);
+ $link_done = 1;
+ };
+
+ if ($@) {
+ warn $@;
+ $link_errors++;
+ }
+ }
+
+ # Note since the files are presumably identical, they both have the same size.
+ # My random number generator chooses the incumbent's size.
+
+ if ($link_done) {
+ delete $inode_to_file_name{$to_inode}->{$to_file};
+ $inode_to_file_name{$from_inode}->{$to_file} = undef;
+ $hash_to_inode{$digest} = $from_inode;
+
+ $hard_links++;
+ if ($to_nlink == 1) {
+ $files_recovered++;
+ $bytes_recovered += $incumbent_size;
+ }
+
+ # FIXME: Now we're really confused for some reason.
+ # Start over to rebuild state.
+ next link_start;
+ } else {
+ warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
+
+ # FIXME: This is a lame heuristic. We really need to know if we've
+ # tried all possible ways to hardlink the file out of existence first;
+ # however, that is complex and only benefits a silly statistic.
+ if ($to_nlink == 1 || $from_nlink == 1) {
+ $files_lost++;
+ $bytes_lost += $incumbent_size;
+ }
+ }
+ }
+ }
+ }
+ }
+ $finished = 1;
+ }
+ } else {
+ print STDERR "\t\tNew hash entered\n" if $debug;
+ $hash_to_inode{$digest} = $candidate;
+ }
+ } else {
+ warn "No digests found for inode $candidate\n";
+ delete $inode_to_file_name{$candidate};
+ }
+ }
+
+end_merge:
+
+ print STDERR "Merge done.\n" if $debug;
+ undef %inode_to_file_name;
+}
+
+while (<FIND>) {
+ my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
+ die "read error: $!\nLast input line was '$_'" unless defined($name);
+
+ print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
+
+ unless (! (-l $name) && (-f _)) {
+ warn "Bogon file " . tick_quote($name);
+ $bogons_input++;
+ next;
+ }
+
+ $links_input++;
+ merge_files if $weak_key ne $current_key;
+ $current_key = $weak_key;
+
+ $inode_to_file_name{$inode}->{$name} = undef;
+
+ print STDERR "$name\n" if $verbose;
+}
+
+merge_files;
+
+print STDERR <<STATS;
+links_input $links_input
+files_input ........... $files_input
+bogons_input $bogons_input
+merges_attempted ...... $merges_attempted
+trivially_unique $trivially_unique
+files_hashed .......... $files_hashed
+hash_errors $hash_errors
+surprises ............. $surprises
+compare_count $compare_count
+compare_differences ... $compare_differences
+compare_errors $compare_errors
+hard_links ............ $hard_links
+link_errors $link_errors
+link_retries .......... $link_retries
+bytes_recovered $bytes_recovered
+files_recovered ....... $files_recovered
+bytes_lost $bytes_lost
+files_lost ............ $files_lost
+STATS
+
+exit(0);