From 92a22d87fd9fb90d125cea123e093fa900883c84 Mon Sep 17 00:00:00 2001
From: root <root@ashura.furryterror.org>
Date: Tue, 23 Dec 2008 14:52:04 -0500
Subject: [PATCH] Initial commit

---
 faster-dupemerge | 517 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 517 insertions(+)
 create mode 100755 faster-dupemerge
diff --git a/faster-dupemerge b/faster-dupemerge
new file mode 100755
index 0000000..6508fc0
--- /dev/null
+++ b/faster-dupemerge
@@ -0,0 +1,517 @@
+#!/usr/bin/perl -w
+use strict;
+use Fcntl qw(:DEFAULT :flock);
+use File::Compare;
+use File::Temp;
+
+my $links_input = 0;
+my $files_input = 0;
+my $bogons_input = 0;
+my $files_hashed = 0;
+my $hash_errors = 0;
+my $compare_count = 0;
+my $compare_errors = 0;
+my $compare_differences = 0;
+my $trivially_unique = 0;
+my $merges_attempted = 0;
+my $hard_links = 0;
+my $link_errors = 0;
+my $link_retries = 0;
+my $bytes_recovered = 0;
+my $files_recovered = 0;
+my $files_lost = 0;
+my $bytes_lost = 0;
+my $surprises = 0;
+
+eval '
+	use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
+';
+
+if ($@) {
+	warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
+
+	eval <<'DIGEST';
+		sub digest {
+			my ($filename) = (@_);
+			my $fv = open(MD5SUM, "-|");	
+			die "fork: $!" unless defined($fv);
+			if ($fv) {
+				my ($sum_line) = <MD5SUM>;
+				close(MD5SUM) or die "md5sum: exit status $? (error status $!)";
+				die "hash error:  got EOF instead of md5sum output" unless defined($sum_line);
+				my ($sum) = $sum_line =~ m/^([a-fA-F0-9]{32})/o;
+				die "hash error:  got \Q$sum_line\E instead of md5sum output" unless defined($sum);
+				return $sum;
+			} else {
+				sysopen(STDIN, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
+				exec('md5sum');
+				# Perl guarantees it will die here
+			}
+		}
+DIGEST
+} else {
+	eval <<'DIGEST';
+		sub digest {
+			my ($filename) = (@_);
+			die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
+			my $ctx = Digest::SHA1->new;
+			sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
+			binmode(FILE);		# FIXME:  Necessary?  Probably harmless...
+			$ctx->addfile(\*FILE);
+			close(FILE) or die "close: $filename: $!";
+			return $ctx->b64digest;
+		}
+DIGEST
+}
+	
+my $collapse_access = 0;
+my $collapse_timestamp = 0;
+my $collapse_zero = 0;
+my $trust_hashes = 0;
+my $verbose = 0;
+my $debug = 0;
+my @extra_find_opts = ();
+my @extra_sort_opts = ();
+my $lock_file;
+my $lock_rm = 0;
+my $lock_obtained = 0;
+
+my @directories;
+
+sub usage {
+	my $name = shift(@_);
+	die <<USAGE;
+Usage: $name [--opts] directory [directory...]
+Finds duplicate files in the given directories, and replaces all identical
+copies of a file with hard-links to a single file.
+
+Several options modify the definition of a "duplicate".  By default, files
+which have differences in owner uid or gid, permission (mode), or
+modification time (mtime) are considered different, so that hardlinking
+files does not also change their attributes.  Additionally, all files of
+zero size are ignored for performance reasons (there tend to be many
+of them, and they tend not to release any space when replaced with
+hard links).
+
+        --access        uid, gid, and mode may be different for identical
+                        files
+
+        --debug         show all steps in duplication discovery process
+                        (implies --verbose)
+
+        --find          pass next options (up to --) to find command
+
+        --lock FILE     exit immediately (status 10) if unable to obtain a 
+                        flock(LOCK_EX|LOCK_NB) on FILE
+
+	--lock-rm	remove lock file at exit
+
+        --sort          pass next options (up to --) to sort command
+
+        --timestamps    mtime may be different for identical files
+
+        --trust         skip byte-by-byte file comparisons
+                        (trust the hash function)
+
+        --verbose       report files as they are considered
+
+        --zeros         hard-link zero-length files too
+USAGE
+}
+
+while ($#ARGV >= 0) {
+	my $arg = shift(@ARGV);
+	if ($arg eq '--access') {
+		$collapse_access = 1;
+	} elsif ($arg eq '--timestamps') {
+		$collapse_timestamp = 1;
+	} elsif ($arg eq '--zeros') {
+		$collapse_zero = 1;
+	} elsif ($arg eq '--trust') {
+		$trust_hashes = 1;
+	} elsif ($arg eq '--verbose') {
+		$verbose = 1;
+	} elsif ($arg eq '--lock-rm') {
+		$lock_rm = 1;
+	} elsif ($arg eq '--lock') {
+		$lock_file = shift(@ARGV);
+		unless (defined($lock_file)) {
+			usage($0);
+			exit(1);
+		}
+	} elsif ($arg eq '--debug') {
+		$debug = $verbose = 1;
+	} elsif ($arg eq '--find') {
+		while ($#ARGV >= 0) {
+			my $extra_arg = shift(@ARGV);
+			last if $extra_arg eq '--';
+			push(@extra_find_opts, $extra_arg);
+		}
+	} elsif ($arg eq '--sort') {
+		while ($#ARGV >= 0) {
+			my $extra_arg = shift(@ARGV);
+			last if $extra_arg eq '--';
+			push(@extra_sort_opts, $extra_arg);
+		}
+	} elsif ($arg =~ /^-/o) {
+		usage($0);
+		exit(1);
+	} else {
+		push(@directories, $arg);
+	}
+}
+
+if (defined($lock_file)) {
+	sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
+	flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
+	print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
+	$lock_obtained = 1;
+}
+
+END {
+	if ($lock_obtained) {
+		print STDERR "Removing '$lock_file'.\n" if $verbose;
+		unlink($lock_file) or warn "unlink: $lock_file: $!";
+	}
+}
+
+sub tick_quote {
+	my ($text) = (@_);
+	$text =~ s/'/'\''/go;
+	return "'$text'";
+}
+
+my @find_command = ('find', @directories, @extra_find_opts, '-type', 'f');
+my $printf_string = '%s ' .
+	($collapse_access    ? '0 0 0 ' : '%U %G %m ') .
+	($collapse_timestamp ? '0 '     : '%T@ ') .
+	'%i %p\0';
+
+push(@find_command, '!', '-empty') unless $collapse_zero;
+push(@find_command, '-printf', $printf_string);
+
+my @sort_command = ('sort', '-znr', @extra_sort_opts);
+my @quoted_sort_command = @sort_command;
+grep(tick_quote($_), @quoted_sort_command);
+my $quoted_sort_command = "'" . join("' '", @quoted_sort_command) . "'";
+
+my @quoted_find_command = @find_command;
+grep(tick_quote($_), @quoted_find_command);
+my $quoted_find_command = "'" . join("' '", @quoted_find_command) . "'";
+print STDERR "find command:  $quoted_find_command | $quoted_sort_command\n" if $verbose;
+
+open(FIND, "$quoted_find_command | $quoted_sort_command |") or die "open: $!";
+$/ = "\0";
+
+# Input is sorted so that all weak keys are contiguous.
+# When the key changes, we have to process all files we previously know about.
+my $current_key = -1;
+
+# $inode_to_file_name{$inode} = [@file_names]
+my %inode_to_file_name = ();
+
+# Link files
+sub link_files {
+	my ($from, $to) = (@_);
+	my $inode_dir = $to;
+	my $inode_base = $to;
+	$inode_dir =~ s:[^/]*$::o;
+	$inode_base =~ s:^.*/::os;
+	my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
+	my $quoted_from = tick_quote($from);
+	my $quoted_to = tick_quote($to);
+	print STDERR "ln -f $quoted_from $quoted_to\n";
+	print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
+	link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
+	print STDERR "\trename: $tmp_to -> $to\n" if $debug;
+	unless (rename($tmp_to, $to)) {
+		my $saved_bang = $!;
+		unlink($tmp_to) or warn "unlink: $tmp_to: $!";	# Try, possibly in vain, to clean up
+		die "rename: $tmp_to -> $from: $saved_bang";
+	}
+}
+
+# Process all known files so far.
+sub merge_files {
+	$merges_attempted++;
+
+	my %hash_to_inode;
+	# Used to stop link retry loops (there is a goto in here!  Actually two...)
+	my %stop_loop;
+
+	my @candidate_list = keys(%inode_to_file_name);
+	$files_input += @candidate_list;
+	if (@candidate_list < 2) {
+		print STDERR "Merging...only one candidate to merge..." if $debug;
+		$trivially_unique++;
+		goto end_merge;
+	}
+
+	print STDERR "Merging...\n" if $debug;
+	foreach my $candidate (@candidate_list) {
+		print STDERR "\tDigesting candidate $candidate\n" if $debug;
+		my $ok = 0;
+		my $digest;
+
+hash_file:
+
+		foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
+			print STDERR "\t\tDigesting file $filename\n" if $debug;
+			if ((-l $filename) || ! -f _) {
+				warn "Bogon file " . tick_quote($filename);
+				$surprises++;
+				next;
+			}
+			eval { $digest = digest($filename); };
+			if ($@) {
+				warn "Digest($filename)(#$candidate) failed: $@";
+				$hash_errors++;
+			} else {
+				$ok = 1;
+				$files_hashed++;
+				last hash_file;
+			}
+		}
+		if ($ok) {
+			print STDERR "\t\tDigest is $digest\n" if $debug;
+
+			my $incumbent = $hash_to_inode{$digest};
+			if (defined($incumbent)) {
+				print STDERR "\t\tInodes $incumbent and $candidate have same hash\n" if $debug;
+
+				my $finished = 0;
+
+link_start:
+
+				until ($finished) {
+					my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
+					my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
+					print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
+					print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
+
+	incumbent_file:
+
+					foreach my $incumbent_file (@incumbent_names) {
+						my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
+						print STDERR "\t\tINCUMBENT dev=$incumbent_dev ino=$incumbent_ino mode=$incumbent_mode nlink=$incumbent_nlink uid=$incumbent_uid gid=$incumbent_gid rdev=$incumbent_rdev size=$incumbent_size atime=$incumbent_atime mtime=$incumbent_mtime ctime=$incumbent_ctime blksize=$incumbent_blksize blocks=$incumbent_blocks _=$incumbent_file\n" if $debug;
+
+						if (!defined($incumbent_blocks)) {
+							warn "lstat: $incumbent_file: $!";
+							$surprises++;
+							next incumbent_file;
+						}
+
+						if ($incumbent_ino != $incumbent) {
+							warn "$incumbent_file: expected inode $incumbent, found $incumbent_ino";
+							$surprises++;
+							next incumbent_file;
+						}
+
+	candidate_file:
+
+						my $at_least_one_link_done = 0;
+
+						foreach my $candidate_file (@candidate_names) {
+							my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
+							print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
+
+							if (!defined($candidate_blocks)) {
+								warn "lstat: $candidate_file: $!";
+								$surprises++;
+								next candidate_file;
+							}
+
+							if ($candidate_ino != $candidate) {
+								warn "$candidate_file: expected inode $candidate, found $candidate_ino";
+								$surprises++;
+								next candidate_file;
+							}
+
+							if ($candidate_size != $incumbent_size) {
+								warn "$candidate_file, $incumbent_file: file sizes are different";
+								$surprises++;
+								next candidate_file;
+							}
+
+							my $identical;
+
+							eval {
+								if ($trust_hashes) {
+									print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
+									$identical = 1;
+								} else {
+									my $quoted_incumbent_file = tick_quote($incumbent_file);
+									my $quoted_candidate_file = tick_quote($candidate_file);
+									print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
+									if (compare($incumbent_file, $candidate_file)) {
+										$compare_differences++;
+										$identical = 0;
+										print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
+									} else {
+										$identical = 1;
+									}
+									$compare_count++;
+								}
+							};
+							if ($@) {
+								warn $@;
+								$compare_errors++;
+								next candidate_file;
+							}
+
+							if ($identical) {
+								print STDERR "\t\t\t\tincumbent_nlink=$incumbent_nlink, candidate_nlink=$candidate_nlink\n" if $debug;
+
+								# We have to do this to break out of a possible infinite loop.
+								# Given file A, with hardlinks A1 and A2, and file B, with hardlink B1,
+								# such that A1 and B1 are in non-writable directories, we will loop
+								# forever hardlinking A2 with A and B.
+								# To break the loop, we never attempt to hardlink any files X and Y twice.
+
+								if (defined($stop_loop{$incumbent_file}->{$candidate_file}) ||
+								    defined($stop_loop{$candidate_file}->{$incumbent_file})) {
+									print STDERR "Already considered linking '$incumbent_file' and '$candidate_file', not trying again now\n";
+								} else {
+									$stop_loop{$incumbent_file}->{$candidate_file} = 1;
+									$stop_loop{$candidate_file}->{$incumbent_file} = 1;
+
+									my $link_done = 0;
+
+									my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
+									if ($candidate_nlink > $incumbent_nlink) {
+										$from_file = $candidate_file;
+										$to_file = $incumbent_file;
+										$from_inode = $candidate;
+										$to_inode = $incumbent;
+										$from_nlink = $candidate_nlink;
+										$to_nlink = $incumbent_nlink;
+									} else {
+										$to_file = $candidate_file;
+										$from_file = $incumbent_file;
+										$to_inode = $candidate;
+										$from_inode = $incumbent;
+										$to_nlink = $candidate_nlink;
+										$from_nlink = $incumbent_nlink;
+									}
+
+									eval {
+										link_files($from_file, $to_file);
+										$link_done = 1;
+									};
+
+									if ($@) {
+										warn $@;
+										$link_errors++;
+
+										print STDERR "\t\t\t\t...retrying with swapped from/to files...\n" if $debug;
+										$link_retries++;
+
+										eval {
+											($from_file, $to_file) = ($to_file, $from_file);
+											($from_inode, $to_inode) = ($to_inode, $from_inode);
+											($from_nlink, $to_nlink) = ($to_nlink, $from_nlink);
+											link_files($from_file, $to_file);
+											$link_done = 1;
+										};
+
+										if ($@) {
+											warn $@;
+											$link_errors++;
+										}
+									}
+
+									# Note since the files are presumably identical, they both have the same size.
+									# My random number generator chooses the incumbent's size.
+
+									if ($link_done) {
+										delete $inode_to_file_name{$to_inode}->{$to_file};
+										$inode_to_file_name{$from_inode}->{$to_file} = undef;
+										$hash_to_inode{$digest} = $from_inode;
+
+										$hard_links++;
+										if ($to_nlink == 1) {
+											$files_recovered++;
+											$bytes_recovered += $incumbent_size;
+										}
+
+										# FIXME:  Now we're really confused for some reason.
+										# Start over to rebuild state.
+										next link_start;
+									} else {
+										warn "Could not hardlink '$incumbent_file' and '$candidate_file'";
+
+										# FIXME:  This is a lame heuristic.  We really need to know if we've
+										# tried all possible ways to hardlink the file out of existence first;
+										# however, that is complex and only benefits a silly statistic.
+										if ($to_nlink == 1 || $from_nlink == 1) {
+											$files_lost++;
+											$bytes_lost += $incumbent_size;
+										}
+									}
+								}
+							}
+						}
+					}
+					$finished = 1;
+				}
+			} else {
+				print STDERR "\t\tNew hash entered\n" if $debug;
+				$hash_to_inode{$digest} = $candidate;
+			}
+		} else {
+			warn "No digests found for inode $candidate\n";
+			delete $inode_to_file_name{$candidate};
+		}
+	}
+
+end_merge:
+
+	print STDERR "Merge done.\n" if $debug;
+	undef %inode_to_file_name;
+}
+
+while (<FIND>) {
+	my ($weak_key, $inode, $name) = m/^(\d+ \d+ \d+ \d+ -?\d+) (\d+) (.+)\0$/so;
+	die "read error: $!\nLast input line was '$_'" unless defined($name);
+
+	print STDERR "weak_key=$weak_key inode=$inode name=$name\n" if $debug;
+
+	unless (! (-l $name) && (-f _)) {
+		warn "Bogon file " . tick_quote($name);
+		$bogons_input++;
+		next;
+	}
+
+	$links_input++;
+	merge_files if $weak_key ne $current_key;
+	$current_key = $weak_key;
+
+	$inode_to_file_name{$inode}->{$name} = undef;
+
+	print STDERR "$name\n" if $verbose;
+}
+
+merge_files;
+
+print STDERR <<STATS;
+links_input             $links_input
+files_input ........... $files_input
+bogons_input            $bogons_input
+merges_attempted ...... $merges_attempted
+trivially_unique        $trivially_unique
+files_hashed .......... $files_hashed
+hash_errors             $hash_errors
+surprises ............. $surprises
+compare_count           $compare_count
+compare_differences ... $compare_differences
+compare_errors          $compare_errors
+hard_links ............ $hard_links
+link_errors             $link_errors
+link_retries .......... $link_retries
+bytes_recovered         $bytes_recovered
+files_recovered ....... $files_recovered
+bytes_lost              $bytes_lost
+files_lost ............ $files_lost
+STATS
+
+exit(0);
-- 
2.30.2