From 95742bfa9d5a1f33ce77689f0b2fb93bdf888e85 Mon Sep 17 00:00:00 2001
From: Zygo Blaxell <zblaxell@faye.furryterror.org>
Date: Wed, 6 Jan 2010 11:10:04 -0500
Subject: [PATCH] dupemerge: maybe improve seek performance by sorting perl
 hashes

Thank Johannes Niess <Linux@johannes-niess.de> for this idea.

To improve seek performance, choose inodes for linking in a fixed order.
This will mean that two directories with multiple identical files will
end up with links to the copies with lower inode numbers.  This is an
improvement over the previous result, which was that both directories
would end up with randomly chosen files from both directories.

The sort order isn't strictly numeric; however, it's hopefully close
enough.

As a crude heuristic, we assume that inode numbers approximate file
position on disk, and file names approximate typical usage patterns.
Previously we used the perl hash semantics, which are mostly random
and might change depending on the numbers of files considered.
---
 faster-dupemerge | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/faster-dupemerge b/faster-dupemerge
index 6371fd0..6223ab0 100755
--- a/faster-dupemerge
+++ b/faster-dupemerge
@@ -305,14 +305,14 @@ sub merge_files {
 	}
 
 	print STDERR "Merging...\n" if $debug;
-	foreach my $candidate (@candidate_list) {
+	foreach my $candidate (sort @candidate_list) {
 		print STDERR "\tDigesting candidate $candidate\n" if $debug;
 		my $ok = 0;
 		my $digest;
 
 hash_file:
 
-		foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
+		foreach my $filename (sort keys(%{$inode_to_file_name{$candidate}})) {
 			print STDERR "\t\tDigesting file $filename\n" if $debug;
 			if ((-l $filename) || ! -f _) {
 				warn "Bogon file " . tick_quote($filename);
@@ -342,8 +342,8 @@ hash_file:
 link_start:
 
 				until ($finished) {
-					my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
-					my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
+					my @incumbent_names = sort keys(%{$inode_to_file_name{$incumbent}});
+					my @candidate_names = sort keys(%{$inode_to_file_name{$candidate}});
 					print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
 					print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
 
@@ -441,6 +441,12 @@ candidate_file:
 									my $link_done = 0;
 
 									my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
+
+									# If the candidate has more links than incumbent, replace incumbent with candidate.
+									# If the incumbent has more links than candidate, replace candidate with incumbent.
+									# If the link counts are equal, we saw incumbent first, so keep the incumbent.
+									# "We saw incumbent first" is significant because we explicitly sort the inodes.
+									# Thank Johannes Niess for this idea.
 									if ($candidate_nlink > $incumbent_nlink) {
 										$from_file = $candidate_file;
 										$to_file = $incumbent_file;
-- 
2.30.2