From 95742bfa9d5a1f33ce77689f0b2fb93bdf888e85 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Wed, 6 Jan 2010 11:10:04 -0500 Subject: [PATCH] dupemerge: maybe improve seek performance by sorting perl hashes Thank Johannes Niess for this idea. To improve seek performance, choose inodes for linking in a fixed order. This will mean that two directories with multiple identical files will end up with links to the copies with lower inode numbers. This is an improvement over the previous result, which was that both directories would end up with randomly chosen files from both directories. The sort order isn't strictly numeric; however, it's hopefully close enough. As a crude heuristic, we assume that inode numbers approximate file position on disk, and file names approximate typical usage patterns. Previously we used the perl hash semantics, which are mostly random and might change depending on the numbers of files considered. --- faster-dupemerge | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/faster-dupemerge b/faster-dupemerge index 6371fd0..6223ab0 100755 --- a/faster-dupemerge +++ b/faster-dupemerge @@ -305,14 +305,14 @@ sub merge_files { } print STDERR "Merging...\n" if $debug; - foreach my $candidate (@candidate_list) { + foreach my $candidate (sort @candidate_list) { print STDERR "\tDigesting candidate $candidate\n" if $debug; my $ok = 0; my $digest; hash_file: - foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) { + foreach my $filename (sort keys(%{$inode_to_file_name{$candidate}})) { print STDERR "\t\tDigesting file $filename\n" if $debug; if ((-l $filename) || ! -f _) { warn "Bogon file " . tick_quote($filename); @@ -342,8 +342,8 @@ hash_file: link_start: until ($finished) { - my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}}); - my @candidate_names = keys(%{$inode_to_file_name{$candidate}}); + my @incumbent_names = sort keys(%{$inode_to_file_name{$incumbent}}); + my @candidate_names = sort keys(%{$inode_to_file_name{$candidate}}); print STDERR "\t\tLinks to $incumbent:", join("\n\t\t\t", '', @incumbent_names), "\n" if $debug; print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug; @@ -441,6 +441,12 @@ candidate_file: my $link_done = 0; my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink); + + # If the candidate has more links than incumbent, replace incumbent with candidate. + # If the incumbent has more links than candidate, replace candidate with incumbent. + # If the link counts are equal, we saw incumbent first, so keep the incumbent. + # "We saw incumbent first" is significant because we explicitly sort the inodes. + # Thank Johannes Niess for this idea. if ($candidate_nlink > $incumbent_nlink) { $from_file = $candidate_file; $to_file = $incumbent_file; -- 2.30.2