dupemerge: maybe improve seek performance by sorting perl hashes
authorZygo Blaxell <zblaxell@faye.furryterror.org>
Wed, 6 Jan 2010 16:10:04 +0000 (11:10 -0500)
committerZygo Blaxell <zblaxell@waya.furryterror.org>
Sat, 9 Jan 2010 02:17:48 +0000 (21:17 -0500)
Thank Johannes Niess <Linux@johannes-niess.de> for this idea.

To improve seek performance, choose inodes for linking in a fixed order.
This will mean that two directories with multiple identical files will
end up with links to the copies with lower inode numbers.  This is an
improvement over the previous result, which was that both directories
would end up with randomly chosen files from both directories.

The sort order isn't strictly numeric; however, it's hopefully close
enough.

As a crude heuristic, we assume that inode numbers approximate file
position on disk, and file names approximate typical usage patterns.
Previously we used the perl hash semantics, which are mostly random
and might change depending on the numbers of files considered.

faster-dupemerge

index 6371fd037747c419cde6efce2c2a33e8b6788d6a..6223ab0adc2d7cacb4dee7e7c4f4e71a2aa3a1d4 100755 (executable)
@@ -305,14 +305,14 @@ sub merge_files {
        }
 
        print STDERR "Merging...\n" if $debug;
        }
 
        print STDERR "Merging...\n" if $debug;
-       foreach my $candidate (@candidate_list) {
+       foreach my $candidate (sort @candidate_list) {
                print STDERR "\tDigesting candidate $candidate\n" if $debug;
                my $ok = 0;
                my $digest;
 
 hash_file:
 
                print STDERR "\tDigesting candidate $candidate\n" if $debug;
                my $ok = 0;
                my $digest;
 
 hash_file:
 
-               foreach my $filename (keys(%{$inode_to_file_name{$candidate}})) {
+               foreach my $filename (sort keys(%{$inode_to_file_name{$candidate}})) {
                        print STDERR "\t\tDigesting file $filename\n" if $debug;
                        if ((-l $filename) || ! -f _) {
                                warn "Bogon file " . tick_quote($filename);
                        print STDERR "\t\tDigesting file $filename\n" if $debug;
                        if ((-l $filename) || ! -f _) {
                                warn "Bogon file " . tick_quote($filename);
@@ -342,8 +342,8 @@ hash_file:
 link_start:
 
                                until ($finished) {
 link_start:
 
                                until ($finished) {
-                                       my @incumbent_names = keys(%{$inode_to_file_name{$incumbent}});
-                                       my @candidate_names = keys(%{$inode_to_file_name{$candidate}});
+                                       my @incumbent_names = sort keys(%{$inode_to_file_name{$incumbent}});
+                                       my @candidate_names = sort keys(%{$inode_to_file_name{$candidate}});
                                        print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
                                        print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
 
                                        print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
                                        print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
 
@@ -441,6 +441,12 @@ candidate_file:
                                                                        my $link_done = 0;
 
                                                                        my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
                                                                        my $link_done = 0;
 
                                                                        my ($from_file, $to_file, $from_inode, $to_inode, $from_nlink, $to_nlink);
+
+                                                                       # If the candidate has more links than incumbent, replace incumbent with candidate.
+                                                                       # If the incumbent has more links than candidate, replace candidate with incumbent.
+                                                                       # If the link counts are equal, we saw incumbent first, so keep the incumbent.
+                                                                       # "We saw incumbent first" is significant because we explicitly sort the inodes.
+                                                                       # Thank Johannes Niess for this idea.
                                                                        if ($candidate_nlink > $incumbent_nlink) {
                                                                                $from_file = $candidate_file;
                                                                                $to_file = $incumbent_file;
                                                                        if ($candidate_nlink > $incumbent_nlink) {
                                                                                $from_file = $candidate_file;
                                                                                $to_file = $incumbent_file;