From: Zygo Blaxell Date: Thu, 13 May 2010 20:07:52 +0000 (-0400) Subject: dm6: finish WIP X-Git-Tag: dm6-0.20100514~5^2~14 X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;ds=sidebyside;h=fd37519dbb05cea1e1f8d20fbc944f9380f71f70;hp=8104c13303bb6b502c35ace2d956919367e51760;p=dupemerge dm6: finish WIP --- diff --git a/dm6 b/dm6 index c5ce5d4..b7a9cdf 100755 --- a/dm6 +++ b/dm6 @@ -6,6 +6,7 @@ use Fcntl qw(:DEFAULT :flock); use File::Compare; use File::Path; use File::Temp; +use File::stat; # Copyright (C) 2010 Zygo Blaxell @@ -45,6 +46,7 @@ USAGE sub link_files { my ($from, $to) = (@_); + print STDERR "link '$from' '$to' ..."; my $inode_dir = $to; my $inode_base = $to; $inode_dir =~ s:[^/]*$::o; @@ -56,26 +58,106 @@ sub link_files { unlink($tmp_to) or warn "unlink: $tmp_to: $!"; # Try, possibly in vain, to clean up die "rename: $tmp_to -> $from: $saved_bang"; } + print STDERR "\n"; } my $link_dir = shift @ARGV; (-d $link_dir) or usage; +sub slash_prefix { + my ($file) = @_; + my $prefix = substr($file, 0, 3); + my $suffix = substr($file, 3); + $prefix =~ s:(.):$1/:osg; + chop($prefix); + return ($prefix, $suffix); +} + +sub prepare_parents { + my ($link_dir, $file) = @_; + my ($prefix, $suffix) = slash_prefix($file); + my $parent = "$link_dir/$prefix"; + mkpath($parent, { verbose => 1 }); + die "mkpath: $parent: $!" unless -d $parent; + return "$parent/$prefix/$suffix"; +} + +# ext3 cannot handle more than 32000 links to a file. Leave some headroom. +my $link_count_max = 31990; + $/ = "\0"; while () { my $file = $_; eval { - chomp $file; - print STDERR "digest($file) = "; - my $digest = digest($file); - $digest =~ y:/:_:; - print STDERR "$digest\n"; - $digest =~ s:^(.)(.)(.):$1/$2/$3:osg; - my ($parent) = ($digest =~ m:^(.*/):osg); - $parent = "$link_dir/$parent"; - mkpath($parent, { verbose => 1 }); - die "mkpath: $parent: $!" unless -d $parent; - link_files($file, "$link_dir/$digest"); + for (1) { + chomp $file; + + # Get file stat data + my $st = lstat($file); + die "lstat: $file: $!" unless $st; + + # Oops? + next unless -f _; + + # Skip the file if it has far too many links already + next if ($st->nlink > $link_count_max); + + # Check link to inode + my $inode_link = prepare_parents("$link_dir/inode", $st->ino); + my $inode_st = lstat($inode_link); + my $update_links; + if ($inode_st) { + my $inode_dev = $inode_st->dev; + my $inode_ino = $inode_st->ino; + my $file_dev = $st->dev; + my $file_ino = $st->ino; + if ($inode_ino != $file_ino || $inode_dev != $file_dev) { + warn "inode link '$inode_link' is wrong (inode $inode_ino should be $file_ino)" if $inode_ino != $file_ino; + warn "inode link '$inode_link' is wrong (dev $inode_dev should be $file_dev)" if $inode_dev != $file_dev; + $update_links = 1; + } + } else { + $update_links = 1; + } + + # If neither criteria for updating link is met, leave it as-is + next unless $update_links; + + # Compute digest + print STDERR "digest($file) = "; + my $digest = digest($file); + + # Base64 uses /, we prefer _ + $digest =~ y:/:_:; + + print STDERR "$digest\n"; + + # Check link to digest + my $digest_link = prepare_parents("$link_dir/digest", $digest); + my $digest_st = lstat($digest_link); + if ($digest_st) { + my $digest_nlink = $digest_st->nlink; + if ($digest_nlink > 31990) { + print STDERR "Removing '$digest_link' with $digest_nlink links\n"; + unlink($digest_link) or die "unlink: $digest_link: $!"; + undef $digest_st; + } + } + + # If digest link exists, link it to file + if ($digest_st) { + print STDERR "cmp '$digest_link' '$file' ..."; + die "NOT identical!" if compare($digest_link, $file); + print STDERR "\n"; + link_files($digest_link, $file); + } else { + link_files($file, $digest_link); + } + + # A link to the inode indicates we are done, so do it last + link_files($file, $inode_link); + + } }; warn "$file: $@" if $@; }