X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?p=dupemerge;a=blobdiff_plain;f=dm6;h=ff3a9eb2b57244eed3ac9c8d071ebf58c5afcbf9;hp=b8cd25f1bf2319b3aaa3425cac05e712728fe1e1;hb=0c34a3856f509e8b615afc0fed52b7399fee21df;hpb=6757a5a253dcdd4023338a430304e088af10bc68 diff --git a/dm6 b/dm6 index b8cd25f..ff3a9eb 100755 --- a/dm6 +++ b/dm6 @@ -9,7 +9,7 @@ use File::Temp; use File::stat; use MIME::Base64; -# Copyright (C) 2010 Zygo Blaxell +# Copyright (C) 2010 Zygo Blaxell # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -41,7 +41,7 @@ sub usage { Usage: $0 link-dir Hashes a NUL-separated list of files on stdin into link-dir. -Version: 0.20100514 +Version: 0.20101024 USAGE } @@ -58,12 +58,15 @@ sub link_files { print STDERR "\bL"; link($from, $tmp_to) or die "link: $from -> $tmp_to: $!"; print STDERR "\bR"; - unless (rename($tmp_to, $to)) { - my $saved_bang = $!; - print STDERR "\bU"; - unlink($tmp_to) or warn "unlink: $tmp_to: $!"; # Try, possibly in vain, to clean up - die "rename: $tmp_to -> $from: $saved_bang"; - } + my $saved_bang; + $saved_bang = $! unless rename($tmp_to, $to); + + # If $to exists and is a hardlink to $tmp_to (or $from), + # rename returns success but $tmp_to still exists. + print STDERR "\bU"; + unlink($tmp_to) or warn "unlink: $tmp_to: $!" if -e $tmp_to; + + die "rename: $tmp_to -> $from: $saved_bang" if $saved_bang; print STDERR "\b"; } @@ -104,22 +107,18 @@ sub prepare_parents { return "$parent/$suffix"; } -sub name_quad { +sub name_ino { my ($int64) = @_; my $packed = pack('Q>', $int64); $packed =~ s/^\0+//os; my $base64_packed = encode_base64($packed, ''); + $base64_packed =~ y:/:_:; # Don't strip off the trailing padding since it makes the string # so short we end up just putting it back on again. # $base64_packed =~ s/=+$//os; return $base64_packed; } -sub name_ino { - my ($ino) = @_; - return name_quad($ino) . 'I'; -} - # ext3 cannot handle more than 32000 links to a file. Leave some headroom. # Arguably this should be configurable, but the losses are miniscule and # the coding for option support is not. @@ -169,13 +168,12 @@ while () { # Compute digest print STDERR 'd'; my $digest = digest($file); - print STDERR "\b"; # Base64 uses /, we prefer _ $digest =~ y:/:_:; # Check link to digest - my $digest_link = prepare_parents($link_dir, "${digest}D"); + my $digest_link = prepare_parents($link_dir, $digest); print STDERR 'D'; my $digest_st = lstat($digest_link); if ($digest_st) { @@ -218,13 +216,44 @@ while () { } # Garbage collection -print STDERR "\nGarbage collection in '$link_dir'..."; +print STDERR "\nGarbage collection in '$link_dir'...\n"; chdir($link_dir) || die "chdir: $link_dir: $!"; -print STDERR "\nRemoving files with link count < 3..."; -system("find . -type f -links -3 -print0 | xargs -0rt rm -f") and die "system: exit status $?"; -print STDERR "\nRemoving empty directories..."; + +my ($last_inode) = ''; +my @last_links; + +sub handle_gc_file { + my ($line) = @_; + my ($inode, $link) = ($line =~ /^(\S+) (.+)\0$/os); + $inode ||= ''; + if ($inode ne $last_inode) { + my ($dev, $ino, $links) = ($last_inode =~ /^(\d+):(\d+):(\d+)$/os); + if (defined($links)) { + if ($links && $links == @last_links) { + print STDERR "rm -f @last_links\n"; + for my $unlink (@last_links) { + unlink($unlink) or warn "unlink: $unlink: $!"; + } + } + } else { + warn "Could not parse '$last_inode' in '$line'" unless $last_inode eq ''; + } + @last_links = (); + } + $last_inode = $inode; + push(@last_links, $link); +} + +print STDERR "Removing files contained entirely in '$link_dir'...\n"; +open(FIND, "find . -type f -printf '%D:%i:%n %p\\0' | sort -z --compress-program=gzip |") or die "open: find: $!"; +while () { + handle_gc_file($_); +} +handle_gc_file(''); + +print STDERR "Removing empty directories...\n"; system("find . -type d -empty -print0 | xargs -0rt rmdir -p --ignore-fail-on-non-empty") and die "system: exit status $?"; -print STDERR "\nDone.\n"; +print STDERR "Done.\n"; exit(0);