#!/usr/bin/perl
use warnings;
use strict;
-use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
+use Digest::SHA qw(sha1 sha1_hex sha1_base64);
use Fcntl qw(:DEFAULT :flock);
use File::Compare;
use File::Path;
use File::Temp;
use File::stat;
+use MIME::Base64;
-# Copyright (C) 2010 Zygo Blaxell <dm5@mailtoo.hungrycats.org>
+# Copyright (C) 2010-2012 Zygo Blaxell <dupemerge@mailtoo.hungrycats.org>
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
sub digest {
my ($filename) = (@_);
die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
- my $ctx = Digest::SHA1->new;
+ my $ctx = Digest::SHA->new;
sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
binmode(FILE); # FIXME: Necessary? Probably harmless...
$ctx->addfile(\*FILE);
die <<USAGE;
Usage: $0 link-dir
Hashes a NUL-separated list of files on stdin into link-dir.
+
+Version: 0.20120914
USAGE
}
sub link_files {
my ($from, $to) = (@_);
- print STDERR "link '$from' '$to' ...";
+ print STDERR 'T';
my $inode_dir = $to;
my $inode_base = $to;
$inode_dir =~ s:[^/]*$::o;
$inode_base =~ s:^.*/::os;
my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
+ print STDERR "\bL";
link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
- unless (rename($tmp_to, $to)) {
- my $saved_bang = $!;
- unlink($tmp_to) or warn "unlink: $tmp_to: $!"; # Try, possibly in vain, to clean up
- die "rename: $tmp_to -> $from: $saved_bang";
- }
- print STDERR "\n";
+ print STDERR "\bR";
+ my $saved_bang;
+ $saved_bang = $! unless rename($tmp_to, $to);
+
+ # If $to exists and is a hardlink to $tmp_to (or $from),
+ # rename returns success but $tmp_to still exists.
+ print STDERR "\bU";
+ unlink($tmp_to) or warn "unlink: $tmp_to: $!" if -e $tmp_to;
+
+ die "rename: $tmp_to -> $from: $saved_bang" if $saved_bang;
+ print STDERR "\b";
}
my $link_dir = shift @ARGV;
-(-d $link_dir) or usage;
+usage unless $link_dir;
+
+my $prefix_length = 3;
sub slash_prefix {
my ($file) = @_;
- my $prefix = substr($file, 0, 3);
- my $suffix = substr($file, 3);
+ $file .= '_' x (length($file) + 1 - $prefix_length) if length($file) + 1 < $prefix_length;
+ my $prefix = substr($file, 0, $prefix_length);
+ my $suffix = substr($file, $prefix_length);
$prefix =~ s:(.):$1/:osg;
chop($prefix);
return ($prefix, $suffix);
}
+sub mkdir_p {
+ my ($dir) = @_;
+ return if -d $dir;
+ $dir =~ s:/+$::os;
+ my $parent;
+ ($parent = $dir) =~ s:[^/]+$::os;
+ if ($parent ne $dir) {
+ mkdir_p($parent);
+ print STDERR 'm';
+ mkdir($dir) or die "mkdir: $dir: $!";
+ }
+ die "mkdir: $dir: $!" unless -d $dir;
+}
+
sub prepare_parents {
my ($link_dir, $file) = @_;
my ($prefix, $suffix) = slash_prefix($file);
my $parent = "$link_dir/$prefix";
- mkpath($parent, { verbose => 1 });
- die "mkpath: $parent: $!" unless -d $parent;
- return "$parent/$prefix/$suffix";
+ mkdir_p($parent);
+ return "$parent/$suffix";
+}
+
+sub name_ino {
+ my ($int64) = @_;
+ my $packed = pack('Q>', $int64);
+ $packed =~ s/^\0+//os;
+ my $base64_packed = encode_base64($packed, '');
+ $base64_packed =~ y:/:_:;
+ # Don't strip off the trailing padding since it makes the string
+ # so short we end up just putting it back on again.
+ # $base64_packed =~ s/=+$//os;
+ return $base64_packed;
}
# ext3 cannot handle more than 32000 links to a file. Leave some headroom.
+# Arguably this should be configurable, but the losses are miniscule and
+# the coding for option support is not.
my $link_count_max = 31990;
$/ = "\0";
chomp $file;
# Get file stat data
+ print STDERR '.';
my $st = lstat($file);
die "lstat: $file: $!" unless $st;
next if ($st->nlink > $link_count_max);
# Check link to inode
- my $inode_link = prepare_parents("$link_dir/inode", $st->ino);
+ my $inode_link = prepare_parents($link_dir, name_ino($st->ino));
+ print STDERR 'I';
my $inode_st = lstat($inode_link);
my $update_links;
if ($inode_st) {
} else {
$update_links = 1;
}
+ print STDERR "\b";
# If neither criteria for updating link is met, leave it as-is
next unless $update_links;
# Compute digest
- print STDERR "digest($file) = ";
+ print STDERR 'd';
my $digest = digest($file);
# Base64 uses /, we prefer _
$digest =~ y:/:_:;
- print STDERR "$digest\n";
-
# Check link to digest
- my $digest_link = prepare_parents("$link_dir/digest", $digest);
+ my $digest_link = prepare_parents($link_dir, $digest);
+ print STDERR 'D';
my $digest_st = lstat($digest_link);
if ($digest_st) {
my $digest_nlink = $digest_st->nlink;
if ($digest_nlink > 31990) {
- print STDERR "Removing '$digest_link' with $digest_nlink links\n";
+ print STDERR 'u';
unlink($digest_link) or die "unlink: $digest_link: $!";
undef $digest_st;
}
}
+ print STDERR "\b";
+
+ # Which file are we keeping?
+ my $keep_ino;
# If digest link exists, link it to file
if ($digest_st) {
- print STDERR "cmp '$digest_link' '$file' ...";
+ print STDERR 'c';
die "NOT identical!" if compare($digest_link, $file);
- print STDERR "\n";
+
+ # Old, replace input with old file
+ print STDERR '-';
link_files($digest_link, $file);
+ $keep_ino = $digest_st->ino;
} else {
+ # New, add input to digest
+ print STDERR '+';
link_files($file, $digest_link);
+ $keep_ino = $st->ino;
}
# A link to the inode indicates we are done, so do it last
- link_files($file, $inode_link);
+ $inode_link = prepare_parents($link_dir, name_ino($keep_ino));
+ print STDERR ' ';
+ link_files($digest_link, $inode_link);
}
};
warn "$file: $@" if $@;
}
+# Garbage collection
+print STDERR "\nGarbage collection in '$link_dir'...\n";
+chdir($link_dir) || die "chdir: $link_dir: $!";
+
+my ($last_inode) = '';
+my @last_links;
+
+sub handle_gc_file {
+ my ($line) = @_;
+ my ($inode, $link) = ($line =~ /^(\S+) (.+)\0$/os);
+ $inode ||= '';
+ if ($inode ne $last_inode) {
+ my ($dev, $ino, $links) = ($last_inode =~ /^(\d+):(\d+):(\d+)$/os);
+ if (defined($links)) {
+ if ($links && $links == @last_links) {
+ print STDERR "rm -f @last_links\n";
+ for my $unlink (@last_links) {
+ unlink($unlink) or warn "unlink: $unlink: $!";
+ }
+ }
+ } else {
+ warn "Could not parse '$last_inode' in '$line'" unless $last_inode eq '';
+ }
+ @last_links = ();
+ }
+ $last_inode = $inode;
+ push(@last_links, $link);
+}
+
+print STDERR "Removing files contained entirely in '$link_dir'...\n";
+open(FIND, "find . -type f -printf '%D:%i:%n %p\\0' | sort -z --compress-program=gzip |") or die "open: find: $!";
+while (<FIND>) {
+ handle_gc_file($_);
+}
+handle_gc_file('');
+
+print STDERR "Removing empty directories...\n";
+system("find . -type d -empty -print0 | xargs -0rt rmdir -p --ignore-fail-on-non-empty") and die "system: exit status $?";
+print STDERR "Done.\n";
+
exit(0);
__END__