X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=faster-dupemerge;h=0ae1032f2192b4fe4d7556f8b5be4ffd9ce671a8;hb=51db2c8ce235245d7eb2c02e337d9d18b94c9813;hp=6508fc0da801e8d66cb8476cbfb0bd5ce25ce373;hpb=92a22d87fd9fb90d125cea123e093fa900883c84;p=dupemerge

diff --git a/faster-dupemerge b/faster-dupemerge
index 6508fc0..0ae1032 100755
--- a/faster-dupemerge
+++ b/faster-dupemerge
@@ -1,14 +1,22 @@
 #!/usr/bin/perl -w
+# $Id$
+
+# Copyright (C) 2002-2003 by Zygo Blaxell <zblaxell@hungrycats.org>
+# Use, modification, and distribution permitted
+# under the terms of the GNU GPL.
+
 use strict;
 use Fcntl qw(:DEFAULT :flock);
 use File::Compare;
 use File::Temp;
 
-my $links_input = 0;
-my $files_input = 0;
-my $bogons_input = 0;
-my $files_hashed = 0;
+my $input_links = 0;
+my $input_files = 0;
+my $input_bogons = 0;
+my $hash_bytes = 0;
+my $hash_files = 0;
 my $hash_errors = 0;
+my $compare_bytes = 0;
 my $compare_count = 0;
 my $compare_errors = 0;
 my $compare_differences = 0;
@@ -17,10 +25,10 @@ my $merges_attempted = 0;
 my $hard_links = 0;
 my $link_errors = 0;
 my $link_retries = 0;
-my $bytes_recovered = 0;
-my $files_recovered = 0;
-my $files_lost = 0;
-my $bytes_lost = 0;
+my $recovered_bytes = 0;
+my $recovered_files = 0;
+my $lost_files = 0;
+my $lost_bytes = 0;
 my $surprises = 0;
 
 eval '
@@ -31,7 +39,7 @@ if ($@) {
 	warn "Digest::SHA1: $@\nUsing external md5sum program to generate hashes.\nPlease install Digest::SHA1 (libdigest-sha1-perl)";
 
 	eval <<'DIGEST';
-		sub digest {
+		sub really_digest {
 			my ($filename) = (@_);
 			my $fv = open(MD5SUM, "-|");	
 			die "fork: $!" unless defined($fv);
@@ -51,7 +59,7 @@ if ($@) {
 DIGEST
 } else {
 	eval <<'DIGEST';
-		sub digest {
+		sub really_digest {
 			my ($filename) = (@_);
 			die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
 			my $ctx = Digest::SHA1->new;
@@ -67,15 +75,30 @@ DIGEST
 my $collapse_access = 0;
 my $collapse_timestamp = 0;
 my $collapse_zero = 0;
-my $trust_hashes = 0;
+my $skip_compares = 0;
+my $skip_hashes = 0;
 my $verbose = 0;
 my $debug = 0;
+my $dry_run = 0;
+my $humane = 0;
 my @extra_find_opts = ();
 my @extra_sort_opts = ();
 my $lock_file;
 my $lock_rm = 0;
 my $lock_obtained = 0;
 
+sub digest {
+	my ($filename) = (@_);
+	if ($skip_hashes) {
+		return "SKIPPING HASHES";
+	} else {
+		my $digest = &really_digest($filename);
+		$hash_bytes += -s $filename;
+		$hash_files++;
+		return $digest
+	}
+}
+
 my @directories;
 
 sub usage {
@@ -99,8 +122,12 @@ hard links).
         --debug         show all steps in duplication discovery process
                         (implies --verbose)
 
+	--dry-run	do not lock files or make changes to filesystem
+
         --find          pass next options (up to --) to find command
 
+	--humane	human-readable statistics (e.g. 1 048 576)
+
         --lock FILE     exit immediately (status 10) if unable to obtain a 
                         flock(LOCK_EX|LOCK_NB) on FILE
 
@@ -110,7 +137,11 @@ hard links).
 
         --timestamps    mtime may be different for identical files
 
-        --trust         skip byte-by-byte file comparisons
+	--skip-compare	skip byte-by-byte file comparisons
+
+	--skip-hash	skip calculation of hash function on files
+
+        --trust         old name for --skip-compare
                         (trust the hash function)
 
         --verbose       report files as they are considered
@@ -127,8 +158,10 @@ while ($#ARGV >= 0) {
 		$collapse_timestamp = 1;
 	} elsif ($arg eq '--zeros') {
 		$collapse_zero = 1;
-	} elsif ($arg eq '--trust') {
-		$trust_hashes = 1;
+	} elsif ($arg eq '--trust' || $arg eq '--skip-compare') {
+		$skip_compares = 1;
+	} elsif ($arg eq '--skip-hash') {
+		$skip_hashes = 1;
 	} elsif ($arg eq '--verbose') {
 		$verbose = 1;
 	} elsif ($arg eq '--lock-rm') {
@@ -141,6 +174,10 @@ while ($#ARGV >= 0) {
 		}
 	} elsif ($arg eq '--debug') {
 		$debug = $verbose = 1;
+	} elsif ($arg eq '--dry-run') {
+		$dry_run = 1;
+	} elsif ($arg eq '--humane') {
+		$humane = 1;
 	} elsif ($arg eq '--find') {
 		while ($#ARGV >= 0) {
 			my $extra_arg = shift(@ARGV);
@@ -161,7 +198,13 @@ while ($#ARGV >= 0) {
 	}
 }
 
-if (defined($lock_file)) {
+if ($skip_hashes && $skip_compares) {
+	die "Cannot skip both hashes and compares.\n";
+}
+
+@directories or usage;
+
+if (defined($lock_file) && !$dry_run) {
 	sysopen(LOCK_FILE, $lock_file, O_CREAT|O_RDONLY, 0666) or die "open: $lock_file: $!";
 	flock(LOCK_FILE, LOCK_EX|LOCK_NB) or die "flock: $lock_file: LOCK_EX|LOCK_NB: $!";
 	print STDERR "Locked '$lock_file' in LOCK_EX mode.\n" if $verbose;
@@ -169,7 +212,7 @@ if (defined($lock_file)) {
 }
 
 END {
-	if ($lock_obtained) {
+	if ($lock_obtained && !$dry_run) {
 		print STDERR "Removing '$lock_file'.\n" if $verbose;
 		unlink($lock_file) or warn "unlink: $lock_file: $!";
 	}
@@ -177,7 +220,7 @@ END {
 
 sub tick_quote {
 	my ($text) = (@_);
-	$text =~ s/'/'\''/go;
+	$text =~ s/'/'\\''/go;
 	return "'$text'";
 }
 
@@ -213,14 +256,18 @@ my %inode_to_file_name = ();
 # Link files
 sub link_files {
 	my ($from, $to) = (@_);
+
+	my $quoted_from = tick_quote($from);
+	my $quoted_to = tick_quote($to);
+	print STDERR "ln -f $quoted_from $quoted_to\n";
+
+	return if $dry_run;
+
 	my $inode_dir = $to;
 	my $inode_base = $to;
 	$inode_dir =~ s:[^/]*$::o;
 	$inode_base =~ s:^.*/::os;
 	my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
-	my $quoted_from = tick_quote($from);
-	my $quoted_to = tick_quote($to);
-	print STDERR "ln -f $quoted_from $quoted_to\n";
 	print STDERR "\tlink: $from -> $tmp_to\n" if $debug;
 	link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
 	print STDERR "\trename: $tmp_to -> $to\n" if $debug;
@@ -240,7 +287,7 @@ sub merge_files {
 	my %stop_loop;
 
 	my @candidate_list = keys(%inode_to_file_name);
-	$files_input += @candidate_list;
+	$input_files += @candidate_list;
 	if (@candidate_list < 2) {
 		print STDERR "Merging...only one candidate to merge..." if $debug;
 		$trivially_unique++;
@@ -262,13 +309,14 @@ hash_file:
 				$surprises++;
 				next;
 			}
-			eval { $digest = digest($filename); };
+			eval { 
+				$digest = digest($filename); 
+			};
 			if ($@) {
 				warn "Digest($filename)(#$candidate) failed: $@";
 				$hash_errors++;
 			} else {
 				$ok = 1;
-				$files_hashed++;
 				last hash_file;
 			}
 		}
@@ -289,7 +337,7 @@ link_start:
 					print STDERR "\t\tLinks to $incumbent:",   join("\n\t\t\t", '', @incumbent_names),   "\n" if $debug;
 					print STDERR "\t\tLinks to $candidate:", join("\n\t\t\t", '', @candidate_names), "\n" if $debug;
 
-	incumbent_file:
+incumbent_file:
 
 					foreach my $incumbent_file (@incumbent_names) {
 						my ($incumbent_dev,$incumbent_ino,$incumbent_mode,$incumbent_nlink,$incumbent_uid,$incumbent_gid,$incumbent_rdev,$incumbent_size,$incumbent_atime,$incumbent_mtime,$incumbent_ctime,$incumbent_blksize,$incumbent_blocks) = lstat($incumbent_file);
@@ -307,10 +355,10 @@ link_start:
 							next incumbent_file;
 						}
 
-	candidate_file:
-
 						my $at_least_one_link_done = 0;
 
+candidate_file:
+
 						foreach my $candidate_file (@candidate_names) {
 							my ($candidate_dev,$candidate_ino,$candidate_mode,$candidate_nlink,$candidate_uid,$candidate_gid,$candidate_rdev,$candidate_size,$candidate_atime,$candidate_mtime,$candidate_ctime,$candidate_blksize,$candidate_blocks) = lstat($candidate_file);
 							print STDERR "\t\t\tCANDIDATE dev=$candidate_dev ino=$candidate_ino mode=$candidate_mode nlink=$candidate_nlink uid=$candidate_uid gid=$candidate_gid rdev=$candidate_rdev size=$candidate_size atime=$candidate_atime mtime=$candidate_mtime ctime=$candidate_ctime blksize=$candidate_blksize blocks=$candidate_blocks _=$candidate_file\n" if $debug;
@@ -336,21 +384,26 @@ link_start:
 							my $identical;
 
 							eval {
-								if ($trust_hashes) {
-									print STDERR "\t\t\t\tTrusting hashes!\n" if $debug;
+								if ($skip_compares) {
+									print STDERR "\t\t\t\tSkipping compare!\n" if $debug;
 									$identical = 1;
 								} else {
 									my $quoted_incumbent_file = tick_quote($incumbent_file);
 									my $quoted_candidate_file = tick_quote($candidate_file);
-									print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n";
+									print STDERR "cmp $quoted_incumbent_file $quoted_candidate_file\n" if $debug;
 									if (compare($incumbent_file, $candidate_file)) {
 										$compare_differences++;
 										$identical = 0;
-										print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n"
+										# It is significant for two non-identical files to have identical SHA1 or MD5 hashes.
+										# Some kind of I/O error is more likely, so this message cannot be turned off.
+										# On the other hand, if we're skipping hashes, _all_ files will have the same hash,
+										# so the warning in that case is quite silly.  Hmmm.
+										print STDERR "$quoted_incumbent_file and $quoted_candidate_file have same hash but do not compare equal!\n" unless $skip_hashes;
 									} else {
 										$identical = 1;
 									}
 									$compare_count++;
+									$compare_bytes += $incumbent_size;
 								}
 							};
 							if ($@) {
@@ -424,14 +477,18 @@ link_start:
 									# My random number generator chooses the incumbent's size.
 
 									if ($link_done) {
+										# Since we're in a dry run, the filesystem doesn't change.
+										# Our notion of what the filesystem should look like should not change either.
 										delete $inode_to_file_name{$to_inode}->{$to_file};
-										$inode_to_file_name{$from_inode}->{$to_file} = undef;
-										$hash_to_inode{$digest} = $from_inode;
+										unless ($dry_run) {
+											$inode_to_file_name{$from_inode}->{$to_file} = undef;
+											$hash_to_inode{$digest} = $from_inode;
+										}
 
 										$hard_links++;
 										if ($to_nlink == 1) {
-											$files_recovered++;
-											$bytes_recovered += $incumbent_size;
+											$recovered_files++;
+											$recovered_bytes += $incumbent_size;
 										}
 
 										# FIXME:  Now we're really confused for some reason.
@@ -444,8 +501,8 @@ link_start:
 										# tried all possible ways to hardlink the file out of existence first;
 										# however, that is complex and only benefits a silly statistic.
 										if ($to_nlink == 1 || $from_nlink == 1) {
-											$files_lost++;
-											$bytes_lost += $incumbent_size;
+											$lost_files++;
+											$lost_bytes += $incumbent_size;
 										}
 									}
 								}
@@ -478,11 +535,11 @@ while (<FIND>) {
 
 	unless (! (-l $name) && (-f _)) {
 		warn "Bogon file " . tick_quote($name);
-		$bogons_input++;
+		$input_bogons++;
 		next;
 	}
 
-	$links_input++;
+	$input_links++;
 	merge_files if $weak_key ne $current_key;
 	$current_key = $weak_key;
 
@@ -493,25 +550,53 @@ while (<FIND>) {
 
 merge_files;
 
-print STDERR <<STATS;
-links_input             $links_input
-files_input ........... $files_input
-bogons_input            $bogons_input
-merges_attempted ...... $merges_attempted
-trivially_unique        $trivially_unique
-files_hashed .......... $files_hashed
-hash_errors             $hash_errors
-surprises ............. $surprises
+my $stats_blob = <<STATS;
+compare_bytes           $compare_bytes
 compare_count           $compare_count
-compare_differences ... $compare_differences
+compare_differences     $compare_differences
 compare_errors          $compare_errors
-hard_links ............ $hard_links
+hard_links              $hard_links
+hash_bytes              $hash_bytes
+hash_errors             $hash_errors
+hash_files              $hash_files
+input_bogons            $input_bogons
+input_files             $input_files
+input_links             $input_links
 link_errors             $link_errors
-link_retries .......... $link_retries
-bytes_recovered         $bytes_recovered
-files_recovered ....... $files_recovered
-bytes_lost              $bytes_lost
-files_lost ............ $files_lost
+link_retries            $link_retries
+lost_bytes              $lost_bytes
+lost_files              $lost_files
+merges_attempted        $merges_attempted
+recovered_bytes         $recovered_bytes
+recovered_files         $recovered_files
+surprises               $surprises
+trivially_unique        $trivially_unique
 STATS
 
+if ($humane) {
+	my $max_num_len = 0;
+
+	sub measure_numbers {
+		my ($num) = @_;
+		my $len = length($num);
+		$len += int( (length($num) - 1) / 3);
+		$max_num_len = $len if $len > $max_num_len;
+	}
+
+	(my $dummy = $stats_blob) =~ s/\d+/measure_numbers($&)/geos;
+
+	sub space_numbers {
+		my ($num) = @_;
+		1 while $num =~ s/(\d)(\d\d\d)( \d\d\d)*$/$1 $2$3/os;
+		$num = ' ' x ($max_num_len - length($num)) . $num;
+		return $num;
+	}
+
+	$stats_blob =~ s/\d+/space_numbers($&)/geos;
+}
+
+$stats_blob =~ s/([^\n]*\n[^\n]*? )(\s+)( [^\n]*\n)/$1 . ('.' x length($2)) . $3/oemg;
+
+print STDERR $stats_blob;
+
 exit(0);