4 use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
5 use Fcntl qw(:DEFAULT :flock);
12 # Copyright (C) 2010 Zygo Blaxell <dupemerge@mailtoo.hungrycats.org>
14 # This program is free software; you can redistribute it and/or modify
15 # it under the terms of the GNU General Public License as published by
16 # the Free Software Foundation; either version 2 of the License, or
17 # (at your option) any later version.
19 # This program is distributed in the hope that it will be useful,
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 # GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License
25 # along with this program; if not, write to the Free Software
26 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 my ($filename) = (@_);
30 die "'$filename' is not a plain file" if (-l $filename) || ! (-f _);
31 my $ctx = Digest::SHA1->new;
32 sysopen(FILE, $filename, O_RDONLY|O_NONBLOCK) or die "open: $filename: $!";
33 binmode(FILE); # FIXME: Necessary? Probably harmless...
34 $ctx->addfile(\*FILE);
35 close(FILE) or die "close: $filename: $!";
36 return $ctx->b64digest;
42 Hashes a NUL-separated list of files on stdin into link-dir.
50 my ($from, $to) = (@_);
55 $inode_dir =~ s:[^/]*$::o;
56 $inode_base =~ s:^.*/::os;
57 my $tmp_to = File::Temp::tempnam($inode_dir, ".$inode_base.");
59 link($from, $tmp_to) or die "link: $from -> $tmp_to: $!";
62 $saved_bang = $! unless rename($tmp_to, $to);
64 # If $to exists and is a hardlink to $tmp_to (or $from),
65 # rename returns success but $tmp_to still exists.
67 unlink($tmp_to) or warn "unlink: $tmp_to: $!" if -e $tmp_to;
69 die "rename: $tmp_to -> $from: $saved_bang" if $saved_bang;
73 my $link_dir = shift @ARGV;
74 usage unless $link_dir;
76 my $prefix_length = 3;
80 $file .= '_' x (length($file) + 1 - $prefix_length) if length($file) + 1 < $prefix_length;
81 my $prefix = substr($file, 0, $prefix_length);
82 my $suffix = substr($file, $prefix_length);
83 $prefix =~ s:(.):$1/:osg;
85 return ($prefix, $suffix);
93 ($parent = $dir) =~ s:[^/]+$::os;
94 if ($parent ne $dir) {
97 mkdir($dir) or die "mkdir: $dir: $!";
99 die "mkdir: $dir: $!" unless -d $dir;
102 sub prepare_parents {
103 my ($link_dir, $file) = @_;
104 my ($prefix, $suffix) = slash_prefix($file);
105 my $parent = "$link_dir/$prefix";
107 return "$parent/$suffix";
112 my $packed = pack('Q>', $int64);
113 $packed =~ s/^\0+//os;
114 my $base64_packed = encode_base64($packed, '');
115 $base64_packed =~ y:/:_:;
116 # Don't strip off the trailing padding since it makes the string
117 # so short we end up just putting it back on again.
118 # $base64_packed =~ s/=+$//os;
119 return $base64_packed;
122 # ext3 cannot handle more than 32000 links to a file. Leave some headroom.
123 # Arguably this should be configurable, but the losses are miniscule and
124 # the coding for option support is not.
125 my $link_count_max = 31990;
136 my $st = lstat($file);
137 die "lstat: $file: $!" unless $st;
142 # Skip the file if it has far too many links already
143 next if ($st->nlink > $link_count_max);
145 # Check link to inode
146 my $inode_link = prepare_parents($link_dir, name_ino($st->ino));
148 my $inode_st = lstat($inode_link);
151 my $inode_dev = $inode_st->dev;
152 my $inode_ino = $inode_st->ino;
153 my $file_dev = $st->dev;
154 my $file_ino = $st->ino;
155 if ($inode_ino != $file_ino || $inode_dev != $file_dev) {
156 warn "inode link '$inode_link' is wrong (inode $inode_ino should be $file_ino)" if $inode_ino != $file_ino;
157 warn "inode link '$inode_link' is wrong (dev $inode_dev should be $file_dev)" if $inode_dev != $file_dev;
165 # If neither criteria for updating link is met, leave it as-is
166 next unless $update_links;
170 my $digest = digest($file);
172 # Base64 uses /, we prefer _
175 # Check link to digest
176 my $digest_link = prepare_parents($link_dir, $digest);
178 my $digest_st = lstat($digest_link);
180 my $digest_nlink = $digest_st->nlink;
181 if ($digest_nlink > 31990) {
183 unlink($digest_link) or die "unlink: $digest_link: $!";
189 # Which file are we keeping?
192 # If digest link exists, link it to file
195 die "NOT identical!" if compare($digest_link, $file);
197 # Old, replace input with old file
199 link_files($digest_link, $file);
200 $keep_ino = $digest_st->ino;
202 # New, add input to digest
204 link_files($file, $digest_link);
205 $keep_ino = $st->ino;
208 # A link to the inode indicates we are done, so do it last
209 $inode_link = prepare_parents($link_dir, name_ino($keep_ino));
211 link_files($digest_link, $inode_link);
215 warn "$file: $@" if $@;
219 print STDERR "\nGarbage collection in '$link_dir'...";
220 chdir($link_dir) || die "chdir: $link_dir: $!";
221 print STDERR "\nRemoving files with link count < 3 and temporary links...";
222 system('find . -type f \( -links -3 -o -name ".*" \) -print0 | xargs -0rt rm -f') and die "system: exit status $?";
223 print STDERR "\nRemoving empty directories...";
224 system("find . -type d -empty -print0 | xargs -0rt rmdir -p --ignore-fail-on-non-empty") and die "system: exit status $?";
225 print STDERR "\nDone.\n";
231 #################################################################################
232 # GNU GENERAL PUBLIC LICENSE #
233 # Version 2, June 1991 #
235 # Copyright (C) 1989, 1991 Free Software Foundation, Inc. #
236 # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #
237 # Everyone is permitted to copy and distribute verbatim copies #
238 # of this license document, but changing it is not allowed. #
242 # The licenses for most software are designed to take away your #
243 # freedom to share and change it. By contrast, the GNU General Public #
244 # License is intended to guarantee your freedom to share and change free #
245 # software--to make sure the software is free for all its users. This #
246 # General Public License applies to most of the Free Software #
247 # Foundation's software and to any other program whose authors commit to #
248 # using it. (Some other Free Software Foundation software is covered by #
249 # the GNU Library General Public License instead.) You can apply it to #
250 # your programs, too. #
252 # When we speak of free software, we are referring to freedom, not #
253 # price. Our General Public Licenses are designed to make sure that you #
254 # have the freedom to distribute copies of free software (and charge for #
255 # this service if you wish), that you receive source code or can get it #
256 # if you want it, that you can change the software or use pieces of it #
257 # in new free programs; and that you know you can do these things. #
259 # To protect your rights, we need to make restrictions that forbid #
260 # anyone to deny you these rights or to ask you to surrender the rights. #
261 # These restrictions translate to certain responsibilities for you if you #
262 # distribute copies of the software, or if you modify it. #
264 # For example, if you distribute copies of such a program, whether #
265 # gratis or for a fee, you must give the recipients all the rights that #
266 # you have. You must make sure that they, too, receive or can get the #
267 # source code. And you must show them these terms so they know their #
270 # We protect your rights with two steps: (1) copyright the software, and #
271 # (2) offer you this license which gives you legal permission to copy, #
272 # distribute and/or modify the software. #
274 # Also, for each author's protection and ours, we want to make certain #
275 # that everyone understands that there is no warranty for this free #
276 # software. If the software is modified by someone else and passed on, we #
277 # want its recipients to know that what they have is not the original, so #
278 # that any problems introduced by others will not reflect on the original #
279 # authors' reputations. #
281 # Finally, any free program is threatened constantly by software #
282 # patents. We wish to avoid the danger that redistributors of a free #
283 # program will individually obtain patent licenses, in effect making the #
284 # program proprietary. To prevent this, we have made it clear that any #
285 # patent must be licensed for everyone's free use or not licensed at all. #
287 # The precise terms and conditions for copying, distribution and #
288 # modification follow. #
290 # GNU GENERAL PUBLIC LICENSE #
291 # TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION #
293 # 0. This License applies to any program or other work which contains #
294 # a notice placed by the copyright holder saying it may be distributed #
295 # under the terms of this General Public License. The "Program", below, #
296 # refers to any such program or work, and a "work based on the Program" #
297 # means either the Program or any derivative work under copyright law: #
298 # that is to say, a work containing the Program or a portion of it, #
299 # either verbatim or with modifications and/or translated into another #
300 # language. (Hereinafter, translation is included without limitation in #
301 # the term "modification".) Each licensee is addressed as "you". #
303 # Activities other than copying, distribution and modification are not #
304 # covered by this License; they are outside its scope. The act of #
305 # running the Program is not restricted, and the output from the Program #
306 # is covered only if its contents constitute a work based on the #
307 # Program (independent of having been made by running the Program). #
308 # Whether that is true depends on what the Program does. #
310 # 1. You may copy and distribute verbatim copies of the Program's #
311 # source code as you receive it, in any medium, provided that you #
312 # conspicuously and appropriately publish on each copy an appropriate #
313 # copyright notice and disclaimer of warranty; keep intact all the #
314 # notices that refer to this License and to the absence of any warranty; #
315 # and give any other recipients of the Program a copy of this License #
316 # along with the Program. #
318 # You may charge a fee for the physical act of transferring a copy, and #
319 # you may at your option offer warranty protection in exchange for a fee. #
321 # 2. You may modify your copy or copies of the Program or any portion #
322 # of it, thus forming a work based on the Program, and copy and #
323 # distribute such modifications or work under the terms of Section 1 #
324 # above, provided that you also meet all of these conditions: #
326 # a) You must cause the modified files to carry prominent notices #
327 # stating that you changed the files and the date of any change. #
329 # b) You must cause any work that you distribute or publish, that in #
330 # whole or in part contains or is derived from the Program or any #
331 # part thereof, to be licensed as a whole at no charge to all third #
332 # parties under the terms of this License. #
334 # c) If the modified program normally reads commands interactively #
335 # when run, you must cause it, when started running for such #
336 # interactive use in the most ordinary way, to print or display an #
337 # announcement including an appropriate copyright notice and a #
338 # notice that there is no warranty (or else, saying that you provide #
339 # a warranty) and that users may redistribute the program under #
340 # these conditions, and telling the user how to view a copy of this #
341 # License. (Exception: if the Program itself is interactive but #
342 # does not normally print such an announcement, your work based on #
343 # the Program is not required to print an announcement.) #
345 # These requirements apply to the modified work as a whole. If #
346 # identifiable sections of that work are not derived from the Program, #
347 # and can be reasonably considered independent and separate works in #
348 # themselves, then this License, and its terms, do not apply to those #
349 # sections when you distribute them as separate works. But when you #
350 # distribute the same sections as part of a whole which is a work based #
351 # on the Program, the distribution of the whole must be on the terms of #
352 # this License, whose permissions for other licensees extend to the #
353 # entire whole, and thus to each and every part regardless of who wrote it. #
355 # Thus, it is not the intent of this section to claim rights or contest #
356 # your rights to work written entirely by you; rather, the intent is to #
357 # exercise the right to control the distribution of derivative or #
358 # collective works based on the Program. #
360 # In addition, mere aggregation of another work not based on the Program #
361 # with the Program (or with a work based on the Program) on a volume of #
362 # a storage or distribution medium does not bring the other work under #
363 # the scope of this License. #
365 # 3. You may copy and distribute the Program (or a work based on it, #
366 # under Section 2) in object code or executable form under the terms of #
367 # Sections 1 and 2 above provided that you also do one of the following: #
369 # a) Accompany it with the complete corresponding machine-readable #
370 # source code, which must be distributed under the terms of Sections #
371 # 1 and 2 above on a medium customarily used for software interchange; or, #
373 # b) Accompany it with a written offer, valid for at least three #
374 # years, to give any third party, for a charge no more than your #
375 # cost of physically performing source distribution, a complete #
376 # machine-readable copy of the corresponding source code, to be #
377 # distributed under the terms of Sections 1 and 2 above on a medium #
378 # customarily used for software interchange; or, #
380 # c) Accompany it with the information you received as to the offer #
381 # to distribute corresponding source code. (This alternative is #
382 # allowed only for noncommercial distribution and only if you #
383 # received the program in object code or executable form with such #
384 # an offer, in accord with Subsection b above.) #
386 # The source code for a work means the preferred form of the work for #
387 # making modifications to it. For an executable work, complete source #
388 # code means all the source code for all modules it contains, plus any #
389 # associated interface definition files, plus the scripts used to #
390 # control compilation and installation of the executable. However, as a #
391 # special exception, the source code distributed need not include #
392 # anything that is normally distributed (in either source or binary #
393 # form) with the major components (compiler, kernel, and so on) of the #
394 # operating system on which the executable runs, unless that component #
395 # itself accompanies the executable. #
397 # If distribution of executable or object code is made by offering #
398 # access to copy from a designated place, then offering equivalent #
399 # access to copy the source code from the same place counts as #
400 # distribution of the source code, even though third parties are not #
401 # compelled to copy the source along with the object code. #
403 # 4. You may not copy, modify, sublicense, or distribute the Program #
404 # except as expressly provided under this License. Any attempt #
405 # otherwise to copy, modify, sublicense or distribute the Program is #
406 # void, and will automatically terminate your rights under this License. #
407 # However, parties who have received copies, or rights, from you under #
408 # this License will not have their licenses terminated so long as such #
409 # parties remain in full compliance. #
411 # 5. You are not required to accept this License, since you have not #
412 # signed it. However, nothing else grants you permission to modify or #
413 # distribute the Program or its derivative works. These actions are #
414 # prohibited by law if you do not accept this License. Therefore, by #
415 # modifying or distributing the Program (or any work based on the #
416 # Program), you indicate your acceptance of this License to do so, and #
417 # all its terms and conditions for copying, distributing or modifying #
418 # the Program or works based on it. #
420 # 6. Each time you redistribute the Program (or any work based on the #
421 # Program), the recipient automatically receives a license from the #
422 # original licensor to copy, distribute or modify the Program subject to #
423 # these terms and conditions. You may not impose any further #
424 # restrictions on the recipients' exercise of the rights granted herein. #
425 # You are not responsible for enforcing compliance by third parties to #
428 # 7. If, as a consequence of a court judgment or allegation of patent #
429 # infringement or for any other reason (not limited to patent issues), #
430 # conditions are imposed on you (whether by court order, agreement or #
431 # otherwise) that contradict the conditions of this License, they do not #
432 # excuse you from the conditions of this License. If you cannot #
433 # distribute so as to satisfy simultaneously your obligations under this #
434 # License and any other pertinent obligations, then as a consequence you #
435 # may not distribute the Program at all. For example, if a patent #
436 # license would not permit royalty-free redistribution of the Program by #
437 # all those who receive copies directly or indirectly through you, then #
438 # the only way you could satisfy both it and this License would be to #
439 # refrain entirely from distribution of the Program. #
441 # If any portion of this section is held invalid or unenforceable under #
442 # any particular circumstance, the balance of the section is intended to #
443 # apply and the section as a whole is intended to apply in other #
446 # It is not the purpose of this section to induce you to infringe any #
447 # patents or other property right claims or to contest validity of any #
448 # such claims; this section has the sole purpose of protecting the #
449 # integrity of the free software distribution system, which is #
450 # implemented by public license practices. Many people have made #
451 # generous contributions to the wide range of software distributed #
452 # through that system in reliance on consistent application of that #
453 # system; it is up to the author/donor to decide if he or she is willing #
454 # to distribute software through any other system and a licensee cannot #
455 # impose that choice. #
457 # This section is intended to make thoroughly clear what is believed to #
458 # be a consequence of the rest of this License. #
460 # 8. If the distribution and/or use of the Program is restricted in #
461 # certain countries either by patents or by copyrighted interfaces, the #
462 # original copyright holder who places the Program under this License #
463 # may add an explicit geographical distribution limitation excluding #
464 # those countries, so that distribution is permitted only in or among #
465 # countries not thus excluded. In such case, this License incorporates #
466 # the limitation as if written in the body of this License. #
468 # 9. The Free Software Foundation may publish revised and/or new versions #
469 # of the General Public License from time to time. Such new versions will #
470 # be similar in spirit to the present version, but may differ in detail to #
471 # address new problems or concerns. #
473 # Each version is given a distinguishing version number. If the Program #
474 # specifies a version number of this License which applies to it and "any #
475 # later version", you have the option of following the terms and conditions #
476 # either of that version or of any later version published by the Free #
477 # Software Foundation. If the Program does not specify a version number of #
478 # this License, you may choose any version ever published by the Free Software #
481 # 10. If you wish to incorporate parts of the Program into other free #
482 # programs whose distribution conditions are different, write to the author #
483 # to ask for permission. For software which is copyrighted by the Free #
484 # Software Foundation, write to the Free Software Foundation; we sometimes #
485 # make exceptions for this. Our decision will be guided by the two goals #
486 # of preserving the free status of all derivatives of our free software and #
487 # of promoting the sharing and reuse of software generally. #
491 # 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY #
492 # FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN #
493 # OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES #
494 # PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED #
495 # OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF #
496 # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS #
497 # TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE #
498 # PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, #
499 # REPAIR OR CORRECTION. #
501 # 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING #
502 # WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR #
503 # REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, #
504 # INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING #
505 # OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED #
506 # TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY #
507 # YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER #
508 # PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE #
509 # POSSIBILITY OF SUCH DAMAGES. #
511 # END OF TERMS AND CONDITIONS #
513 # How to Apply These Terms to Your New Programs #
515 # If you develop a new program, and you want it to be of the greatest #
516 # possible use to the public, the best way to achieve this is to make it #
517 # free software which everyone can redistribute and change under these terms. #
519 # To do so, attach the following notices to the program. It is safest #
520 # to attach them to the start of each source file to most effectively #
521 # convey the exclusion of warranty; and each file should have at least #
522 # the "copyright" line and a pointer to where the full notice is found. #
524 # <one line to give the program's name and a brief idea of what it does.> #
525 # Copyright (C) <year> <name of author> #
527 # This program is free software; you can redistribute it and/or modify #
528 # it under the terms of the GNU General Public License as published by #
529 # the Free Software Foundation; either version 2 of the License, or #
530 # (at your option) any later version. #
532 # This program is distributed in the hope that it will be useful, #
533 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
534 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
535 # GNU General Public License for more details. #
537 # You should have received a copy of the GNU General Public License #
538 # along with this program; if not, write to the Free Software #
539 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #
542 # Also add information on how to contact you by electronic and paper mail. #
544 # If the program is interactive, make it output a short notice like this #
545 # when it starts in an interactive mode: #
547 # Gnomovision version 69, Copyright (C) year name of author #
548 # Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. #
549 # This is free software, and you are welcome to redistribute it #
550 # under certain conditions; type `show c' for details. #
552 # The hypothetical commands `show w' and `show c' should show the appropriate #
553 # parts of the General Public License. Of course, the commands you use may #
554 # be called something other than `show w' and `show c'; they could even be #
555 # mouse-clicks or menu items--whatever suits your program. #
557 # You should also get your employer (if you work as a programmer) or your #
558 # school, if any, to sign a "copyright disclaimer" for the program, if #
559 # necessary. Here is a sample; alter the names: #
561 # Yoyodyne, Inc., hereby disclaims all copyright interest in the program #
562 # `Gnomovision' (which makes passes at compilers) written by James Hacker. #
564 # <signature of Ty Coon>, 1 April 1989 #
565 # Ty Coon, President of Vice #
567 # This General Public License does not permit incorporating your program into #
568 # proprietary programs. If your program is a subroutine library, you may #
569 # consider it more useful to permit linking proprietary applications with the #
570 # library. If this is what you want to do, use the GNU Library General #
571 # Public License instead of this License. #
572 #################################################################################