git.hungrycats.org Git - xscreensaver/blob - hacks/webcollage

   1 #!/usr/bin/perl -w
   2 #
   3 # webcollage, Copyright (c) 1999-2012 by Jamie Zawinski <jwz@jwz.org>
   4 # This program decorates the screen with random images from the web.
   5 # One satisfied customer described it as "a nonstop pop culture brainbath."
   6 #
   7 # Permission to use, copy, modify, distribute, and sell this software and its
   8 # documentation for any purpose is hereby granted without fee, provided that
   9 # the above copyright notice appear in all copies and that both that
  10 # copyright notice and this permission notice appear in supporting
  11 # documentation.  No representations are made about the suitability of this
  12 # software for any purpose.  It is provided "as is" without express or
  13 # implied warranty.
  14
  15
  16 # To run this as a display mode with xscreensaver, add this to `programs':
  17 #
  18 #     webcollage -root
  19 #     webcollage -root -filter 'vidwhacker -stdin -stdout'
  20 #
  21 #
  22 # You can see this in action at http://www.jwz.org/webcollage/ --
  23 # it auto-reloads about once a minute.  To make a page similar to
  24 # that on your own system, do this:
  25 #
  26 #     webcollage -size '800x600' -imagemap $HOME/www/webcollage/index
  27 #
  28 #
  29 # If you have the "driftnet" program installed, webcollage can display a
  30 # collage of images sniffed off your local ethernet, instead of pulled out
  31 # of search engines: in that way, your screensaver can display the images
  32 # that your co-workers are downloading!
  33 #
  34 # Driftnet is available here: http://www.ex-parrot.com/~chris/driftnet/
  35 # Use it like so:
  36 #
  37 #     webcollage -root -driftnet
  38 #
  39 # Driftnet is the Unix implementation of the MacOS "EtherPEG" program.
  40
  41
  42 require 5;
  43 use strict;
  44
  45 # We can't "use diagnostics" here, because that library malfunctions if
  46 # you signal and catch alarms: it says "Uncaught exception from user code"
  47 # and exits, even though I damned well AM catching it!
  48 #use diagnostics;
  49
  50
  51 use Socket;
  52 require Time::Local;
  53 require POSIX;
  54 use Fcntl ':flock'; # import LOCK_* constants
  55 use POSIX qw(strftime);
  56
  57 use bytes;  # Larry can take Unicode and shove it up his ass sideways.
  58             # Perl 5.8.0 causes us to start getting incomprehensible
  59             # errors about UTF-8 all over the place without this.
  60
  61
  62 my $progname = $0; $progname =~ s@.*/@@g;
  63 my $version = q{ $Revision: 1.158 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;
  64 my $copyright = "WebCollage $version, Copyright (c) 1999-2011" .
  65     " Jamie Zawinski <jwz\@jwz.org>\n" .
  66     "            http://www.jwz.org/webcollage/\n";
  67
  68
  69
  70 my @search_methods = ( 24, "googlephotos",  \&pick_from_google_image_photos,
  71                        13, "googleimgs",    \&pick_from_google_images,
  72                        13, "googlenums",    \&pick_from_google_image_numbers,
  73                        16, "flickr_recent", \&pick_from_flickr_recent,
  74                        13, "flickr_random", \&pick_from_flickr_random,
  75                        10, "twitpic",       \&pick_from_twitpic_images,
  76                         8, "livejournal",   \&pick_from_livejournal_images,
  77                         3, "yahoorand",     \&pick_from_yahoo_random_link,
  78
  79                      # This one doesn't work very well: too many non-img links.
  80                         0, "twitter",       \&pick_from_twitter_images,
  81
  82                      # This is a cute way to search for a certain webcams.
  83                      # Not included in default methods, since these images
  84                      # aren't terribly interesting by themselves.
  85                      # See also "SurveillanceSaver".
  86                      #
  87                         0, "securitycam",   \&pick_from_security_camera,
  88
  89                      # Nonfunctional as of June 2011.
  90                      #  0, "altavista",     \&pick_from_alta_vista_random_link,
  91
  92                      # In Apr 2002, Google asked me to stop searching them.
  93                      # I asked them to add a "random link" url.  They said
  94                      # "that would be easy, we'll think about it" and then
  95                      # never wrote back.  Booo Google!  Booooo!  So, screw
  96                      # those turkeys, I've turned Google searching back on.
  97                      # I'm sure they can take it.  (Jan 2005.)
  98
  99                      # Jan 2005: Yahoo fucked up their search form so that
 100                      # it's no longer possible to do "or" searches on news
 101                      # images, so we rarely get any hits there any more.
 102                      #
 103                      #  0, "yahoonews",     \&pick_from_yahoo_news_text,
 104
 105                      # Dec 2004: the ircimages guy's server can't take the
 106                      # heat, so he started banning the webcollage user agent.
 107                      # I tried to convince him to add a lighter-weight page to
 108                      # support webcollage better, but he doesn't care.
 109                      #
 110                      #  0, "ircimages",     \&pick_from_ircimages,
 111
 112                      # Dec 2002: Alta Vista has a new "random link" URL now.
 113                      # They added it specifically to better support webcollage!
 114                      # That was super cool of them.  This is how we used to do
 115                      # it, before:
 116                      #
 117                      #  0, "avimages",      \&pick_from_alta_vista_images,
 118                      #  0, "avtext",        \&pick_from_alta_vista_text,
 119
 120                      # This broke in 2004.  Eh, Lycos sucks anyway.
 121                      #
 122                      #  0, "lycos",         \&pick_from_lycos_text,
 123
 124                      # This broke in 2003, I think.  I suspect Hotbot is
 125                      # actually the same search engine data as Lycos.
 126                      #
 127                      #  0, "hotbot",        \&pick_from_hotbot_text,
 128                       );
 129
 130 # programs we can use to write to the root window (tried in ascending order.)
 131 #
 132 my @root_displayers = (
 133   "xscreensaver-getimage -root -file",
 134   "chbg       -once -xscreensaver -max_size 100",
 135   "xv         -root -quit -viewonly +noresetroot -quick24 -rmode 5" .
 136   "           -rfg black -rbg black",
 137   "xli        -quiet -onroot -center -border black",
 138   "xloadimage -quiet -onroot -center -border black",
 139
 140 # this lame program wasn't built with vroot.h:
 141 # "xsri       -scale -keep-aspect -center-horizontal -center-vertical",
 142 );
 143
 144
 145 # Some sites need cookies to work properly.   These are they.
 146 #
 147 my %cookies = (
 148   "www.altavista.com"  =>  "AV_ALL=1",   # request uncensored searches
 149   "web.altavista.com"  =>  "AV_ALL=1",
 150
 151                                          # log in as "cipherpunk"
 152   "www.nytimes.com"    =>  'NYT-S=18cHMIlJOn2Y1bu5xvEG3Ufuk6E1oJ.' .
 153                            'FMxWaQV0igaB5Yi/Q/guDnLeoL.pe7i1oakSb' .
 154                            '/VqfdUdb2Uo27Vzt1jmPn3cpYRlTw9',
 155
 156   "ircimages.com"      =>  'disclaimer=1',
 157 );
 158
 159
 160 # If this is set, it's a helper program to use for pasting images together:
 161 # this is a lot faster and more efficient than using PPM pipelines, which is
 162 # what we do if this program doesn't exist.  (We check for "webcollage-helper"
 163 # on $PATH at startup, and set this variable appropriately.)
 164 #
 165 my $webcollage_helper = undef;
 166
 167
 168 # If we have the webcollage-helper program, then it will paste the images
 169 # together with transparency!  0.0 is invisible, 1.0 is totally opaque.
 170 #
 171 my $opacity = 0.85;
 172
 173
 174 # Some sites have  managed to poison the search engines.  These are they.
 175 # (We auto-detect sites that have poisoned the search engines via excessive
 176 # keywords or dictionary words,  but these are ones that slip through
 177 # anyway.)
 178 #
 179 # This can contain full host names, or 2 or 3 component domains.
 180 #
 181 my %poisoners = (
 182   "die.net"                 => 1,  # 'l33t h4ck3r d00dz.
 183   "genforum.genealogy.com"  => 1,  # Cluttering avtext with human names.
 184   "rootsweb.com"            => 1,  # Cluttering avtext with human names.
 185   "akamai.net"              => 1,  # Lots of sites have their images on Akamai.
 186   "akamaitech.net"          => 1,  # But those are pretty much all banners.
 187                                    # Since Akamai is super-expensive, let's
 188                                    # go out on a limb and assume that all of
 189                                    # their customers are rich-and-boring.
 190   "bartleby.com"            => 1,  # Dictionary, cluttering avtext.
 191   "encyclopedia.com"        => 1,  # Dictionary, cluttering avtext.
 192   "onlinedictionary.datasegment.com" => 1,  # Dictionary, cluttering avtext.
 193   "hotlinkpics.com"         => 1,  # Porn site that has poisoned avimages
 194                                    # (I don't see how they did it, though!)
 195   "alwayshotels.com"        => 1,  # Poisoned Lycos pretty heavily.
 196   "nextag.com"              => 1,  # Poisoned Alta Vista real good.
 197   "ghettodriveby.com"       => 1,  # Poisoned Google Images.
 198   "crosswordsolver.org"     => 1,  # Poisoned Google Images.
 199   "xona.com"                => 1,  # Poisoned Google Images.
 200   "freepatentsonline.com"   => 1,  # Poisoned Google Images.
 201   "herbdatanz.com"          => 1,  # Poisoned Google Images.
 202 );
 203
 204
 205 # When verbosity is turned on, we warn about sites that we seem to be hitting
 206 # a lot: usually this means some new poisoner has made it into the search
 207 # engines.  But sometimes, the warning is just because that site has a lot
 208 # of stuff on it.  So these are the sites that are immune to the "frequent
 209 # site" diagnostic message.
 210 #
 211 my %warningless_sites = (
 212   "home.earthlink.net"      => 1,
 213   "www.angelfire.com"       => 1,
 214   "members.aol.com"         => 1,
 215   "img.photobucket.com"     => 1,
 216   "pics.livejournal.com"    => 1,
 217   "tinypic.com"             => 1,
 218   "flickr.com"              => 1,
 219   "pbase.com"               => 1,
 220   "blogger.com"             => 1,
 221   "multiply.com"            => 1,
 222   "wikimedia.org"           => 1,
 223   "twitpic.com"             => 1,
 224   "amazonaws.com"           => 1,
 225   "blogspot.com"            => 1,
 226   "photoshelter.com"        => 1,
 227   "myspacecdn.com"          => 1,
 228   "feedburner.com"          => 1,
 229   "wikia.com"               => 1,
 230   "ljplus.ru"               => 1,
 231   "yandex.ru"               => 1,
 232   "imgur.com"               => 1,
 233   "yfrog.com"               => 1,
 234
 235   "yimg.com"                => 1,  # This is where dailynews.yahoo.com stores
 236   "eimg.com"                => 1,  # its images, so pick_from_yahoo_news_text()
 237                                    # hits this every time.
 238
 239   "images.quizfarm.com"     => 1,  # damn those LJ quizzes...
 240   "images.quizilla.com"     => 1,
 241   "images.quizdiva.net"     => 1,
 242
 243   "driftnet"                => 1,  # builtin...
 244   "local-directory"         => 1,  # builtin...
 245 );
 246
 247
 248 # For decoding HTML-encoded character entities to URLs.
 249 #
 250 my %entity_table = (
 251    "apos"   => '\'',
 252    "quot"   => '"', "amp"    => '&', "lt"     => '<', "gt"     => '>',
 253    "nbsp"   => ' ', "iexcl"  => '¡', "cent"   => '¢', "pound"  => '£',
 254    "curren" => '¤', "yen"    => '¥', "brvbar" => '¦', "sect"   => '§',
 255    "uml"    => '¨', "copy"   => '©', "ordf"   => 'ª', "laquo"  => '«',
 256    "not"    => '¬', "shy"    => '', "reg"    => '®', "macr"   => '¯',
 257    "deg"    => '°', "plusmn" => '±', "sup2"   => '²', "sup3"   => '³',
 258    "acute"  => '´', "micro"  => 'µ', "para"   => '¶', "middot" => '·',
 259    "cedil"  => '¸', "sup1"   => '¹', "ordm"   => 'º', "raquo"  => '»',
 260    "frac14" => '¼', "frac12" => '½', "frac34" => '¾', "iquest" => '¿',
 261    "Agrave" => 'À', "Aacute" => 'Á', "Acirc"  => 'Â', "Atilde" => 'Ã',
 262    "Auml"   => 'Ä', "Aring"  => 'Å', "AElig"  => 'Æ', "Ccedil" => 'Ç',
 263    "Egrave" => 'È', "Eacute" => 'É', "Ecirc"  => 'Ê', "Euml"   => 'Ë',
 264    "Igrave" => 'Ì', "Iacute" => 'Í', "Icirc"  => 'Î', "Iuml"   => 'Ï',
 265    "ETH"    => 'Ð', "Ntilde" => 'Ñ', "Ograve" => 'Ò', "Oacute" => 'Ó',
 266    "Ocirc"  => 'Ô', "Otilde" => 'Õ', "Ouml"   => 'Ö', "times"  => '×',
 267    "Oslash" => 'Ø', "Ugrave" => 'Ù', "Uacute" => 'Ú', "Ucirc"  => 'Û',
 268    "Uuml"   => 'Ü', "Yacute" => 'Ý', "THORN"  => 'Þ', "szlig"  => 'ß',
 269    "agrave" => 'à', "aacute" => 'á', "acirc"  => 'â', "atilde" => 'ã',
 270    "auml"   => 'ä', "aring"  => 'å', "aelig"  => 'æ', "ccedil" => 'ç',
 271    "egrave" => 'è', "eacute" => 'é', "ecirc"  => 'ê', "euml"   => 'ë',
 272    "igrave" => 'ì', "iacute" => 'í', "icirc"  => 'î', "iuml"   => 'ï',
 273    "eth"    => 'ð', "ntilde" => 'ñ', "ograve" => 'ò', "oacute" => 'ó',
 274    "ocirc"  => 'ô', "otilde" => 'õ', "ouml"   => 'ö', "divide" => '÷',
 275    "oslash" => 'ø', "ugrave" => 'ù', "uacute" => 'ú', "ucirc"  => 'û',
 276    "uuml"   => 'ü', "yacute" => 'ý', "thorn"  => 'þ', "yuml"   => 'ÿ',
 277
 278    # HTML 4 entities that do not have 1:1 Latin1 mappings.
 279    "bull"  => "*",   "hellip"=> "...",  "prime" => "'",  "Prime" => "\"",
 280    "frasl" => "/",   "trade" => "[tm]", "larr"  => "<-", "rarr"  => "->",
 281    "harr"  => "<->", "lArr"  => "<=",   "rArr"  => "=>", "hArr"  => "<=>",
 282    "empty" => "Ø",   "minus" => "-",    "lowast"=> "*",  "sim"   => "~",
 283    "cong"  => "=~",  "asymp" => "~",    "ne"    => "!=", "equiv" => "==",
 284    "le"    => "<=",  "ge"    => ">=",   "lang"  => "<",  "rang"  => ">",
 285    "loz"   => "<>",  "OElig" => "OE",   "oelig" => "oe", "Yuml"  => "Y",
 286    "circ"  => "^",   "tilde" => "~",    "ensp"  => " ",  "emsp"  => " ",
 287    "thinsp"=> " ",   "ndash" => "-",    "mdash" => "--", "lsquo" => "`",
 288    "rsquo" => "'",   "sbquo" => "'",    "ldquo" => "\"", "rdquo" => "\"",
 289    "bdquo" => "\"",  "lsaquo"=> "<",    "rsaquo"=> ">",
 290 );
 291
 292
 293 ##############################################################################
 294 #
 295 # Various global flags set by command line parameters, or computed
 296 #
 297 ##############################################################################
 298
 299
 300 my $current_state = "???";      # for diagnostics
 301 my $load_method;
 302 my $last_search;
 303 my $image_succeeded = -1;
 304 my $suppress_audit = 0;
 305
 306 my $verbose_imgmap = 0;         # print out rectangles and URLs only (stdout)
 307 my $verbose_warnings = 0;       # print out warnings when things go wrong
 308 my $verbose_load = 0;           # diagnostics about loading of URLs
 309 my $verbose_filter = 0;         # diagnostics about page selection/rejection
 310 my $verbose_net = 0;            # diagnostics about network I/O
 311 my $verbose_pbm = 0;            # diagnostics about PBM pipelines
 312 my $verbose_http = 0;           # diagnostics about all HTTP activity
 313 my $verbose_exec = 0;           # diagnostics about executing programs
 314
 315 my $report_performance_interval = 60 * 15;  # print some stats every 15 minutes
 316
 317 my $http_proxy = undef;
 318 my $http_timeout = 20;
 319 my $cvt_timeout = 10;
 320
 321 my $min_width = 50;
 322 my $min_height = 50;
 323 my $min_ratio = 1/5;
 324
 325 my $min_gif_area = (120 * 120);
 326
 327
 328 my $no_output_p = 0;
 329 my $urls_only_p = 0;
 330 my $cocoa_p = 0;
 331 my $imagemap_base = undef;
 332
 333 my @pids_to_kill = ();  # forked pids we should kill when we exit, if any.
 334
 335 my $driftnet_magic = 'driftnet';
 336 my $driftnet_dir = undef;
 337 my $default_driftnet_cmd = "driftnet -a -m 100";
 338
 339 my $local_magic = 'local-directory';
 340 my $local_dir = undef;
 341
 342 my $wordlist;
 343
 344 my %rejected_urls;
 345 my @tripwire_words = ("aberrate", "abode", "amorphous", "antioch",
 346                       "arrhenius", "arteriole", "blanket", "brainchild",
 347                       "burdensome", "carnival", "cherub", "chord", "clever",
 348                       "dedicate", "dilogarithm", "dolan", "dryden",
 349                       "eggplant");
 350
 351
 352 ##############################################################################
 353 #
 354 # Retrieving URLs
 355 #
 356 ##############################################################################
 357
 358 # returns three values: the HTTP response line; the document headers;
 359 # and the document body.
 360 #
 361 sub get_document_1($$$) {
 362   my ($url, $referer, $timeout) = @_;
 363
 364   if (!defined($timeout)) { $timeout = $http_timeout; }
 365   if ($timeout > $http_timeout) { $timeout = $http_timeout; }
 366
 367   if ($timeout <= 0) {
 368     LOG (($verbose_net || $verbose_load), "timed out for $url");
 369     return ();
 370   }
 371
 372   LOG ($verbose_net, "get_document_1 $url " . ($referer ? $referer : ""));
 373
 374   if (! ($url =~ m@^http://@i)) {
 375     LOG ($verbose_net, "not an HTTP URL: $url");
 376     return ();
 377   }
 378
 379   my ($url_proto, $dummy, $serverstring, $path) = split(/\//, $url, 4);
 380   $path = "" unless $path;
 381
 382   if (!$url_proto || !$serverstring) {
 383     LOG (($verbose_net || $verbose_load), "unparsable URL: $url");
 384     return ();
 385   }
 386
 387   my ($them,$port) = split(/:/, $serverstring);
 388   $port = 80 unless $port;
 389
 390   my $them2 = $them;
 391   my $port2 = $port;
 392   if ($http_proxy) {
 393     $serverstring = $http_proxy if $http_proxy;
 394     $serverstring =~ s@^[a-z]+://@@;
 395     ($them2,$port2) = split(/:/, $serverstring);
 396     $port2 = 80 unless $port2;
 397   }
 398
 399   my ($remote, $iaddr, $paddr, $proto, $line);
 400   $remote = $them2;
 401   if ($port2 =~ /\D/) { $port2 = getservbyname($port2, 'tcp') }
 402   if (!$port2) {
 403     LOG (($verbose_net || $verbose_load), "unrecognised port in $url");
 404     return ();
 405   }
 406   $iaddr   = inet_aton($remote);
 407   if (!$iaddr) {
 408     LOG (($verbose_net || $verbose_load), "host not found: $remote");
 409     return ();
 410   }
 411   $paddr   = sockaddr_in($port2, $iaddr);
 412
 413
 414   my $head = "";
 415   my $body = "";
 416
 417   @_ =
 418     eval {
 419       local $SIG{ALRM} = sub {
 420         LOG (($verbose_net || $verbose_load), "timed out ($timeout) for $url");
 421         die "alarm\n";
 422       };
 423       alarm $timeout;
 424
 425       $proto   = getprotobyname('tcp');
 426       if (!socket(S, PF_INET, SOCK_STREAM, $proto)) {
 427         LOG (($verbose_net || $verbose_load), "socket: $!");
 428         return ();
 429       }
 430       if (!connect(S, $paddr)) {
 431         LOG (($verbose_net || $verbose_load), "connect($serverstring): $!");
 432         return ();
 433       }
 434
 435       select(S); $| = 1; select(STDOUT);
 436
 437       my $cookie = $cookies{$them};
 438
 439       my $user_agent = "$progname/$version";
 440
 441       if ($url =~ m@^http://www\.altavista\.com/@ ||
 442           $url =~ m@^http://random\.yahoo\.com/@ ||
 443           $url =~ m@^http://images\.google\.com/@ ||
 444           $url =~ m@^http://www\.google\.com/@) {
 445         # block this, you turkeys.
 446         $user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.7)" .
 447           " Gecko/20070914 Firefox/2.0.0.7";
 448
 449         # 28-Jun-2007: Google Images now emits the entire page in JS if
 450         # you claim to be Gecko.  They also still block "webcollage".
 451         # They serve non-JS for unrecognised agents, so let's try this...
 452         $user_agent = "NoJavascriptPlease/1.0"
 453           if ($url =~ m@^http://[a-z]+\.google\.com/@);
 454       }
 455
 456       my $hdrs = "GET " . ($http_proxy ? $url : "/$path") . " HTTP/1.0\r\n" .
 457                  "Host: $them\r\n" .
 458                  "User-Agent: $user_agent\r\n";
 459       if ($referer) {
 460         $hdrs .= "Referer: $referer\r\n";
 461       }
 462       if ($cookie) {
 463         my @cc = split(/\r?\n/, $cookie);
 464         $hdrs .= "Cookie: " . join('; ', @cc) . "\r\n";
 465       }
 466       $hdrs .= "\r\n";
 467
 468       foreach (split('\r?\n', $hdrs)) {
 469         LOG ($verbose_http, "  ==> $_");
 470       }
 471       print S $hdrs;
 472       my $http = <S> || "";
 473
 474       # Kludge: the Yahoo Random Link is now returning as its first
 475       # line "Status: 301" instead of "HTTP/1.0 301 Found".  Fix it...
 476       #
 477       $http =~ s@^Status:\s+(\d+)\b@HTTP/1.0 $1@i;
 478
 479       $_  = $http;
 480       s/[\r\n]+$//s;
 481       LOG ($verbose_http, "  <== $_");
 482
 483       while (<S>) {
 484         $head .= $_;
 485         s/[\r\n]+$//s;
 486         last if m@^$@;
 487         LOG ($verbose_http, "  <== $_");
 488
 489         if (m@^Set-cookie:\s*([^;\r\n]+)@i) {
 490           set_cookie($them, $1)
 491         }
 492       }
 493
 494       my $lines = 0;
 495       while (<S>) {
 496         $body .= $_;
 497         $lines++;
 498       }
 499
 500       LOG ($verbose_http,
 501            "  <== [ body ]: $lines lines, " . length($body) . " bytes");
 502
 503       close S;
 504
 505       if (!$http) {
 506         LOG (($verbose_net || $verbose_load), "null response: $url");
 507         return ();
 508       }
 509
 510       $SIG{ALRM} = 'DEFAULT';  # seem to be suffering a race?
 511       return ( $http, $head, $body );
 512     };
 513   die if ($@ && $@ ne "alarm\n");       # propagate errors
 514
 515   if ($@ && $@ ne "alarm\n") {
 516     print STDERR blurb() . "DIE " . join(" ", $@) . "\n";
 517     die;
 518   }
 519
 520   if ($@) {
 521     # timed out
 522     $head = undef;
 523     $body = undef;
 524     $suppress_audit = 1;
 525     return ();
 526   } else {
 527     # didn't
 528     alarm 0;
 529     return @_;
 530   }
 531 }
 532
 533
 534 # returns two values: the document headers; and the document body.
 535 # if the given URL did a redirect, returns the redirected-to document.
 536 #
 537 sub get_document($$;$) {
 538   my ($url, $referer, $timeout) = @_;
 539   my $start = time;
 540
 541   if (defined($referer) && $referer eq $driftnet_magic) {
 542     return get_driftnet_file ($url);
 543   }
 544
 545   if (defined($referer) && $referer eq $local_magic) {
 546     return get_local_file ($url);
 547   }
 548
 549   my $orig_url = $url;
 550   my $loop_count = 0;
 551   my $max_loop_count = 4;
 552
 553   do {
 554     if (defined($timeout) && $timeout <= 0) {
 555       LOG (($verbose_net || $verbose_load), "timed out for $url");
 556       $suppress_audit = 1;
 557       return ();
 558     }
 559
 560     my ( $http, $head, $body ) = get_document_1 ($url, $referer, $timeout);
 561
 562     if (defined ($timeout)) {
 563       my $now = time;
 564       my $elapsed = $now - $start;
 565       $timeout -= $elapsed;
 566       $start = $now;
 567     }
 568
 569     return () unless $http; # error message already printed
 570
 571     $http =~ s/[\r\n]+$//s;
 572
 573     if ( $http =~ m@^HTTP/[0-9.]+ 30[123]@ ) {
 574       $_ = $head;
 575
 576       my ( $location ) = m@^location:[ \t]*(.*)$@im;
 577       if ( $location ) {
 578         $location =~ s/[\r\n]$//;
 579
 580         LOG ($verbose_net, "redirect from $url to $location");
 581         $referer = $url;
 582         $url = $location;
 583
 584         if ($url =~ m@^/@) {
 585           $referer =~ m@^(http://[^/]+)@i;
 586           $url = $1 . $url;
 587         } elsif (! ($url =~ m@^[a-z]+:@i)) {
 588           $_ = $referer;
 589           s@[^/]+$@@g if m@^http://[^/]+/@i;
 590           $_ .= "/" if m@^http://[^/]+$@i;
 591           $url = $_ . $url;
 592         }
 593
 594       } else {
 595         LOG ($verbose_net, "no Location with \"$http\"");
 596         return ( $url, $body );
 597       }
 598
 599       if ($loop_count++ > $max_loop_count) {
 600         LOG ($verbose_net,
 601              "too many redirects ($max_loop_count) from $orig_url");
 602         $body = undef;
 603         return ();
 604       }
 605
 606     } elsif ( $http =~ m@^HTTP/[0-9.]+ ([4-9][0-9][0-9].*)$@ ) {
 607
 608       LOG (($verbose_net || $verbose_load), "failed: $1 ($url)");
 609
 610       # http errors -- return nothing.
 611       $body = undef;
 612       return ();
 613
 614     } elsif (!$body) {
 615
 616       LOG (($verbose_net || $verbose_load), "document contains no data: $url");
 617       return ();
 618
 619     } else {
 620
 621       # ok!
 622       return ( $url, $body );
 623     }
 624
 625   } while (1);
 626 }
 627
 628 # If we already have a cookie defined for this site, and the site is trying
 629 # to overwrite that very same cookie, let it do so.  This is because nytimes
 630 # expires its cookies - it lets you upgrade to a new cookie without logging
 631 # in again, but you have to present the old cookie to get the new cookie.
 632 # So, by doing this, the built-in cypherpunks cookie will never go "stale".
 633 #
 634 sub set_cookie($$) {
 635   my ($host, $cookie) = @_;
 636   my $oc = $cookies{$host};
 637   return unless $oc;
 638   $_ = $oc;
 639   my ($oc_name, $oc_value) = m@^([^= \t\r\n]+)=(.*)$@;
 640   $_ = $cookie;
 641   my ($nc_name, $nc_value) = m@^([^= \t\r\n]+)=(.*)$@;
 642
 643   if ($oc_name eq $nc_name &&
 644       $oc_value ne $nc_value) {
 645     $cookies{$host} = $cookie;
 646     LOG ($verbose_net, "overwrote ${host}'s $oc_name cookie");
 647   }
 648 }
 649
 650
 651 ############################################################################
 652 #
 653 # Extracting image URLs from HTML
 654 #
 655 ############################################################################
 656
 657 # given a URL and the body text at that URL, selects and returns a random
 658 # image from it.  returns () if no suitable images found.
 659 #
 660 sub pick_image_from_body($$) {
 661   my ($url, $body) = @_;
 662
 663   my $base = $url;
 664   $_ = $url;
 665
 666   # if there's at least one slash after the host, take off the last
 667   # pathname component
 668   if ( m@^http://[^/]+/@io ) {
 669     $base =~ s@[^/]+$@@go;
 670   }
 671
 672   # if there are no slashes after the host at all, put one on the end.
 673   if ( m@^http://[^/]+$@io ) {
 674     $base .= "/";
 675   }
 676
 677   $_ = $body;
 678
 679   # strip out newlines, compress whitespace
 680   s/[\r\n\t ]+/ /go;
 681
 682   # nuke comments
 683   s/<!--.*?-->//go;
 684
 685
 686   # There are certain web sites that list huge numbers of dictionary
 687   # words in their bodies or in their <META NAME=KEYWORDS> tags (surprise!
 688   # Porn sites tend not to be reputable!)
 689   #
 690   # I do not want webcollage to filter on content: I want it to select
 691   # randomly from the set of images on the web.  All the logic here for
 692   # rejecting some images is really a set of heuristics for rejecting
 693   # images that are not really images: for rejecting *text* that is in
 694   # GIF/JPEG/PNG form.  I don't want text, I want pictures, and I want
 695   # the content of the pictures to be randomly selected from among all
 696   # the available content.
 697   #
 698   # So, filtering out "dirty" pictures by looking for "dirty" keywords
 699   # would be wrong: dirty pictures exist, like it or not, so webcollage
 700   # should be able to select them.
 701   #
 702   # However, picking a random URL is a hard thing to do.  The mechanism I'm
 703   # using is to search for a selection of random words.  This is not
 704   # perfect, but works ok most of the time.  The way it breaks down is when
 705   # some URLs get precedence because their pages list *every word* as
 706   # related -- those URLs come up more often than others.
 707   #
 708   # So, after we've retrieved a URL, if it has too many keywords, reject
 709   # it.  We reject it not on the basis of what those keywords are, but on
 710   # the basis that by having so many, the page has gotten an unfair
 711   # advantage against our randomizer.
 712   #
 713   my $trip_count = 0;
 714   foreach my $trip (@tripwire_words) {
 715     $trip_count++ if m/$trip/i;
 716   }
 717
 718   if ($trip_count >= $#tripwire_words - 2) {
 719     LOG (($verbose_filter || $verbose_load),
 720          "there is probably a dictionary in \"$url\": rejecting.");
 721     $rejected_urls{$url} = -1;
 722     $body = undef;
 723     $_ = undef;
 724     return ();
 725   }
 726
 727
 728   my @urls;
 729   my %unique_urls;
 730
 731   foreach (split(/ *</)) {
 732     if ( m/^meta.*["']keywords["']/i ) {
 733
 734       # Likewise, reject any web pages that have a KEYWORDS meta tag
 735       # that is too long.
 736       #
 737       my $L = length($_);
 738       if ($L > 1000) {
 739         LOG (($verbose_filter || $verbose_load),
 740              "excessive keywords ($L bytes) in $url: rejecting.");
 741         $rejected_urls{$url} = $L;
 742         $body = undef;
 743         $_ = undef;
 744         return ();
 745       } else {
 746         LOG ($verbose_filter, "  keywords ($L bytes) in $url (ok)");
 747       }
 748
 749     } elsif (m/^ (IMG|A) \b .* (SRC|HREF) \s* = \s* ["']? (.*?) [ "'<>] /six ||
 750              m/^ (LINK|META) \b .* (REL|PROPERTY) \s* = \s*
 751                  ["']? (image_src|og:image) ["']? /six) {
 752
 753       my $was_inline = (lc($1) eq 'img');
 754       my $was_meta   = (lc($1) eq 'link' || lc($1) eq 'meta');
 755       my $link = $3;
 756
 757       # For <link rel="image_src" href="...">
 758       # and <meta property="og:image" content="...">
 759       #
 760       if ($was_meta) {
 761         next unless (m/ (HREF|CONTENT) \s* = \s* ["']? (.*?) [ "'<>] /six);
 762         $link = $2;
 763       }
 764
 765       my ( $width )  = m/width ?=[ \"]*(\d+)/oi;
 766       my ( $height ) = m/height ?=[ \"]*(\d+)/oi;
 767       $_ = $link;
 768
 769       if ( m@^/@o ) {
 770         my $site;
 771         ( $site = $base ) =~ s@^(http://[^/]*).*@$1@gio;
 772         $_ = "$site$link";
 773       } elsif ( ! m@^[^/:?]+:@ ) {
 774         $_ = "$base$link";
 775         s@/\./@/@g;
 776         1 while (s@/[^/]+/\.\./@/@g);
 777       }
 778
 779       # skip non-http
 780       if ( ! m@^http://@io ) {
 781         next;
 782       }
 783
 784       # skip non-image
 785       if ( ! m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@io ) {
 786         next;
 787       }
 788
 789       # skip really short or really narrow images
 790       if ( $width && $width < $min_width) {
 791         if (!$height) { $height = "?"; }
 792         LOG ($verbose_filter, "  skip narrow image $_ (${width}x$height)");
 793         next;
 794       }
 795
 796       if ( $height && $height < $min_height) {
 797         if (!$width) { $width = "?"; }
 798         LOG ($verbose_filter, "  skip short image $_ (${width}x$height)");
 799         next;
 800       }
 801
 802       # skip images with ratios that make them look like banners.
 803       if ($min_ratio && $width && $height &&
 804           ($width * $min_ratio ) > $height) {
 805         if (!$height) { $height = "?"; }
 806         LOG ($verbose_filter, "  skip bad ratio $_ (${width}x$height)");
 807         next;
 808       }
 809
 810       # skip GIFs with a small number of pixels -- those usually suck.
 811       if ($width && $height &&
 812           m/\.gif$/io &&
 813           ($width * $height) < $min_gif_area) {
 814         LOG ($verbose_filter, "  skip small GIF $_ (${width}x$height)");
 815         next;
 816       }
 817
 818       # skip images with a URL that indicates a Yahoo thumbnail.
 819       if (m@\.yimg\.com/.*/t/@) {
 820         if (!$width)  { $width  = "?"; }
 821         if (!$height) { $height = "?"; }
 822         LOG ($verbose_filter, "  skip yahoo thumb $_ (${width}x$height)");
 823         next;
 824       }
 825
 826       my $url = $_;
 827
 828       if ($unique_urls{$url}) {
 829         LOG ($verbose_filter, "  skip duplicate image $_");
 830         next;
 831       }
 832
 833       LOG ($verbose_filter,
 834            "  image $url" .
 835            ($width && $height ? " (${width}x${height})" : "") .
 836            ($was_meta ? " (meta)" : $was_inline ? " (inline)" : ""));
 837
 838
 839       my $weight = 1;
 840
 841       if ($was_meta) {
 842         $weight = 20;    # meta tag images are far preferable to inline images.
 843       } else {
 844         if ($url !~ m@[.](gif|png)$@io ) {
 845           $weight += 2;  # JPEGs are preferable to GIFs and PNGs.
 846         }
 847         if (! $was_inline) {
 848           $weight += 4;  # pointers to images are preferable to inlined images.
 849         }
 850       }
 851
 852       $unique_urls{$url}++;
 853       for (my $i = 0; $i < $weight; $i++) {
 854         $urls[++$#urls] = $url;
 855       }
 856     }
 857   }
 858
 859   my $fsp = ($body =~ m@<frameset@i);
 860
 861   $_ = undef;
 862   $body = undef;
 863
 864   @urls = depoison (@urls);
 865
 866   if ( $#urls < 0 ) {
 867     LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : ""));
 868     return ();
 869   }
 870
 871   # pick a random element of the table
 872   my $i = int(rand($#urls+1));
 873   $url = $urls[$i];
 874
 875   LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
 876
 877   return $url;
 878 }
 879
 880
 881 # Given a URL and the RSS feed from that URL, pick a random image from
 882 # the feed.  This is a lot simpler than extracting images out of a page:
 883 # we already know we have reasonable images, so we just pick one.
 884 # Returns: the real URL of the page (preferably not the RSS version),
 885 # and the image.
 886
 887 sub pick_image_from_rss($$) {
 888   my ( $url, $body ) = @_;
 889   my @suitable = ($body =~ m/<enclosure url="(.*?)"/g);
 890
 891   my ($base) = ($body =~ m@<link>([^<>]+)</link>@i);
 892   $base = $url unless $base;
 893
 894   # pick a random element of the table
 895   if (@suitable) {
 896     my $i = int(rand(scalar @suitable));
 897     my $url = $suitable[$i];
 898     LOG ($verbose_load, "picked image " .($i+1) . "/" .
 899                         ($#suitable+1) . ": $url");
 900     return ($base, $url);
 901   }
 902   return;
 903 }
 904
 905 \f
 906 ############################################################################
 907 #
 908 # Subroutines for getting pages and images out of search engines
 909 #
 910 ############################################################################
 911
 912
 913 sub pick_dictionary() {
 914   my @dicts = ("/usr/dict/words",
 915                "/usr/share/dict/words",
 916                "/usr/share/lib/dict/words",
 917                "/usr/share/dict/cracklib-small",
 918                "/usr/share/dict/cracklib-words"
 919                );
 920   foreach my $f (@dicts) {
 921     if (-f $f) {
 922       $wordlist = $f;
 923       last;
 924     }
 925   }
 926   error ("$dicts[0] does not exist") unless defined($wordlist);
 927 }
 928
 929 # returns a random word from the dictionary
 930 #
 931 sub random_word() {
 932
 933   return undef unless open (my $in, '<', $wordlist);
 934
 935   my $size = (stat($in))[7];
 936   my $word = undef;
 937   my $count = 0;
 938
 939   while (1) {
 940     error ("looping ($count) while reading $wordlist")
 941       if (++$count > 100);
 942
 943     my $pos = int (rand ($size));
 944     if (seek ($in, $pos, 0)) {
 945       $word = <$in>;   # toss partial line
 946       $word = <$in>;   # keep next line
 947     }
 948
 949     next unless ($word);
 950     next if ($word =~ m/^[-\']/);
 951
 952     $word = lc($word);
 953     $word =~ s/^.*-//s;
 954     $word =~ s/^[^a-z]+//s;
 955     $word =~ s/[^a-z]+$//s;
 956     $word =~ s/\'s$//s;
 957     $word =~ s/ys$/y/s;
 958     $word =~ s/ally$//s;
 959     $word =~ s/ly$//s;
 960     $word =~ s/ies$/y/s;
 961     $word =~ s/ally$/al/s;
 962     $word =~ s/izes$/ize/s;
 963     $word =~ s/esses$/ess/s;
 964     $word =~ s/(.{5})ing$/$1/s;
 965
 966     next if (length ($word) > 14);
 967     last if ($word);
 968   }
 969
 970   close ($in);
 971
 972   if ( $word =~ s/\s/\+/gs ) {  # convert intra-word spaces to "+".
 973     $word = "\%22$word\%22";    # And put quotes (%22) around it.
 974   }
 975
 976   return $word;
 977 }
 978
 979
 980 sub random_words($) {
 981   my ($sep) = @_;
 982   return (random_word() . $sep .
 983           random_word() . $sep .
 984           random_word() . $sep .
 985           random_word() . $sep .
 986           random_word());
 987 }
 988
 989
 990 sub url_quote($) {
 991   my ($s) = @_;
 992   $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge;
 993   return $s;
 994 }
 995
 996 sub url_unquote($) {
 997   my ($s) = @_;
 998   $s =~ s/[+]/ /g;
 999   $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige;
1000   return $s;
1001 }
1002
1003 sub html_quote($) {
1004   my ($s) = @_;
1005   $s =~ s/&/&amp;/gi;
1006   $s =~ s/</&lt;/gi;
1007   $s =~ s/>/&gt;/gi;
1008   $s =~ s/\"/&quot;/gi;
1009   return $s;
1010 }
1011
1012 sub html_unquote($) {
1013   my ($s) = @_;
1014   $s =~ s/(&([a-z]+);)/{ $entity_table{$2} || $1; }/gexi;  # e.g., &apos;
1015   $s =~ s/(&\#(\d+);)/{ chr($2) }/gexi;                    # e.g., &#39;
1016   return $s;
1017 }
1018
1019
1020 # Loads the given URL (a search on some search engine) and returns:
1021 # - the total number of hits the search engine claimed it had;
1022 # - a list of URLs from the page that the search engine returned;
1023 # Note that this list contains all kinds of internal search engine
1024 # junk URLs too -- caller must prune them.
1025 #
1026 sub pick_from_search_engine($$$) {
1027   my ( $timeout, $search_url, $words ) = @_;
1028
1029   $_ = $words;
1030   s/%20/ /g;
1031
1032   print STDERR "\n\n" if ($verbose_load);
1033
1034   LOG ($verbose_load, "words: $_");
1035   LOG ($verbose_load, "URL: $search_url");
1036
1037   $last_search = $search_url;   # for warnings
1038
1039   my $start = time;
1040   my ( $base, $body ) = get_document ($search_url, undef, $timeout);
1041   if (defined ($timeout)) {
1042     $timeout -= (time - $start);
1043     if ($timeout <= 0) {
1044       $body = undef;
1045       LOG (($verbose_net || $verbose_load),
1046            "timed out (late) for $search_url");
1047       $suppress_audit = 1;
1048       return ();
1049     }
1050   }
1051
1052   return () if (! $body);
1053
1054
1055   my @subpages;
1056
1057   my $search_count = "?";
1058   if ($body =~ m@found (approximately |about )?(<B>)?(\d+)(</B>)? image@) {
1059     $search_count = $3;
1060   } elsif ($body =~ m@<NOBR>((\d{1,3})(,\d{3})*)&nbsp;@i) {
1061     $search_count = $1;
1062   } elsif ($body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) {
1063     $search_count = $1;
1064   } elsif ($body =~ m@found about ((\d{1,3})(,\d{3})*|\d+) results@) {
1065     $search_count = $1;
1066   } elsif ($body =~ m@\b\d+ - \d+ of (\d+)\b@i) { # avimages
1067     $search_count = $1;
1068   } elsif ($body =~ m@About ((\d{1,3})(,\d{3})*) images@i) { # avimages
1069     $search_count = $1;
1070   } elsif ($body =~ m@We found ((\d{1,3})(,\d{3})*|\d+) results@i) { # *vista
1071     $search_count = $1;
1072   } elsif ($body =~ m@ of about <B>((\d{1,3})(,\d{3})*)<@i) { # googleimages
1073     $search_count = $1;
1074   } elsif ($body =~ m@<B>((\d{1,3})(,\d{3})*)</B> Web sites were found@i) {
1075     $search_count = $1;    # lycos
1076   } elsif ($body =~ m@WEB.*?RESULTS.*?\b((\d{1,3})(,\d{3})*)\b.*?Matches@i) {
1077     $search_count = $1;                          # hotbot
1078   } elsif ($body =~ m@no photos were found containing@i) { # avimages
1079     $search_count = "0";
1080   } elsif ($body =~ m@found no document matching@i) { # avtext
1081     $search_count = "0";
1082   }
1083   1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/);
1084
1085 #  if ($search_count eq "?" || $search_count eq "0") {
1086 #    my $file = "/tmp/wc.html";
1087 #    open (my $out, '>', $file) || error ("writing $file: $!");
1088 #    print $out $body;
1089 #    close $out;
1090 #    print STDERR  blurb() . "###### wrote $file\n";
1091 #  }
1092
1093
1094   my $length = length($body);
1095   my $href_count = 0;
1096
1097   $_ = $body;
1098
1099   s/[\r\n\t ]+/ /g;
1100
1101
1102   s/(<A )/\n$1/gi;
1103   foreach (split(/\n/)) {
1104     $href_count++;
1105     my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
1106     next unless $u;
1107
1108     if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; }   # quoted string
1109     elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; }  # or token
1110
1111     if ( $rejected_urls{$u} ) {
1112       LOG ($verbose_filter, "  pre-rejecting candidate: $u");
1113       next;
1114     }
1115
1116     LOG ($verbose_http, "    HREF: $u");
1117
1118     $subpages[++$#subpages] = $u;
1119   }
1120
1121   if ( $#subpages < 0 ) {
1122     LOG ($verbose_filter,
1123          "found nothing on $base ($length bytes, $href_count links).");
1124     return ();
1125   }
1126
1127   LOG ($verbose_filter, "" . $#subpages+1 . " links on $search_url");
1128
1129   return ($search_count, @subpages);
1130 }
1131
1132
1133 sub depoison(@) {
1134   my (@urls) = @_;
1135   my @urls2 = ();
1136   foreach (@urls) {
1137     my ($h) = m@^http://([^/: \t\r\n]+)@i;
1138
1139     next unless defined($h);
1140
1141     if ($poisoners{$h}) {
1142       LOG (($verbose_filter), "  rejecting poisoner: $_");
1143       next;
1144     }
1145     if ($h =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
1146         $poisoners{$1}) {
1147       LOG (($verbose_filter), "  rejecting poisoner: $_");
1148       next;
1149     }
1150     if ($h =~ m@([^.]+\.[^.]+)$@ &&
1151         $poisoners{$1}) {
1152       LOG (($verbose_filter), "  rejecting poisoner: $_");
1153       next;
1154     }
1155
1156     push @urls2, $_;
1157   }
1158   return @urls2;
1159 }
1160
1161
1162 # given a list of URLs, picks one at random; loads it; and returns a
1163 # random image from it.
1164 # returns the url of the page loaded; the url of the image chosen.
1165 #
1166 sub pick_image_from_pages($$$$@) {
1167   my ($base, $total_hit_count, $unfiltered_link_count, $timeout, @pages) = @_;
1168
1169   $total_hit_count = "?" unless defined($total_hit_count);
1170
1171   @pages = depoison (@pages);
1172   LOG ($verbose_load,
1173        "" . ($#pages+1) . " candidates of $unfiltered_link_count links" .
1174        " ($total_hit_count total)");
1175
1176   return () if ($#pages < 0);
1177
1178   my $i = int(rand($#pages+1));
1179   my $page = $pages[$i];
1180
1181   LOG ($verbose_load, "picked page $page");
1182
1183   $suppress_audit = 1;
1184
1185   my ( $base2, $body2 ) = get_document ($page, $base, $timeout);
1186
1187   if (!$base2 || !$body2) {
1188     $body2 = undef;
1189     return ();
1190   }
1191
1192   my $img = pick_image_from_body ($base2, $body2);
1193   $body2 = undef;
1194
1195   if ($img) {
1196     return ($base2, $img);
1197   } else {
1198     return ();
1199   }
1200 }
1201
1202 \f
1203 ############################################################################
1204 #
1205 # Pick images from random pages returned by the Yahoo Random Link
1206 #
1207 ############################################################################
1208
1209 # yahoorand
1210 my $yahoo_random_link = "http://random.yahoo.com/fast/ryl";
1211
1212
1213 # Picks a random page; picks a random image on that page;
1214 # returns two URLs: the page containing the image, and the image.
1215 # Returns () if nothing found this time.
1216 #
1217 sub pick_from_yahoo_random_link($) {
1218   my ($timeout) = @_;
1219
1220   print STDERR "\n\n" if ($verbose_load);
1221   LOG ($verbose_load, "URL: $yahoo_random_link");
1222
1223   $last_search = $yahoo_random_link;   # for warnings
1224
1225   $suppress_audit = 1;
1226
1227   my ( $base, $body ) = get_document ($yahoo_random_link, undef, $timeout);
1228   if (!$base || !$body) {
1229     $body = undef;
1230     return;
1231   }
1232
1233   LOG ($verbose_load, "redirected to: $base");
1234
1235   my $img = pick_image_from_body ($base, $body);
1236   $body = undef;
1237
1238   if ($img) {
1239     return ($base, $img);
1240   } else {
1241     return ();
1242   }
1243 }
1244
1245 \f
1246 ############################################################################
1247 #
1248 # Pick images from random pages returned by the Alta Vista Random Link
1249 # Note: this seems to have gotten a *lot* less random lately (2007).
1250 #
1251 ############################################################################
1252
1253 # altavista
1254 my $alta_vista_random_link = "http://www.altavista.com/image/randomlink";
1255
1256
1257 # Picks a random page; picks a random image on that page;
1258 # returns two URLs: the page containing the image, and the image.
1259 # Returns () if nothing found this time.
1260 #
1261 sub pick_from_alta_vista_random_link($) {
1262   my ($timeout) = @_;
1263
1264   print STDERR "\n\n" if ($verbose_load);
1265   LOG ($verbose_load, "URL: $alta_vista_random_link");
1266
1267   $last_search = $alta_vista_random_link;   # for warnings
1268
1269   $suppress_audit = 1;
1270
1271   my ( $base, $body ) = get_document ($alta_vista_random_link,
1272                                       undef, $timeout);
1273   if (!$base || !$body) {
1274     $body = undef;
1275     return;
1276   }
1277
1278   LOG ($verbose_load, "redirected to: $base");
1279
1280   my $img = pick_image_from_body ($base, $body);
1281   $body = undef;
1282
1283   if ($img) {
1284     return ($base, $img);
1285   } else {
1286     return ();
1287   }
1288 }
1289
1290 \f
1291 ############################################################################
1292 #
1293 # Pick images by feeding random words into Alta Vista Image Search
1294 #
1295 ############################################################################
1296
1297
1298 my $alta_vista_images_url = "http://www.altavista.com/image/results" .
1299                             "?ipht=1" .       # photos
1300                             "&igrph=1" .      # graphics
1301                             "&iclr=1" .       # color
1302                             "&ibw=1" .        # b&w
1303                             "&micat=1" .      # no partner sites
1304                             "&sc=on" .        # "site collapse"
1305                             "&q=";
1306
1307 # avimages
1308 sub pick_from_alta_vista_images($) {
1309   my ($timeout) = @_;
1310
1311   my $words = random_word();
1312   my $page = (int(rand(9)) + 1);
1313   my $search_url = $alta_vista_images_url . $words;
1314
1315   if ($page > 1) {
1316     $search_url .= "&pgno=" . $page;            # page number
1317     $search_url .= "&stq=" . (($page-1) * 12);  # first hit result on page
1318   }
1319
1320   my ($search_hit_count, @subpages) =
1321     pick_from_search_engine ($timeout, $search_url, $words);
1322
1323   my @candidates = ();
1324   foreach my $u (@subpages) {
1325
1326     # avimages is encoding their URLs now.
1327     next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
1328     $u = url_unquote($u);
1329
1330     next unless ($u =~ m@^http://@i);    #  skip non-HTTP or relative URLs
1331     next if ($u =~ m@[/.]altavista\.com\b@i);     # skip altavista builtins
1332     next if ($u =~ m@[/.]yahoo\.com\b@i);         # yahoo and av in cahoots?
1333     next if ($u =~ m@[/.]doubleclick\.net\b@i);   # you cretins
1334     next if ($u =~ m@[/.]clicktomarket\.com\b@i); # more cretins
1335
1336     next if ($u =~ m@[/.]viewimages\.com\b@i);    # stacked deck
1337     next if ($u =~ m@[/.]gettyimages\.com\b@i);
1338
1339     LOG ($verbose_filter, "  candidate: $u");
1340     push @candidates, $u;
1341   }
1342
1343   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1344                                 $timeout, @candidates);
1345 }
1346
1347
1348 \f
1349 ############################################################################
1350 #
1351 # Pick images from Aptix security cameras
1352 # Cribbed liberally from google image search code.
1353 # By Jason Sullivan <jasonsul@us.ibm.com>
1354 #
1355 ############################################################################
1356
1357 my $aptix_images_url = ("http://www.google.com/search" .
1358                         "?q=inurl:%22jpg/image.jpg%3Fr%3D%22");
1359
1360 # securitycam
1361 sub pick_from_security_camera($) {
1362   my ($timeout) = @_;
1363
1364   my $page = (int(rand(9)) + 1);
1365   my $num = 20;                                 # 20 images per page
1366   my $search_url = $aptix_images_url;
1367
1368   if ($page > 1) {
1369     $search_url .= "&start=" . $page*$num;      # page number
1370     $search_url .= "&num="   . $num;            #images per page
1371   }
1372
1373   my ($search_hit_count, @subpages) =
1374     pick_from_search_engine ($timeout, $search_url, '');
1375
1376   my @candidates = ();
1377   my %referers;
1378   foreach my $u (@subpages) {
1379     next if ($u =~ m@[/.]google\.com\b@i);        # skip google builtins (most links)
1380     next unless ($u =~ m@jpg/image.jpg\?r=@i);    #  All pics contain this
1381
1382     LOG ($verbose_filter, "  candidate: $u");
1383     push @candidates, $u;
1384     $referers{$u} = $u;
1385     }
1386
1387   @candidates = depoison (@candidates);
1388   return () if ($#candidates < 0);
1389   my $i = int(rand($#candidates+1));
1390   my $img = $candidates[$i];
1391   my $ref = $referers{$img};
1392
1393   LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
1394   return ($ref, $img);
1395 }
1396
1397 \f
1398 ############################################################################
1399 #
1400 # Pick images by feeding random words into Google Image Search.
1401 # By Charles Gales <gales@us.ibm.com>
1402 #
1403 ############################################################################
1404
1405
1406 my $google_images_url =     "http://images.google.com/images" .
1407                             "?site=images" .  # photos
1408                             "&btnG=Search" .  # graphics
1409                             "&safe=off" .     # no screening
1410                             "&imgsafe=off" .
1411                             "&q=";
1412
1413 # googleimgs
1414 sub pick_from_google_images($;$$) {
1415   my ($timeout, $words, $max_page) = @_;
1416
1417   if (!defined($words)) {
1418     $words = random_word();   # only one word for Google
1419   }
1420
1421   my $page = (int(rand(9)) + 1);
1422   my $num = 20;     # 20 images per page
1423   my $search_url = $google_images_url . $words;
1424
1425   if ($page > 1) {
1426     $search_url .= "&start=" . $page*$num;      # page number
1427     $search_url .= "&num="   . $num;            #images per page
1428   }
1429
1430   my ($search_hit_count, @subpages) =
1431     pick_from_search_engine ($timeout, $search_url, $words);
1432
1433   my @candidates = ();
1434   my %referers;
1435   foreach my $u (@subpages) {
1436     next unless ($u =~ m@imgres\?imgurl@i);    #  All pics start with this
1437     next if ($u =~ m@[/.]google\.com\b@i);     # skip google builtins
1438
1439     $u = html_unquote($u);
1440     if ($u =~ m@^/imgres\?imgurl=(.*?)&imgrefurl=(.*?)\&@) {
1441       my $ref = $2;
1442       my $img = $1;
1443       $img = "http://$img" unless ($img =~ m/^http:/i);
1444
1445       $ref = url_decode($ref);
1446       $img = url_decode($img);
1447
1448       LOG ($verbose_filter, "  candidate: $ref");
1449       push @candidates, $img;
1450       $referers{$img} = $ref;
1451     }
1452   }
1453
1454   @candidates = depoison (@candidates);
1455   return () if ($#candidates < 0);
1456   my $i = int(rand($#candidates+1));
1457   my $img = $candidates[$i];
1458   my $ref = $referers{$img};
1459
1460   LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
1461   return ($ref, $img);
1462 }
1463
1464
1465 \f
1466 ############################################################################
1467 #
1468 # Pick images by feeding random numbers into Google Image Search.
1469 # By jwz, suggested by Ian O'Donnell.
1470 #
1471 ############################################################################
1472
1473
1474 # googlenums
1475 sub pick_from_google_image_numbers($) {
1476   my ($timeout) = @_;
1477
1478   my $max = 9999;
1479   my $number = int(rand($max));
1480
1481   $number = sprintf("%04d", $number)
1482     if (rand() < 0.3);
1483
1484   pick_from_google_images ($timeout, "$number");
1485 }
1486
1487
1488 \f
1489 ############################################################################
1490 #
1491 # Pick images by feeding random digital camera file names into
1492 # Google Image Search.
1493 # By jwz, inspired by the excellent Random Personal Picture Finder
1494 # at http://www.diddly.com/random/
1495 #
1496 ############################################################################
1497
1498 my @photomakers = (
1499   #
1500   # Common digital camera file name formats, as described at
1501   # http://www.diddly.com/random/about.html
1502   #
1503   sub { sprintf ("dcp%05d.jpg",  int(rand(4000))); },   # Kodak
1504   sub { sprintf ("dsc%05d.jpg",  int(rand(4000))); },   # Nikon
1505   sub { sprintf ("dscn%04d.jpg", int(rand(4000))); },   # Nikon
1506   sub { sprintf ("mvc-%03d.jpg", int(rand(999)));  },   # Sony Mavica
1507   sub { sprintf ("mvc%05d.jpg",  int(rand(9999))); },   # Sony Mavica
1508   sub { sprintf ("P101%04d.jpg", int(rand(9999))); },   # Olympus w/ date=101
1509   sub { sprintf ("P%x%02d%04d.jpg",                     # Olympus
1510                  int(rand(0xC)), int(rand(30))+1,
1511                  rand(9999)); },
1512   sub { sprintf ("IMG_%03d.jpg",  int(rand(999))); },   # ?
1513   sub { sprintf ("IMAG%04d.jpg",  int(rand(9999))); },  # RCA and Samsung
1514   sub { my $n = int(rand(9999));                        # Canon
1515           sprintf ("1%02d-%04d.jpg", int($n/100), $n); },
1516   sub { my $n = int(rand(9999));                        # Canon
1517           sprintf ("1%02d-%04d_IMG.jpg",
1518                    int($n/100), $n); },
1519   sub { sprintf ("IMG_%04d.jpg", int(rand(9999))); },   # Canon
1520   sub { sprintf ("dscf%04d.jpg", int(rand(9999))); },   # Fuji Finepix
1521   sub { sprintf ("pdrm%04d.jpg", int(rand(9999))); },   # Toshiba PDR
1522   sub { sprintf ("IM%06d.jpg", int(rand(9999))); },     # HP Photosmart
1523   sub { sprintf ("EX%06d.jpg", int(rand(9999))); },     # HP Photosmart
1524 #  sub { my $n = int(rand(3));                          # Kodak DC-40,50,120
1525 #        sprintf ("DC%04d%s.jpg", int(rand(9999)),
1526 #                 $n == 0 ? 'S' : $n == 1 ? 'M' : 'L'); },
1527   sub { sprintf ("pict%04d.jpg", int(rand(9999))); },   # Minolta Dimage
1528   sub { sprintf ("P%07d.jpg", int(rand(9999))); },      # Kodak DC290
1529 #  sub { sprintf ("%02d%02d%04d.jpg",                   # Casio QV3000, QV4000
1530 #                 int(rand(12))+1, int(rand(31))+1,
1531 #                 int(rand(999))); },
1532 #  sub { sprintf ("%02d%x%02d%04d.jpg",                 # Casio QV7000
1533 #                 int(rand(6)), # year
1534 #                 int(rand(12))+1, int(rand(31))+1,
1535 #                 int(rand(999))); },
1536   sub { sprintf ("IMGP%04d.jpg", int(rand(9999))); },   # Pentax Optio S
1537   sub { sprintf ("PANA%04d.jpg", int(rand(9999))); },   # Panasonic vid still
1538   sub { sprintf ("HPIM%04d.jpg", int(rand(9999))); },   # HP Photosmart
1539   sub { sprintf ("PCDV%04d.jpg", int(rand(9999))); },   # ?
1540  );
1541
1542
1543 # googlephotos
1544 sub pick_from_google_image_photos($) {
1545   my ($timeout) = @_;
1546
1547   my $i = int(rand($#photomakers + 1));
1548   my $fn = $photomakers[$i];
1549   my $file = &$fn;
1550   my $words .= $file . "%20filetype:jpg";
1551
1552   pick_from_google_images ($timeout, $words);
1553 }
1554
1555
1556 \f
1557 ############################################################################
1558 #
1559 # Pick images by feeding random words into Alta Vista Text Search
1560 #
1561 ############################################################################
1562
1563
1564 my $alta_vista_url = "http://www.altavista.com/web/results" .
1565                      "?pg=aq" .
1566                      "&aqmode=s" .
1567                      "&filetype=html" .
1568                      "&sc=on" .        # "site collapse"
1569                      "&nbq=50" .
1570                      "&aqo=";
1571
1572 # avtext
1573 sub pick_from_alta_vista_text($) {
1574   my ($timeout) = @_;
1575
1576   my $words = random_words('%20');
1577   my $page = (int(rand(9)) + 1);
1578   my $search_url = $alta_vista_url . $words;
1579
1580   if ($page > 1) {
1581     $search_url .= "&pgno=" . $page;
1582     $search_url .= "&stq=" . (($page-1) * 10);
1583   }
1584
1585   my ($search_hit_count, @subpages) =
1586     pick_from_search_engine ($timeout, $search_url, $words);
1587
1588   my @candidates = ();
1589   foreach my $u (@subpages) {
1590
1591     # Those altavista fuckers are playing really nasty redirection games
1592     # these days: the filter your clicks through their site, but use
1593     # onMouseOver to make it look like they're not!  Well, it makes it
1594     # easier for us to identify search results...
1595     #
1596     next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
1597     $u = url_unquote($u);
1598
1599     next unless ($u =~ m@^http://@i);    #  skip non-HTTP or relative URLs
1600     next if ($u =~ m@[/.]altavista\.com\b@i);     # skip altavista builtins
1601     next if ($u =~ m@[/.]yahoo\.com\b@i);         # yahoo and av in cahoots?
1602
1603     LOG ($verbose_filter, "  candidate: $u");
1604     push @candidates, $u;
1605   }
1606
1607   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1608                                 $timeout, @candidates);
1609 }
1610
1611
1612 \f
1613 ############################################################################
1614 #
1615 # Pick images by feeding random words into Hotbot
1616 #
1617 ############################################################################
1618
1619 my $hotbot_search_url =("http://hotbot.lycos.com/default.asp" .
1620                         "?ca=w" .
1621                         "&descriptiontype=0" .
1622                         "&imagetoggle=1" .
1623                         "&matchmode=any" .
1624                         "&nummod=2" .
1625                         "&recordcount=50" .
1626                         "&sitegroup=1" .
1627                         "&stem=1" .
1628                         "&cobrand=undefined" .
1629                         "&query=");
1630
1631 sub pick_from_hotbot_text($) {
1632   my ($timeout) = @_;
1633
1634   $last_search = $hotbot_search_url;   # for warnings
1635
1636   # lycos seems to always give us back dictionaries and word lists if
1637   # we search for more than one word...
1638   #
1639   my $words = random_word();
1640
1641   my $start = int(rand(8)) * 10 + 1;
1642   my $search_url = $hotbot_search_url . $words . "&first=$start&page=more";
1643
1644   my ($search_hit_count, @subpages) =
1645     pick_from_search_engine ($timeout, $search_url, $words);
1646
1647   my @candidates = ();
1648   foreach my $u (@subpages) {
1649
1650     # Hotbot plays redirection games too
1651     # (not any more?)
1652 #    next unless ($u =~ m@/director.asp\?.*\btarget=([^&]+)@);
1653 #    $u = url_decode($1);
1654
1655     next unless ($u =~ m@^http://@i);    #  skip non-HTTP or relative URLs
1656     next if ($u =~ m@[/.]hotbot\.com\b@i);     # skip hotbot builtins
1657     next if ($u =~ m@[/.]lycos\.com\b@i);      # skip hotbot builtins
1658     next if ($u =~ m@[/.]inktomi\.com\b@i);    # skip hotbot builtins
1659
1660     LOG ($verbose_filter, "  candidate: $u");
1661     push @candidates, $u;
1662   }
1663
1664   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1665                                 $timeout, @candidates);
1666 }
1667
1668
1669 \f
1670 ############################################################################
1671 #
1672 # Pick images by feeding random words into Lycos
1673 #
1674 ############################################################################
1675
1676 my $lycos_search_url = "http://search.lycos.com/default.asp" .
1677                        "?lpv=1" .
1678                        "&loc=searchhp" .
1679                        "&tab=web" .
1680                        "&query=";
1681
1682 sub pick_from_lycos_text($) {
1683   my ($timeout) = @_;
1684
1685   $last_search = $lycos_search_url;   # for warnings
1686
1687   # lycos seems to always give us back dictionaries and word lists if
1688   # we search for more than one word...
1689   #
1690   my $words = random_word();
1691
1692   my $start = int(rand(8)) * 10 + 1;
1693   my $search_url = $lycos_search_url . $words . "&first=$start&page=more";
1694
1695   my ($search_hit_count, @subpages) =
1696     pick_from_search_engine ($timeout, $search_url, $words);
1697
1698   my @candidates = ();
1699   foreach my $u (@subpages) {
1700
1701     # Lycos plays redirection games.
1702     # (not any more?)
1703 #    next unless ($u =~ m@^http://click.lycos.com/director.asp
1704 #                         .*
1705 #                         \btarget=([^&]+)
1706 #                         .*
1707 #                        @x);
1708 #    $u = url_decode($1);
1709
1710     next unless ($u =~ m@^http://@i);    #  skip non-HTTP or relative URLs
1711     next if ($u =~ m@[/.]hotbot\.com\b@i);     # skip lycos builtins
1712     next if ($u =~ m@[/.]lycos\.com\b@i);      # skip lycos builtins
1713     next if ($u =~ m@[/.]terralycos\.com\b@i); # skip lycos builtins
1714     next if ($u =~ m@[/.]inktomi\.com\b@i);    # skip lycos builtins
1715
1716
1717     LOG ($verbose_filter, "  candidate: $u");
1718     push @candidates, $u;
1719   }
1720
1721   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1722                                 $timeout, @candidates);
1723 }
1724
1725
1726 \f
1727 ############################################################################
1728 #
1729 # Pick images by feeding random words into news.yahoo.com
1730 #
1731 ############################################################################
1732
1733 my $yahoo_news_url = "http://news.search.yahoo.com/search/news" .
1734                      "?c=news_photos" .
1735                      "&p=";
1736
1737 # yahoonews
1738 sub pick_from_yahoo_news_text($) {
1739   my ($timeout) = @_;
1740
1741   $last_search = $yahoo_news_url;   # for warnings
1742
1743   my $words = random_word();
1744   my $search_url = $yahoo_news_url . $words;
1745
1746   my ($search_hit_count, @subpages) =
1747     pick_from_search_engine ($timeout, $search_url, $words);
1748
1749   my @candidates = ();
1750   foreach my $u (@subpages) {
1751
1752     # de-redirectize the URLs
1753     $u =~ s@^http://rds\.yahoo\.com/.*-http%3A@http:@s;
1754
1755     # only accept URLs on Yahoo's news site
1756     next unless ($u =~ m@^http://dailynews\.yahoo\.com/@i ||
1757                  $u =~ m@^http://story\.news\.yahoo\.com/@i);
1758     next unless ($u =~ m@&u=/@);
1759
1760     LOG ($verbose_filter, "  candidate: $u");
1761     push @candidates, $u;
1762   }
1763
1764   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1765                                 $timeout, @candidates);
1766 }
1767
1768
1769 \f
1770 ############################################################################
1771 #
1772 # Pick images from LiveJournal's list of recently-posted images.
1773 #
1774 ############################################################################
1775
1776 my $livejournal_img_url = "http://www.livejournal.com/stats/latest-img.bml";
1777
1778 # With most of our image sources, we get a random page and then select
1779 # from the images on it.  However, in the case of LiveJournal, the page
1780 # of images tends to update slowly; so we'll remember the last N entries
1781 # on it and randomly select from those, to get a wider variety each time.
1782
1783 my $lj_cache_size = 1000;
1784 my @lj_cache = (); # fifo, for ordering by age
1785 my %lj_cache = (); # hash, for detecting dups
1786
1787 # livejournal
1788 sub pick_from_livejournal_images($) {
1789   my ($timeout) = @_;
1790
1791   $last_search = $livejournal_img_url;   # for warnings
1792
1793   my ( $base, $body ) = get_document ($livejournal_img_url, undef, $timeout);
1794   return () unless $body;
1795
1796   $body =~ s/\n/ /gs;
1797   $body =~ s/(<recent-image)\b/\n$1/gsi;
1798
1799   foreach (split (/\n/, $body)) {
1800     next unless (m/^<recent-image\b/);
1801     next unless (m/\bIMG=[\'\"]([^\'\"]+)[\'\"]/si);
1802     my $img = html_unquote ($1);
1803
1804     next if ($lj_cache{$img}); # already have it
1805
1806     next unless (m/\bURL=[\'\"]([^\'\"]+)[\'\"]/si);
1807     my $page = html_unquote ($1);
1808     my @pair = ($img, $page);
1809     LOG ($verbose_filter, "  candidate: $img");
1810     push @lj_cache, \@pair;
1811     $lj_cache{$img} = \@pair;
1812   }
1813
1814   return () if ($#lj_cache == -1);
1815
1816   my $n = $#lj_cache+1;
1817   my $i = int(rand($n));
1818   my ($img, $page) = @{$lj_cache[$i]};
1819
1820   # delete this one from @lj_cache and from %lj_cache.
1821   #
1822   @lj_cache = ( @lj_cache[0 .. $i-1],
1823                 @lj_cache[$i+1 .. $#lj_cache] );
1824   delete $lj_cache{$img};
1825
1826   # Keep the size of the cache under the limit by nuking older entries
1827   #
1828   while ($#lj_cache >= $lj_cache_size) {
1829     my $pairP = shift @lj_cache;
1830     my $img = $pairP->[0];
1831     delete $lj_cache{$img};
1832   }
1833
1834   LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
1835
1836   return ($page, $img);
1837 }
1838
1839 \f
1840 ############################################################################
1841 #
1842 # Pick images from ircimages.com (images that have been in the /topic of
1843 # various IRC channels.)
1844 #
1845 ############################################################################
1846
1847 my $ircimages_url = "http://ircimages.com/";
1848
1849 # ircimages
1850 sub pick_from_ircimages($) {
1851   my ($timeout) = @_;
1852
1853   $last_search = $ircimages_url;   # for warnings
1854
1855   my $n = int(rand(2900));
1856   my $search_url = $ircimages_url . "page-$n";
1857
1858   my ( $base, $body ) = get_document ($search_url, undef, $timeout);
1859   return () unless $body;
1860
1861   my @candidates = ();
1862
1863   $body =~ s/\n/ /gs;
1864   $body =~ s/(<A)\b/\n$1/gsi;
1865
1866   foreach (split (/\n/, $body)) {
1867
1868     my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
1869     next unless $u;
1870
1871     if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; }   # quoted string
1872     elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; }  # or token
1873
1874     next unless ($u =~ m/^http:/i);
1875     next if ($u =~ m@^http://(searchirc\.com\|ircimages\.com)@i);
1876     next unless ($u =~ m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@i);
1877
1878     LOG ($verbose_http, "    HREF: $u");
1879     push @candidates, $u;
1880   }
1881
1882   LOG ($verbose_filter, "" . $#candidates+1 . " links on $search_url");
1883
1884   return () if ($#candidates == -1);
1885
1886   my $i = int(rand($#candidates+1));
1887   my $img = $candidates[$i];
1888
1889   LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#candidates+1) .
1890        ": $img");
1891
1892   $search_url = $img;  # hmm...
1893   return ($search_url, $img);
1894 }
1895
1896 \f
1897 ############################################################################
1898 #
1899 # Pick images from Twitpic's list of recently-posted images.
1900 #
1901 ############################################################################
1902
1903 my $twitpic_img_url = "http://twitpic.com/public_timeline/feed.rss";
1904
1905 # With most of our image sources, we get a random page and then select
1906 # from the images on it.  However, in the case of Twitpic, the page
1907 # of images tends to update slowly; so we'll remember the last N entries
1908 # on it and randomly select from those, to get a wider variety each time.
1909
1910 my $twitpic_cache_size = 1000;
1911 my @twitpic_cache = (); # fifo, for ordering by age
1912 my %twitpic_cache = (); # hash, for detecting dups
1913
1914 # twitpic
1915 sub pick_from_twitpic_images($) {
1916   my ($timeout) = @_;
1917
1918   $last_search = $twitpic_img_url;   # for warnings
1919
1920   my ( $base, $body ) = get_document ($twitpic_img_url, undef, $timeout);
1921
1922   # Update the cache.
1923
1924   if ($body) {
1925     $body =~ s/\n/ /gs;
1926     $body =~ s/(<item)\b/\n$1/gsi;
1927
1928     my @items = split (/\n/, $body);
1929     shift @items;
1930     foreach (@items) {
1931       next unless (m@<link>([^<>]*)</link>@si);
1932       my $page = html_unquote ($1);
1933
1934       $page =~ s@/$@@s;
1935       $page .= '/full';
1936
1937       next if ($twitpic_cache{$page}); # already have it
1938
1939       LOG ($verbose_filter, "  candidate: $page");
1940       push @twitpic_cache, $page;
1941       $twitpic_cache{$page} = $page;
1942     }
1943   }
1944
1945   # Pull from the cache.
1946
1947   return () if ($#twitpic_cache == -1);
1948
1949   my $n = $#twitpic_cache+1;
1950   my $i = int(rand($n));
1951   my $page = $twitpic_cache[$i];
1952
1953   # delete this one from @twitpic_cache and from %twitpic_cache.
1954   #
1955   @twitpic_cache = ( @twitpic_cache[0 .. $i-1],
1956                      @twitpic_cache[$i+1 .. $#twitpic_cache] );
1957   delete $twitpic_cache{$page};
1958
1959   # Keep the size of the cache under the limit by nuking older entries
1960   #
1961   while ($#twitpic_cache >= $twitpic_cache_size) {
1962     my $page = shift @twitpic_cache;
1963     delete $twitpic_cache{$page};
1964   }
1965
1966   ( $base, $body ) = get_document ($page, undef, $timeout);
1967   my $img = undef;
1968   $body = '' unless defined($body);
1969
1970   foreach (split (/<img\s+/, $body)) {
1971     my ($src) = m/\bsrc=[\"\'](.*?)[\"\']/si;
1972     next unless $src;
1973     next if m@/js/@s;
1974     next if m@/images/@s;
1975
1976     $img = $src;
1977
1978     $img = "http:$img" if ($img =~ m@^//@s);  # Oh come on
1979
1980     # Sometimes these images are hosted on twitpic, sometimes on Amazon.
1981     if ($img =~ m@^/@) {
1982       $base =~ s@^(https?://[^/]+)/.*@$1@s;
1983       $img = $base . $img;
1984     }
1985     last;
1986   }
1987
1988   if (!$img) {
1989     LOG ($verbose_load, "no matching images on $page\n");
1990     return ();
1991   }
1992
1993   LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
1994
1995   return ($page, $img);
1996 }
1997
1998 \f
1999 ############################################################################
2000 #
2001 # Pick images from Twitter's list of recently-posted updates.
2002 #
2003 ############################################################################
2004
2005 # With most of our image sources, we get a random page and then select
2006 # from the images on it.  However, in the case of Twitter, the page
2007 # of images only updates once a minute; so we'll remember the last N entries
2008 # on it and randomly select from those, to get a wider variety each time.
2009
2010 my $twitter_img_url = "http://api.twitter.com/1/statuses/" .
2011                       "public_timeline.json" .
2012                       "?include_entities=true" .
2013                       "&include_rts=true" .
2014                       "&count=200";
2015
2016 my $twitter_cache_size = 1000;
2017
2018 my @twitter_cache = (); # fifo, for ordering by age
2019 my %twitter_cache = (); # hash, for detecting dups
2020
2021
2022 # twitter
2023 sub pick_from_twitter_images($) {
2024   my ($timeout) = @_;
2025
2026   $last_search = $twitter_img_url;   # for warnings
2027
2028   my ( $base, $body ) = get_document ($twitter_img_url, undef, $timeout);
2029   # Update the cache.
2030
2031   if ($body) {
2032     $body =~ s/[\r\n]+/ /gs;
2033
2034     # Parsing JSON is a pain in the ass.  So we halfass it as usual.
2035     $body =~ s/^\[|\]$//s;
2036     $body =~ s/(\[.*?\])/{ $_ = $1; s@\},@\} @gs; $_; }/gsexi;
2037     my @items = split (/},{/, $body);
2038     foreach (@items) {
2039       my ($name) = m@"screen_name":"([^\"]+)"@si;
2040       my ($img)  = m@"media_url":"([^\"]+)"@si;
2041       my ($page) = m@"display_url":"([^\"]+)"@si;
2042       next unless ($name && $img && $page);
2043       foreach ($img, $page) {
2044         s/\\//gs;
2045         $_ = "http://$_" unless (m/^http/si);
2046       }
2047
2048       next if ($twitter_cache{$page}); # already have it
2049
2050       LOG ($verbose_filter, "  candidate: $page - $img");
2051       push @twitter_cache, $page;
2052       $twitter_cache{$page} = $img;
2053     }
2054   }
2055
2056   # Pull from the cache.
2057
2058   return () if ($#twitter_cache == -1);
2059
2060   my $n = $#twitter_cache+1;
2061   my $i = int(rand($n));
2062   my $page = $twitter_cache[$i];
2063   my $url  = $twitter_cache{$page};
2064
2065   # delete this one from @twitter_cache and from %twitter_cache.
2066   #
2067   @twitter_cache = ( @twitter_cache[0 .. $i-1],
2068                      @twitter_cache[$i+1 .. $#twitter_cache] );
2069   delete $twitter_cache{$page};
2070
2071   # Keep the size of the cache under the limit by nuking older entries
2072   #
2073   while ($#twitter_cache >= $twitter_cache_size) {
2074     my $page = shift @twitter_cache;
2075     delete $twitter_cache{$page};
2076   }
2077
2078   LOG ($verbose_load, "picked page $url");
2079
2080   $suppress_audit = 1;
2081
2082   return ($page, $url);
2083 }
2084
2085 \f
2086 ############################################################################
2087 #
2088 # Pick images from Flickr's page of recently-posted photos.
2089 #
2090 ############################################################################
2091
2092 my $flickr_img_url = "http://www.flickr.com/photos/";
2093
2094 # Like LiveJournal, the Flickr page of images tends to update slowly,
2095 # so remember the last N entries on it and randomly select from those.
2096
2097 # I know that Flickr has an API (http://www.flickr.com/services/api/)
2098 # but it was easy enough to scrape the HTML, so I didn't bother exploring.
2099
2100 my $flickr_cache_size = 1000;
2101 my @flickr_cache = (); # fifo, for ordering by age
2102 my %flickr_cache = (); # hash, for detecting dups
2103
2104
2105 # flickr_recent
2106 sub pick_from_flickr_recent($) {
2107   my ($timeout) = @_;
2108
2109   my $start = 16 * int(rand(100));
2110
2111   $last_search = $flickr_img_url;   # for warnings
2112   $last_search .= "?start=$start" if ($start > 0);
2113
2114   my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2115   return () unless $body;
2116
2117   $body =~ s/[\r\n]/ /gs;
2118   $body =~ s/(<a)\b/\n$1/gsi;
2119
2120   my $count = 0;
2121   my $count2 = 0;
2122   foreach (split (/\n/, $body)) {
2123     my ($page, $thumb) = m@<A \s [^<>]* \b HREF=\"([^<>\"]+)\" [^<>]* > \s*
2124                            <IMG \s [^<>]* \b SRC=\"([^<>\"]+)\" @xsi;
2125     next unless defined ($thumb);
2126     $page = html_unquote ($page);
2127     $thumb = html_unquote ($thumb);
2128
2129     next unless ($thumb =~ m@^http://farm\d*\.static\.?flickr\.com/@);
2130
2131     my $base = "http://www.flickr.com/";
2132     $page  =~ s@^/@$base@;
2133     $thumb =~ s@^/@$base@;
2134
2135     my $img = $thumb;
2136     $img =~ s/_[a-z](\.[a-z\d]+)$/$1/si;  # take off "thumb" suffix
2137
2138     $count++;
2139     next if ($flickr_cache{$img}); # already have it
2140
2141     my @pair = ($img, $page, $start);
2142     LOG ($verbose_filter, "  candidate: $img");
2143     push @flickr_cache, \@pair;
2144     $flickr_cache{$img} = \@pair;
2145     $count2++;
2146   }
2147
2148   return () if ($#flickr_cache == -1);
2149
2150   my $n = $#flickr_cache+1;
2151   my $i = int(rand($n));
2152   my ($img, $page) = @{$flickr_cache[$i]};
2153
2154   # delete this one from @flickr_cache and from %flickr_cache.
2155   #
2156   @flickr_cache = ( @flickr_cache[0 .. $i-1],
2157                     @flickr_cache[$i+1 .. $#flickr_cache] );
2158   delete $flickr_cache{$img};
2159
2160   # Keep the size of the cache under the limit by nuking older entries
2161   #
2162   while ($#flickr_cache >= $flickr_cache_size) {
2163     my $pairP = shift @flickr_cache;
2164     my $img = $pairP->[0];
2165     delete $flickr_cache{$img};
2166   }
2167
2168   LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
2169
2170   return ($page, $img);
2171 }
2172
2173 \f
2174 ############################################################################
2175 #
2176 # Pick images from a random RSS feed on Flickr.
2177 #
2178 ############################################################################
2179
2180 my $flickr_rss_base = ("http://www.flickr.com/services/feeds/photos_public.gne".
2181                        "?format=rss_200_enc&tagmode=any&tags=");
2182
2183 # Picks a random RSS feed; picks a random image from that feed;
2184 # returns 2 URLs: the page containing the image, and the image.
2185 # Mostly by Joe Mcmahon <mcmahon@yahoo-inc.com>
2186 #
2187 # flickr_random
2188 sub pick_from_flickr_random($) {
2189   my $timeout = shift;
2190
2191   my $words = random_words(',');
2192   my $rss = $flickr_rss_base . $words;
2193   $last_search = $rss;
2194
2195   $_ = $words;
2196   s/,/ /g;
2197
2198   print STDERR "\n\n" if ($verbose_load);
2199   LOG ($verbose_load, "words: $_");
2200   LOG ($verbose_load, "URL: $last_search");
2201
2202   $suppress_audit = 1;
2203
2204   my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2205   if (!$base || !$body) {
2206     $body = undef;
2207     return;
2208   }
2209
2210   my $img;
2211   ($base, $img) = pick_image_from_rss ($base, $body);
2212   $body = undef;
2213   return () unless defined ($img);
2214
2215   LOG ($verbose_load, "redirected to: $base");
2216   return ($base, $img);
2217 }
2218
2219 \f
2220 ############################################################################
2221 #
2222 # Pick images by waiting for driftnet to populate a temp dir with files.
2223 # Requires driftnet version 0.1.5 or later.
2224 # (Driftnet is a program by Chris Lightfoot that sniffs your local ethernet
2225 # for images being downloaded by others.)
2226 # Driftnet/webcollage integration by jwz.
2227 #
2228 ############################################################################
2229
2230 # driftnet
2231 sub pick_from_driftnet($) {
2232   my ($timeout) = @_;
2233
2234   my $id = $driftnet_magic;
2235   my $dir = $driftnet_dir;
2236   my $start = time;
2237   my $now;
2238
2239   error ("\$driftnet_dir unset?") unless ($dir);
2240   $dir =~ s@/+$@@;
2241
2242   error ("$dir unreadable") unless (-d "$dir/.");
2243
2244   $timeout = $http_timeout unless ($timeout);
2245   $last_search = $id;
2246
2247   while ($now = time, $now < $start + $timeout) {
2248     opendir (my $dir, $dir) || error ("$dir: $!");
2249     while (my $file = readdir($dir)) {
2250       next if ($file =~ m/^\./);
2251       $file = "$dir/$file";
2252       closedir ($dir);
2253       LOG ($verbose_load, "picked file $file ($id)");
2254       return ($id, $file);
2255     }
2256     closedir ($dir);
2257   }
2258   LOG (($verbose_net || $verbose_load), "timed out for $id");
2259   return ();
2260 }
2261
2262
2263 sub get_driftnet_file($) {
2264   my ($file) = @_;
2265
2266   error ("\$driftnet_dir unset?") unless ($driftnet_dir);
2267
2268   my $id = $driftnet_magic;
2269   error ("$id: $file not in $driftnet_dir?")
2270     unless ($file =~ m@^\Q$driftnet_dir@o);
2271
2272   open (my $in, '<', $file) || error ("$id: $file: $!");
2273   my $body = '';
2274   local $/ = undef;  # read entire file
2275   $body = <$in>;
2276   close ($in) || error ("$id: $file: $!");
2277   unlink ($file) || error ("$id: $file: rm: $!");
2278   return ($id, $body);
2279 }
2280
2281
2282 sub spawn_driftnet($) {
2283   my ($cmd) = @_;
2284
2285   # make a directory to use.
2286   while (1) {
2287     my $tmp = $ENV{TEMPDIR} || "/tmp";
2288     $driftnet_dir = sprintf ("$tmp/driftcollage-%08x", rand(0xffffffff));
2289     LOG ($verbose_exec, "mkdir $driftnet_dir");
2290     last if mkdir ($driftnet_dir, 0700);
2291   }
2292
2293   if (! ($cmd =~ m/\s/)) {
2294     # if the command didn't have any arguments in it, then it must be just
2295     # a pointer to the executable.  Append the default args to it.
2296     my $dargs = $default_driftnet_cmd;
2297     $dargs =~ s/^[^\s]+//;
2298     $cmd .= $dargs;
2299   }
2300
2301   # point the driftnet command at our newly-minted private directory.
2302   #
2303   $cmd .= " -d $driftnet_dir";
2304   $cmd .= ">/dev/null" unless ($verbose_exec);
2305
2306   my $pid = fork();
2307   if ($pid < 0) { error ("fork: $!\n"); }
2308   if ($pid) {
2309     # parent fork
2310     push @pids_to_kill, $pid;
2311     LOG ($verbose_exec, "forked for \"$cmd\"");
2312   } else {
2313     # child fork
2314     nontrapping_system ($cmd) || error ("exec: $!");
2315   }
2316
2317   # wait a bit, then make sure the process actually started up.
2318   #
2319   sleep (1);
2320   error ("pid $pid failed to start \"$cmd\"")
2321     unless (1 == kill (0, $pid));
2322 }
2323
2324 # local-directory
2325 sub pick_from_local_dir($) {
2326   my ($timeout) = @_;
2327
2328   my $id = $local_magic;
2329   $last_search = $id;
2330
2331   my $dir = $local_dir;
2332   error ("\$local_dir unset?") unless ($dir);
2333   $dir =~ s@/+$@@;
2334
2335   error ("$dir unreadable") unless (-d "$dir/.");
2336
2337   my $v = ($verbose_exec ? "-v" : "");
2338   my $pick = `xscreensaver-getimage-file $v "$dir"`;
2339   $pick =~ s/\s+$//s;
2340   $pick = "$dir/$pick" unless ($pick =~ m@^/@s);       # relative path
2341
2342   LOG ($verbose_load, "picked file $pick ($id)");
2343   return ($id, $pick);
2344 }
2345
2346
2347 sub get_local_file($) {
2348   my ($file) = @_;
2349
2350   error ("\$local_dir unset?") unless ($local_dir);
2351
2352   my $id = $local_magic;
2353   error ("$id: $file not in $local_dir?")
2354     unless ($file =~ m@^\Q$local_dir@o);
2355
2356   open (my $in, '<', $file) || error ("$id: $file: $!");
2357   local $/ = undef;  # read entire file
2358   my $body = <$in>;
2359   close ($in) || error ("$id: $file: $!");
2360   return ($id, $body);
2361 }
2362
2363
2364 \f
2365 ############################################################################
2366 #
2367 # Pick a random image in a random way
2368 #
2369 ############################################################################
2370
2371
2372 # Picks a random image on a random page, and returns two URLs:
2373 # the page containing the image, and the image.
2374 # Returns () if nothing found this time.
2375 #
2376
2377 sub pick_image(;$) {
2378   my ($timeout) = @_;
2379
2380   $current_state = "select";
2381   $load_method = "none";
2382
2383   my $n = int(rand(100));
2384   my $fn = undef;
2385   my $total = 0;
2386   my @rest = @search_methods;
2387
2388   while (@rest) {
2389     my $pct  = shift @rest;
2390     my $name = shift @rest;
2391     my $tfn  = shift @rest;
2392     $total += $pct;
2393     if ($total > $n && !defined($fn)) {
2394       $fn = $tfn;
2395       $current_state = $name;
2396       $load_method = $current_state;
2397     }
2398   }
2399
2400   if ($total != 100) {
2401     error ("internal error: \@search_methods totals to $total%!");
2402   }
2403
2404   record_attempt ($current_state);
2405   return $fn->($timeout);
2406 }
2407
2408
2409 \f
2410 ############################################################################
2411 #
2412 # Statistics and logging
2413 #
2414 ############################################################################
2415
2416 sub timestr() {
2417   return strftime ("%H:%M:%S: ", localtime);
2418 }
2419
2420 sub blurb() {
2421   return "$progname: " . timestr() . "$current_state: ";
2422 }
2423
2424 sub error($) {
2425   my ($err) = @_;
2426   print STDERR blurb() . "$err\n";
2427   exit 1;
2428 }
2429
2430 sub stacktrace() {
2431   my $i = 1;
2432   print STDERR "$progname: stack trace:\n";
2433   while (1) {
2434     my ($package, $filename, $line, $subroutine) = caller($i++);
2435     last unless defined($package);
2436     $filename =~ s@^.*/@@;
2437     print STDERR "  $filename#$line, $subroutine\n";
2438   }
2439 }
2440
2441
2442 my $lastlog = "";
2443
2444 sub clearlog() {
2445   $lastlog = "";
2446 }
2447
2448 sub showlog() {
2449   my $head = "$progname: DEBUG: ";
2450   foreach (split (/\n/, $lastlog)) {
2451     print STDERR "$head$_\n";
2452   }
2453   $lastlog = "";
2454 }
2455
2456 sub LOG($$) {
2457   my ($print, $msg) = @_;
2458   my $blurb = timestr() . "$current_state: ";
2459   $lastlog .= "$blurb$msg\n";
2460   print STDERR "$progname: $blurb$msg\n" if $print;
2461 }
2462
2463
2464 my %stats_attempts;
2465 my %stats_successes;
2466 my %stats_elapsed;
2467
2468 my $last_state = undef;
2469 sub record_attempt($) {
2470   my ($name) = @_;
2471
2472   if ($last_state) {
2473     record_failure($last_state) unless ($image_succeeded > 0);
2474   }
2475   $last_state = $name;
2476
2477   clearlog();
2478   report_performance();
2479
2480   start_timer($name);
2481   $image_succeeded = 0;
2482   $suppress_audit = 0;
2483 }
2484
2485 sub record_success($$$) {
2486   my ($name, $url, $base) = @_;
2487   if (defined($stats_successes{$name})) {
2488     $stats_successes{$name}++;
2489   } else {
2490     $stats_successes{$name} = 1;
2491   }
2492
2493   stop_timer ($name, 1);
2494   my $o = $current_state;
2495   $current_state = $name;
2496   save_recent_url ($url, $base);
2497   $current_state = $o;
2498   $image_succeeded = 1;
2499   clearlog();
2500 }
2501
2502
2503 sub record_failure($) {
2504   my ($name) = @_;
2505
2506   return if $image_succeeded;
2507
2508   stop_timer ($name, 0);
2509   if ($verbose_load && !$verbose_exec) {
2510
2511     if ($suppress_audit) {
2512       print STDERR "$progname: " . timestr() . "(audit log suppressed)\n";
2513       return;
2514     }
2515
2516     my $o = $current_state;
2517     $current_state = "DEBUG";
2518
2519     my $line =  "#" x 78;
2520     print STDERR "\n\n\n";
2521     print STDERR ("#" x 78) . "\n";
2522     print STDERR blurb() . "failed to get an image.  Full audit log:\n";
2523     print STDERR "\n";
2524     showlog();
2525     print STDERR ("-" x 78) . "\n";
2526     print STDERR "\n\n";
2527
2528     $current_state = $o;
2529   }
2530   $image_succeeded = 0;
2531 }
2532
2533
2534
2535 sub stats_of($) {
2536   my ($name) = @_;
2537   my $i = $stats_successes{$name};
2538   my $j = $stats_attempts{$name};
2539   $i = 0 unless $i;
2540   $j = 0 unless $j;
2541   return "" . ($j ? int($i * 100 / $j) : "0") . "%";
2542 }
2543
2544
2545 my $current_start_time = 0;
2546
2547 sub start_timer($) {
2548   my ($name) = @_;
2549   $current_start_time = time;
2550
2551   if (defined($stats_attempts{$name})) {
2552     $stats_attempts{$name}++;
2553   } else {
2554     $stats_attempts{$name} = 1;
2555   }
2556   if (!defined($stats_elapsed{$name})) {
2557     $stats_elapsed{$name} = 0;
2558   }
2559 }
2560
2561 sub stop_timer($$) {
2562   my ($name, $success) = @_;
2563   $stats_elapsed{$name} += time - $current_start_time;
2564 }
2565
2566
2567 my $last_report_time = 0;
2568 sub report_performance() {
2569
2570   return unless $verbose_warnings;
2571
2572   my $now = time;
2573   return unless ($now >= $last_report_time + $report_performance_interval);
2574   my $ot = $last_report_time;
2575   $last_report_time = $now;
2576
2577   return if ($ot == 0);
2578
2579   my $blurb = "$progname: " . timestr();
2580
2581   print STDERR "\n";
2582   print STDERR "${blurb}Current standings:\n";
2583
2584   foreach my $name (sort keys (%stats_attempts)) {
2585     my $try = $stats_attempts{$name};
2586     my $suc = $stats_successes{$name} || 0;
2587     my $pct = int($suc * 100 / $try);
2588     my $secs = $stats_elapsed{$name};
2589     my $secs_link = $secs / $try;
2590     print STDERR sprintf ("$blurb %-14s %4s (%d/%d);" .
2591                           "       \t %.1f secs/link\n",
2592                           "$name:", "$pct%", $suc, $try, $secs_link);
2593   }
2594 }
2595
2596
2597
2598 my $max_recent_images = 400;
2599 my $max_recent_sites  = 20;
2600 my @recent_images = ();
2601 my @recent_sites = ();
2602
2603 sub save_recent_url($$) {
2604   my ($url, $base) = @_;
2605
2606   return unless ($verbose_warnings);
2607
2608   $_ = $url;
2609   my ($site) = m@^http://([^ \t\n\r/:]+)@;
2610   return unless defined ($site);
2611
2612   if ($base eq $driftnet_magic || $base eq $local_magic) {
2613     $site = $base;
2614     @recent_images = ();
2615   }
2616
2617   my $done = 0;
2618   foreach (@recent_images) {
2619     if ($_ eq $url) {
2620       print STDERR blurb() . "WARNING: recently-duplicated image: $url" .
2621         " (on $base via $last_search)\n";
2622       $done = 1;
2623       last;
2624     }
2625   }
2626
2627   # suppress "duplicate site" warning via %warningless_sites.
2628   #
2629   if ($warningless_sites{$site}) {
2630     $done = 1;
2631   } elsif ($site =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
2632            $warningless_sites{$1}) {
2633     $done = 1;
2634   } elsif ($site =~ m@([^.]+\.[^.]+)$@ &&
2635            $warningless_sites{$1}) {
2636     $done = 1;
2637   }
2638
2639   if (!$done) {
2640     foreach (@recent_sites) {
2641       if ($_ eq $site) {
2642         print STDERR blurb() . "WARNING: recently-duplicated site: $site" .
2643         " ($url on $base via $last_search)\n";
2644         last;
2645       }
2646     }
2647   }
2648
2649   push @recent_images, $url;
2650   push @recent_sites,  $site;
2651   shift @recent_images if ($#recent_images >= $max_recent_images);
2652   shift @recent_sites  if ($#recent_sites  >= $max_recent_sites);
2653 }
2654
2655
2656 \f
2657 ##############################################################################
2658 #
2659 # other utilities
2660 #
2661 ##############################################################################
2662
2663 # Does %-decoding.
2664 #
2665 sub url_decode($) {
2666   ($_) = @_;
2667   tr/+/ /;
2668   s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
2669   return $_;
2670 }
2671
2672
2673 # Given the raw body of a GIF document, returns the dimensions of the image.
2674 #
2675 sub gif_size($) {
2676   my ($body) = @_;
2677   my $type = substr($body, 0, 6);
2678   my $s;
2679   return () unless ($type =~ /GIF8[7,9]a/);
2680   $s = substr ($body, 6, 10);
2681   my ($a,$b,$c,$d) = unpack ("C"x4, $s);
2682   return () unless defined ($d);
2683   return (($b<<8|$a), ($d<<8|$c));
2684 }
2685
2686 # Given the raw body of a JPEG document, returns the dimensions of the image.
2687 #
2688 sub jpeg_size($) {
2689   my ($body) = @_;
2690   my $i = 0;
2691   my $L = length($body);
2692
2693   my $c1 = substr($body, $i, 1); $i++;
2694   my $c2 = substr($body, $i, 1); $i++;
2695   return () unless (ord($c1) == 0xFF && ord($c2) == 0xD8);
2696
2697   my $ch = "0";
2698   while (ord($ch) != 0xDA && $i < $L) {
2699     # Find next marker, beginning with 0xFF.
2700     while (ord($ch) != 0xFF) {
2701       return () if (length($body) <= $i);
2702       $ch = substr($body, $i, 1); $i++;
2703     }
2704     # markers can be padded with any number of 0xFF.
2705     while (ord($ch) == 0xFF) {
2706       return () if (length($body) <= $i);
2707       $ch = substr($body, $i, 1); $i++;
2708     }
2709
2710     # $ch contains the value of the marker.
2711     my $marker = ord($ch);
2712
2713     if (($marker >= 0xC0) &&
2714         ($marker <= 0xCF) &&
2715         ($marker != 0xC4) &&
2716         ($marker != 0xCC)) {  # it's a SOFn marker
2717       $i += 3;
2718       return () if (length($body) <= $i);
2719       my $s = substr($body, $i, 4); $i += 4;
2720       my ($a,$b,$c,$d) = unpack("C"x4, $s);
2721       return (($c<<8|$d), ($a<<8|$b));
2722
2723     } else {
2724       # We must skip variables, since FFs in variable names aren't
2725       # valid JPEG markers.
2726       return () if (length($body) <= $i);
2727       my $s = substr($body, $i, 2); $i += 2;
2728       my ($c1, $c2) = unpack ("C"x2, $s);
2729       my $length = ($c1 << 8) | $c2;
2730       return () if ($length < 2);
2731       $i += $length-2;
2732     }
2733   }
2734   return ();
2735 }
2736
2737 # Given the raw body of a PNG document, returns the dimensions of the image.
2738 #
2739 sub png_size($) {
2740   my ($body) = @_;
2741   return () unless ($body =~ m/^\211PNG\r/);
2742   my ($bits) = ($body =~ m/^.{12}(.{12})/s);
2743   return () unless defined ($bits);
2744   return () unless ($bits =~ /^IHDR/);
2745   my ($ign, $w, $h) = unpack("a4N2", $bits);
2746   return ($w, $h);
2747 }
2748
2749
2750 # Given the raw body of a GIF, JPEG, or PNG document, returns the dimensions
2751 # of the image.
2752 #
2753 sub image_size($) {
2754   my ($body) = @_;
2755   my ($w, $h) = gif_size ($body);
2756   if ($w && $h) { return ($w, $h); }
2757   ($w, $h) = jpeg_size ($body);
2758   if ($w && $h) { return ($w, $h); }
2759   return png_size ($body);
2760 }
2761
2762
2763 # returns the full path of the named program, or undef.
2764 #
2765 sub which($) {
2766   my ($prog) = @_;
2767   foreach (split (/:/, $ENV{PATH})) {
2768     if (-x "$_/$prog") {
2769       return $prog;
2770     }
2771   }
2772   return undef;
2773 }
2774
2775
2776 # Like rand(), but chooses numbers with a bell curve distribution.
2777 sub bellrand(;$) {
2778   ($_) = @_;
2779   $_ = 1.0 unless defined($_);
2780   $_ /= 3.0;
2781   return (rand($_) + rand($_) + rand($_));
2782 }
2783
2784
2785 sub exit_cleanup() {
2786   x_cleanup();
2787   print STDERR "$progname: exiting\n" if ($verbose_warnings);
2788   if (@pids_to_kill) {
2789     print STDERR blurb() . "killing: " . join(' ', @pids_to_kill) . "\n";
2790     kill ('TERM', @pids_to_kill);
2791   }
2792 }
2793
2794 sub signal_cleanup($) {
2795   my ($sig) = @_;
2796   print STDERR blurb() . (defined($sig)
2797                           ? "caught signal $sig."
2798                           : "exiting.")
2799                        . "\n"
2800     if ($verbose_exec || $verbose_warnings);
2801   exit 1;
2802 }
2803
2804
2805
2806 ##############################################################################
2807 #
2808 # Generating a list of urls only
2809 #
2810 ##############################################################################
2811
2812 sub url_only_output() {
2813   do {
2814     my ($base, $img) = pick_image;
2815     if ($img) {
2816       $base =~ s/ /%20/g;
2817       $img  =~ s/ /%20/g;
2818       print "$img $base\n";
2819     }
2820   } while (1);
2821 }
2822
2823 ##############################################################################
2824 #
2825 # Running as an xscreensaver module, or as a web page imagemap
2826 #
2827 ##############################################################################
2828
2829 my $image_ppm   = sprintf ("%s/webcollage-%08x.ppm",
2830                            ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp"),
2831                            rand(0xFFFFFFFF));
2832 my $image_tmp1  = sprintf ("%s/webcollage-1-%08x.ppm",
2833                            ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp"),
2834                            rand(0xFFFFFFFF));
2835 my $image_tmp2  = sprintf ("%s/webcollage-2-%08x.ppm",
2836                            ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp"),
2837                            rand(0xFFFFFFFF));
2838
2839 my $filter_cmd = undef;
2840 my $post_filter_cmd = undef;
2841 my $background = undef;
2842
2843 my @imagemap_areas = ();
2844 my $imagemap_html_tmp = undef;
2845 my $imagemap_jpg_tmp = undef;
2846
2847
2848 my $img_width;            # size of the image being generated.
2849 my $img_height;
2850
2851 my $delay = 2;
2852
2853 sub x_cleanup() {
2854   unlink $image_ppm, $image_tmp1, $image_tmp2;
2855   unlink $imagemap_html_tmp, $imagemap_jpg_tmp
2856     if (defined ($imagemap_html_tmp));
2857 }
2858
2859
2860 # Like system, but prints status about exit codes, and kills this process
2861 # with whatever signal killed the sub-process, if any.
2862 #
2863 sub nontrapping_system(@) {
2864   $! = 0;
2865
2866   $_ = join(" ", @_);
2867   s/\"[^\"]+\"/\"...\"/g;
2868
2869   LOG ($verbose_exec, "executing \"$_\"");
2870
2871   my $rc = system @_;
2872
2873   if ($rc == 0) {
2874     LOG ($verbose_exec, "subproc exited normally.");
2875   } elsif (($rc & 0xff) == 0) {
2876     $rc >>= 8;
2877     LOG ($verbose_exec, "subproc exited with status $rc.");
2878   } else {
2879     if ($rc & 0x80) {
2880       LOG ($verbose_exec, "subproc dumped core.");
2881       $rc &= ~0x80;
2882     }
2883     LOG ($verbose_exec, "subproc died with signal $rc.");
2884     # die that way ourselves.
2885     kill $rc, $$;
2886   }
2887
2888   return $rc;
2889 }
2890
2891
2892 # Given the URL of a GIF, JPEG, or PNG image, and the body of that image,
2893 # writes a PPM to the given output file.  Returns the width/height of the
2894 # image if successful.
2895 #
2896 sub image_to_pnm($$$) {
2897   my ($url, $body, $output) = @_;
2898   my ($cmd, $cmd2, $w, $h);
2899
2900   if ((@_ = gif_size ($body))) {
2901     ($w, $h) = @_;
2902     $cmd = "giftopnm";
2903   } elsif ((@_ = jpeg_size ($body))) {
2904     ($w, $h) = @_;
2905     $cmd = "djpeg";
2906   } elsif ((@_ = png_size ($body))) {
2907     ($w, $h) = @_;
2908     $cmd = "pngtopnm";
2909   } else {
2910     LOG (($verbose_pbm || $verbose_load),
2911          "not a GIF, JPG, or PNG" .
2912          (($body =~ m@<(base|html|head|body|script|table|a href)\b@i)
2913           ? " (looks like HTML)" : "") .
2914          ": $url");
2915     $suppress_audit = 1;
2916     return ();
2917   }
2918
2919   $cmd2 = "exec $cmd";        # yes, this really is necessary.  if we don't
2920                               # do this, the process doesn't die properly.
2921   if (!$verbose_pbm) {
2922     #
2923     # We get a "giftopnm: got a 'Application Extension' extension"
2924     # warning any time it's an animgif.
2925     #
2926     # Note that "giftopnm: EOF / read error on image data" is not
2927     # always a fatal error -- sometimes the image looks fine anyway.
2928     #
2929     $cmd2 .= " 2>/dev/null";
2930   }
2931
2932   # There exist corrupted GIF and JPEG files that can make giftopnm and
2933   # djpeg lose their minds and go into a loop.  So this gives those programs
2934   # a small timeout -- if they don't complete in time, kill them.
2935   #
2936   my $pid;
2937   @_ = eval {
2938     my $timed_out;
2939
2940     local $SIG{ALRM}  = sub {
2941       LOG ($verbose_pbm,
2942            "timed out ($cvt_timeout) for $cmd on \"$url\" in pid $pid");
2943       kill ('TERM', $pid) if ($pid);
2944       $timed_out = 1;
2945       $body = undef;
2946     };
2947
2948     if (($pid = open (my $pipe, "| $cmd2 > $output"))) {
2949       $timed_out = 0;
2950       alarm $cvt_timeout;
2951       print $pipe $body;
2952       $body = undef;
2953       close $pipe;
2954
2955       LOG ($verbose_exec, "awaiting $pid");
2956       waitpid ($pid, 0);
2957       LOG ($verbose_exec, "$pid completed");
2958
2959       my $size = (stat($output))[7];
2960       $size = -1 unless defined($size);
2961       if ($size < 5) {
2962         LOG ($verbose_pbm, "$cmd on ${w}x$h \"$url\" failed ($size bytes)");
2963         return ();
2964       }
2965
2966       LOG ($verbose_pbm, "created ${w}x$h $output ($cmd)");
2967       return ($w, $h);
2968     } else {
2969       print STDERR blurb() . "$cmd failed: $!\n";
2970       return ();
2971     }
2972   };
2973   die if ($@ && $@ ne "alarm\n");       # propagate errors
2974   if ($@) {
2975     # timed out
2976     $body = undef;
2977     return ();
2978   } else {
2979     # didn't
2980     alarm 0;
2981     $body = undef;
2982     return @_;
2983   }
2984 }
2985
2986
2987 # Same as the "ppmmake" command: creates a solid-colored PPM.
2988 # Does not understand the rgb.txt color names except "black" and "white".
2989 #
2990 sub ppmmake($$$$) {
2991   my ($outfile, $bgcolor, $w, $h) = @_;
2992
2993   my ($r, $g, $b);
2994   if ($bgcolor =~ m/^\#?([\dA-F][\dA-F])([\dA-F][\dA-F])([\dA-F][\dA-F])$/i ||
2995       $bgcolor =~ m/^\#?([\dA-F])([\dA-F])([\dA-F])$/i) {
2996     ($r, $g, $b) = (hex($1), hex($2), hex($3));
2997   } elsif ($bgcolor =~ m/^black$/i) {
2998     ($r, $g, $b) = (0, 0, 0);
2999   } elsif ($bgcolor =~ m/^white$/i) {
3000     ($r, $g, $b) = (0xFF, 0xFF, 0xFF);
3001   } else {
3002     error ("unparsable color name: $bgcolor");
3003   }
3004
3005   my $pixel = pack('CCC', $r, $g, $b);
3006   my $bits = "P6\n$w $h\n255\n" . ($pixel x ($w * $h));
3007
3008   open (my $out, '>', $outfile) || error ("$outfile: $!");
3009   print $out $bits;
3010   close $out;
3011 }
3012
3013
3014 sub pick_root_displayer() {
3015   my @names = ();
3016
3017   if ($cocoa_p) {
3018     # see "xscreensaver/hacks/webcollage-cocoa.m"
3019     return "echo COCOA LOAD ";
3020   }
3021
3022   foreach my $cmd (@root_displayers) {
3023     $_ = $cmd;
3024     my ($name) = m/^([^ ]+)/;
3025     push @names, "\"$name\"";
3026     LOG ($verbose_exec, "looking for $name...");
3027     foreach my $dir (split (/:/, $ENV{PATH})) {
3028       LOG ($verbose_exec, "  checking $dir/$name");
3029       return $cmd if (-x "$dir/$name");
3030     }
3031   }
3032
3033   $names[$#names] = "or " . $names[$#names];
3034   error "none of: " . join (", ", @names) . " were found on \$PATH.";
3035 }
3036
3037
3038 my $ppm_to_root_window_cmd = undef;
3039
3040
3041 sub x_or_pbm_output($) {
3042   my ($window_id) = @_;
3043
3044   # Check for our helper program, to see whether we need to use PPM pipelines.
3045   #
3046   $_ = "webcollage-helper";
3047   if (defined ($webcollage_helper) || which ($_)) {
3048     $webcollage_helper = $_ unless (defined($webcollage_helper));
3049     LOG ($verbose_pbm, "found \"$webcollage_helper\"");
3050     $webcollage_helper .= " -v";
3051   } else {
3052     LOG (($verbose_pbm || $verbose_load), "no $_ program");
3053   }
3054
3055   if ($cocoa_p && !defined ($webcollage_helper)) {
3056     error ("webcollage-helper not found in Cocoa-mode!");
3057   }
3058
3059
3060   # make sure the various programs we execute exist, right up front.
3061   #
3062   my @progs = ();
3063
3064   if (!defined($webcollage_helper)) {
3065     # Only need these others if we don't have the helper.
3066     @progs = (@progs,
3067               "giftopnm", "pngtopnm", "djpeg",
3068               "pnmpaste", "pnmscale", "pnmcut");
3069   }
3070
3071   foreach (@progs) {
3072     which ($_) || error "$_ not found on \$PATH.";
3073   }
3074
3075   # find a root-window displayer program.
3076   #
3077   if (!$no_output_p) {
3078     $ppm_to_root_window_cmd = pick_root_displayer();
3079   }
3080
3081   if (defined ($window_id)) {
3082     error ("-window-id only works if xscreensaver-getimage is installed")
3083       unless ($ppm_to_root_window_cmd =~ m/^xscreensaver-getimage\b/);
3084
3085     error ("unparsable window id: $window_id")
3086       unless ($window_id =~ m/^\d+$|^0x[\da-f]+$/i);
3087     $ppm_to_root_window_cmd =~ s/--?root\b/$window_id/ ||
3088       error ("unable to munge displayer: $ppm_to_root_window_cmd");
3089   }
3090
3091   if (!$img_width || !$img_height) {
3092
3093     if (!defined ($window_id) &&
3094         defined ($ENV{XSCREENSAVER_WINDOW})) {
3095       $window_id = $ENV{XSCREENSAVER_WINDOW};
3096     }
3097
3098     if (!defined ($window_id)) {
3099       $_ = "xdpyinfo";
3100       which ($_) || error "$_ not found on \$PATH.";
3101       $_ = `$_`;
3102       ($img_width, $img_height) = m/dimensions: *(\d+)x(\d+) /;
3103       if (!defined($img_height)) {
3104         error "xdpyinfo failed.";
3105       }
3106     } else {  # we have a window id
3107       $_ = "xwininfo";
3108       which ($_) || error "$_ not found on \$PATH.";
3109       $_ .= " -id $window_id";
3110       $_ = `$_`;
3111       ($img_width, $img_height) = m/^\s*Width:\s*(\d+)\n\s*Height:\s*(\d+)\n/m;
3112
3113       if (!defined($img_height)) {
3114         error "xwininfo failed.";
3115       }
3116     }
3117   }
3118
3119   my $bgcolor = "#000000";
3120   my $bgimage = undef;
3121
3122   if ($background) {
3123     if ($background =~ m/^\#[0-9a-f]+$/i) {
3124       $bgcolor = $background;
3125
3126     } elsif (-r $background) {
3127       $bgimage = $background;
3128
3129     } elsif (! $background =~ m@^[-a-z0-9 ]+$@i) {
3130       error "not a color or readable file: $background";
3131
3132     } else {
3133       # default to assuming it's a color
3134       $bgcolor = $background;
3135     }
3136   }
3137
3138   # Create the sold-colored base image.
3139   #
3140   LOG ($verbose_pbm, "creating base image: ${img_width}x${img_height}");
3141   $_ = ppmmake ($image_ppm, $bgcolor, $img_width, $img_height);
3142
3143   # Paste the default background image in the middle of it.
3144   #
3145   if ($bgimage) {
3146     my ($iw, $ih);
3147
3148     my $body = "";
3149     open (my $imgf, '<', $bgimage) || error "couldn't open $bgimage: $!";
3150     local $/ = undef;  # read entire file
3151     $body = <$imgf>;
3152     close ($imgf);
3153
3154     my $cmd;
3155     if ((@_ = gif_size ($body))) {
3156       ($iw, $ih) = @_;
3157       $cmd = "giftopnm |";
3158
3159     } elsif ((@_ = jpeg_size ($body))) {
3160       ($iw, $ih) = @_;
3161       $cmd = "djpeg |";
3162
3163     } elsif ((@_ = png_size ($body))) {
3164       ($iw, $ih) = @_;
3165       $cmd = "pngtopnm |";
3166
3167     } elsif ($body =~ m/^P\d\n(\d+) (\d+)\n/) {
3168       $iw = $1;
3169       $ih = $2;
3170       $cmd = "";
3171
3172     } else {
3173       error "$bgimage is not a GIF, JPEG, PNG, or PPM.";
3174     }
3175
3176     my $x = int (($img_width  - $iw) / 2);
3177     my $y = int (($img_height - $ih) / 2);
3178     LOG ($verbose_pbm,
3179          "pasting $bgimage (${iw}x$ih) into base image at $x,$y");
3180
3181     $cmd .= "pnmpaste - $x $y $image_ppm > $image_tmp1";
3182     open ($imgf, "| $cmd") || error "running $cmd: $!";
3183     print $imgf $body;
3184     $body = undef;
3185     close ($imgf);
3186     LOG ($verbose_exec, "subproc exited normally.");
3187     rename ($image_tmp1, $image_ppm) ||
3188       error "renaming $image_tmp1 to $image_ppm: $!";
3189   }
3190
3191   clearlog();
3192
3193   while (1) {
3194     my ($base, $img) = pick_image();
3195     my $source = $current_state;
3196     $current_state = "loadimage";
3197     if ($img) {
3198       my ($headers, $body) = get_document ($img, $base);
3199       if ($body) {
3200         paste_image ($base, $img, $body, $source);
3201         $body = undef;
3202       }
3203     }
3204     $current_state = "idle";
3205     $load_method = "none";
3206
3207     unlink $image_tmp1, $image_tmp2;
3208     sleep $delay;
3209   }
3210 }
3211
3212 sub paste_image($$$$) {
3213   my ($base, $img, $body, $source) = @_;
3214
3215   $current_state = "paste";
3216
3217   $suppress_audit = 0;
3218
3219   LOG ($verbose_pbm, "got $img (" . length($body) . ")");
3220
3221   my ($iw, $ih);
3222
3223   # If we are using the webcollage-helper, then we do not need to convert this
3224   # image to a PPM.  But, if we're using a filter command, we still must, since
3225   # that's what the filters expect (webcollage-helper can read PPMs, so that's
3226   # fine.)
3227   #
3228   if (defined ($webcollage_helper) &&
3229       !defined ($filter_cmd)) {
3230
3231     ($iw, $ih) = image_size ($body);
3232     if (!$iw || !$ih) {
3233       LOG (($verbose_pbm || $verbose_load),
3234            "not a GIF, JPG, or PNG" .
3235            (($body =~ m@<(base|html|head|body|script|table|a href)>@i)
3236             ? " (looks like HTML)" : "") .
3237            ": $img");
3238       $suppress_audit = 1;
3239       $body = undef;
3240       return 0;
3241     }
3242
3243     open (my $out, '>', $image_tmp1) || error ("writing $image_tmp1: $!");
3244     (print $out $body) || error ("writing $image_tmp1: $!");
3245     close ($out) || error ("writing $image_tmp1: $!");
3246
3247   } else {
3248     ($iw, $ih) = image_to_pnm ($img, $body, $image_tmp1);
3249     $body = undef;
3250     if (!$iw || !$ih) {
3251       LOG ($verbose_pbm, "unable to make PBM from $img");
3252       return 0;
3253     }
3254   }
3255
3256   record_success ($load_method, $img, $base);
3257
3258
3259   my $ow = $iw;  # used only for error messages
3260   my $oh = $ih;
3261
3262   # don't just tack this onto the front of the pipeline -- we want it to
3263   # be able to change the size of the input image.
3264   #
3265   if ($filter_cmd) {
3266     LOG ($verbose_pbm, "running $filter_cmd");
3267
3268     my $rc = nontrapping_system "($filter_cmd) < $image_tmp1 >$image_tmp2";
3269     if ($rc != 0) {
3270       LOG(($verbose_pbm || $verbose_load), "failed command: \"$filter_cmd\"");
3271       LOG(($verbose_pbm || $verbose_load), "failed URL: \"$img\" (${ow}x$oh)");
3272       return;
3273     }
3274     rename ($image_tmp2, $image_tmp1);
3275
3276     # re-get the width/height in case the filter resized it.
3277     open (my $imgf, '<', $image_tmp1) || return 0;
3278     $_ = <$imgf>;
3279     $_ = <$imgf>;
3280     ($iw, $ih) = m/^(\d+) (\d+)$/;
3281     close ($imgf);
3282     return 0 unless ($iw && $ih);
3283   }
3284
3285   my $target_w = $img_width;   # max rectangle into which the image must fit
3286   my $target_h = $img_height;
3287
3288   my $cmd = "";
3289   my $scale = 1.0;
3290
3291
3292   # Usually scale the image to fit on the screen -- but sometimes scale it
3293   # to fit on half or a quarter of the screen.  (We do this by reducing the
3294   # size of the target rectangle.)  Note that the image is not merely scaled
3295   # to fit; we instead cut the image in half repeatedly until it fits in the
3296   # target rectangle -- that gives a wider distribution of sizes.
3297   #
3298   if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; } # reduce target rect
3299   if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; }
3300
3301   if ($iw > $target_w || $ih > $target_h) {
3302     while ($iw > $target_w ||
3303            $ih > $target_h) {
3304       $iw = int($iw / 2);
3305       $ih = int($ih / 2);
3306       $scale /= 2;
3307     }
3308     if ($iw <= 10 || $ih <= 10) {
3309       LOG ($verbose_pbm, "scaling to ${iw}x$ih would have been bogus.");
3310       return 0;
3311     }
3312
3313     LOG ($verbose_pbm, "scaling to ${iw}x$ih ($scale)");
3314
3315     $cmd .= " | pnmscale -xsize $iw -ysize $ih";
3316   }
3317
3318
3319   my $src = $image_tmp1;
3320
3321   my $crop_x = 0;     # the sub-rectangle of the image
3322   my $crop_y = 0;     # that we will actually paste.
3323   my $crop_w = $iw;
3324   my $crop_h = $ih;
3325
3326   # The chance that we will randomly crop out a section of an image starts
3327   # out fairly low, but goes up for images that are very large, or images
3328   # that have ratios that make them look like banners (we try to avoid
3329   # banner images entirely, but they slip through when the IMG tags didn't
3330   # have WIDTH and HEIGHT specified.)
3331   #
3332   my $crop_chance = 0.2;
3333   if ($iw > $img_width * 0.4 || $ih > $img_height * 0.4) {
3334     $crop_chance += 0.2;
3335   }
3336   if ($iw > $img_width * 0.7 || $ih > $img_height * 0.7) {
3337     $crop_chance += 0.2;
3338   }
3339   if ($min_ratio && ($iw * $min_ratio) > $ih) {
3340     $crop_chance += 0.7;
3341   }
3342
3343   if ($crop_chance > 0.1) {
3344     LOG ($verbose_pbm, "crop chance: $crop_chance");
3345   }
3346
3347   if (rand() < $crop_chance) {
3348
3349     my $ow = $crop_w;
3350     my $oh = $crop_h;
3351
3352     if ($crop_w > $min_width) {
3353       # if it's a banner, select the width linearly.
3354       # otherwise, select a bell.
3355       my $r = (($min_ratio && ($iw * $min_ratio) > $ih)
3356                ? rand()
3357                : bellrand());
3358       $crop_w = $min_width + int ($r * ($crop_w - $min_width));
3359       $crop_x = int (rand() * ($ow - $crop_w));
3360     }
3361     if ($crop_h > $min_height) {
3362       # height always selects as a bell.
3363       $crop_h = $min_height + int (bellrand() * ($crop_h - $min_height));
3364       $crop_y = int (rand() * ($oh - $crop_h));
3365     }
3366
3367     if ($crop_x != 0   || $crop_y != 0 ||
3368         $crop_w != $iw || $crop_h != $ih) {
3369       LOG ($verbose_pbm,
3370            "randomly cropping to ${crop_w}x$crop_h \@ $crop_x,$crop_y");
3371     }
3372   }
3373
3374   # Where the image should logically land -- this might be negative.
3375   #
3376   my $x = int((rand() * ($img_width  + $crop_w/2)) - $crop_w*3/4);
3377   my $y = int((rand() * ($img_height + $crop_h/2)) - $crop_h*3/4);
3378
3379   # if we have chosen to paste the image outside of the rectangle of the
3380   # screen, then we need to crop it.
3381   #
3382   if ($x < 0 ||
3383       $y < 0 ||
3384       $x + $crop_w > $img_width ||
3385       $y + $crop_h > $img_height) {
3386
3387     LOG ($verbose_pbm,
3388          "cropping for effective paste of ${crop_w}x$crop_h \@ $x,$y");
3389
3390     if ($x < 0) { $crop_x -= $x; $crop_w += $x; $x = 0; }
3391     if ($y < 0) { $crop_y -= $y; $crop_h += $y; $y = 0; }
3392
3393     if ($x + $crop_w >= $img_width)  { $crop_w = $img_width  - $x - 1; }
3394     if ($y + $crop_h >= $img_height) { $crop_h = $img_height - $y - 1; }
3395   }
3396
3397   # If any cropping needs to happen, add pnmcut.
3398   #
3399   if ($crop_x != 0   || $crop_y != 0 ||
3400       $crop_w != $iw || $crop_h != $ih) {
3401     $iw = $crop_w;
3402     $ih = $crop_h;
3403     $cmd .= " | pnmcut $crop_x $crop_y $iw $ih";
3404     LOG ($verbose_pbm, "cropping to ${crop_w}x$crop_h \@ $crop_x,$crop_y");
3405   }
3406
3407   LOG ($verbose_pbm, "pasting ${iw}x$ih \@ $x,$y in $image_ppm");
3408
3409   $cmd .= " | pnmpaste - $x $y $image_ppm";
3410
3411   $cmd =~ s@^ *\| *@@;
3412
3413   if (defined ($webcollage_helper)) {
3414     $cmd = "$webcollage_helper $image_tmp1 $image_ppm " .
3415                               "$scale $opacity " .
3416                               "$crop_x $crop_y $x $y " .
3417                               "$iw $ih";
3418     $_ = $cmd;
3419
3420   } else {
3421     # use a PPM pipeline
3422     $_ = "($cmd)";
3423     $_ .= " < $image_tmp1 > $image_tmp2";
3424   }
3425
3426   if ($verbose_pbm) {
3427     $_ = "($_) 2>&1 | sed s'/^/" . blurb() . "/'";
3428   } else {
3429     $_ .= " 2> /dev/null";
3430   }
3431
3432   my $rc = nontrapping_system ($_);
3433
3434   if (defined ($webcollage_helper) && -z $image_ppm) {
3435     LOG (1, "failed command: \"$cmd\"");
3436     print STDERR "\naudit log:\n\n\n";
3437     print STDERR ("#" x 78) . "\n";
3438     print STDERR blurb() . "$image_ppm has zero size\n";
3439     showlog();
3440     print STDERR "\n\n";
3441     exit (1);
3442   }
3443
3444   if ($rc != 0) {
3445     LOG (($verbose_pbm || $verbose_load), "failed command: \"$cmd\"");
3446     LOG (($verbose_pbm || $verbose_load), "failed URL: \"$img\" (${ow}x$oh)");
3447     return;
3448   }
3449
3450   if (!defined ($webcollage_helper)) {
3451     rename ($image_tmp2, $image_ppm) || return;
3452   }
3453
3454   my $target = "$image_ppm";
3455
3456   # don't just tack this onto the end of the pipeline -- we don't want it
3457   # to end up in $image_ppm, because we don't want the results to be
3458   # cumulative.
3459   #
3460   if ($post_filter_cmd) {
3461
3462     my $cmd;
3463
3464     $target = $image_tmp1;
3465     if (!defined ($webcollage_helper)) {
3466       $cmd = "($post_filter_cmd) < $image_ppm > $target";
3467     } else {
3468       # Blah, my scripts need the JPEG data, but some other folks need
3469       # the PPM data -- what to do?  Ignore the problem, that's what!
3470 #     $cmd = "djpeg < $image_ppm | ($post_filter_cmd) > $target";
3471       $cmd = "($post_filter_cmd) < $image_ppm > $target";
3472     }
3473
3474     $rc = nontrapping_system ($cmd);
3475     if ($rc != 0) {
3476       LOG ($verbose_pbm, "filter failed: \"$post_filter_cmd\"\n");
3477       return;
3478     }
3479   }
3480
3481   if (!$no_output_p) {
3482     my $tsize = (stat($target))[7];
3483     if ($tsize > 200) {
3484       $cmd = "$ppm_to_root_window_cmd $target";
3485
3486       # xv seems to hate being killed.  it tends to forget to clean
3487       # up after itself, and leaves windows around and colors allocated.
3488       # I had this same problem with vidwhacker, and I'm not entirely
3489       # sure what I did to fix it.  But, let's try this: launch xv
3490       # in the background, so that killing this process doesn't kill it.
3491       # it will die of its own accord soon enough.  So this means we
3492       # start pumping bits to the root window in parallel with starting
3493       # the next network retrieval, which is probably a better thing
3494       # to do anyway.
3495       #
3496       $cmd .= " &" unless ($cocoa_p);
3497
3498       $rc = nontrapping_system ($cmd);
3499
3500       if ($rc != 0) {
3501         LOG (($verbose_pbm || $verbose_load), "display failed: \"$cmd\"");
3502         return;
3503       }
3504
3505     } else {
3506       LOG ($verbose_pbm, "$target size is $tsize");
3507     }
3508   }
3509
3510   $source .= "-" . stats_of($source);
3511   print STDOUT "image: ${iw}x${ih} @ $x,$y $base $source\n"
3512     if ($verbose_imgmap);
3513   if ($imagemap_base) {
3514     update_imagemap ($base, $x, $y, $iw, $ih,
3515                      $image_ppm, $img_width, $img_height);
3516   }
3517
3518   clearlog();
3519
3520   return 1;
3521 }
3522
3523
3524 sub update_imagemap($$$$$$$$) {
3525   my ($url, $x, $y, $w, $h, $image_ppm, $image_width, $image_height) = @_;
3526
3527   $current_state = "imagemap";
3528
3529   my $max_areas = 200;
3530
3531   $url = html_quote ($url);
3532   my $x2 = $x + $w;
3533   my $y2 = $y + $h;
3534   my $area = "<AREA SHAPE=RECT COORDS=\"$x,$y,$x2,$y2\" HREF=\"$url\">";
3535   unshift @imagemap_areas, $area;       # put one on the front
3536   if ($#imagemap_areas >= $max_areas) {
3537     pop @imagemap_areas;                # take one off the back.
3538   }
3539
3540   LOG ($verbose_pbm, "area: $x,$y,$x2,$y2 (${w}x$h)");
3541
3542   my $map_name = $imagemap_base;
3543   $map_name =~ s@^.*/@@;
3544   $map_name = 'collage' if ($map_name eq '');
3545
3546   my $imagemap_html = $imagemap_base . ".html";
3547   my $imagemap_jpg  = $imagemap_base . ".jpg";
3548
3549   if (!defined ($imagemap_html_tmp)) {
3550     $imagemap_html_tmp = $imagemap_html . sprintf (".%08x", rand(0xffffffff));
3551     $imagemap_jpg_tmp  = $imagemap_jpg  . sprintf (".%08x", rand(0xffffffff));
3552   }
3553
3554   # Read the imagemap html file (if any) to get a template.
3555   #
3556   my $template_html = '';
3557   {
3558     if (open (my $in, '<', $imagemap_html)) {
3559       local $/ = undef;  # read entire file
3560       $template_html = <$in>;
3561       close $in;
3562       LOG ($verbose_pbm, "read template $imagemap_html");
3563     }
3564
3565     if ($template_html =~ m/^\s*$/s) {
3566       $template_html = ("<MAP NAME=\"$map_name\"></MAP>\n" .
3567                         "<IMG SRC=\"$imagemap_base.jpg\"" .
3568                         " USEMAP=\"$map_name\">\n");
3569       LOG ($verbose_pbm, "created dummy template");
3570     }
3571   }
3572
3573   # Write the jpg to a tmp file
3574   #
3575   {
3576     my $cmd;
3577     if (defined ($webcollage_helper)) {
3578       $cmd = "cp -p $image_ppm $imagemap_jpg_tmp";
3579     } else {
3580       $cmd = "cjpeg < $image_ppm > $imagemap_jpg_tmp";
3581     }
3582     my $rc = nontrapping_system ($cmd);
3583     if ($rc != 0) {
3584       error ("imagemap jpeg failed: \"$cmd\"\n");
3585     }
3586   }
3587
3588   # Write the html to a tmp file
3589   #
3590   {
3591     my $body = $template_html;
3592     my $areas = join ("\n\t", @imagemap_areas);
3593     my $map = ("<MAP NAME=\"$map_name\">\n\t$areas\n</MAP>");
3594     my $img = ("<IMG SRC=\"$imagemap_base.jpg\" " .
3595                "BORDER=0 " .
3596                "WIDTH=$image_width HEIGHT=$image_height " .
3597                "USEMAP=\"#$map_name\">");
3598     $body =~ s@(<MAP\s+NAME=\"[^\"]*\"\s*>).*?(</MAP>)@$map@is;
3599     $body =~ s@<IMG\b[^<>]*\bUSEMAP\b[^<>]*>@$img@is;
3600
3601     # if there are magic webcollage spans in the html, update those too.
3602     #
3603     {
3604       my @st = stat ($imagemap_jpg_tmp);
3605       my $date = strftime("%d-%b-%Y %l:%M:%S %p %Z", localtime($st[9]));
3606       my $size = int(($st[7] / 1024) + 0.5) . "K";
3607       $body =~ s@(<SPAN\s+CLASS=\"webcollage_date\">).*?(</SPAN>)@$1$date$2@si;
3608       $body =~ s@(<SPAN\s+CLASS=\"webcollage_size\">).*?(</SPAN>)@$1$size$2@si;
3609     }
3610
3611     open (my $out, '>', $imagemap_html_tmp) || error ("$imagemap_html_tmp: $!");
3612     (print $out $body)                      || error ("$imagemap_html_tmp: $!");
3613     close ($out)                            || error ("$imagemap_html_tmp: $!");
3614     LOG ($verbose_pbm, "wrote $imagemap_html_tmp");
3615   }
3616
3617   # Rename the two tmp files to the real files
3618   #
3619   rename ($imagemap_html_tmp, $imagemap_html) ||
3620     error "renaming $imagemap_html_tmp to $imagemap_html";
3621   LOG ($verbose_pbm, "wrote $imagemap_html");
3622   rename ($imagemap_jpg_tmp,  $imagemap_jpg) ||
3623     error "renaming $imagemap_jpg_tmp to $imagemap_jpg";
3624   LOG ($verbose_pbm, "wrote $imagemap_jpg");
3625 }
3626
3627
3628 # Figure out what the proxy server should be, either from environment
3629 # variables or by parsing the output of the (MacOS) program "scutil",
3630 # which tells us what the system-wide proxy settings are.
3631 #
3632 sub set_proxy() {
3633
3634   if (! $http_proxy) {
3635     # historical suckage: the environment variable name is lower case.
3636     $http_proxy = $ENV{http_proxy} || $ENV{HTTP_PROXY};
3637   }
3638
3639   if (defined ($http_proxy)) {
3640     if ($http_proxy && $http_proxy =~ m@^http://([^/]*)/?$@ ) {
3641       # historical suckage: allow "http://host:port" as well as "host:port".
3642       $http_proxy = $1;
3643     }
3644
3645   } else {
3646     my $proxy_data = `scutil --proxy 2>/dev/null`;
3647     my ($server) = ($proxy_data =~ m/\bHTTPProxy\s*:\s*([^\s]+)/s);
3648     my ($port)   = ($proxy_data =~ m/\bHTTPPort\s*:\s*([^\s]+)/s);
3649     # Note: this ignores the "ExceptionsList".
3650     if ($server) {
3651       $http_proxy = $server;
3652       $http_proxy .= ":$port" if $port;
3653     }
3654   }
3655
3656   if ($http_proxy) {
3657     LOG ($verbose_net, "proxy server: $http_proxy");
3658   }
3659 }
3660
3661
3662 sub init_signals() {
3663
3664   $SIG{HUP}  = \&signal_cleanup;
3665   $SIG{INT}  = \&signal_cleanup;
3666   $SIG{QUIT} = \&signal_cleanup;
3667   $SIG{ABRT} = \&signal_cleanup;
3668   $SIG{KILL} = \&signal_cleanup;
3669   $SIG{TERM} = \&signal_cleanup;
3670
3671   # Need this so that if giftopnm dies, we don't die.
3672   $SIG{PIPE} = 'IGNORE';
3673 }
3674
3675 END { exit_cleanup(); }
3676
3677
3678 sub main() {
3679   $| = 1;
3680   srand(time ^ $$);
3681
3682   my $verbose = 0;
3683   my $dict;
3684   my $driftnet_cmd = 0;
3685
3686   $current_state = "init";
3687   $load_method = "none";
3688
3689   my $root_p = 0;
3690   my $window_id = undef;
3691
3692   while ($_ = $ARGV[0]) {
3693     shift @ARGV;
3694     if ($_ eq "-display" ||
3695         $_ eq "-displ" ||
3696         $_ eq "-disp" ||
3697         $_ eq "-dis" ||
3698         $_ eq "-dpy" ||
3699         $_ eq "-d") {
3700       $ENV{DISPLAY} = shift @ARGV;
3701     } elsif ($_ eq "-root") {
3702       $root_p = 1;
3703     } elsif ($_ eq "-window-id" || $_ eq "--window-id") {
3704       $window_id = shift @ARGV;
3705       $root_p = 1;
3706     } elsif ($_ eq "-no-output") {
3707       $no_output_p = 1;
3708     } elsif ($_ eq "-urls-only") {
3709       $urls_only_p = 1;
3710       $no_output_p = 1;
3711     } elsif ($_ eq "-cocoa") {
3712       $cocoa_p = 1;
3713     } elsif ($_ eq "-imagemap") {
3714       $imagemap_base = shift @ARGV;
3715       $no_output_p = 1;
3716     } elsif ($_ eq "-verbose") {
3717       $verbose++;
3718     } elsif (m/^-v+$/) {
3719       $verbose += length($_)-1;
3720     } elsif ($_ eq "-delay") {
3721       $delay = shift @ARGV;
3722     } elsif ($_ eq "-timeout") {
3723       $http_timeout = shift @ARGV;
3724     } elsif ($_ eq "-filter") {
3725       $filter_cmd = shift @ARGV;
3726     } elsif ($_ eq "-filter2") {
3727       $post_filter_cmd = shift @ARGV;
3728     } elsif ($_ eq "-background" || $_ eq "-bg") {
3729       $background = shift @ARGV;
3730     } elsif ($_ eq "-size") {
3731       $_ = shift @ARGV;
3732       if (m@^(\d+)x(\d+)$@) {
3733         $img_width = $1;
3734         $img_height = $2;
3735       } else {
3736         error "argument to \"-size\" must be of the form \"640x400\"";
3737       }
3738     } elsif ($_ eq "-proxy" || $_ eq "-http-proxy") {
3739       $http_proxy = shift @ARGV;
3740     } elsif ($_ eq "-dictionary" || $_ eq "-dict") {
3741       $dict = shift @ARGV;
3742     } elsif ($_ eq "-opacity") {
3743       $opacity = shift @ARGV;
3744       error ("opacity must be between 0.0 and 1.0")
3745         if ($opacity <= 0 || $opacity > 1);
3746     } elsif ($_ eq "-driftnet" || $_ eq "--driftnet") {
3747       @search_methods = ( 100, "driftnet", \&pick_from_driftnet );
3748       if (! ($ARGV[0] =~ m/^-/)) {
3749         $driftnet_cmd = shift @ARGV;
3750       } else {
3751         $driftnet_cmd = $default_driftnet_cmd;
3752       }
3753     } elsif ($_ eq "-directory" || $_ eq "--directory") {
3754       @search_methods = ( 100, "local", \&pick_from_local_dir );
3755       if (! ($ARGV[0] =~ m/^-/)) {
3756         $local_dir = shift @ARGV;
3757       } else {
3758         error ("local directory path must be set")
3759       }
3760     } elsif ($_ eq "-fps") {
3761       # -fps only works on MacOS, via "webcollage-cocoa.m".
3762       # Ignore it if passed to this script in an X11 context.
3763     } elsif ($_ eq "-debug" || $_ eq "--debug") {
3764       my $which = shift @ARGV;
3765       my @rest = @search_methods;
3766       my $ok = 0;
3767       while (@rest) {
3768         my $pct  = shift @rest;
3769         my $name = shift @rest;
3770         my $tfn  = shift @rest;
3771
3772         if ($name eq $which) {
3773           @search_methods = (100, $name, $tfn);
3774           $ok = 1;
3775           last;
3776         }
3777       }
3778       error "no such search method as \"$which\"" unless ($ok);
3779       LOG (1, "DEBUG: using only \"$which\"");
3780
3781     } else {
3782       print STDERR "$copyright\nusage: $progname " .
3783               "[-root] [-display dpy] [-verbose] [-debug which]\n" .
3784         "\t\t  [-timeout secs] [-delay secs] [-size WxH]\n" .
3785         "\t\t  [-no-output] [-urls-only] [-imagemap filename]\n" .
3786         "\t\t  [-background color] [-opacity f]\n" .
3787         "\t\t  [-filter cmd] [-filter2 cmd]\n" .
3788         "\t\t  [-dictionary dictionary-file] [-http-proxy host[:port]]\n" .
3789         "\t\t  [-driftnet [driftnet-program-and-args]]\n" .
3790         "\t\t  [-directory local-image-directory]\n" .
3791         "\n";
3792       exit 1;
3793     }
3794   }
3795
3796   if (!$root_p && !$no_output_p && !$cocoa_p) {
3797     print STDERR $copyright;
3798     error "the -root argument is mandatory (for now.)";
3799   }
3800
3801   if (!$no_output_p && !$cocoa_p && !$ENV{DISPLAY}) {
3802     error "\$DISPLAY is not set.";
3803   }
3804
3805
3806   if ($verbose == 1) {
3807     $verbose_imgmap   = 1;
3808     $verbose_warnings = 1;
3809
3810   } elsif ($verbose == 2) {
3811     $verbose_imgmap   = 1;
3812     $verbose_warnings = 1;
3813     $verbose_load     = 1;
3814
3815   } elsif ($verbose == 3) {
3816     $verbose_imgmap   = 1;
3817     $verbose_warnings = 1;
3818     $verbose_load     = 1;
3819     $verbose_filter   = 1;
3820
3821   } elsif ($verbose == 4) {
3822     $verbose_imgmap   = 1;
3823     $verbose_warnings = 1;
3824     $verbose_load     = 1;
3825     $verbose_filter   = 1;
3826     $verbose_net      = 1;
3827
3828   } elsif ($verbose == 5) {
3829     $verbose_imgmap   = 1;
3830     $verbose_warnings = 1;
3831     $verbose_load     = 1;
3832     $verbose_filter   = 1;
3833     $verbose_net      = 1;
3834     $verbose_pbm      = 1;
3835
3836   } elsif ($verbose == 6) {
3837     $verbose_imgmap   = 1;
3838     $verbose_warnings = 1;
3839     $verbose_load     = 1;
3840     $verbose_filter   = 1;
3841     $verbose_net      = 1;
3842     $verbose_pbm      = 1;
3843     $verbose_http     = 1;
3844
3845   } elsif ($verbose >= 7) {
3846     $verbose_imgmap   = 1;
3847     $verbose_warnings = 1;
3848     $verbose_load     = 1;
3849     $verbose_filter   = 1;
3850     $verbose_net      = 1;
3851     $verbose_pbm      = 1;
3852     $verbose_http     = 1;
3853     $verbose_exec     = 1;
3854   }
3855
3856   if ($dict) {
3857     error ("$dict does not exist") unless (-f $dict);
3858     $wordlist = $dict;
3859   } else {
3860     pick_dictionary();
3861   }
3862
3863   if ($imagemap_base && !($img_width && $img_height)) {
3864     error ("-size WxH is required with -imagemap");
3865   }
3866
3867   if (defined ($local_dir)) {
3868     $_ = "xscreensaver-getimage-file";
3869     which ($_) || error "$_ not found on \$PATH.";
3870   }
3871
3872   init_signals();
3873   set_proxy();
3874
3875   spawn_driftnet ($driftnet_cmd) if ($driftnet_cmd);
3876
3877   if ($urls_only_p) {
3878     url_only_output ();
3879   } else {
3880     x_or_pbm_output ($window_id);
3881   }
3882 }
3883
3884 main();
3885 exit (0);