git.hungrycats.org Git - xscreensaver/blob - hacks/webcollage

   1 #!/usr/bin/perl -w
   2 #
   3 # webcollage, Copyright © 1999-2017 by Jamie Zawinski <jwz@jwz.org>
   4 # This program decorates the screen with random images from the web.
   5 # One satisfied customer described it as "a nonstop pop culture brainbath."
   6 #
   7 # Permission to use, copy, modify, distribute, and sell this software and its
   8 # documentation for any purpose is hereby granted without fee, provided that
   9 # the above copyright notice appear in all copies and that both that
  10 # copyright notice and this permission notice appear in supporting
  11 # documentation.  No representations are made about the suitability of this
  12 # software for any purpose.  It is provided "as is" without express or
  13 # implied warranty.
  14
  15
  16 # To run this as a display mode with xscreensaver, add this to `programs':
  17 #
  18 #     webcollage --root
  19 #     webcollage --root --filter 'vidwhacker --stdin --stdout'
  20 #
  21 #
  22 # You can see this in action at https://www.jwz.org/webcollage/ --
  23 # it auto-reloads about once a minute.  To make a page similar to
  24 # that on your own system, do this:
  25 #
  26 #     webcollage --size '800x600' --imagemap $HOME/www/webcollage/index
  27 #
  28 #
  29 # Requires that either the provided "webcollage-helper" program or
  30 # ImageMagick's "convert" be available on $PATH.
  31 #
  32 #
  33 # If you have the "driftnet" program installed, webcollage can display a
  34 # collage of images sniffed off your local ethernet, instead of pulled out
  35 # of search engines: in that way, your screensaver can display the images
  36 # that your co-workers are downloading!
  37 #
  38 # Driftnet is available here: http://www.ex-parrot.com/~chris/driftnet/
  39 # Use it like so:
  40 #
  41 #     webcollage --root --driftnet
  42 #
  43 # Driftnet is the Unix implementation of the MacOS "EtherPEG" program.
  44
  45
  46 require 5;
  47 use strict;
  48
  49 # We can't "use diagnostics" here, because that library malfunctions if
  50 # you signal and catch alarms: it says "Uncaught exception from user code"
  51 # and exits, even though I damned well AM catching it!
  52 #use diagnostics;
  53
  54
  55 require Time::Local;
  56 require POSIX;
  57 use Fcntl ':flock'; # import LOCK_* constants
  58 use POSIX qw(strftime);
  59 use LWP::UserAgent;
  60
  61
  62 my $progname = $0; $progname =~ s@.*/@@g;
  63 my ($version) = ('$Revision: 1.181 $' =~ m/\s(\d[.\d]+)\s/s);
  64 my $copyright = "WebCollage $version, Copyright (c) 1999-2017" .
  65     " Jamie Zawinski <jwz\@jwz.org>\n" .
  66     "                  https://www.jwz.org/webcollage/\n";
  67
  68
  69
  70 my @search_methods = (
  71                       # Google is rate-limiting us now, so this works ok from
  72                       # a short-running screen saver, but not as a batch job.
  73                       # I haven't found a workaround.
  74                       #
  75                         5, "googlephotos",  \&pick_from_google_image_photos,
  76                         3, "googleimgs",    \&pick_from_google_images,
  77                         3, "googlenums",    \&pick_from_google_image_numbers,
  78
  79                       # So let's try Bing instead. No rate limiting yet!
  80                       #
  81                        13, "bingphotos",    \&pick_from_bing_image_photos,
  82                        11, "bingimgs",      \&pick_from_bing_images,
  83                        10, "bingnums",      \&pick_from_bing_image_numbers,
  84
  85                        20, "flickr_recent", \&pick_from_flickr_recent,
  86                        15, "flickr_random", \&pick_from_flickr_random,
  87                         6, "livejournal",   \&pick_from_livejournal_images,
  88
  89                        11, "imgur",         \&pick_from_imgur,
  90
  91                      # Tumblr doesn't have an "or" search, so this isn't great.
  92                         3, "tumblr",        \&pick_from_tumblr,
  93
  94                      # I ran out of usable access tokens, May 2017
  95                      #  0, "instagram",     \&pick_from_instagram,
  96
  97                      # No longer exists, as of Apr 2014
  98                      #  0, "yahoorand",     \&pick_from_yahoo_random_link,
  99
 100                      # Twitter destroyed their whole API in 2013.
 101                      #  0, "twitpic",       \&pick_from_twitpic_images,
 102                      #  0, "twitter",       \&pick_from_twitter_images,
 103
 104                      # This is a cute way to search for a certain webcams.
 105                      # Not included in default methods, since these images
 106                      # aren't terribly interesting by themselves.
 107                      # See also "SurveillanceSaver".
 108                      #
 109                         0, "securitycam",   \&pick_from_security_camera,
 110
 111                      # Nonfunctional as of June 2011.
 112                      #  0, "altavista",     \&pick_from_alta_vista_random_link,
 113
 114                      # In Apr 2002, Google asked me to stop searching them.
 115                      # I asked them to add a "random link" url.  They said
 116                      # "that would be easy, we'll think about it" and then
 117                      # never wrote back.  Booo Google!  Booooo!  So, screw
 118                      # those turkeys, I've turned Google searching back on.
 119                      # I'm sure they can take it.  (Jan 2005.)
 120
 121                      # Jan 2005: Yahoo fucked up their search form so that
 122                      # it's no longer possible to do "or" searches on news
 123                      # images, so we rarely get any hits there any more.
 124                      #
 125                      #  0, "yahoonews",     \&pick_from_yahoo_news_text,
 126
 127                      # Dec 2004: the ircimages guy's server can't take the
 128                      # heat, so he started banning the webcollage user agent.
 129                      # I tried to convince him to add a lighter-weight page to
 130                      # support webcollage better, but he doesn't care.
 131                      #
 132                      #  0, "ircimages",     \&pick_from_ircimages,
 133
 134                      # Dec 2002: Alta Vista has a new "random link" URL now.
 135                      # They added it specifically to better support webcollage!
 136                      # That was super cool of them.  This is how we used to do
 137                      # it, before:
 138                      #
 139                      #  0, "avimages",      \&pick_from_alta_vista_images,
 140                      #  0, "avtext",        \&pick_from_alta_vista_text,
 141
 142                      # This broke in 2004.  Eh, Lycos sucks anyway.
 143                      #
 144                      #  0, "lycos",         \&pick_from_lycos_text,
 145
 146                      # This broke in 2003, I think.  I suspect Hotbot is
 147                      # actually the same search engine data as Lycos.
 148                      #
 149                      #  0, "hotbot",        \&pick_from_hotbot_text,
 150                       );
 151
 152 # programs we can use to write to the root window (tried in ascending order.)
 153 #
 154 my @root_displayers = (
 155   "xscreensaver-getimage -root -file",
 156   "chbg       -once -xscreensaver -max_size 100",
 157   "xv         -root -quit -viewonly +noresetroot -quick24 -rmode 5" .
 158   "           -rfg black -rbg black",
 159   "xli        -quiet -onroot -center -border black",
 160   "xloadimage -quiet -onroot -center -border black",
 161
 162 # this lame program wasn't built with vroot.h:
 163 # "xsri       -scale -keep-aspect -center-horizontal -center-vertical",
 164 );
 165
 166
 167 # Some sites need cookies to work properly.   These are they.
 168 #
 169 my %cookies = (
 170   "www.altavista.com"  =>  "AV_ALL=1",   # request uncensored searches
 171   "web.altavista.com"  =>  "AV_ALL=1",
 172   "ircimages.com"      =>  'disclaimer=1',
 173 );
 174
 175
 176 # If this is set, it's a helper program to use for pasting images together:
 177 # this is somewhat faster than using ImageMagick.
 178 #
 179 my $webcollage_helper = undef;
 180 my $convert_cmd = 'convert';
 181
 182 my $opacity = 0.85;  # Opacity when pasting images together.
 183
 184
 185 # Some sites have  managed to poison the search engines.  These are they.
 186 # (We auto-detect sites that have poisoned the search engines via excessive
 187 # keywords or dictionary words,  but these are ones that slip through
 188 # anyway.)
 189 #
 190 # This can contain full host names, or 2 or 3 component domains.
 191 #
 192 my %poisoners = (
 193   "die.net"                 => 1,  # 'l33t h4ck3r d00dz.
 194   "genforum.genealogy.com"  => 1,  # Cluttering avtext with human names.
 195   "rootsweb.com"            => 1,  # Cluttering avtext with human names.
 196   "akamai.net"              => 1,  # Lots of sites have their images on Akamai.
 197   "akamaitech.net"          => 1,  # But those are pretty much all banners.
 198                                    # Since Akamai is super-expensive, let's
 199                                    # go out on a limb and assume that all of
 200                                    # their customers are rich-and-boring.
 201   "bartleby.com"            => 1,  # Dictionary, cluttering avtext.
 202   "encyclopedia.com"        => 1,  # Dictionary, cluttering avtext.
 203   "onlinedictionary.datasegment.com" => 1,  # Dictionary, cluttering avtext.
 204   "hotlinkpics.com"         => 1,  # Porn site that has poisoned avimages
 205                                    # (I don't see how they did it, though!)
 206   "alwayshotels.com"        => 1,  # Poisoned Lycos pretty heavily.
 207   "nextag.com"              => 1,  # Poisoned Alta Vista real good.
 208   "ghettodriveby.com"       => 1,  # Poisoned Google Images.
 209   "crosswordsolver.org"     => 1,  # Poisoned Google Images.
 210   "xona.com"                => 1,  # Poisoned Google Images.
 211   "freepatentsonline.com"   => 1,  # Poisoned Google Images.
 212   "herbdatanz.com"          => 1,  # Poisoned Google Images.
 213 );
 214
 215
 216 # When verbosity is turned on, we warn about sites that we seem to be hitting
 217 # a lot: usually this means some new poisoner has made it into the search
 218 # engines.  But sometimes, the warning is just because that site has a lot
 219 # of stuff on it.  So these are the sites that are immune to the "frequent
 220 # site" diagnostic message.
 221 #
 222 my %warningless_sites = (
 223   "home.earthlink.net"      => 1,
 224   "www.angelfire.com"       => 1,
 225   "members.aol.com"         => 1,
 226   "img.photobucket.com"     => 1,
 227   "pics.livejournal.com"    => 1,
 228   "tinypic.com"             => 1,
 229   "flickr.com"              => 1,
 230   "staticflickr.com"        => 1,
 231   "pbase.com"               => 1,
 232   "blogger.com"             => 1,
 233   "multiply.com"            => 1,
 234   "wikimedia.org"           => 1,
 235   "twitpic.com"             => 1,
 236   "amazonaws.com"           => 1,
 237   "blogspot.com"            => 1,
 238   "photoshelter.com"        => 1,
 239   "myspacecdn.com"          => 1,
 240   "feedburner.com"          => 1,
 241   "wikia.com"               => 1,
 242   "ljplus.ru"               => 1,
 243   "yandex.ru"               => 1,
 244   "imgur.com"               => 1,
 245   "tumblr.com"              => 1,
 246   "yfrog.com"               => 1,
 247   "cdninstagram.com"        => 1,
 248   "gstatic.com"             => 1,
 249
 250   "yimg.com"                => 1,  # This is where dailynews.yahoo.com stores
 251   "eimg.com"                => 1,  # its images, so pick_from_yahoo_news_text()
 252                                    # hits this every time.
 253
 254   "images.quizfarm.com"     => 1,  # damn those LJ quizzes...
 255   "images.quizilla.com"     => 1,
 256   "images.quizdiva.net"     => 1,
 257
 258   "driftnet"                => 1,  # builtin...
 259   "local-directory"         => 1,  # builtin...
 260 );
 261
 262
 263 # For decoding HTML-encoded character entities to URLs.
 264 # In This Modern World, probably we should use HTML::Entities instead.
 265 #
 266 my %entity_table = (
 267    "apos"   => '\'',
 268    "quot"   => '"',    "amp"    => '&',    "lt"     => '<',
 269    "gt"     => '>',    "nbsp"   => ' ',    "iexcl"  => '',
 270    "cent"   => "\xA2", "pound"  => "\xA3", "curren" => "\xA4",
 271    "yen"    => "\xA5", "brvbar" => "\xA6", "sect"   => "\xA7",
 272    "uml"    => "\xA8", "copy"   => "\xA9", "ordf"   => "\xAA",
 273    "laquo"  => "\xAB", "not"    => "\xAC", "shy"    => "\xAD",
 274    "reg"    => "\xAE", "macr"   => "\xAF", "deg"    => "\xB0",
 275    "plusmn" => "\xB1", "sup2"   => "\xB2", "sup3"   => "\xB3",
 276    "acute"  => "\xB4", "micro"  => "\xB5", "para"   => "\xB6",
 277    "middot" => "\xB7", "cedil"  => "\xB8", "sup1"   => "\xB9",
 278    "ordm"   => "\xBA", "raquo"  => "\xBB", "frac14" => "\xBC",
 279    "frac12" => "\xBD", "frac34" => "\xBE", "iquest" => "\xBF",
 280    "Agrave" => "\xC0", "Aacute" => "\xC1", "Acirc"  => "\xC2",
 281    "Atilde" => "\xC3", "Auml"   => "\xC4", "Aring"  => "\xC5",
 282    "AElig"  => "\xC6", "Ccedil" => "\xC7", "Egrave" => "\xC8",
 283    "Eacute" => "\xC9", "Ecirc"  => "\xCA", "Euml"   => "\xCB",
 284    "Igrave" => "\xCC", "Iacute" => "\xCD", "Icirc"  => "\xCE",
 285    "Iuml"   => "\xCF", "ETH"    => "\xD0", "Ntilde" => "\xD1",
 286    "Ograve" => "\xD2", "Oacute" => "\xD3", "Ocirc"  => "\xD4",
 287    "Otilde" => "\xD5", "Ouml"   => "\xD6", "times"  => "\xD7",
 288    "Oslash" => "\xD8", "Ugrave" => "\xD9", "Uacute" => "\xDA",
 289    "Ucirc"  => "\xDB", "Uuml"   => "\xDC", "Yacute" => "\xDD",
 290    "THORN"  => "\xDE", "szlig"  => "\xDF", "agrave" => "\xE0",
 291    "aacute" => "\xE1", "acirc"  => "\xE2", "atilde" => "\xE3",
 292    "auml"   => "\xE4", "aring"  => "\xE5", "aelig"  => "\xE6",
 293    "ccedil" => "\xE7", "egrave" => "\xE8", "eacute" => "\xE9",
 294    "ecirc"  => "\xEA", "euml"   => "\xEB", "igrave" => "\xEC",
 295    "iacute" => "\xED", "icirc"  => "\xEE", "iuml"   => "\xEF",
 296    "eth"    => "\xF0", "ntilde" => "\xF1", "ograve" => "\xF2",
 297    "oacute" => "\xF3", "ocirc"  => "\xF4", "otilde" => "\xF5",
 298    "ouml"   => "\xF6", "divide" => "\xF7", "oslash" => "\xF8",
 299    "ugrave" => "\xF9", "uacute" => "\xFA", "ucirc"  => "\xFB",
 300    "uuml"   => "\xFC", "yacute" => "\xFD", "thorn"  => "\xFE",
 301    "yuml"   => "\xFF",
 302
 303    # HTML 4 entities that do not have 1:1 Latin1 mappings.
 304    "bull"  => "*",    "hellip"=> "...",  "prime" => "'",  "Prime" => "\"",
 305    "frasl" => "/",    "trade" => "[tm]", "larr"  => "<-", "rarr"  => "->",
 306    "harr"  => "<->",  "lArr"  => "<=",   "rArr"  => "=>", "hArr"  => "<=>",
 307    "empty" => "\xD8", "minus" => "-",    "lowast"=> "*",  "sim"   => "~",
 308    "cong"  => "=~",   "asymp" => "~",    "ne"    => "!=", "equiv" => "==",
 309    "le"    => "<=",   "ge"    => ">=",   "lang"  => "<",  "rang"  => ">",
 310    "loz"   => "<>",   "OElig" => "OE",   "oelig" => "oe", "Yuml"  => "Y",
 311    "circ"  => "^",    "tilde" => "~",    "ensp"  => " ",  "emsp"  => " ",
 312    "thinsp"=> " ",    "ndash" => "-",    "mdash" => "--", "lsquo" => "`",
 313    "rsquo" => "'",    "sbquo" => "'",    "ldquo" => "\"", "rdquo" => "\"",
 314    "bdquo" => "\"",   "lsaquo"=> "<",    "rsaquo"=> ">",
 315 );
 316
 317
 318 ##############################################################################
 319 #
 320 # Various global flags set by command line parameters, or computed
 321 #
 322 ##############################################################################
 323
 324
 325 my $current_state = "???";      # for diagnostics
 326 my $load_method;
 327 my $last_search;
 328 my $image_succeeded = -1;
 329 my $suppress_audit = 0;
 330
 331 my $verbose_imgmap = 0;         # print out rectangles and URLs only (stdout)
 332 my $verbose_warnings = 0;       # print out warnings when things go wrong
 333 my $verbose_load = 0;           # diagnostics about loading of URLs
 334 my $verbose_filter = 0;         # diagnostics about page selection/rejection
 335 my $verbose_net = 0;            # diagnostics about network I/O
 336 my $verbose_decode = 0;         # diagnostics about img conversion pipelines
 337 my $verbose_http = 0;           # diagnostics about all HTTP activity
 338 my $verbose_exec = 0;           # diagnostics about executing programs
 339
 340 my $report_performance_interval = 60 * 15;  # print some stats every 15 minutes
 341
 342 my $http_proxy = undef;
 343 my $http_timeout = 20;
 344 my $cvt_timeout = 10;
 345
 346 my $min_width = 50;
 347 my $min_height = 50;
 348 my $min_ratio = 1/5;
 349
 350 my $min_gif_area = (120 * 120);
 351
 352
 353 my $no_output_p = 0;
 354 my $urls_only_p = 0;
 355 my $cocoa_p = 0;
 356 my $imagemap_base = undef;
 357
 358 my @pids_to_kill = ();  # forked pids we should kill when we exit, if any.
 359
 360 my $driftnet_magic = 'driftnet';
 361 my $driftnet_dir = undef;
 362 my $default_driftnet_cmd = "driftnet -a -m 100";
 363
 364 my $local_magic = 'local-directory';
 365 my $local_dir = undef;
 366
 367 my $wordlist;
 368
 369 my %rejected_urls;
 370 my @tripwire_words = ("aberrate", "abode", "amorphous", "antioch",
 371                       "arrhenius", "arteriole", "blanket", "brainchild",
 372                       "burdensome", "carnival", "cherub", "chord", "clever",
 373                       "dedicate", "dilogarithm", "dolan", "dryden",
 374                       "eggplant");
 375
 376
 377 ##############################################################################
 378 #
 379 # Retrieving URLs
 380 #
 381 ##############################################################################
 382
 383 # returns three values: the HTTP response line; the document headers;
 384 # and the document body.
 385 #
 386 sub get_document_1($$$) {
 387   my ($url, $referer, $timeout) = @_;
 388
 389   if (!defined($timeout)) { $timeout = $http_timeout; }
 390   if ($timeout > $http_timeout) { $timeout = $http_timeout; }
 391
 392   my $user_agent = "$progname/$version";
 393
 394   if ($url =~ m@^https?://www\.altavista\.com/@s ||
 395       $url =~ m@^https?://random\.yahoo\.com/@s  ||
 396       $url =~ m@^https?://[^./]+\.google\.com/@s ||
 397       $url =~ m@^https?://www\.livejournal\.com/@s) {
 398     # block this, you turkeys.
 399     $user_agent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.7)' .
 400                   ' Gecko/20070914 Firefox/2.0.0.7';
 401   }
 402
 403   my $ua = LWP::UserAgent->new ( agent => $user_agent,
 404                                  keep_alive => 0,
 405                                  env_proxy => 0,
 406                                );
 407   $ua->proxy ('http', $http_proxy) if $http_proxy;
 408   $ua->default_header ('Referer' => $referer) if $referer;
 409   $ua->default_header ('Accept' => '*/*');
 410   $ua->timeout($timeout) if $timeout;
 411
 412   if (0) {
 413     $ua->add_handler ("request_send",
 414                       sub($$$) {
 415                         my ($req, $ua, $h) = @_;
 416                         print "\n>>[[\n"; $req->dump; print "\n]]\n";
 417                         return;
 418                       });
 419     $ua->add_handler ("response_data",
 420                       sub($$$$) {
 421                         my ($req, $ua, $h, $data) = @_;
 422                         #print "\n<<[[\n"; print $data; print "\n]]\n";
 423                         return 1;
 424                       });
 425     $ua->add_handler ("request_done",
 426                       sub($$$) {
 427                         my ($req, $ua, $h) = @_;
 428                         print "\n<<[[\n"; $req->dump; print "\n]]\n";
 429                         return;
 430                       });
 431   }
 432
 433   if ($verbose_http) {
 434     LOG (1, "  ==> GET $url");
 435     LOG (1, "  ==> User-Agent: $user_agent");
 436     LOG (1, "  ==> Referer: $referer") if $referer;
 437   }
 438
 439   my $res = $ua->get ($url);
 440
 441   my $http = ($res ? $res->status_line : '') || '';
 442   my $head = ($res ? $res->headers()   : '') || '';
 443   $head = $head->as_string() if $head;
 444   my $body = ($res && $res->is_success ? $res->decoded_content : '') || '';
 445
 446   LOG ($verbose_net, "get_document_1 $url " . ($referer ? $referer : ""));
 447
 448   $head =~ s/\r\n/\n/gs;
 449   $head =~ s/\r/\n/gs;
 450   if ($verbose_http) {
 451     foreach (split (/\n/, $head)) {
 452       LOG ($verbose_http, "  <== $_");
 453     }
 454   }
 455
 456   my @L = split(/\r\n|\r|\n/, $body);
 457   my $lines = @L;
 458   LOG ($verbose_http,
 459        "  <== [ body ]: $lines lines, " . length($body) . " bytes");
 460
 461   if (!$http) {
 462     LOG (($verbose_net || $verbose_load), "null response: $url");
 463     return ();
 464   }
 465
 466   return ( $http, $head, $body );
 467 }
 468
 469
 470 # returns two values: the document headers; and the document body.
 471 # if the given URL did a redirect, returns the redirected-to document.
 472 #
 473 sub get_document($$;$) {
 474   my ($url, $referer, $timeout) = @_;
 475   my $start = time;
 476
 477   if (defined($referer) && $referer eq $driftnet_magic) {
 478     return get_driftnet_file ($url);
 479   }
 480
 481   if (defined($referer) && $referer eq $local_magic) {
 482     return get_local_file ($url);
 483   }
 484
 485   my $orig_url = $url;
 486   my $loop_count = 0;
 487   my $max_loop_count = 4;
 488
 489   do {
 490     if (defined($timeout) && $timeout <= 0) {
 491       LOG (($verbose_net || $verbose_load), "timed out for $url");
 492       $suppress_audit = 1;
 493       return ();
 494     }
 495
 496     my ( $http, $head, $body ) = get_document_1 ($url, $referer, $timeout);
 497
 498     if (defined ($timeout)) {
 499       my $now = time;
 500       my $elapsed = $now - $start;
 501       $timeout -= $elapsed;
 502       $start = $now;
 503     }
 504
 505     return () unless $http; # error message already printed
 506
 507     $http =~ s/[\r\n]+$//s;
 508
 509     if ( $http =~ m@^HTTP/[0-9.]+ 30[123]@ ) {
 510       $_ = $head;
 511
 512       my ( $location ) = m@^location:[ \t]*(.*)$@im;
 513       if ( $location ) {
 514         $location =~ s/[\r\n]$//;
 515
 516         LOG ($verbose_net, "redirect from $url to $location");
 517         $referer = $url;
 518         $url = $location;
 519
 520         if ($url =~ m@^/@) {
 521           $referer =~ m@^(https?://[^/]+)@i;
 522           $url = $1 . $url;
 523         } elsif (! ($url =~ m@^[a-z]+:@i)) {
 524           $_ = $referer;
 525           s@[^/]+$@@g if m@^https?://[^/]+/@i;
 526           $_ .= "/" if m@^https?://[^/]+$@i;
 527           $url = $_ . $url;
 528         }
 529
 530       } else {
 531         LOG ($verbose_net, "no Location with \"$http\"");
 532         return ( $url, $body );
 533       }
 534
 535       if ($loop_count++ > $max_loop_count) {
 536         LOG ($verbose_net,
 537              "too many redirects ($max_loop_count) from $orig_url");
 538         $body = undef;
 539         return ();
 540       }
 541
 542     } elsif ( $http =~ m@^HTTP/[0-9.]+ ([4-9][0-9][0-9].*)$@ ) {
 543
 544       LOG (($verbose_net || $verbose_load), "failed: $1 ($url)");
 545
 546       # http errors -- return nothing.
 547       $body = undef;
 548       return ();
 549
 550     } elsif (!$body) {
 551
 552       LOG (($verbose_net || $verbose_load), "document contains no data: $url");
 553       return ();
 554
 555     } else {
 556
 557       # ok!
 558       return ( $url, $body );
 559     }
 560
 561   } while (1);
 562 }
 563
 564 # If we already have a cookie defined for this site, and the site is trying
 565 # to overwrite that very same cookie, let it do so.  This is because nytimes
 566 # expires its cookies - it lets you upgrade to a new cookie without logging
 567 # in again, but you have to present the old cookie to get the new cookie.
 568 # So, by doing this, the built-in cypherpunks cookie will never go "stale".
 569 #
 570 sub set_cookie($$) {
 571   my ($host, $cookie) = @_;
 572   my $oc = $cookies{$host};
 573   return unless $oc;
 574   $_ = $oc;
 575   my ($oc_name, $oc_value) = m@^([^= \t\r\n]+)=(.*)$@;
 576   $_ = $cookie;
 577   my ($nc_name, $nc_value) = m@^([^= \t\r\n]+)=(.*)$@;
 578
 579   if ($oc_name eq $nc_name &&
 580       $oc_value ne $nc_value) {
 581     $cookies{$host} = $cookie;
 582     LOG ($verbose_net, "overwrote ${host}'s $oc_name cookie");
 583   }
 584 }
 585
 586
 587 ############################################################################
 588 #
 589 # Extracting image URLs from HTML
 590 #
 591 ############################################################################
 592
 593 # given a URL and the body text at that URL, selects and returns a random
 594 # image from it.  returns () if no suitable images found.
 595 #
 596 sub pick_image_from_body($$) {
 597   my ($url, $body) = @_;
 598
 599   my $base = $url;
 600   $_ = $url;
 601
 602   # if there's at least one slash after the host, take off the last
 603   # pathname component
 604   if ( m@^https?://[^/]+/@io ) {
 605     $base =~ s@[^/]+$@@go;
 606   }
 607
 608   # if there are no slashes after the host at all, put one on the end.
 609   if ( m@^https?://[^/]+$@io ) {
 610     $base .= "/";
 611   }
 612
 613   $_ = $body;
 614
 615   # strip out newlines, compress whitespace
 616   s/[\r\n\t ]+/ /go;
 617
 618   # nuke comments
 619   s/<!--.*?-->//go;
 620
 621
 622   # There are certain web sites that list huge numbers of dictionary
 623   # words in their bodies or in their <META NAME=KEYWORDS> tags (surprise!
 624   # Porn sites tend not to be reputable!)
 625   #
 626   # I do not want webcollage to filter on content: I want it to select
 627   # randomly from the set of images on the web.  All the logic here for
 628   # rejecting some images is really a set of heuristics for rejecting
 629   # images that are not really images: for rejecting *text* that is in
 630   # GIF/JPEG/PNG form.  I don't want text, I want pictures, and I want
 631   # the content of the pictures to be randomly selected from among all
 632   # the available content.
 633   #
 634   # So, filtering out "dirty" pictures by looking for "dirty" keywords
 635   # would be wrong: dirty pictures exist, like it or not, so webcollage
 636   # should be able to select them.
 637   #
 638   # However, picking a random URL is a hard thing to do.  The mechanism I'm
 639   # using is to search for a selection of random words.  This is not
 640   # perfect, but works ok most of the time.  The way it breaks down is when
 641   # some URLs get precedence because their pages list *every word* as
 642   # related -- those URLs come up more often than others.
 643   #
 644   # So, after we've retrieved a URL, if it has too many keywords, reject
 645   # it.  We reject it not on the basis of what those keywords are, but on
 646   # the basis that by having so many, the page has gotten an unfair
 647   # advantage against our randomizer.
 648   #
 649   my $trip_count = 0;
 650   foreach my $trip (@tripwire_words) {
 651     $trip_count++ if m/$trip/i;
 652   }
 653
 654   if ($trip_count >= $#tripwire_words - 2) {
 655     LOG (($verbose_filter || $verbose_load),
 656          "there is probably a dictionary in \"$url\": rejecting.");
 657     $rejected_urls{$url} = -1;
 658     $body = undef;
 659     $_ = undef;
 660     return ();
 661   }
 662
 663
 664   my @urls;
 665   my %unique_urls;
 666
 667   foreach (split(/ *</)) {
 668     if ( m/^meta.*["']keywords["']/i ) {
 669
 670       # Likewise, reject any web pages that have a KEYWORDS meta tag
 671       # that is too long.
 672       #
 673       my $L = length($_);
 674       if ($L > 1000) {
 675         LOG (($verbose_filter || $verbose_load),
 676              "excessive keywords ($L bytes) in $url: rejecting.");
 677         $rejected_urls{$url} = $L;
 678         $body = undef;
 679         $_ = undef;
 680         return ();
 681       } else {
 682         LOG ($verbose_filter, "  keywords ($L bytes) in $url (ok)");
 683       }
 684
 685     } elsif (m/^ (IMG|A) \b .* (SRC|HREF) \s* = \s* ["']? (.*?) [ "'<>] /six ||
 686              m/^ (LINK|META) \b .* (REL|PROPERTY) \s* = \s*
 687                  ["']? (image_src|og:image) ["']? /six) {
 688
 689       my $was_inline = (lc($1) eq 'img');
 690       my $was_meta   = (lc($1) eq 'link' || lc($1) eq 'meta');
 691       my $link = $3;
 692
 693       # For <link rel="image_src" href="...">
 694       # and <meta property="og:image" content="...">
 695       #
 696       if ($was_meta) {
 697         next unless (m/ (HREF|CONTENT) \s* = \s* ["']? (.*?) [ "'<>] /six);
 698         $link = $2;
 699       }
 700
 701       my ( $width )  = m/width ?=[ \"]*(\d+)/oi;
 702       my ( $height ) = m/height ?=[ \"]*(\d+)/oi;
 703       $_ = $link;
 704
 705       if ( m@^/@o ) {
 706         my $site;
 707         ( $site = $base ) =~ s@^(https?://[^/]*).*@$1@gio;
 708         $_ = "$site$link";
 709       } elsif ( ! m@^[^/:?]+:@ ) {
 710         $_ = "$base$link";
 711         s@/\./@/@g;
 712         1 while (s@/[^/]+/\.\./@/@g);
 713       }
 714
 715       # skip non-http
 716       if ( ! m@^https?://@io ) {
 717         next;
 718       }
 719
 720       # skip non-image
 721       if ( ! m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@io ) {
 722         next;
 723       }
 724
 725       # skip really short or really narrow images
 726       if ( $width && $width < $min_width) {
 727         if (!$height) { $height = "?"; }
 728         LOG ($verbose_filter, "  skip narrow image $_ (${width}x$height)");
 729         next;
 730       }
 731
 732       if ( $height && $height < $min_height) {
 733         if (!$width) { $width = "?"; }
 734         LOG ($verbose_filter, "  skip short image $_ (${width}x$height)");
 735         next;
 736       }
 737
 738       # skip images with ratios that make them look like banners.
 739       if ($min_ratio && $width && $height &&
 740           ($width * $min_ratio ) > $height) {
 741         if (!$height) { $height = "?"; }
 742         LOG ($verbose_filter, "  skip bad ratio $_ (${width}x$height)");
 743         next;
 744       }
 745
 746       # skip GIFs with a small number of pixels -- those usually suck.
 747       if ($width && $height &&
 748           m/\.gif$/io &&
 749           ($width * $height) < $min_gif_area) {
 750         LOG ($verbose_filter, "  skip small GIF $_ (${width}x$height)");
 751         next;
 752       }
 753
 754       # skip images with a URL that indicates a Yahoo thumbnail.
 755       if (m@\.yimg\.com/.*/t/@) {
 756         if (!$width)  { $width  = "?"; }
 757         if (!$height) { $height = "?"; }
 758         LOG ($verbose_filter, "  skip yahoo thumb $_ (${width}x$height)");
 759         next;
 760       }
 761
 762       my $url = $_;
 763
 764       if ($unique_urls{$url}) {
 765         LOG ($verbose_filter, "  skip duplicate image $_");
 766         next;
 767       }
 768
 769       LOG ($verbose_filter,
 770            "  image $url" .
 771            ($width && $height ? " (${width}x${height})" : "") .
 772            ($was_meta ? " (meta)" : $was_inline ? " (inline)" : ""));
 773
 774
 775       my $weight = 1;
 776
 777       if ($was_meta) {
 778         $weight = 20;    # meta tag images are far preferable to inline images.
 779       } else {
 780         if ($url !~ m@[.](gif|png)$@io ) {
 781           $weight += 2;  # JPEGs are preferable to GIFs and PNGs.
 782         }
 783         if (! $was_inline) {
 784           $weight += 4;  # pointers to images are preferable to inlined images.
 785         }
 786       }
 787
 788       $unique_urls{$url}++;
 789       for (my $i = 0; $i < $weight; $i++) {
 790         $urls[++$#urls] = $url;
 791       }
 792     }
 793   }
 794
 795   my $fsp = ($body =~ m@<frameset@i);
 796
 797   $_ = undef;
 798   $body = undef;
 799
 800   @urls = depoison (@urls);
 801
 802   if ( $#urls < 0 ) {
 803     LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : ""));
 804     return ();
 805   }
 806
 807   # pick a random element of the table
 808   my $i = int(rand($#urls+1));
 809   $url = $urls[$i];
 810
 811   LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
 812
 813   return $url;
 814 }
 815
 816
 817 # Given a URL and the RSS feed from that URL, pick a random image from
 818 # the feed.  This is a lot simpler than extracting images out of a page:
 819 # we already know we have reasonable images, so we just pick one.
 820 # Returns: the real URL of the page (preferably not the RSS version),
 821 # and the image.
 822
 823 sub pick_image_from_rss($$) {
 824   my ($url, $body) = @_;
 825
 826   my ($base) = ($body =~ m@<link>([^<>]+)</link>@si);   # root link
 827
 828   my @items = ($body =~ m@<item\b[^<>]*>(.*?)</item>@gsi);
 829   return unless @items;
 830
 831   my $n = @items;
 832   my $i = int(rand($n));
 833   my $item = $items[$i];
 834
 835   $base = $1 if ($item =~ m@<link>([^<>]+)</link>@si);  # item link
 836   $base = $url unless $base;
 837
 838   ($url) = ($item =~ m/<enclosure\b[^<>]*\burl="(.*?)"/si);
 839   return unless $url;
 840
 841   LOG ($verbose_load, "picked image $i/$n: $url");
 842   return ($base, $url);
 843 }
 844
 845 \f
 846 ############################################################################
 847 #
 848 # Subroutines for getting pages and images out of search engines
 849 #
 850 ############################################################################
 851
 852
 853 sub pick_dictionary() {
 854   my @dicts = ("/usr/dict/words",
 855                "/usr/share/dict/words",
 856                "/usr/share/lib/dict/words",
 857                "/usr/share/dict/cracklib-small",
 858                "/usr/share/dict/cracklib-words"
 859                );
 860   foreach my $f (@dicts) {
 861     if (-f $f) {
 862       $wordlist = $f;
 863       last;
 864     }
 865   }
 866   error ("$dicts[0] does not exist") unless defined($wordlist);
 867 }
 868
 869 # returns a random word from the dictionary
 870 #
 871 sub random_word() {
 872
 873   return undef unless open (my $in, '<', $wordlist);
 874
 875   my $size = (stat($in))[7];
 876   my $word = undef;
 877   my $count = 0;
 878
 879   while (1) {
 880     error ("looping ($count) while reading $wordlist")
 881       if (++$count > 100);
 882
 883     my $pos = int (rand ($size));
 884     if (seek ($in, $pos, 0)) {
 885       $word = <$in>;   # toss partial line
 886       $word = <$in>;   # keep next line
 887     }
 888
 889     next unless ($word);
 890     next if ($word =~ m/^[-\']/);
 891
 892     $word = lc($word);
 893     $word =~ s/^.*-//s;
 894     $word =~ s/^[^a-z]+//s;
 895     $word =~ s/[^a-z]+$//s;
 896     $word =~ s/\'s$//s;
 897     $word =~ s/ys$/y/s;
 898     $word =~ s/ally$//s;
 899     $word =~ s/ly$//s;
 900     $word =~ s/ies$/y/s;
 901     $word =~ s/ally$/al/s;
 902     $word =~ s/izes$/ize/s;
 903     $word =~ s/esses$/ess/s;
 904     $word =~ s/(.{5})ing$/$1/s;
 905
 906     next if (length ($word) > 14);
 907     last if ($word);
 908   }
 909
 910   close ($in);
 911
 912   if ( $word =~ s/\s/\+/gs ) {  # convert intra-word spaces to "+".
 913     $word = "\%22$word\%22";    # And put quotes (%22) around it.
 914   }
 915
 916   return $word;
 917 }
 918
 919
 920 sub random_words($) {
 921   my ($sep) = @_;
 922   return (random_word() . $sep .
 923           random_word() . $sep .
 924           random_word() . $sep .
 925           random_word() . $sep .
 926           random_word());
 927 }
 928
 929
 930 sub url_quote($) {
 931   my ($s) = @_;
 932   $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge;
 933   return $s;
 934 }
 935
 936 sub url_unquote($) {
 937   my ($s) = @_;
 938   $s =~ s/[+]/ /g;
 939   $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige;
 940   return $s;
 941 }
 942
 943 sub html_quote($) {
 944   my ($s) = @_;
 945   $s =~ s/&/&amp;/gi;
 946   $s =~ s/</&lt;/gi;
 947   $s =~ s/>/&gt;/gi;
 948   $s =~ s/\"/&quot;/gi;
 949   return $s;
 950 }
 951
 952 sub html_unquote($) {
 953   my ($s) = @_;
 954   $s =~ s/(&([a-z]+);)/{ $entity_table{$2} || $1; }/gexi;  # e.g., &apos;
 955   $s =~ s/(&\#(\d+);)/{ chr($2) }/gexi;                    # e.g., &#39;
 956   return $s;
 957 }
 958
 959
 960 # Loads the given URL (a search on some search engine) and returns:
 961 # - the total number of hits the search engine claimed it had;
 962 # - a list of URLs from the page that the search engine returned;
 963 # Note that this list contains all kinds of internal search engine
 964 # junk URLs too -- caller must prune them.
 965 #
 966 sub pick_from_search_engine($$$) {
 967   my ( $timeout, $search_url, $words ) = @_;
 968
 969   $_ = $words;
 970   s/%20/ /g;
 971
 972   print STDERR "\n\n" if ($verbose_load);
 973
 974   LOG ($verbose_load, "words: $_");
 975   LOG ($verbose_load, "URL: $search_url");
 976
 977   $last_search = $search_url;   # for warnings
 978
 979   my $start = time;
 980   my ( $base, $body ) = get_document ($search_url, undef, $timeout);
 981   if (defined ($timeout)) {
 982     $timeout -= (time - $start);
 983     if ($timeout <= 0) {
 984       $body = undef;
 985       LOG (($verbose_net || $verbose_load),
 986            "timed out (late) for $search_url");
 987       $suppress_audit = 1;
 988       return ();
 989     }
 990   }
 991
 992   return () if (! $body);
 993
 994
 995   my @subpages;
 996
 997   my $search_count = "?";
 998   if ($body =~ m@found (approximately |about )?(<B>)?(\d+)(</B>)? image@) {
 999     $search_count = $3;
1000   } elsif ($body =~ m@<NOBR>((\d{1,3})(,\d{3})*)&nbsp;@i) {
1001     $search_count = $1;
1002   } elsif ($body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) {
1003     $search_count = $1;
1004   } elsif ($body =~ m@found about ((\d{1,3})(,\d{3})*|\d+) results@) {
1005     $search_count = $1;
1006   } elsif ($body =~ m@\b\d+ - \d+ of (\d+)\b@i) { # avimages
1007     $search_count = $1;
1008   } elsif ($body =~ m@About ((\d{1,3})(,\d{3})*) images@i) { # avimages
1009     $search_count = $1;
1010   } elsif ($body =~ m@We found ((\d{1,3})(,\d{3})*|\d+) results@i) { # *vista
1011     $search_count = $1;
1012   } elsif ($body =~ m@ of about <B>((\d{1,3})(,\d{3})*)<@i) { # googleimages
1013     $search_count = $1;
1014   } elsif ($body =~ m@<B>((\d{1,3})(,\d{3})*)</B> Web sites were found@i) {
1015     $search_count = $1;    # lycos
1016   } elsif ($body =~ m@WEB.*?RESULTS.*?\b((\d{1,3})(,\d{3})*)\b.*?Matches@i) {
1017     $search_count = $1;                          # hotbot
1018   } elsif ($body =~ m@no photos were found containing@i) { # avimages
1019     $search_count = "0";
1020   } elsif ($body =~ m@found no document matching@i) { # avtext
1021     $search_count = "0";
1022   }
1023   1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/);
1024
1025 #  if ($search_count eq "?" || $search_count eq "0") {
1026 #    my $file = "/tmp/wc.html";
1027 #    open (my $out, '>', $file) || error ("writing $file: $!");
1028 #    print $out $body;
1029 #    close $out;
1030 #    print STDERR  blurb() . "###### wrote $file\n";
1031 #  }
1032
1033
1034   my $length = length($body);
1035   my $href_count = 0;
1036
1037   $_ = $body;
1038
1039   s/[\r\n\t ]+/ /g;
1040
1041
1042   s/(<A )/\n$1/gi;
1043   foreach (split(/\n/)) {
1044     $href_count++;
1045     my ($u) = m@<A\s.*?\bHREF\s*=\s*([\"\'][^\"\'<>]+)@i;
1046     next unless $u;
1047     my ($u2) = m@<IMG\s.*\bSRC\s*=\s*[\"\']([^\"\'<>]+)@i;
1048
1049     if (m/\bm="\{(.*?)\}"/s) {          # Bing info is inside JSON crud
1050       my $json = html_unquote($1);
1051       my ($href) = ($json =~ m/\b(?:surl|purl)\"?:\s*"(.*?)"/s);
1052       my ($img)  = ($json =~ m/\b(?:imgurl|murl)\"?:\s*"(.*?)"/s);
1053       $u = "$img\t$href" if ($img && $href);
1054
1055     } elsif ($u2 && $u2 =~ m@://[^/]*\.gstatic\.com/@s) { $u = $u2;
1056                                                           $u =~ s/^\"|\"$//s;
1057
1058     } elsif ($u =~ m/^\"([^\"]*)\"/) { $u = $1   # quoted string
1059     } elsif ($u =~ m/^([^\s]*)\s/) { $u = $1;    # or token
1060     }
1061
1062     if ( $rejected_urls{$u} ) {
1063       LOG ($verbose_filter, "  pre-rejecting candidate: $u");
1064       next;
1065     }
1066
1067     LOG ($verbose_http, "    HREF: $u");
1068
1069     $subpages[++$#subpages] = $u;
1070   }
1071
1072   if ( $#subpages < 0 ) {
1073     LOG ($verbose_filter,
1074          "found nothing on $base ($length bytes, $href_count links).");
1075     return ();
1076   }
1077
1078   LOG ($verbose_filter, "" . $#subpages+1 . " links on $search_url");
1079
1080   return ($search_count, @subpages);
1081 }
1082
1083
1084 sub depoison(@) {
1085   my (@urls) = @_;
1086   my @urls2 = ();
1087   foreach (@urls) {
1088     my ($h) = m@^https?://([^/: \t\r\n]+)@i;
1089
1090     next unless defined($h);
1091
1092     if ($poisoners{$h}) {
1093       LOG (($verbose_filter), "  rejecting poisoner: $_");
1094       next;
1095     }
1096     if ($h =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
1097         $poisoners{$1}) {
1098       LOG (($verbose_filter), "  rejecting poisoner: $_");
1099       next;
1100     }
1101     if ($h =~ m@([^.]+\.[^.]+)$@ &&
1102         $poisoners{$1}) {
1103       LOG (($verbose_filter), "  rejecting poisoner: $_");
1104       next;
1105     }
1106
1107     push @urls2, $_;
1108   }
1109   return @urls2;
1110 }
1111
1112
1113 # given a list of URLs, picks one at random; loads it; and returns a
1114 # random image from it.
1115 # returns the url of the page loaded; the url of the image chosen.
1116 #
1117 sub pick_image_from_pages($$$$@) {
1118   my ($base, $total_hit_count, $unfiltered_link_count, $timeout, @pages) = @_;
1119
1120   $total_hit_count = "?" unless defined($total_hit_count);
1121
1122   @pages = depoison (@pages);
1123   LOG ($verbose_load,
1124        "" . ($#pages+1) . " candidates of $unfiltered_link_count links" .
1125        " ($total_hit_count total)");
1126
1127   return () if ($#pages < 0);
1128
1129   my $i = int(rand($#pages+1));
1130   my $page = $pages[$i];
1131
1132   LOG ($verbose_load, "picked page $page");
1133
1134   $suppress_audit = 1;
1135
1136   my ( $base2, $body2 ) = get_document ($page, $base, $timeout);
1137
1138   if (!$base2 || !$body2) {
1139     $body2 = undef;
1140     return ();
1141   }
1142
1143   my $img = pick_image_from_body ($base2, $body2);
1144   $body2 = undef;
1145
1146   if ($img) {
1147     return ($base2, $img);
1148   } else {
1149     return ();
1150   }
1151 }
1152
1153 \f
1154 #############################################################################
1155 ##
1156 ## Pick images from random pages returned by the Yahoo Random Link
1157 ##
1158 #############################################################################
1159 #
1160 ## yahoorand
1161 #my $yahoo_random_link = "http://random.yahoo.com/fast/ryl";
1162 #
1163 #
1164 # Picks a random page; picks a random image on that page;
1165 # returns two URLs: the page containing the image, and the image.
1166 # Returns () if nothing found this time.
1167 #
1168 #sub pick_from_yahoo_random_link($) {
1169 #  my ($timeout) = @_;
1170 #
1171 #  print STDERR "\n\n" if ($verbose_load);
1172 #  LOG ($verbose_load, "URL: $yahoo_random_link");
1173 #
1174 #  $last_search = $yahoo_random_link;   # for warnings
1175 #
1176 #  $suppress_audit = 1;
1177 #
1178 #  my ( $base, $body ) = get_document ($yahoo_random_link, undef, $timeout);
1179 #  if (!$base || !$body) {
1180 #    $body = undef;
1181 #    return;
1182 #  }
1183 #
1184 #  LOG ($verbose_load, "redirected to: $base");
1185 #
1186 #  my $img = pick_image_from_body ($base, $body);
1187 #  $body = undef;
1188 #
1189 #  if ($img) {
1190 #    return ($base, $img);
1191 #  } else {
1192 #    return ();
1193 #  }
1194 #}
1195
1196 \f
1197 ############################################################################
1198 #
1199 # Pick images from random pages returned by the Alta Vista Random Link
1200 # Note: this seems to have gotten a *lot* less random lately (2007).
1201 #
1202 ############################################################################
1203
1204 # altavista
1205 my $alta_vista_random_link = "http://www.altavista.com/image/randomlink";
1206
1207
1208 # Picks a random page; picks a random image on that page;
1209 # returns two URLs: the page containing the image, and the image.
1210 # Returns () if nothing found this time.
1211 #
1212 sub pick_from_alta_vista_random_link($) {
1213   my ($timeout) = @_;
1214
1215   print STDERR "\n\n" if ($verbose_load);
1216   LOG ($verbose_load, "URL: $alta_vista_random_link");
1217
1218   $last_search = $alta_vista_random_link;   # for warnings
1219
1220   $suppress_audit = 1;
1221
1222   my ( $base, $body ) = get_document ($alta_vista_random_link,
1223                                       undef, $timeout);
1224   if (!$base || !$body) {
1225     $body = undef;
1226     return;
1227   }
1228
1229   LOG ($verbose_load, "redirected to: $base");
1230
1231   my $img = pick_image_from_body ($base, $body);
1232   $body = undef;
1233
1234   if ($img) {
1235     return ($base, $img);
1236   } else {
1237     return ();
1238   }
1239 }
1240
1241 \f
1242 ############################################################################
1243 #
1244 # Pick images by feeding random words into Alta Vista Image Search
1245 #
1246 ############################################################################
1247
1248
1249 my $alta_vista_images_url = "http://www.altavista.com/image/results" .
1250                             "?ipht=1" .       # photos
1251                             "&igrph=1" .      # graphics
1252                             "&iclr=1" .       # color
1253                             "&ibw=1" .        # b&w
1254                             "&micat=1" .      # no partner sites
1255                             "&sc=on" .        # "site collapse"
1256                             "&q=";
1257
1258 # avimages
1259 sub pick_from_alta_vista_images($) {
1260   my ($timeout) = @_;
1261
1262   my $words = random_word();
1263   my $page = (int(rand(9)) + 1);
1264   my $search_url = $alta_vista_images_url . $words;
1265
1266   if ($page > 1) {
1267     $search_url .= "&pgno=" . $page;            # page number
1268     $search_url .= "&stq=" . (($page-1) * 12);  # first hit result on page
1269   }
1270
1271   my ($search_hit_count, @subpages) =
1272     pick_from_search_engine ($timeout, $search_url, $words);
1273
1274   my @candidates = ();
1275   foreach my $u (@subpages) {
1276
1277     # avimages is encoding their URLs now.
1278     next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
1279     $u = url_unquote($u);
1280
1281     next unless ($u =~ m@^https?://@i);    #  skip non-HTTP or relative URLs
1282     next if ($u =~ m@[/.]altavista\.com\b@i);     # skip altavista builtins
1283     next if ($u =~ m@[/.]yahoo\.com\b@i);         # yahoo and av in cahoots?
1284     next if ($u =~ m@[/.]doubleclick\.net\b@i);   # you cretins
1285     next if ($u =~ m@[/.]clicktomarket\.com\b@i); # more cretins
1286
1287     next if ($u =~ m@[/.]viewimages\.com\b@i);    # stacked deck
1288     next if ($u =~ m@[/.]gettyimages\.com\b@i);
1289
1290     LOG ($verbose_filter, "  candidate: $u");
1291     push @candidates, $u;
1292   }
1293
1294   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1295                                 $timeout, @candidates);
1296 }
1297
1298
1299 \f
1300 ############################################################################
1301 #
1302 # Pick images from Aptix security cameras
1303 # Cribbed liberally from google image search code.
1304 # By Jason Sullivan <jasonsul@us.ibm.com>
1305 #
1306 ############################################################################
1307
1308 my $aptix_images_url = ("http://www.google.com/search" .
1309                         "?q=inurl:%22jpg/image.jpg%3Fr%3D%22");
1310
1311 # securitycam
1312 sub pick_from_security_camera($) {
1313   my ($timeout) = @_;
1314
1315   my $page = (int(rand(9)) + 1);
1316   my $num = 20;                                 # 20 images per page
1317   my $search_url = $aptix_images_url;
1318
1319   if ($page > 1) {
1320     $search_url .= "&start=" . $page*$num;      # page number
1321     $search_url .= "&num="   . $num;            #images per page
1322   }
1323
1324   my ($search_hit_count, @subpages) =
1325     pick_from_search_engine ($timeout, $search_url, '');
1326
1327   my @candidates = ();
1328   my %referers;
1329   foreach my $u (@subpages) {
1330     next if ($u =~ m@[/.]google\.com\b@i); # skip google builtins (most links)
1331     next unless ($u =~ m@jpg/image.jpg\?r=@i);    #  All pics contain this
1332
1333     LOG ($verbose_filter, "  candidate: $u");
1334     push @candidates, $u;
1335     $referers{$u} = $u;
1336     }
1337
1338   @candidates = depoison (@candidates);
1339   return () if ($#candidates < 0);
1340   my $i = int(rand($#candidates+1));
1341   my $img = $candidates[$i];
1342   my $ref = $referers{$img};
1343
1344   LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
1345   return ($ref, $img);
1346 }
1347
1348 \f
1349 ############################################################################
1350 #
1351 # Pick images by feeding random words into Google Image Search.
1352 # By Charles Gales <gales@us.ibm.com>
1353 #
1354 ############################################################################
1355
1356 my $google_images_url = 'https://www.google.com/search' .
1357                         '?source=lnms&tbm=isch&tbs=isz:l&q=';
1358
1359 # googleimgs
1360 sub pick_from_google_images($;$$) {
1361   my ($timeout, $words, $max_page) = @_;
1362
1363   if (!defined($words)) {
1364     $words = random_word();   # only one word for Google
1365   }
1366
1367   my $off = int(rand(40));
1368   my $search_url = $google_images_url . $words . "&start=" . $off;
1369
1370   my ($search_hit_count, @subpages) =
1371     pick_from_search_engine ($timeout, $search_url, $words);
1372
1373   my @candidates = ();
1374   foreach my $u (@subpages) {
1375     $u = html_unquote($u);
1376     # next if ($u =~ m@^https?://[^.]*\.(google|youtube)\.com/@s);
1377     next unless ($u =~ m@^https?://[^/]*\.gstatic\.com@s);
1378     LOG ($verbose_filter, "  candidate: $u");
1379     push @candidates, $u;
1380   }
1381
1382   @candidates = depoison (@candidates);
1383   return () if ($#candidates < 0);
1384   my $i = int(rand($#candidates+1));
1385   my $img = $candidates[$i];
1386
1387   LOG ($verbose_load, "picked image " . ($i+1) . ": $img");
1388   return ($img, $img);
1389 }
1390
1391
1392 \f
1393 ############################################################################
1394 #
1395 # Pick images by feeding random numbers into Google Image Search.
1396 # By jwz, suggested by Ian O'Donnell.
1397 #
1398 ############################################################################
1399
1400
1401 # googlenums
1402 sub pick_from_google_image_numbers($) {
1403   my ($timeout) = @_;
1404
1405   my $max = 9999;
1406   my $number = int(rand($max));
1407
1408   $number = sprintf("%04d", $number)
1409     if (rand() < 0.3);
1410
1411   pick_from_google_images ($timeout, "$number");
1412 }
1413
1414
1415 \f
1416 ############################################################################
1417 #
1418 # Pick images by feeding random digital camera file names into
1419 # Google Image Search.
1420 # By jwz, inspired by the excellent Random Personal Picture Finder
1421 # at http://www.diddly.com/random/
1422 # May 2017: Commented out a bunch of formats that have fallen out of favor.
1423 #
1424 ############################################################################
1425
1426 my @photomakers = (
1427   #
1428   # Common digital camera file name formats, as described at
1429   # http://www.diddly.com/random/about.html
1430   #
1431 # sub { sprintf ("dcp%05d.jpg",  int(rand(4000))); },   # Kodak
1432   sub { sprintf ("dsc%05d.jpg",  int(rand(4000))); },   # Nikon
1433   sub { sprintf ("dscn%04d.jpg", int(rand(4000))); },   # Nikon
1434 # sub { sprintf ("mvc-%03d.jpg", int(rand(999)));  },   # Sony Mavica
1435 # sub { sprintf ("mvc%05d.jpg",  int(rand(9999))); },   # Sony Mavica
1436 # sub { sprintf ("P101%04d.jpg", int(rand(9999))); },   # Olympus w/ date=101
1437 # sub { sprintf ("P%x%02d%04d.jpg",                     # Olympus
1438 #                int(rand(0xC)), int(rand(30))+1,
1439 #                rand(9999)); },
1440   sub { sprintf ("IMG_%03d.jpg",  int(rand(999))); },   # ?
1441 # sub { sprintf ("IMAG%04d.jpg",  int(rand(9999))); },  # RCA and Samsung
1442 # sub { my $n = int(rand(9999));                        # Canon
1443 #         sprintf ("1%02d-%04d.jpg", int($n/100), $n); },
1444 # sub { my $n = int(rand(9999));                        # Canon
1445 #         sprintf ("1%02d-%04d_IMG.jpg",
1446 #                  int($n/100), $n); },
1447   sub { sprintf ("IMG_%04d.jpg", int(rand(9999))); },   # Canon
1448   sub { sprintf ("dscf%04d.jpg", int(rand(9999))); },   # Fuji Finepix
1449 # sub { sprintf ("pdrm%04d.jpg", int(rand(9999))); },   # Toshiba PDR
1450 # sub { sprintf ("IM%06d.jpg", int(rand(9999))); },     # HP Photosmart
1451 # sub { sprintf ("EX%06d.jpg", int(rand(9999))); },     # HP Photosmart
1452 #  sub { my $n = int(rand(3));                          # Kodak DC-40,50,120
1453 #        sprintf ("DC%04d%s.jpg", int(rand(9999)),
1454 #                 $n == 0 ? 'S' : $n == 1 ? 'M' : 'L'); },
1455   sub { sprintf ("pict%04d.jpg", int(rand(9999))); },   # Minolta Dimage
1456 # sub { sprintf ("P%07d.jpg", int(rand(9999))); },      # Kodak DC290
1457 #  sub { sprintf ("%02d%02d%04d.jpg",                   # Casio QV3000, QV4000
1458 #                 int(rand(12))+1, int(rand(31))+1,
1459 #                 int(rand(999))); },
1460 #  sub { sprintf ("%02d%x%02d%04d.jpg",                 # Casio QV7000
1461 #                 int(rand(6)), # year
1462 #                 int(rand(12))+1, int(rand(31))+1,
1463 #                 int(rand(999))); },
1464   sub { sprintf ("IMGP%04d.jpg", int(rand(9999))); },   # Pentax Optio S
1465 # sub { sprintf ("PANA%04d.jpg", int(rand(9999))); },   # Panasonic vid still
1466   sub { sprintf ("HPIM%04d.jpg", int(rand(9999))); },   # HP Photosmart
1467 # sub { sprintf ("PCDV%04d.jpg", int(rand(9999))); },   # ?
1468  );
1469
1470
1471 # googlephotos
1472 sub pick_from_google_image_photos($) {
1473   my ($timeout) = @_;
1474
1475   my $i = int(rand($#photomakers + 1));
1476   my $fn = $photomakers[$i];
1477   my $file = &$fn;
1478   #$file .= "%20filetype:jpg";
1479
1480   pick_from_google_images ($timeout, $file);
1481 }
1482
1483 \f
1484 ############################################################################
1485 #
1486 # Pick images by feeding random words into Google Image Search.
1487 # By the way: fuck Microsoft.
1488 #
1489 ############################################################################
1490
1491 my $bing_images_url =   "http://www.bing.com/images/async?q=";
1492
1493
1494 # bingimgs
1495 sub pick_from_bing_images($;$$) {
1496   my ($timeout, $words, $max_page) = @_;
1497
1498   if (!defined($words)) {
1499     $words = random_word();   # only one word for Bing
1500   }
1501
1502   my $off = int(rand(300));
1503   my $search_url = $bing_images_url . $words . "&first=" . $off;
1504
1505   my ($search_hit_count, @subpages) =
1506     pick_from_search_engine ($timeout, $search_url, $words);
1507
1508   my @candidates = ();
1509   my %referers;
1510   foreach my $u (@subpages) {
1511     my ($img, $ref) = ($u =~ m/^(.*?)\t(.*)$/s);
1512     next unless $img;
1513     LOG ($verbose_filter, "  candidate: $ref");
1514     push @candidates, $img;
1515     $referers{$img} = $ref;
1516   }
1517
1518   @candidates = depoison (@candidates);
1519   return () if ($#candidates < 0);
1520   my $i = int(rand($#candidates+1));
1521   my $img = $candidates[$i];
1522   my $ref = $referers{$img};
1523
1524   LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
1525   return ($ref, $img);
1526 }
1527
1528
1529
1530 \f
1531 ############################################################################
1532 #
1533 # Pick images by feeding random numbers into Bing Image Search.
1534 #
1535 ############################################################################
1536
1537 # bingnums
1538 sub pick_from_bing_image_numbers($) {
1539   my ($timeout) = @_;
1540
1541   my $max = 9999;
1542   my $number = int(rand($max));
1543
1544   $number = sprintf("%04d", $number)
1545     if (rand() < 0.3);
1546
1547   pick_from_bing_images ($timeout, "$number");
1548 }
1549
1550 \f
1551 ############################################################################
1552 #
1553 # Pick images by feeding random numbers into Bing Image Search.
1554 #
1555 ############################################################################
1556
1557 # bingphotos
1558 sub pick_from_bing_image_photos($) {
1559   my ($timeout) = @_;
1560
1561   my $i = int(rand($#photomakers + 1));
1562   my $fn = $photomakers[$i];
1563   my $file = &$fn;
1564
1565   pick_from_bing_images ($timeout, $file);
1566 }
1567
1568 \f
1569 ############################################################################
1570 #
1571 # Pick images by feeding random words into Alta Vista Text Search
1572 #
1573 ############################################################################
1574
1575
1576 my $alta_vista_url = "http://www.altavista.com/web/results" .
1577                      "?pg=aq" .
1578                      "&aqmode=s" .
1579                      "&filetype=html" .
1580                      "&sc=on" .        # "site collapse"
1581                      "&nbq=50" .
1582                      "&aqo=";
1583
1584 # avtext
1585 sub pick_from_alta_vista_text($) {
1586   my ($timeout) = @_;
1587
1588   my $words = random_words('%20');
1589   my $page = (int(rand(9)) + 1);
1590   my $search_url = $alta_vista_url . $words;
1591
1592   if ($page > 1) {
1593     $search_url .= "&pgno=" . $page;
1594     $search_url .= "&stq=" . (($page-1) * 10);
1595   }
1596
1597   my ($search_hit_count, @subpages) =
1598     pick_from_search_engine ($timeout, $search_url, $words);
1599
1600   my @candidates = ();
1601   foreach my $u (@subpages) {
1602
1603     # Those altavista fuckers are playing really nasty redirection games
1604     # these days: the filter your clicks through their site, but use
1605     # onMouseOver to make it look like they're not!  Well, it makes it
1606     # easier for us to identify search results...
1607     #
1608     next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
1609     $u = url_unquote($u);
1610
1611     next unless ($u =~ m@^https?://@i);    #  skip non-HTTP or relative URLs
1612     next if ($u =~ m@[/.]altavista\.com\b@i);     # skip altavista builtins
1613     next if ($u =~ m@[/.]yahoo\.com\b@i);         # yahoo and av in cahoots?
1614
1615     LOG ($verbose_filter, "  candidate: $u");
1616     push @candidates, $u;
1617   }
1618
1619   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1620                                 $timeout, @candidates);
1621 }
1622
1623
1624 \f
1625 ############################################################################
1626 #
1627 # Pick images by feeding random words into Hotbot
1628 #
1629 ############################################################################
1630
1631 my $hotbot_search_url =("http://hotbot.lycos.com/default.asp" .
1632                         "?ca=w" .
1633                         "&descriptiontype=0" .
1634                         "&imagetoggle=1" .
1635                         "&matchmode=any" .
1636                         "&nummod=2" .
1637                         "&recordcount=50" .
1638                         "&sitegroup=1" .
1639                         "&stem=1" .
1640                         "&cobrand=undefined" .
1641                         "&query=");
1642
1643 sub pick_from_hotbot_text($) {
1644   my ($timeout) = @_;
1645
1646   $last_search = $hotbot_search_url;   # for warnings
1647
1648   # lycos seems to always give us back dictionaries and word lists if
1649   # we search for more than one word...
1650   #
1651   my $words = random_word();
1652
1653   my $start = int(rand(8)) * 10 + 1;
1654   my $search_url = $hotbot_search_url . $words . "&first=$start&page=more";
1655
1656   my ($search_hit_count, @subpages) =
1657     pick_from_search_engine ($timeout, $search_url, $words);
1658
1659   my @candidates = ();
1660   foreach my $u (@subpages) {
1661
1662     # Hotbot plays redirection games too
1663     # (not any more?)
1664 #    next unless ($u =~ m@/director.asp\?.*\btarget=([^&]+)@);
1665 #    $u = url_decode($1);
1666
1667     next unless ($u =~ m@^https?://@i);    #  skip non-HTTP or relative URLs
1668     next if ($u =~ m@[/.]hotbot\.com\b@i);     # skip hotbot builtins
1669     next if ($u =~ m@[/.]lycos\.com\b@i);      # skip hotbot builtins
1670     next if ($u =~ m@[/.]inktomi\.com\b@i);    # skip hotbot builtins
1671
1672     LOG ($verbose_filter, "  candidate: $u");
1673     push @candidates, $u;
1674   }
1675
1676   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1677                                 $timeout, @candidates);
1678 }
1679
1680
1681 \f
1682 ############################################################################
1683 #
1684 # Pick images by feeding random words into Lycos
1685 #
1686 ############################################################################
1687
1688 my $lycos_search_url = "http://search.lycos.com/default.asp" .
1689                        "?lpv=1" .
1690                        "&loc=searchhp" .
1691                        "&tab=web" .
1692                        "&query=";
1693
1694 sub pick_from_lycos_text($) {
1695   my ($timeout) = @_;
1696
1697   $last_search = $lycos_search_url;   # for warnings
1698
1699   # lycos seems to always give us back dictionaries and word lists if
1700   # we search for more than one word...
1701   #
1702   my $words = random_word();
1703
1704   my $start = int(rand(8)) * 10 + 1;
1705   my $search_url = $lycos_search_url . $words . "&first=$start&page=more";
1706
1707   my ($search_hit_count, @subpages) =
1708     pick_from_search_engine ($timeout, $search_url, $words);
1709
1710   my @candidates = ();
1711   foreach my $u (@subpages) {
1712
1713     # Lycos plays redirection games.
1714     # (not any more?)
1715 #    next unless ($u =~ m@^https?://click.lycos.com/director.asp
1716 #                         .*
1717 #                         \btarget=([^&]+)
1718 #                         .*
1719 #                        @x);
1720 #    $u = url_decode($1);
1721
1722     next unless ($u =~ m@^https?://@i);    #  skip non-HTTP or relative URLs
1723     next if ($u =~ m@[/.]hotbot\.com\b@i);     # skip lycos builtins
1724     next if ($u =~ m@[/.]lycos\.com\b@i);      # skip lycos builtins
1725     next if ($u =~ m@[/.]terralycos\.com\b@i); # skip lycos builtins
1726     next if ($u =~ m@[/.]inktomi\.com\b@i);    # skip lycos builtins
1727
1728
1729     LOG ($verbose_filter, "  candidate: $u");
1730     push @candidates, $u;
1731   }
1732
1733   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1734                                 $timeout, @candidates);
1735 }
1736
1737
1738 \f
1739 ############################################################################
1740 #
1741 # Pick images by feeding random words into news.yahoo.com
1742 #
1743 ############################################################################
1744
1745 my $yahoo_news_url = "http://news.search.yahoo.com/search/news" .
1746                      "?c=news_photos" .
1747                      "&p=";
1748
1749 # yahoonews
1750 sub pick_from_yahoo_news_text($) {
1751   my ($timeout) = @_;
1752
1753   $last_search = $yahoo_news_url;   # for warnings
1754
1755   my $words = random_word();
1756   my $search_url = $yahoo_news_url . $words;
1757
1758   my ($search_hit_count, @subpages) =
1759     pick_from_search_engine ($timeout, $search_url, $words);
1760
1761   my @candidates = ();
1762   foreach my $u (@subpages) {
1763
1764     # de-redirectize the URLs
1765     $u =~ s@^https?://rds\.yahoo\.com/.*-http%3A@http:@s;
1766
1767     # only accept URLs on Yahoo's news site
1768     next unless ($u =~ m@^https?://dailynews\.yahoo\.com/@i ||
1769                  $u =~ m@^https?://story\.news\.yahoo\.com/@i);
1770     next unless ($u =~ m@&u=/@);
1771
1772     LOG ($verbose_filter, "  candidate: $u");
1773     push @candidates, $u;
1774   }
1775
1776   return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1777                                 $timeout, @candidates);
1778 }
1779
1780
1781 \f
1782 ############################################################################
1783 #
1784 # Pick images from LiveJournal's list of recently-posted images.
1785 #
1786 ############################################################################
1787
1788 my $livejournal_img_url = "http://www.livejournal.com/stats/latest-img.bml";
1789
1790 # With most of our image sources, we get a random page and then select
1791 # from the images on it.  However, in the case of LiveJournal, the page
1792 # of images tends to update slowly; so we'll remember the last N entries
1793 # on it and randomly select from those, to get a wider variety each time.
1794
1795 my $lj_cache_size = 1000;
1796 my @lj_cache = (); # fifo, for ordering by age
1797 my %lj_cache = (); # hash, for detecting dups
1798
1799 # livejournal
1800 sub pick_from_livejournal_images($) {
1801   my ($timeout) = @_;
1802
1803   $last_search = $livejournal_img_url;   # for warnings
1804
1805   my ( $base, $body ) = get_document ($livejournal_img_url, undef, $timeout);
1806
1807   # Often the document comes back empty. If so, just use the cache.
1808   # return () unless $body;
1809   $body = '' unless defined($body);
1810
1811   $body =~ s/\n/ /gs;
1812   $body =~ s/(<recent-image)\b/\n$1/gsi;
1813
1814   foreach (split (/\n/, $body)) {
1815     next unless (m/^<recent-image\b/);
1816     next unless (m/\bIMG=[\'\"]([^\'\"]+)[\'\"]/si);
1817     my $img = html_unquote ($1);
1818
1819     next if ($lj_cache{$img}); # already have it
1820
1821     next unless (m/\bURL=[\'\"]([^\'\"]+)[\'\"]/si);
1822     my $page = html_unquote ($1);
1823     my @pair = ($img, $page);
1824     LOG ($verbose_filter, "  candidate: $img");
1825     push @lj_cache, \@pair;
1826     $lj_cache{$img} = \@pair;
1827   }
1828
1829   return () if ($#lj_cache == -1);
1830
1831   my $n = $#lj_cache+1;
1832   my $i = int(rand($n));
1833   my ($img, $page) = @{$lj_cache[$i]};
1834
1835   # delete this one from @lj_cache and from %lj_cache.
1836   #
1837   @lj_cache = ( @lj_cache[0 .. $i-1],
1838                 @lj_cache[$i+1 .. $#lj_cache] );
1839   delete $lj_cache{$img};
1840
1841   # Keep the size of the cache under the limit by nuking older entries
1842   #
1843   while ($#lj_cache >= $lj_cache_size) {
1844     my $pairP = shift @lj_cache;
1845     my $img = $pairP->[0];
1846     delete $lj_cache{$img};
1847   }
1848
1849   LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
1850
1851   return ($page, $img);
1852 }
1853
1854 \f
1855 ############################################################################
1856 #
1857 # Pick images from ircimages.com (images that have been in the /topic of
1858 # various IRC channels.)
1859 #
1860 ############################################################################
1861
1862 my $ircimages_url = "http://ircimages.com/";
1863
1864 # ircimages
1865 sub pick_from_ircimages($) {
1866   my ($timeout) = @_;
1867
1868   $last_search = $ircimages_url;   # for warnings
1869
1870   my $n = int(rand(2900));
1871   my $search_url = $ircimages_url . "page-$n";
1872
1873   my ( $base, $body ) = get_document ($search_url, undef, $timeout);
1874   return () unless $body;
1875
1876   my @candidates = ();
1877
1878   $body =~ s/\n/ /gs;
1879   $body =~ s/(<A)\b/\n$1/gsi;
1880
1881   foreach (split (/\n/, $body)) {
1882
1883     my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
1884     next unless $u;
1885
1886     if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; }   # quoted string
1887     elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; }  # or token
1888
1889     next unless ($u =~ m/^https?:/i);
1890     next if ($u =~ m@^https?://(searchirc\.com\|ircimages\.com)@i);
1891     next unless ($u =~ m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@i);
1892
1893     LOG ($verbose_http, "    HREF: $u");
1894     push @candidates, $u;
1895   }
1896
1897   LOG ($verbose_filter, "" . $#candidates+1 . " links on $search_url");
1898
1899   return () if ($#candidates == -1);
1900
1901   my $i = int(rand($#candidates+1));
1902   my $img = $candidates[$i];
1903
1904   LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#candidates+1) .
1905        ": $img");
1906
1907   $search_url = $img;  # hmm...
1908   return ($search_url, $img);
1909 }
1910
1911 \f
1912 ############################################################################
1913 #
1914 # Pick images from Twitpic's list of recently-posted images.
1915 #
1916 ############################################################################
1917
1918 my $twitpic_img_url = "http://twitpic.com/public_timeline/feed.rss";
1919
1920 # With most of our image sources, we get a random page and then select
1921 # from the images on it.  However, in the case of Twitpic, the page
1922 # of images tends to update slowly; so we'll remember the last N entries
1923 # on it and randomly select from those, to get a wider variety each time.
1924
1925 my $twitpic_cache_size = 1000;
1926 my @twitpic_cache = (); # fifo, for ordering by age
1927 my %twitpic_cache = (); # hash, for detecting dups
1928
1929 # twitpic
1930 sub pick_from_twitpic_images($) {
1931   my ($timeout) = @_;
1932
1933   $last_search = $twitpic_img_url;   # for warnings
1934
1935   my ( $base, $body ) = get_document ($twitpic_img_url, undef, $timeout);
1936
1937   # Update the cache.
1938
1939   if ($body) {
1940     $body =~ s/\n/ /gs;
1941     $body =~ s/(<item)\b/\n$1/gsi;
1942
1943     my @items = split (/\n/, $body);
1944     shift @items;
1945     foreach (@items) {
1946       next unless (m@<link>([^<>]*)</link>@si);
1947       my $page = html_unquote ($1);
1948
1949       $page =~ s@/$@@s;
1950       $page .= '/full';
1951
1952       next if ($twitpic_cache{$page}); # already have it
1953
1954       LOG ($verbose_filter, "  candidate: $page");
1955       push @twitpic_cache, $page;
1956       $twitpic_cache{$page} = $page;
1957     }
1958   }
1959
1960   # Pull from the cache.
1961
1962   return () if ($#twitpic_cache == -1);
1963
1964   my $n = $#twitpic_cache+1;
1965   my $i = int(rand($n));
1966   my $page = $twitpic_cache[$i];
1967
1968   # delete this one from @twitpic_cache and from %twitpic_cache.
1969   #
1970   @twitpic_cache = ( @twitpic_cache[0 .. $i-1],
1971                      @twitpic_cache[$i+1 .. $#twitpic_cache] );
1972   delete $twitpic_cache{$page};
1973
1974   # Keep the size of the cache under the limit by nuking older entries
1975   #
1976   while ($#twitpic_cache >= $twitpic_cache_size) {
1977     my $page = shift @twitpic_cache;
1978     delete $twitpic_cache{$page};
1979   }
1980
1981   ( $base, $body ) = get_document ($page, undef, $timeout);
1982   my $img = undef;
1983   $body = '' unless defined($body);
1984
1985   foreach (split (/<img\s+/, $body)) {
1986     my ($src) = m/\bsrc=[\"\'](.*?)[\"\']/si;
1987     next unless $src;
1988     next if m@/js/@s;
1989     next if m@/images/@s;
1990
1991     $img = $src;
1992
1993     $img = "http:$img" if ($img =~ m@^//@s);  # Oh come on
1994
1995     # Sometimes these images are hosted on twitpic, sometimes on Amazon.
1996     if ($img =~ m@^/@) {
1997       $base =~ s@^(https?://[^/]+)/.*@$1@s;
1998       $img = $base . $img;
1999     }
2000     last;
2001   }
2002
2003   if (!$img) {
2004     LOG ($verbose_load, "no matching images on $page\n");
2005     return ();
2006   }
2007
2008   LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
2009
2010   return ($page, $img);
2011 }
2012
2013 \f
2014 ############################################################################
2015 #
2016 # Pick images from Twitter's list of recently-posted updates.
2017 #
2018 ############################################################################
2019
2020 # With most of our image sources, we get a random page and then select
2021 # from the images on it.  However, in the case of Twitter, the page
2022 # of images only updates once a minute; so we'll remember the last N entries
2023 # on it and randomly select from those, to get a wider variety each time.
2024
2025 my $twitter_img_url = "http://api.twitter.com/1/statuses/" .
2026                       "public_timeline.json" .
2027                       "?include_entities=true" .
2028                       "&include_rts=true" .
2029                       "&count=200";
2030
2031 my $twitter_cache_size = 1000;
2032
2033 my @twitter_cache = (); # fifo, for ordering by age
2034 my %twitter_cache = (); # hash, for detecting dups
2035
2036
2037 # twitter
2038 sub pick_from_twitter_images($) {
2039   my ($timeout) = @_;
2040
2041   $last_search = $twitter_img_url;   # for warnings
2042
2043   my ( $base, $body ) = get_document ($twitter_img_url, undef, $timeout);
2044   # Update the cache.
2045
2046   if ($body) {
2047     $body =~ s/[\r\n]+/ /gs;
2048
2049     # Parsing JSON is a pain in the ass.  So we halfass it as usual.
2050     $body =~ s/^\[|\]$//s;
2051     $body =~ s/(\[.*?\])/{ $_ = $1; s@\},@\} @gs; $_; }/gsexi;
2052     my @items = split (/\},\{/, $body);
2053     foreach (@items) {
2054       my ($name) = m@"screen_name":"([^\"]+)"@si;
2055       my ($img)  = m@"media_url":"([^\"]+)"@si;
2056       my ($page) = m@"display_url":"([^\"]+)"@si;
2057       next unless ($name && $img && $page);
2058       foreach ($img, $page) {
2059         s/\\//gs;
2060         $_ = "http://$_" unless (m/^http/si);
2061       }
2062
2063       next if ($twitter_cache{$page}); # already have it
2064
2065       LOG ($verbose_filter, "  candidate: $page - $img");
2066       push @twitter_cache, $page;
2067       $twitter_cache{$page} = $img;
2068     }
2069   }
2070
2071   # Pull from the cache.
2072
2073   return () if ($#twitter_cache == -1);
2074
2075   my $n = $#twitter_cache+1;
2076   my $i = int(rand($n));
2077   my $page = $twitter_cache[$i];
2078   my $url  = $twitter_cache{$page};
2079
2080   # delete this one from @twitter_cache and from %twitter_cache.
2081   #
2082   @twitter_cache = ( @twitter_cache[0 .. $i-1],
2083                      @twitter_cache[$i+1 .. $#twitter_cache] );
2084   delete $twitter_cache{$page};
2085
2086   # Keep the size of the cache under the limit by nuking older entries
2087   #
2088   while ($#twitter_cache >= $twitter_cache_size) {
2089     my $page = shift @twitter_cache;
2090     delete $twitter_cache{$page};
2091   }
2092
2093   LOG ($verbose_load, "picked page $url");
2094
2095   $suppress_audit = 1;
2096
2097   return ($page, $url);
2098 }
2099
2100 \f
2101 ############################################################################
2102 #
2103 # Pick images from Flickr's page of recently-posted photos.
2104 #
2105 ############################################################################
2106
2107 my $flickr_img_url = "http://www.flickr.com/explore/";
2108
2109 # Like LiveJournal, the Flickr page of images tends to update slowly,
2110 # so remember the last N entries on it and randomly select from those.
2111
2112 # I know that Flickr has an API (http://www.flickr.com/services/api/)
2113 # but it was easy enough to scrape the HTML, so I didn't bother exploring.
2114
2115 my $flickr_cache_size = 1000;
2116 my @flickr_cache = (); # fifo, for ordering by age
2117 my %flickr_cache = (); # hash, for detecting dups
2118
2119
2120 # flickr_recent
2121 sub pick_from_flickr_recent($) {
2122   my ($timeout) = @_;
2123
2124   my $start = 16 * int(rand(100));
2125
2126   $last_search = $flickr_img_url;   # for warnings
2127   $last_search .= "?start=$start" if ($start > 0);
2128
2129   my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2130
2131   # If the document comes back empty. just use the cache.
2132   # return () unless $body;
2133   $body = '' unless defined($body);
2134
2135   my $count = 0;
2136   my $count2 = 0;
2137
2138   if ($body =~ m@{ *"_data": \[ ( .*? \} ) \]@six) {
2139     $body = $1;
2140   } else {
2141     LOG ($verbose_load, "flickr unparsable: $last_search");
2142     return ();
2143   }
2144
2145   $body =~ s/[\r\n]/ /gs;
2146   $body =~ s/(\},) *(\{)/$1\n$2/gs;     # "_flickrModelRegistry"
2147
2148   foreach my $chunk (split (/\n/, $body)) {
2149     my ($img) = ($chunk =~ m@"displayUrl": *"(.*?)"@six);
2150     next unless defined ($img);
2151     $img =~ s/\\//gs;
2152     $img = "//" unless ($img =~ m@^/@s);
2153     $img = "http:$img" unless ($img =~ m/^http/s);
2154
2155     my ($user) = ($chunk =~ m/"pathAlias": *"(.*?)"/si);
2156     next unless defined ($user);
2157
2158     my ($id) = ($img =~ m@/\d+/(\d+)_([\da-f]+)_@si);
2159     my ($page) = "https://www.flickr.com/photos/$user/$id/";
2160
2161     # $img =~ s/_[a-z](\.[a-z\d]+)$/$1/si;  # take off "thumb" suffix
2162
2163     $count++;
2164     next if ($flickr_cache{$img}); # already have it
2165
2166     my @pair = ($img, $page, $start);
2167     LOG ($verbose_filter, "  candidate: $img");
2168     push @flickr_cache, \@pair;
2169     $flickr_cache{$img} = \@pair;
2170     $count2++;
2171   }
2172
2173   return () if ($#flickr_cache == -1);
2174
2175   my $n = $#flickr_cache+1;
2176   my $i = int(rand($n));
2177   my ($img, $page) = @{$flickr_cache[$i]};
2178
2179   # delete this one from @flickr_cache and from %flickr_cache.
2180   #
2181   @flickr_cache = ( @flickr_cache[0 .. $i-1],
2182                     @flickr_cache[$i+1 .. $#flickr_cache] );
2183   delete $flickr_cache{$img};
2184
2185   # Keep the size of the cache under the limit by nuking older entries
2186   #
2187   while ($#flickr_cache >= $flickr_cache_size) {
2188     my $pairP = shift @flickr_cache;
2189     my $img = $pairP->[0];
2190     delete $flickr_cache{$img};
2191   }
2192
2193   LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
2194
2195   return ($page, $img);
2196 }
2197
2198 \f
2199 ############################################################################
2200 #
2201 # Pick images from a random RSS feed on Flickr.
2202 #
2203 ############################################################################
2204
2205 my $flickr_rss_base = ("http://www.flickr.com/services/feeds/" .
2206                        "photos_public.gne" .
2207                        "?format=rss_200_enc&tagmode=any&tags=");
2208
2209 # Picks a random RSS feed; picks a random image from that feed;
2210 # returns 2 URLs: the page containing the image, and the image.
2211 # Mostly by Joe Mcmahon <mcmahon@yahoo-inc.com>
2212 #
2213 # flickr_random
2214 sub pick_from_flickr_random($) {
2215   my $timeout = shift;
2216
2217   my $words = random_words(',');
2218   my $rss = $flickr_rss_base . $words;
2219   $last_search = $rss;
2220
2221   $_ = $words;
2222   s/,/ /g;
2223
2224   print STDERR "\n\n" if ($verbose_load);
2225   LOG ($verbose_load, "words: $_");
2226   LOG ($verbose_load, "URL: $last_search");
2227
2228   $suppress_audit = 1;
2229
2230   my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2231   if (!$base || !$body) {
2232     $body = undef;
2233     return;
2234   }
2235
2236   my $img;
2237   ($base, $img) = pick_image_from_rss ($base, $body);
2238   $body = undef;
2239   return () unless defined ($img);
2240
2241   LOG ($verbose_load, "redirected to: $base");
2242   return ($base, $img);
2243 }
2244
2245 \f
2246 ############################################################################
2247 #
2248 # Pick random images from Instagram.
2249 #
2250 ############################################################################
2251
2252 my $instagram_url_base = "https://api.instagram.com/v1/media/popular";
2253
2254 # instagram_random
2255 sub pick_from_instagram($) {
2256   my $timeout = shift;
2257
2258   # Liberated access tokens.
2259   # jsdo.it search for: instagram client_id
2260   # Google search for: instagram "&client_id=" site:jsfiddle.net
2261   my @tokens = (#'b59fbe4563944b6c88cced13495c0f49', # gramfeed.com
2262                 #'fa26679250df49c48a33fbcf30aae989', # instac.at
2263                 #'d9494686198d4dfeb954979a3e270e5e', # iconosquare.com
2264                 #'793ef48bb18e4197b61afce2d799b81c', # jsdo.it
2265                 #'67b8a3e0073449bba70600d0fc68e6cb', # jsdo.it
2266                 #'26a098e0df4d4b9ea8b4ce6c505b7742', # jsdo.it
2267                 #'2437cbcd906a4c10940f990d283d3cd5', # jsdo.it
2268                 #'191c7d7d5312464cbd92134f36ffdab5', # jsdo.it
2269                 #'acfec809437b4340b2c38f66503af774', # jsdo.it
2270                 #'e9f77604a3a24beba949c12d18130988', # jsdo.it
2271                 #'2cd7bcf68ae346529770073d311575b3', # jsdo.it
2272                 #'830c600fe8d742e2ab3f3b94f9bb22b7', # jsdo.it
2273                 #'55865a0397ad41e5997dd95ef4df8da1', # jsdo.it
2274                 #'192a5742f3644ea8bed1d25e439286a8', # jsdo.it
2275                 #'38ed1477e7a44595861b8842cdb8ba23', # jsdo.it
2276                 #'e52f79f645f54488ad0cc47f6f55ade6', # jsfiddle.net
2277                 );
2278
2279   my $tok = $tokens[int(rand($#tokens+1))];
2280   $last_search = $instagram_url_base . "?client_id=" . $tok;
2281
2282   print STDERR "\n\n" if ($verbose_load);
2283   LOG ($verbose_load, "URL: $last_search");
2284
2285   my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2286   if (!$base || !$body) {
2287     $body = undef;
2288     return;
2289   }
2290
2291   $body =~ s/("link")/\001$1/gs;
2292   my @chunks = split(/\001/, $body);
2293   shift @chunks;
2294   my @urls = ();
2295   foreach (@chunks) {
2296     s/\\//gs;
2297     my ($url) = m/"link":\s*"(.*?)"/s;
2298     my ($img) = m/"standard_resolution":\{"url":\s*"(.*?)"/s;
2299        ($img) = m/"url":\s*"(.*?)"/s unless $url;
2300     next unless ($url && $img);
2301     push @urls, [ $url, $img ];
2302   }
2303
2304   if ($#urls < 0) {
2305     LOG ($verbose_load, "no images on $last_search");
2306     return ();
2307   }
2308
2309   my $i = int(rand($#urls+1));
2310   my ($url, $img) = @{$urls[$i]};
2311
2312   LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
2313   return ($url, $img);
2314 }
2315
2316 \f
2317 ############################################################################
2318 #
2319 # Pick images from Imgur.
2320 #
2321 ############################################################################
2322
2323 my $imgur_base = 'http://imgur.com/search?qs=thumb&q_any=';
2324
2325 sub pick_from_imgur($) {
2326   my $timeout = shift;
2327
2328   my $words = random_words('%20');
2329   $last_search = $imgur_base . $words;
2330
2331   $_ = $words;
2332   s/%20/ /g;
2333
2334   print STDERR "\n\n" if ($verbose_load);
2335   LOG ($verbose_load, "words: $_");
2336   LOG ($verbose_load, "URL: $last_search");
2337
2338   $suppress_audit = 1;
2339
2340   my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2341   if (!$base || !$body) {
2342     $body = undef;
2343     return;
2344   }
2345
2346   my @imgs = ($body =~ m@\bHREF=[\"\']([^\'\"<>]*/gallery/[^\'\"<>]+)@gsi);
2347   return () unless @imgs;
2348
2349   my $n = @imgs;
2350   my $i = int(rand($n));
2351   my $page = $imgs[$i];
2352   $page =~ s/[?&].*$//s;
2353   $page = "http://imgur.com$page" if ($page =~ m@^/@s);
2354
2355   my ($id) = ($page =~ m@([^/?&]+)$@s);
2356   my $img = "http://i.imgur.com/$id.jpg";
2357
2358   LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
2359
2360   return ($page, $img);
2361 }
2362
2363 \f
2364 ############################################################################
2365 #
2366 # Pick images from Tumblr.
2367 #
2368 ############################################################################
2369
2370 my $tumblr_base = 'https://www.tumblr.com/search/';
2371
2372 sub pick_from_tumblr($) {
2373   my $timeout = shift;
2374
2375   # Tumblr doesn't have an "or" search, which means our vocabulary is
2376   # a bit too extensive to work well...
2377
2378   my $words = random_word();
2379   $last_search = $tumblr_base . $words;
2380
2381   print STDERR "\n\n" if ($verbose_load);
2382   LOG ($verbose_load, "words: $words");
2383   LOG ($verbose_load, "URL: $last_search");
2384
2385   $suppress_audit = 1;
2386
2387   my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2388   if (!$base || !$body) {
2389     $body = undef;
2390     return;
2391   }
2392
2393   my @imgs0 = ($body =~ m@<IMG\b([^<>]*)>@gsi);
2394   return () unless @imgs0;
2395   my @imgs;
2396   foreach my $img (@imgs0) {
2397     my ($src)  = ($img =~ m@\bsrc=[\"\'](.*?)[\"\']@si);
2398     my ($href) = ($img =~ m@\bdata-pin-url=[\"\'](.*?)[\"\']@si);
2399     next unless ($src && $href);
2400     next if ($src =~ m/^data:/s);
2401     foreach ($src, $href) { $_ = "http://www.tumblr.com$_" if (m@^/@s); }
2402     push @imgs, [$href, $src];
2403   }
2404   return () unless @imgs;
2405
2406   my $n = @imgs;
2407   my $i = int(rand($n));
2408   my $page = $imgs[$i]->[0];
2409   my $img  = $imgs[$i]->[1];
2410
2411   LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
2412
2413   return ($page, $img);
2414 }
2415
2416 \f
2417 ############################################################################
2418 #
2419 # Pick images by waiting for driftnet to populate a temp dir with files.
2420 # Requires driftnet version 0.1.5 or later.
2421 # (Driftnet is a program by Chris Lightfoot that sniffs your local ethernet
2422 # for images being downloaded by others.)
2423 # Driftnet/webcollage integration by jwz.
2424 #
2425 ############################################################################
2426
2427 # driftnet
2428 sub pick_from_driftnet($) {
2429   my ($timeout) = @_;
2430
2431   my $id = $driftnet_magic;
2432   my $dir = $driftnet_dir;
2433   my $start = time;
2434   my $now;
2435
2436   error ("\$driftnet_dir unset?") unless ($dir);
2437   $dir =~ s@/+$@@;
2438
2439   error ("$dir unreadable") unless (-d "$dir/.");
2440
2441   $timeout = $http_timeout unless ($timeout);
2442   $last_search = $id;
2443
2444   while ($now = time, $now < $start + $timeout) {
2445     opendir (my $dir, $dir) || error ("$dir: $!");
2446     while (my $file = readdir($dir)) {
2447       next if ($file =~ m/^\./);
2448       $file = "$dir/$file";
2449       closedir ($dir);
2450       LOG ($verbose_load, "picked file $file ($id)");
2451       return ($id, $file);
2452     }
2453     closedir ($dir);
2454   }
2455   LOG (($verbose_net || $verbose_load), "timed out for $id");
2456   return ();
2457 }
2458
2459
2460 sub get_driftnet_file($) {
2461   my ($file) = @_;
2462
2463   error ("\$driftnet_dir unset?") unless ($driftnet_dir);
2464
2465   my $id = $driftnet_magic;
2466   error ("$id: $file not in $driftnet_dir?")
2467     unless ($file =~ m@^\Q$driftnet_dir@o);
2468
2469   open (my $in, '<', $file) || error ("$id: $file: $!");
2470   my $body = '';
2471   local $/ = undef;  # read entire file
2472   $body = <$in>;
2473   close ($in) || error ("$id: $file: $!");
2474   unlink ($file) || error ("$id: $file: rm: $!");
2475   return ($id, $body);
2476 }
2477
2478
2479 sub spawn_driftnet($) {
2480   my ($cmd) = @_;
2481
2482   # make a directory to use.
2483   while (1) {
2484     my $tmp = $ENV{TEMPDIR} || "/tmp";
2485     $driftnet_dir = sprintf ("$tmp/driftcollage-%08x", rand(0xffffffff));
2486     LOG ($verbose_exec, "mkdir $driftnet_dir");
2487     last if mkdir ($driftnet_dir, 0700);
2488   }
2489
2490   if (! ($cmd =~ m/\s/)) {
2491     # if the command didn't have any arguments in it, then it must be just
2492     # a pointer to the executable.  Append the default args to it.
2493     my $dargs = $default_driftnet_cmd;
2494     $dargs =~ s/^[^\s]+//;
2495     $cmd .= $dargs;
2496   }
2497
2498   # point the driftnet command at our newly-minted private directory.
2499   #
2500   $cmd .= " -d $driftnet_dir";
2501   $cmd .= ">/dev/null" unless ($verbose_exec);
2502
2503   my $pid = fork();
2504   if ($pid < 0) { error ("fork: $!\n"); }
2505   if ($pid) {
2506     # parent fork
2507     push @pids_to_kill, $pid;
2508     LOG ($verbose_exec, "forked for \"$cmd\"");
2509   } else {
2510     # child fork
2511     nontrapping_system ($cmd) || error ("exec: $!");
2512   }
2513
2514   # wait a bit, then make sure the process actually started up.
2515   #
2516   sleep (1);
2517   error ("pid $pid failed to start \"$cmd\"")
2518     unless (1 == kill (0, $pid));
2519 }
2520
2521 # local-directory
2522 sub pick_from_local_dir($) {
2523   my ($timeout) = @_;
2524
2525   my $id = $local_magic;
2526   $last_search = $id;
2527
2528   my $dir = $local_dir;
2529   error ("\$local_dir unset?") unless ($dir);
2530   $dir =~ s@/+$@@;
2531
2532   error ("$dir unreadable") unless (-d "$dir/.");
2533
2534   my $v = ($verbose_exec ? "-v" : "");
2535   my $pick = `xscreensaver-getimage-file $v "$dir"`;
2536   $pick =~ s/\s+$//s;
2537   $pick = "$dir/$pick" unless ($pick =~ m@^/@s);       # relative path
2538
2539   LOG ($verbose_load, "picked file $pick ($id)");
2540   return ($id, $pick);
2541 }
2542
2543
2544 sub get_local_file($) {
2545   my ($file) = @_;
2546
2547   error ("\$local_dir unset?") unless ($local_dir);
2548
2549   my $id = $local_magic;
2550   error ("$id: $file not in $local_dir?")
2551     unless ($file =~ m@^\Q$local_dir@o);
2552
2553   open (my $in, '<:raw', $file) || error ("$id: $file: $!");
2554   local $/ = undef;  # read entire file
2555   my $body = <$in>;
2556   close ($in) || error ("$id: $file: $!");
2557   return ($id, $body);
2558 }
2559
2560
2561 \f
2562 ############################################################################
2563 #
2564 # Pick a random image in a random way
2565 #
2566 ############################################################################
2567
2568
2569 # Picks a random image on a random page, and returns two URLs:
2570 # the page containing the image, and the image.
2571 # Returns () if nothing found this time.
2572 #
2573
2574 sub pick_image(;$) {
2575   my ($timeout) = @_;
2576
2577   $current_state = "select";
2578   $load_method = "none";
2579
2580   my $n = int(rand(100));
2581   my $fn = undef;
2582   my $total = 0;
2583   my @rest = @search_methods;
2584
2585   while (@rest) {
2586     my $pct  = shift @rest;
2587     my $name = shift @rest;
2588     my $tfn  = shift @rest;
2589     $total += $pct;
2590     if ($total > $n && !defined($fn)) {
2591       $fn = $tfn;
2592       $current_state = $name;
2593       $load_method = $current_state;
2594     }
2595   }
2596
2597   if ($total != 100) {
2598     error ("internal error: \@search_methods totals to $total%!");
2599   }
2600
2601   record_attempt ($current_state);
2602   return $fn->($timeout);
2603 }
2604
2605
2606 \f
2607 ############################################################################
2608 #
2609 # Statistics and logging
2610 #
2611 ############################################################################
2612
2613 sub timestr() {
2614   return strftime ("%H:%M:%S: ", localtime);
2615 }
2616
2617 sub blurb() {
2618   return "$progname: " . timestr() . "$current_state: ";
2619 }
2620
2621 sub error($) {
2622   my ($err) = @_;
2623   print STDERR blurb() . "$err\n";
2624   exit 1;
2625 }
2626
2627 sub stacktrace() {
2628   my $i = 1;
2629   print STDERR "$progname: stack trace:\n";
2630   while (1) {
2631     my ($package, $filename, $line, $subroutine) = caller($i++);
2632     last unless defined($package);
2633     $filename =~ s@^.*/@@;
2634     print STDERR "  $filename#$line, $subroutine\n";
2635   }
2636 }
2637
2638
2639 my $lastlog = "";
2640
2641 sub clearlog() {
2642   $lastlog = "";
2643 }
2644
2645 sub showlog() {
2646   my $head = "$progname: DEBUG: ";
2647   foreach (split (/\n/, $lastlog)) {
2648     print STDERR "$head$_\n";
2649   }
2650   $lastlog = "";
2651 }
2652
2653 sub LOG($$) {
2654   my ($print, $msg) = @_;
2655   my $blurb = timestr() . "$current_state: ";
2656   $lastlog .= "$blurb$msg\n";
2657   print STDERR "$progname: $blurb$msg\n" if $print;
2658 }
2659
2660
2661 my %stats_attempts;
2662 my %stats_successes;
2663 my %stats_elapsed;
2664
2665 my $last_state = undef;
2666 sub record_attempt($) {
2667   my ($name) = @_;
2668
2669   if ($last_state) {
2670     record_failure($last_state) unless ($image_succeeded > 0);
2671   }
2672   $last_state = $name;
2673
2674   clearlog();
2675   report_performance();
2676
2677   start_timer($name);
2678   $image_succeeded = 0;
2679   $suppress_audit = 0;
2680 }
2681
2682 sub record_success($$$) {
2683   my ($name, $url, $base) = @_;
2684   if (defined($stats_successes{$name})) {
2685     $stats_successes{$name}++;
2686   } else {
2687     $stats_successes{$name} = 1;
2688   }
2689
2690   stop_timer ($name, 1);
2691   my $o = $current_state;
2692   $current_state = $name;
2693   save_recent_url ($url, $base);
2694   $current_state = $o;
2695   $image_succeeded = 1;
2696   clearlog();
2697 }
2698
2699
2700 sub record_failure($) {
2701   my ($name) = @_;
2702
2703   return if $image_succeeded;
2704
2705   stop_timer ($name, 0);
2706   if ($verbose_load && !$verbose_exec) {
2707
2708     if ($suppress_audit) {
2709       print STDERR "$progname: " . timestr() . "(audit log suppressed)\n";
2710       return;
2711     }
2712
2713     my $o = $current_state;
2714     $current_state = "DEBUG";
2715
2716     my $line =  "#" x 78;
2717     print STDERR "\n\n\n";
2718     print STDERR ("#" x 78) . "\n";
2719     print STDERR blurb() . "failed to get an image.  Full audit log:\n";
2720     print STDERR "\n";
2721     showlog();
2722     print STDERR ("-" x 78) . "\n";
2723     print STDERR "\n\n";
2724
2725     $current_state = $o;
2726   }
2727   $image_succeeded = 0;
2728 }
2729
2730
2731
2732 sub stats_of($) {
2733   my ($name) = @_;
2734   my $i = $stats_successes{$name};
2735   my $j = $stats_attempts{$name};
2736   $i = 0 unless $i;
2737   $j = 0 unless $j;
2738   return "" . ($j ? int($i * 100 / $j) : "0") . "%";
2739 }
2740
2741
2742 my $current_start_time = 0;
2743
2744 sub start_timer($) {
2745   my ($name) = @_;
2746   $current_start_time = time;
2747
2748   if (defined($stats_attempts{$name})) {
2749     $stats_attempts{$name}++;
2750   } else {
2751     $stats_attempts{$name} = 1;
2752   }
2753   if (!defined($stats_elapsed{$name})) {
2754     $stats_elapsed{$name} = 0;
2755   }
2756 }
2757
2758 sub stop_timer($$) {
2759   my ($name, $success) = @_;
2760   $stats_elapsed{$name} += time - $current_start_time;
2761 }
2762
2763
2764 my $last_report_time = 0;
2765 sub report_performance() {
2766
2767   return unless $verbose_warnings;
2768
2769   my $now = time;
2770   return unless ($now >= $last_report_time + $report_performance_interval);
2771   my $ot = $last_report_time;
2772   $last_report_time = $now;
2773
2774   return if ($ot == 0);
2775
2776   my $blurb = "$progname: " . timestr();
2777
2778   print STDERR "\n";
2779   print STDERR "${blurb}Current standings:\n";
2780
2781   foreach my $name (sort keys (%stats_attempts)) {
2782     my $try = $stats_attempts{$name};
2783     my $suc = $stats_successes{$name} || 0;
2784     my $pct = int($suc * 100 / $try);
2785     my $secs = $stats_elapsed{$name};
2786     my $secs_link = $secs / $try;
2787     print STDERR sprintf ("$blurb %-14s %4s (%d/%d);" .
2788                           "       \t %.1f secs/link\n",
2789                           "$name:", "$pct%", $suc, $try, $secs_link);
2790   }
2791 }
2792
2793
2794
2795 my $max_recent_images = 400;
2796 my $max_recent_sites  = 20;
2797 my @recent_images = ();
2798 my @recent_sites = ();
2799
2800 sub save_recent_url($$) {
2801   my ($url, $base) = @_;
2802
2803   return unless ($verbose_warnings);
2804
2805   $_ = $url;
2806   my ($site) = m@^https?://([^ \t\n\r/:]+)@;
2807   return unless defined ($site);
2808
2809   if ($base eq $driftnet_magic || $base eq $local_magic) {
2810     $site = $base;
2811     @recent_images = ();
2812   }
2813
2814   my $done = 0;
2815   foreach (@recent_images) {
2816     if ($_ eq $url) {
2817       print STDERR blurb() . "WARNING: recently-duplicated image: $url" .
2818         " (on $base via $last_search)\n";
2819       $done = 1;
2820       last;
2821     }
2822   }
2823
2824   # suppress "duplicate site" warning via %warningless_sites.
2825   #
2826   if ($warningless_sites{$site}) {
2827     $done = 1;
2828   } elsif ($site =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
2829            $warningless_sites{$1}) {
2830     $done = 1;
2831   } elsif ($site =~ m@([^.]+\.[^.]+)$@ &&
2832            $warningless_sites{$1}) {
2833     $done = 1;
2834   }
2835
2836   if (!$done) {
2837     foreach (@recent_sites) {
2838       if ($_ eq $site) {
2839         print STDERR blurb() . "WARNING: recently-duplicated site: $site" .
2840         " ($url on $base via $last_search)\n";
2841         last;
2842       }
2843     }
2844   }
2845
2846   push @recent_images, $url;
2847   push @recent_sites,  $site;
2848   shift @recent_images if ($#recent_images >= $max_recent_images);
2849   shift @recent_sites  if ($#recent_sites  >= $max_recent_sites);
2850 }
2851
2852
2853 \f
2854 ##############################################################################
2855 #
2856 # other utilities
2857 #
2858 ##############################################################################
2859
2860 # Does %-decoding.
2861 #
2862 sub url_decode($) {
2863   ($_) = @_;
2864   tr/+/ /;
2865   s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
2866   return $_;
2867 }
2868
2869
2870 # Given the raw body of a GIF document, returns the dimensions of the image.
2871 #
2872 sub gif_size($) {
2873   my ($body) = @_;
2874   my $type = substr($body, 0, 6);
2875   my $s;
2876   return () unless ($type =~ /GIF8[7,9]a/);
2877   $s = substr ($body, 6, 10);
2878   my ($a,$b,$c,$d) = unpack ("C"x4, $s);
2879   return () unless defined ($d);
2880   return (($b<<8|$a), ($d<<8|$c));
2881 }
2882
2883 # Given the raw body of a JPEG document, returns the dimensions of the image.
2884 #
2885 sub jpeg_size($) {
2886   my ($body) = @_;
2887   my $i = 0;
2888   my $L = length($body);
2889
2890   my $c1 = substr($body, $i, 1); $i++;
2891   my $c2 = substr($body, $i, 1); $i++;
2892   return () unless (ord($c1) == 0xFF && ord($c2) == 0xD8);
2893
2894   my $ch = "0";
2895   while (ord($ch) != 0xDA && $i < $L) {
2896     # Find next marker, beginning with 0xFF.
2897     while (ord($ch) != 0xFF) {
2898       return () if (length($body) <= $i);
2899       $ch = substr($body, $i, 1); $i++;
2900     }
2901     # markers can be padded with any number of 0xFF.
2902     while (ord($ch) == 0xFF) {
2903       return () if (length($body) <= $i);
2904       $ch = substr($body, $i, 1); $i++;
2905     }
2906
2907     # $ch contains the value of the marker.
2908     my $marker = ord($ch);
2909
2910     if (($marker >= 0xC0) &&
2911         ($marker <= 0xCF) &&
2912         ($marker != 0xC4) &&
2913         ($marker != 0xCC)) {  # it's a SOFn marker
2914       $i += 3;
2915       return () if (length($body) <= $i);
2916       my $s = substr($body, $i, 4); $i += 4;
2917       my ($a,$b,$c,$d) = unpack("C"x4, $s);
2918       return (($c<<8|$d), ($a<<8|$b));
2919
2920     } else {
2921       # We must skip variables, since FFs in variable names aren't
2922       # valid JPEG markers.
2923       return () if (length($body) <= $i);
2924       my $s = substr($body, $i, 2); $i += 2;
2925       my ($c1, $c2) = unpack ("C"x2, $s);
2926       my $length = ($c1 << 8) | $c2;
2927       return () if ($length < 2);
2928       $i += $length-2;
2929     }
2930   }
2931   return ();
2932 }
2933
2934 # Given the raw body of a PNG document, returns the dimensions of the image.
2935 #
2936 sub png_size($) {
2937   my ($body) = @_;
2938   return () unless ($body =~ m/^\211PNG\r/);
2939   my ($bits) = ($body =~ m/^.{12}(.{12})/s);
2940   return () unless defined ($bits);
2941   return () unless ($bits =~ /^IHDR/);
2942   my ($ign, $w, $h) = unpack("a4N2", $bits);
2943   return ($w, $h);
2944 }
2945
2946
2947 # Given the raw body of a PNM document, returns the dimensions of the image.
2948 #
2949 sub pnm_size($) {
2950   my ($body) = @_;
2951   return () unless ($body =~ m/^P[1-6]\r?\n(\d+) +(\d+)\r?\n/s);
2952   return ($1, $2);
2953 }
2954
2955
2956 # Given the raw body of a GIF, JPEG, or PNG document, returns the dimensions
2957 # of the image.
2958 #
2959 sub image_size($) {
2960   my ($body) = @_;
2961   my ($w, $h) = gif_size ($body);
2962   if ($w && $h) { return ($w, $h); }
2963   ($w, $h) = jpeg_size ($body);
2964   if ($w && $h) { return ($w, $h); }
2965   return png_size ($body);
2966   if ($w && $h) { return ($w, $h); }
2967   return pnm_size ($body);
2968 }
2969
2970
2971 # returns the full path of the named program, or undef.
2972 #
2973 sub which($) {
2974   my ($prog) = @_;
2975   foreach (split (/:/, $ENV{PATH})) {
2976     my $path = "$_/$prog";
2977     if (-x $path) {
2978       return $path;
2979     }
2980   }
2981   return undef;
2982 }
2983
2984
2985 # Like rand(), but chooses numbers with a bell curve distribution.
2986 sub bellrand(;$) {
2987   ($_) = @_;
2988   $_ = 1.0 unless defined($_);
2989   $_ /= 3.0;
2990   return (rand($_) + rand($_) + rand($_));
2991 }
2992
2993
2994 sub exit_cleanup() {
2995   x_cleanup();
2996   print STDERR "$progname: exiting\n" if ($verbose_warnings);
2997   if (@pids_to_kill) {
2998     print STDERR blurb() . "killing: " . join(' ', @pids_to_kill) . "\n";
2999     kill ('TERM', @pids_to_kill);
3000   }
3001 }
3002
3003 sub signal_cleanup($) {
3004   my ($sig) = @_;
3005   print STDERR blurb() . (defined($sig)
3006                           ? "caught signal $sig."
3007                           : "exiting.")
3008                        . "\n"
3009     if ($verbose_exec || $verbose_warnings);
3010   exit 1;
3011 }
3012
3013
3014
3015 ##############################################################################
3016 #
3017 # Generating a list of urls only
3018 #
3019 ##############################################################################
3020
3021 sub url_only_output() {
3022   do {
3023     my ($base, $img) = pick_image;
3024     if ($img) {
3025       $base =~ s/ /%20/g;
3026       $img  =~ s/ /%20/g;
3027       print "$img $base\n";
3028     }
3029   } while (1);
3030 }
3031
3032 ##############################################################################
3033 #
3034 # Running as an xscreensaver module, or as a web page imagemap
3035 #
3036 ##############################################################################
3037
3038 my ($image_png, $image_tmp1, $image_tmp2);
3039 {
3040   my $seed = rand(0xFFFFFFFF);
3041   $image_png = sprintf ("%s/webcollage-%08x",
3042                         ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp"),
3043                         $seed);
3044   $image_tmp1 = $image_png . '-1.png';
3045   $image_tmp2 = $image_png . '-2.png';
3046   $image_png .= '.png';
3047 }
3048
3049
3050 my $filter_cmd = undef;
3051 my $post_filter_cmd = undef;
3052 my $background = undef;
3053
3054 my @imagemap_areas = ();
3055 my $imagemap_html_tmp = undef;
3056 my $imagemap_jpg_tmp = undef;
3057
3058
3059 my $img_width;            # size of the image being generated.
3060 my $img_height;
3061
3062 my $delay = 2;
3063
3064 sub x_cleanup() {
3065   unlink $image_png, $image_tmp1, $image_tmp2;
3066   unlink $imagemap_html_tmp, $imagemap_jpg_tmp
3067     if (defined ($imagemap_html_tmp));
3068 }
3069
3070
3071 # Like system, but prints status about exit codes, and kills this process
3072 # with whatever signal killed the sub-process, if any.
3073 #
3074 sub nontrapping_system(@) {
3075   $! = 0;
3076
3077   $_ = join(" ", @_);
3078   s/\"[^\"]+\"/\"...\"/g;
3079
3080   LOG ($verbose_exec, "executing \"$_\"");
3081
3082   my $rc = system @_;
3083
3084   if ($rc == 0) {
3085     LOG ($verbose_exec, "subproc exited normally.");
3086   } elsif (($rc & 0xff) == 0) {
3087     $rc >>= 8;
3088     LOG ($verbose_exec, "subproc exited with status $rc.");
3089   } else {
3090     if ($rc & 0x80) {
3091       LOG ($verbose_exec, "subproc dumped core.");
3092       $rc &= ~0x80;
3093     }
3094     LOG ($verbose_exec, "subproc died with signal $rc.");
3095     # die that way ourselves.
3096     kill $rc, $$;
3097   }
3098
3099   return $rc;
3100 }
3101
3102
3103 # Creates a solid-colored PNG.
3104 #
3105 sub pngmake($$$$) {
3106   my ($outfile, $bgcolor, $w, $h) = @_;
3107
3108   my @cmd;
3109   if ($webcollage_helper) {
3110     @cmd = ($webcollage_helper, $bgcolor, $w, $h, $outfile);
3111   } else {
3112     @cmd = ($convert_cmd, '-size', "${w}x${h}", "xc:$bgcolor", $outfile);
3113   }
3114
3115   my $rc = nontrapping_system (@cmd);
3116   if ($rc != 0) {
3117     LOG(0, "failed to create $bgcolor image: \"$outfile\"");
3118     exit(1);
3119   }
3120 }
3121
3122
3123 sub pick_root_displayer() {
3124   my @names = ();
3125
3126   if ($cocoa_p) {
3127     # see "xscreensaver/hacks/webcollage-cocoa.m"
3128     return "echo COCOA LOAD ";
3129   }
3130
3131   foreach my $cmd (@root_displayers) {
3132     $_ = $cmd;
3133     my ($name) = m/^([^ ]+)/;
3134     push @names, "\"$name\"";
3135     LOG ($verbose_exec, "looking for $name...");
3136     foreach my $dir (split (/:/, $ENV{PATH})) {
3137       LOG ($verbose_exec, "  checking $dir/$name");
3138       return $cmd if (-x "$dir/$name");
3139     }
3140   }
3141
3142   $names[$#names] = "or " . $names[$#names];
3143   error "none of: " . join (", ", @names) . " were found on \$PATH.";
3144 }
3145
3146
3147 my $png_to_root_window_cmd = undef;
3148
3149
3150 sub x_or_image_output($) {
3151   my ($window_id) = @_;
3152
3153   # Adjust the PATH for OS X 10.10.
3154   #
3155   $_ = $0;
3156   s:/[^/]*$::;
3157   s/([^a-zA-Z0-9._\-+\/])/\\$1/g;
3158   $ENV{PATH} = "$_:$ENV{PATH}";
3159
3160   # Check for our helper program, to see whether we need to use imagemagick.
3161   #
3162   $_ = "webcollage-helper";
3163
3164   if (! defined ($webcollage_helper)) {
3165 #    $webcollage_helper = which ($_);
3166   }
3167
3168   if (defined ($webcollage_helper)) {
3169     LOG ($verbose_decode, "found \"$webcollage_helper\"");
3170   } else {
3171     LOG (($verbose_decode || $verbose_load), "no $_ program");
3172   }
3173
3174   if ($cocoa_p && !defined ($webcollage_helper)) {
3175     error ("webcollage-helper not found in Cocoa-mode!");
3176   }
3177
3178   if (!$cocoa_p && defined ($webcollage_helper)) {
3179     foreach ($image_png, $image_tmp1, $image_tmp2) {
3180       s/\.png$/.jpg/si;
3181     }
3182   }
3183
3184   # make sure the various programs we execute exist, right up front.
3185   #
3186   my @progs = ();
3187
3188   push @progs, $convert_cmd unless defined($webcollage_helper);
3189
3190   foreach (@progs) {
3191     which ($_) || error "$_ not found on \$PATH.";
3192   }
3193
3194   # find a root-window displayer program.
3195   #
3196   if (!$no_output_p) {
3197     $png_to_root_window_cmd = pick_root_displayer();
3198   }
3199
3200   if (defined ($window_id)) {
3201     error ("-window-id only works if xscreensaver-getimage is installed")
3202       unless ($png_to_root_window_cmd =~ m/^xscreensaver-getimage\b/);
3203
3204     error ("unparsable window id: $window_id")
3205       unless ($window_id =~ m/^\d+$|^0x[\da-f]+$/i);
3206     $png_to_root_window_cmd =~ s/--?root\b/$window_id/ ||
3207       error ("unable to munge displayer: $png_to_root_window_cmd");
3208   }
3209
3210   if (!$img_width || !$img_height) {
3211
3212     if (!defined ($window_id) &&
3213         defined ($ENV{XSCREENSAVER_WINDOW})) {
3214       $window_id = $ENV{XSCREENSAVER_WINDOW};
3215     }
3216
3217     if (!defined ($window_id)) {
3218       $_ = "xdpyinfo";
3219       which ($_) || error "$_ not found on \$PATH.";
3220       $_ = `$_`;
3221       ($img_width, $img_height) = m/dimensions: *(\d+)x(\d+) /;
3222       if (!defined($img_height)) {
3223         error "xdpyinfo failed.";
3224       }
3225     } else {  # we have a window id
3226       $_ = "xwininfo";
3227       which ($_) || error "$_ not found on \$PATH.";
3228       $_ .= " -id $window_id";
3229       $_ = `$_`;
3230       ($img_width, $img_height) = m/^\s*Width:\s*(\d+)\n\s*Height:\s*(\d+)\n/m;
3231
3232       if (!defined($img_height)) {
3233         error "xwininfo failed.";
3234       }
3235     }
3236   }
3237
3238   my $bgcolor = "#000000";
3239   my $bgimage = undef;
3240
3241   if ($background) {
3242     if ($background =~ m/^\#[0-9a-f]+$/i) {
3243       $bgcolor = $background;
3244
3245     } elsif (-r $background) {
3246       $bgimage = $background;
3247
3248     } elsif (! $background =~ m@^[-a-z0-9 ]+$@i) {
3249       error "not a color or readable file: $background";
3250
3251     } else {
3252       # default to assuming it's a color
3253       $bgcolor = $background;
3254     }
3255   }
3256
3257   # Create the sold-colored base image.
3258   #
3259   LOG ($verbose_decode, "creating base image: ${img_width}x${img_height}");
3260   $_ = pngmake ($image_png, $bgcolor, $img_width, $img_height);
3261
3262   # Paste the default background image in the middle of it.
3263   #
3264   if ($bgimage) {
3265     open (my $in, '<:raw', $bgimage) || error ("$bgimage: $!");
3266     local $/ = undef;  # read entire file
3267     my $body = <$in>;
3268     close ($in) || error ("$bgimage: $!");
3269     paste_image ('init', $image_png, $body, 'init', 1);
3270   }
3271
3272   clearlog();
3273
3274   while (1) {
3275     my ($base, $img) = pick_image();
3276     my $source = $current_state;
3277     $current_state = "loadimage";
3278     if ($img) {
3279       my ($headers, $body) = get_document ($img, $base);
3280       if ($body) {
3281         paste_image ($base, $img, $body, $source);
3282         $body = undef;
3283       }
3284     }
3285     $current_state = "idle";
3286     $load_method = "none";
3287
3288     unlink $image_tmp1, $image_tmp2;
3289     sleep $delay;
3290   }
3291 }
3292
3293 sub paste_image($$$$;$) {
3294   my ($base, $img, $body, $source, $init_p) = @_;
3295
3296   $current_state = "paste";
3297
3298   $suppress_audit = 0;
3299
3300   LOG ($verbose_decode, "got $img (" . length($body) . ")");
3301
3302   my ($iw, $ih) = image_size ($body);
3303   if (!$iw || !$ih) {
3304     LOG (($verbose_decode || $verbose_load),
3305          "not a GIF, JPG, or PNG" .
3306          (($body =~ m@<(base|html|head|body|script|table|a href)>@i)
3307           ? " (looks like HTML)" : "") .
3308          ": $img");
3309     $suppress_audit = 1;
3310     $body = undef;
3311     return 0;
3312   }
3313
3314   if ($iw <= 0 || $ih <= 0 || $iw > 9999 || $ih > 9999) {
3315     LOG (($verbose_decode || $verbose_load),
3316          "ludicrous image dimensions: $iw x $ih (" . length($body) .
3317          "): $img");
3318     $body = undef;
3319     return 0;
3320   }
3321
3322   open (my $out, '>:raw', $image_tmp1) || error ("writing $image_tmp1: $!");
3323   (print $out $body) || error ("writing $image_tmp1: $!");
3324   close ($out) || error ("writing $image_tmp1: $!");
3325
3326   record_success ($load_method, $img, $base);
3327
3328   my $ow = $iw;  # used only for error messages
3329   my $oh = $ih;
3330
3331   # don't just tack this onto the front of the pipeline -- we want it to
3332   # be able to change the size of the input image.
3333   #
3334   if ($filter_cmd && !$init_p) {
3335     LOG ($verbose_decode, "running $filter_cmd");
3336
3337     # #### Historically, $filter_cmd read and write PPM files.
3338     #      This is doing PNG or JPEG now.  I'll bet nobody uses this.
3339
3340     my $rc = nontrapping_system "($filter_cmd) < $image_tmp1 >$image_tmp2";
3341     if ($rc != 0) {
3342       LOG(($verbose_decode || $verbose_load),
3343           "failed command: \"$filter_cmd\"");
3344       LOG(($verbose_decode || $verbose_load),
3345           "failed URL: \"$img\" (${ow}x$oh)");
3346       return;
3347     }
3348     rename ($image_tmp2, $image_tmp1);
3349
3350     # re-get the width/height in case the filter resized it.
3351     open (my $imgf, '<:raw', $image_tmp1) || return 0;
3352     my $b = '';
3353     sysread ($imgf, $b, 10240);
3354     close $imgf;
3355     ($iw, $ih) = image_size ($b);
3356     return 0 unless ($iw && $ih);
3357   }
3358
3359   my $target_w = $img_width;   # max rectangle into which the image must fit
3360   my $target_h = $img_height;
3361
3362   my $scale = 1.0;
3363
3364   my $crop_x = 0;     # the sub-rectangle of the image
3365   my $crop_y = 0;     # that we will actually paste.
3366   my $crop_w = $iw;
3367   my $crop_h = $ih;
3368   my $x = 0;
3369   my $y = 0;
3370
3371   if (!$init_p) {
3372
3373     # Usually scale the image to fit on the screen -- but sometimes scale it
3374     # to fit on half or a quarter of the screen.  (We do this by reducing the
3375     # size of the target rectangle.)  Note that the image is not merely scaled
3376     # to fit; we instead cut the image in half repeatedly until it fits in the
3377     # target rectangle -- that gives a wider distribution of sizes.
3378     #
3379     if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; } # reduce target rect
3380     if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; }
3381
3382     if ($iw > $target_w || $ih > $target_h) {
3383       while ($iw > $target_w ||
3384              $ih > $target_h) {
3385         $iw = int($iw / 2);
3386         $ih = int($ih / 2);
3387         $scale /= 2;
3388       }
3389       if ($iw <= 10 || $ih <= 10) {
3390         LOG ($verbose_decode, "scaling ${ow}x${oh} to ${iw}x$ih" .
3391              " would have been bogus.");
3392         return 0;
3393       }
3394
3395       $crop_w = $iw;
3396       $crop_h = $ih;
3397
3398       LOG ($verbose_decode, "scaling ${ow}x${oh} to ${iw}x$ih ($scale)");
3399     }
3400
3401
3402     my $src = $image_tmp1;
3403
3404     # The chance that we will randomly crop out a section of an image starts
3405     # out fairly low, but goes up for images that are very large, or images
3406     # that have ratios that make them look like banners (we try to avoid
3407     # banner images entirely, but they slip through when the IMG tags didn't
3408     # have WIDTH and HEIGHT specified.)
3409     #
3410     my $crop_chance = 0.2;
3411     if ($iw > $img_width * 0.4 || $ih > $img_height * 0.4) {
3412       $crop_chance += 0.2;
3413     }
3414     if ($iw > $img_width * 0.7 || $ih > $img_height * 0.7) {
3415       $crop_chance += 0.2;
3416     }
3417     if ($min_ratio && ($iw * $min_ratio) > $ih) {
3418       $crop_chance += 0.7;
3419     }
3420
3421     if ($crop_chance > 0.1) {
3422       LOG ($verbose_decode, "crop chance: $crop_chance");
3423     }
3424
3425     if (rand() < $crop_chance) {
3426
3427       my $ow = $crop_w;
3428       my $oh = $crop_h;
3429
3430       if ($crop_w > $min_width) {
3431         # if it's a banner, select the width linearly.
3432         # otherwise, select a bell.
3433         my $r = (($min_ratio && ($iw * $min_ratio) > $ih)
3434                  ? rand()
3435                  : bellrand());
3436         $crop_w = $min_width + int ($r * ($crop_w - $min_width));
3437         $crop_x = int (rand() * ($ow - $crop_w));
3438       }
3439       if ($crop_h > $min_height) {
3440         # height always selects as a bell.
3441         $crop_h = $min_height + int (bellrand() * ($crop_h - $min_height));
3442         $crop_y = int (rand() * ($oh - $crop_h));
3443       }
3444
3445       # Clip it to the actual post-scaling image size.
3446       if ($crop_x + $crop_w > $iw) { $crop_w = $iw - $crop_x; }
3447       if ($crop_y + $crop_h > $ih) { $crop_h = $ih - $crop_y; }
3448       if ($crop_x < 0) { $crop_w += $crop_x; $crop_x = 0; }
3449       if ($crop_y < 0) { $crop_h += $crop_y; $crop_y = 0; }
3450
3451       if ($crop_x != 0   || $crop_y != 0 ||
3452           $crop_w != $iw || $crop_h != $ih) {
3453         LOG ($verbose_decode,
3454              "randomly cropping to ${crop_w}x$crop_h \@ $crop_x,$crop_y");
3455       }
3456     }
3457
3458     # Where the image should logically land -- this might be negative.
3459     #
3460     $x = int((rand() * ($img_width  + $crop_w/2)) - $crop_w*3/4);
3461     $y = int((rand() * ($img_height + $crop_h/2)) - $crop_h*3/4);
3462
3463     # if we have chosen to paste the image outside of the rectangle of the
3464     # screen, then we need to crop it.
3465     #
3466     if ($x < 0 ||
3467         $y < 0 ||
3468         $x + $crop_w > $img_width ||
3469         $y + $crop_h > $img_height) {
3470
3471       LOG ($verbose_decode,
3472            "cropping for effective paste of ${crop_w}x${crop_h} \@ $x,$y");
3473
3474       if ($x < 0) { $crop_x -= $x; $crop_w += $x; $x = 0; }
3475       if ($y < 0) { $crop_y -= $y; $crop_h += $y; $y = 0; }
3476
3477       if ($x + $crop_w >= $img_width)  { $crop_w = $img_width  - $x - 1; }
3478       if ($y + $crop_h >= $img_height) { $crop_h = $img_height - $y - 1; }
3479     }
3480
3481     # If any cropping needs to happen, add pnmcut.
3482     #
3483     if ($crop_x != 0   || $crop_y != 0 ||
3484         $crop_w != $iw || $crop_h != $ih) {
3485       $iw = $crop_w;
3486       $ih = $crop_h;
3487       LOG ($verbose_decode, "cropping to ${crop_w}x$crop_h \@ " .
3488            "$crop_x,$crop_y");
3489     }
3490
3491     LOG ($verbose_decode, "pasting ${iw}x$ih \@ $x,$y in $image_png");
3492   }
3493
3494   my @cmd;
3495   if (defined ($webcollage_helper)) {
3496     @cmd = ($webcollage_helper,
3497             $image_tmp1, $image_png,
3498             $scale, $opacity,
3499             $crop_x, $crop_y, $x, $y,
3500             $iw, $ih);
3501   } else {
3502     @cmd = ($convert_cmd,
3503             $image_png,
3504             '(',
3505             $image_tmp1 . '[0]',
3506             '-scale', sprintf("%.2f%%", 100 * $scale),
3507             '-crop', "${iw}x${ih}+${crop_x}+${crop_y}",
3508             '-geometry', "+${x}+${y}",
3509
3510             ($init_p ? () :
3511              (
3512               # Blurry edges with rounded corners
3513               '-alpha', 'set',
3514               '-virtual-pixel', 'transparent',
3515               '-channel', 'A',
3516               '-blur', '0x12',
3517               '-level', '50%,100%',
3518
3519               # Overall transparency
3520               '-evaluate', 'multiply', $opacity,
3521
3522               '+channel',
3523              )),
3524             ')',
3525             '-composite',
3526             '+repage',
3527             '-strip',
3528             $image_png);
3529   }
3530
3531   #### $verbose_decode should mean 2>/dev/null
3532
3533   my $rc = nontrapping_system (@cmd);
3534
3535   if (-z $image_png) {
3536     LOG (1, "failed command: \"@cmd\"");
3537     print STDERR "\naudit log:\n\n\n";
3538     print STDERR ("#" x 78) . "\n";
3539     print STDERR blurb() . "$image_png has zero size\n";
3540     showlog();
3541     print STDERR "\n\n";
3542     exit (1);
3543   }
3544
3545   if ($rc != 0) {
3546     LOG (($verbose_decode || $verbose_load), "failed command: \"@cmd\"");
3547     LOG (($verbose_decode || $verbose_load),
3548          "failed URL: \"$img\" (${ow}x$oh)");
3549     return;
3550   }
3551
3552   my $target = "$image_png";
3553
3554   # don't just tack this onto the end of the pipeline -- we don't want it
3555   # to end up in $image_png, because we don't want the results to be
3556   # cumulative.
3557   #
3558   if ($post_filter_cmd) {
3559
3560     # #### Historically, $post_filter_cmd read and write PPM files.
3561     #      This is doing PNG or JPEG now.  I'll bet nobody uses this.
3562
3563     $target = $image_tmp1;
3564     my $cmd = "($post_filter_cmd) < $image_png > $target";
3565     $rc = nontrapping_system ($cmd);
3566     if ($rc != 0) {
3567       LOG ($verbose_decode, "filter failed: \"$post_filter_cmd\"\n");
3568       return;
3569     }
3570   }
3571
3572   if (!$no_output_p) {
3573     my $tsize = (stat($target))[7];
3574     if ($tsize > 200) {
3575       my $cmd = "$png_to_root_window_cmd $target";
3576
3577       # xv seems to hate being killed.  it tends to forget to clean
3578       # up after itself, and leaves windows around and colors allocated.
3579       # I had this same problem with vidwhacker, and I'm not entirely
3580       # sure what I did to fix it.  But, let's try this: launch xv
3581       # in the background, so that killing this process doesn't kill it.
3582       # it will die of its own accord soon enough.  So this means we
3583       # start pumping bits to the root window in parallel with starting
3584       # the next network retrieval, which is probably a better thing
3585       # to do anyway.
3586       #
3587       $cmd .= " &" unless ($cocoa_p);
3588
3589       $rc = nontrapping_system ($cmd);
3590
3591       if ($rc != 0) {
3592         LOG (($verbose_decode || $verbose_load), "display failed: \"$cmd\"");
3593         return;
3594       }
3595
3596     } else {
3597       LOG ($verbose_decode, "$target size is $tsize");
3598     }
3599   }
3600
3601   if (defined($source)) {
3602     $source .= "-" . stats_of($source);
3603     print STDOUT "image: ${iw}x${ih} @ $x,$y $base $source\n"
3604       if ($verbose_imgmap);
3605     if ($imagemap_base) {
3606       update_imagemap ($base, $x, $y, $iw, $ih,
3607                        $image_png, $img_width, $img_height);
3608     }
3609   }
3610
3611   clearlog();
3612
3613   return 1;
3614 }
3615
3616
3617 sub update_imagemap($$$$$$$$) {
3618   my ($url, $x, $y, $w, $h, $image_png, $image_width, $image_height) = @_;
3619
3620   $current_state = "imagemap";
3621
3622   my $max_areas = 200;
3623
3624   $url = html_quote ($url);
3625   push @imagemap_areas, [$x, $y, $w, $h, $url];
3626   shift @imagemap_areas if (@imagemap_areas > $max_areas);
3627   LOG ($verbose_decode, "area: $x,$y,$w,$h");
3628
3629   my $map_name = $imagemap_base;
3630   $map_name =~ s@^.*/@@;
3631   $map_name = 'collage' if ($map_name eq '');
3632
3633   my $imagemap_html = $imagemap_base . ".html";
3634   my $imagemap_jpg  = $imagemap_base . ".jpg";
3635   my $imagemap_jpg2 = $imagemap_jpg;
3636   $imagemap_jpg2 =~ s@^.*/@@gs;
3637
3638   if (!defined ($imagemap_html_tmp)) {
3639     $imagemap_html_tmp = $imagemap_html . sprintf (".%08x", rand(0xffffffff));
3640     $imagemap_jpg_tmp  = $imagemap_jpg  . sprintf (".%08x", rand(0xffffffff));
3641   }
3642
3643   # Read the imagemap html file (if any) to get a template.
3644   #
3645   my $template_html = '';
3646   {
3647     if (open (my $in, '<', $imagemap_html)) {
3648       local $/ = undef;  # read entire file
3649       $template_html = <$in>;
3650       close $in;
3651       LOG ($verbose_decode, "read template $imagemap_html");
3652     }
3653
3654     if (! ($template_html =~ m/\.webcollage_box\b/s)) {  # missing or old
3655       $template_html =
3656        '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
3657           "http://www.w3.org/TR/html4/loose.dtd">
3658 <HTML>
3659  <HEAD>
3660   <BASE TARGET="_new">
3661   <meta HTTP-EQUIV="Refresh" content="60" />
3662   <TITLE>WebCollage</TITLE>
3663   <STYLE TYPE="text/css">
3664    <!--
3665     body { color: #FFF; max-width: 100%; }
3666     .webcollage_date, .webcollage_size {
3667       display: block; margin-top: 4px; font-size: 7pt; color: #888;
3668     }
3669     .webcollage_date { float: left; }
3670     .webcollage_size { float: right; }
3671
3672     .webcollage_frame {
3673       overflow: hidden;
3674       position: relative;
3675       padding-bottom: 56.25%;
3676       padding-bottom: 75%; /* 4:3 aspect ratio */
3677       border: 1px solid #888;
3678       background: #000;
3679     }
3680
3681     .webcollage_box {
3682        position: absolute; top: 0; left: 0;
3683        border: 0; margin: 0; padding: 0;
3684        width:  100%;
3685        height: 100%;
3686     }
3687
3688     .webcollage_box > img { width: 100%; height: 100%; border: 0; }
3689
3690     .webcollage_frame > a {
3691       display: block;
3692       position: absolute;
3693       border-radius: 16px;
3694     }
3695
3696     .webcollage_frame > a:hover {
3697       background: rgba(1,1,1,.25);
3698     }
3699    -->
3700   </STYLE>
3701  </HEAD>
3702  <BODY>
3703   <DIV CLASS="webcollage_frame">
3704   </DIV>
3705  </BODY>
3706 </HTML>
3707 ';
3708
3709       LOG ($verbose_decode, "created dummy template");
3710     }
3711   }
3712
3713   # Write the jpg to a tmp file
3714   #
3715   {
3716     my @cmd;
3717     if (defined($webcollage_helper)) {
3718       @cmd = ('cp', '-p', $image_png, $imagemap_jpg_tmp);
3719     } else {
3720       @cmd = ($convert_cmd, $image_png, 'jpeg:' . $imagemap_jpg_tmp);
3721     }
3722     my $rc = nontrapping_system (@cmd);
3723     if ($rc != 0) {
3724       error ("imagemap jpeg failed: \"@cmd\"\n");
3725     }
3726   }
3727
3728   # Write the html to a tmp file
3729   #
3730   {
3731     my $body = $template_html;
3732     my $img = ("   <DIV CLASS=\"webcollage_box\">" .
3733                     "<IMG SRC=\"$imagemap_jpg2\">" .
3734                    "</DIV>\n");
3735     foreach my $a (@imagemap_areas) {
3736       my ($x, $y, $w, $h, $u) = @$a;
3737       $x /= $img_width  / 100;
3738       $y /= $img_height / 100;
3739       $w /= $img_width  / 100;
3740       $h /= $img_height / 100;
3741       foreach ($x, $y, $w, $h) { $_ = sprintf("%.1f%%", $_); }
3742       $u = html_quote($u);
3743       $img .= ("   <A HREF=\"$u\" STYLE=\"" .
3744                "left:$x;top:$y;width:$w;height:$h\"/>\n");
3745     }
3746
3747     $img = ("<DIV CLASS=\"webcollage_frame\">\n" .
3748             $img .
3749             "  </DIV>\n");
3750     $body =~ s@<DIV \s+ CLASS=[\"\']webcollage_frame[\"\']>
3751                 .*? </DIV> .*? </DIV>@$img@sex ||
3752       error ("$imagemap_html_tmp: unable to splice image");
3753
3754
3755     # if there are magic webcollage spans in the html, update those too.
3756     #
3757     {
3758       my @st = stat ($imagemap_jpg_tmp);
3759       my $date = strftime("%d-%b-%Y %l:%M:%S %p %Z", localtime($st[9]));
3760       my $size = int(($st[7] / 1024) + 0.5) . "K";
3761       $body =~ s@(<SPAN\s+CLASS=\"webcollage_date\">).*?(</SPAN>)@$1$date$2@si;
3762       $body =~ s@(<SPAN\s+CLASS=\"webcollage_size\">).*?(</SPAN>)@$1$size$2@si;
3763     }
3764
3765     open (my $out, '>', $imagemap_html_tmp) || error("$imagemap_html_tmp: $!");
3766     (print $out $body)                      || error("$imagemap_html_tmp: $!");
3767     close ($out)                            || error("$imagemap_html_tmp: $!");
3768     LOG ($verbose_decode, "wrote $imagemap_html_tmp");
3769   }
3770
3771   # Rename the two tmp files to the real files
3772   #
3773   rename ($imagemap_html_tmp, $imagemap_html) ||
3774     error "renaming $imagemap_html_tmp to $imagemap_html";
3775   LOG ($verbose_decode, "wrote $imagemap_html");
3776
3777   if ($imagemap_jpg_tmp ne $image_png) {
3778     rename ($imagemap_jpg_tmp,  $imagemap_jpg) ||
3779       error "renaming $imagemap_jpg_tmp to $imagemap_jpg";
3780     LOG ($verbose_decode, "wrote $imagemap_jpg");
3781   }
3782 }
3783
3784
3785 # Figure out what the proxy server should be, either from environment
3786 # variables or by parsing the output of the (MacOS) program "scutil",
3787 # which tells us what the system-wide proxy settings are.
3788 #
3789 sub set_proxy() {
3790
3791   if (! defined($http_proxy)) {
3792     # historical suckage: the environment variable name is lower case.
3793     $http_proxy = $ENV{http_proxy} || $ENV{HTTP_PROXY};
3794   }
3795
3796   if (defined ($http_proxy)) {
3797     if ($http_proxy && $http_proxy =~ m@^https?://([^/]*)/?$@ ) {
3798       # historical suckage: allow "http://host:port" as well as "host:port".
3799       $http_proxy = $1;
3800     }
3801
3802   } else {
3803     my $proxy_data = `scutil --proxy 2>/dev/null`;
3804     my ($server) = ($proxy_data =~ m/\bHTTPProxy\s*:\s*([^\s]+)/s);
3805     my ($port)   = ($proxy_data =~ m/\bHTTPPort\s*:\s*([^\s]+)/s);
3806     # Note: this ignores the "ExceptionsList".
3807     if ($server) {
3808       $http_proxy = $server;
3809       $http_proxy .= ":$port" if $port;
3810     }
3811   }
3812
3813   delete $ENV{http_proxy};
3814   delete $ENV{HTTP_PROXY};
3815   delete $ENV{https_proxy};
3816   delete $ENV{HTTPS_PROXY};
3817   delete $ENV{PERL_LWP_ENV_PROXY};
3818
3819   if ($http_proxy) {
3820     $http_proxy = 'http://' . $http_proxy;
3821     LOG ($verbose_net, "proxy server: $http_proxy");
3822   } else {
3823     $http_proxy = undef;  # for --proxy ''
3824   }
3825 }
3826
3827
3828 sub init_signals() {
3829
3830   $SIG{HUP}  = \&signal_cleanup;
3831   $SIG{INT}  = \&signal_cleanup;
3832   $SIG{QUIT} = \&signal_cleanup;
3833   $SIG{ABRT} = \&signal_cleanup;
3834   $SIG{KILL} = \&signal_cleanup;
3835   $SIG{TERM} = \&signal_cleanup;
3836
3837   # Need this so that if subprocess filters die, we don't die.
3838   $SIG{PIPE} = 'IGNORE';
3839 }
3840
3841 END { exit_cleanup(); }
3842
3843
3844 sub main() {
3845   $| = 1;
3846   srand(time ^ $$);
3847
3848   my $verbose = 0;
3849   my $dict;
3850   my $driftnet_cmd = 0;
3851
3852   $current_state = "init";
3853   $load_method = "none";
3854
3855   my $root_p = 0;
3856   my $window_id = undef;
3857
3858   while ($#ARGV >= 0) {
3859     $_ = shift @ARGV;
3860     if (m/^--?d(i(s(p(l(a(y)?)?)?)?)?)?$/s) {
3861       $ENV{DISPLAY} = shift @ARGV;
3862     } elsif (m/^--?root$/s) {
3863       $root_p = 1;
3864     } elsif (m/^--?window-id$/s) {
3865       $window_id = shift @ARGV;
3866       $root_p = 1;
3867     } elsif (m/^--?no-output$/s) {
3868       $no_output_p = 1;
3869     } elsif (m/^--?urls(-only)?$/s) {
3870       $urls_only_p = 1;
3871       $no_output_p = 1;
3872     } elsif (m/^--?cocoa$/s) {
3873       $cocoa_p = 1;
3874     } elsif (m/^--?imagemap$/s) {
3875       $imagemap_base = shift @ARGV;
3876       $no_output_p = 1;
3877     } elsif (m/^--?verbose$/s) {
3878       $verbose++;
3879     } elsif (m/^-v+$/) {
3880       $verbose += length($_)-1;
3881     } elsif (m/^--?delay$/s) {
3882       $delay = shift @ARGV;
3883     } elsif (m/^--?timeout$/s) {
3884       $http_timeout = shift @ARGV;
3885     } elsif (m/^--?filter$/s) {
3886       $filter_cmd = shift @ARGV;
3887     } elsif (m/^--?filter2$/s) {
3888       $post_filter_cmd = shift @ARGV;
3889     } elsif (m/^--?(background|bg)$/s) {
3890       $background = shift @ARGV;
3891     } elsif (m/^--?size$/s) {
3892       $_ = shift @ARGV;
3893       if (m@^(\d+)x(\d+)$@) {
3894         $img_width = $1;
3895         $img_height = $2;
3896       } else {
3897         error "argument to \"--size\" must be of the form \"640x400\"";
3898       }
3899     } elsif (m/^--?(http-)?proxy$/s) {
3900       $http_proxy = shift @ARGV;
3901     } elsif (m/^--?dict(ionary)?$/s) {
3902       $dict = shift @ARGV;
3903     } elsif (m/^--?opacity$/s) {
3904       $opacity = shift @ARGV;
3905       error ("opacity must be between 0.0 and 1.0")
3906         if ($opacity <= 0 || $opacity > 1);
3907     } elsif (m/^--?driftnet$/s) {
3908       @search_methods = ( 100, "driftnet", \&pick_from_driftnet );
3909       if (! ($ARGV[0] =~ m/^-/)) {
3910         $driftnet_cmd = shift @ARGV;
3911       } else {
3912         $driftnet_cmd = $default_driftnet_cmd;
3913       }
3914     } elsif (m/^--?dir(ectory)?$/s) {
3915       @search_methods = ( 100, "local", \&pick_from_local_dir );
3916       if (! ($ARGV[0] =~ m/^-/)) {
3917         $local_dir = shift @ARGV;
3918       } else {
3919         error ("local directory path must be set")
3920       }
3921     } elsif (m/^--?fps$/s) {
3922       # -fps only works on MacOS, via "webcollage-cocoa.m".
3923       # Ignore it if passed to this script in an X11 context.
3924     } elsif (m/^--?debug$/s) {
3925       my $which = shift @ARGV;
3926       my @rest = @search_methods;
3927       my $ok = 0;
3928       while (@rest) {
3929         my $pct  = shift @rest;
3930         my $name = shift @rest;
3931         my $tfn  = shift @rest;
3932
3933         if ($name eq $which) {
3934           @search_methods = (100, $name, $tfn);
3935           $ok = 1;
3936           last;
3937         }
3938       }
3939       error "no such search method as \"$which\"" unless ($ok);
3940       LOG (1, "DEBUG: using only \"$which\"");
3941       $report_performance_interval = 30;
3942
3943     } else {
3944       print STDERR "unknown option: $_\n\n";
3945       print STDERR "$copyright\nusage: $progname " .
3946               "[--root] [--display dpy] [--verbose] [--debug which]\n" .
3947         "\t\t  [--timeout secs] [--delay secs] [--size WxH]\n" .
3948         "\t\t  [--no-output] [--urls-only] [--imagemap filename]\n" .
3949         "\t\t  [--background color] [--opacity f]\n" .
3950         "\t\t  [--filter cmd] [--filter2 cmd]\n" .
3951         "\t\t  [--dictionary dictionary-file] [--http-proxy host[:port]]\n" .
3952         "\t\t  [--driftnet [driftnet-program-and-args]]\n" .
3953         "\t\t  [--directory local-image-directory]\n" .
3954         "\n";
3955       exit 1;
3956     }
3957   }
3958
3959   if (!$root_p && !$no_output_p && !$cocoa_p) {
3960     print STDERR $copyright;
3961     error "the --root argument is mandatory (for now.)";
3962   }
3963
3964   if (!$no_output_p && !$cocoa_p && !$ENV{DISPLAY}) {
3965     error "\$DISPLAY is not set.";
3966   }
3967
3968
3969   if ($verbose == 1) {
3970     $verbose_imgmap   = 1;
3971     $verbose_warnings = 1;
3972
3973   } elsif ($verbose == 2) {
3974     $verbose_imgmap   = 1;
3975     $verbose_warnings = 1;
3976     $verbose_load     = 1;
3977
3978   } elsif ($verbose == 3) {
3979     $verbose_imgmap   = 1;
3980     $verbose_warnings = 1;
3981     $verbose_load     = 1;
3982     $verbose_filter   = 1;
3983
3984   } elsif ($verbose == 4) {
3985     $verbose_imgmap   = 1;
3986     $verbose_warnings = 1;
3987     $verbose_load     = 1;
3988     $verbose_filter   = 1;
3989     $verbose_net      = 1;
3990
3991   } elsif ($verbose == 5) {
3992     $verbose_imgmap   = 1;
3993     $verbose_warnings = 1;
3994     $verbose_load     = 1;
3995     $verbose_filter   = 1;
3996     $verbose_net      = 1;
3997     $verbose_decode   = 1;
3998
3999   } elsif ($verbose == 6) {
4000     $verbose_imgmap   = 1;
4001     $verbose_warnings = 1;
4002     $verbose_load     = 1;
4003     $verbose_filter   = 1;
4004     $verbose_net      = 1;
4005     $verbose_decode   = 1;
4006     $verbose_http     = 1;
4007
4008   } elsif ($verbose >= 7) {
4009     $verbose_imgmap   = 1;
4010     $verbose_warnings = 1;
4011     $verbose_load     = 1;
4012     $verbose_filter   = 1;
4013     $verbose_net      = 1;
4014     $verbose_decode   = 1;
4015     $verbose_http     = 1;
4016     $verbose_exec     = 1;
4017   }
4018
4019   if ($dict) {
4020     error ("$dict does not exist") unless (-f $dict);
4021     $wordlist = $dict;
4022   } else {
4023     pick_dictionary();
4024   }
4025
4026   if ($imagemap_base && !($img_width && $img_height)) {
4027     error ("--size WxH is required with --imagemap");
4028   }
4029
4030   if (defined ($local_dir)) {
4031     $_ = "xscreensaver-getimage-file";
4032     which ($_) || error "$_ not found on \$PATH.";
4033   }
4034
4035   init_signals();
4036   set_proxy();
4037
4038   spawn_driftnet ($driftnet_cmd) if ($driftnet_cmd);
4039
4040   if ($urls_only_p) {
4041     url_only_output ();
4042   } else {
4043     x_or_image_output ($window_id);
4044   }
4045 }
4046
4047 main();
4048 exit (0);