+ # log in as "cipherpunk"
+ "www.nytimes.com" => 'NYT-S=18cHMIlJOn2Y1bu5xvEG3Ufuk6E1oJ.' .
+ 'FMxWaQV0igaB5Yi/Q/guDnLeoL.pe7i1oakSb' .
+ '/VqfdUdb2Uo27Vzt1jmPn3cpYRlTw9',
+
+ "ircimages.com" => 'disclaimer=1',
+);
+
+
+# If this is set, it's a helper program to use for pasting images together:
+# this is a lot faster and more efficient than using PPM pipelines, which is
+# what we do if this program doesn't exist. (We check for "webcollage-helper"
+# on $PATH at startup, and set this variable appropriately.)
+#
+my $webcollage_helper = undef;
+
+
+# If we have the webcollage-helper program, then it will paste the images
+# together with transparency! 0.0 is invisible, 1.0 is totally opaque.
+#
+my $opacity = 0.85;
+
+
+# Some sites have managed to poison the search engines. These are they.
+# (We auto-detect sites that have poisoned the search engines via excessive
+# keywords or dictionary words, but these are ones that slip through
+# anyway.)
+#
+# This can contain full host names, or 2 or 3 component domains.
+#
+my %poisoners = (
+ "die.net" => 1, # 'l33t h4ck3r d00dz.
+ "genforum.genealogy.com" => 1, # Cluttering avtext with human names.
+ "rootsweb.com" => 1, # Cluttering avtext with human names.
+ "akamai.net" => 1, # Lots of sites have their images on Akamai.
+ "akamaitech.net" => 1, # But those are pretty much all banners.
+ # Since Akamai is super-expensive, let's
+ # go out on a limb and assume that all of
+ # their customers are rich-and-boring.
+ "bartleby.com" => 1, # Dictionary, cluttering avtext.
+ "encyclopedia.com" => 1, # Dictionary, cluttering avtext.
+ "onlinedictionary.datasegment.com" => 1, # Dictionary, cluttering avtext.
+ "hotlinkpics.com" => 1, # Porn site that has poisoned avimages
+ # (I don't see how they did it, though!)
+ "alwayshotels.com" => 1, # Poisoned Lycos pretty heavily.
+ "nextag.com" => 1, # Poisoned Alta Vista real good.
+ "ghettodriveby.com" => 1, # Poisoned Google Images.
+ "crosswordsolver.org" => 1, # Poisoned Google Images.
+ "xona.com" => 1, # Poisoned Google Images.
+ "freepatentsonline.com" => 1, # Poisoned Google Images.
+ "herbdatanz.com" => 1, # Poisoned Google Images.
+);
+
+
+# When verbosity is turned on, we warn about sites that we seem to be hitting
+# a lot: usually this means some new poisoner has made it into the search
+# engines. But sometimes, the warning is just because that site has a lot
+# of stuff on it. So these are the sites that are immune to the "frequent
+# site" diagnostic message.
+#
+my %warningless_sites = (
+ "home.earthlink.net" => 1,
+ "www.angelfire.com" => 1,
+ "members.aol.com" => 1,
+ "img.photobucket.com" => 1,
+ "pics.livejournal.com" => 1,
+ "tinypic.com" => 1,
+ "flickr.com" => 1,
+ "staticflickr.com" => 1,
+ "pbase.com" => 1,
+ "blogger.com" => 1,
+ "multiply.com" => 1,
+ "wikimedia.org" => 1,
+ "twitpic.com" => 1,
+ "amazonaws.com" => 1,
+ "blogspot.com" => 1,
+ "photoshelter.com" => 1,
+ "myspacecdn.com" => 1,
+ "feedburner.com" => 1,
+ "wikia.com" => 1,
+ "ljplus.ru" => 1,
+ "yandex.ru" => 1,
+ "imgur.com" => 1,
+ "yfrog.com" => 1,
+ "cdninstagram.com" => 1,
+
+ "yimg.com" => 1, # This is where dailynews.yahoo.com stores
+ "eimg.com" => 1, # its images, so pick_from_yahoo_news_text()
+ # hits this every time.
+
+ "images.quizfarm.com" => 1, # damn those LJ quizzes...
+ "images.quizilla.com" => 1,
+ "images.quizdiva.net" => 1,
+
+ "driftnet" => 1, # builtin...
+ "local-directory" => 1, # builtin...
+);
+
+
+# For decoding HTML-encoded character entities to URLs.
+#
+my %entity_table = (
+ "apos" => '\'',
+ "quot" => '"', "amp" => '&', "lt" => '<',
+ "gt" => '>', "nbsp" => ' ', "iexcl" => '',
+ "cent" => "\xA2", "pound" => "\xA3", "curren" => "\xA4",
+ "yen" => "\xA5", "brvbar" => "\xA6", "sect" => "\xA7",
+ "uml" => "\xA8", "copy" => "\xA9", "ordf" => "\xAA",
+ "laquo" => "\xAB", "not" => "\xAC", "shy" => "\xAD",
+ "reg" => "\xAE", "macr" => "\xAF", "deg" => "\xB0",
+ "plusmn" => "\xB1", "sup2" => "\xB2", "sup3" => "\xB3",
+ "acute" => "\xB4", "micro" => "\xB5", "para" => "\xB6",
+ "middot" => "\xB7", "cedil" => "\xB8", "sup1" => "\xB9",
+ "ordm" => "\xBA", "raquo" => "\xBB", "frac14" => "\xBC",
+ "frac12" => "\xBD", "frac34" => "\xBE", "iquest" => "\xBF",
+ "Agrave" => "\xC0", "Aacute" => "\xC1", "Acirc" => "\xC2",
+ "Atilde" => "\xC3", "Auml" => "\xC4", "Aring" => "\xC5",
+ "AElig" => "\xC6", "Ccedil" => "\xC7", "Egrave" => "\xC8",
+ "Eacute" => "\xC9", "Ecirc" => "\xCA", "Euml" => "\xCB",
+ "Igrave" => "\xCC", "Iacute" => "\xCD", "Icirc" => "\xCE",
+ "Iuml" => "\xCF", "ETH" => "\xD0", "Ntilde" => "\xD1",
+ "Ograve" => "\xD2", "Oacute" => "\xD3", "Ocirc" => "\xD4",
+ "Otilde" => "\xD5", "Ouml" => "\xD6", "times" => "\xD7",
+ "Oslash" => "\xD8", "Ugrave" => "\xD9", "Uacute" => "\xDA",
+ "Ucirc" => "\xDB", "Uuml" => "\xDC", "Yacute" => "\xDD",
+ "THORN" => "\xDE", "szlig" => "\xDF", "agrave" => "\xE0",
+ "aacute" => "\xE1", "acirc" => "\xE2", "atilde" => "\xE3",
+ "auml" => "\xE4", "aring" => "\xE5", "aelig" => "\xE6",
+ "ccedil" => "\xE7", "egrave" => "\xE8", "eacute" => "\xE9",
+ "ecirc" => "\xEA", "euml" => "\xEB", "igrave" => "\xEC",
+ "iacute" => "\xED", "icirc" => "\xEE", "iuml" => "\xEF",
+ "eth" => "\xF0", "ntilde" => "\xF1", "ograve" => "\xF2",
+ "oacute" => "\xF3", "ocirc" => "\xF4", "otilde" => "\xF5",
+ "ouml" => "\xF6", "divide" => "\xF7", "oslash" => "\xF8",
+ "ugrave" => "\xF9", "uacute" => "\xFA", "ucirc" => "\xFB",
+ "uuml" => "\xFC", "yacute" => "\xFD", "thorn" => "\xFE",
+ "yuml" => "\xFF",
+
+ # HTML 4 entities that do not have 1:1 Latin1 mappings.
+ "bull" => "*", "hellip"=> "...", "prime" => "'", "Prime" => "\"",
+ "frasl" => "/", "trade" => "[tm]", "larr" => "<-", "rarr" => "->",
+ "harr" => "<->", "lArr" => "<=", "rArr" => "=>", "hArr" => "<=>",
+ "empty" => "\xD8", "minus" => "-", "lowast"=> "*", "sim" => "~",
+ "cong" => "=~", "asymp" => "~", "ne" => "!=", "equiv" => "==",
+ "le" => "<=", "ge" => ">=", "lang" => "<", "rang" => ">",
+ "loz" => "<>", "OElig" => "OE", "oelig" => "oe", "Yuml" => "Y",
+ "circ" => "^", "tilde" => "~", "ensp" => " ", "emsp" => " ",
+ "thinsp"=> " ", "ndash" => "-", "mdash" => "--", "lsquo" => "`",
+ "rsquo" => "'", "sbquo" => "'", "ldquo" => "\"", "rdquo" => "\"",
+ "bdquo" => "\"", "lsaquo"=> "<", "rsaquo"=> ">",
+);
+
+
+##############################################################################
+#
+# Various global flags set by command line parameters, or computed
+#
+##############################################################################
+
+
+my $current_state = "???"; # for diagnostics
+my $load_method;
+my $last_search;
+my $image_succeeded = -1;
+my $suppress_audit = 0;
+
+my $verbose_imgmap = 0; # print out rectangles and URLs only (stdout)
+my $verbose_warnings = 0; # print out warnings when things go wrong
+my $verbose_load = 0; # diagnostics about loading of URLs
+my $verbose_filter = 0; # diagnostics about page selection/rejection
+my $verbose_net = 0; # diagnostics about network I/O
+my $verbose_pbm = 0; # diagnostics about PBM pipelines
+my $verbose_http = 0; # diagnostics about all HTTP activity
+my $verbose_exec = 0; # diagnostics about executing programs