+# Extracting image URLs from HTML
+#
+############################################################################
+
+# given a URL and the body text at that URL, selects and returns a random
+# image from it. returns () if no suitable images found.
+#
+sub pick_image_from_body {
+ my ( $url, $body ) = @_;
+
+ my $base = $url;
+ $_ = $url;
+
+ # if there's at least one slash after the host, take off the last
+ # pathname component
+ if ( m@^http://[^/]+/@io ) {
+ $base =~ s@[^/]+$@@go;
+ }
+
+ # if there are no slashes after the host at all, put one on the end.
+ if ( m@^http://[^/]+$@io ) {
+ $base .= "/";
+ }
+
+ $_ = $body;
+
+ # strip out newlines, compress whitespace
+ s/[\r\n\t ]+/ /go;
+
+ # nuke comments
+ s/<!--.*?-->//go;
+
+
+ # There are certain web sites that list huge numbers of dictionary
+ # words in their bodies or in their <META NAME=KEYWORDS> tags (surprise!
+ # Porn sites tend not to be reputable!)
+ #
+ # I do not want webcollage to filter on content: I want it to select
+ # randomly from the set of images on the web. All the logic here for
+ # rejecting some images is really a set of heuristics for rejecting
+ # images that are not really images: for rejecting *text* that is in
+ # GIF/JPEG/PNG form. I don't want text, I want pictures, and I want
+ # the content of the pictures to be randomly selected from among all
+ # the available content.
+ #
+ # So, filtering out "dirty" pictures by looking for "dirty" keywords
+ # would be wrong: dirty pictures exist, like it or not, so webcollage
+ # should be able to select them.
+ #
+ # However, picking a random URL is a hard thing to do. The mechanism I'm
+ # using is to search for a selection of random words. This is not
+ # perfect, but works ok most of the time. The way it breaks down is when
+ # some URLs get precedence because their pages list *every word* as
+ # related -- those URLs come up more often than others.
+ #
+ # So, after we've retrieved a URL, if it has too many keywords, reject
+ # it. We reject it not on the basis of what those keywords are, but on
+ # the basis that by having so many, the page has gotten an unfair
+ # advantage against our randomizer.
+ #
+ my $trip_count = 0;
+ foreach my $trip (@tripwire_words) {
+ $trip_count++ if m/$trip/i;
+ }
+
+ if ($trip_count >= $#tripwire_words - 2) {
+ LOG (($verbose_filter || $verbose_load),
+ "there is probably a dictionary in \"$url\": rejecting.");
+ $rejected_urls{$url} = -1;
+ $body = undef;
+ $_ = undef;
+ return ();
+ }
+
+
+ my @urls;
+ my %unique_urls;
+
+ foreach (split(/ *</)) {
+ if ( m/^meta /i ) {
+
+ # Likewise, reject any web pages that have a KEYWORDS meta tag
+ # that is too long.
+ #
+ if (m/name ?= ?\"?keywords\"?/i &&
+ m/content ?= ?\"([^\"]+)\"/) {
+ my $L = length($1);
+ if ($L > 1000) {
+ LOG (($verbose_filter || $verbose_load),
+ "excessive keywords ($L bytes) in $url: rejecting.");
+ $rejected_urls{$url} = $L;
+ $body = undef;
+ $_ = undef;
+ return ();
+ } else {
+ LOG ($verbose_filter, " keywords ($L bytes) in $url (ok)");
+ }
+ }
+
+ } elsif ( m/^(img|a) .*(src|href) ?= ?\"? ?(.*?)[ >\"]/io ) {
+
+ my $was_inline = (! ( "$1" eq "a" || "$1" eq "A" ));
+ my $link = $3;
+ my ( $width ) = m/width ?=[ \"]*(\d+)/oi;
+ my ( $height ) = m/height ?=[ \"]*(\d+)/oi;
+ $_ = $link;
+
+ if ( m@^/@o ) {
+ my $site;
+ ( $site = $base ) =~ s@^(http://[^/]*).*@$1@gio;
+ $_ = "$site$link";
+ } elsif ( ! m@^[^/:?]+:@ ) {
+ $_ = "$base$link";
+ s@/\./@/@g;
+ 1 while (s@/[^/]+/\.\./@/@g);
+ }
+
+ # skip non-http
+ if ( ! m@^http://@io ) {
+ next;
+ }
+
+ # skip non-image
+ if ( ! m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@io ) {
+ next;
+ }
+
+ # skip really short or really narrow images
+ if ( $width && $width < $min_width) {
+ if (!$height) { $height = "?"; }
+ LOG ($verbose_filter, " skip narrow image $_ (${width}x$height)");
+ next;
+ }
+
+ if ( $height && $height < $min_height) {
+ if (!$width) { $width = "?"; }
+ LOG ($verbose_filter, " skip short image $_ (${width}x$height)");
+ next;
+ }
+
+ # skip images with ratios that make them look like banners.
+ if ($min_ratio && $width && $height &&
+ ($width * $min_ratio ) > $height) {
+ if (!$height) { $height = "?"; }
+ LOG ($verbose_filter, " skip bad ratio $_ (${width}x$height)");
+ next;
+ }
+
+ # skip GIFs with a small number of pixels -- those usually suck.
+ if ($width && $height &&
+ m/\.gif$/io &&
+ ($width * $height) < $min_gif_area) {
+ LOG ($verbose_filter, " skip small GIF $_ (${width}x$height)");
+ next;
+ }
+
+ # skip images with a URL that indicates a Yahoo thumbnail.
+ if (m@\.yimg\.com/.*/t/@) {
+ if (!$width) { $width = "?"; }
+ if (!$height) { $height = "?"; }
+ LOG ($verbose_filter, " skip yahoo thumb $_ (${width}x$height)");
+ next;
+ }
+
+ my $url = $_;
+
+ if ($unique_urls{$url}) {
+ LOG ($verbose_filter, " skip duplicate image $_");
+ next;
+ }
+
+ LOG ($verbose_filter,
+ " image $url" .
+ ($width && $height ? " (${width}x${height})" : "") .
+ ($was_inline ? " (inline)" : ""));
+
+ $urls[++$#urls] = $url;
+ $unique_urls{$url}++;
+
+ # JPEGs are preferable to GIFs and PNGs.
+ $_ = $url;
+ if ( ! m@[.](gif|png)$@io ) {
+ $urls[++$#urls] = $url;
+ }
+
+ # pointers to images are preferable to inlined images.
+ if ( ! $was_inline ) {
+ $urls[++$#urls] = $url;
+ $urls[++$#urls] = $url;
+ }
+ }
+ }
+
+ my $fsp = ($body =~ m@<frameset@i);
+
+ $_ = undef;
+ $body = undef;
+
+ @urls = depoison (@urls);
+
+ if ( $#urls < 0 ) {
+ LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : ""));
+ return ();
+ }
+
+ # pick a random element of the table
+ my $i = int(rand($#urls+1));
+ $url = $urls[$i];
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
+
+ return $url;
+}
+
+
+\f
+############################################################################
+#
+# Subroutines for getting pages and images out of search engines
+#
+############################################################################
+
+
+sub pick_dictionary {
+ my @dicts = ("/usr/dict/words",
+ "/usr/share/dict/words",
+ "/usr/share/lib/dict/words");
+ foreach my $f (@dicts) {
+ if (-f $f) {
+ $wordlist = $f;
+ last;
+ }
+ }
+ error ("$dicts[0] does not exist") unless defined($wordlist);
+}
+
+# returns a random word from the dictionary
+#
+sub random_word {
+
+ local *IN;
+ if (! open (IN, "<$wordlist")) {
+ return undef;
+ }
+
+ my $size = (stat(IN))[7];
+ my $word = undef;
+ my $count = 0;
+
+ while (1) {
+ error ("looping ($count) while reading $wordlist")
+ if (++$count > 100);
+
+ my $pos = int (rand ($size));
+ if (seek (IN, $pos, 0)) {
+ $word = <IN>; # toss partial line
+ $word = <IN>; # keep next line
+ }
+
+ next unless ($word);
+ next if ($word =~ m/^[-\']/);
+
+ $word = lc($word);
+ $word =~ s/^.*-//s;
+ $word =~ s/^[^a-z]+//s;
+ $word =~ s/[^a-z]+$//s;
+ $word =~ s/\'s$//s;
+ $word =~ s/ys$/y/s;
+ $word =~ s/ally$//s;
+ $word =~ s/ly$//s;
+ $word =~ s/ies$/y/s;
+ $word =~ s/ally$/al/s;
+ $word =~ s/izes$/ize/s;
+ $word =~ s/esses$/ess/s;
+ $word =~ s/(.{5})ing$/$1/s;
+
+ next if (length ($word) > 14);
+ last if ($word);
+ }
+
+ close (IN);
+
+ if ( $word =~ s/\s/\+/gs ) { # convert intra-word spaces to "+".
+ $word = "\%22$word\%22"; # And put quotes (%22) around it.
+ }
+
+ return $word;
+}
+
+
+sub random_words {
+ my ($or_p) = @_;
+ my $sep = ($or_p ? "%20OR%20" : "%20");
+ return (random_word . $sep .
+ random_word . $sep .
+ random_word . $sep .
+ random_word . $sep .
+ random_word);
+}
+
+
+sub url_quote {
+ my ($s) = @_;
+ $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge;
+ return $s;
+}
+
+sub url_unquote {
+ my ($s) = @_;
+ $s =~ s/[+]/ /g;
+ $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige;
+ return $s;
+}
+
+sub html_quote {
+ my ($s) = @_;
+ $s =~ s/&/&/gi;
+ $s =~ s/</</gi;
+ $s =~ s/>/>/gi;
+ $s =~ s/\"/"/gi;
+ return $s;
+}
+
+sub html_unquote {
+ my ($s) = @_;
+ $s =~ s/</</gi; # far from exhaustive...
+ $s =~ s/>/</gi;
+ $s =~ s/"/\"/gi;
+ $s =~ s/&/&/gi;
+ return $s;
+}
+
+
+# Loads the given URL (a search on some search engine) and returns:
+# - the total number of hits the search engine claimed it had;
+# - a list of URLs from the page that the search engine returned;
+# Note that this list contains all kinds of internal search engine
+# junk URLs too -- caller must prune them.
+#
+sub pick_from_search_engine {
+ my ( $timeout, $search_url, $words ) = @_;
+
+ $_ = $words;
+ s/%20/ /g;
+
+ print STDERR "\n\n" if ($verbose_load);
+
+ LOG ($verbose_load, "words: $_");
+ LOG ($verbose_load, "URL: $search_url");
+
+ $last_search = $search_url; # for warnings
+
+ my $start = time;
+ my ( $base, $body ) = get_document ($search_url, undef, $timeout);
+ if (defined ($timeout)) {
+ $timeout -= (time - $start);
+ if ($timeout <= 0) {
+ $body = undef;
+ LOG (($verbose_net || $verbose_load),
+ "timed out (late) for $search_url");
+ $suppress_audit = 1;
+ return ();
+ }
+ }
+
+ return () if (! $body);
+
+
+ my @subpages;
+
+ my $search_count = "?";
+ if ($body =~ m@found (approximately |about )?(<B>)?(\d+)(</B>)? image@) {
+ $search_count = $3;
+ } elsif ($body =~ m@<NOBR>((\d{1,3})(,\d{3})*) @i) {
+ $search_count = $1;
+ } elsif ($body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) {
+ $search_count = $1;
+ } elsif ($body =~ m@found about ((\d{1,3})(,\d{3})*|\d+) results@) {
+ $search_count = $1;
+ } elsif ($body =~ m@\b\d+ - \d+ of (\d+)\b@i) { # avimages
+ $search_count = $1;
+ } elsif ($body =~ m@About ((\d{1,3})(,\d{3})*) images@i) { # avimages
+ $search_count = $1;
+ } elsif ($body =~ m@We found ((\d{1,3})(,\d{3})*|\d+) results@i) { # *vista
+ $search_count = $1;
+ } elsif ($body =~ m@ of about <B>((\d{1,3})(,\d{3})*)<@i) { # googleimages
+ $search_count = $1;
+ } elsif ($body =~ m@<B>((\d{1,3})(,\d{3})*)</B> Web sites were found@i) {
+ $search_count = $1; # lycos
+ } elsif ($body =~ m@WEB.*?RESULTS.*?\b((\d{1,3})(,\d{3})*)\b.*?Matches@i) {
+ $search_count = $1; # hotbot
+ } elsif ($body =~ m@no photos were found containing@i) { # avimages
+ $search_count = "0";
+ } elsif ($body =~ m@found no document matching@i) { # avtext
+ $search_count = "0";
+ }
+ 1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/);
+
+# if ($search_count eq "?" || $search_count eq "0") {
+# local *OUT;
+# my $file = "/tmp/wc.html";
+# open(OUT, ">$file") || error ("writing $file: $!");
+# print OUT $body;
+# close OUT;
+# print STDERR blurb() . "###### wrote $file\n";
+# }
+
+
+ my $length = length($body);
+ my $href_count = 0;
+
+ $_ = $body;
+
+ s/[\r\n\t ]+/ /g;
+
+
+ s/(<A )/\n$1/gi;
+ foreach (split(/\n/)) {
+ $href_count++;
+ my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
+ next unless $u;
+
+ if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; } # quoted string
+ elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; } # or token
+
+ if ( $rejected_urls{$u} ) {
+ LOG ($verbose_filter, " pre-rejecting candidate: $u");
+ next;
+ }
+
+ LOG ($verbose_http, " HREF: $u");
+
+ $subpages[++$#subpages] = $u;
+ }
+
+ if ( $#subpages < 0 ) {
+ LOG ($verbose_filter,
+ "found nothing on $base ($length bytes, $href_count links).");
+ return ();
+ }
+
+ LOG ($verbose_filter, "" . $#subpages+1 . " links on $search_url");
+
+ return ($search_count, @subpages);
+}
+
+
+sub depoison {
+ my (@urls) = @_;
+ my @urls2 = ();
+ foreach (@urls) {
+ my ($h) = m@^http://([^/: \t\r\n]+)@i;
+
+ next unless defined($h);
+
+ if ($poisoners{$h}) {
+ LOG (($verbose_filter), " rejecting poisoner: $_");
+ next;
+ }
+ if ($h =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
+ $poisoners{$1}) {
+ LOG (($verbose_filter), " rejecting poisoner: $_");
+ next;
+ }
+ if ($h =~ m@([^.]+\.[^.]+)$@ &&
+ $poisoners{$1}) {
+ LOG (($verbose_filter), " rejecting poisoner: $_");
+ next;
+ }
+
+ push @urls2, $_;
+ }
+ return @urls2;
+}
+
+
+# given a list of URLs, picks one at random; loads it; and returns a
+# random image from it.
+# returns the url of the page loaded; the url of the image chosen.
+#
+sub pick_image_from_pages {
+ my ($base, $total_hit_count, $unfiltered_link_count, $timeout, @pages) = @_;
+
+ $total_hit_count = "?" unless defined($total_hit_count);
+
+ @pages = depoison (@pages);
+ LOG ($verbose_load,
+ "" . ($#pages+1) . " candidates of $unfiltered_link_count links" .
+ " ($total_hit_count total)");
+
+ return () if ($#pages < 0);
+
+ my $i = int(rand($#pages+1));
+ my $page = $pages[$i];
+
+ LOG ($verbose_load, "picked page $page");
+
+ $suppress_audit = 1;
+
+ my ( $base2, $body2 ) = get_document ($page, $base, $timeout);
+
+ if (!$base2 || !$body2) {
+ $body2 = undef;
+ return ();
+ }
+
+ my $img = pick_image_from_body ($base2, $body2);
+ $body2 = undef;
+
+ if ($img) {
+ return ($base2, $img);
+ } else {
+ return ();
+ }
+}
+
+\f
+############################################################################
+#
+# Pick images from random pages returned by the Yahoo Random Link
+#
+############################################################################
+
+# yahoorand
+my $yahoo_random_link = "http://random.yahoo.com/fast/ryl";
+
+
+# Picks a random page; picks a random image on that page;
+# returns two URLs: the page containing the image, and the image.
+# Returns () if nothing found this time.
+#
+sub pick_from_yahoo_random_link {
+ my ( $timeout ) = @_;
+
+ print STDERR "\n\n" if ($verbose_load);
+ LOG ($verbose_load, "URL: $yahoo_random_link");
+
+ $last_search = $yahoo_random_link; # for warnings
+
+ $suppress_audit = 1;
+
+ my ( $base, $body ) = get_document ($yahoo_random_link, undef, $timeout);
+ if (!$base || !$body) {
+ $body = undef;
+ return;
+ }
+
+ LOG ($verbose_load, "redirected to: $base");
+
+ my $img = pick_image_from_body ($base, $body);
+ $body = undef;
+
+ if ($img) {
+ return ($base, $img);
+ } else {
+ return ();
+ }
+}
+
+\f
+############################################################################
+#
+# Pick images from random pages returned by the Alta Vista Random Link
+#
+############################################################################
+
+# altavista
+my $alta_vista_random_link = "http://www.altavista.com/image/randomlink";
+
+
+# Picks a random page; picks a random image on that page;
+# returns two URLs: the page containing the image, and the image.
+# Returns () if nothing found this time.
+#
+sub pick_from_alta_vista_random_link {
+ my ( $timeout ) = @_;
+
+ print STDERR "\n\n" if ($verbose_load);
+ LOG ($verbose_load, "URL: $alta_vista_random_link");
+
+ $last_search = $alta_vista_random_link; # for warnings
+
+ $suppress_audit = 1;
+
+ my ( $base, $body ) = get_document ($alta_vista_random_link,
+ undef, $timeout);
+ if (!$base || !$body) {
+ $body = undef;
+ return;
+ }
+
+ LOG ($verbose_load, "redirected to: $base");
+
+ my $img = pick_image_from_body ($base, $body);
+ $body = undef;
+
+ if ($img) {
+ return ($base, $img);
+ } else {
+ return ();
+ }
+}
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Alta Vista Image Search
+#
+############################################################################
+
+
+my $alta_vista_images_url = "http://www.altavista.com/image/results" .
+ "?ipht=1" . # photos
+ "&igrph=1" . # graphics
+ "&iclr=1" . # color
+ "&ibw=1" . # b&w
+ "&micat=1" . # no partner sites
+ "&sc=on" . # "site collapse"
+ "&q=";
+
+# avimages
+sub pick_from_alta_vista_images {
+ my ( $timeout ) = @_;
+
+ my $words = random_word();
+ my $page = (int(rand(9)) + 1);
+ my $search_url = $alta_vista_images_url . $words;
+
+ if ($page > 1) {
+ $search_url .= "&pgno=" . $page; # page number
+ $search_url .= "&stq=" . (($page-1) * 12); # first hit result on page
+ }
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # avimages is encoding their URLs now.
+ next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
+ $u = url_unquote($u);
+
+ next unless ($u =~ m@^http://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]altavista\.com\b@i); # skip altavista builtins
+ next if ($u =~ m@[/.]yahoo\.com\b@i); # yahoo and av in cahoots?
+ next if ($u =~ m@[/.]doubleclick\.net\b@i); # you cretins
+ next if ($u =~ m@[/.]clicktomarket\.com\b@i); # more cretins
+
+ next if ($u =~ m@[/.]viewimages\.com\b@i); # stacked deck
+ next if ($u =~ m@[/.]gettyimages\.com\b@i);
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Google Image Search.
+# By Charles Gales <gales@us.ibm.com>
+#
+############################################################################
+
+
+my $google_images_url = "http://images.google.com/images" .
+ "?site=images" . # photos
+ "&btnG=Search" . # graphics
+ "&safe=off" . # no screening
+ "&imgsafe=off" .
+ "&q=";
+
+# googleimgs
+sub pick_from_google_images {
+ my ( $timeout, $words, $max_page ) = @_;
+
+ if (!defined($words)) {
+ $words = random_word; # only one word for Google
+ }
+
+ my $page = (int(rand(9)) + 1);
+ my $num = 20; # 20 images per page
+ my $search_url = $google_images_url . $words;
+
+ if ($page > 1) {
+ $search_url .= "&start=" . $page*$num; # page number
+ $search_url .= "&num=" . $num; #images per page
+ }
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ my %referers;
+ foreach my $u (@subpages) {
+ next unless ($u =~ m@imgres\?imgurl@i); # All pics start with this
+ next if ($u =~ m@[/.]google\.com\b@i); # skip google builtins
+
+ if ($u =~ m@^/imgres\?imgurl=(.*?)\&imgrefurl=(.*?)\&@) {
+ my $ref = $2;
+ my $img = $1;
+ $img = "http://$img" unless ($img =~ m/^http:/i);
+
+ LOG ($verbose_filter, " candidate: $ref");
+ push @candidates, $img;
+ $referers{$img} = $ref;
+ }
+ }
+
+ @candidates = depoison (@candidates);
+ return () if ($#candidates < 0);
+ my $i = int(rand($#candidates+1));
+ my $img = $candidates[$i];
+ my $ref = $referers{$img};
+
+ LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
+ return ($ref, $img);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random numbers into Google Image Search.
+# By jwz, suggested by Ian O'Donnell.
+#
+############################################################################
+
+
+# googlenums
+sub pick_from_google_image_numbers {
+ my ( $timeout ) = @_;
+
+ my $max = 9999;
+ my $number = int(rand($max));
+
+ $number = sprintf("%04d", $number)
+ if (rand() < 0.3);
+
+ pick_from_google_images ($timeout, "$number");
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random digital camera file names into
+# Google Image Search.
+# By jwz, inspired by the excellent Random Personal Picture Finder
+# at http://www.diddly.com/random/
+#
+############################################################################
+
+my @photomakers = (
+ #
+ # Common digital camera file name formats, as described at
+ # http://www.diddly.com/random/about.html
+ #
+ sub { sprintf ("dcp%05d.jpg", int(rand(4000))); }, # Kodak
+ sub { sprintf ("dsc%05d.jpg", int(rand(4000))); }, # Nikon
+ sub { sprintf ("dscn%04d.jpg", int(rand(4000))); }, # Nikon
+ sub { sprintf ("mvc-%03d.jpg", int(rand(999))); }, # Sony Mavica
+ sub { sprintf ("mvc%05d.jpg", int(rand(9999))); }, # Sony Mavica
+ sub { sprintf ("P101%04d.jpg", int(rand(9999))); }, # Olympus w/ date=101
+ sub { sprintf ("P%x%02d%04d.jpg", # Olympus
+ int(rand(0xC)), int(rand(30))+1,
+ rand(9999)); },
+ sub { sprintf ("IMG_%03d.jpg", int(rand(999))); }, # ?
+ sub { sprintf ("IMAG%04d.jpg", int(rand(9999))); }, # RCA and Samsung
+ sub { my $n = int(rand(9999)); # Canon
+ sprintf ("1%02d-%04d.jpg", int($n/100), $n); },
+ sub { my $n = int(rand(9999)); # Canon
+ sprintf ("1%02d-%04d_IMG.jpg",
+ int($n/100), $n); },
+ sub { sprintf ("IMG_%04d.jpg", int(rand(9999))); }, # Canon
+ sub { sprintf ("dscf%04d.jpg", int(rand(9999))); }, # Fuji Finepix
+ sub { sprintf ("pdrm%04d.jpg", int(rand(9999))); }, # Toshiba PDR
+ sub { sprintf ("IM%06d.jpg", int(rand(9999))); }, # HP Photosmart
+ sub { sprintf ("EX%06d.jpg", int(rand(9999))); }, # HP Photosmart
+# sub { my $n = int(rand(3)); # Kodak DC-40,50,120
+# sprintf ("DC%04d%s.jpg", int(rand(9999)),
+# $n == 0 ? 'S' : $n == 1 ? 'M' : 'L'); },
+ sub { sprintf ("pict%04d.jpg", int(rand(9999))); }, # Minolta Dimage
+ sub { sprintf ("P%07d.jpg", int(rand(9999))); }, # Kodak DC290
+# sub { sprintf ("%02d%02d%04d.jpg", # Casio QV3000, QV4000
+# int(rand(12))+1, int(rand(31))+1,
+# int(rand(999))); },
+# sub { sprintf ("%02d%x%02d%04d.jpg", # Casio QV7000
+# int(rand(6)), # year
+# int(rand(12))+1, int(rand(31))+1,
+# int(rand(999))); },
+ sub { sprintf ("IMGP%04d.jpg", int(rand(9999))); }, # Pentax Optio S
+ sub { sprintf ("PANA%04d.jpg", int(rand(9999))); }, # Panasonic vid still
+ sub { sprintf ("HPIM%04d.jpg", int(rand(9999))); }, # HP Photosmart
+ sub { sprintf ("PCDV%04d.jpg", int(rand(9999))); }, # ?
+ );
+
+
+# googlephotos
+sub pick_from_google_image_photos {
+ my ( $timeout ) = @_;
+
+ my $i = int(rand($#photomakers + 1));
+ my $fn = $photomakers[$i];
+ my $file = &$fn;
+ my $words .= $file . "%20filetype:jpg";
+
+ pick_from_google_images ($timeout, $words);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Alta Vista Text Search
+#
+############################################################################
+
+
+my $alta_vista_url = "http://www.altavista.com/web/results" .
+ "?pg=aq" .
+ "&aqmode=s" .
+ "&filetype=html" .
+ "&sc=on" . # "site collapse"
+ "&nbq=50" .
+ "&aqo=";
+
+# avtext
+sub pick_from_alta_vista_text {
+ my ( $timeout ) = @_;
+
+ my $words = random_words(0);
+ my $page = (int(rand(9)) + 1);
+ my $search_url = $alta_vista_url . $words;
+
+ if ($page > 1) {
+ $search_url .= "&pgno=" . $page;
+ $search_url .= "&stq=" . (($page-1) * 10);
+ }
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # Those altavista fuckers are playing really nasty redirection games
+ # these days: the filter your clicks through their site, but use
+ # onMouseOver to make it look like they're not! Well, it makes it
+ # easier for us to identify search results...
+ #
+ next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
+ $u = url_unquote($u);
+
+ next unless ($u =~ m@^http://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]altavista\.com\b@i); # skip altavista builtins
+ next if ($u =~ m@[/.]yahoo\.com\b@i); # yahoo and av in cahoots?
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Hotbot
+#
+############################################################################
+
+my $hotbot_search_url =("http://hotbot.lycos.com/default.asp" .
+ "?ca=w" .
+ "&descriptiontype=0" .
+ "&imagetoggle=1" .
+ "&matchmode=any" .
+ "&nummod=2" .
+ "&recordcount=50" .
+ "&sitegroup=1" .
+ "&stem=1" .
+ "&cobrand=undefined" .
+ "&query=");
+
+sub pick_from_hotbot_text {
+ my ( $timeout ) = @_;
+
+ $last_search = $hotbot_search_url; # for warnings
+
+ # lycos seems to always give us back dictionaries and word lists if
+ # we search for more than one word...
+ #
+ my $words = random_word();
+
+ my $start = int(rand(8)) * 10 + 1;
+ my $search_url = $hotbot_search_url . $words . "&first=$start&page=more";
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # Hotbot plays redirection games too
+ # (not any more?)
+# next unless ($u =~ m@/director.asp\?.*\btarget=([^&]+)@);
+# $u = url_decode($1);
+
+ next unless ($u =~ m@^http://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]hotbot\.com\b@i); # skip hotbot builtins
+ next if ($u =~ m@[/.]lycos\.com\b@i); # skip hotbot builtins
+ next if ($u =~ m@[/.]inktomi\.com\b@i); # skip hotbot builtins
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Lycos
+#
+############################################################################
+
+my $lycos_search_url = "http://search.lycos.com/default.asp" .
+ "?lpv=1" .
+ "&loc=searchhp" .
+ "&tab=web" .
+ "&query=";
+
+sub pick_from_lycos_text {
+ my ( $timeout ) = @_;
+
+ $last_search = $lycos_search_url; # for warnings
+
+ # lycos seems to always give us back dictionaries and word lists if
+ # we search for more than one word...
+ #
+ my $words = random_word();
+
+ my $start = int(rand(8)) * 10 + 1;
+ my $search_url = $lycos_search_url . $words . "&first=$start&page=more";
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # Lycos plays redirection games.
+ # (not any more?)
+# next unless ($u =~ m@^http://click.lycos.com/director.asp
+# .*
+# \btarget=([^&]+)
+# .*
+# @x);
+# $u = url_decode($1);
+
+ next unless ($u =~ m@^http://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]hotbot\.com\b@i); # skip lycos builtins
+ next if ($u =~ m@[/.]lycos\.com\b@i); # skip lycos builtins
+ next if ($u =~ m@[/.]terralycos\.com\b@i); # skip lycos builtins
+ next if ($u =~ m@[/.]inktomi\.com\b@i); # skip lycos builtins
+
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into news.yahoo.com
+#
+############################################################################
+
+my $yahoo_news_url = "http://news.search.yahoo.com/search/news" .
+ "?c=news_photos" .
+ "&p=";
+
+# yahoonews
+sub pick_from_yahoo_news_text {
+ my ( $timeout ) = @_;
+
+ $last_search = $yahoo_news_url; # for warnings
+
+ my $words = random_word();
+ my $search_url = $yahoo_news_url . $words;
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # de-redirectize the URLs
+ $u =~ s@^http://rds\.yahoo\.com/.*-http%3A@http:@s;
+
+ # only accept URLs on Yahoo's news site
+ next unless ($u =~ m@^http://dailynews\.yahoo\.com/@i ||
+ $u =~ m@^http://story\.news\.yahoo\.com/@i);
+ next unless ($u =~ m@&u=/@);
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images from LiveJournal's list of recently-posted images.
+#
+############################################################################
+
+my $livejournal_img_url = "http://www.livejournal.com/stats/latest-img.bml";
+
+# With most of our image sources, we get a random page and then select
+# from the images on it. However, in the case of LiveJournal, the page
+# of images tends to update slowly; so we'll remember the last N entries
+# on it and randomly select from those, to get a wider variety each time.
+
+my $lj_cache_size = 1000;
+my @lj_cache = (); # fifo, for ordering by age
+my %lj_cache = (); # hash, for detecting dups
+
+# livejournal
+sub pick_from_livejournal_images {
+ my ( $timeout ) = @_;
+
+ $last_search = $livejournal_img_url; # for warnings
+
+ my ( $base, $body ) = get_document ($livejournal_img_url, undef, $timeout);
+ return () unless $body;
+
+ $body =~ s/\n/ /gs;
+ $body =~ s/(<recent-image)\b/\n$1/gsi;
+
+ foreach (split (/\n/, $body)) {
+ next unless (m/^<recent-image\b/);
+ next unless (m/\bIMG=[\'\"]([^\'\"]+)[\'\"]/si);
+ my $img = html_unquote ($1);
+
+ next if ($lj_cache{$img}); # already have it
+
+ next unless (m/\bURL=[\'\"]([^\'\"]+)[\'\"]/si);
+ my $page = html_unquote ($1);
+ my @pair = ($img, $page);
+ LOG ($verbose_filter, " candidate: $img");
+ push @lj_cache, \@pair;
+ $lj_cache{$img} = \@pair;
+ }
+
+ return () if ($#lj_cache == -1);
+
+ my $n = $#lj_cache+1;
+ my $i = int(rand($n));
+ my ($img, $page) = @{$lj_cache[$i]};
+
+ # delete this one from @lj_cache and from %lj_cache.
+ #
+ @lj_cache = ( @lj_cache[0 .. $i-1],
+ @lj_cache[$i+1 .. $#lj_cache] );
+ delete $lj_cache{$img};
+
+ # Keep the size of the cache under the limit by nuking older entries
+ #
+ while ($#lj_cache >= $lj_cache_size) {
+ my $pairP = shift @lj_cache;
+ my $img = $pairP->[0];
+ delete $lj_cache{$img};
+ }
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
+
+ return ($page, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from ircimages.com (images that have been in the /topic of
+# various IRC channels.)
+#
+############################################################################
+
+my $ircimages_url = "http://ircimages.com/";
+
+# ircimages
+sub pick_from_ircimages {
+ my ( $timeout ) = @_;
+
+ $last_search = $ircimages_url; # for warnings
+
+ my $n = int(rand(2900));
+ my $search_url = $ircimages_url . "page-$n";
+
+ my ( $base, $body ) = get_document ($search_url, undef, $timeout);
+ return () unless $body;
+
+ my @candidates = ();
+
+ $body =~ s/\n/ /gs;
+ $body =~ s/(<A)\b/\n$1/gsi;
+
+ foreach (split (/\n/, $body)) {
+
+ my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
+ next unless $u;
+
+ if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; } # quoted string
+ elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; } # or token
+
+ next unless ($u =~ m/^http:/i);
+ next if ($u =~ m@^http://(searchirc\.com\|ircimages\.com)@i);
+ next unless ($u =~ m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@i);
+
+ LOG ($verbose_http, " HREF: $u");
+ push @candidates, $u;
+ }