+ my $fsp = ($body =~ m@<frameset@i);
+
+ $_ = undef;
+ $body = undef;
+
+ @urls = depoison (@urls);
+
+ if ( $#urls < 0 ) {
+ LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : ""));
+ return ();
+ }
+
+ # pick a random element of the table
+ my $i = int(rand($#urls+1));
+ $url = $urls[$i];
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
+
+ return $url;
+}
+
+
+# Given a URL and the RSS feed from that URL, pick a random image from
+# the feed. This is a lot simpler than extracting images out of a page:
+# we already know we have reasonable images, so we just pick one.
+# Returns: the real URL of the page (preferably not the RSS version),
+# and the image.
+
+sub pick_image_from_rss($$) {
+ my ($url, $body) = @_;
+
+ my ($base) = ($body =~ m@<link>([^<>]+)</link>@si); # root link
+
+ my @items = ($body =~ m@<item\b[^<>]*>(.*?)</item>@gsi);
+ return unless @items;
+
+ my $n = @items;
+ my $i = int(rand($n));
+ my $item = $items[$i];
+
+ $base = $1 if ($item =~ m@<link>([^<>]+)</link>@si); # item link
+ $base = $url unless $base;
+
+ ($url) = ($item =~ m/<enclosure\b[^<>]*\burl="(.*?)"/si);
+ return unless $url;
+
+ LOG ($verbose_load, "picked image $i/$n: $url");
+ return ($base, $url);
+}
+
+\f
+############################################################################
+#
+# Subroutines for getting pages and images out of search engines
+#
+############################################################################
+
+
+sub pick_dictionary() {
+ my @dicts = ("/usr/dict/words",
+ "/usr/share/dict/words",
+ "/usr/share/lib/dict/words",
+ "/usr/share/dict/cracklib-small",
+ "/usr/share/dict/cracklib-words"
+ );
+ foreach my $f (@dicts) {
+ if (-f $f) {
+ $wordlist = $f;
+ last;
+ }
+ }
+ error ("$dicts[0] does not exist") unless defined($wordlist);
+}
+
+# returns a random word from the dictionary
+#
+sub random_word() {
+
+ return undef unless open (my $in, '<', $wordlist);
+
+ my $size = (stat($in))[7];
+ my $word = undef;
+ my $count = 0;
+
+ while (1) {
+ error ("looping ($count) while reading $wordlist")
+ if (++$count > 100);
+
+ my $pos = int (rand ($size));
+ if (seek ($in, $pos, 0)) {
+ $word = <$in>; # toss partial line
+ $word = <$in>; # keep next line
+ }
+
+ next unless ($word);
+ next if ($word =~ m/^[-\']/);
+
+ $word = lc($word);
+ $word =~ s/^.*-//s;
+ $word =~ s/^[^a-z]+//s;
+ $word =~ s/[^a-z]+$//s;
+ $word =~ s/\'s$//s;
+ $word =~ s/ys$/y/s;
+ $word =~ s/ally$//s;
+ $word =~ s/ly$//s;
+ $word =~ s/ies$/y/s;
+ $word =~ s/ally$/al/s;
+ $word =~ s/izes$/ize/s;
+ $word =~ s/esses$/ess/s;
+ $word =~ s/(.{5})ing$/$1/s;
+
+ next if (length ($word) > 14);
+ last if ($word);
+ }
+
+ close ($in);
+
+ if ( $word =~ s/\s/\+/gs ) { # convert intra-word spaces to "+".
+ $word = "\%22$word\%22"; # And put quotes (%22) around it.
+ }
+
+ return $word;
+}
+
+
+sub random_words($) {
+ my ($sep) = @_;
+ return (random_word() . $sep .
+ random_word() . $sep .
+ random_word() . $sep .
+ random_word() . $sep .
+ random_word());
+}
+
+
+sub url_quote($) {
+ my ($s) = @_;
+ $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge;
+ return $s;
+}
+
+sub url_unquote($) {
+ my ($s) = @_;
+ $s =~ s/[+]/ /g;
+ $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige;
+ return $s;
+}
+
+sub html_quote($) {
+ my ($s) = @_;
+ $s =~ s/&/&/gi;
+ $s =~ s/</</gi;
+ $s =~ s/>/>/gi;
+ $s =~ s/\"/"/gi;
+ return $s;
+}
+
+sub html_unquote($) {
+ my ($s) = @_;
+ $s =~ s/(&([a-z]+);)/{ $entity_table{$2} || $1; }/gexi; # e.g., '
+ $s =~ s/(&\#(\d+);)/{ chr($2) }/gexi; # e.g., '
+ return $s;
+}
+
+
+# Loads the given URL (a search on some search engine) and returns:
+# - the total number of hits the search engine claimed it had;
+# - a list of URLs from the page that the search engine returned;
+# Note that this list contains all kinds of internal search engine
+# junk URLs too -- caller must prune them.
+#
+sub pick_from_search_engine($$$) {
+ my ( $timeout, $search_url, $words ) = @_;
+
+ $_ = $words;
+ s/%20/ /g;
+
+ print STDERR "\n\n" if ($verbose_load);
+
+ LOG ($verbose_load, "words: $_");
+ LOG ($verbose_load, "URL: $search_url");
+
+ $last_search = $search_url; # for warnings
+
+ my $start = time;
+ my ( $base, $body ) = get_document ($search_url, undef, $timeout);
+ if (defined ($timeout)) {
+ $timeout -= (time - $start);
+ if ($timeout <= 0) {
+ $body = undef;
+ LOG (($verbose_net || $verbose_load),
+ "timed out (late) for $search_url");
+ $suppress_audit = 1;
+ return ();
+ }
+ }
+
+ return () if (! $body);
+
+
+ my @subpages;
+
+ my $search_count = "?";
+ if ($body =~ m@found (approximately |about )?(<B>)?(\d+)(</B>)? image@) {
+ $search_count = $3;
+ } elsif ($body =~ m@<NOBR>((\d{1,3})(,\d{3})*) @i) {
+ $search_count = $1;
+ } elsif ($body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) {
+ $search_count = $1;
+ } elsif ($body =~ m@found about ((\d{1,3})(,\d{3})*|\d+) results@) {
+ $search_count = $1;
+ } elsif ($body =~ m@\b\d+ - \d+ of (\d+)\b@i) { # avimages
+ $search_count = $1;
+ } elsif ($body =~ m@About ((\d{1,3})(,\d{3})*) images@i) { # avimages
+ $search_count = $1;
+ } elsif ($body =~ m@We found ((\d{1,3})(,\d{3})*|\d+) results@i) { # *vista
+ $search_count = $1;
+ } elsif ($body =~ m@ of about <B>((\d{1,3})(,\d{3})*)<@i) { # googleimages
+ $search_count = $1;
+ } elsif ($body =~ m@<B>((\d{1,3})(,\d{3})*)</B> Web sites were found@i) {
+ $search_count = $1; # lycos
+ } elsif ($body =~ m@WEB.*?RESULTS.*?\b((\d{1,3})(,\d{3})*)\b.*?Matches@i) {
+ $search_count = $1; # hotbot
+ } elsif ($body =~ m@no photos were found containing@i) { # avimages
+ $search_count = "0";
+ } elsif ($body =~ m@found no document matching@i) { # avtext
+ $search_count = "0";
+ }
+ 1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/);
+
+# if ($search_count eq "?" || $search_count eq "0") {
+# my $file = "/tmp/wc.html";
+# open (my $out, '>', $file) || error ("writing $file: $!");
+# print $out $body;
+# close $out;
+# print STDERR blurb() . "###### wrote $file\n";
+# }
+
+
+ my $length = length($body);
+ my $href_count = 0;
+
+ $_ = $body;
+
+ s/[\r\n\t ]+/ /g;
+
+
+ s/(<A )/\n$1/gi;
+ foreach (split(/\n/)) {
+ $href_count++;
+ my ($u) = m@<A\s.*?\bHREF\s*=\s*([\"\'][^\"\'<>]+)@i;
+ next unless $u;
+ my ($u2) = m@<IMG\s.*\bSRC\s*=\s*[\"\']([^\"\'<>]+)@i;
+
+ if (m/\bm="\{(.*?)\}"/s) { # Bing info is inside JSON crud
+ my $json = html_unquote($1);
+ my ($href) = ($json =~ m/\b(?:surl|purl)\"?:\s*"(.*?)"/s);
+ my ($img) = ($json =~ m/\b(?:imgurl|murl)\"?:\s*"(.*?)"/s);
+ $u = "$img\t$href" if ($img && $href);
+
+ } elsif ($u2 && $u2 =~ m@://[^/]*\.gstatic\.com/@s) { $u = $u2;
+ $u =~ s/^\"|\"$//s;
+
+ } elsif ($u =~ m/^\"([^\"]*)\"/) { $u = $1 # quoted string
+ } elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; # or token
+ }
+
+ if ( $rejected_urls{$u} ) {
+ LOG ($verbose_filter, " pre-rejecting candidate: $u");
+ next;
+ }
+
+ LOG ($verbose_http, " HREF: $u");
+
+ $subpages[++$#subpages] = $u;
+ }
+
+ if ( $#subpages < 0 ) {
+ LOG ($verbose_filter,
+ "found nothing on $base ($length bytes, $href_count links).");
+ return ();
+ }
+
+ LOG ($verbose_filter, "" . $#subpages+1 . " links on $search_url");
+
+ return ($search_count, @subpages);
+}
+
+
+sub depoison(@) {
+ my (@urls) = @_;
+ my @urls2 = ();
+ foreach (@urls) {
+ my ($h) = m@^https?://([^/: \t\r\n]+)@i;
+
+ next unless defined($h);
+
+ if ($poisoners{$h}) {
+ LOG (($verbose_filter), " rejecting poisoner: $_");
+ next;
+ }
+ if ($h =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
+ $poisoners{$1}) {
+ LOG (($verbose_filter), " rejecting poisoner: $_");
+ next;
+ }
+ if ($h =~ m@([^.]+\.[^.]+)$@ &&
+ $poisoners{$1}) {
+ LOG (($verbose_filter), " rejecting poisoner: $_");
+ next;
+ }
+
+ push @urls2, $_;
+ }
+ return @urls2;
+}
+
+
+# given a list of URLs, picks one at random; loads it; and returns a
+# random image from it.
+# returns the url of the page loaded; the url of the image chosen.
+#
+sub pick_image_from_pages($$$$@) {
+ my ($base, $total_hit_count, $unfiltered_link_count, $timeout, @pages) = @_;
+
+ $total_hit_count = "?" unless defined($total_hit_count);
+
+ @pages = depoison (@pages);
+ LOG ($verbose_load,
+ "" . ($#pages+1) . " candidates of $unfiltered_link_count links" .
+ " ($total_hit_count total)");
+
+ return () if ($#pages < 0);
+
+ my $i = int(rand($#pages+1));
+ my $page = $pages[$i];
+
+ LOG ($verbose_load, "picked page $page");
+
+ $suppress_audit = 1;
+
+ my ( $base2, $body2 ) = get_document ($page, $base, $timeout);
+
+ if (!$base2 || !$body2) {
+ $body2 = undef;
+ return ();
+ }
+
+ my $img = pick_image_from_body ($base2, $body2);
+ $body2 = undef;
+
+ if ($img) {
+ return ($base2, $img);
+ } else {
+ return ();
+ }
+}
+
+\f
+#############################################################################
+##
+## Pick images from random pages returned by the Yahoo Random Link
+##
+#############################################################################
+#
+## yahoorand
+#my $yahoo_random_link = "http://random.yahoo.com/fast/ryl";
+#
+#
+# Picks a random page; picks a random image on that page;
+# returns two URLs: the page containing the image, and the image.
+# Returns () if nothing found this time.
+#
+#sub pick_from_yahoo_random_link($) {
+# my ($timeout) = @_;
+#
+# print STDERR "\n\n" if ($verbose_load);
+# LOG ($verbose_load, "URL: $yahoo_random_link");
+#
+# $last_search = $yahoo_random_link; # for warnings
+#
+# $suppress_audit = 1;
+#
+# my ( $base, $body ) = get_document ($yahoo_random_link, undef, $timeout);
+# if (!$base || !$body) {
+# $body = undef;
+# return;
+# }
+#
+# LOG ($verbose_load, "redirected to: $base");
+#
+# my $img = pick_image_from_body ($base, $body);
+# $body = undef;
+#
+# if ($img) {
+# return ($base, $img);
+# } else {
+# return ();
+# }
+#}
+
+\f
+############################################################################
+#
+# Pick images from random pages returned by the Alta Vista Random Link
+# Note: this seems to have gotten a *lot* less random lately (2007).
+#
+############################################################################
+
+# altavista
+my $alta_vista_random_link = "http://www.altavista.com/image/randomlink";
+
+
+# Picks a random page; picks a random image on that page;
+# returns two URLs: the page containing the image, and the image.
+# Returns () if nothing found this time.
+#
+sub pick_from_alta_vista_random_link($) {
+ my ($timeout) = @_;
+
+ print STDERR "\n\n" if ($verbose_load);
+ LOG ($verbose_load, "URL: $alta_vista_random_link");
+
+ $last_search = $alta_vista_random_link; # for warnings
+
+ $suppress_audit = 1;
+
+ my ( $base, $body ) = get_document ($alta_vista_random_link,
+ undef, $timeout);
+ if (!$base || !$body) {
+ $body = undef;
+ return;
+ }
+
+ LOG ($verbose_load, "redirected to: $base");
+
+ my $img = pick_image_from_body ($base, $body);
+ $body = undef;
+
+ if ($img) {
+ return ($base, $img);
+ } else {
+ return ();
+ }
+}
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Alta Vista Image Search
+#
+############################################################################
+
+
+my $alta_vista_images_url = "http://www.altavista.com/image/results" .
+ "?ipht=1" . # photos
+ "&igrph=1" . # graphics
+ "&iclr=1" . # color
+ "&ibw=1" . # b&w
+ "&micat=1" . # no partner sites
+ "&sc=on" . # "site collapse"
+ "&q=";
+
+# avimages
+sub pick_from_alta_vista_images($) {
+ my ($timeout) = @_;
+
+ my $words = random_word();
+ my $page = (int(rand(9)) + 1);
+ my $search_url = $alta_vista_images_url . $words;
+
+ if ($page > 1) {
+ $search_url .= "&pgno=" . $page; # page number
+ $search_url .= "&stq=" . (($page-1) * 12); # first hit result on page
+ }
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # avimages is encoding their URLs now.
+ next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
+ $u = url_unquote($u);
+
+ next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]altavista\.com\b@i); # skip altavista builtins
+ next if ($u =~ m@[/.]yahoo\.com\b@i); # yahoo and av in cahoots?
+ next if ($u =~ m@[/.]doubleclick\.net\b@i); # you cretins
+ next if ($u =~ m@[/.]clicktomarket\.com\b@i); # more cretins
+
+ next if ($u =~ m@[/.]viewimages\.com\b@i); # stacked deck
+ next if ($u =~ m@[/.]gettyimages\.com\b@i);
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images from Aptix security cameras
+# Cribbed liberally from google image search code.
+# By Jason Sullivan <jasonsul@us.ibm.com>
+#
+############################################################################
+
+my $aptix_images_url = ("http://www.google.com/search" .
+ "?q=inurl:%22jpg/image.jpg%3Fr%3D%22");
+
+# securitycam
+sub pick_from_security_camera($) {
+ my ($timeout) = @_;
+
+ my $page = (int(rand(9)) + 1);
+ my $num = 20; # 20 images per page
+ my $search_url = $aptix_images_url;
+
+ if ($page > 1) {
+ $search_url .= "&start=" . $page*$num; # page number
+ $search_url .= "&num=" . $num; #images per page
+ }
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, '');
+
+ my @candidates = ();
+ my %referers;
+ foreach my $u (@subpages) {
+ next if ($u =~ m@[/.]google\.com\b@i); # skip google builtins (most links)
+ next unless ($u =~ m@jpg/image.jpg\?r=@i); # All pics contain this
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ $referers{$u} = $u;
+ }
+
+ @candidates = depoison (@candidates);
+ return () if ($#candidates < 0);
+ my $i = int(rand($#candidates+1));
+ my $img = $candidates[$i];
+ my $ref = $referers{$img};
+
+ LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
+ return ($ref, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Google Image Search.
+# By Charles Gales <gales@us.ibm.com>
+#
+############################################################################
+
+my $google_images_url = 'https://www.google.com/search' .
+ '?source=lnms&tbm=isch&tbs=isz:l&q=';
+
+# googleimgs
+sub pick_from_google_images($;$$) {
+ my ($timeout, $words, $max_page) = @_;
+
+ if (!defined($words)) {
+ $words = random_word(); # only one word for Google
+ }
+
+ my $off = int(rand(40));
+ my $search_url = $google_images_url . $words . "&start=" . $off;
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+ $u = html_unquote($u);
+ # next if ($u =~ m@^https?://[^.]*\.(google|youtube)\.com/@s);
+ next unless ($u =~ m@^https?://[^/]*\.gstatic\.com@s);
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ @candidates = depoison (@candidates);
+ return () if ($#candidates < 0);
+ my $i = int(rand($#candidates+1));
+ my $img = $candidates[$i];
+
+ LOG ($verbose_load, "picked image " . ($i+1) . ": $img");
+ return ($img, $img);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random numbers into Google Image Search.
+# By jwz, suggested by Ian O'Donnell.
+#
+############################################################################
+
+
+# googlenums
+sub pick_from_google_image_numbers($) {
+ my ($timeout) = @_;
+
+ my $max = 9999;
+ my $number = int(rand($max));
+
+ $number = sprintf("%04d", $number)
+ if (rand() < 0.3);
+
+ pick_from_google_images ($timeout, "$number");
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random digital camera file names into
+# Google Image Search.
+# By jwz, inspired by the excellent Random Personal Picture Finder
+# at http://www.diddly.com/random/
+# May 2017: Commented out a bunch of formats that have fallen out of favor.
+#
+############################################################################
+
+my @photomakers = (
+ #
+ # Common digital camera file name formats, as described at
+ # http://www.diddly.com/random/about.html
+ #
+# sub { sprintf ("dcp%05d.jpg", int(rand(4000))); }, # Kodak
+ sub { sprintf ("dsc%05d.jpg", int(rand(4000))); }, # Nikon
+ sub { sprintf ("dscn%04d.jpg", int(rand(4000))); }, # Nikon
+# sub { sprintf ("mvc-%03d.jpg", int(rand(999))); }, # Sony Mavica
+# sub { sprintf ("mvc%05d.jpg", int(rand(9999))); }, # Sony Mavica
+# sub { sprintf ("P101%04d.jpg", int(rand(9999))); }, # Olympus w/ date=101
+# sub { sprintf ("P%x%02d%04d.jpg", # Olympus
+# int(rand(0xC)), int(rand(30))+1,
+# rand(9999)); },
+ sub { sprintf ("IMG_%03d.jpg", int(rand(999))); }, # ?
+# sub { sprintf ("IMAG%04d.jpg", int(rand(9999))); }, # RCA and Samsung
+# sub { my $n = int(rand(9999)); # Canon
+# sprintf ("1%02d-%04d.jpg", int($n/100), $n); },
+# sub { my $n = int(rand(9999)); # Canon
+# sprintf ("1%02d-%04d_IMG.jpg",
+# int($n/100), $n); },
+ sub { sprintf ("IMG_%04d.jpg", int(rand(9999))); }, # Canon
+ sub { sprintf ("dscf%04d.jpg", int(rand(9999))); }, # Fuji Finepix
+# sub { sprintf ("pdrm%04d.jpg", int(rand(9999))); }, # Toshiba PDR
+# sub { sprintf ("IM%06d.jpg", int(rand(9999))); }, # HP Photosmart
+# sub { sprintf ("EX%06d.jpg", int(rand(9999))); }, # HP Photosmart
+# sub { my $n = int(rand(3)); # Kodak DC-40,50,120
+# sprintf ("DC%04d%s.jpg", int(rand(9999)),
+# $n == 0 ? 'S' : $n == 1 ? 'M' : 'L'); },
+ sub { sprintf ("pict%04d.jpg", int(rand(9999))); }, # Minolta Dimage
+# sub { sprintf ("P%07d.jpg", int(rand(9999))); }, # Kodak DC290
+# sub { sprintf ("%02d%02d%04d.jpg", # Casio QV3000, QV4000
+# int(rand(12))+1, int(rand(31))+1,
+# int(rand(999))); },
+# sub { sprintf ("%02d%x%02d%04d.jpg", # Casio QV7000
+# int(rand(6)), # year
+# int(rand(12))+1, int(rand(31))+1,
+# int(rand(999))); },
+ sub { sprintf ("IMGP%04d.jpg", int(rand(9999))); }, # Pentax Optio S
+# sub { sprintf ("PANA%04d.jpg", int(rand(9999))); }, # Panasonic vid still
+ sub { sprintf ("HPIM%04d.jpg", int(rand(9999))); }, # HP Photosmart
+# sub { sprintf ("PCDV%04d.jpg", int(rand(9999))); }, # ?
+ );
+
+
+# googlephotos
+sub pick_from_google_image_photos($) {
+ my ($timeout) = @_;
+
+ my $i = int(rand($#photomakers + 1));
+ my $fn = $photomakers[$i];
+ my $file = &$fn;
+ #$file .= "%20filetype:jpg";
+
+ pick_from_google_images ($timeout, $file);
+}
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Google Image Search.
+# By the way: fuck Microsoft.
+#
+############################################################################
+
+my $bing_images_url = "http://www.bing.com/images/async?q=";
+
+
+# bingimgs
+sub pick_from_bing_images($;$$) {
+ my ($timeout, $words, $max_page) = @_;
+
+ if (!defined($words)) {
+ $words = random_word(); # only one word for Bing
+ }
+
+ my $off = int(rand(300));
+ my $search_url = $bing_images_url . $words . "&first=" . $off;
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ my %referers;
+ foreach my $u (@subpages) {
+ my ($img, $ref) = ($u =~ m/^(.*?)\t(.*)$/s);
+ next unless $img;
+ LOG ($verbose_filter, " candidate: $ref");
+ push @candidates, $img;
+ $referers{$img} = $ref;
+ }
+
+ @candidates = depoison (@candidates);
+ return () if ($#candidates < 0);
+ my $i = int(rand($#candidates+1));
+ my $img = $candidates[$i];
+ my $ref = $referers{$img};
+
+ LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
+ return ($ref, $img);
+}
+
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random numbers into Bing Image Search.
+#
+############################################################################
+
+# bingnums
+sub pick_from_bing_image_numbers($) {
+ my ($timeout) = @_;
+
+ my $max = 9999;
+ my $number = int(rand($max));
+
+ $number = sprintf("%04d", $number)
+ if (rand() < 0.3);
+
+ pick_from_bing_images ($timeout, "$number");
+}
+
+\f
+############################################################################
+#
+# Pick images by feeding random numbers into Bing Image Search.
+#
+############################################################################
+
+# bingphotos
+sub pick_from_bing_image_photos($) {
+ my ($timeout) = @_;
+
+ my $i = int(rand($#photomakers + 1));
+ my $fn = $photomakers[$i];
+ my $file = &$fn;
+
+ pick_from_bing_images ($timeout, $file);
+}
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Alta Vista Text Search
+#
+############################################################################
+
+
+my $alta_vista_url = "http://www.altavista.com/web/results" .
+ "?pg=aq" .
+ "&aqmode=s" .
+ "&filetype=html" .
+ "&sc=on" . # "site collapse"
+ "&nbq=50" .
+ "&aqo=";
+
+# avtext
+sub pick_from_alta_vista_text($) {
+ my ($timeout) = @_;
+
+ my $words = random_words('%20');
+ my $page = (int(rand(9)) + 1);
+ my $search_url = $alta_vista_url . $words;
+
+ if ($page > 1) {
+ $search_url .= "&pgno=" . $page;
+ $search_url .= "&stq=" . (($page-1) * 10);
+ }
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # Those altavista fuckers are playing really nasty redirection games
+ # these days: the filter your clicks through their site, but use
+ # onMouseOver to make it look like they're not! Well, it makes it
+ # easier for us to identify search results...
+ #
+ next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
+ $u = url_unquote($u);
+
+ next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]altavista\.com\b@i); # skip altavista builtins
+ next if ($u =~ m@[/.]yahoo\.com\b@i); # yahoo and av in cahoots?
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Hotbot
+#
+############################################################################
+
+my $hotbot_search_url =("http://hotbot.lycos.com/default.asp" .
+ "?ca=w" .
+ "&descriptiontype=0" .
+ "&imagetoggle=1" .
+ "&matchmode=any" .
+ "&nummod=2" .
+ "&recordcount=50" .
+ "&sitegroup=1" .
+ "&stem=1" .
+ "&cobrand=undefined" .
+ "&query=");
+
+sub pick_from_hotbot_text($) {
+ my ($timeout) = @_;
+
+ $last_search = $hotbot_search_url; # for warnings
+
+ # lycos seems to always give us back dictionaries and word lists if
+ # we search for more than one word...
+ #
+ my $words = random_word();
+
+ my $start = int(rand(8)) * 10 + 1;
+ my $search_url = $hotbot_search_url . $words . "&first=$start&page=more";
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # Hotbot plays redirection games too
+ # (not any more?)
+# next unless ($u =~ m@/director.asp\?.*\btarget=([^&]+)@);
+# $u = url_decode($1);
+
+ next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]hotbot\.com\b@i); # skip hotbot builtins
+ next if ($u =~ m@[/.]lycos\.com\b@i); # skip hotbot builtins
+ next if ($u =~ m@[/.]inktomi\.com\b@i); # skip hotbot builtins
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Lycos
+#
+############################################################################
+
+my $lycos_search_url = "http://search.lycos.com/default.asp" .
+ "?lpv=1" .
+ "&loc=searchhp" .
+ "&tab=web" .
+ "&query=";
+
+sub pick_from_lycos_text($) {
+ my ($timeout) = @_;
+
+ $last_search = $lycos_search_url; # for warnings
+
+ # lycos seems to always give us back dictionaries and word lists if
+ # we search for more than one word...
+ #
+ my $words = random_word();
+
+ my $start = int(rand(8)) * 10 + 1;
+ my $search_url = $lycos_search_url . $words . "&first=$start&page=more";
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # Lycos plays redirection games.
+ # (not any more?)
+# next unless ($u =~ m@^https?://click.lycos.com/director.asp
+# .*
+# \btarget=([^&]+)
+# .*
+# @x);
+# $u = url_decode($1);
+
+ next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]hotbot\.com\b@i); # skip lycos builtins
+ next if ($u =~ m@[/.]lycos\.com\b@i); # skip lycos builtins
+ next if ($u =~ m@[/.]terralycos\.com\b@i); # skip lycos builtins
+ next if ($u =~ m@[/.]inktomi\.com\b@i); # skip lycos builtins
+
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into news.yahoo.com
+#
+############################################################################
+
+my $yahoo_news_url = "http://news.search.yahoo.com/search/news" .
+ "?c=news_photos" .
+ "&p=";
+
+# yahoonews
+sub pick_from_yahoo_news_text($) {
+ my ($timeout) = @_;
+
+ $last_search = $yahoo_news_url; # for warnings
+
+ my $words = random_word();
+ my $search_url = $yahoo_news_url . $words;
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # de-redirectize the URLs
+ $u =~ s@^https?://rds\.yahoo\.com/.*-http%3A@http:@s;
+
+ # only accept URLs on Yahoo's news site
+ next unless ($u =~ m@^https?://dailynews\.yahoo\.com/@i ||
+ $u =~ m@^https?://story\.news\.yahoo\.com/@i);
+ next unless ($u =~ m@&u=/@);
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images from LiveJournal's list of recently-posted images.
+#
+############################################################################
+
+my $livejournal_img_url = "http://www.livejournal.com/stats/latest-img.bml";
+
+# With most of our image sources, we get a random page and then select
+# from the images on it. However, in the case of LiveJournal, the page
+# of images tends to update slowly; so we'll remember the last N entries
+# on it and randomly select from those, to get a wider variety each time.
+
+my $lj_cache_size = 1000;
+my @lj_cache = (); # fifo, for ordering by age
+my %lj_cache = (); # hash, for detecting dups
+
+# livejournal
+sub pick_from_livejournal_images($) {
+ my ($timeout) = @_;
+
+ $last_search = $livejournal_img_url; # for warnings
+
+ my ( $base, $body ) = get_document ($livejournal_img_url, undef, $timeout);
+
+ # Often the document comes back empty. If so, just use the cache.
+ # return () unless $body;
+ $body = '' unless defined($body);
+
+ $body =~ s/\n/ /gs;
+ $body =~ s/(<recent-image)\b/\n$1/gsi;
+
+ foreach (split (/\n/, $body)) {
+ next unless (m/^<recent-image\b/);
+ next unless (m/\bIMG=[\'\"]([^\'\"]+)[\'\"]/si);
+ my $img = html_unquote ($1);
+
+ next if ($lj_cache{$img}); # already have it
+
+ next unless (m/\bURL=[\'\"]([^\'\"]+)[\'\"]/si);
+ my $page = html_unquote ($1);
+ my @pair = ($img, $page);
+ LOG ($verbose_filter, " candidate: $img");
+ push @lj_cache, \@pair;
+ $lj_cache{$img} = \@pair;
+ }
+
+ return () if ($#lj_cache == -1);
+
+ my $n = $#lj_cache+1;
+ my $i = int(rand($n));
+ my ($img, $page) = @{$lj_cache[$i]};
+
+ # delete this one from @lj_cache and from %lj_cache.
+ #
+ @lj_cache = ( @lj_cache[0 .. $i-1],
+ @lj_cache[$i+1 .. $#lj_cache] );
+ delete $lj_cache{$img};
+
+ # Keep the size of the cache under the limit by nuking older entries
+ #
+ while ($#lj_cache >= $lj_cache_size) {
+ my $pairP = shift @lj_cache;
+ my $img = $pairP->[0];
+ delete $lj_cache{$img};
+ }
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
+
+ return ($page, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from ircimages.com (images that have been in the /topic of
+# various IRC channels.)
+#
+############################################################################
+
+my $ircimages_url = "http://ircimages.com/";
+
+# ircimages
+sub pick_from_ircimages($) {
+ my ($timeout) = @_;
+
+ $last_search = $ircimages_url; # for warnings
+
+ my $n = int(rand(2900));
+ my $search_url = $ircimages_url . "page-$n";
+
+ my ( $base, $body ) = get_document ($search_url, undef, $timeout);
+ return () unless $body;
+
+ my @candidates = ();
+
+ $body =~ s/\n/ /gs;
+ $body =~ s/(<A)\b/\n$1/gsi;
+
+ foreach (split (/\n/, $body)) {
+
+ my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
+ next unless $u;
+
+ if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; } # quoted string
+ elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; } # or token
+
+ next unless ($u =~ m/^https?:/i);
+ next if ($u =~ m@^https?://(searchirc\.com\|ircimages\.com)@i);
+ next unless ($u =~ m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@i);
+
+ LOG ($verbose_http, " HREF: $u");
+ push @candidates, $u;
+ }
+
+ LOG ($verbose_filter, "" . $#candidates+1 . " links on $search_url");
+
+ return () if ($#candidates == -1);
+
+ my $i = int(rand($#candidates+1));
+ my $img = $candidates[$i];
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#candidates+1) .
+ ": $img");
+
+ $search_url = $img; # hmm...
+ return ($search_url, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from Twitpic's list of recently-posted images.
+#
+############################################################################
+
+my $twitpic_img_url = "http://twitpic.com/public_timeline/feed.rss";
+
+# With most of our image sources, we get a random page and then select
+# from the images on it. However, in the case of Twitpic, the page
+# of images tends to update slowly; so we'll remember the last N entries
+# on it and randomly select from those, to get a wider variety each time.
+
+my $twitpic_cache_size = 1000;
+my @twitpic_cache = (); # fifo, for ordering by age
+my %twitpic_cache = (); # hash, for detecting dups
+
+# twitpic
+sub pick_from_twitpic_images($) {
+ my ($timeout) = @_;
+
+ $last_search = $twitpic_img_url; # for warnings
+
+ my ( $base, $body ) = get_document ($twitpic_img_url, undef, $timeout);
+
+ # Update the cache.
+
+ if ($body) {
+ $body =~ s/\n/ /gs;
+ $body =~ s/(<item)\b/\n$1/gsi;
+
+ my @items = split (/\n/, $body);
+ shift @items;
+ foreach (@items) {
+ next unless (m@<link>([^<>]*)</link>@si);
+ my $page = html_unquote ($1);
+
+ $page =~ s@/$@@s;
+ $page .= '/full';
+
+ next if ($twitpic_cache{$page}); # already have it
+
+ LOG ($verbose_filter, " candidate: $page");
+ push @twitpic_cache, $page;
+ $twitpic_cache{$page} = $page;
+ }
+ }
+
+ # Pull from the cache.
+
+ return () if ($#twitpic_cache == -1);
+
+ my $n = $#twitpic_cache+1;
+ my $i = int(rand($n));
+ my $page = $twitpic_cache[$i];
+
+ # delete this one from @twitpic_cache and from %twitpic_cache.
+ #
+ @twitpic_cache = ( @twitpic_cache[0 .. $i-1],
+ @twitpic_cache[$i+1 .. $#twitpic_cache] );
+ delete $twitpic_cache{$page};
+
+ # Keep the size of the cache under the limit by nuking older entries
+ #
+ while ($#twitpic_cache >= $twitpic_cache_size) {
+ my $page = shift @twitpic_cache;
+ delete $twitpic_cache{$page};
+ }
+
+ ( $base, $body ) = get_document ($page, undef, $timeout);
+ my $img = undef;
+ $body = '' unless defined($body);
+
+ foreach (split (/<img\s+/, $body)) {
+ my ($src) = m/\bsrc=[\"\'](.*?)[\"\']/si;
+ next unless $src;
+ next if m@/js/@s;
+ next if m@/images/@s;
+
+ $img = $src;
+
+ $img = "http:$img" if ($img =~ m@^//@s); # Oh come on
+
+ # Sometimes these images are hosted on twitpic, sometimes on Amazon.
+ if ($img =~ m@^/@) {
+ $base =~ s@^(https?://[^/]+)/.*@$1@s;
+ $img = $base . $img;
+ }
+ last;
+ }
+
+ if (!$img) {
+ LOG ($verbose_load, "no matching images on $page\n");
+ return ();
+ }
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
+
+ return ($page, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from Twitter's list of recently-posted updates.
+#
+############################################################################
+
+# With most of our image sources, we get a random page and then select
+# from the images on it. However, in the case of Twitter, the page
+# of images only updates once a minute; so we'll remember the last N entries
+# on it and randomly select from those, to get a wider variety each time.
+
+my $twitter_img_url = "http://api.twitter.com/1/statuses/" .
+ "public_timeline.json" .
+ "?include_entities=true" .
+ "&include_rts=true" .
+ "&count=200";
+
+my $twitter_cache_size = 1000;
+
+my @twitter_cache = (); # fifo, for ordering by age
+my %twitter_cache = (); # hash, for detecting dups
+
+
+# twitter
+sub pick_from_twitter_images($) {
+ my ($timeout) = @_;
+
+ $last_search = $twitter_img_url; # for warnings
+
+ my ( $base, $body ) = get_document ($twitter_img_url, undef, $timeout);
+ # Update the cache.
+
+ if ($body) {
+ $body =~ s/[\r\n]+/ /gs;
+
+ # Parsing JSON is a pain in the ass. So we halfass it as usual.
+ $body =~ s/^\[|\]$//s;
+ $body =~ s/(\[.*?\])/{ $_ = $1; s@\},@\} @gs; $_; }/gsexi;
+ my @items = split (/\},\{/, $body);
+ foreach (@items) {
+ my ($name) = m@"screen_name":"([^\"]+)"@si;
+ my ($img) = m@"media_url":"([^\"]+)"@si;
+ my ($page) = m@"display_url":"([^\"]+)"@si;
+ next unless ($name && $img && $page);
+ foreach ($img, $page) {
+ s/\\//gs;
+ $_ = "http://$_" unless (m/^http/si);
+ }
+
+ next if ($twitter_cache{$page}); # already have it
+
+ LOG ($verbose_filter, " candidate: $page - $img");
+ push @twitter_cache, $page;
+ $twitter_cache{$page} = $img;
+ }
+ }
+
+ # Pull from the cache.
+
+ return () if ($#twitter_cache == -1);
+
+ my $n = $#twitter_cache+1;
+ my $i = int(rand($n));
+ my $page = $twitter_cache[$i];
+ my $url = $twitter_cache{$page};
+
+ # delete this one from @twitter_cache and from %twitter_cache.
+ #
+ @twitter_cache = ( @twitter_cache[0 .. $i-1],
+ @twitter_cache[$i+1 .. $#twitter_cache] );
+ delete $twitter_cache{$page};
+
+ # Keep the size of the cache under the limit by nuking older entries
+ #
+ while ($#twitter_cache >= $twitter_cache_size) {
+ my $page = shift @twitter_cache;
+ delete $twitter_cache{$page};
+ }
+
+ LOG ($verbose_load, "picked page $url");
+
+ $suppress_audit = 1;
+
+ return ($page, $url);
+}
+
+\f
+############################################################################
+#
+# Pick images from Flickr's page of recently-posted photos.
+#
+############################################################################
+
+my $flickr_img_url = "http://www.flickr.com/explore/";
+
+# Like LiveJournal, the Flickr page of images tends to update slowly,
+# so remember the last N entries on it and randomly select from those.
+
+# I know that Flickr has an API (http://www.flickr.com/services/api/)
+# but it was easy enough to scrape the HTML, so I didn't bother exploring.
+
+my $flickr_cache_size = 1000;
+my @flickr_cache = (); # fifo, for ordering by age
+my %flickr_cache = (); # hash, for detecting dups
+
+
+# flickr_recent
+sub pick_from_flickr_recent($) {
+ my ($timeout) = @_;
+
+ my $start = 16 * int(rand(100));
+
+ $last_search = $flickr_img_url; # for warnings
+ $last_search .= "?start=$start" if ($start > 0);
+
+ my ( $base, $body ) = get_document ($last_search, undef, $timeout);
+
+ # If the document comes back empty. just use the cache.
+ # return () unless $body;
+ $body = '' unless defined($body);
+
+ my $count = 0;
+ my $count2 = 0;
+
+ if ($body =~ m@{ *"_data": \[ ( .*? \} ) \]@six) {
+ $body = $1;
+ } else {
+ LOG ($verbose_load, "flickr unparsable: $last_search");
+ return ();
+ }
+
+ $body =~ s/[\r\n]/ /gs;
+ $body =~ s/(\},) *(\{)/$1\n$2/gs; # "_flickrModelRegistry"
+
+ foreach my $chunk (split (/\n/, $body)) {
+ my ($img) = ($chunk =~ m@"displayUrl": *"(.*?)"@six);
+ next unless defined ($img);
+ $img =~ s/\\//gs;
+ $img = "//" unless ($img =~ m@^/@s);
+ $img = "http:$img" unless ($img =~ m/^http/s);
+
+ my ($user) = ($chunk =~ m/"pathAlias": *"(.*?)"/si);
+ next unless defined ($user);
+
+ my ($id) = ($img =~ m@/\d+/(\d+)_([\da-f]+)_@si);
+ my ($page) = "https://www.flickr.com/photos/$user/$id/";
+
+ # $img =~ s/_[a-z](\.[a-z\d]+)$/$1/si; # take off "thumb" suffix
+
+ $count++;
+ next if ($flickr_cache{$img}); # already have it
+
+ my @pair = ($img, $page, $start);
+ LOG ($verbose_filter, " candidate: $img");
+ push @flickr_cache, \@pair;
+ $flickr_cache{$img} = \@pair;
+ $count2++;
+ }
+
+ return () if ($#flickr_cache == -1);
+
+ my $n = $#flickr_cache+1;
+ my $i = int(rand($n));
+ my ($img, $page) = @{$flickr_cache[$i]};
+
+ # delete this one from @flickr_cache and from %flickr_cache.
+ #
+ @flickr_cache = ( @flickr_cache[0 .. $i-1],
+ @flickr_cache[$i+1 .. $#flickr_cache] );
+ delete $flickr_cache{$img};
+
+ # Keep the size of the cache under the limit by nuking older entries
+ #
+ while ($#flickr_cache >= $flickr_cache_size) {
+ my $pairP = shift @flickr_cache;
+ my $img = $pairP->[0];
+ delete $flickr_cache{$img};
+ }
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
+
+ return ($page, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from a random RSS feed on Flickr.
+#
+############################################################################
+
+my $flickr_rss_base = ("http://www.flickr.com/services/feeds/" .
+ "photos_public.gne" .
+ "?format=rss_200_enc&tagmode=any&tags=");
+
+# Picks a random RSS feed; picks a random image from that feed;
+# returns 2 URLs: the page containing the image, and the image.
+# Mostly by Joe Mcmahon <mcmahon@yahoo-inc.com>
+#
+# flickr_random
+sub pick_from_flickr_random($) {
+ my $timeout = shift;
+
+ my $words = random_words(',');
+ my $rss = $flickr_rss_base . $words;
+ $last_search = $rss;
+
+ $_ = $words;
+ s/,/ /g;
+
+ print STDERR "\n\n" if ($verbose_load);
+ LOG ($verbose_load, "words: $_");
+ LOG ($verbose_load, "URL: $last_search");
+
+ $suppress_audit = 1;
+
+ my ( $base, $body ) = get_document ($last_search, undef, $timeout);
+ if (!$base || !$body) {
+ $body = undef;
+ return;
+ }
+
+ my $img;
+ ($base, $img) = pick_image_from_rss ($base, $body);
+ $body = undef;
+ return () unless defined ($img);
+
+ LOG ($verbose_load, "redirected to: $base");
+ return ($base, $img);
+}
+
+\f
+############################################################################
+#
+# Pick random images from Instagram.
+#
+############################################################################
+
+my $instagram_url_base = "https://api.instagram.com/v1/media/popular";
+
+# instagram_random
+sub pick_from_instagram($) {
+ my $timeout = shift;
+
+ # Liberated access tokens.
+ # jsdo.it search for: instagram client_id
+ # Google search for: instagram "&client_id=" site:jsfiddle.net
+ my @tokens = (#'b59fbe4563944b6c88cced13495c0f49', # gramfeed.com
+ #'fa26679250df49c48a33fbcf30aae989', # instac.at
+ #'d9494686198d4dfeb954979a3e270e5e', # iconosquare.com
+ #'793ef48bb18e4197b61afce2d799b81c', # jsdo.it
+ #'67b8a3e0073449bba70600d0fc68e6cb', # jsdo.it
+ #'26a098e0df4d4b9ea8b4ce6c505b7742', # jsdo.it
+ #'2437cbcd906a4c10940f990d283d3cd5', # jsdo.it
+ #'191c7d7d5312464cbd92134f36ffdab5', # jsdo.it
+ #'acfec809437b4340b2c38f66503af774', # jsdo.it
+ #'e9f77604a3a24beba949c12d18130988', # jsdo.it
+ #'2cd7bcf68ae346529770073d311575b3', # jsdo.it
+ #'830c600fe8d742e2ab3f3b94f9bb22b7', # jsdo.it
+ #'55865a0397ad41e5997dd95ef4df8da1', # jsdo.it
+ #'192a5742f3644ea8bed1d25e439286a8', # jsdo.it
+ #'38ed1477e7a44595861b8842cdb8ba23', # jsdo.it
+ #'e52f79f645f54488ad0cc47f6f55ade6', # jsfiddle.net
+ );
+
+ my $tok = $tokens[int(rand($#tokens+1))];
+ $last_search = $instagram_url_base . "?client_id=" . $tok;
+
+ print STDERR "\n\n" if ($verbose_load);
+ LOG ($verbose_load, "URL: $last_search");
+
+ my ( $base, $body ) = get_document ($last_search, undef, $timeout);
+ if (!$base || !$body) {
+ $body = undef;
+ return;
+ }
+
+ $body =~ s/("link")/\001$1/gs;
+ my @chunks = split(/\001/, $body);
+ shift @chunks;
+ my @urls = ();
+ foreach (@chunks) {
+ s/\\//gs;
+ my ($url) = m/"link":\s*"(.*?)"/s;
+ my ($img) = m/"standard_resolution":\{"url":\s*"(.*?)"/s;
+ ($img) = m/"url":\s*"(.*?)"/s unless $url;
+ next unless ($url && $img);
+ push @urls, [ $url, $img ];
+ }
+
+ if ($#urls < 0) {
+ LOG ($verbose_load, "no images on $last_search");
+ return ();
+ }
+
+ my $i = int(rand($#urls+1));
+ my ($url, $img) = @{$urls[$i]};
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
+ return ($url, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from Imgur.
+#
+############################################################################
+
+my $imgur_base = 'http://imgur.com/search?qs=thumb&q_any=';
+
+sub pick_from_imgur($) {
+ my $timeout = shift;
+
+ my $words = random_words('%20');
+ $last_search = $imgur_base . $words;
+
+ $_ = $words;
+ s/%20/ /g;
+
+ print STDERR "\n\n" if ($verbose_load);
+ LOG ($verbose_load, "words: $_");
+ LOG ($verbose_load, "URL: $last_search");
+
+ $suppress_audit = 1;
+
+ my ( $base, $body ) = get_document ($last_search, undef, $timeout);
+ if (!$base || !$body) {
+ $body = undef;
+ return;
+ }
+
+ my @imgs = ($body =~ m@\bHREF=[\"\']([^\'\"<>]*/gallery/[^\'\"<>]+)@gsi);
+ return () unless @imgs;
+
+ my $n = @imgs;
+ my $i = int(rand($n));
+ my $page = $imgs[$i];
+ $page =~ s/[?&].*$//s;
+ $page = "http://imgur.com$page" if ($page =~ m@^/@s);
+
+ my ($id) = ($page =~ m@([^/?&]+)$@s);
+ my $img = "http://i.imgur.com/$id.jpg";
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
+
+ return ($page, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from Tumblr.
+#
+############################################################################
+
+my $tumblr_base = 'https://www.tumblr.com/search/';
+
+sub pick_from_tumblr($) {
+ my $timeout = shift;
+
+ # Tumblr doesn't have an "or" search, which means our vocabulary is
+ # a bit too extensive to work well...
+
+ my $words = random_word();
+ $last_search = $tumblr_base . $words;
+
+ print STDERR "\n\n" if ($verbose_load);
+ LOG ($verbose_load, "words: $words");
+ LOG ($verbose_load, "URL: $last_search");
+
+ $suppress_audit = 1;