+
+\f
+############################################################################
+#
+# Pick images by feeding random numbers into Bing Image Search.
+#
+############################################################################
+
+# bingnums
+sub pick_from_bing_image_numbers($) {
+ my ($timeout) = @_;
+
+ my $max = 9999;
+ my $number = int(rand($max));
+
+ $number = sprintf("%04d", $number)
+ if (rand() < 0.3);
+
+ pick_from_bing_images ($timeout, "$number");
+}
+
+\f
+############################################################################
+#
+# Pick images by feeding random numbers into Bing Image Search.
+#
+############################################################################
+
+# bingphotos
+sub pick_from_bing_image_photos($) {
+ my ($timeout) = @_;
+
+ my $i = int(rand($#photomakers + 1));
+ my $fn = $photomakers[$i];
+ my $file = &$fn;
+
+ pick_from_bing_images ($timeout, $file);
+}
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Alta Vista Text Search
+#
+############################################################################
+
+
+my $alta_vista_url = "http://www.altavista.com/web/results" .
+ "?pg=aq" .
+ "&aqmode=s" .
+ "&filetype=html" .
+ "&sc=on" . # "site collapse"
+ "&nbq=50" .
+ "&aqo=";
+
+# avtext
+sub pick_from_alta_vista_text($) {
+ my ($timeout) = @_;
+
+ my $words = random_words('%20');
+ my $page = (int(rand(9)) + 1);
+ my $search_url = $alta_vista_url . $words;
+
+ if ($page > 1) {
+ $search_url .= "&pgno=" . $page;
+ $search_url .= "&stq=" . (($page-1) * 10);
+ }
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # Those altavista fuckers are playing really nasty redirection games
+ # these days: the filter your clicks through their site, but use
+ # onMouseOver to make it look like they're not! Well, it makes it
+ # easier for us to identify search results...
+ #
+ next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
+ $u = url_unquote($u);
+
+ next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]altavista\.com\b@i); # skip altavista builtins
+ next if ($u =~ m@[/.]yahoo\.com\b@i); # yahoo and av in cahoots?
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Hotbot
+#
+############################################################################
+
+my $hotbot_search_url =("http://hotbot.lycos.com/default.asp" .
+ "?ca=w" .
+ "&descriptiontype=0" .
+ "&imagetoggle=1" .
+ "&matchmode=any" .
+ "&nummod=2" .
+ "&recordcount=50" .
+ "&sitegroup=1" .
+ "&stem=1" .
+ "&cobrand=undefined" .
+ "&query=");
+
+sub pick_from_hotbot_text($) {
+ my ($timeout) = @_;
+
+ $last_search = $hotbot_search_url; # for warnings
+
+ # lycos seems to always give us back dictionaries and word lists if
+ # we search for more than one word...
+ #
+ my $words = random_word();
+
+ my $start = int(rand(8)) * 10 + 1;
+ my $search_url = $hotbot_search_url . $words . "&first=$start&page=more";
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # Hotbot plays redirection games too
+ # (not any more?)
+# next unless ($u =~ m@/director.asp\?.*\btarget=([^&]+)@);
+# $u = url_decode($1);
+
+ next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]hotbot\.com\b@i); # skip hotbot builtins
+ next if ($u =~ m@[/.]lycos\.com\b@i); # skip hotbot builtins
+ next if ($u =~ m@[/.]inktomi\.com\b@i); # skip hotbot builtins
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Lycos
+#
+############################################################################
+
+my $lycos_search_url = "http://search.lycos.com/default.asp" .
+ "?lpv=1" .
+ "&loc=searchhp" .
+ "&tab=web" .
+ "&query=";
+
+sub pick_from_lycos_text($) {
+ my ($timeout) = @_;
+
+ $last_search = $lycos_search_url; # for warnings
+
+ # lycos seems to always give us back dictionaries and word lists if
+ # we search for more than one word...
+ #
+ my $words = random_word();
+
+ my $start = int(rand(8)) * 10 + 1;
+ my $search_url = $lycos_search_url . $words . "&first=$start&page=more";
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # Lycos plays redirection games.
+ # (not any more?)
+# next unless ($u =~ m@^https?://click.lycos.com/director.asp
+# .*
+# \btarget=([^&]+)
+# .*
+# @x);
+# $u = url_decode($1);
+
+ next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]hotbot\.com\b@i); # skip lycos builtins
+ next if ($u =~ m@[/.]lycos\.com\b@i); # skip lycos builtins
+ next if ($u =~ m@[/.]terralycos\.com\b@i); # skip lycos builtins
+ next if ($u =~ m@[/.]inktomi\.com\b@i); # skip lycos builtins
+
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into news.yahoo.com
+#
+############################################################################
+
+my $yahoo_news_url = "http://news.search.yahoo.com/search/news" .
+ "?c=news_photos" .
+ "&p=";
+
+# yahoonews
+sub pick_from_yahoo_news_text($) {
+ my ($timeout) = @_;
+
+ $last_search = $yahoo_news_url; # for warnings
+
+ my $words = random_word();
+ my $search_url = $yahoo_news_url . $words;
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # de-redirectize the URLs
+ $u =~ s@^https?://rds\.yahoo\.com/.*-http%3A@http:@s;
+
+ # only accept URLs on Yahoo's news site
+ next unless ($u =~ m@^https?://dailynews\.yahoo\.com/@i ||
+ $u =~ m@^https?://story\.news\.yahoo\.com/@i);
+ next unless ($u =~ m@&u=/@);
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images from LiveJournal's list of recently-posted images.
+#
+############################################################################
+
+my $livejournal_img_url = "http://www.livejournal.com/stats/latest-img.bml";
+
+# With most of our image sources, we get a random page and then select
+# from the images on it. However, in the case of LiveJournal, the page
+# of images tends to update slowly; so we'll remember the last N entries
+# on it and randomly select from those, to get a wider variety each time.
+
+my $lj_cache_size = 1000;
+my @lj_cache = (); # fifo, for ordering by age
+my %lj_cache = (); # hash, for detecting dups
+
+# livejournal
+sub pick_from_livejournal_images($) {
+ my ($timeout) = @_;
+
+ $last_search = $livejournal_img_url; # for warnings
+
+ my ( $base, $body ) = get_document ($livejournal_img_url, undef, $timeout);
+
+ # Often the document comes back empty. If so, just use the cache.
+ # return () unless $body;
+ $body = '' unless defined($body);
+
+ $body =~ s/\n/ /gs;
+ $body =~ s/(<recent-image)\b/\n$1/gsi;
+
+ foreach (split (/\n/, $body)) {
+ next unless (m/^<recent-image\b/);
+ next unless (m/\bIMG=[\'\"]([^\'\"]+)[\'\"]/si);
+ my $img = html_unquote ($1);
+
+ next if ($lj_cache{$img}); # already have it
+
+ next unless (m/\bURL=[\'\"]([^\'\"]+)[\'\"]/si);
+ my $page = html_unquote ($1);
+ my @pair = ($img, $page);
+ LOG ($verbose_filter, " candidate: $img");
+ push @lj_cache, \@pair;
+ $lj_cache{$img} = \@pair;
+ }
+
+ return () if ($#lj_cache == -1);
+
+ my $n = $#lj_cache+1;
+ my $i = int(rand($n));
+ my ($img, $page) = @{$lj_cache[$i]};
+
+ # delete this one from @lj_cache and from %lj_cache.
+ #
+ @lj_cache = ( @lj_cache[0 .. $i-1],
+ @lj_cache[$i+1 .. $#lj_cache] );
+ delete $lj_cache{$img};
+
+ # Keep the size of the cache under the limit by nuking older entries
+ #
+ while ($#lj_cache >= $lj_cache_size) {
+ my $pairP = shift @lj_cache;
+ my $img = $pairP->[0];
+ delete $lj_cache{$img};
+ }
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
+
+ return ($page, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from ircimages.com (images that have been in the /topic of
+# various IRC channels.)
+#
+############################################################################
+
+my $ircimages_url = "http://ircimages.com/";
+
+# ircimages
+sub pick_from_ircimages($) {
+ my ($timeout) = @_;
+
+ $last_search = $ircimages_url; # for warnings
+
+ my $n = int(rand(2900));
+ my $search_url = $ircimages_url . "page-$n";
+
+ my ( $base, $body ) = get_document ($search_url, undef, $timeout);
+ return () unless $body;
+
+ my @candidates = ();
+
+ $body =~ s/\n/ /gs;
+ $body =~ s/(<A)\b/\n$1/gsi;
+
+ foreach (split (/\n/, $body)) {
+
+ my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
+ next unless $u;
+
+ if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; } # quoted string
+ elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; } # or token
+
+ next unless ($u =~ m/^https?:/i);
+ next if ($u =~ m@^https?://(searchirc\.com\|ircimages\.com)@i);
+ next unless ($u =~ m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@i);
+
+ LOG ($verbose_http, " HREF: $u");
+ push @candidates, $u;
+ }
+
+ LOG ($verbose_filter, "" . $#candidates+1 . " links on $search_url");
+
+ return () if ($#candidates == -1);
+
+ my $i = int(rand($#candidates+1));
+ my $img = $candidates[$i];
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#candidates+1) .
+ ": $img");
+
+ $search_url = $img; # hmm...
+ return ($search_url, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from Twitpic's list of recently-posted images.
+#
+############################################################################
+
+my $twitpic_img_url = "http://twitpic.com/public_timeline/feed.rss";
+
+# With most of our image sources, we get a random page and then select
+# from the images on it. However, in the case of Twitpic, the page
+# of images tends to update slowly; so we'll remember the last N entries
+# on it and randomly select from those, to get a wider variety each time.
+
+my $twitpic_cache_size = 1000;
+my @twitpic_cache = (); # fifo, for ordering by age
+my %twitpic_cache = (); # hash, for detecting dups
+
+# twitpic
+sub pick_from_twitpic_images($) {
+ my ($timeout) = @_;
+
+ $last_search = $twitpic_img_url; # for warnings
+
+ my ( $base, $body ) = get_document ($twitpic_img_url, undef, $timeout);
+
+ # Update the cache.
+
+ if ($body) {
+ $body =~ s/\n/ /gs;
+ $body =~ s/(<item)\b/\n$1/gsi;
+
+ my @items = split (/\n/, $body);
+ shift @items;
+ foreach (@items) {
+ next unless (m@<link>([^<>]*)</link>@si);
+ my $page = html_unquote ($1);
+
+ $page =~ s@/$@@s;
+ $page .= '/full';
+
+ next if ($twitpic_cache{$page}); # already have it
+
+ LOG ($verbose_filter, " candidate: $page");
+ push @twitpic_cache, $page;
+ $twitpic_cache{$page} = $page;
+ }
+ }
+
+ # Pull from the cache.
+
+ return () if ($#twitpic_cache == -1);
+
+ my $n = $#twitpic_cache+1;
+ my $i = int(rand($n));
+ my $page = $twitpic_cache[$i];
+
+ # delete this one from @twitpic_cache and from %twitpic_cache.
+ #
+ @twitpic_cache = ( @twitpic_cache[0 .. $i-1],
+ @twitpic_cache[$i+1 .. $#twitpic_cache] );
+ delete $twitpic_cache{$page};
+
+ # Keep the size of the cache under the limit by nuking older entries
+ #
+ while ($#twitpic_cache >= $twitpic_cache_size) {
+ my $page = shift @twitpic_cache;
+ delete $twitpic_cache{$page};
+ }
+
+ ( $base, $body ) = get_document ($page, undef, $timeout);
+ my $img = undef;
+ $body = '' unless defined($body);
+
+ foreach (split (/<img\s+/, $body)) {
+ my ($src) = m/\bsrc=[\"\'](.*?)[\"\']/si;
+ next unless $src;
+ next if m@/js/@s;
+ next if m@/images/@s;
+
+ $img = $src;
+
+ $img = "http:$img" if ($img =~ m@^//@s); # Oh come on
+
+ # Sometimes these images are hosted on twitpic, sometimes on Amazon.
+ if ($img =~ m@^/@) {
+ $base =~ s@^(https?://[^/]+)/.*@$1@s;
+ $img = $base . $img;
+ }
+ last;
+ }
+
+ if (!$img) {
+ LOG ($verbose_load, "no matching images on $page\n");
+ return ();
+ }
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
+
+ return ($page, $img);
+}
+
+\f
+############################################################################
+#
+# Pick images from Twitter's list of recently-posted updates.
+#
+############################################################################
+
+# With most of our image sources, we get a random page and then select
+# from the images on it. However, in the case of Twitter, the page
+# of images only updates once a minute; so we'll remember the last N entries
+# on it and randomly select from those, to get a wider variety each time.
+
+my $twitter_img_url = "http://api.twitter.com/1/statuses/" .
+ "public_timeline.json" .
+ "?include_entities=true" .
+ "&include_rts=true" .
+ "&count=200";
+
+my $twitter_cache_size = 1000;
+
+my @twitter_cache = (); # fifo, for ordering by age
+my %twitter_cache = (); # hash, for detecting dups
+
+
+# twitter
+sub pick_from_twitter_images($) {
+ my ($timeout) = @_;
+
+ $last_search = $twitter_img_url; # for warnings
+
+ my ( $base, $body ) = get_document ($twitter_img_url, undef, $timeout);
+ # Update the cache.
+
+ if ($body) {
+ $body =~ s/[\r\n]+/ /gs;
+
+ # Parsing JSON is a pain in the ass. So we halfass it as usual.
+ $body =~ s/^\[|\]$//s;
+ $body =~ s/(\[.*?\])/{ $_ = $1; s@\},@\} @gs; $_; }/gsexi;
+ my @items = split (/\},\{/, $body);
+ foreach (@items) {
+ my ($name) = m@"screen_name":"([^\"]+)"@si;
+ my ($img) = m@"media_url":"([^\"]+)"@si;
+ my ($page) = m@"display_url":"([^\"]+)"@si;
+ next unless ($name && $img && $page);
+ foreach ($img, $page) {
+ s/\\//gs;
+ $_ = "http://$_" unless (m/^http/si);
+ }
+
+ next if ($twitter_cache{$page}); # already have it
+
+ LOG ($verbose_filter, " candidate: $page - $img");
+ push @twitter_cache, $page;
+ $twitter_cache{$page} = $img;
+ }
+ }
+
+ # Pull from the cache.
+
+ return () if ($#twitter_cache == -1);
+
+ my $n = $#twitter_cache+1;
+ my $i = int(rand($n));
+ my $page = $twitter_cache[$i];
+ my $url = $twitter_cache{$page};
+
+ # delete this one from @twitter_cache and from %twitter_cache.
+ #
+ @twitter_cache = ( @twitter_cache[0 .. $i-1],
+ @twitter_cache[$i+1 .. $#twitter_cache] );
+ delete $twitter_cache{$page};
+
+ # Keep the size of the cache under the limit by nuking older entries
+ #
+ while ($#twitter_cache >= $twitter_cache_size) {
+ my $page = shift @twitter_cache;
+ delete $twitter_cache{$page};
+ }
+
+ LOG ($verbose_load, "picked page $url");
+
+ $suppress_audit = 1;
+
+ return ($page, $url);
+}
+