+ next unless ($word);
+ next if ($word =~ m/^[-\']/);
+
+ $word = lc($word);
+ $word =~ s/^.*-//s;
+ $word =~ s/^[^a-z]+//s;
+ $word =~ s/[^a-z]+$//s;
+ $word =~ s/\'s$//s;
+ $word =~ s/ys$/y/s;
+ $word =~ s/ally$//s;
+ $word =~ s/ly$//s;
+ $word =~ s/ies$/y/s;
+ $word =~ s/ally$/al/s;
+ $word =~ s/izes$/ize/s;
+ $word =~ s/esses$/ess/s;
+ $word =~ s/(.{5})ing$/$1/s;
+
+ next if (length ($word) > 14);
+ last if ($word);
+ }
+
+ close ($in);
+
+ if ( $word =~ s/\s/\+/gs ) { # convert intra-word spaces to "+".
+ $word = "\%22$word\%22"; # And put quotes (%22) around it.
+ }
+
+ return $word;
+}
+
+
+sub random_words($) {
+ my ($sep) = @_;
+ return (random_word() . $sep .
+ random_word() . $sep .
+ random_word() . $sep .
+ random_word() . $sep .
+ random_word());
+}
+
+
+sub url_quote($) {
+ my ($s) = @_;
+ $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge;
+ return $s;
+}
+
+sub url_unquote($) {
+ my ($s) = @_;
+ $s =~ s/[+]/ /g;
+ $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige;
+ return $s;
+}
+
+sub html_quote($) {
+ my ($s) = @_;
+ $s =~ s/&/&/gi;
+ $s =~ s/</</gi;
+ $s =~ s/>/>/gi;
+ $s =~ s/\"/"/gi;
+ return $s;
+}
+
+sub html_unquote($) {
+ my ($s) = @_;
+ $s =~ s/(&([a-z]+);)/{ $entity_table{$2} || $1; }/gexi; # e.g., '
+ $s =~ s/(&\#(\d+);)/{ chr($2) }/gexi; # e.g., '
+ return $s;
+}
+
+
+# Loads the given URL (a search on some search engine) and returns:
+# - the total number of hits the search engine claimed it had;
+# - a list of URLs from the page that the search engine returned;
+# Note that this list contains all kinds of internal search engine
+# junk URLs too -- caller must prune them.
+#
+sub pick_from_search_engine($$$) {
+ my ( $timeout, $search_url, $words ) = @_;
+
+ $_ = $words;
+ s/%20/ /g;
+
+ print STDERR "\n\n" if ($verbose_load);
+
+ LOG ($verbose_load, "words: $_");
+ LOG ($verbose_load, "URL: $search_url");
+
+ $last_search = $search_url; # for warnings
+
+ my $start = time;
+ my ( $base, $body ) = get_document ($search_url, undef, $timeout);
+ if (defined ($timeout)) {
+ $timeout -= (time - $start);
+ if ($timeout <= 0) {
+ $body = undef;
+ LOG (($verbose_net || $verbose_load),
+ "timed out (late) for $search_url");
+ $suppress_audit = 1;
+ return ();
+ }
+ }
+
+ return () if (! $body);
+
+
+ my @subpages;
+
+ if ($body =~ m/^\{\"/s) { # Google AJAX JSON response.
+
+ my @chunks = split (/"GsearchResultClass"/, $body);
+ shift @chunks;
+ my $body2 = '';
+ my $n = 1;
+ foreach (@chunks) {
+ my ($img) = m/"unescapedUrl":"(.*?)"/si;
+ my ($url) = m/"originalContextUrl":"(.*?)"/si;
+ next unless ($img && $url);
+ $url = ("/imgres" .
+ "?imgurl=" . url_quote($img) .
+ "&imgrefurl=" . url_quote($url) .
+ "&...");
+ $body2 .= "<A HREF=\"" . html_quote($url) . "\">$n</A>\n";
+ $n++;
+ }
+ $body = $body2 if $body2;
+ }
+
+ my $search_count = "?";
+ if ($body =~ m@found (approximately |about )?(<B>)?(\d+)(</B>)? image@) {
+ $search_count = $3;
+ } elsif ($body =~ m@<NOBR>((\d{1,3})(,\d{3})*) @i) {
+ $search_count = $1;
+ } elsif ($body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) {
+ $search_count = $1;
+ } elsif ($body =~ m@found about ((\d{1,3})(,\d{3})*|\d+) results@) {
+ $search_count = $1;
+ } elsif ($body =~ m@\b\d+ - \d+ of (\d+)\b@i) { # avimages
+ $search_count = $1;
+ } elsif ($body =~ m@About ((\d{1,3})(,\d{3})*) images@i) { # avimages
+ $search_count = $1;
+ } elsif ($body =~ m@We found ((\d{1,3})(,\d{3})*|\d+) results@i) { # *vista
+ $search_count = $1;
+ } elsif ($body =~ m@ of about <B>((\d{1,3})(,\d{3})*)<@i) { # googleimages
+ $search_count = $1;
+ } elsif ($body =~ m@<B>((\d{1,3})(,\d{3})*)</B> Web sites were found@i) {
+ $search_count = $1; # lycos
+ } elsif ($body =~ m@WEB.*?RESULTS.*?\b((\d{1,3})(,\d{3})*)\b.*?Matches@i) {
+ $search_count = $1; # hotbot
+ } elsif ($body =~ m@no photos were found containing@i) { # avimages
+ $search_count = "0";
+ } elsif ($body =~ m@found no document matching@i) { # avtext
+ $search_count = "0";
+ }
+ 1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/);
+
+# if ($search_count eq "?" || $search_count eq "0") {
+# my $file = "/tmp/wc.html";
+# open (my $out, '>', $file) || error ("writing $file: $!");
+# print $out $body;
+# close $out;
+# print STDERR blurb() . "###### wrote $file\n";
+# }
+
+
+ my $length = length($body);
+ my $href_count = 0;
+
+ $_ = $body;
+
+ s/[\r\n\t ]+/ /g;
+
+
+ s/(<A )/\n$1/gi;
+ foreach (split(/\n/)) {
+ $href_count++;
+ my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
+ next unless $u;
+
+ if (m/\bm="{(.*?)}"/s) { # Bing info is inside JSON crud
+ my $json = html_unquote($1);
+ my ($href) = ($json =~ m/\bsurl:"(.*?)"/s);
+ my ($img) = ($json =~ m/\bimgurl:"(.*?)"/s);
+ $u = "$img\t$href" if ($img && $href);
+
+ } elsif ($u =~ m/^\"([^\"]*)\"/) { $u = $1 # quoted string
+ } elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; # or token
+ }
+
+ if ( $rejected_urls{$u} ) {
+ LOG ($verbose_filter, " pre-rejecting candidate: $u");
+ next;
+ }
+
+ LOG ($verbose_http, " HREF: $u");
+
+ $subpages[++$#subpages] = $u;
+ }
+
+ if ( $#subpages < 0 ) {
+ LOG ($verbose_filter,
+ "found nothing on $base ($length bytes, $href_count links).");
+ return ();
+ }
+
+ LOG ($verbose_filter, "" . $#subpages+1 . " links on $search_url");
+
+ return ($search_count, @subpages);
+}
+
+
+sub depoison(@) {
+ my (@urls) = @_;
+ my @urls2 = ();
+ foreach (@urls) {
+ my ($h) = m@^http://([^/: \t\r\n]+)@i;
+
+ next unless defined($h);
+
+ if ($poisoners{$h}) {
+ LOG (($verbose_filter), " rejecting poisoner: $_");
+ next;
+ }
+ if ($h =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
+ $poisoners{$1}) {
+ LOG (($verbose_filter), " rejecting poisoner: $_");
+ next;
+ }
+ if ($h =~ m@([^.]+\.[^.]+)$@ &&
+ $poisoners{$1}) {
+ LOG (($verbose_filter), " rejecting poisoner: $_");
+ next;
+ }
+
+ push @urls2, $_;
+ }
+ return @urls2;
+}
+
+
+# given a list of URLs, picks one at random; loads it; and returns a
+# random image from it.
+# returns the url of the page loaded; the url of the image chosen.
+#
+sub pick_image_from_pages($$$$@) {
+ my ($base, $total_hit_count, $unfiltered_link_count, $timeout, @pages) = @_;
+
+ $total_hit_count = "?" unless defined($total_hit_count);
+
+ @pages = depoison (@pages);
+ LOG ($verbose_load,
+ "" . ($#pages+1) . " candidates of $unfiltered_link_count links" .
+ " ($total_hit_count total)");
+
+ return () if ($#pages < 0);
+
+ my $i = int(rand($#pages+1));
+ my $page = $pages[$i];
+
+ LOG ($verbose_load, "picked page $page");
+
+ $suppress_audit = 1;
+
+ my ( $base2, $body2 ) = get_document ($page, $base, $timeout);
+
+ if (!$base2 || !$body2) {
+ $body2 = undef;
+ return ();
+ }
+
+ my $img = pick_image_from_body ($base2, $body2);
+ $body2 = undef;
+
+ if ($img) {
+ return ($base2, $img);
+ } else {
+ return ();
+ }
+}
+
+\f
+#############################################################################
+##
+## Pick images from random pages returned by the Yahoo Random Link
+##
+#############################################################################
+#
+## yahoorand
+#my $yahoo_random_link = "http://random.yahoo.com/fast/ryl";
+#
+#
+# Picks a random page; picks a random image on that page;
+# returns two URLs: the page containing the image, and the image.
+# Returns () if nothing found this time.
+#
+#sub pick_from_yahoo_random_link($) {
+# my ($timeout) = @_;
+#
+# print STDERR "\n\n" if ($verbose_load);
+# LOG ($verbose_load, "URL: $yahoo_random_link");
+#
+# $last_search = $yahoo_random_link; # for warnings
+#
+# $suppress_audit = 1;
+#
+# my ( $base, $body ) = get_document ($yahoo_random_link, undef, $timeout);
+# if (!$base || !$body) {
+# $body = undef;
+# return;
+# }
+#
+# LOG ($verbose_load, "redirected to: $base");
+#
+# my $img = pick_image_from_body ($base, $body);
+# $body = undef;
+#
+# if ($img) {
+# return ($base, $img);
+# } else {
+# return ();
+# }
+#}
+
+\f
+############################################################################
+#
+# Pick images from random pages returned by the Alta Vista Random Link
+# Note: this seems to have gotten a *lot* less random lately (2007).
+#
+############################################################################
+
+# altavista
+my $alta_vista_random_link = "http://www.altavista.com/image/randomlink";
+
+
+# Picks a random page; picks a random image on that page;
+# returns two URLs: the page containing the image, and the image.
+# Returns () if nothing found this time.
+#
+sub pick_from_alta_vista_random_link($) {
+ my ($timeout) = @_;
+
+ print STDERR "\n\n" if ($verbose_load);
+ LOG ($verbose_load, "URL: $alta_vista_random_link");
+
+ $last_search = $alta_vista_random_link; # for warnings
+
+ $suppress_audit = 1;
+
+ my ( $base, $body ) = get_document ($alta_vista_random_link,
+ undef, $timeout);
+ if (!$base || !$body) {
+ $body = undef;
+ return;
+ }
+
+ LOG ($verbose_load, "redirected to: $base");
+
+ my $img = pick_image_from_body ($base, $body);
+ $body = undef;
+
+ if ($img) {
+ return ($base, $img);
+ } else {
+ return ();
+ }
+}
+
+\f
+############################################################################
+#
+# Pick images by feeding random words into Alta Vista Image Search
+#
+############################################################################
+
+
+my $alta_vista_images_url = "http://www.altavista.com/image/results" .
+ "?ipht=1" . # photos
+ "&igrph=1" . # graphics
+ "&iclr=1" . # color
+ "&ibw=1" . # b&w
+ "&micat=1" . # no partner sites
+ "&sc=on" . # "site collapse"
+ "&q=";
+
+# avimages
+sub pick_from_alta_vista_images($) {
+ my ($timeout) = @_;
+
+ my $words = random_word();
+ my $page = (int(rand(9)) + 1);
+ my $search_url = $alta_vista_images_url . $words;
+
+ if ($page > 1) {
+ $search_url .= "&pgno=" . $page; # page number
+ $search_url .= "&stq=" . (($page-1) * 12); # first hit result on page
+ }
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, $words);
+
+ my @candidates = ();
+ foreach my $u (@subpages) {
+
+ # avimages is encoding their URLs now.
+ next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
+ $u = url_unquote($u);
+
+ next unless ($u =~ m@^http://@i); # skip non-HTTP or relative URLs
+ next if ($u =~ m@[/.]altavista\.com\b@i); # skip altavista builtins
+ next if ($u =~ m@[/.]yahoo\.com\b@i); # yahoo and av in cahoots?
+ next if ($u =~ m@[/.]doubleclick\.net\b@i); # you cretins
+ next if ($u =~ m@[/.]clicktomarket\.com\b@i); # more cretins
+
+ next if ($u =~ m@[/.]viewimages\.com\b@i); # stacked deck
+ next if ($u =~ m@[/.]gettyimages\.com\b@i);
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ }
+
+ return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
+ $timeout, @candidates);
+}
+
+
+\f
+############################################################################
+#
+# Pick images from Aptix security cameras
+# Cribbed liberally from google image search code.
+# By Jason Sullivan <jasonsul@us.ibm.com>
+#
+############################################################################
+
+my $aptix_images_url = ("http://www.google.com/search" .
+ "?q=inurl:%22jpg/image.jpg%3Fr%3D%22");
+
+# securitycam
+sub pick_from_security_camera($) {
+ my ($timeout) = @_;
+
+ my $page = (int(rand(9)) + 1);
+ my $num = 20; # 20 images per page
+ my $search_url = $aptix_images_url;
+
+ if ($page > 1) {
+ $search_url .= "&start=" . $page*$num; # page number
+ $search_url .= "&num=" . $num; #images per page
+ }
+
+ my ($search_hit_count, @subpages) =
+ pick_from_search_engine ($timeout, $search_url, '');
+
+ my @candidates = ();
+ my %referers;
+ foreach my $u (@subpages) {
+ next if ($u =~ m@[/.]google\.com\b@i); # skip google builtins (most links)
+ next unless ($u =~ m@jpg/image.jpg\?r=@i); # All pics contain this
+
+ LOG ($verbose_filter, " candidate: $u");
+ push @candidates, $u;
+ $referers{$u} = $u;