+ my ( $url, $body ) = @_;
+
+ my $base = $url;
+ $_ = $url;
+
+ # if there's at least one slash after the host, take off the last
+ # pathname component
+ if ( m@^http://[^/]+/@io ) {
+ $base =~ s@[^/]+$@@go;
+ }
+
+ # if there are no slashes after the host at all, put one on the end.
+ if ( m@^http://[^/]+$@io ) {
+ $base .= "/";
+ }
+
+ $_ = $body;
+
+ # strip out newlines, compress whitespace
+ s/[\r\n\t ]+/ /go;
+
+ # nuke comments
+ s/<!--.*?-->//go;
+
+
+ # There are certain web sites that list huge numbers of dictionary
+ # words in their bodies or in their <META NAME=KEYWORDS> tags (surprise!
+ # Porn sites tend not to be reputable!)
+ #
+ # I do not want webcollage to filter on content: I want it to select
+ # randomly from the set of images on the web. All the logic here for
+ # rejecting some images is really a set of heuristics for rejecting
+ # images that are not really images: for rejecting *text* that is in
+ # GIF/JPEG form. I don't want text, I want pictures, and I want the
+ # content of the pictures to be randomly selected from among all the
+ # available content.
+ #
+ # So, filtering out "dirty" pictures by looking for "dirty" keywords
+ # would be wrong: dirty pictures exist, like it or not, so webcollage
+ # should be able to select them.
+ #
+ # However, picking a random URL is a hard thing to do. The mechanism I'm
+ # using is to search for a selection of random words. This is not
+ # perfect, but works ok most of the time. The way it breaks down is when
+ # some URLs get precedence because their pages list *every word* as
+ # related -- those URLs come up more often than others.
+ #
+ # So, after we've retrieved a URL, if it has too many keywords, reject
+ # it. We reject it not on the basis of what those keywords are, but on
+ # the basis that by having so many, the page has gotten an unfair
+ # advantage against our randomizer.
+ #
+ my $trip_count = 0;
+ foreach my $trip (@tripwire_words) {
+ $trip_count++ if m/$trip/i;
+ }
+
+ if ($trip_count >= $#tripwire_words - 2) {
+ LOG (($verbose_filter || $verbose_load),
+ "there is probably a dictionary in \"$url\": rejecting.");
+ $rejected_urls{$url} = -1;
+ $body = undef;
+ $_ = undef;
+ return ();
+ }
+
+
+ my @urls;
+ my %unique_urls;
+
+ foreach (split(/ *</)) {
+ if ( m/^meta /i ) {
+
+ # Likewise, reject any web pages that have a KEYWORDS meta tag
+ # that is too long.
+ #
+ if (m/name ?= ?\"?keywords\"?/i &&
+ m/content ?= ?\"([^\"]+)\"/) {
+ my $L = length($1);
+ if ($L > 1000) {
+ LOG (($verbose_filter || $verbose_load),
+ "excessive keywords ($L bytes) in $url: rejecting.");
+ $rejected_urls{$url} = $L;
+ $body = undef;
+ $_ = undef;
+ return ();
+ } else {
+ LOG ($verbose_filter, " keywords ($L bytes) in $url (ok)");
+ }
+ }
+
+ } elsif ( m/^(img|a) .*(src|href) ?= ?\"? ?(.*?)[ >\"]/io ) {
+
+ my $was_inline = ( "$1" eq "a" || "$1" eq "A" );
+ my $link = $3;
+ my ( $width ) = m/width ?=[ \"]*(\d+)/oi;
+ my ( $height ) = m/height ?=[ \"]*(\d+)/oi;
+ $_ = $link;
+
+ if ( m@^/@o ) {
+ my $site;
+ ( $site = $base ) =~ s@^(http://[^/]*).*@$1@gio;
+ $_ = "$site$link";
+ } elsif ( ! m@^[^/:?]+:@ ) {
+ $_ = "$base$link";
+ s@/\./@/@g;
+ 1 while (s@/[^/]+/\.\./@/@g);
+ }
+
+ # skip non-http
+ if ( ! m@^http://@io ) {
+ next;
+ }
+
+ # skip non-image
+ if ( ! m@[.](gif|jpg|jpeg|pjpg|pjpeg)$@io ) {
+ next;
+ }
+
+ # skip really short or really narrow images
+ if ( $width && $width < $min_width) {
+ if (!$height) { $height = "?"; }
+ LOG ($verbose_filter, " skip narrow image $_ (${width}x$height)");
+ next;
+ }
+
+ if ( $height && $height < $min_height) {
+ if (!$width) { $width = "?"; }
+ LOG ($verbose_filter, " skip short image $_ (${width}x$height)");
+ next;
+ }
+
+ # skip images with ratios that make them look like banners.
+ if ($min_ratio && $width && $height &&
+ ($width * $min_ratio ) > $height) {
+ if (!$height) { $height = "?"; }
+ LOG ($verbose_filter, " skip bad ratio $_ (${width}x$height)");
+ next;
+ }
+
+ # skip GIFs with a small number of pixels -- those usually suck.
+ if ($width && $height &&
+ m/\.gif$/io &&
+ ($width * $height) < $min_gif_area) {
+ LOG ($verbose_filter, " skip small GIF $_ (${width}x$height)");
+ next;
+ }
+
+
+ my $url = $_;
+
+ if ($unique_urls{$url}) {
+ LOG ($verbose_filter, " skip duplicate image $_");
+ next;
+ }
+
+ LOG ($verbose_filter,
+ " image $url" .
+ ($width && $height ? " (${width}x${height})" : "") .
+ ($was_inline ? " (inline)" : ""));
+
+ $urls[++$#urls] = $url;
+ $unique_urls{$url}++;
+
+ # jpegs are preferable to gifs.
+ $_ = $url;
+ if ( ! m@[.]gif$@io ) {
+ $urls[++$#urls] = $url;
+ }
+
+ # pointers to images are preferable to inlined images.
+ if ( ! $was_inline ) {
+ $urls[++$#urls] = $url;
+ $urls[++$#urls] = $url;
+ }
+ }
+ }
+
+ my $fsp = ($body =~ m@<frameset@i);
+
+ $_ = undef;
+ $body = undef;
+
+ @urls = depoison (@urls);
+
+ if ( $#urls < 0 ) {
+ LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : ""));
+ return ();
+ }
+
+ # pick a random element of the table
+ my $i = int(rand($#urls+1));
+ $url = $urls[$i];
+
+ LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
+
+ return $url;
+}
+
+
+\f
+############################################################################
+#
+# Subroutines for getting pages and images out of search engines
+#
+############################################################################
+
+
+sub pick_dictionary {
+ my @dicts = ("/usr/dict/words",
+ "/usr/share/dict/words",
+ "/usr/share/lib/dict/words");
+ foreach my $f (@dicts) {
+ if (-f $f) {
+ $wordlist = $f;
+ last;
+ }
+ }
+ error ("$dicts[0] does not exist") unless defined($wordlist);
+}
+
+# returns a random word from the dictionary
+#
+sub random_word {
+ my $word = 0;
+ if (open (IN, "<$wordlist")) {
+ my $size = (stat(IN))[7];
+ my $pos = rand $size;
+ if (seek (IN, $pos, 0)) {
+ $word = <IN>; # toss partial line
+ $word = <IN>; # keep next line
+ }
+ if (!$word) {
+ seek( IN, 0, 0 );
+ $word = <IN>;
+ }
+ close (IN);
+ }
+
+ return 0 if (!$word);
+
+ $word =~ s/^[ \t\n\r]+//;
+ $word =~ s/[ \t\n\r]+$//;
+ $word =~ s/ys$/y/;
+ $word =~ s/ally$//;
+ $word =~ s/ly$//;
+ $word =~ s/ies$/y/;
+ $word =~ s/ally$/al/;
+ $word =~ s/izes$/ize/;
+ $word =~ tr/A-Z/a-z/;
+
+ if ( $word =~ s/[ \t\n\r]/\+/g ) { # convert intra-word spaces to "+".
+ $word = "\%22$word\%22"; # And put quotes (%22) around it.
+ }
+
+ return $word;
+}
+
+sub random_words {
+ my ($or_p) = @_;
+ my $sep = ($or_p ? "%20OR%20" : "%20");
+ return (random_word . $sep .
+ random_word . $sep .
+ random_word . $sep .
+ random_word . $sep .
+ random_word);
+}
+
+
+sub url_quote {
+ my ($s) = @_;
+ $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge;
+ return $s;
+}
+
+sub url_unquote {
+ my ($s) = @_;
+ $s =~ s/[+]/ /g;
+ $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige;
+ return $s;
+}
+