+# given a URL and the body text at that URL, selects and returns a random
+# image from it. returns () if no suitable images found.
+#
+sub pick_image_from_body {
+ my ( $url, $body ) = @_;
+
+ my $base = $url;
+ $_ = $url;
+
+ # if there's at least one slash after the host, take off the last
+ # pathname component
+ if ( m@^http://[^/]+/@io ) {
+ $base =~ s@[^/]+$@@go;
+ }
+
+ # if there are no slashes after the host at all, put one on the end.
+ if ( m@^http://[^/]+$@io ) {
+ $base .= "/";
+ }
+
+ $_ = $body;
+
+ # strip out newlines, compress whitespace
+ s/[\r\n\t ]+/ /go;
+
+ # nuke comments
+ s/<!--.*?-->//go;
+
+
+ # There are certain web sites that list huge numbers of dictionary
+ # words in their bodies or in their <META NAME=KEYWORDS> tags (surprise!
+ # Porn sites tend not to be reputable!)
+ #
+ # I do not want webcollage to filter on content: I want it to select
+ # randomly from the set of images on the web. All the logic here for
+ # rejecting some images is really a set of heuristics for rejecting
+ # images that are not really images: for rejecting *text* that is in
+ # GIF/JPEG form. I don't want text, I want pictures, and I want the
+ # content of the pictures to be randomly selected from among all the
+ # available content.
+ #
+ # So, filtering out "dirty" pictures by looking for "dirty" keywords
+ # would be wrong: dirty pictures exist, like it or not, so webcollage
+ # should be able to select them.
+ #
+ # However, picking a random URL is a hard thing to do. The mechanism I'm
+ # using is to search for a selection of random words. This is not
+ # perfect, but works ok most of the time. The way it breaks down is when
+ # some URLs get precedence because their pages list *every word* as
+ # related -- those URLs come up more often than others.
+ #
+ # So, after we've retrieved a URL, if it has too many keywords, reject
+ # it. We reject it not on the basis of what those keywords are, but on
+ # the basis that by having so many, the page has gotten an unfair
+ # advantage against our randomizer.
+ #
+ my $trip_count = 0;
+ foreach my $trip (@tripwire_words) {
+ $trip_count++ if m/$trip/i;
+ }
+
+ if ($trip_count >= $#tripwire_words - 2) {
+ LOG (($verbose_filter || $verbose_load),
+ "there is probably a dictionary in \"$url\": rejecting.");
+ $rejected_urls{$url} = -1;
+ $body = undef;
+ $_ = undef;
+ return ();
+ }
+
+
+ my @urls;
+ my %unique_urls;
+
+ foreach (split(/ *</)) {
+ if ( m/^meta /i ) {
+
+ # Likewise, reject any web pages that have a KEYWORDS meta tag
+ # that is too long.
+ #
+ if (m/name ?= ?\"?keywords\"?/i &&
+ m/content ?= ?\"([^\"]+)\"/) {
+ my $L = length($1);
+ if ($L > 1000) {
+ LOG (($verbose_filter || $verbose_load),
+ "excessive keywords ($L bytes) in $url: rejecting.");
+ $rejected_urls{$url} = $L;
+ $body = undef;
+ $_ = undef;
+ return ();
+ } else {
+ LOG ($verbose_filter, " keywords ($L bytes) in $url (ok)");
+ }
+ }
+
+ } elsif ( m/^(img|a) .*(src|href) ?= ?\"? ?(.*?)[ >\"]/io ) {
+
+ my $was_inline = ( "$1" eq "a" || "$1" eq "A" );
+ my $link = $3;
+ my ( $width ) = m/width ?=[ \"]*(\d+)/oi;
+ my ( $height ) = m/height ?=[ \"]*(\d+)/oi;
+ $_ = $link;
+
+ if ( m@^/@o ) {
+ my $site;
+ ( $site = $base ) =~ s@^(http://[^/]*).*@$1@gio;
+ $_ = "$site$link";
+ } elsif ( ! m@^[^/:?]+:@ ) {
+ $_ = "$base$link";
+ s@/\./@/@g;
+ while (s@/\.\./@/@g) {
+ }
+ }
+
+ # skip non-http
+ if ( ! m@^http://@io ) {
+ next;
+ }
+
+ # skip non-image
+ if ( ! m@[.](gif|jpg|jpeg|pjpg|pjpeg)$@io ) {
+ next;
+ }
+
+ # skip really short or really narrow images
+ if ( $width && $width < $min_width) {
+ if (!$height) { $height = "?"; }
+ LOG ($verbose_filter, " skip narrow image $_ (${width}x$height)");
+ next;
+ }
+
+ if ( $height && $height < $min_height) {
+ if (!$width) { $width = "?"; }
+ LOG ($verbose_filter, " skip short image $_ (${width}x$height)");
+ next;
+ }
+
+ # skip images with ratios that make them look like banners.
+ if ($min_ratio && $width && $height &&
+ ($width * $min_ratio ) > $height) {
+ if (!$height) { $height = "?"; }
+ LOG ($verbose_filter, " skip bad ratio $_ (${width}x$height)");
+ next;
+ }
+
+ # skip GIFs with a small number of pixels -- those usually suck.
+ if ($width && $height &&
+ m/\.gif$/io &&
+ ($width * $height) < $min_gif_area) {
+ LOG ($verbose_filter, " skip small GIF $_ (${width}x$height)");
+ next;
+ }
+
+
+ my $url = $_;
+
+ if ($unique_urls{$url}) {
+ LOG ($verbose_filter, " skip duplicate image $_");
+ next;
+ }
+
+ LOG ($verbose_filter,
+ " image $url" .
+ ($width && $height ? " (${width}x${height})" : "") .
+ ($was_inline ? " (inline)" : ""));
+
+ $urls[++$#urls] = $url;
+ $unique_urls{$url}++;
+
+ # jpegs are preferable to gifs.
+ $_ = $url;
+ if ( ! m@[.]gif$@io ) {
+ $urls[++$#urls] = $url;
+ }
+
+ # pointers to images are preferable to inlined images.
+ if ( ! $was_inline ) {
+ $urls[++$#urls] = $url;
+ $urls[++$#urls] = $url;
+ }
+ }
+ }
+
+ my $fsp = ($body =~ m@<frameset@i);
+
+ $_ = undef;
+ $body = undef;
+
+ @urls = depoison (@urls);
+
+ if ( $#urls < 0 ) {
+ LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : ""));
+ return ();
+ }