WebCollage

X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?p=xscreensaver;a=blobdiff_plain;f=hacks%2Fwebcollage;h=9f767edaabb6abb61ea93d1e6b27db06312d6740;hp=49768c978262200eeb7bbc1883029a47814c4db6;hb=af290bcdf2d1c61efc8aaaff653745c900cbe98c;hpb=df7adbee81405e2849728a24b498ad2117784b1f diff --git a/hacks/webcollage b/hacks/webcollage index 49768c97..9f767eda 100755 --- a/hacks/webcollage +++ b/hacks/webcollage @@ -16,34 +16,10 @@ # # default-n: webcollage -root \n\ # default-n: webcollage -root -filter 'vidwhacker -stdin -stdout' \n\ -# -# To run this as a CGI program on a web site, do this (these instructions -# work with Apache 1.3 or newer): -# -# 1: Place this program in your document directory, named "webcollage". -# The name shouldn't end in .cgi or .html, since this CGI behaves like -# a directory. -# 2: Make it world-readable and world-executable. -# 3: Create a ".htaccess" file in the same directory containing these lines: -# -# SetHandler cgi-script -# -# 4: Create these files in the same directory, world-writable, zero-length: -# collage.ppm -# collage.tmp -# collage.jpg -# collage.pending -# collage.map -# -# Now the CGI is ready to go. - -my $copyright = "WebCollage, Copyright (c) 1999" . - " Jamie Zawinski \n" . - " http://www.jwz.org/xscreensaver/\n"; -my $argv0 = $0; -my $progname = $argv0; $progname =~ s@.*/@@g; -my $version = q{ $Revision: 1.7 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; +require 5; +#use diagnostics; +use strict; use Socket; require Time::Local; @@ -51,41 +27,46 @@ require POSIX; use Fcntl ':flock'; # import LOCK_* constants -# CGI Parameters - -my $data_dir = ""; # if you want the following files to go in - # some directory below ".", name it here. - -my $image_ppm = "${data_dir}collage.ppm"; # names of the various data files. -my $image_tmp = "${data_dir}collage.tmp"; -my $image_jpg = "${data_dir}collage.jpg"; -my $pending_file = "${data_dir}collage.pending"; -my $map_file = "${data_dir}collage.map"; - -my $url_generation_time = 60; # total time to spend getting URLs. -my $image_retrieval_time = 60; # maximum time to spend loading all images. -my $max_map_entries = 100; # how many lines to save in $map_file. -my $pastes_per_load = 3; # how many images to try and paste each time. - -my $max_age = 5 * 60; # minutes before it is considered stale. -my $scale = 1.0; # client-side image expansion. - -my $img_width = 800; # size of the image being generated. -my $img_height = 600; - -my @all_files = ($image_ppm, $image_tmp, $image_jpg, $pending_file, $map_file); -my $script_date; +my $version = q{ $Revision: 1.32 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; +my $copyright = "WebCollage $version, Copyright (c) 1999" . + " Jamie Zawinski \n" . + " http://www.jwz.org/xscreensaver/\n"; -# Other Parameters +my $argv0 = $0; +my $progname = $argv0; $progname =~ s@.*/@@g; my $random_redirector = "http://random.yahoo.com/bin/ryl"; -my $image_randomizer_a = "http://image.altavista.com/"; -my $image_randomizer = $image_randomizer_a . "cgi-bin/avncgi" . - "?do=3&verb=no&oshape=n&oorder=" . - "&ophoto=1&oart=1&ocolor=1&obw=1" . - "&stype=simage&oprem=0&query="; - +my $image_randomizer_1 = "http://image.altavista.com/cgi-bin/avncgi" . + "?do=3" . + "&verb=n" . + "&oshape=n" . + "&oorder=" . + "&ophoto=1&oart=1&ocolor=1&obw=1" . + "&stype=simage" . + "&oprem=0" . + "&query="; +my $image_randomizer_2 = "http://www.hotbot.com/?clickSrc=search" . + "&submit=SEARCH&SM=SC&LG=any" . + "&AM0=MC&AT0=words&AW0=" . + "&AM1=MN&AT1=words&AW1=" . + "&savenummod=2&date=within" . + "&DV=0&DR=newer&DM=1&DD=1&DY=99&FVI=1&FS=&RD=RG" . + "&RG=all&Domain=&PS=A&PD=&STEM=1&DC=50&DE=0&_v=2" . + "&OPs=MDRTP&NUMMOD=2" . + "&MT="; +my $image_randomizer_3 = "http://www.altavista.com/cgi-bin/query?pg=q" . + "&text=yes&kl=XX&stype=stext&q="; + +my $image_ppm = ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp") . "/webcollage." . $$; +my $image_tmp1 = $image_ppm . "-1"; +my $image_tmp2 = $image_ppm . "-2"; + +my $img_width; # size of the image being generated. +my $img_height; + +my $http_proxy = undef; my $http_timeout = 30; +my $cvt_timeout = 10; my $ppm_to_root_window_cmd = "xv -root -rmode 5 -viewonly" . " +noresetroot %%PPM%% -quit"; my $filter_cmd = undef; @@ -100,13 +81,22 @@ my $wordlist = "/usr/dict/words"; if (!-r $wordlist) { $wordlist = "/usr/share/lib/dict/words"; # irix } +die "$wordlist doesn't exist!\n" unless (-r $wordlist); my $min_width = 50; my $min_height = 50; my $min_ratio = 1/5; -my $DEBUG = 0; +my $verbose = 0; + +my %rejected_urls; +my @tripwire_words = ("aberrate", "abode", "amorphous", "antioch", + "arrhenius", "arteriole", "blanket", "brainchild", + "burdensome", "carnival", "cherub", "chord", "clever", + "dedicate", "dilogarithm", "dolan", "dryden", + "eggplant"); + @@ -123,51 +113,76 @@ sub get_document_1 { my ( $url, $referer, $timeout ) = @_; if (!defined($timeout)) { $timeout = $http_timeout; } - if ($timeout <= 0) { return undef; } + if ($timeout <= 0) { return (); } if ($timeout > $http_timeout) { $timeout = $http_timeout; } - if ( $DEBUG > 3 ) { - print STDERR "get_document_1 $url " . + if ( $verbose > 3 ) { + print STDERR "$progname: get_document_1 $url " . ($referer ? $referer : "") . "\n"; } my($url_proto, $dummy, $serverstring, $path) = split(/\//, $url, 4); if (! ($url_proto && $url_proto =~ m/^http:$/i)) { - if ($DEBUG) { print STDERR "not an HTTP URL: $url\n"; } - return undef; + if ($verbose) { print STDERR "$progname: not an HTTP URL: $url\n"; } + return (); } + + $path = "" unless $path; + my($them,$port) = split(/:/, $serverstring); $port = 80 unless $port; - my $size=""; + + my $them2 = $them; + my $port2 = $port; + if ($http_proxy) { + $serverstring = $http_proxy if $http_proxy; + ($them2,$port2) = split(/:/, $serverstring); + $port2 = 80 unless $port2; + } my ($remote, $iaddr, $paddr, $proto, $line); - $remote = $them; - if ($port =~ /\D/) { $port = getservbyname($port, 'tcp') } - return unless $port; - $iaddr = inet_aton($remote) || return; - $paddr = sockaddr_in($port, $iaddr); + $remote = $them2; + if ($port2 =~ /\D/) { $port2 = getservbyname($port2, 'tcp') } + return unless $port2; + $iaddr = inet_aton($remote) || return; + $paddr = sockaddr_in($port2, $iaddr); + @_ = eval { local $SIG{ALRM} = sub { - if ($DEBUG > 0) { - print STDERR "timed out ($timeout) for $url\n"; + if ($verbose > 0) { + print STDERR "$progname: timed out ($timeout) for $url\n"; } - die "alarm\n" }; + die "alarm\n" + }; alarm $timeout; $proto = getprotobyname('tcp'); - socket(S, PF_INET, SOCK_STREAM, $proto) || return; - connect(S, $paddr) || return; + if (!socket(S, PF_INET, SOCK_STREAM, $proto)) { + print STDERR "$progname: socket: $!\n" if ($verbose); + return; + } + if (!connect(S, $paddr)) { + print STDERR "$progname: connect($serverstring): $!\n" + if ($verbose); + return; + } select(S); $| = 1; select(STDOUT); - print S ("GET /$path HTTP/1.0\n" . + my $cookie; + if ($remote =~ m/\baltavista\.com$/i) { + # kludge to tell the various altavista sites to be uncensored. + $cookie = "AV_ALL=1"; + } + + print S ("GET " . ($http_proxy ? $url : "/$path") . " HTTP/1.0\n" . "Host: $them\n" . "User-Agent: $progname/$version\n" . ($referer ? "Referer: $referer\n" : "") . + ($cookie ? "Cookie: $cookie\n" : "") . "\n"); - my $http = ; my $head = ""; @@ -182,12 +197,16 @@ sub get_document_1 { close S; + if ( $verbose > 3 ) { + print STDERR "$progname: ==> $http\n"; + } + return ( $http, $head, $body ); }; die if ($@ && $@ ne "alarm\n"); # propagate errors if ($@) { # timed out - return undef; + return (); } else { # didn't alarm 0; @@ -203,8 +222,12 @@ sub get_document { my ( $url, $referer, $timeout ) = @_; my $start = time; + my $orig_url = $url; + my $loop_count = 0; + my $max_loop_count = 4; + do { - if (defined($timeout) && $timeout <= 0) { return undef; } + if (defined($timeout) && $timeout <= 0) { return (); } my ( $http, $head, $body ) = get_document_1 ($url, $referer, $timeout); @@ -215,25 +238,46 @@ sub get_document { $start = $now; } - return undef if ( ! $body ); + return () if ( ! $body ); if ( $http =~ m@HTTP/[0-9.]+ 30[23]@ ) { $_ = $head; my ( $location ) = m@^location:[ \t]*(.*)$@im; if ( $location ) { + $location =~ s/[\r\n]$//; - if ( $DEBUG > 3 ) { - print STDERR "redirect from $url to $location\n"; + if ( $verbose > 3 ) { + print STDERR "$progname: redirect from " . + "$url to $location\n"; } $referer = $url; $url = $location; + + if ($url =~ m@^/@) { + $referer =~ m@^(http://[^/]+)@i; + $url = $1 . $url; + } elsif (! ($url =~ m@^[a-z]+:@i)) { + $_ = $referer; + s@[^/]+$@@g if m@^http://[^/]+/@i; + $_ .= "/" if m@^http://[^/]+$@i; + $url = $_ . $url; + } + } else { return ( $url, $body ); } + if ($loop_count++ > $max_loop_count) { + if ( $verbose > 1 ) { + print STDERR "$progname: too many redirects " . + "($max_loop_count) from $orig_url\n"; + } + return (); + } + } elsif ( $http =~ m@HTTP/[0-9.]+ [4-9][0-9][0-9]@ ) { # http errors -- return nothing. - return undef; + return (); } else { @@ -245,17 +289,18 @@ sub get_document { # given a URL and the body text at that URL, selects and returns a random -# image from it. returns undef if no suitable images found. +# image from it. returns () if no suitable images found. # sub pick_image_from_body { - my ( $base, $body ) = @_; + my ( $url, $body ) = @_; - $_ = $base; + my $base = $url; + $_ = $url; # if there's at least one slash after the host, take off the last # pathname component if ( m@^http://[^/]+/@io ) { - ( $base = $base ) =~ s@[^/]+$@@go; + $base =~ s@[^/]+$@@go; } # if there are no slashes after the host at all, put one on the end. @@ -263,8 +308,8 @@ sub pick_image_from_body { $base .= "/"; } - if ( $DEBUG > 3 ) { - print STDERR "base is $base\n"; + if ( $verbose > 3 ) { + print STDERR "$progname: base is $base\n"; } @@ -276,16 +321,79 @@ sub pick_image_from_body { # nuke comments s///go; + + # There are certain web sites that list huge numbers of dictionary + # words in their bodies or in their tags (surprise! + # Porn sites tend not to be reputable!) + # + # I do not want webcollage to filter on content: I want it to select + # randomly from the set of images on the web. All the logic here for + # rejecting some images is really a set of heuristics for rejecting + # images that are not really images: for rejecting *text* that is in + # GIF/JPEG form. I don't want text, I want pictures, and I want the + # content of the pictures to be randomly selected from among all the + # available content. + # + # So, filtering out "dirty" pictures by looking for "dirty" keywords + # would be wrong: dirty pictures exist, like it or not, so webcollage + # should be able to select them. + # + # However, picking a random URL is a hard thing to do. The mechanism I'm + # using is to search for a selection of random words. This is not + # perfect, but works ok most of the time. The way it breaks down is when + # some URLs get precedence because their pages list *every word* as + # related -- those URLs come up more often than others. + # + # So, after we've retrieved a URL, if it has too many keywords, reject + # it. We reject it not on the basis of what those keywords are, but on + # the basis that by having so many, the page has gotten an unfair + # advantage against our randomizer. + # + my $trip_count = 0; + foreach my $trip (@tripwire_words) { + $trip_count++ if m/$trip/i; + } + if ($trip_count >= $#tripwire_words - 2) { + if ($verbose > 1) { + print STDERR "$progname: there is probably a dictionary in" . + " \"$url\": rejecting.\n"; + } + $rejected_urls{$url} = -1; + return (); + } + + my @urls; my %unique_urls; foreach (split(/ *\"]/io ) { + if ( m/^meta /i ) { + + # Likewise, reject any web pages that have a KEYWORDS meta tag + # that is too long. + # + if (m/name ?= ?\"?keywords\"?/i && + m/content ?= ?\"([^\"]+)\"/) { + my $L = length($1); + if ($L > 1000) { + if ($verbose > 1) { + print STDERR "$progname: keywords of" . + " length $L in $url: rejecting.\n"; + } + $rejected_urls{$url} = $L; + return (); + } elsif ( $verbose > 2 ) { + print STDERR "$progname: keywords of length $L" . + " in $url (ok.)\n"; + } + } + + } elsif ( m/^(img|a) .*(src|href) ?= ?\"? ?(.*?)[ >\"]/io ) { my $was_inline = ( "$1" eq "a" || "$1" eq "A" ); my $link = $3; - my ( $width ) = m/width ?=[ \"]*([0-9]+)/oi; - my ( $height ) = m/height ?=[ \"]*([0-9]+)/oi; + my ( $width ) = m/width ?=[ \"]*(\d+)/oi; + my ( $height ) = m/height ?=[ \"]*(\d+)/oi; $_ = $link; if ( m@^/@o ) { @@ -309,25 +417,21 @@ sub pick_image_from_body { next; } -# # skip GIF? -# if ( m@[.](gif)@io ) { -## if ( $DEBUG > 2 ) { print STDERR "skip GIF $_\n"; } -# next; -# } - # skip really short or really narrow images if ( $width && $width < $min_width) { - if ( $DEBUG > 2 ) { + if ( $verbose > 2 ) { if (!$height) { $height = "?"; } - print STDERR "skip narrow image $_ ($width x $height)\n"; + print STDERR "$progname: skip narrow image " . + "$_ (${width}x$height)\n"; } next; } if ( $height && $height < $min_height) { - if ( $DEBUG > 2 ) { + if ( $verbose > 2 ) { if (!$width) { $width = "?"; } - print STDERR "skip short image $_ ($width x $height)\n"; + print STDERR "$progname: skip short image " . + "$_ (${width}x$height)\n"; } next; } @@ -335,9 +439,10 @@ sub pick_image_from_body { # skip images with ratios that make them look like banners. if ( $min_ratio && $width && $height && ($width * $min_ratio ) > $height ) { - if ( $DEBUG > 2 ) { + if ( $verbose > 2 ) { if (!$height) { $height = "?"; } - print STDERR "skip bad ratio $_ ($width x $height)\n"; + print STDERR "$progname: skip bad ratio " . + "$_ (${width}x$height)\n"; } next; } @@ -345,12 +450,14 @@ sub pick_image_from_body { my $url = $_; if ( $unique_urls{$url} ) { - if ( $DEBUG > 2 ) { print STDERR "skip duplicate image $_\n"; } + if ( $verbose > 2 ) { + print STDERR "$progname: skip duplicate image $_\n"; + } next; } - if ( $DEBUG > 2 ) { - print STDERR "got $url" . + if ( $verbose > 2 ) { + print STDERR "$progname: got $url" . ($width && $height ? " (${width}x${height})" : "") . ($was_inline ? " (inline)" : "") . "\n"; } @@ -373,20 +480,20 @@ sub pick_image_from_body { } if ( $#urls == 0 ) { - if ( $DEBUG > 2 ) { - print STDERR "no images on $base\n"; + if ( $verbose > 2 ) { + print STDERR "$progname: no images on $base\n"; } - return undef; + return (); } - return undef if ( $#urls < 1 ); + return () if ( $#urls < 1 ); # pick a random element of the table my $i = ((rand() * 99999) % $#urls); - my $url = $urls[$i]; + $url = $urls[$i]; - if ( $DEBUG > 2 ) { - print STDERR "picked $url\n"; + if ( $verbose > 2 ) { + print STDERR "$progname: picked $url\n"; } return $url; @@ -395,13 +502,13 @@ sub pick_image_from_body { # Using the URL-randomizer, picks a random image on a random page, and # returns two URLs: the page containing the image, and the image. -# Returns undef if nothing found this time. +# Returns () if nothing found this time. # sub pick_from_url_randomizer { my ( $timeout ) = @_; - if ( $DEBUG > 3 ) { - print STDERR "\n\npicking from $random_redirector...\n\n"; + if ( $verbose > 3 ) { + print STDERR "\n\n$progname: picking from $random_redirector...\n\n"; } my ( $base, $body ) = get_document ($random_redirector, undef, $timeout); @@ -410,9 +517,9 @@ sub pick_from_url_randomizer { my $img = pick_image_from_body ($base, $body); if ($img) { - return ($base, $img); + return ($base, $img, "yahoo"); } else { - return undef; + return (); } } @@ -449,10 +556,10 @@ sub random_word { # Using the image-randomizer, picks a random image on a random page, and # returns two URLs: the page containing the image, and the image. -# Returns undef if nothing found this time. +# Returns () if nothing found this time. # sub pick_from_image_randomizer { - my ( $timeout ) = @_; + my ( $timeout, $which ) = @_; my $words = random_word; $words .= "%20" . random_word; @@ -460,96 +567,195 @@ sub pick_from_image_randomizer { $words .= "%20" . random_word; $words .= "%20" . random_word; - my $search_url = $image_randomizer . $words; + my $search_url = ($which == 0 ? $image_randomizer_1 : + $which == 1 ? $image_randomizer_2 : + $image_randomizer_3) . + $words; + + # Pick a random search-result page instead of always taking the first. + # This assumes there are at least 10 pages... + if ($which == 0) { + $search_url .= "&pgno=" . (int(rand(9)) + 1); + } elsif ($which == 2) { + $search_url .= "&stq=" . (10 * (int(rand(9)) + 1)); + } - if ( $DEBUG > 3 ) { - $_ = $words; s/%20/ /g; print STDERR "search words: $_\n"; + if ( $verbose > 3 ) { + $_ = $words; s/%20/ /g; print STDERR "$progname: search words: $_\n"; } - if ( $DEBUG > 3 ) { - print STDERR "\n\npicking from $search_url\n"; + if ( $verbose > 3 ) { + print STDERR "\n\n$progname: picking from $search_url\n"; } my $start = time; my ( $base, $body ) = get_document ($search_url, undef, $timeout); if (defined ($timeout)) { $timeout -= (time - $start); - return undef if ($timeout <= 0); + return () if ($timeout <= 0); } - return undef if (! $body); + return () if (! $body); my @subpages; my $skipped = 0; + my $search_count = "?"; + if ($which == 0 && + $body =~ m@found (approximately |about )?()?(\d+)()? image@) { + $search_count = $3; + } elsif ($which == 1 && $body =~ m@((\d{1,3})(,\d{3})*) @i) { + $search_count = $1; + } elsif ($which == 2 && $body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) { + $search_count = $1; + } + 1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/); + + my $length = length($body); + my $href_count = 0; + $_ = $body; + s/[\r\n\t ]+/ /g; + + s/Result Pages:.*$//; # trim off page footer + s/(]+)>@i; + next unless $u; + if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; } # quoted string + elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; } # or token + + if ($which == 1) { + # Kludge to decode HotBot pages + next unless ($u =~ m@/director\.asp\?target=(http%3A[^&>]+)@); + $u = url_decode($1); + } - if ( m@]+)> 3 ) { - print STDERR "skipping corbis URL: $_\n"; - } - next; - } elsif ( $DEBUG > 3 ) { - print STDERR "sub-page: $1\n"; + if ($which == 0 && $u =~ m@[/.]corbis\.com/@) { + $skipped = 1; + if ( $verbose > 3 ) { + print STDERR "$progname: skipping corbis URL: $u\n"; } + next; - $subpages[++$#subpages] = $u; + } elsif ( $rejected_urls{$u} ) { + if ( $verbose > 3 ) { + my $L = $rejected_urls{$u}; + print STDERR "$progname: pre-rejecting sub-page: $u\n"; + } + next; + + } elsif ( $verbose > 3 ) { + print STDERR "$progname: sub-page: $u\n"; } + + $subpages[++$#subpages] = $u; } - if ( $#subpages <= 0 ) { - if (!$skipped) { - print STDERR "Found nothing on $base\n"; + if ( $#subpages < 0 ) { + if (!$skipped && $verbose > 1) { + print STDERR "$progname: found nothing on $base " . + "($length bytes, $href_count links).\n"; } - return undef; + return (); } # pick a random element of the table - my $i = ((rand() * 99999) % $#subpages); + my $i = ((rand() * 99999) % ($#subpages + 1)); my $subpage = $subpages[$i]; - if ( $DEBUG > 3 ) { - print STDERR "picked page $subpage\n"; + if ( $verbose > 3 ) { + print STDERR "$progname: picked page $subpage\n"; } my ( $base2, $body2 ) = get_document ($subpage, $base, $timeout); - return undef if (!$base2 || !body2); + return () if (!$base2 || !$body2); my $img = pick_image_from_body ($base2, $body2); if ($img) { - return ($base2, $img); + return ($base2, $img, + ($which == 0 ? "imagevista" : + $which == 1 ? "hotbot" : "altavista") . + "/$search_count"); } else { - return undef; + return (); } } # Picks a random image on a random page, and returns two URLs: # the page containing the image, and the image. -# Returns undef if nothing found this time. +# Returns () if nothing found this time. # Uses the url-randomizer 1 time in 5, else the image randomizer. # +my $total_0 = 0; +my $total_1 = 0; +my $total_2 = 0; +my $total_3 = 0; +my $count_0 = 0; +my $count_1 = 0; +my $count_2 = 0; +my $count_3 = 0; + sub pick_image { my ( $timeout ) = @_; - if (int(rand 5) == 0) { - return pick_from_url_randomizer ($timeout); + my $r = int(rand(100)); + my ($base, $img, $source, $total, $count); + + if ($r < 20) { + ($base, $img, $source) = pick_from_url_randomizer ($timeout); + $total = ++$total_0; + $count = ++$count_0 if $img; + + } elsif ($r < 60) { + ($base, $img, $source) = pick_from_image_randomizer ($timeout, 0); + $total = ++$total_1; + $count = ++$count_1 if $img; + +# } elsif ($r < 80) { +# # HotBot sucks: 98% of the time, it says "no pages match your +# # search", and then if I load the URL again by hand, it works. +# # I don't understand what's going wrong here, but we're not getting +# # any good data back from them, so forget it for now. +# +# ($base, $img, $source) = pick_from_image_randomizer ($timeout, 1); +# $total = ++$total_2; +# $count = ++$count_2 if $img; + } else { - return pick_from_image_randomizer ($timeout); + ($base, $img, $source) = pick_from_image_randomizer ($timeout, 2); + $total = ++$total_3; + $count = ++$count_3 if $img; + } + + if ($source && $total > 0) { + $source .= " " . int(($count / $total) * 100) . "%"; } + return ($base, $img, $source); +} + + +# Does %-decoding. +# +sub url_decode { + ($_) = @_; + tr/+/ /; + s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; + return $_; } @@ -559,7 +765,7 @@ sub gif_size { my ($body) = @_; my $type = substr($body, 0, 6); my $s; - return undef unless ($type =~ /GIF8[7,9]a/); + return () unless ($type =~ /GIF8[7,9]a/); $s = substr ($body, 6, 10); my ($a,$b,$c,$d) = unpack ("C"x4, $s); return (($b<<8|$a), ($d<<8|$c)); @@ -572,9 +778,9 @@ sub jpeg_size { my $i = 0; my $L = length($body); - $c1 = substr($body, $i, 1); $i++; - $c2 = substr($body, $i, 1); $i++; - return undef unless (ord($c1) == 0xFF && ord($c2) == 0xD8); + my $c1 = substr($body, $i, 1); $i++; + my $c2 = substr($body, $i, 1); $i++; + return () unless (ord($c1) == 0xFF && ord($c2) == 0xD8); my $ch = "0"; while (ord($ch) != 0xDA && $i < $L) { @@ -605,11 +811,11 @@ sub jpeg_size { my $s = substr($body, $i, 2); $i += 2; my ($c1, $c2) = unpack ("C"x2, $s); my $length = ($c1 << 8) | $c2; - return undef if ($length < 2); + return () if ($length < 2); $i += $length-2; } } - return undef; + return (); } # Given the raw body of a GIF or JPEG document, returns the dimensions of @@ -635,781 +841,169 @@ sub which { return undef; } -############################################################################## -# -# Running as a CGI -# -############################################################################## - -my $body_tag = "\n"; - -my $html_document = - ("" . - "\n" . - "\n" . - " WebCollage\n" . - "\n" . - "\n" . - $body_tag . - "\n" . - "
" . - "\n" . - " \n" . - " \n" . - " \n" . - " \n" . - " \n" . - " \n" . - " \n" . - "
\n" . - " WebCollage: \n" . - "
by\n" . - " Jamie Zawinski\n" . - " \n" . - "\n" . - "
Exterminate All Rational Thought.\n" . - " \n" . - "
This program creates collages out of random images\n" . - " found on the Web.\n" . - "
More images are being added to the\n" . - " collage now: please wait for the image below to load.\n" . - " This will take a minute or two, since it has to contact\n" . - " other web sites to retrieve the images before it can construct\n" . - " the collage. Once the image below is loaded, you can reload\n" . - " this page to do it again.\n" . - "
If you enjoy this, you might also enjoy\n" . - " DadaDodo.\n" . - " WebCollage also works as a screen saver, for those of you\n" . - " using Unix: it is included with the\n" . - " XScreenSaver\n" . - " package.
\n" . - "
\n" . - " \n" . - "
\n" . - " %%MAP%%\n" . - " \n" . - "
\n" . - "
\n" . - "
\n"); - - -my @time_fmt_days = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"); -my @time_fmt_months = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", - "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"); - -# Converts a time_t to a string acceptable to HTTP. -# -sub format_http_time { - my ($time) = @_; - my @t = gmtime($time); - my ($sec, $min, $hour, $mday, $mon, $year, $wday) = @t; - $year += 1900; - $wday = $time_fmt_days[$wday]; - $mon = $time_fmt_months[$mon]; - return sprintf("%s, %02d %s %d %02d:%02d:%02d GMT", - $wday, $mday, $mon, $year, $hour, $min, $sec); -} - - -# Parses exactly the time format that HTTP requires, no more, no less. -# -sub parse_http_time { +# Like rand(), but chooses numbers with a bell curve distribution. +sub bellrand { ($_) = @_; - - if (!m/^[SMTWF][a-z][a-z]+, (\d\d)[- ]([JFMAJSOND][a-z][a-z]+)[- ](\d\d\d?\d?)[- ](\d\d):(\d\d):(\d\d)( GMT)?$/o) { - return undef; - } - - my @moy = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); - @moy{@moy} = (1..12); - - my $t = Time::Local::timegm($6, $5, $4, $1, $moy{$2}-1, - ($3 < 100 ? $3 : $3-1900)); - return ($t < 0 ? undef : $t); -} - - -# Given a modification time, returns a time_t to use as the expiration time -# of both the HTML and the JPEG. -# -sub compute_expires_time { - my ($mod_time) = (@_); - my $now = time; - if ($mod_time < $now) { $mod_time = $now; } - return $mod_time + $max_age; + $_ = 1.0 unless defined($_); + $_ /= 3.0; + return (rand($_) + rand($_) + rand($_)); } -# Parse the If-Modified-Since header, and write a response if appropriate. -# If this returns 1, we're done. +############################################################################## # -sub do_ifmod { - # see http://vancouver-webpages.com/proxy/log-tail.pl and - # http://mnot.cbd.net.au/cache_docs/ for clues about how to - # do cacheing properly with CGI-generated documents. - my ($mod_time) = (@_); - if ($ENV{HTTP_IF_MODIFIED_SINCE}) { - my $ims = $ENV{HTTP_IF_MODIFIED_SINCE}; - $ims =~ s/;.*// ; # lose trailing "; length=3082" - $ims = parse_http_time($ims); - if ($ims && $mod_time <= $ims) { - print "Status: 304 Not Modified\n\n" ; - return 1; - } - } - return 0; -} - - -# Returns N urls of images (and the pages on which they were found.) -# Unless there is a significant surplus of URLs in the $pending_file, -# this will spend $url_generation_time seconds generating as many URLs -# as it can. The first N will be returned, and the rest will be left -# in the file. +# Generating a list of urls only # -sub get_image_urls { - my ($count) = @_; - - my @urls; - my $body = ""; - my $file_count = 0; - - local *PEND; - - # Open and lock the file (read/write.) - # rewind after locking, in case we had to wait for the lock. - # - open (PEND, "+<$pending_file") || die "couldn't open $pending_file: $!"; - - if ($DEBUG > 2) { print STDERR "jpeg: opened $pending_file\n"; } - - my $flock_wait = time; - flock (PEND, LOCK_EX) || die "couldn't lock $pending_file: $!"; - $flock_wait = (time - $flock_wait); - - seek (PEND, 0, 0) || die "couldn't rewind $pending_file: $!"; - - if ($DEBUG > 2) { print STDERR "jpeg: locked $pending_file\n"; } - - - # Take N URLs off the top, and leave the rest. - # - while () { - if (--$count >= 0) { - if ($DEBUG > 3) { print STDERR " < $_"; } - s/[\r\n]+$//; - $urls[++$#urls] = $_; - } else { - $body .= $_; - if ($DEBUG > 3) { print STDERR " - $_"; } - $file_count++; - } - } - - # rewind and overwrite the file - seek (PEND, 0, 0) || die "couldn't rewind $pending_file: $!"; - truncate (PEND, 0) || die "couldn't truncate $pending_file: $!"; - print PEND $body; - - - # If there are fewer than 3x as many URLs as we took left in the file, - # then generate as many URLs as we can in N seconds. Take what we - # need from that, and append the rest to the file. Note that we are - # still holding a lock on the file. - # - # Count the time spent waiting for flock as time spent gathering URLs. - # Because that means someone else was doing it. - # - $body = ""; - if ($file_count < $count * 3) { - my $timeout = $url_generation_time - $flock_wait; - my $start = time; - - while (1) { - last if ($timeout <= 0); - - if ($DEBUG > 2) { print STDERR "time remaining: $timeout\n"; } - my ($base, $img) = pick_image ($timeout); - - if ($img) { - $img =~ s/ /%20/g; - $base =~ s/ /%20/g; - $_ = "$img $base"; - if ($count-- >= 0) { - if ($DEBUG > 3) { print STDERR " << $img\n"; } - $urls[++$#urls] = $_; - } else { - if ($DEBUG > 3) { print STDERR " >> $img\n"; } - print PEND "$_\n"; # append to file - $file_count++; - } - } +############################################################################## - my $now = time; - my $elapsed = $now - $start; - $timeout -= $elapsed; - $start = $now; +sub url_only_output { + do { + my ($base, $img) = pick_image; + if ($img) { + $base =~ s/ /%20/g; + $img =~ s/ /%20/g; + print "$img $base\n"; } - } - - my $of = select(PEND); $| = 1; select($of); # flush output - print PEND ""; - - flock (PEND, LOCK_UN) || die "couldn't unlock $pending_file: $!"; - close (PEND) || die "couldn't close $pending_file: $!"; - - if ($DEBUG > 2) { - print STDERR "jpeg: closed $pending_file; $file_count urls in file;" . - " returning $#urls.\n"; - } - - return @urls; + } while (1); } +############################################################################## +# +# Running as an xscreensaver module +# +############################################################################## -sub cgi_reset_all_files { - foreach (@all_files) { - my $file = $_; - local *OUT; - open (OUT, "+<$file") || die "couldn't open $file: $!"; - flock (OUT, LOCK_EX) || die "couldn't lock $file: $!"; - truncate (OUT, 0) || die "couldn't truncate $file: $!"; - flock (OUT, LOCK_UN) || die "couldn't unlock $file: $!"; - close (OUT) || die "couldn't close $file: $!"; - } - - system "ppmmake '#000000' $img_width $img_height > $image_ppm" || - die "failed to create blank $image_ppm file: $!"; - system "cjpeg -progressive $image_ppm > $image_jpg" || - die "failed to create blank $image_jpg file: $!"; +sub x_cleanup { + my ($sig) = @_; + if ($verbose > 0) { print STDERR "$progname: caught signal $sig.\n"; } + unlink $image_ppm, $image_tmp1, $image_tmp2; + exit 1; } -# Given the URL of an image and the page on which it was found, this will -# load the image, and paste it at a random spot in $image_ppm and $img_jpg. -# It will also update $map_file to contain the appropriate referer, and -# will limit it to $max_map_entries lines. +# Like system, but prints status about exit codes, and kills this process +# with whatever signal killed the sub-process, if any. # -sub cgi_paste_image { - my ($img, $referer) = @_; - - my ( $base, $body ) = get_document ($img, $referer); - return if (!$base || !$body); - - my ($iw, $ih) = image_size ($body); - return if (!$iw || !$ih); - - if ($DEBUG > 2) { print STDERR "got $base ($iw x $ih)\n"; } - - my $cmd; - - if ($base =~ m/\.gif$/i) { - $cmd = "giftopnm"; - } else { - $cmd = "djpeg"; - } - - if ($iw > $img_width || $ih > $img_height) { - while ($iw > $img_width || $ih > $img_height) { - $iw = int($iw / 2); - $ih = int($ih / 2); - } - $cmd .= " | pnmscale -xysize $iw $ih"; - } - - my $x = int (rand() * ($img_width - $iw)); - my $y = int (rand() * ($img_height - $ih)); - - $cmd .= " | pnmpaste - $x $y $image_ppm"; - - - local *MAP; - local *PIPE_OUT; - - # Open and lock the map (read/write.) - # rewind after locking, in case we had to wait for the lock. - # This lock doubles as our lock on the image file itself. - # - open (MAP, "+<$map_file") || die "couldn't open $map_file: $!"; - - if ($DEBUG > 2) { print STDERR "jpeg: opened $map_file\n"; } - - flock (MAP, LOCK_EX) || die "couldn't lock $map_file: $!"; - seek (MAP, 0, 0) || die "couldn't rewind $map_file: $!"; - - if ($DEBUG > 2) { print STDERR "jpeg: locked $map_file\n"; } - - # Read in the first hundred lines of the map file. - # - my $map = ""; - my $count = 0; - while () { - last if ($count++ > $max_map_entries); - $map .= $_; +sub nontrapping_system { + $! = 0; + + if ($verbose > 1) { + $_ = join(" ", @_); + s/\"[^\"]+\"/\"...\"/g; + print STDERR "$progname: executing \"$_\"\n"; } - # Add this entry to the front of the map data. - # - $map = "$x $y $iw $ih $referer\n" . $map; + my $rc = system @_; - - # Ensure that the $image_ppm file exists and has a ppm in it. - # - my $ppm_size = $img_width * $img_height * 3 * 2; - my $s = (stat($image_ppm))[7]; - if ($s < $ppm_size) { - - if ( $DEBUG ) { - print STDERR "$image_ppm is $s bytes;" . - " should be at least $ppm_size\n"; - print STDERR "resetting everything."; - cgi_reset_all_files(); + if ($rc == 0) { + if ($verbose > 1) { + print STDERR "$progname: subproc exited normally.\n"; } - } - - # Paste the bits into the image. Note that the map file is still locked. - # - local *TMP; - open (TMP, ">$image_tmp") || die "couldn't open $image_tmp: $!"; - close (TMP); - - if (! $DEBUG ) { - $cmd = "( $cmd ) 2>/dev/null"; - } - - $cmd .= " > $image_tmp"; - if ($DEBUG > 2) { print STDERR "executing $cmd\n"; } - - if (open(PIPE_OUT, "| $cmd")) { - print PIPE_OUT $body; - close(PIPE_OUT); - - if ($DEBUG > 2) { system "ls -ldF $image_tmp >&2"; } - - my @tmp_stat = stat($image_tmp); - if (@tmp_stat && $tmp_stat[7] < 200) { -# unlink ($image_tmp) || die "couldn't unlink $image_tmp: $!"; - open (OUT, ">$image_tmp") || die "$image_tmp unwritable: $!"; - close (OUT); - if ($DEBUG > 2) { print STDERR "FAILED writing $image_ppm\n"; } - } else { -# rename ($image_tmp, $image_ppm) || -# die "couldn't rename $image_tmp to $image_ppm: $!"; - local *IN; - local *OUT; - open (IN, "+<$image_tmp") || die "$image_tmp unreadable: $!"; - open (OUT, ">$image_ppm") || die "$image_ppm unwritable: $!"; - while () { print OUT $_; } - truncate (IN, 0) || die "couldn't truncate $image_tmp: $!"; - close (IN); - close (OUT) || die "couldn't write $image_ppm: $!"; - if ($DEBUG > 2) { print STDERR "wrote $image_ppm\n"; } - - - # Now convert the PPM to a JPEG. - # - system "cjpeg -progressive $image_ppm > $image_tmp 2>/dev/null"; - - @tmp_stat = stat($image_tmp); - if (@tmp_stat && $tmp_stat[7] < 200) { -# unlink ($image_tmp) || die "couldn't unlink $image_tmp: $!"; - open (OUT, ">$image_tmp") || die "$image_tmp unwritable: $!"; - close (OUT); - if ($DEBUG > 2) { print STDERR "FAILED writing $image_jpg\n"; } - } else { -# rename ($image_tmp, $image_ppm) || -# die "couldn't rename $image_tmp to $image_ppm: $!"; - open (IN, "+<$image_tmp") || die "$image_tmp unreadable: $!"; - open (OUT, ">$image_jpg") || die "$image_jpg unwritable: $!"; - while () { print OUT $_; } - truncate (IN, 0) || die "couldn't truncate $image_tmp: $!"; - close (IN); - close (OUT) || die "couldn't write $image_jpg: $!"; - if ($DEBUG > 2) { print STDERR "wrote $image_jpg\n"; } + } elsif (($rc & 0xff) == 0) { + $rc >>= 8; + if ($verbose) { + print "$progname: subproc exited with status $rc.\n"; + } + } else { + if ($rc & 0x80) { + if ($verbose) { + print "$progname: subproc dumped core.\n"; } + $rc &= ~0x80; } - - # Overwrite the map data. - # - seek (MAP, 0, 0) || die "couldn't rewind $map_file: $!"; - truncate (MAP, 0) || die "couldn't truncate $map_file: $!"; - print MAP $map; - } - - my $of = select(MAP); $| = 1; select($of); # flush output - print MAP ""; - - flock (MAP, LOCK_UN) || die "couldn't unlock $map_file: $!"; - close (MAP) || die "couldn't close $map_file: $!"; - - if ($DEBUG > 2) { print STDERR "jpeg: closed $map_file\n"; } -} - - -sub cgi_generate_image { - - $SIG{PIPE} = 'IGNORE'; - - my @urls = get_image_urls ($pastes_per_load); - my $end_time = time + $image_retrieval_time; - - if ($DEBUG > 2) { - print STDERR "loading $#urls images\n"; - } - - foreach (@urls) { - my ($img, $referer) = m/^([^ ]+) ([^ ]+)/; - if ($img) { - cgi_paste_image ($img, $referer); + if ($verbose) { + print "$progname: subproc died with signal $rc.\n"; } - last if (time > $end_time); + # die that way ourselves. + kill $rc, $$; } -} - - -sub cgi_sanity_check { - my $error = undef; - foreach (@all_files) { - if (! -e $_) { $error = "$_ does not exist.\n"; } - elsif (! -r $_) { $error = "$_ is unreadable.\n"; } - elsif (! -w $_) { $error = "$_ is unwritable.\n"; } - last if ($error); - } - - return unless $error; - - print "Content-Type: text/html\n"; - print "\n\nError$body_tag
Error
"; - print POSIX::getcwd() . "/" . $error . "
\n"; - - $_ = join(", ", @all_files); - s/,([^,]*)$/, and$1/; - print "Each of the files: $_\n"; - print " must exist and be readable and writable by the httpd process\n"; - print "(which probably means they must be globally readable and\n"; - print "writable, since on most systems, CGI scripts run as the\n"; - print "user nobody.)\n
\n"; - - exit (0); + return $rc; } -# Write the encapsulating HTML document and associated HTTP headers. -# This is fast -- it just writes out the wrapper document corresponding -# to the data currently on disk. It is the loading of the sub-image -# that does the real work. +# Given the URL of a GIF or JPEG image, and the body of that image, writes a +# PPM to the given output file. Returns the width/height of the image if +# successful. # -sub cgi_emit_html_document { - - cgi_sanity_check; - - my $map_file_date; - my $doc = $html_document; - - my $w2 = int ($img_width * $scale); - my $h2 = int ($img_height * $scale); - $doc =~ s/%%WIDTH%%/$w2/g; - $doc =~ s/%%HEIGHT%%/$h2/g; - - local *MAP; - open (MAP, "<$map_file") || die "couldn't open $map_file: $!"; - if ($DEBUG > 2) { print STDERR "html: opened $map_file\n"; } - - flock (MAP, LOCK_SH) || die "couldn't lock $map_file: $!"; - seek (MAP, 0, 0) || die "couldn't rewind $map_file: $!"; - if ($DEBUG > 2) { print STDERR "html: locked $map_file\n"; } - - $map_file_date = (stat(MAP))[9]; - - my $map = "\n"; - while () { - my ($x, $y, $w, $h, $url) = - m/^([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+) (.*)$/; - if ($w && $h) { - $x = int($x * $scale); - $y = int($y * $scale); - $w = int($w * $scale); - $h = int($h * $scale); - - # protect against URLs that contain <, >, or ". - $url =~ s/([<>\"])/uc sprintf("%%%02X",ord($1))/eg; - - my $x2 = $x + $w; - my $y2 = $y + $h; - $map .= - "\n"; - } - } - $map .= ""; - flock (MAP, LOCK_UN) || die "couldn't unlock $map_file: $!"; - close (MAP) || die "couldn't close $map_file: $!"; - - if ($DEBUG > 2) { print STDERR "html: closed $map_file\n"; } - - $doc =~ s/%%MAP%%/$map/g; - - my $img_name = "current"; +sub image_to_pnm { + my ($url, $body, $output) = @_; + my ($cmd, $cmd2, $w, $h); - $doc =~ s@%%IMAGE%%@images/$img_name.jpg@g; - - - my $mod_time = $map_file_date; - if ($script_date > $mod_time) { $mod_time = $script_date; } - - if (do_ifmod($mod_time)) { - return; - } - - my $exp = compute_expires_time($mod_time); - - print "Content-Type: text/html\n"; - print "Content-Length: " . length($doc) . "\n"; - print "Last-Modified: " . format_http_time($mod_time) . "\n"; - - # This is a suggestion to consider the object invalid after the given - # date. This is sometimes ignored. - # - print "Expires: " . format_http_time($exp) . "\n"; - - # This may or may not cause a cacheing proxy to pass this stuff along. - # It's not standardized, but was historically used for... something. - print "Pragma: no-cache\n"; - - # This says the same thing as the Expires header, but it is a stronger - # assertion that we're serious and should be listened to. - # - my $age = $exp - time; - print "Cache-Control: max-age=$age, must-revalidate\n"; - - print "\n"; - print $doc; -} - - -# Write the interior JPEG document and associated HTTP headers. -# -sub cgi_emit_jpeg_document { - - my $image_data = ""; - my $jpg_file_date; - my $do_ims = 0; - - # The map file is the means by which we hold write-locks on the image - # file. So first obtain the lock on that file. - # - local *MAP; - open (MAP, "+<$map_file") || die "couldn't open $map_file: $!"; - - if ($DEBUG > 2) { print STDERR "jpeg: opened $map_file\n"; } - flock (MAP, LOCK_SH) || die "couldn't lock $map_file: $!"; - if ($DEBUG > 2) { print STDERR "jpeg: locked $map_file\n"; } - - # Now we have exclusive access to the image file. Read it. - # - local *IMG; - open (IMG, "<$image_jpg") || die "couldn't open $image_jpg: $!"; - - $jpg_file_date = (stat(IMG))[9]; - - if (do_ifmod($jpg_file_date)) { - $do_ims = 1; - if ($DEBUG > 2) { - my $ims = $ENV{HTTP_IF_MODIFIED_SINCE}; - $ims =~ s/;.*//; - print STDERR "not-modified-since " . - localtime(parse_http_time($ims)) . "\n"; - print STDERR "jpg date: " . localtime($jpg_file_date) . "\n"; - } + if ((@_ = gif_size ($body))) { + ($w, $h) = @_; + $cmd = "giftopnm"; + } elsif ((@_ = jpeg_size ($body))) { + ($w, $h) = @_; + $cmd = "djpeg"; + } else { + return (); } - if (!$do_ims) { - while () { $image_data .= $_; } + $cmd2 = "exec $cmd"; # yes, this really is necessary. if we don't + # do this, the process doesn't die properly. + if ($verbose == 0) { + $cmd2 .= " 2>/dev/null"; } - close (IMG) || die "couldn't close $image_jpg: $!"; - - # Now free the lock so that others can write to the file. - # - flock (MAP, LOCK_UN) || die "couldn't unlock $map_file: $!"; - close (MAP) || die "couldn't close $map_file: $!"; - if ($DEBUG > 2) { print STDERR "jpeg: closed $map_file\n"; } - - return if ($do_ims); - - # At this point, we have the image data we will be returning. - # However, don't return it yet -- first go off and generate the - # *next* image, then we can return *this* one. If we don't do it - # in this order, people will jump the gun hitting reload, and no - # image updates will happen. - # - my $type = "image/jpeg"; - my $mod_time = $jpg_file_date; - if ($script_date > $mod_time) { $mod_time = $script_date; } - - print "Last-Modified: " . format_http_time($mod_time) . "\n"; - print "Expires: " . format_http_time(compute_expires_time($mod_time)) - . "\n"; - print "Content-Type: $type\n"; - print "Content-Length: " . length($image_data) . "\n"; - print "\n"; - - # Now, before returning the image data, go catatonic for a minute - # while we load some URLs and make the next image. + # There exist corrupted GIF and JPEG files that can make giftopnm and + # djpeg lose their minds and go into a loop. So this gives those programs + # a small timeout -- if they don't complete in time, kill them. # - cgi_generate_image; - - # Done setting up for next time -- now finish loading. - # - print $image_data; - $image_data = undef; -} - - -# Write the source code of this script as a text/plain document. -# -sub cgi_emit_source_document { - my $mod_time = $script_date; - - if (do_ifmod($mod_time)) { - return; - } - - print "Content-Type: text/plain\n"; - print "Last-Modified: " . format_http_time($mod_time) . "\n"; - print "\n"; - open (IN, "<$argv0") || die "couldn't open $argv0: $!"; - while () { - print; - } - close (IN); -} - - -# Parse the various environment variables to decide how we were invoked, -# and then do something about it. -# -sub cgi_main { - - $DEBUG=4; - - $ENV{PATH} .= ":/usr/local/bin"; - - # make sure the various programs we execute exist, right up front. - foreach ("ppmmake", "cjpeg", "djpeg", "giftopnm", "pnmpaste", "pnmscale") { - if (!which ($_)) { - print "Content-Type: text/html\n"; - print "\n\nError$body_tag
Error
"; - print "The $_ program was not found on \$PATH. \n"; - - my $p = $ENV{PATH}; - $p =~ s/%/%25/g; $p =~ s/\&/%26/g; - $p =~ s//%3E/g; - $p =~ s/:/:/g; - print "\$PATH is: $p\n"; - exit (0); - } - } - - $script_date = (stat($argv0))[9]; - - print "Blat: Foop\n"; - - if ($ENV{REQUEST_METHOD} && - $ENV{REQUEST_METHOD} ne "GET" && - $ENV{REQUEST_METHOD} ne "HEAD" ) { - print "Content-Type: text/html\n"; - print "\n\nError$body_tag Error"; - $_ = $ENV{REQUEST_METHOD}; - print "bad request method: $_\n"; - exit (0); - - } elsif ( $ENV{QUERY_STRING} ) { - if ( $ENV{QUERY_STRING} eq "reset" ) { - cgi_reset_all_files; + my $pid; + @_ = eval { + my $timed_out; - print "Content-Type: text/html\n"; - print "\n\nCollage Reset"; - print "$body_tagCollage Reset \n"; - exit (0); + local $SIG{ALRM} = sub { + if ($verbose > 0) { + print STDERR "$progname: timed out ($cvt_timeout) for " . + "$cmd on \"$url\" in pid $pid\n"; + } + kill ('TERM', $pid) if ($pid); + $timed_out = 1; + }; + + if (($pid = open(PIPE, "| $cmd2 > $output"))) { + $timed_out = 0; + alarm $cvt_timeout; + print PIPE $body; + close PIPE; + + if ($verbose > 3) { print STDERR "$progname: awaiting $pid\n"; } + waitpid ($pid, 0); + if ($verbose > 3) { print STDERR "$progname: $pid completed\n"; } + + + my $size = (stat($output))[7]; + if ($size < 5) { + if ($verbose) { + print STDERR "$progname: $cmd on ${w}x$h \"$url\" failed" . + " ($size bytes)\n"; + } + return (); + } + if ($verbose > 1) { + print STDERR "$progname: created ${w}x$h $output ($cmd)\n"; + } + return ($w, $h); } else { - print "Content-Type: text/html\n"; - print "\n\nError$body_tag Error"; - $_ = $ENV{QUERY_STRING}; - print "malformed URL: $_\n"; - exit (0); + print STDERR "$progname: $cmd failed: $!\n"; + return (); } - - } elsif ( !$ENV{PATH_INFO} || $ENV{PATH_INFO} eq "" ) { - # don't allow /webcollage as a URL -- force it to be /webcollage/ - print "Status: 301 Moved Permanently\n"; - print "Location: http://" . - ($ENV{HTTP_HOST} ? $ENV{HTTP_HOST} : - $ENV{SERVER_NAME} ? $ENV{SERVER_NAME} : "???") . - ($ENV{REQUEST_URI} ? $ENV{REQUEST_URI} : "") . - "/\n\n"; - exit (0); - - } elsif ( $ENV{PATH_INFO} eq "/" ) { - cgi_emit_html_document; - - } elsif ( $ENV{PATH_INFO} =~ m@^/images/[^/]+\.jpg$@ ) { - cgi_emit_jpeg_document; - - } elsif ( $ENV{PATH_INFO} eq "/webcollage.pl" ) { - cgi_emit_source_document; - + }; + die if ($@ && $@ ne "alarm\n"); # propagate errors + if ($@) { + # timed out + return (); } else { - print "Content-Type: text/html\n"; - print "\n\nError$body_tagError"; - $_ = $ENV{PATH_INFO}; - print "malformed URL: $_\n"; - exit (0); + # didn't + alarm 0; + return @_; } } - -############################################################################## -# -# Generating a list of urls only -# -############################################################################## - -sub url_only_output { - $| = 1; - do { - my ($base, $img) = pick_image; - if ($img) { - $base =~ s/ /%20/g; - $img =~ s/ /%20/g; - print "$img $base\n"; - } - } while (1); -} - -############################################################################## -# -# Running as an xscreensaver module -# -############################################################################## - -my $image_tmp2; -my $image_tmp3; - -sub x_cleanup { - if ($DEBUG > 0) { print STDERR "caught signal\n"; } - unlink $image_ppm, $image_tmp, $image_tmp2, $image_tmp3; - exit 1; -} - - sub x_output { my $win_cmd = $ppm_to_root_window_cmd; @@ -1417,7 +1011,7 @@ sub x_output { # make sure the various programs we execute exist, right up front. foreach ("ppmmake", "giftopnm", "djpeg", "pnmpaste", "pnmscale", - $win_cmd) { + "pnmcut", $win_cmd) { which ($_) || die "$progname: $_ not found on \$PATH.\n"; } @@ -1435,7 +1029,7 @@ sub x_output { $_ = "xdpyinfo"; which ($_) || die "$progname: $_ not found on \$PATH.\n"; $_ = `$_`; - ($img_width, $img_height) = m/dimensions: *([0-9]+)x([0-9]+) /; + ($img_width, $img_height) = m/dimensions: *(\d+)x(\d+) /; } my $bgcolor = "#000000"; @@ -1448,7 +1042,8 @@ sub x_output { $bgimage = $background; } elsif (! $background =~ m@^[-a-z0-9 ]+$@i) { - print STDERR "not a color or readable file: $background\n"; + print STDERR "$progname: not a color or readable file: " . + "$background\n"; exit 1; } else { # default to assuming it's a color @@ -1459,182 +1054,324 @@ sub x_output { # Create the sold-colored base image. # $_ = "ppmmake '$bgcolor' $img_width $img_height"; - if ($DEBUG > 1) { - print STDERR "creating base image: $_\n"; + if ($verbose > 1) { + print STDERR "$progname: creating base image: $_\n"; } - system "$_ > $image_ppm"; + nontrapping_system "$_ > $image_ppm"; # Paste the default background image in the middle of it. # if ($bgimage) { my ($iw, $ih); - if (open(IMG, "<$bgimage")) { - $_ = ; - $_ = ; - ($iw, $ih) = m/^([0-9]+) ([0-9]+)$/; - close (IMG); + + my $body = ""; + local *IMG; + open(IMG, "<$bgimage") || die ("couldn't open $bgimage: $!\n"); + my $cmd; + while () { $body .= $_; } + close (IMG); + if ((@_ = gif_size ($body))) { + ($iw, $ih) = @_; + $cmd = "giftopnm |"; + } elsif ((@_ = jpeg_size ($body))) { + ($iw, $ih) = @_; + $cmd = "djpeg |"; + } elsif ($body =~ "^P\d\n(\d+) (\d+)\n") { + $iw = $1; + $ih = $2; + $cmd = ""; + } else { + die "$progname: $bgimage is not a GIF, JPEG, or PPM.\n"; } - my $x = int (($img_width - $iw) / 2); + + my $x = int (($img_width - $iw) / 2); my $y = int (($img_height - $ih) / 2); - if ($DEBUG > 1) { - print STDERR "pasting $bgimage into base image at $x, $y\n"; + if ($verbose > 1) { + print STDERR "$progname: pasting $bgimage (${iw}x$ih) into base ". + "image at $x,$y\n"; } - system "pnmpaste $bgimage $x $y $image_ppm > $image_tmp2" . - " && mv $image_tmp2 $image_ppm"; - } - - do { - my ($base, $img) = pick_image; + $cmd .= "pnmpaste - $x $y $image_ppm > $image_tmp1"; + open (IMG, "| $cmd") || die ("running $cmd: $!\n"); + print IMG $body; + close (IMG); + if ($verbose > 1) { + print STDERR "$progname: subproc exited normally.\n"; + } + rename ($image_tmp1, $image_ppm) || + die ("renaming $image_tmp1 to $image_ppm: $!\n"); + } - my ($headers, $body); + while (1) { + my ($base, $img, $source) = pick_image(); if ($img) { - ($headers, $body) = get_document ($img, $base); + my ($headers, $body) = get_document ($img, $base); + if ($body) { + handle_image ($base, $img, $body, $source); + } } + unlink $image_tmp1, $image_tmp2; + sleep $delay; + } +} - if ($body) { +sub handle_image { + my ($base, $img, $body, $source) = @_; - if ($DEBUG > 0) { - print STDERR "got $img (" . length($body) . ")\n"; - } + if ($verbose > 1) { + print STDERR "$progname: got $img (" . length($body) . ")\n"; + } - my $cmd; - if ($img =~ m/\.gif/i) { - $cmd = "giftopnm"; - } else { - $cmd = "djpeg"; - } + my ($iw, $ih) = image_to_pnm ($img, $body, $image_tmp1); + return 0 unless ($iw && $ih); - if ($DEBUG == 0) { - $cmd .= " 2>/dev/null"; - } + my $ow = $iw; # used only for error messages + my $oh = $ih; - if (open(PIPE, "| $cmd > $image_tmp")) { - print PIPE $body; - close PIPE; + # don't just tack this onto the front of the pipeline -- we want it to + # be able to change the size of the input image. + # + if ($filter_cmd) { + if ($verbose > 1) { + print STDERR "$progname: running $filter_cmd\n"; + } - if ($DEBUG > 1) { - print STDERR "created $image_tmp ($cmd)\n"; - } + my $rc = nontrapping_system "($filter_cmd) < $image_tmp1 >$image_tmp2"; + if ($rc != 0) { + if ($verbose) { + print STDERR "$progname: failed command: \"$filter_cmd\"\n"; + print STDERR "$progname: failed url: \"$img\" (${ow}x$oh)\n"; } + return; + } + rename ($image_tmp2, $image_tmp1); + + # re-get the width/height in case the filter resized it. + local *IMG; + open(IMG, "<$image_tmp1") || return 0; + $_ = ; + $_ = ; + ($iw, $ih) = m/^(\d+) (\d+)$/; + close (IMG); + return 0 unless ($iw && $ih); + } - if (-s $image_tmp) { + my $target_w = $img_width; + my $target_h = $img_height; - if ($filter_cmd) { - if ($DEBUG > 1) { - print STDERR "running $filter_cmd\n"; - } - system "($filter_cmd) < $image_tmp > $image_tmp3" . - " && mv $image_tmp3 $image_tmp"; - } + my $cmd = ""; - my ($iw, $ih); - if (open(IMG, "<$image_tmp")) { - $_ = ; - $_ = ; - ($iw, $ih) = m/^([0-9]+) ([0-9]+)$/; - close (IMG); - } - if ($iw && $ih) { + # Usually scale the image to fit on the screen -- but sometimes scale it + # to fit on half or a quarter of the screen. Note that we don't merely + # scale it to fit, we instead cut it in half until it fits -- that should + # give a wider distribution of sizes. + # + if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; } + if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; } - if ($DEBUG > 1) { - print STDERR "image size is $iw x $ih\n"; - } + if ($iw > $target_w || $ih > $target_h) { + while ($iw > $target_w || + $ih > $target_h) { + $iw = int($iw / 2); + $ih = int($ih / 2); + } + if ($iw <= 10 || $ih <= 10) { + if ($verbose > 1) { + print STDERR "$progname: scaling to ${iw}x$ih would " . + "have been bogus.\n"; + } + return 0; + } - if ($iw > $img_width || $ih > $img_height) { - while ($iw > $img_width || - $ih > $img_height) { - $iw = int($iw / 2); - $ih = int($ih / 2); - } - if ($DEBUG > 1) { - print STDERR "scaling to $iw x $ih\n"; - } - system "pnmscale -xysize $iw $ih $image_tmp" . - " > $image_tmp2" . - " 2>/dev/null && mv $image_tmp2 $image_tmp"; - } + if ($verbose > 1) { + print STDERR "$progname: scaling to ${iw}x$ih\n"; + } - my $x = int (rand() * ($img_width - $iw)); - my $y = int (rand() * ($img_height - $ih)); + $cmd .= " | pnmscale -xsize $iw -ysize $ih"; + } - if ($DEBUG > 1) { - print STDERR "pasting at $x, $y in $image_ppm\n"; - } - system "pnmpaste $image_tmp $x $y $image_ppm" . - " > $image_tmp2" . - " && mv $image_tmp2 $image_ppm"; + my $src = $image_tmp1; + my $crop_x = 0; # the sub-rectangle of the image + my $crop_y = 0; # that we will actually paste. + my $crop_w = $iw; + my $crop_h = $ih; - my $target = $image_ppm; - if ($post_filter_cmd) { - if ($DEBUG > 1) { - print STDERR "running $post_filter_cmd\n"; - } - system "($post_filter_cmd) < $image_ppm > $image_tmp3"; - $target = $image_tmp3; - } + # The chance that we will randomly crop out a section of an image starts + # out fairly low, but goes up for images that are very large, or images + # that have ratios that make them look like banners (we try to avoid + # banner images entirely, but they slip through when the IMG tags didn't + # have WIDTH and HEIGHT specified.) + # + my $crop_chance = 0.2; + if ($iw > $img_width * 0.4 || $ih > $img_height * 0.4) { + $crop_chance += 0.2; + } + if ($iw > $img_width * 0.7 || $ih > $img_height * 0.7) { + $crop_chance += 0.2; + } + if ($min_ratio && ($iw * $min_ratio) > $ih) { + $crop_chance += 0.7; + } - if (!$no_output_p) { + if ($verbose > 2 && $crop_chance > 0.1) { + print STDERR "$progname: crop chance: $crop_chance\n"; + } - my $tsize = (stat($target))[7]; - if ($tsize > 200) { - $_ = $ppm_to_root_window_cmd; - s/%%PPM%%/$target/; + if (rand() < $crop_chance) { - if ($DEBUG > 1) { - print STDERR "running $_\n"; - } - system $_; + my $ow = $crop_w; + my $oh = $crop_h; - } elsif ($DEBUG > 1) { - print STDERR "$target size is $tsize\n"; - } - } - } - } - unlink $image_tmp, $image_tmp2, $image_tmp3; + if ($crop_w > $min_width) { + # if it's a banner, select the width linearly. + # otherwise, select a bell. + my $r = (($min_ratio && ($iw * $min_ratio) > $ih) + ? rand() + : bellrand()); + $crop_w = $min_width + int ($r * ($crop_w - $min_width)); + $crop_x = int (rand() * ($ow - $crop_w)); + } + if ($crop_h > $min_height) { + # height always selects as a bell. + $crop_h = $min_height + int (bellrand() * ($crop_h - $min_height)); + $crop_y = int (rand() * ($oh - $crop_h)); } - sleep $delay; + if ($verbose > 1 && + ($crop_x != 0 || $crop_y != 0 || + $crop_w != $iw || $crop_h != $ih)) { + print STDERR "$progname: randomly cropping to " . + "${crop_w}x$crop_h \@ $crop_x,$crop_y\n"; + } + } - } while (1); -} + # Where the image should logically land -- this might be negative. + # + my $x = int((rand() * ($img_width + $crop_w/2)) - $crop_w*3/4); + my $y = int((rand() * ($img_height + $crop_h/2)) - $crop_h*3/4); + # if we have chosen to paste the image outside of the rectangle of the + # screen, then we need to crop it. + # + if ($x < 0 || + $y < 0 || + $x + $crop_w > $img_width || + $y + $crop_h > $img_height) { + + if ($verbose > 1) { + print STDERR "$progname: cropping for effective paste of " . + "${crop_w}x$crop_h \@ $x,$y\n"; + } + + if ($x < 0) { $crop_x -= $x; $crop_w += $x; $x = 0; } + if ($y < 0) { $crop_y -= $y; $crop_h += $y; $y = 0; } -sub x_main { + if ($x + $crop_w >= $img_width) { $crop_w = $img_width - $x - 1; } + if ($y + $crop_h >= $img_height) { $crop_h = $img_height - $y - 1; } + } - # Unlike CGI, when running in X mode, the various tmp files should be - # in the /tmp directory and should have gensymed names. + # If any cropping needs to happen, add pnmcut. # - $image_ppm = ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp") . "/webcollage." . $$; - $image_tmp = $image_ppm . "-1"; - $image_tmp2 = $image_ppm . "-2"; - $image_tmp3 = $image_ppm . "-3"; + if ($crop_x != 0 || $crop_y != 0 || + $crop_w != $iw || $crop_h != $ih) { + $iw = $crop_w; + $ih = $crop_h; + $cmd .= " | pnmcut $crop_x $crop_y $iw $ih"; + if ($verbose > 1) { + print STDERR "$progname: cropping to ${crop_w}x$crop_h \@ " . + "$crop_x,$crop_y\n"; + } + } + + if ($verbose > 1) { + print STDERR "$progname: pasting ${iw}x$ih \@ $x,$y in $image_ppm\n"; + } + + $cmd .= " | pnmpaste - $x $y $image_ppm"; + + $cmd =~ s@^ *\| *@@; + my $rc = nontrapping_system "($cmd) < $image_tmp1 > $image_tmp2"; - # In X mode, these aren't used. Set them to undef to error if we try. + if ($rc != 0) { + if ($verbose) { + print STDERR "$progname: failed command: \"$cmd\"\n"; + print STDERR "$progname: failed url: \"$img\" (${ow}x$oh)\n"; + } + return; + } + + rename ($image_tmp2, $image_ppm) || return; + + my $target = "$image_ppm"; + + # don't just tack this onto the end of the pipeline -- we don't want it + # to end up in $image_ppm, because we don't want the results to be + # cumulative. # - $data_dir = undef; - $image_jpg = undef; - $pending_file = undef; - $map_file = undef; - $url_generation_time = undef; - $image_retrieval_time = undef; - $max_map_entries = undef; - $pastes_per_load = undef; - $max_age = undef; - $script_date = undef; - @all_files = undef; - - # In X mode, these come either from the command line, or from the X server. - $img_width = undef; - $img_height = undef; + if ($post_filter_cmd) { + $target = $image_tmp1; + $rc = nontrapping_system "($post_filter_cmd) < $image_ppm > $target"; + if ($rc != 0) { + if ($verbose) { + print STDERR "$progname: filter failed: " . + "\"$post_filter_cmd\"\n"; + } + return; + } + } + + if (!$no_output_p) { + my $tsize = (stat($target))[7]; + if ($tsize > 200) { + $cmd = $ppm_to_root_window_cmd; + $cmd =~ s/%%PPM%%/$target/; + + # xv seems to hate being killed. it tends to forget to clean + # up after itself, and leaves windows around and colors allocated. + # I had this same problem with vidwhacker, and I'm not entirely + # sure what I did to fix it. But, let's try this: launch xv + # in the background, so that killing this process doesn't kill it. + # it will die of its own accord soon enough. So this means we + # start pumping bits to the root window in parallel with starting + # the next network retrieval, which is probably a better thing + # to do anyway. + # + $cmd .= "&"; + + $rc = nontrapping_system ($cmd); + + if ($rc != 0) { + if ($verbose) { + print STDERR "$progname: display failed: \"$cmd\"\n"; + } + return; + } + + } elsif ($verbose > 1) { + print STDERR "$progname: $target size is $tsize\n"; + } + } + + if ($verbose > 0) { + print STDOUT "image: ${iw}x${ih} @ $x,$y $base $source\n"; + } + + return 1; +} +sub main { + $| = 1; + srand(time ^ $$); + my $root_p = 0; + # historical suckage: the environment variable name is lower case. + $http_proxy = $ENV{http_proxy} || $ENV{HTTP_PROXY}; + while ($_ = $ARGV[0]) { shift @ARGV; if ($_ eq "-display" || @@ -1652,9 +1389,9 @@ sub x_main { $urls_only_p = 1; $no_output_p = 1; } elsif ($_ eq "-verbose") { - $DEBUG++; + $verbose++; } elsif (m/^-v+$/) { - $DEBUG += length($_)-1; + $verbose += length($_)-1; } elsif ($_ eq "-delay") { $delay = shift @ARGV; } elsif ($_ eq "-timeout") { @@ -1667,20 +1404,31 @@ sub x_main { $background = shift @ARGV; } elsif ($_ eq "-size") { $_ = shift @ARGV; - if (m@^([0-9]+)x([0-9]+)$@) { + if (m@^(\d+)x(\d+)$@) { $img_width = $1; $img_height = $2; } else { die "$progname: argument to \"-size\" must be" . " of the form \"640x400\"\n"; } + } elsif ($_ eq "-proxy" || $_ eq "-http-proxy") { + $http_proxy = shift @ARGV; } else { die "$copyright\nusage: $progname [-root]" . " [-display dpy] [-root] [-verbose] [-timeout secs]\n" . - "\t\t [-delay secs] [-filter cmd] [-filter2 cmd]\n"; + "\t\t [-delay secs] [-filter cmd] [-filter2 cmd]\n" . + "\t\t [-http-proxy host[:port]]\n"; } } + if ($http_proxy && $http_proxy eq "") { + $http_proxy = undef; + } + if ($http_proxy && $http_proxy =~ m@^http://([^/]*)/?$@ ) { + # historical suckage: allow "http://host:port" as well as "host:port". + $http_proxy = $1; + } + if (!$root_p && !$no_output_p) { die "$copyright" . "$progname: the -root argument is manditory (for now.)\n"; @@ -1697,21 +1445,5 @@ sub x_main { } } - -############################################################################## -# -# Decide if we're in X or CGI mode, and dispatch. -# -############################################################################## - -sub main { - srand(time ^ $$); - if ( $progname =~ m/\.cgi$/i || $ENV{REQUEST_METHOD} ) { - cgi_main; - } else { - x_main; - } -} - main; exit (0);