2 # Copyright © 2003, 2004 Jamie Zawinski <jwz@jwz.org>
4 # Permission to use, copy, modify, distribute, and sell this software and its
5 # documentation for any purpose is hereby granted without fee, provided that
6 # the above copyright notice appear in all copies and that both that
7 # copyright notice and this permission notice appear in supporting
8 # documentation. No representations are made about the suitability of this
9 # software for any purpose. It is provided "as is" without express or
12 # Created: 30-Aug-2003.
14 # Spits out the text of the most recent public posts on livejournal.com.
15 # This works as the "-program" argument to phosphor, starwars, etc.
20 # We can't "use diagnostics" here, because that library malfunctions if
21 # you signal and catch alarms: it says "Uncaught exception from user code"
22 # and exits, even though I damned well AM catching it!
26 use Text::Wrap qw(wrap);
27 use bytes; # Larry can take Unicode and shove it up his ass sideways.
29 my $progname = $0; $progname =~ s@.*/@@g;
30 my $version = q{ $Revision: 1.10 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;
34 my $url = "http://www.livejournal.com/stats/latest-rss.bml";
36 my $http_proxy = undef;
37 my $http_timeout = 30;
38 my $http_timeout2 = 5;
41 # Maps HTML character entities to the corresponding Latin1 characters.
44 "quot" => '"', "amp" => '&', "lt" => '<', "gt" => '>',
45 "nbsp" => ' ', "iexcl" => '¡', "cent" => '¢', "pound" => '£',
46 "curren" => '¤', "yen" => '¥', "brvbar" => '¦', "sect" => '§',
47 "uml" => '¨', "copy" => '©', "ordf" => 'ª', "laquo" => '«',
48 "not" => '¬', "shy" => '', "reg" => '®', "macr" => '¯',
49 "deg" => '°', "plusmn" => '±', "sup2" => '²', "sup3" => '³',
50 "acute" => '´', "micro" => 'µ', "para" => '¶', "middot" => '·',
51 "cedil" => '¸', "sup1" => '¹', "ordm" => 'º', "raquo" => '»',
52 "frac14" => '¼', "frac12" => '½', "frac34" => '¾', "iquest" => '¿',
53 "Agrave" => 'À', "Aacute" => 'Á', "Acirc" => 'Â', "Atilde" => 'Ã',
54 "Auml" => 'Ä', "Aring" => 'Å', "AElig" => 'Æ', "Ccedil" => 'Ç',
55 "Egrave" => 'È', "Eacute" => 'É', "Ecirc" => 'Ê', "Euml" => 'Ë',
56 "Igrave" => 'Ì', "Iacute" => 'Í', "Icirc" => 'Î', "Iuml" => 'Ï',
57 "ETH" => 'Ð', "Ntilde" => 'Ñ', "Ograve" => 'Ò', "Oacute" => 'Ó',
58 "Ocirc" => 'Ô', "Otilde" => 'Õ', "Ouml" => 'Ö', "times" => '×',
59 "Oslash" => 'Ø', "Ugrave" => 'Ù', "Uacute" => 'Ú', "Ucirc" => 'Û',
60 "Uuml" => 'Ü', "Yacute" => 'Ý', "THORN" => 'Þ', "szlig" => 'ß',
61 "agrave" => 'à', "aacute" => 'á', "acirc" => 'â', "atilde" => 'ã',
62 "auml" => 'ä', "aring" => 'å', "aelig" => 'æ', "ccedil" => 'ç',
63 "egrave" => 'è', "eacute" => 'é', "ecirc" => 'ê', "euml" => 'ë',
64 "igrave" => 'ì', "iacute" => 'í', "icirc" => 'î', "iuml" => 'ï',
65 "eth" => 'ð', "ntilde" => 'ñ', "ograve" => 'ò', "oacute" => 'ó',
66 "ocirc" => 'ô', "otilde" => 'õ', "ouml" => 'ö', "divide" => '÷',
67 "oslash" => 'ø', "ugrave" => 'ù', "uacute" => 'ú', "ucirc" => 'û',
68 "uuml" => 'ü', "yacute" => 'ý', "thorn" => 'þ', "yuml" => 'ÿ',
72 # Maps certain UTF8 characters (2 or 3 bytes) to the corresponding
75 my %unicode_latin1_table = (
76 "\xC2\xA1" => '¡', "\xC2\xA2" => '¢', "\xC2\xA3" => '£', "\xC2\xA4" => '¤',
77 "\xC2\xA5" => '¥', "\xC2\xA6" => '¦', "\xC2\xA7" => '§', "\xC2\xA8" => '¨',
78 "\xC2\xA9" => '©', "\xC2\xAA" => 'ª', "\xC2\xAB" => '«', "\xC2\xAC" => '¬',
79 "\xC2\xAD" => '', "\xC2\xAE" => '®', "\xC2\xAF" => '¯', "\xC2\xB0" => '°',
80 "\xC2\xB1" => '±', "\xC2\xB2" => '²', "\xC2\xB3" => '³', "\xC2\xB4" => '´',
81 "\xC2\xB5" => 'µ', "\xC2\xB6" => '¶', "\xC2\xB7" => '·', "\xC2\xB8" => '¸',
82 "\xC2\xB9" => '¹', "\xC2\xBA" => 'º', "\xC2\xBB" => '»', "\xC2\xBC" => '¼',
83 "\xC2\xBD" => '½', "\xC2\xBE" => '¾', "\xC2\xBF" => '¿', "\xC3\x80" => 'À',
84 "\xC3\x81" => 'Á', "\xC3\x82" => 'Â', "\xC3\x83" => 'Ã', "\xC3\x84" => 'Ä',
85 "\xC3\x85" => 'Å', "\xC3\x86" => 'Æ', "\xC3\x87" => 'Ç', "\xC3\x88" => 'È',
86 "\xC3\x89" => 'É', "\xC3\x8A" => 'Ê', "\xC3\x8B" => 'Ë', "\xC3\x8C" => 'Ì',
87 "\xC3\x8D" => 'Í', "\xC3\x8E" => 'Î', "\xC3\x8F" => 'Ï', "\xC3\x90" => 'Ð',
88 "\xC3\x91" => 'Ñ', "\xC3\x92" => 'Ò', "\xC3\x93" => 'Ó', "\xC3\x94" => 'Ô',
89 "\xC3\x95" => 'Õ', "\xC3\x96" => 'Ö', "\xC3\x97" => '×', "\xC3\x98" => 'Ø',
90 "\xC3\x99" => 'Ù', "\xC3\x9A" => 'Ú', "\xC3\x9B" => 'Û', "\xC3\x9C" => 'Ü',
91 "\xC3\x9D" => 'Ý', "\xC3\x9E" => 'Þ', "\xC3\x9F" => 'ß', "\xC3\xA0" => 'à',
92 "\xC3\xA1" => 'á', "\xC3\xA2" => 'â', "\xC3\xA3" => 'ã', "\xC3\xA4" => 'ä',
93 "\xC3\xA5" => 'å', "\xC3\xA6" => 'æ', "\xC3\xA7" => 'ç', "\xC3\xA8" => 'è',
94 "\xC3\xA9" => 'é', "\xC3\xAA" => 'ê', "\xC3\xAB" => 'ë', "\xC3\xAC" => 'ì',
95 "\xC3\xAD" => 'í', "\xC3\xAE" => 'î', "\xC3\xAF" => 'ï', "\xC3\xB0" => 'ð',
96 "\xC3\xB1" => 'ñ', "\xC3\xB2" => 'ò', "\xC3\xB3" => 'ó', "\xC3\xB4" => 'ô',
97 "\xC3\xB5" => 'õ', "\xC3\xB6" => 'ö', "\xC3\xB7" => '÷', "\xC3\xB8" => 'ø',
98 "\xC3\xB9" => 'ù', "\xC3\xBA" => 'ú', "\xC3\xBB" => 'û', "\xC3\xBC" => 'ü',
99 "\xC3\xBD" => 'ý', "\xC3\xBE" => 'þ', "\xC3\xBF" => 'ÿ',
101 "\xE2\x80\x93" => '--', "\xE2\x80\x94" => '--',
102 "\xE2\x80\x98" => '`', "\xE2\x80\x99" => '\'',
103 "\xE2\x80\x9C" => "``", "\xE2\x80\x9D" => "''",
104 "\xE2\x80\xA6" => '...',
108 # Convert any HTML entities to Latin1 characters.
112 $text =~ s/(&(\#)?([[:alpha:]\d]+);?)/
116 $c = chr($3); # the &#number is always decimal, right?
118 $c = $entity_table{$3};
120 # print STDERR "$progname: warning: unknown HTML character entity \"$1\"\n"
129 # Convert any Unicode characters to Latin1 if possible.
130 # Unconvertable bytes are left alone.
134 foreach my $key (keys (%unicode_latin1_table)) {
135 my $val = $unicode_latin1_table{$key};
136 $text =~ s/$key/$val/gs;
142 # returns three values: the HTTP response line; the document headers;
143 # and the document body.
148 my $timeout = $http_timeout;
149 my $timeout2 = $http_timeout2;
151 print STDERR "$progname: loading $url\n" if ($verbose);
153 if (! ($url =~ m@^http://@i)) {
154 error ("not an HTTP URL: $url");
157 my ($url_proto, $dummy, $serverstring, $path) = split(/\//, $url, 4);
158 $path = "" unless $path;
160 my ($them,$port) = split(/:/, $serverstring);
161 $port = 80 unless $port;
166 $serverstring = $http_proxy if $http_proxy;
167 $serverstring =~ s@^[a-z]+://@@;
168 ($them2,$port2) = split(/:/, $serverstring);
169 $port2 = 80 unless $port2;
172 my ($remote, $iaddr, $paddr, $proto, $line);
174 if ($port2 =~ /\D/) { $port2 = getservbyname($port2, 'tcp') }
176 error ("unrecognised port in $url");
178 $iaddr = inet_aton($remote);
180 error ("host not found: $remote");
182 $paddr = sockaddr_in($port2, $iaddr);
190 local $SIG{ALRM} = sub {
192 print STDERR "$progname: timed out ($timeout) in headers for $url\n";
194 print STDERR "$progname: timed out ($timeout2) in body for $url\n";
200 $proto = getprotobyname('tcp');
201 if (!socket(S, PF_INET, SOCK_STREAM, $proto)) {
202 error ("socket: $!");
204 if (!connect(S, $paddr)) {
205 error ("connect($serverstring): $!");
208 select(S); $| = 1; select(STDOUT);
210 my $user_agent = "$progname/$version";
212 my $hdrs = "GET " . ($http_proxy ? $url : "/$path") . " HTTP/1.0\r\n" .
214 "User-Agent: $user_agent\r\n";
218 foreach (split('\r?\n', $hdrs)) {
219 print STDERR " ==> $_\n";
224 my $http = <S> || "";
228 print STDERR " <== $_\n" if ($verbose > 1);
234 print STDERR " <== $_\n" if ($verbose > 1);
242 # we wait $timeout secs to get the first body line; after
243 # that, we time out if we haven't received a subsequent line
244 # in $timeout2 seconds.
249 print STDERR (" <== [ body ]: $lines lines, " .
250 length($body) . " bytes\n")
256 print STDERR "$progname: null response: $url\n" if ($verbose);
259 return ( $http, $head, $body );
261 die if ($@ && $@ ne "alarm\n"); # propagate errors
274 my ($images_p, $count, $cols) = @_;
276 $|=1; # unbuffer stdout
279 s@^[a-z]+:/+([^/?\#]+).*$@$1@;
282 print STDOUT "Contacting $host..." if ($verbose);
284 my ($http, $head, $body) = get_document ($url);
287 print STDOUT "$progname: no response from $host\n";
291 print STDOUT "\n\n" if ($verbose);
293 $body =~ s/(<item\b)/\001\001$1/gsi;
294 my @items = split (/\001\001/, $body);
296 # Let's skip forward in the stream by a random amount, so that if
297 # two copies of ljlatest are running at the same time (e.g., on a
298 # multi-headed machine), they get different text. (Put the items
299 # that we take off the front back on the back.)
302 my $n = int (rand ($#items - 5));
304 push @items, (shift @items);
310 next unless m/^<item\b/i;
311 last if (defined ($count) && $i >= $count);
313 my ($ig0, $title) = m@<(TITLE [^<>\s]*)[^<>]*>\s*(.*?)\s*</\1>@xsi;
314 my ($ig1, $body) = m@<(DESCRIPTION [^<>\s]*)[^<>]*>\s*(.*?)\s*</\1>@xsi;
315 my ($ig2, $url) = m@<(LINK [^<>\s]*)[^<>]*>\s*(.*?)\s*</\1>@xsi;
317 $_ = "$title\n\n$body";
319 s@<[^<>]*>@@gs; # lose all XML tags
320 $_ = de_unicoddle ($_); # convert UTF8 to Latin1
321 $_ = de_entify ($_); # convert entities to get HTML from XML
325 foreach (split (/\001\001/, $_)) {
326 next unless m/^(<img\b[^<>]+>)/i;
328 my ($src) = m/\bSRC \s*=\s*[\"\']?([^<>\"\'\s]+)/xsi;
330 next if ($src =~ m@^http://[^./]+\.livejournal\.com\b@); # builtins
332 my ($w) = m/\bWIDTH \s*=\s*[\"\']?(\d+)/xsi;
333 my ($h) = m/\bHEIGHT \s*=\s*[\"\']?(\d+)/xsi;
335 $_ = "<A HREF=\"$url\"><IMG SRC=\"$src\"";
336 $_ .= " WIDTH=$w" if ($w);
337 $_ .= " HEIGHT=$h" if ($h);
338 $_ .= " BORDER=1 HSPACE=4 VSPACE=4></A><BR>\n";
343 } else { # emit text/plain
345 s@</?(BR|TR|TD|LI|DIV)\b[^<>]*>@\n@gsi; # line break at BR, TD, DIV, etc
346 s@</?(P|UL|OL|BLOCKQUOTE)\b[^<>]*>@\n\n@gsi; # two line breaks
348 s@<lj\s+user=\"?([^<>\"]+)\"?[^<>]*>?@$1@gsi; # handle <LJ USER=>
349 s@</?[BI]>@*@gsi; # bold, italic => asterisks
351 s@<[^<>]*>?@@gs; # lose all other HTML tags
352 $_ = de_entify ($_); # convert entities in the html too
354 # elide any remaining non-Latin1 binary data...
355 s/([\177-\377]+(\s*[\177-\377]+)[^a-z\d]*)/«...» /g;
356 #s/([\177-\377]+(\s*[\177-\377]+)[^a-z\d]*)/«$1» /g;
360 s/[ \t]*$//gm; # lose whitespace at end of line
361 s@\n\n\n+@\n\n@gs; # compress blank lines
363 $Text::Wrap::columns = $cols;
364 $_ = wrap ("", " ", $_); # wrap the lines as a paragraph
366 s/[ \t]*$//gm; # lose whitespace at end of line again
367 s/^\s+//s; # de-indent first line
368 $_ .= "\n"; # blank line at very end
378 print STDERR "$progname: $_\n";
383 print STDERR "usage: $progname [--verbose] [--count N] [--columns N]" .
384 " [--images] [rss-url]\n";
392 while ($_ = $ARGV[0]) {
394 if ($_ eq "--verbose") { $verbose++; }
395 elsif (m/^-v+$/) { $verbose += length($_)-1; }
396 elsif ($_ eq "--count") { $count = 0 + shift @ARGV; }
397 elsif ($_ eq "--images") { $images_p = 1; }
398 elsif ($_ eq "--columns" ||
402 $cols = 0 + shift @ARGV; }
403 elsif (m/^-./) { usage; }
404 elsif (m@^http://@) { $url = $_; }
408 # historical suckage: the environment variable name is lower case.
409 $http_proxy = $ENV{http_proxy} || $ENV{HTTP_PROXY};
411 lj_latest ($images_p, $count, $cols);