#!/usr/bin/perl -w
-# Copyright © 2005-2014 Jamie Zawinski <jwz@jwz.org>
+# Copyright © 2005-2015 Jamie Zawinski <jwz@jwz.org>
#
# Permission to use, copy, modify, distribute, and sell this software and its
# documentation for any purpose is hereby granted without fee, provided that
# Only error out if we're actually loading a URL instead of local data.
BEGIN { eval 'use LWP::UserAgent;' }
+# Not sure how prevalent this is. Hope it's part of the default install.
+BEGIN { eval 'use HTML::Entities;' }
+
use Socket;
use POSIX qw(strftime);
use Text::Wrap qw(wrap);
use bytes;
my $progname = $0; $progname =~ s@.*/@@g;
-my ($version) = ('$Revision: 1.33 $' =~ m/\s(\d[.\d]+)\s/s);
+my ($version) = ('$Revision: 1.39 $' =~ m/\s(\d[.\d]+)\s/s);
my $verbose = 0;
my $http_proxy = undef;
my $text_literal = '';
my $text_file = '';
my $text_program = '';
-my $text_url = 'http://en.wikipedia.org/w/index.php?title=Special:NewPages&feed=rss';
+my $text_url = 'https://en.wikipedia.org/w/index.php?title=Special:NewPages&feed=rss';
# Default URL needs to be set and match what's in OSX/XScreenSaverView.m
my $wrap_columns = undef;
my $truncate_lines = undef;
+my $latin1_p = 0;
my $nyarlathotep_p = 0;
-# Maps HTML character entities to the corresponding Latin1 characters.
-#
-my %entity_table = (
- "quot" => '"', "amp" => '&', "lt" => '<',
- "gt" => '>', "nbsp" => " ", "iexcl" => "\xA1",
- "cent" => "\xA2", "pound" => "\xA3", "curren" => "\xA4",
- "yen" => "\xA5", "brvbar" => "\xA6", "sect" => "\xA7",
- "uml" => "\xA8", "copy" => "\xA9", "ordf" => "\xAA",
- "laquo" => "\xAB", "not" => "\xAC", "shy" => "\xAD",
- "reg" => "\xAE", "macr" => "\xAF", "deg" => "\xB0",
- "plusmn" => "\xB1", "sup2" => "\xB2", "sup3" => "\xB3",
- "acute" => "\xB4", "micro" => "\xB5", "para" => "\xB6",
- "middot" => "\xB7", "cedil" => "\xB8", "sup1" => "\xB9",
- "ordm" => "\xBA", "raquo" => "\xBB", "frac14" => "\xBC",
- "frac12" => "\xBD", "frac34" => "\xBE", "iquest" => "\xBF",
- "Agrave" => "\xC0", "Aacute" => "\xC1", "Acirc" => "\xC2",
- "Atilde" => "\xC3", "Auml" => "\xC4", "Aring" => "\xC5",
- "AElig" => "\xC6", "Ccedil" => "\xC7", "Egrave" => "\xC8",
- "Eacute" => "\xC9", "Ecirc" => "\xCA", "Euml" => "\xCB",
- "Igrave" => "\xCC", "Iacute" => "\xCD", "Icirc" => "\xCE",
- "Iuml" => "\xCF", "ETH" => "\xD0", "Ntilde" => "\xD1",
- "Ograve" => "\xD2", "Oacute" => "\xD3", "Ocirc" => "\xD4",
- "Otilde" => "\xD5", "Ouml" => "\xD6", "times" => "\xD7",
- "Oslash" => "\xD8", "Ugrave" => "\xD9", "Uacute" => "\xDA",
- "Ucirc" => "\xDB", "Uuml" => "\xDC", "Yacute" => "\xDD",
- "THORN" => "\xDE", "szlig" => "\xDF", "agrave" => "\xE0",
- "aacute" => "\xE1", "acirc" => "\xE2", "atilde" => "\xE3",
- "auml" => "\xE4", "aring" => "\xE5", "aelig" => "\xE6",
- "ccedil" => "\xE7", "egrave" => "\xE8", "eacute" => "\xE9",
- "ecirc" => "\xEA", "euml" => "\xEB", "igrave" => "\xEC",
- "iacute" => "\xED", "icirc" => "\xEE", "iuml" => "\xEF",
- "eth" => "\xF0", "ntilde" => "\xF1", "ograve" => "\xF2",
- "oacute" => "\xF3", "ocirc" => "\xF4", "otilde" => "\xF5",
- "ouml" => "\xF6", "divide" => "\xF7", "oslash" => "\xF8",
- "ugrave" => "\xF9", "uacute" => "\xFA", "ucirc" => "\xFB",
- "uuml" => "\xFC", "yacute" => "\xFD", "thorn" => "\xFE",
- "yuml" => "\xFF", "apos" => "\'",
-
- # HTML 4 entities that do not have 1:1 Latin1 mappings.
- "bull" => "*", "hellip"=> "...", "prime" => "'", "Prime" => "\"",
- "frasl" => "/", "trade" => "[tm]", "larr" => "<-", "rarr" => "->",
- "harr" => "<->", "lArr" => "<=", "rArr" => "=>", "hArr" => "<=>",
- "empty" => "\xD8", "minus" => "-", "lowast"=> "*", "sim" => "~",
- "cong" => "=~", "asymp" => "~", "ne" => "!=", "equiv" => "==",
- "le" => "<=", "ge" => ">=", "lang" => "<", "rang" => ">",
- "loz" => "<>", "OElig" => "OE", "oelig" => "oe", "Yuml" => "Y",
- "circ" => "^", "tilde" => "~", "ensp" => " ", "emsp" => " ",
- "thinsp"=> " ", "ndash" => "-", "mdash" => "-", "lsquo" => "`",
- "rsquo" => "'", "sbquo" => "'", "ldquo" => "\"", "rdquo" => "\"",
- "bdquo" => "\"", "lsaquo"=> "<", "rsaquo"=> ">",
-);
-
-# Maps certain UTF8 characters (2 or 3 bytes) to the corresponding
-# Latin1 characters.
-#
-my %unicode_latin1_table = (
- "\xC2\xA1" => "\xA1", "\xC2\xA2" => "\xA2", "\xC2\xA3" => "\xA3",
- "\xC2\xA4" => "\xA4", "\xC2\xA5" => "\xA5", "\xC2\xA6" => "\xA6",
- "\xC2\xA7" => "\xA7", "\xC2\xA8" => "\xA8", "\xC2\xA9" => "\xA9",
- "\xC2\xAA" => "\xAA", "\xC2\xAB" => "\xAB", "\xC2\xAC" => "\xAC",
- "\xC2\xAD" => "\xAD", "\xC2\xAE" => "\xAE", "\xC2\xAF" => "\xAF",
- "\xC2\xB0" => "\xB0", "\xC2\xB1" => "\xB1", "\xC2\xB2" => "\xB2",
- "\xC2\xB3" => "\xB3", "\xC2\xB4" => "\xB4", "\xC2\xB5" => "\xB5",
- "\xC2\xB6" => "\xB6", "\xC2\xB7" => "\xB7", "\xC2\xB8" => "\xB8",
- "\xC2\xB9" => "\xB9", "\xC2\xBA" => "\xBA", "\xC2\xBB" => "\xBB",
- "\xC2\xBC" => "\xBC", "\xC2\xBD" => "\xBD", "\xC2\xBE" => "\xBE",
- "\xC2\xBF" => "\xBF", "\xC3\x80" => "\xC0", "\xC3\x81" => "\xC1",
- "\xC3\x82" => "\xC2", "\xC3\x83" => "\xC3", "\xC3\x84" => "\xC4",
- "\xC3\x85" => "\xC5", "\xC3\x86" => "\xC6", "\xC3\x87" => "\xC7",
- "\xC3\x88" => "\xC8", "\xC3\x89" => "\xC9", "\xC3\x8A" => "\xCA",
- "\xC3\x8B" => "\xCB", "\xC3\x8C" => "\xCC", "\xC3\x8D" => "\xCD",
- "\xC3\x8E" => "\xCE", "\xC3\x8F" => "\xCF", "\xC3\x90" => "\xD0",
- "\xC3\x91" => "\xD1", "\xC3\x92" => "\xD2", "\xC3\x93" => "\xD3",
- "\xC3\x94" => "\xD4", "\xC3\x95" => "\xD5", "\xC3\x96" => "\xD6",
- "\xC3\x97" => "\xD7", "\xC3\x98" => "\xD8", "\xC3\x99" => "\xD9",
- "\xC3\x9A" => "\xDA", "\xC3\x9B" => "\xDB", "\xC3\x9C" => "\xDC",
- "\xC3\x9D" => "\xDD", "\xC3\x9E" => "\xDE", "\xC3\x9F" => "\xDF",
- "\xC3\xA0" => "\xE0", "\xC3\xA1" => "\xE1", "\xC3\xA2" => "\xE2",
- "\xC3\xA3" => "\xE3", "\xC3\xA4" => "\xE4", "\xC3\xA5" => "\xE5",
- "\xC3\xA6" => "\xE6", "\xC3\xA7" => "\xE7", "\xC3\xA8" => "\xE8",
- "\xC3\xA9" => "\xE9", "\xC3\xAA" => "\xEA", "\xC3\xAB" => "\xEB",
- "\xC3\xAC" => "\xEC", "\xC3\xAD" => "\xED", "\xC3\xAE" => "\xEE",
- "\xC3\xAF" => "\xEF", "\xC3\xB0" => "\xF0", "\xC3\xB1" => "\xF1",
- "\xC3\xB2" => "\xF2", "\xC3\xB3" => "\xF3", "\xC3\xB4" => "\xF4",
- "\xC3\xB5" => "\xF5", "\xC3\xB6" => "\xF6", "\xC3\xB7" => "\xF7",
- "\xC3\xB8" => "\xF8", "\xC3\xB9" => "\xF9", "\xC3\xBA" => "\xFA",
- "\xC3\xBB" => "\xFB", "\xC3\xBC" => "\xFC", "\xC3\xBD" => "\xFD",
- "\xC3\xBE" => "\xFE", "\xC3\xBF" => "\xFF",
-
- "\xE2\x80\x93" => '--', "\xE2\x80\x94" => '--',
- "\xE2\x80\x98" => '`', "\xE2\x80\x99" => '\'',
- "\xE2\x80\x9C" => "``", "\xE2\x80\x9D" => "''",
- "\xE2\x80\xB2" => "'", "\xE2\x80\xA6" => '...',
- "\xE2\x86\x90" => ' ← ', "\xE2\x84\xA2" => '™',
- "\xE2\x80\xA2" => '•', "\xC2\xA0" => ' ', #
- );
-
# Convert any HTML entities to Latin1 characters.
#
sub de_entify($) {
my ($text) = @_;
- $text =~ s/(&(\#)?([[:alpha:]\d]+);?)/
- {
- my $c = $3;
- if (! defined($2)) {
- $c = $entity_table{$c}; # for Á
- } else {
- if ($c =~ m@^x([\dA-F]+)$@si) { # for A
- $c = chr(hex($1));
- } elsif ($c =~ m@^\d+$@si) { # for A
- $c = chr($c);
- } else {
- $c = undef;
- }
- }
- ($c || "[$3]"); # for &unknown; => "[unknown]"
- }
- /gexi;
- return $text;
+
+ return '' unless defined($text);
+ return $text unless ($text =~ m/&/s);
+
+ # Convert any HTML entities to Unicode characters,
+ # if the HTML::Entities module is installed.
+ eval {
+ my $t2 = $text;
+ $text = undef;
+ $text = HTML::Entities::decode_entities ($t2);
+ };
+ return $text if defined($text);
+
+ # If it's not installed, just complain instead of trying to halfass it.
+ print STDOUT ("\n\tPerl is broken. Do this to repair it:\n" .
+ "\n\tsudo cpan HTML::Entities\n\n");
+ exit (1);
}
# Convert any Unicode characters to Latin1 if possible.
# Unconvertable bytes are left alone.
#
-sub de_unicoddle($) {
+sub utf8_to_latin1($) {
my ($text) = @_;
- foreach my $key (sort { length($b) <=> length($a) }
- keys (%unicode_latin1_table)) {
- my $val = $unicode_latin1_table{$key};
- $text =~ s/$key/$val/gs;
- }
+
+ utf8::encode ($text); # Unpack Unicode back to multi-byte UTF-8.
+
+ # Maybe it would be better to handle this in the Unicode domain
+ # by doing things like s/\x{2018}/\"/g, but without decoding the
+ # string back to UTF-8 first, I'm at a loss as to how to have
+ # "á" print as "\340" instead of as "\303\240".
+
+ $text =~ s/ \xC2 ( [\xA0-\xFF] ) / $1 /gsex;
+ $text =~ s/ \xC3 ( [\x80-\xFF] ) / chr (ord($1) | 0x40) /gsex;
+
+ # Handles a few 3-byte sequences too.
+ $text =~ s/\xE2\x80\x93/--/gs;
+ $text =~ s/\xE2\x80\x94/--/gs;
+ $text =~ s/\xE2\x80\x98/`/gs;
+ $text =~ s/\xE2\x80\x99/'/gs;
+ $text =~ s/\xE2\x80\x9C/``/gs;
+ $text =~ s/\xE2\x80\x9D/'/gs;
+ $text =~ s/\xE2\x80\xA2/•/gs;
+ $text =~ s/\xE2\x80\xA6/.../gs;
+ $text =~ s/\xE2\x80\xB2/'/gs;
+ $text =~ s/\xE2\x84\xA2/™/gs;
+ $text =~ s/\xE2\x86\x90/ ← /gs;
+
return $text;
}
$text_mode = $v if defined ($v);
# The "textMode" pref is set to a number instead of a string because I
- # can't figure out the black magic to make Cocoa bindings work right.
+ # couldn't figure out the black magic to make Cocoa bindings work right.
+ #
+ # Update: as of 5.33, Cocoa writes strings instead of numbers, but
+ # pre-existing saved preferences might still have numbers in them.
#
if ($text_mode eq '0') { $text_mode = 'date'; }
elsif ($text_mode eq '1') { $text_mode = 'literal'; }
if ($text_mode eq 'literal') {
$text_literal = strftime ($text_literal, localtime);
+ $text_literal = utf8_to_latin1($text_literal) if ($latin1_p);
$text_literal =~ y/A-Za-z/N-ZA-Mn-za-m/ if ($nyarlathotep_p);
print STDOUT $text_literal;
print STDOUT "\n" unless ($text_literal =~ m/\n$/s);
} else {
# stream it by lines
while (<$in>) {
+ $_ = utf8_to_latin1($_) if ($latin1_p);
y/A-Za-z/N-ZA-Mn-za-m/ if ($nyarlathotep_p);
print STDOUT $_;
}
if (-f "/usr/sbin/system_profiler") { # "Mac OS X 10.4.5 (8H14)"
my $sp = # "iMac G5"
- `/usr/sbin/system_profiler SPSoftwareDataType SPHardwareDataType`;
+ `/usr/sbin/system_profiler SPSoftwareDataType SPHardwareDataType 2>/dev/null`;
+ # system_profiler on OS X 10.10 generates spurious error messages.
my ($v) = ($sp =~ m/^\s*System Version:\s*(.*)$/mi);
my ($s) = ($sp =~ m/^\s*(?:CPU|Processor) Speed:\s*(.*)$/mi);
my ($t) = ($sp =~ m/^\s*(?:Machine|Model) Name:\s*(.*)$/mi);
s@[\r\n]@ @gsi;
}
+ # This right here is the part where I doom us all to inhuman
+ # toil for the One whose Name cannot be expressed in the
+ # Basic Multilingual Plane. http://jwz.org/b/yhAT He comes.
+
s@<!--.*?-->@@gsi; # lose comments
s@<(STYLE|SCRIPT)\b[^<>]*>.*?</\1\s*>@@gsi; # lose css and js
#
if ($rss_p eq 'wiki') {
+ s@<!--.*?-->@@gsi; # lose HTML comments again
+
# Creation line is often truncated: screws up parsing with unbalanced {{.
- s@(: +<- +Created page) with [^\n]+@$1@s;
+ s@(: +[^a-zA-Z ]* *Created page) with [^\n]+@$1@s;
s@/\*.*?\*/@@si; # /* ... */
# Try to omit all tables, since they're impossible to read as text.
#
- 1 while (s/{{[^{}]*}}/ /gs); # {{ ... }}
- 1 while (s/{\|.*?\|}/\n\n/gs); # {| ... |}
+ 1 while (s/\{\{[^{}]*}}/ /gs); # {{ ... }}
+ 1 while (s/\{\|.*?\|\}/\n\n/gs); # {| ... |}
1 while (s/\|-.*?\|/ /gs); # |- ... | (table cell)
# Convert anchors to something more readable.
# Omit trailing headlines with no text after them (e.g. == Notes ==)
1 while (s/\n==+[ \t]*[^\n=]+[ \t]*==+\s*$/\n/s);
+
+ $_ = de_entify ($_); # convert HTML entities, again
}
- # elide any remaining non-Latin1 binary data...
- s/([^\000-\176]+(\s*[^\000-\176]+)[^a-z\d]*)/\xAB...\xBB /g;
-# s/([^\000-\176]+(\s*[^\000-\176]+)[^a-z\d]*)/\xAB$1\xBB /g;
+ # elide any remaining non-Latin1 binary data.
+ if ($latin1_p) {
+ utf8::encode ($_); # Unpack Unicode back to multi-byte UTF-8.
+ s/([^\000-\176]+(\s*[^\000-\176]+)[^a-z\d]*)/\xAB...\xBB /g;
+ }
$_ .= "\n";
s/^(([^\n]*\n){$truncate_lines}).*$/$1/s;
}
+ $_ = utf8_to_latin1($_) if ($latin1_p);
y/A-Za-z/N-ZA-Mn-za-m/ if ($nyarlathotep_p);
return $_;
$body = de_entify ($body); # convert entities to get HTML from XML
}
- $body = de_unicoddle ($body); # convert UTF8 to Latin1
return $body;
}
$body =~ s/^(([^\n]*\n){$truncate_lines}).*$/$1/s;
}
+ $body = utf8_to_latin1($body) if ($latin1_p);
$body =~ y/A-Za-z/N-ZA-Mn-za-m/ if ($nyarlathotep_p);
return $body;
}
$ct = 'text/plain';
}
+ utf8::decode ($body); # Pack multi-byte UTF-8 back into wide chars.
+
$ct = guess_content_type ($ct, $body);
if ($ct eq 'html') {
print STDERR "$progname: converting HTML...\n" if ($verbose > 2);
" --cols N Wrap lines at this column. Default 72.\n" .
"\n" .
" --lines N No more than N lines of output.\n" .
+ "\n" .
+ " --latin1 Emit Latin1 instead of UTF-8.\n" .
"\n");
exit 1;
}
elsif (m/^--?col(umn)?s?$/) { $wrap_columns = 0 + shift @ARGV; }
elsif (m/^--?lines?$/) { $truncate_lines = 0 + shift @ARGV; }
elsif (m/^--?cocoa$/) { $cocoa_id = shift @ARGV; }
+ elsif (m/^--?latin1$/) { $latin1_p++; }
elsif (m/^--?nyarlathotep$/) { $nyarlathotep_p++; }
elsif (m/^-./) { usage; }
else { usage; }