#!/usr/bin/perl -w
-# Copyright © 2005-2009 Jamie Zawinski <jwz@jwz.org>
+# Copyright © 2005-2010 Jamie Zawinski <jwz@jwz.org>
#
# Permission to use, copy, modify, distribute, and sell this software and its
# documentation for any purpose is hereby granted without fee, provided that
use bytes;
my $progname = $0; $progname =~ s@.*/@@g;
-my $version = q{ $Revision: 1.18 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;
+my $version = q{ $Revision: 1.21 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;
my $verbose = 0;
my $http_proxy = undef;
my ($text) = @_;
$text =~ s/(&(\#)?([[:alpha:]\d]+);?)/
{
- my $c;
- if ($2) {
- $c = chr($3); # the &#number is always decimal, right?
+ my $c = $3;
+ if (! defined($2)) {
+ $c = $entity_table{$c}; # for Á
} else {
- $c = $entity_table{$3};
+ if ($c =~ m@^x([\dA-F]+)$@si) { # for A
+ $c = chr(hex($1));
+ } elsif ($c =~ m@^\d+$@si) { # for A
+ $c = chr($c);
+ } else {
+ $c = undef;
+ }
}
-# print STDERR "$progname: warning: unknown HTML character entity \"$1\"\n"
-# unless $c;
- ($c ? $c : "[$3]");
+ ($c || "[$3]"); # for &unknown; => "[unknown]"
}
/gexi;
return $text;
sub get_url_1($;$) {
my ($url, $referer) = @_;
- if (! ($url =~ m@^(http|feed)://@i)) {
+ $url =~ s@^feed:@http:@si;
+ if (! ($url =~ m@^http://@i)) {
error ("not an HTTP URL: $url");
}
my ($body, $rss_p) = @_;
$_ = $body;
+ # In HTML, try to preserve newlines inside of PRE.
+ #
+ if (! $rss_p) {
+ s@(<PRE\b[^<>]*>\s*)(.*?)(</PRE)@{
+ my ($a, $b, $c) = ($1, $2, $3);
+ $b =~ s/[\r\n]/<BR>/gs;
+ $a . $b . $c;
+ }@gsexi;
+ }
+
if (! $rss_p) {
- # In HTML, unfold lines (this breaks PRE. Sue me.)
+ # In HTML, unfold lines.
# In RSS, assume \n means literal line break.
s@[\r\n]@ @gsi;
}