- $iaddr = inet_aton($remote);
- if (!$iaddr) {
- LOG (($verbose_net || $verbose_load), "host not found: $remote");
- return ();
- }
- $paddr = sockaddr_in($port2, $iaddr);
-
-
- my $head = "";
- my $body = "";
-
- @_ =
- eval {
- local $SIG{ALRM} = sub {
- LOG (($verbose_net || $verbose_load), "timed out ($timeout) for $url");
- die "alarm\n";
- };
- alarm $timeout;
-
- $proto = getprotobyname('tcp');
- if (!socket(S, PF_INET, SOCK_STREAM, $proto)) {
- LOG (($verbose_net || $verbose_load), "socket: $!");
- return ();
- }
- if (!connect(S, $paddr)) {
- LOG (($verbose_net || $verbose_load), "connect($serverstring): $!");
- return ();
- }
-
- select(S); $| = 1; select(STDOUT);
-
- my $cookie = $cookies{$them};
-
- my $user_agent = "$progname/$version";
-
- if ($url =~ m@^http://www\.altavista\.com/@ ||
- $url =~ m@^http://random\.yahoo\.com/@ ||
- $url =~ m@^http://images\.google\.com/@ ||
- $url =~ m@^http://www\.google\.com/@) {
- # block this, you turkeys.
- $user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.7)" .
- " Gecko/20070914 Firefox/2.0.0.7";
-
- # 28-Jun-2007: Google Images now emits the entire page in JS if
- # you claim to be Gecko. They also still block "webcollage".
- # They serve non-JS for unrecognised agents, so let's try this...
- $user_agent = "NoJavascriptPlease/1.0"
- if ($url =~ m@^http://[a-z]+\.google\.com/@);
- }
-
- my $hdrs = "GET " . ($http_proxy ? $url : "/$path") . " HTTP/1.0\r\n" .
- "Host: $them\r\n" .
- "User-Agent: $user_agent\r\n";
- if ($referer) {
- $hdrs .= "Referer: $referer\r\n";
- }
- if ($cookie) {
- my @cc = split(/\r?\n/, $cookie);
- $hdrs .= "Cookie: " . join('; ', @cc) . "\r\n";
- }
- $hdrs .= "\r\n";
-
- foreach (split('\r?\n', $hdrs)) {
- LOG ($verbose_http, " ==> $_");
- }
- print S $hdrs;
- my $http = <S> || "";
-
- # Kludge: the Yahoo Random Link is now returning as its first
- # line "Status: 301" instead of "HTTP/1.0 301 Found". Fix it...
- #
- $http =~ s@^Status:\s+(\d+)\b@HTTP/1.0 $1@i;
-
- $_ = $http;
- s/[\r\n]+$//s;
- LOG ($verbose_http, " <== $_");
-
- while (<S>) {
- $head .= $_;
- s/[\r\n]+$//s;
- last if m@^$@;
- LOG ($verbose_http, " <== $_");
-
- if (m@^Set-cookie:\s*([^;\r\n]+)@i) {
- set_cookie($them, $1)
- }
- }
-
- my $lines = 0;
- while (<S>) {
- $body .= $_;
- $lines++;
- }
-
- LOG ($verbose_http,
- " <== [ body ]: $lines lines, " . length($body) . " bytes");
-
- close S;
-
- if (!$http) {
- LOG (($verbose_net || $verbose_load), "null response: $url");
- return ();
- }
-
- $SIG{ALRM} = 'DEFAULT'; # seem to be suffering a race?
- return ( $http, $head, $body );
- };
- die if ($@ && $@ ne "alarm\n"); # propagate errors
-
- if ($@ && $@ ne "alarm\n") {
- print STDERR blurb() . "DIE " . join(" ", $@) . "\n";
- die;
- }