use Socket;
use URI;
+our %BANNED;
our %TODO;
our %DONE;
our %CHILD;
my $check = 1;
my $ua = new LWP::UserAgent('keep_alive' => 3);
+ $ua->requests_redirectable([]);
for (;;) {
$0 = "[fetcher] idle";
my $url = <$s>;
next;
}
$0 = "[fetcher] requesting $url";
- print STDERR "Retrieving $url\n"
+ print(STDERR "Retrieving $url\n")
unless ($quiet > 1);
my $resp = $ua->get($url);
if ($check) {
$0 = "[fetcher] checking $url";
- # XXX if we got a redirect, we should blacklist the
- # original URL and suggest the target URL instead
- if ($resp->header('Content-Type') =~ m/^text\//) {
- my %urls = map { $_ => 1 } ($resp->content =~ m/$url_re/g);
- foreach (keys(%urls)) {
- $s->write("add $_\n");
+ if ($resp->is_redirect()) {
+ $s->write("ban $url\n");
+ $url = $resp->header('Location') //
+ $resp->header('Content-Location');
+ $s->write("add $url\n")
+ if $url;
+ } elsif ($resp->is_success()) {
+ if ($resp->header('Content-Type') =~ m/^text\//) {
+ my %urls = map { $_ => 1 } ($resp->content =~ m/$url_re/g);
+ foreach (keys(%urls)) {
+ $s->write("add $_\n");
+ }
}
+ } elsif ($resp->is_error()) {
+ # XXX should we ban these?
+ } else {
+ print(STDERR "Unsupported response type:",
+ $resp->status_line(), "\n");
}
}
select(undef, undef, undef, $delay)
++$BUSY;
}
+# Convert relative to absolute and add to blacklist
+sub ban_url($$) {
+ my ($child, $url) = @_;
+
+ die "child not busy\n"
+ unless $child->{'url'};
+ my $uri = URI->new_abs($1, $child->{'url'});
+ $url = $uri->canonical;
+ $BANNED{$url} = 1;
+ print(STDERR "Banned $url\n");
+}
+
# Convert relative to absolute, check if valid, and add to list
sub get_url($$) {
my ($child, $url) = @_;
unless $child->{'url'};
my $uri = URI->new_abs($1, $child->{'url'});
$url = $uri->canonical;
- if ($uri->scheme() ne 'http' ||
+ # XXX should cache child URI to avoid new() here
+ if ($BANNED{$url} || $uri->scheme() ne 'http' ||
$uri->host_port() ne URI->new($child->{'url'})->host_port()) {
- print STDERR "Rejected $url\n"
+ print(STDERR "Rejected $url\n")
unless ($quiet > 0);
return;
}
$mux->endloop();
} elsif ($line =~ m/^add (.*?)$/) {
get_url($child, $1);
+ } elsif ($line =~ m/^ban (.*?)$/) {
+ ban_url($child, $1);
} else {
die "can't grok [$line]\n";
}
}
%TODO = %DONE;
%DONE = ();
- print STDERR "Starting over...\n";
+ print(STDERR "Starting over...\n");
}
# done
sub usage() {
- print STDERR "usage: $0 [-cqr] [-d n] [-j n] URL ...\n";
+ print(STDERR "usage: $0 [-cqr] [-d n] [-j n] URL ...\n");
exit(1);
}