From 6365a5e9ae709daeb082985ea130c20d6744d248 Mon Sep 17 00:00:00 2001 From: des Date: Wed, 23 Jan 2008 13:45:48 +0000 Subject: [PATCH] Add -c and -r options: - If the former is specified, fetcher will go into a loop after having traversed the entire tree, and continuously re-fetch all known URLs. - The latter is not yet implented, but the idea is to assign a random probability to each URL based on an inverse-exponential (or similar) distribution, and re-fetch URLs at random according to this frequency. This will help simulate a "short head long tail" scenario. Some restructuring. Add a comment about a possible improvement which will help work around bugs in certain commonly used data sets (e.g. the Apache httpd manual) which can result in an infinite set of URLs (which in reality map to a fairly large but finite set of pages) git-svn-id: svn+ssh://projects.linpro.no/svn/varnish/trunk@2374 d4fa192b-c00b-0410-8231-f00ffab90ce4 --- varnish-tools/fetcher/fetcher.pl | 91 +++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 19 deletions(-) diff --git a/varnish-tools/fetcher/fetcher.pl b/varnish-tools/fetcher/fetcher.pl index 3e70828a..6b66db13 100755 --- a/varnish-tools/fetcher/fetcher.pl +++ b/varnish-tools/fetcher/fetcher.pl @@ -43,8 +43,14 @@ our %DONE; our %CHILD; our $BUSY; -our $jobs = 1; +our $continue = 0; our $delay = 0; +our $jobs = 1; +our $quiet = 0; +our $random = 0; + +our $url_re = + qr/\b(?:href|src)=[\'\"]([^\'\"\?\#]+)(?:[\?\#][^\'\"]*)?[\'\"]/io; sub new($$) { my ($this, $mux, $fh) = @_; @@ -57,27 +63,40 @@ sub new($$) { }; } +# Child sub run($$) { my ($self, $s) = @_; - my $ua = new LWP::UserAgent(); + my $check = 1; + my $ua = new LWP::UserAgent('keep_alive' => 3); for (;;) { $0 = "[fetcher] idle"; my $url = <$s>; exit(0) unless defined($url); chomp($url); - die "no more work\n" - if $url eq "done"; + if ($url eq "done") { + last; + } elsif ($url eq "check") { + $check = 1; + next; + } elsif ($url eq "no check") { + $check = 0; + next; + } $0 = "[fetcher] requesting $url"; - print STDERR "Retrieving $url\n"; + print STDERR "Retrieving $url\n" + unless ($quiet > 1); my $resp = $ua->get($url); - $0 = "[fetcher] checking $url"; - if ($resp->header('Content-Type') =~ m/^text\//) { - my %urls = map { $_ => 1 } - ($resp->content =~ m/\b(?:href|src)=[\'\"]([^\'\"\?\#]+)(?:[\?\#][^\'\"]*)?[\'\"]/g); - foreach (keys(%urls)) { - $s->write("add $_\n"); + if ($check) { + $0 = "[fetcher] checking $url"; + # XXX if we got a redirect, we should blacklist the + # original URL and suggest the target URL instead + if ($resp->header('Content-Type') =~ m/^text\//) { + my %urls = map { $_ => 1 } ($resp->content =~ m/$url_re/g); + foreach (keys(%urls)) { + $s->write("add $_\n"); + } } } select(undef, undef, undef, $delay) @@ -87,6 +106,16 @@ sub run($$) { } } +# Send a command for which we don't expect a response +sub send($) { + my ($child, $msg) = @_; + + die "child busy\n" + if $child->{'url'}; + $child->{'mux'}->write($child->{'fh'}, "$msg\n"); +} + +# Send a URL and mark the child as busy sub send_url($) { my ($child) = @_; @@ -102,6 +131,7 @@ sub send_url($) { ++$BUSY; } +# Convert relative to absolute, check if valid, and add to list sub get_url($$) { my ($child, $url) = @_; @@ -111,13 +141,15 @@ sub get_url($$) { $url = $uri->canonical; if ($uri->scheme() ne 'http' || $uri->host_port() ne URI->new($child->{'url'})->host_port()) { - print STDERR "Rejected $url\n"; + print STDERR "Rejected $url\n" + unless ($quiet > 0); return; } return if $TODO{$url} || $DONE{$url}; $TODO{$url} = 1; } +# Called when mux gets data from a client sub mux_input($$$$) { my ($child, $mux, $fh, $input) = @_; @@ -172,32 +204,53 @@ sub fetcher(@) { # main loop for (;;) { + for (;;) { + foreach my $child (values(%CHILD)) { + $child->send_url() + unless $child->{'url'}; + } + last unless $BUSY; + $mux->loop(); + } + last unless $continue; foreach my $child (values(%CHILD)) { - $child->send_url() - unless $child->{'url'}; + $child->send("no check"); } - last unless $BUSY; - $mux->loop(); + %TODO = %DONE; + %DONE = (); + print STDERR "Starting over...\n"; } # done foreach my $child (values(%CHILD)) { + $child->send("done"); $mux->close($$child{'fh'}); } } +sub refetch() { + + # Recycle valid URLs from initial run + %TODO = %DONE; +} + sub usage() { - print STDERR "usage: $0 [-d n] [-j n] URL ...\n"; + print STDERR "usage: $0 [-cqr] [-d n] [-j n] URL ...\n"; exit(1); } MAIN:{ - GetOptions("j|jobs=i" => \$jobs, - "d|delay=i" => \$delay) + GetOptions("c|continue" => \$continue, + "d|delay=i" => \$delay, + "j|jobs=i" => \$jobs, + "q|quiet+" => \$quiet, + "r|random" => \$random) or usage(); $jobs > 0 or usage(); + $random + and die "-r is not yet implemented\n"; @ARGV or usage(); fetcher(@ARGV); -- 2.39.5