From: Ralf Ertzinger Date: Sat, 9 Jan 2010 17:13:53 +0000 (+0100) Subject: - Alter GermanBashGrabber to use LWP::UserAgent X-Git-Url: https://git.camperquake.de/gitweb.cgi?p=quotesite.git;a=commitdiff_plain;h=3f99f34c478f21862b13f5799a629a6e96c1d69f;ds=sidebyside - Alter GermanBashGrabber to use LWP::UserAgent This is a fun one. Some time back GermanBash (for reasons entirely outside my imagination) started to deliver broken HTML to user agents which identified as the perl LWP library. Not to all those agents, mind. Just some. For example, using the GET command line script (which is based on LWP) would produce valid HTML, including a quote, but using LWP::Simple::get from a script (on the same machine) would not. I'm not too sure what to make of this. If this was meant as a countermeasure of some kind against quotesite I'd really, really like to hear from the guys running GermanBash. --- diff --git a/quotesite/GermanBashGrabber.pm b/quotesite/GermanBashGrabber.pm index 919f015..e65b660 100644 --- a/quotesite/GermanBashGrabber.pm +++ b/quotesite/GermanBashGrabber.pm @@ -8,7 +8,7 @@ package GermanBashGrabber; use GrabberBase; @ISA = qw(GrabberBase); -use LWP::Simple qw(!get); +use LWP::UserAgent; use HTML::TokeParser; use Data::Dumper; @@ -36,6 +36,7 @@ sub _parse { my $metadata = {}; my $p; my $t; + my $ua = LWP::UserAgent->new('agent' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'); $url =~ m|$pattern|; $url = $1; @@ -47,10 +48,12 @@ sub _parse { $metadata->{'CONTENT'} = undef; # Get the HTML file containing the quote - unless(defined($content = LWP::Simple::get(sprintf('http://german-bash.org/%s', $2)))) { + $content = $ua->get(sprintf('http://german-bash.org/%s', $2)); + unless($content->is_success) { $self->error('Could not download quote'); return undef; } + $content = $content->decoded_content(); $p = HTML::TokeParser->new(\$content);