From 3f99f34c478f21862b13f5799a629a6e96c1d69f Mon Sep 17 00:00:00 2001 From: Ralf Ertzinger Date: Sat, 9 Jan 2010 18:13:53 +0100 Subject: [PATCH] - Alter GermanBashGrabber to use LWP::UserAgent This is a fun one. Some time back GermanBash (for reasons entirely outside my imagination) started to deliver broken HTML to user agents which identified as the perl LWP library. Not to all those agents, mind. Just some. For example, using the GET command line script (which is based on LWP) would produce valid HTML, including a quote, but using LWP::Simple::get from a script (on the same machine) would not. I'm not too sure what to make of this. If this was meant as a countermeasure of some kind against quotesite I'd really, really like to hear from the guys running GermanBash. --- quotesite/GermanBashGrabber.pm | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/quotesite/GermanBashGrabber.pm b/quotesite/GermanBashGrabber.pm index 919f015..e65b660 100644 --- a/quotesite/GermanBashGrabber.pm +++ b/quotesite/GermanBashGrabber.pm @@ -8,7 +8,7 @@ package GermanBashGrabber; use GrabberBase; @ISA = qw(GrabberBase); -use LWP::Simple qw(!get); +use LWP::UserAgent; use HTML::TokeParser; use Data::Dumper; @@ -36,6 +36,7 @@ sub _parse { my $metadata = {}; my $p; my $t; + my $ua = LWP::UserAgent->new('agent' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'); $url =~ m|$pattern|; $url = $1; @@ -47,10 +48,12 @@ sub _parse { $metadata->{'CONTENT'} = undef; # Get the HTML file containing the quote - unless(defined($content = LWP::Simple::get(sprintf('http://german-bash.org/%s', $2)))) { + $content = $ua->get(sprintf('http://german-bash.org/%s', $2)); + unless($content->is_success) { $self->error('Could not download quote'); return undef; } + $content = $content->decoded_content(); $p = HTML::TokeParser->new(\$content); -- 1.8.3.1