From a5457541076766804405416fb653592932c688ec Mon Sep 17 00:00:00 2001 From: Ralf Ertzinger Date: Thu, 15 Aug 2013 16:45:02 +0200 Subject: [PATCH] Add Amazon grabber --- quotesite/AmazonGrabber.pm | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 quotesite/AmazonGrabber.pm diff --git a/quotesite/AmazonGrabber.pm b/quotesite/AmazonGrabber.pm new file mode 100644 index 0000000..bc91cb5 --- /dev/null +++ b/quotesite/AmazonGrabber.pm @@ -0,0 +1,77 @@ +# (c) 2007 by Ralf Ertzinger +# licensed under GNU GPL v2 +# +# Grabber for Amazon + +package quotesite::AmazonGrabber; + +use quotesite::GrabberBase; +@ISA = qw(quotesite::GrabberBase); + +use LWP::Simple qw(!get); +use HTML::TokeParser; +use Data::Dumper; +use Encode; + +use strict; + +sub new { + my $class = shift; + my $self = $class->SUPER::new(); + + $self->{'NAME'} = 'amazon'; + $self->{'PATTERNS'} = ['(https?://(?:[-a-zA-Z0-9_.]+\.)*amazon\.(?:com|de|co\.uk|fr)/.*[dg]p(?:/product)?/([[:alnum:]]{10}))']; + + bless($self, $class); + $self->_prepare_parameters(); + + return $self; +} + +sub _parse { + my $self = shift; + my $url = shift; + my $pattern = shift; + my $content; + my $metadata = {}; + my $p; + my $t; + my $t2; + + $url =~ m|$pattern|; + $url = $1; + + $metadata->{'URL'} = $url; + $metadata->{'ID'} = $2; + $metadata->{'TYPE'} = 'quote'; + $metadata->{'SOURCE'} = $self->{'NAME'}; + $metadata->{'CONTENT'} = undef; + + # Get the HTML file containing the quote + unless(defined($content = LWP::Simple::get($url))) { + $self->error('Could not download quote'); + return undef; + } + + $self->debug($content); + + $p = HTML::TokeParser->new(\$content); + + OUTER: while ($t = $p->get_tag('h1')) { + if (exists($t->[1]->{'class'}) && ($t->[1]->{'class'} eq 'parseasinTitle')) { + $metadata->{'CONTENT'} = encode('utf8', decode('iso8859-1', $p->get_text('/h1'))); + $metadata->{'CONTENT'} =~ s/^\s*//; + $metadata->{'CONTENT'} =~ s/\s*$//; + last OUTER; + } + } + + unless(defined($metadata->{'CONTENT'})) { + $self->error('Could not extract quote content'); + return undef; + } + + return $metadata; +} + +1; -- 1.8.3.1