From: Ralf Ertzinger Date: Mon, 19 Aug 2013 21:41:03 +0000 (+0200) Subject: Merge branch 'master' of ssh://git.camperquake.de:22003/quotesite X-Git-Url: https://git.camperquake.de/gitweb.cgi?p=quotesite.git;a=commitdiff_plain;h=87482e72603201424440b68d760b59e573b8eafd;hp=18eb17221c111a8442538c54d2527e8c06b25803 Merge branch 'master' of ssh://git.camperquake.de:22003/quotesite --- diff --git a/quotesite.pl b/quotesite.pl index 6e4728b..47fb872 100644 --- a/quotesite.pl +++ b/quotesite.pl @@ -74,7 +74,7 @@ my $quotesite_commands = { 'help' => sub { cmd_help(@_); - }. + }, 'enable' => sub { cmd_enable(@_); @@ -362,7 +362,7 @@ sub init_quotesite { _load_modules($plugindir); - unless (defined(@grabbers)) { + unless (@grabbers) { write_irssi('No grabbers found, can not proceed.'); return; } diff --git a/quotesite/AmazonGrabber.pm b/quotesite/AmazonGrabber.pm new file mode 100644 index 0000000..bc91cb5 --- /dev/null +++ b/quotesite/AmazonGrabber.pm @@ -0,0 +1,77 @@ +# (c) 2007 by Ralf Ertzinger +# licensed under GNU GPL v2 +# +# Grabber for Amazon + +package quotesite::AmazonGrabber; + +use quotesite::GrabberBase; +@ISA = qw(quotesite::GrabberBase); + +use LWP::Simple qw(!get); +use HTML::TokeParser; +use Data::Dumper; +use Encode; + +use strict; + +sub new { + my $class = shift; + my $self = $class->SUPER::new(); + + $self->{'NAME'} = 'amazon'; + $self->{'PATTERNS'} = ['(https?://(?:[-a-zA-Z0-9_.]+\.)*amazon\.(?:com|de|co\.uk|fr)/.*[dg]p(?:/product)?/([[:alnum:]]{10}))']; + + bless($self, $class); + $self->_prepare_parameters(); + + return $self; +} + +sub _parse { + my $self = shift; + my $url = shift; + my $pattern = shift; + my $content; + my $metadata = {}; + my $p; + my $t; + my $t2; + + $url =~ m|$pattern|; + $url = $1; + + $metadata->{'URL'} = $url; + $metadata->{'ID'} = $2; + $metadata->{'TYPE'} = 'quote'; + $metadata->{'SOURCE'} = $self->{'NAME'}; + $metadata->{'CONTENT'} = undef; + + # Get the HTML file containing the quote + unless(defined($content = LWP::Simple::get($url))) { + $self->error('Could not download quote'); + return undef; + } + + $self->debug($content); + + $p = HTML::TokeParser->new(\$content); + + OUTER: while ($t = $p->get_tag('h1')) { + if (exists($t->[1]->{'class'}) && ($t->[1]->{'class'} eq 'parseasinTitle')) { + $metadata->{'CONTENT'} = encode('utf8', decode('iso8859-1', $p->get_text('/h1'))); + $metadata->{'CONTENT'} =~ s/^\s*//; + $metadata->{'CONTENT'} =~ s/\s*$//; + last OUTER; + } + } + + unless(defined($metadata->{'CONTENT'})) { + $self->error('Could not extract quote content'); + return undef; + } + + return $metadata; +} + +1; diff --git a/quotesite/AppNetGrabber.pm b/quotesite/AppNetGrabber.pm new file mode 100644 index 0000000..d92edec --- /dev/null +++ b/quotesite/AppNetGrabber.pm @@ -0,0 +1,72 @@ +# (c) 2010 by Ralf Ertzinger +# licensed under GNU GPL v2 +# +# Grabber for app.net + +package quotesite::AppNetGrabber; + +use quotesite::GrabberBase; +@ISA = qw(quotesite::GrabberBase); + +use Data::Dumper; +use JSON; +use Encode; + +use strict; + +sub new { + my $class = shift; + my $self = $class->SUPER::new(); + + $self->{'NAME'} = 'app.net'; + $self->{'PATTERNS'} = ['(https?://alpha\.app\.net/[^/]+/post/(\d+))']; + + bless($self, $class); + $self->_prepare_parameters(); + + return $self; +} + +sub _parse { + my $self = shift; + my $url = shift; + my $pattern = shift; + my $content; + my $metadata = {}; + my $p = XML::Simple->new(); + my $t; + + $url =~ m|$pattern|; + $url = $1; + + $metadata->{'URL'} = $url; + $metadata->{'ID'} = $2; + $metadata->{'TYPE'} = 'quote'; + $metadata->{'SOURCE'} = $self->{'NAME'}; + $metadata->{'CONTENT'} = undef; + + # Get the JSON file containing the quote + unless(defined($content = $self->simple_get(sprintf('https://alpha-api.app.net/stream/0/posts/%s', $2)))) { + $self->error('Could not download quote'); + return undef; + } + + unless(defined($t = JSON->new->utf8->decode($content))) { + $self->error('Could not parse JSON metadata'); + return undef; + } + + $self->debug("JSON content: %s", Dumper($t)); + + $metadata->{'CONTENT'} = $t->{'data'}->{'text'}; + $metadata->{'ID'} = $t->{'data'}->{'user'}->{'username'}; + + unless(defined($metadata->{'CONTENT'})) { + $self->error('Could not extract quote content'); + return undef; + } + + return $metadata; +} + +1; diff --git a/quotesite/Base.pm b/quotesite/Base.pm index ccbb657..ebd65c3 100644 --- a/quotesite/Base.pm +++ b/quotesite/Base.pm @@ -4,6 +4,8 @@ package quotesite::Base; use strict; +use LWP::UserAgent; +use HTTP::Cookies; use Data::Dumper; sub new { @@ -170,4 +172,30 @@ sub setdebug { $self->{'_DEBUG'} = shift; } +sub ua { + my $self = shift; + my $ua; + + $ua = LWP::UserAgent->new( + 'agent' => 'Mozilla/5.0', + 'cookie_jar' => HTTP::Cookies->new, + 'timeout' => 15, + ); + + $self->{_CACHED_UA} = $ua; + + return $ua; +} + +sub simple_get { + my $self = shift; + my $url = shift; + my $ua = shift || $self->ua(); + my $r; + + $r = $ua->get($url); + return $r->decoded_content() if $r->is_success(); + return undef; +} + 1; diff --git a/quotesite/GermanBashGrabber.pm b/quotesite/GermanBashGrabber.pm index 747968c..361c43d 100644 --- a/quotesite/GermanBashGrabber.pm +++ b/quotesite/GermanBashGrabber.pm @@ -19,8 +19,8 @@ sub new { my $self = $class->SUPER::new(); $self->{'NAME'} = 'germanbash'; - $self->{'PATTERNS'} = ['(http://(?:[-a-zA-Z0-9_.]+\.)*german-bash\.org/(\d+))', - '(http://(?:[-a-zA-Z0-9_.]+\.)*german-bash\.org/action/show/id/(\d+))']; + $self->{'PATTERNS'} = ['(http://(?:[-a-zA-Z0-9_.]+\.)*german-bash\.(?:org|de)/(\d+))', + '(http://(?:[-a-zA-Z0-9_.]+\.)*german-bash\.(?:org|de)/action/show/id/(\d+))']; bless($self, $class); $self->_prepare_parameters(); diff --git a/quotesite/QdbGrabber.pm b/quotesite/QdbGrabber.pm index f9c7387..411be01 100644 --- a/quotesite/QdbGrabber.pm +++ b/quotesite/QdbGrabber.pm @@ -11,6 +11,7 @@ use quotesite::GrabberBase; use LWP::Simple qw(!get); use HTML::TokeParser; use Data::Dumper; +use Encode; use strict; @@ -56,7 +57,7 @@ sub _parse { OUTER: while ($t = $p->get_tag('span')) { if (exists($t->[1]->{'class'}) && ($t->[1]->{'class'} eq 'qt')) { - $metadata->{'CONTENT'} = $p->get_text('/span'); + $metadata->{'CONTENT'} = encode('utf8', decode('iso8859-1', $p->get_text('/span'))); last OUTER; } }