X-Git-Url: https://git.camperquake.de/gitweb.cgi?p=quotesite.git;a=blobdiff_plain;f=quotesite%2FiBashGrabber.pm;fp=quotesite%2FiBashGrabber.pm;h=04bd07d22b17f084b2e2111fa4805c2e236cb095;hp=0000000000000000000000000000000000000000;hb=20af9b1c4f051c0830ba35b8d3d15439c499e119;hpb=efd2a46770962a8ecf55e18c819b7c52042456d4 diff --git a/quotesite/iBashGrabber.pm b/quotesite/iBashGrabber.pm new file mode 100644 index 0000000..04bd07d --- /dev/null +++ b/quotesite/iBashGrabber.pm @@ -0,0 +1,72 @@ +# (c) 2007 by Ralf Ertzinger +# licensed under GNU GPL v2 +# +# Grabber for german-bash.org + +package iBashGrabber; + +use GrabberBase; +@ISA = qw(GrabberBase); + +use LWP::Simple qw(!get); +use HTML::TokeParser; +use Data::Dumper; +use Encode; + +use strict; + +sub new { + my $class = shift; + my $self = $class->SUPER::new(); + + $self->{'NAME'} = 'ibash.de'; + $self->{'PATTERNS'} = ['(http://(?:[-a-zA-Z0-9_.]+\.)*ibash.de/zitat_(\d+)\.html)']; + + bless($self, $class); + $self->_prepare_parameters(); + + return $self; +} + +sub _parse { + my $self = shift; + my $url = shift; + my $pattern = shift; + my $content; + my $metadata = {}; + my $p; + my $t; + + $url =~ m|$pattern|; + $url = $1; + + $metadata->{'URL'} = $url; + $metadata->{'ID'} = $2; + $metadata->{'TYPE'} = 'quote'; + $metadata->{'SOURCE'} = $self->{'NAME'}; + $metadata->{'CONTENT'} = undef; + + # Get the HTML file containing the quote + unless(defined($content = LWP::Simple::get(sprintf('http://www.ibash.de/zitat_%s.html', $2)))) { + $self->error('Could not download quote'); + return undef; + } + + $p = HTML::TokeParser->new(\$content); + + while ($t = $p->get_tag('td')) { + if (exists($t->[1]->{'class'}) && ($t->[1]->{'class'} eq 'quote')) { + $metadata->{'CONTENT'} = $p->get_text('/td'); + Encode::from_to($metadata->{'CONTENT'}, 'iso-8859-1', 'utf8'); + } + } + + unless(defined($metadata->{'CONTENT'})) { + $self->error('Could not extract quote content'); + return undef; + } + + return $metadata; +} + +1;