X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=videosite%2FCollegeHumorGrabber.pm;h=bbf54a688aac683fab225aca387df9641d98a041;hb=4f73c0b48954f5f4f30ee96779ecaa3fca847305;hp=c325ed359de56b04e8485b50280d04cd460ad667;hpb=35306939804820c033c8e25ab5aa5134f5dc5735;p=videosite.git diff --git a/videosite/CollegeHumorGrabber.pm b/videosite/CollegeHumorGrabber.pm index c325ed3..bbf54a6 100644 --- a/videosite/CollegeHumorGrabber.pm +++ b/videosite/CollegeHumorGrabber.pm @@ -1,8 +1,14 @@ -package CollegeHumorGrabber; +# Grabber for collegehumor.com +# +# (c) 2007 by Ralf Ertzinger +# licensed under GNU GPL v2 -use GrabberBase; -@ISA = qw(GrabberBase); +package videosite::CollegeHumorGrabber; +use videosite::GrabberBase; +@ISA = qw(videosite::GrabberBase); + +use videosite::HTMLHelper; use LWP::Simple qw(!get); use XML::Simple; use Data::Dumper; @@ -29,6 +35,7 @@ sub _parse { my $content; my $metadata = {}; my $p = XML::Simple->new(); + my @accum; my $t; $url =~ m|$pattern|; @@ -37,7 +44,7 @@ sub _parse { $metadata->{'URL'} = $url; $metadata->{'ID'} = $2; $metadata->{'TYPE'} = 'video'; - $metadata->{'SOURCE'} = 'collegehumor'; + $metadata->{'SOURCE'} = $self->{'NAME'}; $metadata->{'TITLE'} = undef; $metadata->{'DLURL'} = undef; @@ -53,7 +60,20 @@ sub _parse { } $metadata->{'DLURL'} = $t->{'video'}->{'file'}; - $metadata->{'TITLE'} = $t->{'video'}->{'caption'}; + + # The XML does not contain the full title of the video, for + # reasons possibly known to some jerk at CollegeHumor. + # So we'll have to parse the actual HTML, too. + $p = videosite::HTMLHelper->new(); + unless(defined($content = $p->load(sprintf('http://www.collegehumor.com/video:%s', $2)))) { + $self->error('Could not download HTML'); + return undef; + } + + $t = $p->findnodes('h1[@id="item_title"]'); + if (defined($t)) { + $metadata->{'TITLE'} = $t->{'_content'}->[0]; + } unless(defined($metadata->{'DLURL'}) && defined($metadata->{'TITLE'})) { $self->error('Could not extract download URL and title');