X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=videosite%2FCollegeHumorGrabber.pm;h=bbf54a688aac683fab225aca387df9641d98a041;hb=4f73c0b48954f5f4f30ee96779ecaa3fca847305;hp=2d9b37f5c57d29644e166b3f4d7e17cd3fffea8d;hpb=2044d5ff60e639a2eafbeed3a70224564e94c84c;p=videosite.git diff --git a/videosite/CollegeHumorGrabber.pm b/videosite/CollegeHumorGrabber.pm index 2d9b37f..bbf54a6 100644 --- a/videosite/CollegeHumorGrabber.pm +++ b/videosite/CollegeHumorGrabber.pm @@ -3,11 +3,12 @@ # (c) 2007 by Ralf Ertzinger # licensed under GNU GPL v2 -package CollegeHumorGrabber; +package videosite::CollegeHumorGrabber; -use GrabberBase; -@ISA = qw(GrabberBase); +use videosite::GrabberBase; +@ISA = qw(videosite::GrabberBase); +use videosite::HTMLHelper; use LWP::Simple qw(!get); use XML::Simple; use Data::Dumper; @@ -34,6 +35,7 @@ sub _parse { my $content; my $metadata = {}; my $p = XML::Simple->new(); + my @accum; my $t; $url =~ m|$pattern|; @@ -42,7 +44,7 @@ sub _parse { $metadata->{'URL'} = $url; $metadata->{'ID'} = $2; $metadata->{'TYPE'} = 'video'; - $metadata->{'SOURCE'} = 'collegehumor'; + $metadata->{'SOURCE'} = $self->{'NAME'}; $metadata->{'TITLE'} = undef; $metadata->{'DLURL'} = undef; @@ -58,7 +60,20 @@ sub _parse { } $metadata->{'DLURL'} = $t->{'video'}->{'file'}; - $metadata->{'TITLE'} = $t->{'video'}->{'caption'}; + + # The XML does not contain the full title of the video, for + # reasons possibly known to some jerk at CollegeHumor. + # So we'll have to parse the actual HTML, too. + $p = videosite::HTMLHelper->new(); + unless(defined($content = $p->load(sprintf('http://www.collegehumor.com/video:%s', $2)))) { + $self->error('Could not download HTML'); + return undef; + } + + $t = $p->findnodes('h1[@id="item_title"]'); + if (defined($t)) { + $metadata->{'TITLE'} = $t->{'_content'}->[0]; + } unless(defined($metadata->{'DLURL'}) && defined($metadata->{'TITLE'})) { $self->error('Could not extract download URL and title');