X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=videosite%2FCollegeHumorGrabber.pm;h=4388a312f38903887f07dfa29439d24bab874bfe;hb=400621043a9ee2217a07e2afe5cedf23a2491d69;hp=9fdb87e456e3c9f5fcc9b11a363920cee28a5e9a;hpb=d738e03f4f2d70a41eba8b77177826d1ff62f42b;p=videosite.git diff --git a/videosite/CollegeHumorGrabber.pm b/videosite/CollegeHumorGrabber.pm index 9fdb87e..4388a31 100644 --- a/videosite/CollegeHumorGrabber.pm +++ b/videosite/CollegeHumorGrabber.pm @@ -1,8 +1,14 @@ +# Grabber for collegehumor.com +# +# (c) 2007 by Ralf Ertzinger +# licensed under GNU GPL v2 + package CollegeHumorGrabber; use GrabberBase; @ISA = qw(GrabberBase); +use HTMLHelper; use LWP::Simple qw(!get); use XML::Simple; use Data::Dumper; @@ -29,6 +35,7 @@ sub _parse { my $content; my $metadata = {}; my $p = XML::Simple->new(); + my @accum; my $t; $url =~ m|$pattern|; @@ -36,7 +43,8 @@ sub _parse { $metadata->{'URL'} = $url; $metadata->{'ID'} = $2; - $metadata->{'TYPE'} = 'collegehumor'; + $metadata->{'TYPE'} = 'video'; + $metadata->{'SOURCE'} = $self->{'NAME'}; $metadata->{'TITLE'} = undef; $metadata->{'DLURL'} = undef; @@ -52,7 +60,20 @@ sub _parse { } $metadata->{'DLURL'} = $t->{'video'}->{'file'}; - $metadata->{'TITLE'} = $t->{'video'}->{'caption'}; + + # The XML does not contain the full title of the video, for + # reasons possibly known to some jerk at CollegeHumor. + # So we'll have to parse the actual HTML, too. + $p = HTMLHelper->new(); + unless(defined($content = $p->load(sprintf('http://www.collegehumor.com/video:%s', $2)))) { + $self->error('Could not download HTML'); + return undef; + } + + $t = $p->findnodes('meta[@name="title"]'); + if (defined($t)) { + $metadata->{'TITLE'} = $t->{'content'}; + } unless(defined($metadata->{'DLURL'}) && defined($metadata->{'TITLE'})) { $self->error('Could not extract download URL and title');