videosite/CollegeHumorGrabber.pm

   1 # Grabber for collegehumor.com
   2 #
   3 # (c) 2007 by Ralf Ertzinger <ralf@camperquake.de>
   4 # licensed under GNU GPL v2
   5
   6 package videosite::CollegeHumorGrabber;
   7
   8 use videosite::GrabberBase;
   9 @ISA = qw(videosite::GrabberBase);
  10
  11 use videosite::HTMLHelper;
  12 use LWP::Simple qw(!get);
  13 use XML::Simple;
  14 use Data::Dumper;
  15
  16 use strict;
  17
  18 sub new {
  19     my $class = shift;
  20     my $self = $class->SUPER::new();
  21
  22     $self->{'NAME'} = 'collegehumor';
  23     $self->{'PATTERNS'} = ['(http://www.collegehumor.com/video:(\d+))'];
  24
  25     bless($self, $class);
  26     $self->_prepare_parameters();
  27
  28     return $self;
  29 }
  30
  31 sub _parse {
  32     my $self = shift;
  33     my $url = shift;
  34     my $pattern = shift;
  35     my $content;
  36     my $metadata = {};
  37     my $p = XML::Simple->new();
  38     my @accum;
  39     my $t;
  40
  41     $url =~ m|$pattern|;
  42     $url = $1;
  43
  44     $metadata->{'URL'} = $url;
  45     $metadata->{'ID'} = $2;
  46     $metadata->{'TYPE'} = 'video';
  47     $metadata->{'SOURCE'} = $self->{'NAME'};
  48     $metadata->{'TITLE'} = undef;
  49     $metadata->{'DLURL'} = undef;
  50
  51     # Get the XML file containing the video metadata
  52     unless(defined($content = LWP::Simple::get(sprintf('http://www.collegehumor.com/moogaloop/video:%s', $2)))) {
  53         $self->error('Could not download XML metadata');
  54         return undef;
  55     }
  56
  57     unless(defined($t = $p->XMLin($content))) {
  58         $self->error('Could not parse XML metadata');
  59         return undef;
  60     }
  61
  62     $metadata->{'DLURL'} = $t->{'video'}->{'file'};
  63
  64     # The XML does not contain the full title of the video, for
  65     # reasons possibly known to some jerk at CollegeHumor.
  66     # So we'll have to parse the actual HTML, too.
  67     $p = videosite::HTMLHelper->new();
  68     unless(defined($content = $p->load(sprintf('http://www.collegehumor.com/video:%s', $2)))) {
  69         $self->error('Could not download HTML');
  70         return undef;
  71     }
  72
  73     $t = $p->findnodes('h1[@id="item_title"]');
  74     if (defined($t)) {
  75         $metadata->{'TITLE'} = $t->{'_content'}->[0];
  76     }
  77
  78     unless(defined($metadata->{'DLURL'}) && defined($metadata->{'TITLE'})) {
  79         $self->error('Could not extract download URL and title');
  80         return undef;
  81     }
  82
  83     return $metadata;
  84 }
  85
  86 1;