videosite/CollegeHumorGrabber.pm

   1 # Grabber for collegehumor.com
   2 #
   3 # (c) 2007 by Ralf Ertzinger <ralf@camperquake.de>
   4 # licensed under GNU GPL v2
   5
   6 package CollegeHumorGrabber;
   7
   8 use GrabberBase;
   9 @ISA = qw(GrabberBase);
  10
  11 use LWP::Simple qw(!get);
  12 use XML::Simple;
  13 use HTML::Parser;
  14 use Data::Dumper;
  15
  16 use strict;
  17
  18 sub new {
  19     my $class = shift;
  20     my $self = $class->SUPER::new();
  21
  22     $self->{'NAME'} = 'collegehumor';
  23     $self->{'PATTERNS'} = ['(http://www.collegehumor.com/video:(\d+))'];
  24
  25     bless($self, $class);
  26     $self->_prepare_parameters();
  27
  28     return $self;
  29 }
  30
  31 sub _parse {
  32     my $self = shift;
  33     my $url = shift;
  34     my $pattern = shift;
  35     my $content;
  36     my $metadata = {};
  37     my $p = XML::Simple->new();
  38     my @accum;
  39     my $t;
  40
  41     $url =~ m|$pattern|;
  42     $url = $1;
  43
  44     $metadata->{'URL'} = $url;
  45     $metadata->{'ID'} = $2;
  46     $metadata->{'TYPE'} = 'video';
  47     $metadata->{'SOURCE'} = $self->{'NAME'};
  48     $metadata->{'TITLE'} = undef;
  49     $metadata->{'DLURL'} = undef;
  50
  51     # Get the XML file containing the video metadata
  52     unless(defined($content = LWP::Simple::get(sprintf('http://www.collegehumor.com/moogaloop/video:%s', $2)))) {
  53         $self->error('Could not download XML metadata');
  54         return undef;
  55     }
  56
  57     unless(defined($t = $p->XMLin($content))) {
  58         $self->error('Could not parse XML metadata');
  59         return undef;
  60     }
  61
  62     $metadata->{'DLURL'} = $t->{'video'}->{'file'};
  63
  64     # The XML does not contain the full title of the video, for
  65     # reasons possibly known to some jerk at CollegeHumor.
  66     # So we'll have to parse the actual HTML, too.
  67     unless(defined($content = LWP::Simple::get(sprintf('http://www.collegehumor.com/video:%s', $2)))) {
  68         $self->error('Could not download HTML');
  69         return undef;
  70     }
  71     $p = HTML::Parser->new(api_version => 3);
  72
  73     $p->handler(start => \@accum, "tagname, attr");
  74     $p->report_tags(qw(meta));
  75     $p->utf8_mode(1);
  76     $p->parse($content);
  77
  78     # Look for the title in the meta tags
  79     foreach $t (@accum) {
  80         if ('meta' eq $t->[0]) {
  81             if (exists($t->[1]->{'name'}) and ('title' eq $t->[1]->{'name'})) {
  82                 $metadata->{'TITLE'} = $t->[1]->{'content'};
  83             }
  84         }
  85     }
  86
  87     unless(defined($metadata->{'DLURL'}) && defined($metadata->{'TITLE'})) {
  88         $self->error('Could not extract download URL and title');
  89         return undef;
  90     }
  91
  92     return $metadata;
  93 }
  94
  95 1;