From 5dec08312ac5618cb2b20a497f7b96142d8d48ca Mon Sep 17 00:00:00 2001 From: Ralf Ertzinger Date: Sun, 27 Apr 2008 23:11:56 +0200 Subject: [PATCH 1/1] - Add grabber for video.yahoo.com --- videosite/YahooGrabber.pm | 99 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 videosite/YahooGrabber.pm diff --git a/videosite/YahooGrabber.pm b/videosite/YahooGrabber.pm new file mode 100644 index 0000000..4c98332 --- /dev/null +++ b/videosite/YahooGrabber.pm @@ -0,0 +1,99 @@ +# (c) 2008 by Ralf Ertzinger +# licensed under GNU GPL v2 +# +# Grabber for video.yahoo.com + +package YahooGrabber; + +use GrabberBase; +@ISA = qw(GrabberBase); + +use LWP::Simple qw(!get); +use XML::Simple; +use HTML::Parser; +use Data::Dumper; + +use strict; + +sub new { + my $class = shift; + my $self = $class->SUPER::new(); + + $self->{'NAME'} = 'yahoo'; + $self->{'PATTERNS'} = ['(http://video\.yahoo\.com/watch/\d+/(\d+))']; + + bless($self, $class); + $self->_prepare_parameters(); + + return $self; +} + +sub _parse { + my $self = shift; + my $url = shift; + my $pattern = shift; + my $content; + my $metadata = {}; + my $p = XML::Simple->new(); + my $t; + my @accum; + + $url =~ m|$pattern|; + $url = $1; + + $metadata->{'URL'} = $url; + $metadata->{'ID'} = $2; + $metadata->{'TYPE'} = 'video'; + $metadata->{'SOURCE'} = $self->{'NAME'}; + $metadata->{'TITLE'} = undef; + $metadata->{'DLURL'} = undef; + + # Get the XML file containing the video metadata + unless(defined($content = LWP::Simple::get(sprintf('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=%s', $2)))) { + $self->error('Could not download XML metadata'); + return undef; + } + + # There is no XML header in the data, which makes XML::Simple unhappy + $content = '' . $content; + + unless(defined($t = $p->XMLin($content, KeepRoot => 1))) { + $self->error('Could not parse XML metadata'); + return undef; + } + + $metadata->{'DLURL'} = $t->{'DATA'}->{'SEQUENCE-ITEM'}->{'STREAM'}->{'APP'} . $t->{'DATA'}->{'SEQUENCE-ITEM'}->{'STREAM'}->{'FULLPATH'}; + + # The XML does not contain the title of the video, for + # reasons possibly known to some jerk at yahoo. + # So we'll have to parse the actual HTML, too. + unless(defined($content = LWP::Simple::get($url))) { + $self->error('Could not download HTML'); + return undef; + } + $p = HTML::Parser->new(api_version => 3); + + $p->handler(start => \@accum, "tagname, attr"); + $p->report_tags(qw(meta)); + $p->utf8_mode(1); + $p->parse($content); + + # Look for the title in the meta tags + foreach $t (@accum) { + if ('meta' eq $t->[0]) { + if (exists($t->[1]->{'name'}) and ('title' eq $t->[1]->{'name'})) { + $metadata->{'TITLE'} = $t->[1]->{'content'}; + last; + } + } + } + + unless(defined($metadata->{'DLURL'}) && defined($metadata->{'TITLE'})) { + $self->error('Could not extract download URL and title'); + return undef; + } + + return $metadata; +} + +1; -- 1.8.3.1