X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=videosite%2FYouTubeGrabber.pm;h=5d328de82f6b10f23d70df62448161852f5bd15d;hb=c41cdd5161342c69724537a5094fb9e270f48c55;hp=2ed52a757fcd0d5a1c0ba143e15c86683ff2a2b9;hpb=7b6cd96a0ad9a3bed4f771df3cc3d541470427f8;p=videosite.git diff --git a/videosite/YouTubeGrabber.pm b/videosite/YouTubeGrabber.pm index 2ed52a7..5d328de 100644 --- a/videosite/YouTubeGrabber.pm +++ b/videosite/YouTubeGrabber.pm @@ -14,6 +14,8 @@ use videosite::GrabberBase; use LWP::UserAgent; use HTTP::Cookies; use HTML::TokeParser; +use HTML::Entities qw(decode_entities); +use Encode; use Data::Dumper; use videosite::JSArrayParser; @@ -24,8 +26,11 @@ sub new { my $self = $class->SUPER::new(); $self->{'NAME'} = 'youtube'; - $self->{'PATTERNS'} = ['(http://(?:[-a-zA-Z0-9_.]+\.)*youtube.(?:com|de|co.uk)/watch\?(?:.+=.+&)*v=([-a-zA-Z0-9_]+))', - '(http://(?:[-a-zA-Z0-9_.]+\.)*youtube.(?:com|de|co.uk)/v/([-a-zA-Z0-9_]+))']; + $self->{'PATTERNS'} = ['(http://(?:[-a-zA-Z0-9_.]+\.)*youtube\.(?:com|de|co.uk)/watch\?(?:.+=.+&)*v=([-a-zA-Z0-9_]+))', + '(http://(?:[-a-zA-Z0-9_.]+\.)*youtube\.(?:com|de|co.uk)/watch\#\!v=([-a-zA-Z0-9_]+))', + '(http://(?:[-a-zA-Z0-9_.]+\.)*youtube\.(?:com|de|co.uk)/v/([-a-zA-Z0-9_]+))', + '(http://(?:[-a-zA-Z0-9_.]+\.)*youtu\.be/([-a-zA-Z0-9_]+))', + '(http://(?:[-a-zA-Z0-9_.]+\.)*youtube\.(?:com|de|co.uk)/user/\w+\?.*/([-a-zA-Z0-9_]+))']; $self->{'_PARAMS'} = { 'QUALITY' => ['normal', 'Quality of the video to download.', { 'normal' => 'standard resolution flash video', @@ -56,10 +61,10 @@ sub _parse { my $videourl; my $quality = $self->_getval('QUALITY'); my %preflist = ( - 'hd' => [22, 35, 18, 34, 6, 5], - 'h264' => [18, 34, 22, 35, 6, 5], - 'high' => [34, 35, 18, 22, 6, 5], - 'normal' => [6, 5, 34, 35, 18, 22]); + 'hd' => [37, 22, 35, 18, 34, 6, 5], + 'h264' => [18, 34, 37, 22, 35, 6, 5], + 'high' => [34, 35, 18, 37, 22, 6, 5], + 'normal' => [6, 5, 34, 35, 18, 22, 37]); my $preflist; my $jsp; @@ -73,6 +78,8 @@ sub _parse { $metadata->{'TITLE'} = undef; $metadata->{'DLURL'} = undef; + $self->debug("Matched id %s from pattern %s", $2, $pattern); + $preflist = $preflist{$quality}; $self->debug("Quality: %s, preflist: [%s]", $quality, join(", ", @{$preflist})); @@ -100,17 +107,25 @@ sub _parse { if ('meta' eq $tag->[0]) { if ('title' eq $tag->[1]->{'name'}) { $metadata->{'TITLE'} = $tag->[1]->{'content'}; + # Convert HTML entities in the title. This is a bit convoluted. + $metadata->{'TITLE'} = encode("utf8", + decode_entities( + decode("utf8", $metadata->{'TITLE'}))); + $self->debug('Title found: %s', $metadata->{'TITLE'}); } } elsif ('script' eq $tag->[0]) { + my %urls; + $e = $p->get_text(); $self->debug("Found script: %s", $e); + if ($e =~ m|\x27SWF_ARGS\x27:\s+(.+),|) { - my %urls; my $args = $1; $self->debug("Found SWF_ARGS: %s", $args); $jsp = videosite::JSArrayParser->new(); + $self->debug("Using %s to parse", ref($jsp)); $r = $jsp->parse($args); unless(defined($r)) { @@ -125,6 +140,8 @@ sub _parse { $urls =~ s/%([[:xdigit:]]{2})/chr(hex($1))/ge; %urls = split(/[\|,]/, $urls); + $self->debug("Pagetype: old (SWF_ARGS), fmt_url_map"); + } elsif (exists($r->{'t'}) and ($r->{'t'} ne '')) { my $thash = $r->{'t'}; @@ -142,24 +159,37 @@ sub _parse { $_[0], $thash); } + $self->debug("Pagetype: 2009 (SWF_ARGS), t with fmt_map"); + } else { $urls{5} = sprintf('http://www.youtube.com/get_video?video_id=%s&t=%s', $metadata->{'ID'}, $thash); + $self->debug("Pagetype: 2009 (SWF_ARGS), t without fmt_map"); } } else { $self->error('Neither fmt_url_map nor t found in video information hash'); return undef; } - $self->debug("Found quality levels [%s]", join(", ", keys(%urls))); + } elsif ($e =~ m|var swfHTML = .*fmt_url_map=([^\&]+)\&|) { + my $urls = $1; + $self->debug("Video has fmt_url_map: %s", $urls); + + $urls =~ s/%([[:xdigit:]]{2})/chr(hex($1))/ge; + %urls = split(/[\|,]/, $urls); + $self->debug("Pagetype: 2010 (swfHTML), fmt_url_map"); + } + if (%urls) { foreach (keys(%urls)) { if ($_ == 35) { $self->debug('Found flv,h264,large: %s', $urls{$_}); } elsif ($_ == 34) { $self->debug('Found flv,h264: %s', $urls{$_}); } elsif ($_ == 22) { - $self->debug('Found mp4,h264,large: %s', $urls{$_}); + $self->debug('Found mp4,h264,720p: %s', $urls{$_}); + } elsif ($_ == 37) { + $self->debug('Found mp4,h264,1080p: %s', $urls{$_}); } elsif ($_ == 18) { $self->debug('Found mp4,h264: %s', $urls{$_}); } elsif ($_ == 5) { @@ -181,7 +211,7 @@ sub _parse { last SWF_ARGS; } } elsif ('div' eq $tag->[0]) { - if (exists($tag->[1]->{'class'}) and ('errorBox' eq $tag->[1]->{'class'})) { + if (exists($tag->[1]->{'class'}) and ('yt-alert-content' eq $tag->[1]->{'class'})) { $self->error("Could not get video data for youtube %s: %s", $metadata->{'ID'}, $p->get_trimmed_text()); return undef; @@ -232,6 +262,8 @@ sub __login { return undef; } + $self->debug("Got a cookie"); + $r = $ua->get($videourl); if ($r->base->as_string() =~ m,/verify_age,) { $self->debug("Looking for session token...");