From 7b6cd96a0ad9a3bed4f771df3cc3d541470427f8 Mon Sep 17 00:00:00 2001 From: Ralf Ertzinger Date: Sat, 14 Nov 2009 15:33:39 +0100 Subject: [PATCH] - New YouYube scheme, as some videos did not conform to the old scheme. This is getting pretty embarassing, as I'm now down to having written a parser for textual repesentations of javascript hashes to get the information I need. This would be much easier if YouTube would use XML files just like everybody else. Thank you. --- videosite/JSArrayParser.pm | 55 ++++ videosite/YouTubeGrabber.pm | 54 +++- videosite/jsarray.pm | 632 ++++++++++++++++++++++++++++++++++++++++++++ videosite/jsarray.yp | 15 ++ 4 files changed, 750 insertions(+), 6 deletions(-) create mode 100644 videosite/JSArrayParser.pm create mode 100644 videosite/jsarray.pm create mode 100644 videosite/jsarray.yp diff --git a/videosite/JSArrayParser.pm b/videosite/JSArrayParser.pm new file mode 100644 index 0000000..20e6e0e --- /dev/null +++ b/videosite/JSArrayParser.pm @@ -0,0 +1,55 @@ +# +# A helper class for parsing textual JS hashes into perl +# hashes +# +# The parser is in jsarray.yp, to regenerate you'll need the Parse::YAPP +# package. Use 'yapp -m videosite::jsarray -s jsarray.yp' to regenerate +# + +package videosite::JSArrayParser; + +use Parse::Lex; +use videosite::jsarray; +use strict; + +my @tokens = ( + COLON => '[:]', + RIGHTC => '[\}]', + LEFTC => '[\{]', + QUOTE => '[\"]', + COMMA => '[,]', + ID => '[\w_%\.\+-]+' +); + +sub new { + my $class = shift; + my $self = { + '_PARSER' => videosite::jsarray->new(), + '_LEXER' => Parse::Lex->new(@tokens), + }; + + return bless($self, $class); +} + +sub parse { + my $self = shift; + my $s = shift; + my @result; + my $l = $self->{'_LEXER'}; + + $l->from($s); + @result = $self->{'_PARSER'}->YYParse( + yylex => sub { + my $tok = $l->next(); + return ('', undef) unless $tok; + return ('', undef) if $l->eoi(); + return ($tok->name(), $tok->text()); + }, + yyerror => sub { + $_[0]->YYAbort(); + }, + yydebug => 0x0); + return $result[0]?{@{$result[0]}}:undef; +} + +1; diff --git a/videosite/YouTubeGrabber.pm b/videosite/YouTubeGrabber.pm index 33ca467..2ed52a7 100644 --- a/videosite/YouTubeGrabber.pm +++ b/videosite/YouTubeGrabber.pm @@ -15,6 +15,7 @@ use LWP::UserAgent; use HTTP::Cookies; use HTML::TokeParser; use Data::Dumper; +use videosite::JSArrayParser; use strict; @@ -60,6 +61,7 @@ sub _parse { 'high' => [34, 35, 18, 22, 6, 5], 'normal' => [6, 5, 34, 35, 18, 22]); my $preflist; + my $jsp; $url =~ m|$pattern|; $url = $1; @@ -94,7 +96,7 @@ sub _parse { $p = HTML::TokeParser->new(\$content); - while ($tag = $p->get_tag('div', 'meta', 'script')) { + SWF_ARGS: while ($tag = $p->get_tag('div', 'meta', 'script')) { if ('meta' eq $tag->[0]) { if ('title' eq $tag->[1]->{'name'}) { $metadata->{'TITLE'} = $tag->[1]->{'content'}; @@ -103,13 +105,52 @@ sub _parse { } elsif ('script' eq $tag->[0]) { $e = $p->get_text(); $self->debug("Found script: %s", $e); - if ($e =~ m|\x22fmt_url_map\x22\s*:\s*\x22([^\x22]+)\x22|) { - $self->debug("Found fmt_url_map: %s", $1); - my $urls = $1; + if ($e =~ m|\x27SWF_ARGS\x27:\s+(.+),|) { my %urls; + my $args = $1; - $urls =~ s/%([[:xdigit:]]{2})/chr(hex($1))/ge; - %urls = split(/[\|,]/, $urls); + $self->debug("Found SWF_ARGS: %s", $args); + $jsp = videosite::JSArrayParser->new(); + $r = $jsp->parse($args); + + unless(defined($r)) { + $self->error("Found information hash, but could not parse"); + return undef; + } + + if (exists($r->{'fmt_url_map'}) and ($r->{'fmt_url_map'} ne '')) { + my $urls = $r->{'fmt_url_map'}; + + $self->debug("Video has fmt_url_map: %s", $urls); + + $urls =~ s/%([[:xdigit:]]{2})/chr(hex($1))/ge; + %urls = split(/[\|,]/, $urls); + } elsif (exists($r->{'t'}) and ($r->{'t'} ne '')) { + my $thash = $r->{'t'}; + + if (exists($r->{'fmt_map'}) && ($r->{'fmt_map'} ne '')) { + my $fmt = $r->{'fmt_map'}; + my @fmt; + + $self->debug('Video has fmt_map'); + $fmt =~ s/%([[:xdigit:]]{2})/chr(hex($1))/ge; + @fmt = split(/,/, $fmt); + foreach (@fmt) { + split(/\//); + $urls{$_[0]} = sprintf('http://www.youtube.com/get_video?video_id=%s&fmt=%d&t=%s', + $metadata->{'ID'}, + $_[0], + $thash); + } + } else { + $urls{5} = sprintf('http://www.youtube.com/get_video?video_id=%s&t=%s', + $metadata->{'ID'}, + $thash); + } + } else { + $self->error('Neither fmt_url_map nor t found in video information hash'); + return undef; + } $self->debug("Found quality levels [%s]", join(", ", keys(%urls))); foreach (keys(%urls)) { @@ -137,6 +178,7 @@ sub _parse { } $self->debug('URL found: %s', $metadata->{'DLURL'}); + last SWF_ARGS; } } elsif ('div' eq $tag->[0]) { if (exists($tag->[1]->{'class'}) and ('errorBox' eq $tag->[1]->{'class'})) { diff --git a/videosite/jsarray.pm b/videosite/jsarray.pm new file mode 100644 index 0000000..6f9075a --- /dev/null +++ b/videosite/jsarray.pm @@ -0,0 +1,632 @@ +#################################################################### +# +# This file was generated using Parse::Yapp version 1.05. +# +# Don't edit this file, use source file instead. +# +# ANY CHANGE MADE HERE WILL BE LOST ! +# +#################################################################### +package videosite::jsarray; +use vars qw ( @ISA ); +use strict; + +@ISA= qw ( Parse::Yapp::Driver ); +#Included Parse/Yapp/Driver.pm file---------------------------------------- +{ +# +# Module Parse::Yapp::Driver +# +# This module is part of the Parse::Yapp package available on your +# nearest CPAN +# +# Any use of this module in a standalone parser make the included +# text under the same copyright as the Parse::Yapp module itself. +# +# This notice should remain unchanged. +# +# (c) Copyright 1998-2001 Francois Desarmenien, all rights reserved. +# (see the pod text in Parse::Yapp module for use and distribution rights) +# + +package Parse::Yapp::Driver; + +require 5.004; + +use strict; + +use vars qw ( $VERSION $COMPATIBLE $FILENAME ); + +$VERSION = '1.05'; +$COMPATIBLE = '0.07'; +$FILENAME=__FILE__; + +use Carp; + +#Known parameters, all starting with YY (leading YY will be discarded) +my(%params)=(YYLEX => 'CODE', 'YYERROR' => 'CODE', YYVERSION => '', + YYRULES => 'ARRAY', YYSTATES => 'ARRAY', YYDEBUG => ''); +#Mandatory parameters +my(@params)=('LEX','RULES','STATES'); + +sub new { + my($class)=shift; + my($errst,$nberr,$token,$value,$check,$dotpos); + my($self)={ ERROR => \&_Error, + ERRST => \$errst, + NBERR => \$nberr, + TOKEN => \$token, + VALUE => \$value, + DOTPOS => \$dotpos, + STACK => [], + DEBUG => 0, + CHECK => \$check }; + + _CheckParams( [], \%params, \@_, $self ); + + exists($$self{VERSION}) + and $$self{VERSION} < $COMPATIBLE + and croak "Yapp driver version $VERSION ". + "incompatible with version $$self{VERSION}:\n". + "Please recompile parser module."; + + ref($class) + and $class=ref($class); + + bless($self,$class); +} + +sub YYParse { + my($self)=shift; + my($retval); + + _CheckParams( \@params, \%params, \@_, $self ); + + if($$self{DEBUG}) { + _DBLoad(); + $retval = eval '$self->_DBParse()';#Do not create stab entry on compile + $@ and die $@; + } + else { + $retval = $self->_Parse(); + } + $retval +} + +sub YYData { + my($self)=shift; + + exists($$self{USER}) + or $$self{USER}={}; + + $$self{USER}; + +} + +sub YYErrok { + my($self)=shift; + + ${$$self{ERRST}}=0; + undef; +} + +sub YYNberr { + my($self)=shift; + + ${$$self{NBERR}}; +} + +sub YYRecovering { + my($self)=shift; + + ${$$self{ERRST}} != 0; +} + +sub YYAbort { + my($self)=shift; + + ${$$self{CHECK}}='ABORT'; + undef; +} + +sub YYAccept { + my($self)=shift; + + ${$$self{CHECK}}='ACCEPT'; + undef; +} + +sub YYError { + my($self)=shift; + + ${$$self{CHECK}}='ERROR'; + undef; +} + +sub YYSemval { + my($self)=shift; + my($index)= $_[0] - ${$$self{DOTPOS}} - 1; + + $index < 0 + and -$index <= @{$$self{STACK}} + and return $$self{STACK}[$index][1]; + + undef; #Invalid index +} + +sub YYCurtok { + my($self)=shift; + + @_ + and ${$$self{TOKEN}}=$_[0]; + ${$$self{TOKEN}}; +} + +sub YYCurval { + my($self)=shift; + + @_ + and ${$$self{VALUE}}=$_[0]; + ${$$self{VALUE}}; +} + +sub YYExpect { + my($self)=shift; + + keys %{$self->{STATES}[$self->{STACK}[-1][0]]{ACTIONS}} +} + +sub YYLexer { + my($self)=shift; + + $$self{LEX}; +} + + +################# +# Private stuff # +################# + + +sub _CheckParams { + my($mandatory,$checklist,$inarray,$outhash)=@_; + my($prm,$value); + my($prmlst)={}; + + while(($prm,$value)=splice(@$inarray,0,2)) { + $prm=uc($prm); + exists($$checklist{$prm}) + or croak("Unknow parameter '$prm'"); + ref($value) eq $$checklist{$prm} + or croak("Invalid value for parameter '$prm'"); + $prm=unpack('@2A*',$prm); + $$outhash{$prm}=$value; + } + for (@$mandatory) { + exists($$outhash{$_}) + or croak("Missing mandatory parameter '".lc($_)."'"); + } +} + +sub _Error { + print "Parse error.\n"; +} + +sub _DBLoad { + { + no strict 'refs'; + + exists(${__PACKAGE__.'::'}{_DBParse})#Already loaded ? + and return; + } + my($fname)=__FILE__; + my(@drv); + open(DRV,"<$fname") or die "Report this as a BUG: Cannot open $fname"; + while() { + /^\s*sub\s+_Parse\s*{\s*$/ .. /^\s*}\s*#\s*_Parse\s*$/ + and do { + s/^#DBG>//; + push(@drv,$_); + } + } + close(DRV); + + $drv[0]=~s/_P/_DBP/; + eval join('',@drv); +} + +#Note that for loading debugging version of the driver, +#this file will be parsed from 'sub _Parse' up to '}#_Parse' inclusive. +#So, DO NOT remove comment at end of sub !!! +sub _Parse { + my($self)=shift; + + my($rules,$states,$lex,$error) + = @$self{ 'RULES', 'STATES', 'LEX', 'ERROR' }; + my($errstatus,$nberror,$token,$value,$stack,$check,$dotpos) + = @$self{ 'ERRST', 'NBERR', 'TOKEN', 'VALUE', 'STACK', 'CHECK', 'DOTPOS' }; + +#DBG> my($debug)=$$self{DEBUG}; +#DBG> my($dbgerror)=0; + +#DBG> my($ShowCurToken) = sub { +#DBG> my($tok)='>'; +#DBG> for (split('',$$token)) { +#DBG> $tok.= (ord($_) < 32 or ord($_) > 126) +#DBG> ? sprintf('<%02X>',ord($_)) +#DBG> : $_; +#DBG> } +#DBG> $tok.='<'; +#DBG> }; + + $$errstatus=0; + $$nberror=0; + ($$token,$$value)=(undef,undef); + @$stack=( [ 0, undef ] ); + $$check=''; + + while(1) { + my($actions,$act,$stateno); + + $stateno=$$stack[-1][0]; + $actions=$$states[$stateno]; + +#DBG> print STDERR ('-' x 40),"\n"; +#DBG> $debug & 0x2 +#DBG> and print STDERR "In state $stateno:\n"; +#DBG> $debug & 0x08 +#DBG> and print STDERR "Stack:[". +#DBG> join(',',map { $$_[0] } @$stack). +#DBG> "]\n"; + + + if (exists($$actions{ACTIONS})) { + + defined($$token) + or do { + ($$token,$$value)=&$lex($self); +#DBG> $debug & 0x01 +#DBG> and print STDERR "Need token. Got ".&$ShowCurToken."\n"; + }; + + $act= exists($$actions{ACTIONS}{$$token}) + ? $$actions{ACTIONS}{$$token} + : exists($$actions{DEFAULT}) + ? $$actions{DEFAULT} + : undef; + } + else { + $act=$$actions{DEFAULT}; +#DBG> $debug & 0x01 +#DBG> and print STDERR "Don't need token.\n"; + } + + defined($act) + and do { + + $act > 0 + and do { #shift + +#DBG> $debug & 0x04 +#DBG> and print STDERR "Shift and go to state $act.\n"; + + $$errstatus + and do { + --$$errstatus; + +#DBG> $debug & 0x10 +#DBG> and $dbgerror +#DBG> and $$errstatus == 0 +#DBG> and do { +#DBG> print STDERR "**End of Error recovery.\n"; +#DBG> $dbgerror=0; +#DBG> }; + }; + + + push(@$stack,[ $act, $$value ]); + + $$token ne '' #Don't eat the eof + and $$token=$$value=undef; + next; + }; + + #reduce + my($lhs,$len,$code,@sempar,$semval); + ($lhs,$len,$code)=@{$$rules[-$act]}; + +#DBG> $debug & 0x04 +#DBG> and $act +#DBG> and print STDERR "Reduce using rule ".-$act." ($lhs,$len): "; + + $act + or $self->YYAccept(); + + $$dotpos=$len; + + unpack('A1',$lhs) eq '@' #In line rule + and do { + $lhs =~ /^\@[0-9]+\-([0-9]+)$/ + or die "In line rule name '$lhs' ill formed: ". + "report it as a BUG.\n"; + $$dotpos = $1; + }; + + @sempar = $$dotpos + ? map { $$_[1] } @$stack[ -$$dotpos .. -1 ] + : (); + + $semval = $code ? &$code( $self, @sempar ) + : @sempar ? $sempar[0] : undef; + + splice(@$stack,-$len,$len); + + $$check eq 'ACCEPT' + and do { + +#DBG> $debug & 0x04 +#DBG> and print STDERR "Accept.\n"; + + return($semval); + }; + + $$check eq 'ABORT' + and do { + +#DBG> $debug & 0x04 +#DBG> and print STDERR "Abort.\n"; + + return(undef); + + }; + +#DBG> $debug & 0x04 +#DBG> and print STDERR "Back to state $$stack[-1][0], then "; + + $$check eq 'ERROR' + or do { +#DBG> $debug & 0x04 +#DBG> and print STDERR +#DBG> "go to state $$states[$$stack[-1][0]]{GOTOS}{$lhs}.\n"; + +#DBG> $debug & 0x10 +#DBG> and $dbgerror +#DBG> and $$errstatus == 0 +#DBG> and do { +#DBG> print STDERR "**End of Error recovery.\n"; +#DBG> $dbgerror=0; +#DBG> }; + + push(@$stack, + [ $$states[$$stack[-1][0]]{GOTOS}{$lhs}, $semval ]); + $$check=''; + next; + }; + +#DBG> $debug & 0x04 +#DBG> and print STDERR "Forced Error recovery.\n"; + + $$check=''; + + }; + + #Error + $$errstatus + or do { + + $$errstatus = 1; + &$error($self); + $$errstatus # if 0, then YYErrok has been called + or next; # so continue parsing + +#DBG> $debug & 0x10 +#DBG> and do { +#DBG> print STDERR "**Entering Error recovery.\n"; +#DBG> ++$dbgerror; +#DBG> }; + + ++$$nberror; + + }; + + $$errstatus == 3 #The next token is not valid: discard it + and do { + $$token eq '' # End of input: no hope + and do { +#DBG> $debug & 0x10 +#DBG> and print STDERR "**At eof: aborting.\n"; + return(undef); + }; + +#DBG> $debug & 0x10 +#DBG> and print STDERR "**Dicard invalid token ".&$ShowCurToken.".\n"; + + $$token=$$value=undef; + }; + + $$errstatus=3; + + while( @$stack + and ( not exists($$states[$$stack[-1][0]]{ACTIONS}) + or not exists($$states[$$stack[-1][0]]{ACTIONS}{error}) + or $$states[$$stack[-1][0]]{ACTIONS}{error} <= 0)) { + +#DBG> $debug & 0x10 +#DBG> and print STDERR "**Pop state $$stack[-1][0].\n"; + + pop(@$stack); + } + + @$stack + or do { + +#DBG> $debug & 0x10 +#DBG> and print STDERR "**No state left on stack: aborting.\n"; + + return(undef); + }; + + #shift the error token + +#DBG> $debug & 0x10 +#DBG> and print STDERR "**Shift \$error token and go to state ". +#DBG> $$states[$$stack[-1][0]]{ACTIONS}{error}. +#DBG> ".\n"; + + push(@$stack, [ $$states[$$stack[-1][0]]{ACTIONS}{error}, undef ]); + + } + + #never reached + croak("Error in driver logic. Please, report it as a BUG"); + +}#_Parse +#DO NOT remove comment + +1; + +} +#End of include-------------------------------------------------- + + + + +sub new { + my($class)=shift; + ref($class) + and $class=ref($class); + + my($self)=$class->SUPER::new( yyversion => '1.05', + yystates => +[ + {#State 0 + ACTIONS => { + 'LEFTC' => 2 + }, + GOTOS => { + 'array' => 1 + } + }, + {#State 1 + ACTIONS => { + '' => 3 + } + }, + {#State 2 + ACTIONS => { + 'QUOTE' => 7 + }, + GOTOS => { + 'quotestring' => 4, + 'kvpair' => 5, + 'kvlist' => 6 + } + }, + {#State 3 + DEFAULT => 0 + }, + {#State 4 + ACTIONS => { + 'COLON' => 8 + } + }, + {#State 5 + DEFAULT => -3 + }, + {#State 6 + ACTIONS => { + 'COMMA' => 9, + 'RIGHTC' => 10 + } + }, + {#State 7 + ACTIONS => { + 'ID' => 11, + 'QUOTE' => 12 + } + }, + {#State 8 + ACTIONS => { + 'QUOTE' => 7 + }, + GOTOS => { + 'quotestring' => 13 + } + }, + {#State 9 + ACTIONS => { + 'QUOTE' => 7 + }, + GOTOS => { + 'quotestring' => 4, + 'kvpair' => 14 + } + }, + {#State 10 + DEFAULT => -1 + }, + {#State 11 + ACTIONS => { + 'QUOTE' => 15 + } + }, + {#State 12 + DEFAULT => -5 + }, + {#State 13 + DEFAULT => -4 + }, + {#State 14 + DEFAULT => -2 + }, + {#State 15 + DEFAULT => -6 + } +], + yyrules => +[ + [#Rule 0 + '$start', 2, undef + ], + [#Rule 1 + 'array', 3, +sub +#line 3 "jsarray.yp" +{ return $_[2] } + ], + [#Rule 2 + 'kvlist', 3, +sub +#line 6 "jsarray.yp" +{ return [ @{$_[1]}, @{$_[3]} ] } + ], + [#Rule 3 + 'kvlist', 1, undef + ], + [#Rule 4 + 'kvpair', 3, +sub +#line 10 "jsarray.yp" +{ return [ $_[1], $_[3] ] } + ], + [#Rule 5 + 'quotestring', 2, +sub +#line 13 "jsarray.yp" +{ return "" } + ], + [#Rule 6 + 'quotestring', 3, +sub +#line 14 "jsarray.yp" +{ return $_[2] } + ] +], + @_); + bless($self,$class); +} + +#line 15 "jsarray.yp" + + +1; diff --git a/videosite/jsarray.yp b/videosite/jsarray.yp new file mode 100644 index 0000000..3b32aca --- /dev/null +++ b/videosite/jsarray.yp @@ -0,0 +1,15 @@ +%% +array: + LEFTC kvlist RIGHTC { return $_[2] }; + +kvlist: + kvlist COMMA kvpair { return [ @{$_[1]}, @{$_[3]} ] } | + kvpair; + +kvpair: + quotestring COLON quotestring { return [ $_[1], $_[3] ] }; + +quotestring: + QUOTE QUOTE { return "" } | + QUOTE ID QUOTE { return $_[2] }; +%% -- 1.8.3.1