- New YouYube scheme, as some videos did not conform to
authorRalf Ertzinger <sun@ryoko-darknet.camperquake.de>
Sat, 14 Nov 2009 14:33:39 +0000 (15:33 +0100)
committerRalf Ertzinger <sun@ryoko-darknet.camperquake.de>
Sat, 14 Nov 2009 14:33:39 +0000 (15:33 +0100)
  the old scheme. This is getting pretty embarassing, as I'm
  now down to having written a parser for textual repesentations
  of javascript hashes to get the information I need. This
  would be much easier if YouTube would use XML files just like
  everybody else. Thank you.

videosite/JSArrayParser.pm [new file with mode: 0644]
videosite/YouTubeGrabber.pm
videosite/jsarray.pm [new file with mode: 0644]
videosite/jsarray.yp [new file with mode: 0644]

diff --git a/videosite/JSArrayParser.pm b/videosite/JSArrayParser.pm
new file mode 100644 (file)
index 0000000..20e6e0e
--- /dev/null
@@ -0,0 +1,55 @@
+#
+# A helper class for parsing textual JS hashes into perl 
+# hashes
+#
+# The parser is in jsarray.yp, to regenerate you'll need the Parse::YAPP
+# package. Use 'yapp -m videosite::jsarray -s jsarray.yp' to regenerate
+#
+
+package videosite::JSArrayParser;
+
+use Parse::Lex;
+use videosite::jsarray;
+use strict;
+
+my @tokens = (
+    COLON  => '[:]',
+    RIGHTC => '[\}]',
+    LEFTC => '[\{]',
+    QUOTE => '[\"]',
+    COMMA => '[,]',
+    ID =>    '[\w_%\.\+-]+'
+);
+
+sub new {
+    my $class = shift;
+    my $self = {
+        '_PARSER' => videosite::jsarray->new(),
+        '_LEXER' => Parse::Lex->new(@tokens),
+    };
+
+    return bless($self, $class);
+}
+
+sub parse {
+    my $self = shift;
+    my $s = shift;
+    my @result;
+    my $l = $self->{'_LEXER'};
+
+    $l->from($s);
+    @result = $self->{'_PARSER'}->YYParse(
+        yylex => sub {
+            my $tok = $l->next();
+            return ('', undef) unless $tok;
+            return ('', undef) if $l->eoi();
+            return ($tok->name(), $tok->text());
+        },
+        yyerror => sub {
+            $_[0]->YYAbort();
+        },
+        yydebug => 0x0);
+    return $result[0]?{@{$result[0]}}:undef;
+}
+
+1;
index 33ca467..2ed52a7 100644 (file)
@@ -15,6 +15,7 @@ use LWP::UserAgent;
 use HTTP::Cookies;
 use HTML::TokeParser;
 use Data::Dumper;
+use videosite::JSArrayParser;
 
 use strict;
 
@@ -60,6 +61,7 @@ sub _parse {
         'high' => [34, 35, 18, 22, 6, 5],
         'normal' => [6, 5, 34, 35, 18, 22]);
     my $preflist;
+    my $jsp;
 
     $url =~ m|$pattern|;
     $url = $1;
@@ -94,7 +96,7 @@ sub _parse {
 
     $p = HTML::TokeParser->new(\$content);
 
-    while ($tag = $p->get_tag('div', 'meta', 'script')) {
+    SWF_ARGS: while ($tag = $p->get_tag('div', 'meta', 'script')) {
         if ('meta' eq $tag->[0]) {
             if ('title' eq $tag->[1]->{'name'}) {
                 $metadata->{'TITLE'} = $tag->[1]->{'content'};
@@ -103,13 +105,52 @@ sub _parse {
         } elsif ('script' eq $tag->[0]) {
             $e = $p->get_text();
             $self->debug("Found script: %s", $e);
-            if ($e =~ m|\x22fmt_url_map\x22\s*:\s*\x22([^\x22]+)\x22|) {
-                $self->debug("Found fmt_url_map: %s", $1);
-                my $urls = $1;
+            if ($e =~ m|\x27SWF_ARGS\x27:\s+(.+),|) {
                 my %urls;
+                my $args = $1;
 
-                $urls =~ s/%([[:xdigit:]]{2})/chr(hex($1))/ge;
-                %urls = split(/[\|,]/, $urls);
+                $self->debug("Found SWF_ARGS: %s", $args);
+                $jsp = videosite::JSArrayParser->new();
+                $r = $jsp->parse($args);
+
+                unless(defined($r)) {
+                    $self->error("Found information hash, but could not parse");
+                    return undef;
+                }
+
+                if (exists($r->{'fmt_url_map'}) and ($r->{'fmt_url_map'} ne '')) {
+                    my $urls =  $r->{'fmt_url_map'};
+
+                    $self->debug("Video has fmt_url_map: %s", $urls);
+
+                    $urls =~ s/%([[:xdigit:]]{2})/chr(hex($1))/ge;
+                    %urls = split(/[\|,]/, $urls);
+                } elsif (exists($r->{'t'}) and ($r->{'t'} ne '')) {
+                    my $thash = $r->{'t'};
+
+                    if (exists($r->{'fmt_map'}) && ($r->{'fmt_map'} ne '')) {
+                        my $fmt = $r->{'fmt_map'};
+                        my @fmt;
+
+                        $self->debug('Video has fmt_map');
+                        $fmt =~ s/%([[:xdigit:]]{2})/chr(hex($1))/ge;
+                        @fmt = split(/,/, $fmt);
+                        foreach (@fmt) {
+                            split(/\//);
+                            $urls{$_[0]} =  sprintf('http://www.youtube.com/get_video?video_id=%s&fmt=%d&t=%s', 
+                                $metadata->{'ID'},
+                                $_[0],
+                                $thash);
+                        }
+                    } else {
+                        $urls{5} = sprintf('http://www.youtube.com/get_video?video_id=%s&t=%s',
+                            $metadata->{'ID'},
+                            $thash);
+                    }
+                } else {
+                    $self->error('Neither fmt_url_map nor t found in video information hash');
+                    return undef;
+                }
                 $self->debug("Found quality levels [%s]", join(", ", keys(%urls)));
 
                 foreach (keys(%urls)) {
@@ -137,6 +178,7 @@ sub _parse {
                 }
 
                 $self->debug('URL found: %s', $metadata->{'DLURL'});
+                last SWF_ARGS;
             }
         } elsif ('div' eq $tag->[0]) {
             if (exists($tag->[1]->{'class'}) and ('errorBox' eq $tag->[1]->{'class'})) {
diff --git a/videosite/jsarray.pm b/videosite/jsarray.pm
new file mode 100644 (file)
index 0000000..6f9075a
--- /dev/null
@@ -0,0 +1,632 @@
+####################################################################
+#
+#    This file was generated using Parse::Yapp version 1.05.
+#
+#        Don't edit this file, use source file instead.
+#
+#             ANY CHANGE MADE HERE WILL BE LOST !
+#
+####################################################################
+package videosite::jsarray;
+use vars qw ( @ISA );
+use strict;
+
+@ISA= qw ( Parse::Yapp::Driver );
+#Included Parse/Yapp/Driver.pm file----------------------------------------
+{
+#
+# Module Parse::Yapp::Driver
+#
+# This module is part of the Parse::Yapp package available on your
+# nearest CPAN
+#
+# Any use of this module in a standalone parser make the included
+# text under the same copyright as the Parse::Yapp module itself.
+#
+# This notice should remain unchanged.
+#
+# (c) Copyright 1998-2001 Francois Desarmenien, all rights reserved.
+# (see the pod text in Parse::Yapp module for use and distribution rights)
+#
+
+package Parse::Yapp::Driver;
+
+require 5.004;
+
+use strict;
+
+use vars qw ( $VERSION $COMPATIBLE $FILENAME );
+
+$VERSION = '1.05';
+$COMPATIBLE = '0.07';
+$FILENAME=__FILE__;
+
+use Carp;
+
+#Known parameters, all starting with YY (leading YY will be discarded)
+my(%params)=(YYLEX => 'CODE', 'YYERROR' => 'CODE', YYVERSION => '',
+                        YYRULES => 'ARRAY', YYSTATES => 'ARRAY', YYDEBUG => '');
+#Mandatory parameters
+my(@params)=('LEX','RULES','STATES');
+
+sub new {
+    my($class)=shift;
+       my($errst,$nberr,$token,$value,$check,$dotpos);
+    my($self)={ ERROR => \&_Error,
+                               ERRST => \$errst,
+                NBERR => \$nberr,
+                               TOKEN => \$token,
+                               VALUE => \$value,
+                               DOTPOS => \$dotpos,
+                               STACK => [],
+                               DEBUG => 0,
+                               CHECK => \$check };
+
+       _CheckParams( [], \%params, \@_, $self );
+
+               exists($$self{VERSION})
+       and     $$self{VERSION} < $COMPATIBLE
+       and     croak "Yapp driver version $VERSION ".
+                         "incompatible with version $$self{VERSION}:\n".
+                         "Please recompile parser module.";
+
+        ref($class)
+    and $class=ref($class);
+
+    bless($self,$class);
+}
+
+sub YYParse {
+    my($self)=shift;
+    my($retval);
+
+       _CheckParams( \@params, \%params, \@_, $self );
+
+       if($$self{DEBUG}) {
+               _DBLoad();
+               $retval = eval '$self->_DBParse()';#Do not create stab entry on compile
+        $@ and die $@;
+       }
+       else {
+               $retval = $self->_Parse();
+       }
+    $retval
+}
+
+sub YYData {
+       my($self)=shift;
+
+               exists($$self{USER})
+       or      $$self{USER}={};
+
+       $$self{USER};
+       
+}
+
+sub YYErrok {
+       my($self)=shift;
+
+       ${$$self{ERRST}}=0;
+    undef;
+}
+
+sub YYNberr {
+       my($self)=shift;
+
+       ${$$self{NBERR}};
+}
+
+sub YYRecovering {
+       my($self)=shift;
+
+       ${$$self{ERRST}} != 0;
+}
+
+sub YYAbort {
+       my($self)=shift;
+
+       ${$$self{CHECK}}='ABORT';
+    undef;
+}
+
+sub YYAccept {
+       my($self)=shift;
+
+       ${$$self{CHECK}}='ACCEPT';
+    undef;
+}
+
+sub YYError {
+       my($self)=shift;
+
+       ${$$self{CHECK}}='ERROR';
+    undef;
+}
+
+sub YYSemval {
+       my($self)=shift;
+       my($index)= $_[0] - ${$$self{DOTPOS}} - 1;
+
+               $index < 0
+       and     -$index <= @{$$self{STACK}}
+       and     return $$self{STACK}[$index][1];
+
+       undef;  #Invalid index
+}
+
+sub YYCurtok {
+       my($self)=shift;
+
+        @_
+    and ${$$self{TOKEN}}=$_[0];
+    ${$$self{TOKEN}};
+}
+
+sub YYCurval {
+       my($self)=shift;
+
+        @_
+    and ${$$self{VALUE}}=$_[0];
+    ${$$self{VALUE}};
+}
+
+sub YYExpect {
+    my($self)=shift;
+
+    keys %{$self->{STATES}[$self->{STACK}[-1][0]]{ACTIONS}}
+}
+
+sub YYLexer {
+    my($self)=shift;
+
+       $$self{LEX};
+}
+
+
+#################
+# Private stuff #
+#################
+
+
+sub _CheckParams {
+       my($mandatory,$checklist,$inarray,$outhash)=@_;
+       my($prm,$value);
+       my($prmlst)={};
+
+       while(($prm,$value)=splice(@$inarray,0,2)) {
+        $prm=uc($prm);
+                       exists($$checklist{$prm})
+               or      croak("Unknow parameter '$prm'");
+                       ref($value) eq $$checklist{$prm}
+               or      croak("Invalid value for parameter '$prm'");
+        $prm=unpack('@2A*',$prm);
+               $$outhash{$prm}=$value;
+       }
+       for (@$mandatory) {
+                       exists($$outhash{$_})
+               or      croak("Missing mandatory parameter '".lc($_)."'");
+       }
+}
+
+sub _Error {
+       print "Parse error.\n";
+}
+
+sub _DBLoad {
+       {
+               no strict 'refs';
+
+                       exists(${__PACKAGE__.'::'}{_DBParse})#Already loaded ?
+               and     return;
+       }
+       my($fname)=__FILE__;
+       my(@drv);
+       open(DRV,"<$fname") or die "Report this as a BUG: Cannot open $fname";
+       while(<DRV>) {
+                       /^\s*sub\s+_Parse\s*{\s*$/ .. /^\s*}\s*#\s*_Parse\s*$/
+               and     do {
+                       s/^#DBG>//;
+                       push(@drv,$_);
+               }
+       }
+       close(DRV);
+
+       $drv[0]=~s/_P/_DBP/;
+       eval join('',@drv);
+}
+
+#Note that for loading debugging version of the driver,
+#this file will be parsed from 'sub _Parse' up to '}#_Parse' inclusive.
+#So, DO NOT remove comment at end of sub !!!
+sub _Parse {
+    my($self)=shift;
+
+       my($rules,$states,$lex,$error)
+     = @$self{ 'RULES', 'STATES', 'LEX', 'ERROR' };
+       my($errstatus,$nberror,$token,$value,$stack,$check,$dotpos)
+     = @$self{ 'ERRST', 'NBERR', 'TOKEN', 'VALUE', 'STACK', 'CHECK', 'DOTPOS' };
+
+#DBG>  my($debug)=$$self{DEBUG};
+#DBG>  my($dbgerror)=0;
+
+#DBG>  my($ShowCurToken) = sub {
+#DBG>          my($tok)='>';
+#DBG>          for (split('',$$token)) {
+#DBG>                  $tok.=          (ord($_) < 32 or ord($_) > 126)
+#DBG>                                  ?       sprintf('<%02X>',ord($_))
+#DBG>                                  :       $_;
+#DBG>          }
+#DBG>          $tok.='<';
+#DBG>  };
+
+       $$errstatus=0;
+       $$nberror=0;
+       ($$token,$$value)=(undef,undef);
+       @$stack=( [ 0, undef ] );
+       $$check='';
+
+    while(1) {
+        my($actions,$act,$stateno);
+
+        $stateno=$$stack[-1][0];
+        $actions=$$states[$stateno];
+
+#DBG>  print STDERR ('-' x 40),"\n";
+#DBG>          $debug & 0x2
+#DBG>  and     print STDERR "In state $stateno:\n";
+#DBG>          $debug & 0x08
+#DBG>  and     print STDERR "Stack:[".
+#DBG>                                   join(',',map { $$_[0] } @$stack).
+#DBG>                                   "]\n";
+
+
+        if  (exists($$actions{ACTIONS})) {
+
+                               defined($$token)
+            or do {
+                               ($$token,$$value)=&$lex($self);
+#DBG>                          $debug & 0x01
+#DBG>                  and     print STDERR "Need token. Got ".&$ShowCurToken."\n";
+                       };
+
+            $act=   exists($$actions{ACTIONS}{$$token})
+                    ?   $$actions{ACTIONS}{$$token}
+                    :   exists($$actions{DEFAULT})
+                        ?   $$actions{DEFAULT}
+                        :   undef;
+        }
+        else {
+            $act=$$actions{DEFAULT};
+#DBG>                  $debug & 0x01
+#DBG>          and     print STDERR "Don't need token.\n";
+        }
+
+            defined($act)
+        and do {
+
+                $act > 0
+            and do {        #shift
+
+#DBG>                          $debug & 0x04
+#DBG>                  and     print STDERR "Shift and go to state $act.\n";
+
+                                       $$errstatus
+                               and     do {
+                                       --$$errstatus;
+
+#DBG>                                  $debug & 0x10
+#DBG>                          and     $dbgerror
+#DBG>                          and     $$errstatus == 0
+#DBG>                          and     do {
+#DBG>                                  print STDERR "**End of Error recovery.\n";
+#DBG>                                  $dbgerror=0;
+#DBG>                          };
+                               };
+
+
+                push(@$stack,[ $act, $$value ]);
+
+                                       $$token ne ''   #Don't eat the eof
+                               and     $$token=$$value=undef;
+                next;
+            };
+
+            #reduce
+            my($lhs,$len,$code,@sempar,$semval);
+            ($lhs,$len,$code)=@{$$rules[-$act]};
+
+#DBG>                  $debug & 0x04
+#DBG>          and     $act
+#DBG>          and     print STDERR "Reduce using rule ".-$act." ($lhs,$len): ";
+
+                $act
+            or  $self->YYAccept();
+
+            $$dotpos=$len;
+
+                unpack('A1',$lhs) eq '@'    #In line rule
+            and do {
+                    $lhs =~ /^\@[0-9]+\-([0-9]+)$/
+                or  die "In line rule name '$lhs' ill formed: ".
+                        "report it as a BUG.\n";
+                $$dotpos = $1;
+            };
+
+            @sempar =       $$dotpos
+                        ?   map { $$_[1] } @$stack[ -$$dotpos .. -1 ]
+                        :   ();
+
+            $semval = $code ? &$code( $self, @sempar )
+                            : @sempar ? $sempar[0] : undef;
+
+            splice(@$stack,-$len,$len);
+
+                $$check eq 'ACCEPT'
+            and do {
+
+#DBG>                  $debug & 0x04
+#DBG>          and     print STDERR "Accept.\n";
+
+                               return($semval);
+                       };
+
+                $$check eq 'ABORT'
+            and        do {
+
+#DBG>                  $debug & 0x04
+#DBG>          and     print STDERR "Abort.\n";
+
+                               return(undef);
+
+                       };
+
+#DBG>                  $debug & 0x04
+#DBG>          and     print STDERR "Back to state $$stack[-1][0], then ";
+
+                $$check eq 'ERROR'
+            or  do {
+#DBG>                          $debug & 0x04
+#DBG>                  and     print STDERR 
+#DBG>                              "go to state $$states[$$stack[-1][0]]{GOTOS}{$lhs}.\n";
+
+#DBG>                          $debug & 0x10
+#DBG>                  and     $dbgerror
+#DBG>                  and     $$errstatus == 0
+#DBG>                  and     do {
+#DBG>                          print STDERR "**End of Error recovery.\n";
+#DBG>                          $dbgerror=0;
+#DBG>                  };
+
+                           push(@$stack,
+                     [ $$states[$$stack[-1][0]]{GOTOS}{$lhs}, $semval ]);
+                $$check='';
+                next;
+            };
+
+#DBG>                  $debug & 0x04
+#DBG>          and     print STDERR "Forced Error recovery.\n";
+
+            $$check='';
+
+        };
+
+        #Error
+            $$errstatus
+        or   do {
+
+            $$errstatus = 1;
+            &$error($self);
+                $$errstatus # if 0, then YYErrok has been called
+            or  next;       # so continue parsing
+
+#DBG>                  $debug & 0x10
+#DBG>          and     do {
+#DBG>                  print STDERR "**Entering Error recovery.\n";
+#DBG>                  ++$dbgerror;
+#DBG>          };
+
+            ++$$nberror;
+
+        };
+
+                       $$errstatus == 3        #The next token is not valid: discard it
+               and     do {
+                               $$token eq ''   # End of input: no hope
+                       and     do {
+#DBG>                          $debug & 0x10
+#DBG>                  and     print STDERR "**At eof: aborting.\n";
+                               return(undef);
+                       };
+
+#DBG>                  $debug & 0x10
+#DBG>          and     print STDERR "**Dicard invalid token ".&$ShowCurToken.".\n";
+
+                       $$token=$$value=undef;
+               };
+
+        $$errstatus=3;
+
+               while(    @$stack
+                         and (         not exists($$states[$$stack[-1][0]]{ACTIONS})
+                               or  not exists($$states[$$stack[-1][0]]{ACTIONS}{error})
+                                       or      $$states[$$stack[-1][0]]{ACTIONS}{error} <= 0)) {
+
+#DBG>                  $debug & 0x10
+#DBG>          and     print STDERR "**Pop state $$stack[-1][0].\n";
+
+                       pop(@$stack);
+               }
+
+                       @$stack
+               or      do {
+
+#DBG>                  $debug & 0x10
+#DBG>          and     print STDERR "**No state left on stack: aborting.\n";
+
+                       return(undef);
+               };
+
+               #shift the error token
+
+#DBG>                  $debug & 0x10
+#DBG>          and     print STDERR "**Shift \$error token and go to state ".
+#DBG>                                           $$states[$$stack[-1][0]]{ACTIONS}{error}.
+#DBG>                                           ".\n";
+
+               push(@$stack, [ $$states[$$stack[-1][0]]{ACTIONS}{error}, undef ]);
+
+    }
+
+    #never reached
+       croak("Error in driver logic. Please, report it as a BUG");
+
+}#_Parse
+#DO NOT remove comment
+
+1;
+
+}
+#End of include--------------------------------------------------
+
+
+
+
+sub new {
+        my($class)=shift;
+        ref($class)
+    and $class=ref($class);
+
+    my($self)=$class->SUPER::new( yyversion => '1.05',
+                                  yystates =>
+[
+       {#State 0
+               ACTIONS => {
+                       'LEFTC' => 2
+               },
+               GOTOS => {
+                       'array' => 1
+               }
+       },
+       {#State 1
+               ACTIONS => {
+                       '' => 3
+               }
+       },
+       {#State 2
+               ACTIONS => {
+                       'QUOTE' => 7
+               },
+               GOTOS => {
+                       'quotestring' => 4,
+                       'kvpair' => 5,
+                       'kvlist' => 6
+               }
+       },
+       {#State 3
+               DEFAULT => 0
+       },
+       {#State 4
+               ACTIONS => {
+                       'COLON' => 8
+               }
+       },
+       {#State 5
+               DEFAULT => -3
+       },
+       {#State 6
+               ACTIONS => {
+                       'COMMA' => 9,
+                       'RIGHTC' => 10
+               }
+       },
+       {#State 7
+               ACTIONS => {
+                       'ID' => 11,
+                       'QUOTE' => 12
+               }
+       },
+       {#State 8
+               ACTIONS => {
+                       'QUOTE' => 7
+               },
+               GOTOS => {
+                       'quotestring' => 13
+               }
+       },
+       {#State 9
+               ACTIONS => {
+                       'QUOTE' => 7
+               },
+               GOTOS => {
+                       'quotestring' => 4,
+                       'kvpair' => 14
+               }
+       },
+       {#State 10
+               DEFAULT => -1
+       },
+       {#State 11
+               ACTIONS => {
+                       'QUOTE' => 15
+               }
+       },
+       {#State 12
+               DEFAULT => -5
+       },
+       {#State 13
+               DEFAULT => -4
+       },
+       {#State 14
+               DEFAULT => -2
+       },
+       {#State 15
+               DEFAULT => -6
+       }
+],
+                                  yyrules  =>
+[
+       [#Rule 0
+                '$start', 2, undef
+       ],
+       [#Rule 1
+                'array', 3,
+sub
+#line 3 "jsarray.yp"
+{ return $_[2] }
+       ],
+       [#Rule 2
+                'kvlist', 3,
+sub
+#line 6 "jsarray.yp"
+{ return [ @{$_[1]}, @{$_[3]} ] }
+       ],
+       [#Rule 3
+                'kvlist', 1, undef
+       ],
+       [#Rule 4
+                'kvpair', 3,
+sub
+#line 10 "jsarray.yp"
+{ return [ $_[1], $_[3] ] }
+       ],
+       [#Rule 5
+                'quotestring', 2,
+sub
+#line 13 "jsarray.yp"
+{ return "" }
+       ],
+       [#Rule 6
+                'quotestring', 3,
+sub
+#line 14 "jsarray.yp"
+{ return $_[2] }
+       ]
+],
+                                  @_);
+    bless($self,$class);
+}
+
+#line 15 "jsarray.yp"
+
+
+1;
diff --git a/videosite/jsarray.yp b/videosite/jsarray.yp
new file mode 100644 (file)
index 0000000..3b32aca
--- /dev/null
@@ -0,0 +1,15 @@
+%%
+array:
+    LEFTC kvlist RIGHTC { return $_[2] };
+
+kvlist:
+    kvlist COMMA kvpair { return [ @{$_[1]}, @{$_[3]} ] } |
+    kvpair;
+
+kvpair:
+    quotestring COLON quotestring { return [ $_[1], $_[3] ] };
+
+quotestring:
+    QUOTE QUOTE { return "" } |
+    QUOTE ID QUOTE { return $_[2] };
+%%