- Decode HTML entities in YouTube titles
authorRalf Ertzinger <sun@ryoko-darknet.camperquake.de>
Mon, 21 Dec 2009 19:42:38 +0000 (20:42 +0100)
committerRalf Ertzinger <sun@ryoko-darknet.camperquake.de>
Mon, 21 Dec 2009 19:42:38 +0000 (20:42 +0100)
videosite/YouTubeGrabber.pm

index 83cee59..edb5c99 100644 (file)
@@ -14,6 +14,8 @@ use videosite::GrabberBase;
 use LWP::UserAgent;
 use HTTP::Cookies;
 use HTML::TokeParser;
+use HTML::Entities qw(decode_entities);
+use Encode;
 use Data::Dumper;
 use videosite::JSArrayParser;
 
@@ -100,6 +102,11 @@ sub _parse {
         if ('meta' eq $tag->[0]) {
             if ('title' eq $tag->[1]->{'name'}) {
                 $metadata->{'TITLE'} = $tag->[1]->{'content'};
+                # Convert HTML entities in the title. This is a bit convoluted.
+                $metadata->{'TITLE'} = encode("utf8",
+                                         decode_entities(
+                                           decode("utf8", $metadata->{'TITLE'})));
+                    
                 $self->debug('Title found: %s', $metadata->{'TITLE'});
             }
         } elsif ('script' eq $tag->[0]) {