From c6b55a8d4817a0818a1923db72b0f953ab80c0d4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 7 Jul 2011 12:12:20 +0200 Subject: [PATCH] Full youtube video descriptions, including special characters (2.6+, with fallback for older Pythons) --- youtube-dl | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/youtube-dl b/youtube-dl index fbb0389be..a3522199f 100755 --- a/youtube-dl +++ b/youtube-dl @@ -15,7 +15,6 @@ import email.utils import gzip import htmlentitydefs import httplib -import json # TODO: json for 2.5 import locale import math import netrc @@ -24,20 +23,35 @@ import os.path import re import socket import string -import StringIO import subprocess import sys import time import urllib import urllib2 +import warnings import zlib +try: + import json +except ImportError: + warnings.warn('No JSON support (TODO: insert trivialjson here)') + +try: + import cStringIO as StringIO +except ImportError: + import StringIO + # parse_qs was moved from the cgi module to the urlparse module recently. try: from urlparse import parse_qs except ImportError: from cgi import parse_qs +try: + import lxml.etree +except ImportError: # Python < 2.6 + pass # Handled below + std_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', @@ -1068,11 +1082,19 @@ class YoutubeIE(InfoExtractor): pass # description - video_description = 'No description available.' - if self._downloader.params.get('forcedescription', False): - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1) + try: + lxml.etree + except NameError: + video_description = u'No description available.' + if self._downloader.params.get('forcedescription', False): + warnings.warn(u'You are using an old Python version, install Python 2.6+ or lxml. Falling back to old video description extractor.') + mobj = re.search(r'', video_webpage) + if mobj is not None: + video_description = mobj.group(1).decode('utf-8') + else: + html_parser = lxml.etree.HTMLParser(encoding='utf-8') + vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) + video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -1130,7 +1152,7 @@ class YoutubeIE(InfoExtractor): 'ext': video_extension.decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description.decode('utf-8'), + 'description': video_description, 'player_url': player_url, }) except UnavailableVideoError, err: