1
0
Fork 0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2024-11-25 19:52:11 +00:00

[blip.tv] Add support for subtitles (#2274)

This commit is contained in:
Philipp Hagemeister 2014-02-03 05:18:30 +01:00
parent 009a3408f5
commit b4bcffefa3
3 changed files with 114 additions and 75 deletions

View file

@ -10,6 +10,7 @@ from test.helper import FakeYDL, md5
from youtube_dl.extractor import ( from youtube_dl.extractor import (
BlipTVIE,
YoutubeIE, YoutubeIE,
DailymotionIE, DailymotionIE,
TEDIE, TEDIE,
@ -202,5 +203,25 @@ class TestTedSubtitles(BaseTestSubtitles):
for lang in langs: for lang in langs:
self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
class TestBlipTVSubtitles(BaseTestSubtitles):
url = 'http://blip.tv/a/a-6603250'
IE = BlipTVIE
def test_list_subtitles(self):
self.DL.expect_warning(u'Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_allsubtitles(self):
self.DL.expect_warning(u'Automatic Captions not supported by this server')
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['en']))
self.assertEqual(md5(subtitles['en']), '5b75c300af65fe4476dff79478bb93e4')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View file

@ -6,6 +6,7 @@ import re
import socket import socket
from .common import InfoExtractor from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..utils import ( from ..utils import (
compat_http_client, compat_http_client,
compat_str, compat_str,
@ -17,112 +18,124 @@ from ..utils import (
) )
class BlipTVIE(InfoExtractor): class BlipTVIE(SubtitlesInfoExtractor):
"""Information extractor for blip.tv""" """Information extractor for blip.tv"""
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$' _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(?P<presumptive_id>.+)$'
_TEST = { _TESTS = [{
'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
'file': '5779306.mov',
'md5': 'c6934ad0b6acf2bd920720ec888eb812', 'md5': 'c6934ad0b6acf2bd920720ec888eb812',
'info_dict': { 'info_dict': {
'id': '5779306',
'ext': 'mov',
'upload_date': '20111205', 'upload_date': '20111205',
'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
'uploader': 'Comic Book Resources - CBR TV', 'uploader': 'Comic Book Resources - CBR TV',
'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
} }
} }, {
# https://github.com/rg3/youtube-dl/pull/2274
def report_direct_download(self, title): 'note': 'Video with subtitles',
"""Report information extraction.""" 'url': 'http://blip.tv/play/h6Uag5OEVgI.html',
self.to_screen('%s: Direct download detected' % title) 'md5': '309f9d25b820b086ca163ffac8031806',
'info_dict': {
'id': '6586561',
'ext': 'mp4',
'uploader': 'Red vs. Blue',
'description': 'One-Zero-One',
'upload_date': '20130614',
'title': 'Red vs. Blue Season 11 Episode 1',
}
}]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None: presumptive_id = mobj.group('presumptive_id')
raise ExtractorError('Invalid URL: %s' % url)
# See https://github.com/rg3/youtube-dl/issues/857 # See https://github.com/rg3/youtube-dl/issues/857
embed_mobj = re.search(r'^(?:https?://)?(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url) embed_mobj = re.match(r'https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url)
if embed_mobj: if embed_mobj:
info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1) info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1)
info_page = self._download_webpage(info_url, embed_mobj.group(1)) info_page = self._download_webpage(info_url, embed_mobj.group(1))
video_id = self._search_regex(r'data-episode-id="(\d+)', info_page, 'video_id') video_id = self._search_regex(
r'data-episode-id="([0-9]+)', info_page, 'video_id')
return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV') return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV')
if '?' in url: cchar = '&' if '?' in url else '?'
cchar = '&'
else:
cchar = '?'
json_url = url + cchar + 'skin=json&version=2&no_wrap=1' json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
request = compat_urllib_request.Request(json_url) request = compat_urllib_request.Request(json_url)
request.add_header('User-Agent', 'iTunes/10.6.1') request.add_header('User-Agent', 'iTunes/10.6.1')
self.report_extraction(mobj.group(1)) json_data = self._download_json(request, video_id=presumptive_id)
urlh = self._request_webpage(request, None, False,
'unable to download video info webpage')
try: if 'Post' in json_data:
json_code_bytes = urlh.read() data = json_data['Post']
json_code = json_code_bytes.decode('utf-8') else:
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: data = json_data
raise ExtractorError('Unable to read video info webpage: %s' % compat_str(err))
try: video_id = compat_str(data['item_id'])
json_data = json.loads(json_code) upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
if 'Post' in json_data: subtitles = {}
data = json_data['Post'] formats = []
else: if 'additionalMedia' in data:
data = json_data for f in data['additionalMedia']:
if f.get('file_type_srt') == 1:
upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') LANGS = {
formats = [] 'english': 'en',
if 'additionalMedia' in data: }
for f in sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])): lang = f['role'].rpartition('-')[-1].strip().lower()
if not int(f['media_width']): # filter m3u8 langcode = LANGS.get(lang, lang)
continue subtitles[langcode] = f['url']
formats.append({ continue
'url': f['url'], if not int(f['media_width']): # filter m3u8
'format_id': f['role'], continue
'width': int(f['media_width']),
'height': int(f['media_height']),
})
else:
formats.append({ formats.append({
'url': data['media']['url'], 'url': f['url'],
'width': int(data['media']['width']), 'format_id': f['role'],
'height': int(data['media']['height']), 'width': int(f['media_width']),
'height': int(f['media_height']),
}) })
else:
formats.append({
'url': data['media']['url'],
'width': int(data['media']['width']),
'height': int(data['media']['height']),
})
self._sort_formats(formats)
self._sort_formats(formats) # subtitles
video_subtitles = self.extract_subtitles(video_id, subtitles)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, subtitles)
return
return { return {
'id': compat_str(data['item_id']), 'id': video_id,
'uploader': data['display_name'], 'uploader': data['display_name'],
'upload_date': upload_date, 'upload_date': upload_date,
'title': data['title'], 'title': data['title'],
'thumbnail': data['thumbnailUrl'], 'thumbnail': data['thumbnailUrl'],
'description': data['description'], 'description': data['description'],
'user_agent': 'iTunes/10.6.1', 'user_agent': 'iTunes/10.6.1',
'formats': formats, 'formats': formats,
} 'subtitles': video_subtitles,
except (ValueError, KeyError) as err: }
raise ExtractorError('Unable to parse video information: %s' % repr(err))
def _download_subtitle_url(self, sub_lang, url):
# For some weird reason, blip.tv serves a video instead of subtitles
# when we request with a common UA
req = compat_urllib_request.Request(url)
req.add_header('Youtubedl-user-agent', 'youtube-dl')
return self._download_webpage(req, None, note=False)
class BlipTVUserIE(InfoExtractor): class BlipTVUserIE(InfoExtractor):
"""Information Extractor for blip.tv users."""
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$' _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
_PAGE_SIZE = 12 _PAGE_SIZE = 12
IE_NAME = 'blip.tv:user' IE_NAME = 'blip.tv:user'
def _real_extract(self, url): def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError('Invalid URL: %s' % url)
username = mobj.group(1) username = mobj.group(1)
page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
@ -131,7 +144,6 @@ class BlipTVUserIE(InfoExtractor):
mobj = re.search(r'data-users-id="([^"]+)"', page) mobj = re.search(r'data-users-id="([^"]+)"', page)
page_base = page_base % mobj.group(1) page_base = page_base % mobj.group(1)
# Download video ids using BlipTV Ajax calls. Result size per # Download video ids using BlipTV Ajax calls. Result size per
# query is limited (currently to 12 videos) so we need to query # query is limited (currently to 12 videos) so we need to query
# page by page until there are no video ids - it means we got # page by page until there are no video ids - it means we got
@ -142,8 +154,8 @@ class BlipTVUserIE(InfoExtractor):
while True: while True:
url = page_base + "&page=" + str(pagenum) url = page_base + "&page=" + str(pagenum)
page = self._download_webpage(url, username, page = self._download_webpage(
'Downloading video ids from page %d' % pagenum) url, username, 'Downloading video ids from page %d' % pagenum)
# Extract video identifiers # Extract video identifiers
ids_in_page = [] ids_in_page = []
@ -167,4 +179,4 @@ class BlipTVUserIE(InfoExtractor):
urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] urls = ['http://blip.tv/%s' % video_id for video_id in video_ids]
url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
return [self.playlist_result(url_entries, playlist_title = username)] return [self.playlist_result(url_entries, playlist_title=username)]

View file

@ -62,24 +62,30 @@ class SubtitlesInfoExtractor(InfoExtractor):
subtitles[sub_lang] = subtitle subtitles[sub_lang] = subtitle
return subtitles return subtitles
def _download_subtitle_url(self, sub_lang, url):
return self._download_webpage(url, None, note=False)
def _request_subtitle_url(self, sub_lang, url): def _request_subtitle_url(self, sub_lang, url):
""" makes the http request for the subtitle """ """ makes the http request for the subtitle """
try: try:
sub = self._download_webpage(url, None, note=False) return self._download_subtitle_url(sub_lang, url)
except ExtractorError as err: except ExtractorError as err:
self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err)))
return return
if not sub: if not sub:
self._downloader.report_warning(u'Did not fetch video subtitles') self._downloader.report_warning(u'Did not fetch video subtitles')
return return
return sub
def _get_available_subtitles(self, video_id, webpage): def _get_available_subtitles(self, video_id, webpage):
""" """
returns {sub_lang: url} or {} if not available returns {sub_lang: url} or {} if not available
Must be redefined by the subclasses Must be redefined by the subclasses
""" """
pass
# By default, allow implementations to simply pass in the result
assert isinstance(webpage, dict), \
'_get_available_subtitles not implemented'
return webpage
def _get_available_automatic_caption(self, video_id, webpage): def _get_available_automatic_caption(self, video_id, webpage):
""" """