From 759e8ce15b5e5a7d7a070d8b82344bab04f4986a Mon Sep 17 00:00:00 2001 From: wtpckl Date: Thu, 28 Jan 2021 12:06:52 +0100 Subject: [PATCH 1/5] [RoosterTeeth] Add subtitle support to extractor --- youtube_dl/extractor/roosterteeth.py | 69 ++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8883639b2..fead45a17 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -1,16 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, + compat_urlparse, ) from ..utils import ( ExtractorError, int_or_none, str_or_none, urlencode_postdata, + parse_m3u8_attributes, ) @@ -86,9 +90,11 @@ class RoosterTeethIE(InfoExtractor): api_episode_url = self._EPISODE_BASE_URL + display_id try: - m3u8_url = self._download_json( - api_episode_url + '/videos', display_id, - 'Downloading video JSON metadata')['data'][0]['attributes']['url'] + video_json = self._download_json( + api_episode_url + '/videos', display_id)['data'][0] + m3u8_url = \ + video_json['attributes'].get('url') or \ + video_json['links'].get('master') except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: @@ -100,6 +106,9 @@ class RoosterTeethIE(InfoExtractor): m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) + subtitles = self._extract_m3u8_subtitles( + m3u8_url, display_id) + episode = self._download_json( api_episode_url, display_id, 'Downloading episode JSON metadata')['data'][0] @@ -133,5 +142,59 @@ class RoosterTeethIE(InfoExtractor): 'episode_id': str_or_none(episode.get('uuid')), 'formats': formats, 'channel_id': attributes.get('channel_id'), + 'subtitles': subtitles, 'duration': int_or_none(attributes.get('length')), } + + def _extract_m3u8_subtitles(self, m3u8_url, video_id): + res = self._download_webpage_handle( + m3u8_url, video_id, + note='Downloading subtitle information', + errnote='Failed to download subtitle information', + fatal=False, data=None, headers={}, query={}) + if res is False: + return None + + m3u8_doc, urlh = res + m3u8_url = urlh.geturl() + + def format_url(url, base_url): + if re.match(r'^https?://', url): + return url + else: + return compat_urlparse.urljoin(base_url, url) + + subtitles = {} + + for line in m3u8_doc.splitlines(): + if not line.startswith("#EXT-X-MEDIA:"): + continue + media = parse_m3u8_attributes(line) + + media_type, media_url_raw, media_lang = ( + media.get('TYPE'), + media.get('URI'), + media.get('LANGUAGE'), + ) + if not (media_type in ('SUBTITLES',) and media_url_raw and media_lang): + continue + + media_url = format_url(media_url_raw, base_url=m3u8_url) + + res = self._download_webpage_handle( + media_url, video_id, + note='Downloading subtitle information ({})'.format(media_lang), + errnote='Failed to download subtitle information ({})'.format(media_lang), + fatal=False, data=None, headers={}, query={}) + if res is False: + continue + + m3u8_subtitle_doc, _ = res + for subtitle_line in m3u8_subtitle_doc.splitlines(): + if subtitle_line.startswith("#"): + continue + media_url = format_url(subtitle_line, base_url=media_url) + break + + subtitles[media_lang] = [{'url': media_url, }, ] + return subtitles if len(subtitles) > 0 else None From b4bc4527e48f8ca70078115e702a7b37f12f0924 Mon Sep 17 00:00:00 2001 From: wtpckl Date: Thu, 28 Jan 2021 17:09:54 +0100 Subject: [PATCH 2/5] [RoosterTeeth] Clean up code after adding subtitle support --- youtube_dl/extractor/roosterteeth.py | 40 ++++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index fead45a17..3fc279d0f 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, - compat_urlparse, ) from ..utils import ( ExtractorError, @@ -15,6 +14,9 @@ from ..utils import ( str_or_none, urlencode_postdata, parse_m3u8_attributes, + try_get, + url_or_none, + urljoin, ) @@ -92,9 +94,11 @@ class RoosterTeethIE(InfoExtractor): try: video_json = self._download_json( api_episode_url + '/videos', display_id)['data'][0] - m3u8_url = \ - video_json['attributes'].get('url') or \ - video_json['links'].get('master') + m3u8_url = url_or_none(try_get( + video_json, [ + lambda j: j['attributes']['url'], + lambda j: j['links']['master']], + compat_str)) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: @@ -102,12 +106,14 @@ class RoosterTeethIE(InfoExtractor): '%s is only available for FIRST members' % display_id) raise + if m3u8_url is None: + raise ExtractorError("Unable to find formats") + formats = self._extract_m3u8_formats( m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) - subtitles = self._extract_m3u8_subtitles( - m3u8_url, display_id) + subtitles = self._extract_m3u8_subtitles(m3u8_url, display_id) episode = self._download_json( api_episode_url, display_id, @@ -158,28 +164,20 @@ class RoosterTeethIE(InfoExtractor): m3u8_doc, urlh = res m3u8_url = urlh.geturl() - def format_url(url, base_url): - if re.match(r'^https?://', url): - return url - else: - return compat_urlparse.urljoin(base_url, url) - subtitles = {} - for line in m3u8_doc.splitlines(): if not line.startswith("#EXT-X-MEDIA:"): continue media = parse_m3u8_attributes(line) media_type, media_url_raw, media_lang = ( - media.get('TYPE'), - media.get('URI'), - media.get('LANGUAGE'), - ) + media.get('TYPE'), media.get('URI'), media.get('LANGUAGE'),) if not (media_type in ('SUBTITLES',) and media_url_raw and media_lang): continue - media_url = format_url(media_url_raw, base_url=m3u8_url) + media_url = urljoin(m3u8_url, media_url_raw) + if not media_url: + continue res = self._download_webpage_handle( media_url, video_id, @@ -190,11 +188,13 @@ class RoosterTeethIE(InfoExtractor): continue m3u8_subtitle_doc, _ = res + subtitle_url = None for subtitle_line in m3u8_subtitle_doc.splitlines(): if subtitle_line.startswith("#"): continue - media_url = format_url(subtitle_line, base_url=media_url) + subtitle_url = urljoin(media_url, subtitle_line) break - subtitles[media_lang] = [{'url': media_url, }, ] + if subtitle_url: + subtitles[compat_str(media_lang)] = [{'url': subtitle_url, }, ] return subtitles if len(subtitles) > 0 else None From 0373c0fa89847387470249fc87d88dc78460f13d Mon Sep 17 00:00:00 2001 From: wtpckl Date: Thu, 28 Jan 2021 17:22:53 +0100 Subject: [PATCH 3/5] [RoosterTeeth] Add unittest for subtitle extractor --- test/test_subtitles.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 550e0ca00..41ec0fcaf 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -27,6 +27,7 @@ from youtube_dl.extractor import ( ThePlatformFeedIE, RTVEALaCartaIE, DemocracynowIE, + RoosterTeethIE, ) @@ -349,5 +350,21 @@ class TestDemocracynowSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') +class TestRoosterTeethSubtitles(BaseTestSubtitles): + url = 'https://www.roosterteeth.com/watch/rwby-season-1-episode-1' + IE = RoosterTeethIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['pt', 'de', 'fr', 'es', 'en'])) + self.assertEqual(md5(subtitles['pt']), '96490f6884378403b4304b4355ddd028') + self.assertEqual(md5(subtitles['de']), 'a30fbfbc2574530457d12fcaf68b515c') + self.assertEqual(md5(subtitles['fr']), '64ff89d6a4dd8aa079f680d1cb799fde') + self.assertEqual(md5(subtitles['es']), '565a9b49173539ce5a3de9756bd3e3a2') + self.assertEqual(md5(subtitles['en']), '404252b16a423c3b89d3c8774445df65') + + if __name__ == '__main__': unittest.main() From 1ed79ce6a1c75d48001382b5291bc37da95fd28b Mon Sep 17 00:00:00 2001 From: wtpckl Date: Thu, 28 Jan 2021 17:44:37 +0100 Subject: [PATCH 4/5] [RoosterTeeth] flake8 --- youtube_dl/extractor/roosterteeth.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 3fc279d0f..5041d592b 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( compat_HTTPError, From a1ef0a66f3167aaefbabc3013bbe881911c22796 Mon Sep 17 00:00:00 2001 From: wtpckl Date: Sat, 6 Feb 2021 11:31:44 +0100 Subject: [PATCH 5/5] [RoosterTeeth] Handle missing media URL at higher level --- youtube_dl/extractor/roosterteeth.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 5041d592b..e6c20b25f 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -104,14 +104,15 @@ class RoosterTeethIE(InfoExtractor): '%s is only available for FIRST members' % display_id) raise - if m3u8_url is None: - raise ExtractorError("Unable to find formats") + if m3u8_url: + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') + self._sort_formats(formats) - formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') - self._sort_formats(formats) - - subtitles = self._extract_m3u8_subtitles(m3u8_url, display_id) + subtitles = self._extract_m3u8_subtitles(m3u8_url, display_id) + else: + formats = [] + subtitles = None episode = self._download_json( api_episode_url, display_id,