From 484fe787379a20ec9ef19f69b9e07321359bbd1e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 23 Nov 2020 14:16:38 +0100 Subject: [PATCH 001/424] [pinterest] Add support for large collections(more than 25 pins) --- youtube_dl/extractor/pinterest.py | 87 ++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py index 2bb4ca660..b249c9eda 100644 --- a/youtube_dl/extractor/pinterest.py +++ b/youtube_dl/extractor/pinterest.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -16,12 +19,12 @@ from ..utils import ( class PinterestBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' - def _extract_resource(self, webpage, video_id): - return self._parse_json( - self._search_regex( - r']+\bid=["\']initial-state["\'][^>]*>({.+?})', - webpage, 'application json'), - video_id)['resourceResponses'] + def _call_api(self, resource, video_id, options): + return self._download_json( + 'https://www.pinterest.com/resource/%sResource/get/' % resource, + video_id, 'Download %s JSON metadata' % resource, query={ + 'data': json.dumps({'options': options}) + })['resource_response'] def _extract_video(self, data, extract_formats=True): video_id = data['id'] @@ -128,13 +131,16 @@ class PinterestIE(PinterestBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - data = self._extract_resource(webpage, video_id)[0]['response']['data'] + data = self._call_api( + 'Pin', video_id, { + 'field_set_key': 'unauth_react_main_pin', + 'id': video_id, + })['data'] return self._extract_video(data) class PinterestCollectionIE(PinterestBaseIE): - _VALID_URL = r'%s/[^/]+/(?P[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _VALID_URL = r'%s/(?P[^/]+)/(?P[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', 'info_dict': { @@ -142,6 +148,14 @@ class PinterestCollectionIE(PinterestBaseIE): 'title': 'cool diys', }, 'playlist_count': 8, + }, { + 'url': 'https://www.pinterest.ca/fudohub/videos/', + 'info_dict': { + 'id': '682858430939307450', + 'title': 'VIDEOS', + }, + 'playlist_mincount': 365, + 'skip': 'Test with extract_formats=False', }] @classmethod @@ -150,27 +164,38 @@ class PinterestCollectionIE(PinterestBaseIE): PinterestCollectionIE, cls).suitable(url) def _real_extract(self, url): - collection_name = self._match_id(url) - webpage = self._download_webpage(url, collection_name) - resource = self._extract_resource(webpage, collection_name)[1] + username, slug = re.match(self._VALID_URL, url).groups() + board = self._call_api( + 'Board', slug, { + 'slug': slug, + 'username': username + })['data'] + board_id = board['id'] + options = { + 'board_id': board_id, + 'page_size': 250, + } + bookmark = None entries = [] - for item in resource['response']['data']: - if not isinstance(item, dict) or item.get('type') != 'pin': - continue - video_id = item.get('id') - if video_id: - # Some pins may not be available anonymously via pin URL - # video = self._extract_video(item, extract_formats=False) - # video.update({ - # '_type': 'url_transparent', - # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, - # }) - # entries.append(video) - entries.append(self._extract_video(item)) - title = try_get( - resource, lambda x: x['options']['board_title'], compat_str) - collection_id = try_get( - resource, lambda x: x['options']['board_id'], - compat_str) or collection_name + while True: + if bookmark: + options['bookmarks'] = [bookmark] + board_feed = self._call_api('BoardFeed', board_id, options) + for item in (board_feed.get('data') or []): + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + bookmark = board_feed.get('bookmark') + if not bookmark: + break return self.playlist_result( - entries, playlist_id=collection_id, playlist_title=title) + entries, playlist_id=board_id, playlist_title=board.get('name')) From 316b10855af61798760b30531a5dc34da8d9a69b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Nov 2020 22:19:25 +0700 Subject: [PATCH 002/424] [youtube:tab] Fix some weird typo (closes #27157) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2aad855c8..8f814a81c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3013,7 +3013,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) identity_token = self._search_regex( - r'\bID_TOKEN["\']\s*:\s/l*["\'](.+?)["\']', webpage, + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, 'identity token', default=None) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( From 9d2c90354f9743214a0d02de6981fa748af11b37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Nov 2020 22:33:21 +0700 Subject: [PATCH 003/424] [youtube:favorites] Restore extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/youtube.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7ba4087fd..71178da7e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1491,6 +1491,7 @@ from .yourporn import YourPornIE from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, + YoutubeFavouritesIE, YoutubeHistoryIE, YoutubeTabIE, YoutubePlaylistIE, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8f814a81c..29c1d0413 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3149,6 +3149,25 @@ class YoutubeYtUserIE(InfoExtractor): ie=YoutubeTabIE.ie_key(), video_id=user_id) +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:favorites' + IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' + _LOGIN_REQUIRED = True + _TESTS = [{ + 'url': ':ytfav', + 'only_matching': True, + }, { + 'url': ':ytfavorites', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + 'https://www.youtube.com/playlist?list=LL', + ie=YoutubeTabIE.ie_key()) + + class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for From 323427281899cf3527e504ac7058f5b10aad5d61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Nov 2020 22:34:27 +0700 Subject: [PATCH 004/424] [youtube:tab] PEP 8 --- youtube_dl/extractor/youtube.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 29c1d0413..abb1cad74 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2621,11 +2621,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', 'only_matching': True, }, - # TODO - # { - # 'url': 'https://www.youtube.com/TheYoungTurks/live', - # 'only_matching': True, - # } + # TODO + # { + # 'url': 'https://www.youtube.com/TheYoungTurks/live', + # 'only_matching': True, + # } ] def _extract_channel_id(self, webpage): From 191286265d8ad4fff834249d44e3d334a969b668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Nov 2020 00:10:25 +0700 Subject: [PATCH 005/424] [youtube:tab] Fix feeds extraction (closes #25695, closes #26452) --- test/test_all_urls.py | 7 +- youtube_dl/extractor/youtube.py | 161 ++++++++++++++++++-------------- 2 files changed, 97 insertions(+), 71 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 56a08bed8..50c3466fa 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -61,9 +61,10 @@ class TestAllURLsMatching(unittest.TestCase): # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) def test_youtube_feeds(self): - self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) - self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) - self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) + self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) # def test_youtube_search_matching(self): # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index abb1cad74..7324d8080 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,7 +33,6 @@ from ..utils import ( get_element_by_id, int_or_none, mimetype2ext, - orderedSet, parse_codecs, parse_duration, remove_quotes, @@ -2381,7 +2380,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' - _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P[^/?#&]+)' + _VALID_URL = r'''(?x) + https?:// + (?:\w+\.)? + (?: + youtube(?:kids)?\.com| + invidio\.us + )/ + (?: + (?:channel|c|user|feed)/| + (?:playlist|watch)\?.*?\blist= + ) + (?P[^/?\#&]+) + ''' IE_NAME = 'youtube:tab' _TESTS = [{ @@ -2620,7 +2631,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', 'only_matching': True, - }, + }, { + 'url': 'https://www.youtube.com/feed/trending', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/library', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/history', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/subscriptions', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/watch_later', + 'only_matching': True, + }, { + # no longer available? + 'url': 'https://www.youtube.com/feed/recommended', + 'only_matching': True, + } # TODO # { # 'url': 'https://www.youtube.com/TheYoungTurks/live', @@ -2707,27 +2741,34 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) - def _shelf_entries_trimmed(self, shelf_renderer): - renderer = try_get( - shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict) - if not renderer: + def _shelf_entries_from_content(self, shelf_renderer): + content = shelf_renderer.get('content') + if not isinstance(content, dict): return - # TODO: add support for nested playlists so each shelf is processed - # as separate playlist - # TODO: this includes only first N items - for entry in self._grid_entries(renderer): - yield entry + renderer = content.get('gridRenderer') + if renderer: + # TODO: add support for nested playlists so each shelf is processed + # as separate playlist + # TODO: this includes only first N items + for entry in self._grid_entries(renderer): + yield entry + renderer = content.get('horizontalListRenderer') + if renderer: + # TODO + pass def _shelf_entries(self, shelf_renderer): ep = try_get( shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], compat_str) shelf_url = urljoin('https://www.youtube.com', ep) - if not shelf_url: - return - title = try_get( - shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) - yield self.url_result(shelf_url, video_title=title) + if shelf_url: + title = try_get( + shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + yield self.url_result(shelf_url, video_title=title) + # Shelf may not contain shelf URL, fallback to extraction from content + for entry in self._shelf_entries_from_content(shelf_renderer): + yield entry def _playlist_entries(self, video_list_renderer): for content in video_list_renderer['contents']: @@ -2832,8 +2873,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): } def _entries(self, tab, identity_token): + slr_renderer = try_get(tab, lambda x: x['sectionListRenderer'], dict) + if not slr_renderer: + return continuation = None - slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or [] + slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] for slr_content in slr_contents: if not isinstance(slr_content, dict): continue @@ -2876,6 +2920,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not continuation: continuation = self._extract_continuation(is_renderer) + if not continuation: + continuation = self._extract_continuation(slr_renderer) + headers = { 'x-youtube-client-name': '1', 'x-youtube-client-version': '2.20201112.04.01', @@ -2924,7 +2971,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue - renderer = continuation_item.get('playlistVideoRenderer') + renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') if renderer: video_list_renderer = {'contents': continuation_items} for entry in self._playlist_entries(video_list_renderer): @@ -2969,6 +3016,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + playlist_id = title = description = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -3289,10 +3337,10 @@ class YoutubeSearchURLIE(YoutubeSearchIE): """ -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubeTabIE): """ Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. + Subclasses must define the _FEED_NAME property. """ _LOGIN_REQUIRED = True @@ -3303,55 +3351,17 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: - break - - ids.extend(new_ids) - - for entry in self._ids_to_results(new_ids): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - def _real_extract(self, url): - page = self._download_webpage( + return self.url_result( 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) + ie=YoutubeTabIE.ie_key()) class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater' - + _VALID_URL = r':ytwatchlater' _TESTS = [{ - 'url': 'https://www.youtube.com/feed/watch_later', - 'only_matching': True, - }, { 'url': ':ytwatchlater', 'only_matching': True, }] @@ -3363,23 +3373,38 @@ class YoutubeWatchLaterIE(InfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _VALID_URL = r':ytrec(?:ommended)?' _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' + _TESTS = [{ + 'url': ':ytrec', + 'only_matching': True, + }, { + 'url': ':ytrecommended', + 'only_matching': True, + }] class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _VALID_URL = r':ytsubs(?:criptions)?' _FEED_NAME = 'subscriptions' - _PLAYLIST_TITLE = 'Youtube Subscriptions' + _TESTS = [{ + 'url': ':ytsubs', + 'only_matching': True, + }, { + 'url': ':ytsubscriptions', + 'only_matching': True, + }] class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' + _VALID_URL = r':ythistory' _FEED_NAME = 'history' - _PLAYLIST_TITLE = 'Youtube History' + _TESTS = [{ + 'url': ':ythistory', + 'only_matching': True, + }] class YoutubeTruncatedURLIE(InfoExtractor): From da4eaa15a275628a56ce0512d916fb879734969f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Nov 2020 00:20:42 +0700 Subject: [PATCH 006/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 5a08b3a66..d00c06195 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version + +Core ++ [extractor/common] Add generic support for akamai HTTP format extraction + +Extractors +* [youtube:tab] Fix feeds extraction (#25695, #26452) +* [youtube:favorites] Restore extractor +* [youtube:tab] Fix some weird typo (#27157) ++ [pinterest] Add support for large collections (more than 25 pins) ++ [franceinter] Extract thumbnail (#27153) ++ [box] Add support for box.com (#5949) ++ [nytimes] Add support for cooking.nytimes.com (#27112, #27143) +* [lbry] Relax URL regular expression (#27144) ++ [rumble] Add support for embed pages (#10785) ++ [skyit] Add support for multiple Sky Italia websites (#26629) ++ [pinterest] Add support for pinterest.com (#25747) + + version 2020.11.21.1 Core From d0512ac4c56191b6bdd0c0baf2f907990e3045b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Nov 2020 00:23:14 +0700 Subject: [PATCH 007/424] release 2020.11.24 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 14 ++++++++++++++ youtube_dl/version.py | 2 +- 8 files changed, 28 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index e2cbaee1c..73155d94e 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.21.1 + [debug] youtube-dl version 2020.11.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index e531e89c0..9e1921149 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 686878c56..40ce65e73 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.24** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 17b853716..b98f06c2d 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.21.1 + [debug] youtube-dl version 2020.11.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 109c47925..60bd5f337 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.24** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index d00c06195..3894ac2db 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.11.24 Core + [extractor/common] Add generic support for akamai HTTP format extraction diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 53dbc28e5..dea88b5c5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -59,6 +59,7 @@ - **ARD:mediathek** - **ARDBetaMediathek** - **Arkena** + - **arte.sky.it** - **ArteTV** - **ArteTVEmbed** - **ArteTVPlaylist** @@ -110,6 +111,7 @@ - **Bloomberg** - **BokeCC** - **BostonGlobe** + - **Box** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk - **BravoTV** @@ -157,6 +159,7 @@ - **Chilloutzone** - **chirbit** - **chirbit:profile** + - **cielotv.it** - **Cinchcast** - **Cinemax** - **CiscoLiveSearch** @@ -608,6 +611,7 @@ - **Nuvid** - **NYTimes** - **NYTimesArticle** + - **NYTimesCooking** - **NZZ** - **ocw.mit.edu** - **OdaTV** @@ -660,10 +664,13 @@ - **PicartoVod** - **Piksel** - **Pinkbike** + - **Pinterest** + - **PinterestCollection** - **Pladform** - **Platzi** - **PlatziCourse** - **play.fm** + - **player.sky.it** - **PlayPlusTV** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz @@ -755,6 +762,7 @@ - **RTVNH** - **RTVS** - **RUHD** + - **RumbleEmbed** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels - **rutube:embed**: Rutube embedded videos @@ -792,6 +800,8 @@ - **Shared**: shared.sx - **ShowRoomLive** - **Sina** + - **sky.it** + - **skyacademy.it** - **SkylineWebcams** - **SkyNews** - **skynewsarabia:article** @@ -930,6 +940,7 @@ - **TV2DKBornholmPlay** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ + - **tv8.it** - **TVA** - **TVANouvelles** - **TVANouvellesArticle** @@ -1001,6 +1012,8 @@ - **Viddler** - **Videa** - **video.google:search**: Google Video search + - **video.sky.it** + - **video.sky.it:live** - **VideoDetective** - **videofy.me** - **videomore** @@ -1130,6 +1143,7 @@ - **YourPorn** - **YourUpload** - **youtube**: YouTube.com + - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - **youtube:playlist**: YouTube.com playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d7e901521..e8a816d0d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.21.1' +__version__ = '2020.11.24' From 37258c644f76416b2a09de14c0e74da628534e2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Nov 2020 02:18:40 +0700 Subject: [PATCH 008/424] [cda] Fix extraction (closes #17803, closes #24458, closes #24518, closes #26381) --- youtube_dl/extractor/cda.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 0c3af23d5..d67900e62 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -5,10 +5,16 @@ import codecs import re from .common import InfoExtractor +from ..compat import ( + compat_chr, + compat_ord, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, float_or_none, int_or_none, + merge_dicts, multipart_encode, parse_duration, random_birthday, @@ -107,8 +113,9 @@ class CDAIE(InfoExtractor): r'Odsłony:(?:\s| )*([0-9]+)', webpage, 'view_count', default=None) average_rating = self._search_regex( - r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', - webpage, 'rating', fatal=False, group='rating_value') + (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', + r']+\bclass=["\']rating["\'][^>]*>(?P[0-9.]+)'), webpage, 'rating', fatal=False, + group='rating_value') info_dict = { 'id': video_id, @@ -123,6 +130,24 @@ class CDAIE(InfoExtractor): 'age_limit': 18 if need_confirm_age else 0, } + # Source: https://www.cda.pl/js/player.js?t=1606154898 + def decrypt_file(a): + for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): + a = a.replace(p, '') + a = compat_urllib_parse_unquote(a) + b = [] + for c in a: + f = compat_ord(c) + b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f)) + a = ''.join(b) + a = a.replace('.cda.mp4', '') + for p in ('.2cda.pl', '.3cda.pl'): + a = a.replace(p, '.cda.pl') + if '/upstream' in a: + a = a.replace('/upstream', '.mp4/upstream') + return 'https://' + a + return 'https://' + a + '.mp4' + def extract_format(page, version): json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P.+?)\1', page, @@ -141,6 +166,8 @@ class CDAIE(InfoExtractor): video['file'] = codecs.decode(video['file'], 'rot_13') if video['file'].endswith('adc.mp4'): video['file'] = video['file'].replace('adc.mp4', '.mp4') + elif not video['file'].startswith('http'): + video['file'] = decrypt_file(video['file']) f = { 'url': video['file'], } @@ -179,4 +206,6 @@ class CDAIE(InfoExtractor): self._sort_formats(formats) - return info_dict + info = self._search_json_ld(webpage, video_id, default={}) + + return merge_dicts(info_dict, info) From a86ce9d7a1091b86948feec0ba4a34fa431d7618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Nov 2020 02:59:49 +0700 Subject: [PATCH 009/424] [nrk] Fix extraction --- youtube_dl/extractor/nrk.py | 442 +++++++++++++++++++++--------------- 1 file changed, 257 insertions(+), 185 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 84aacbcda..4a395546f 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -9,6 +9,7 @@ from ..compat import ( compat_urllib_parse_unquote, ) from ..utils import ( + determine_ext, ExtractorError, int_or_none, js_to_json, @@ -16,17 +17,269 @@ from ..utils import ( parse_age_limit, parse_duration, try_get, + url_or_none, ) class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] - _api_host = None + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': '706f34cdf1322577589e369e522b50ef', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'flv', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }] + + def _extract_from_playback(self, video_id): + manifest = self._download_json( + 'http://psapi.nrk.no/playback/manifest/%s' % video_id, + video_id, 'Downloading manifest JSON') + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + data = self._download_json( + 'http://psapi.nrk.no/playback/metadata/%s' % video_id, + video_id, 'Downloading metadata JSON') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } def _real_extract(self, url): video_id = self._match_id(url) + return self._extract_from_playback(video_id) + +class NRKTVIE(NRKBaseIE): + IE_DESC = 'NRK TV and NRK Radio' + _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' + _VALID_URL = r'''(?x) + https?:// + (?:tv|radio)\.nrk(?:super)?\.no/ + (?:serie(?:/[^/]+){1,2}|program)/ + (?![Ee]pisodes)%s + (?:/\d{2}-\d{2}-\d{4})? + (?:\#del=(?P\d+))? + ''' % _EPISODE_RE + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') + _TESTS = [{ + 'url': 'https://tv.nrk.no/program/MDDP12000117', + 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'info_dict': { + 'id': 'MDDP12000117AA', + 'ext': 'mp4', + 'title': 'Alarm Trolltunga', + 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', + 'duration': 2223, + 'age_limit': 6, + }, + }, { + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'md5': '9a167e54d04671eb6317a37b7bc8a280', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + 'skip': 'NoProgramRights', + }, { + 'url': 'https://tv.nrk.no/program/mdfp15000514', + 'info_dict': { + 'id': 'MDFP15000514CA', + 'ext': 'mp4', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', + 'duration': 4605, + 'series': 'Kunnskapskanalen', + 'episode': '24.05.2014', + }, + 'params': { + 'skip_download': True, + }, + }, { + # single playlist video + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Video is geo restricted'], + 'skip': 'particular part is not supported currently', + }, { + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [{ + 'info_dict': { + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'duration': 772, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'info_dict': { + 'id': 'MSPO40010515BH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'duration': 6175, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, + }, + }], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + }, + 'expected_warnings': ['Video is geo restricted'], + }, { + 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', + 'info_dict': { + 'id': 'KMTE50001317AA', + 'ext': 'mp4', + 'title': 'Anno 13:30', + 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', + 'duration': 2340, + 'series': 'Anno', + 'episode': '13:30', + 'season_number': 3, + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', + 'info_dict': { + 'id': 'MUHH46000317AA', + 'ext': 'mp4', + 'title': 'Nytt på Nytt 27.01.2017', + 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', + 'duration': 1796, + 'series': 'Nytt på nytt', + 'episode': '27.01.2017', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, + }] + + _api_host = None + + def _extract_from_mediaelement(self, video_id): api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS for api_host in api_hosts: @@ -195,190 +448,9 @@ class NRKBaseIE(InfoExtractor): return self.playlist_result(entries, video_id, title, description) - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/PS\*| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P[^?#&]+) - ''' - _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'flv', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }] - - -class NRKTVIE(NRKBaseIE): - IE_DESC = 'NRK TV and NRK Radio' - _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P\d+))? - ''' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') - _TESTS = [{ - 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', - 'info_dict': { - 'id': 'MDDP12000117AA', - 'ext': 'mp4', - 'title': 'Alarm Trolltunga', - 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, - 'age_limit': 6, - }, - }, { - 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', - 'info_dict': { - 'id': 'MUHH48000314AA', - 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', - 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'duration': 1741, - 'series': '20 spørsmål', - 'episode': '23.05.2014', - }, - 'skip': 'NoProgramRights', - }, { - 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'info_dict': { - 'id': 'MDFP15000514CA', - 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', - 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, - 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', - }, - 'params': { - 'skip_download': True, - }, - }, { - # single playlist video - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Video is geo restricted'], - 'skip': 'particular part is not supported currently', - }, { - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], - 'info_dict': { - 'id': 'MSPO40010515', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - }, - 'expected_warnings': ['Video is geo restricted'], - }, { - 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', - 'info_dict': { - 'id': 'KMTE50001317AA', - 'ext': 'mp4', - 'title': 'Anno 13:30', - 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', - 'duration': 2340, - 'series': 'Anno', - 'episode': '13:30', - 'season_number': 3, - 'episode_number': 13, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', - 'info_dict': { - 'id': 'MUHH46000317AA', - 'ext': 'mp4', - 'title': 'Nytt på Nytt 27.01.2017', - 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', - 'duration': 1796, - 'series': 'Nytt på nytt', - 'episode': '27.01.2017', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', - 'only_matching': True, - }] + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_mediaelement(video_id) class NRKTVEpisodeIE(InfoExtractor): From f4415faa4690cf71d61d90ad7e9a7f9980be9b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Nov 2020 04:16:29 +0700 Subject: [PATCH 010/424] [downloader/fragment] Set final file's mtime according to last fragment's Last-Modified header (closes #11718, closes #18384, closes #27138) --- youtube_dl/downloader/fragment.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 02f35459e..35c76feba 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -97,12 +97,15 @@ class FragmentFD(FileDownloader): def _download_fragment(self, ctx, frag_url, info_dict, headers=None): fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - success = ctx['dl'].download(fragment_filename, { + fragment_info_dict = { 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), - }) + } + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') down, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() @@ -258,6 +261,13 @@ class FragmentFD(FileDownloader): downloaded_bytes = ctx['complete_frags_downloaded_bytes'] else: self.try_rename(ctx['tmpfilename'], ctx['filename']) + if self.params.get('updatetime', True): + filetime = ctx.get('fragment_filetime') + if filetime: + try: + os.utime(ctx['filename'], (time.time(), filetime)) + except Exception: + pass downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ From 01c92973ddebe6429f5a03855e41c412889c96dc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 23 Nov 2020 22:44:34 +0100 Subject: [PATCH 011/424] [youtube] imporve music metadata and license extraction(closes #26013) --- youtube_dl/extractor/youtube.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7324d8080..1e3ff7d44 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2162,7 +2162,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Youtube Music Auto-generated description release_date = release_year = None if video_description: - mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P[^·]+)·(?P[^\n]+)\n+(?P[^\n]+)(?:.+?℗\s*(?P\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P[^\n]+))?', video_description) + mobj = re.search(r'(?s)(?P[^·\n]+)·(?P[^\n]+)\n+(?P[^\n]+)(?:.+?℗\s*(?P\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: if not track: track = mobj.group('track').strip() @@ -2179,6 +2179,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if release_year: release_year = int(release_year) + yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) + contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] + for content in contents: + rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = try_get( + mrr, lambda x: x['title']['simpleText'], compat_str) + mrr_contents = try_get( + mrr, lambda x: x['contents'][0], dict) or {} + mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str) + if not (mrr_title and mrr_contents_text): + continue + if mrr_title == 'License': + video_license = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + album = mrr_contents_text + elif mrr_title == 'Artist': + artist = mrr_contents_text + elif mrr_title == 'Song': + track = mrr_contents_text + m_episode = re.search( r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*•\s*E(?P\d+)', video_webpage) From e24ebeafd1c59336705bd31b9f1ff46676273e1d Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Tue, 24 Nov 2020 19:27:33 +0200 Subject: [PATCH 012/424] [medaltv] Add new extractor (#27149) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/medaltv.py | 138 +++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 youtube_dl/extractor/medaltv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 71178da7e..688437dc2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -606,6 +606,7 @@ from .markiza import ( from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .medaltv import MedalTVIE from .mediaset import MediasetIE from .mediasite import ( MediasiteIE, diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py new file mode 100644 index 000000000..06f7b6e92 --- /dev/null +++ b/youtube_dl/extractor/medaltv.py @@ -0,0 +1,138 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_get, + float_or_none, + int_or_none +) + + +class MedalTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', + 'info_dict': { + 'id': '34934644', + 'ext': 'mp4', + 'title': 'Quad Cold', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'MowgliSB', + 'timestamp': 1603165266, + 'upload_date': '20201020', + 'uploader_id': 10619174, + } + }, { + 'url': 'https://medal.tv/clips/36787208', + 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', + 'info_dict': { + 'id': '36787208', + 'ext': 'mp4', + 'title': 'u tk me i tk u bigger', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'Mimicc', + 'timestamp': 1605580939, + 'upload_date': '20201117', + 'uploader_id': 5156321, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + hydration_data = self._search_regex( + r']*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*', + webpage, 'hydration data', default='{}') + parsed = self._parse_json(hydration_data, video_id) + + clip_info = try_get(parsed, lambda x: x['clips'][video_id], dict) or {} + if not clip_info: + raise ExtractorError('Could not find video information.', + video_id=video_id) + + width = int_or_none(clip_info.get('sourceWidth')) + height = int_or_none(clip_info.get('sourceHeight')) + + aspect_ratio = (width / height) if(width and height) else (16 / 9) + + # ordered from lowest to highest resolution + heights = (144, 240, 360, 480, 720, 1080) + + formats = [] + thumbnails = [] + + for height in heights: + format_key = '{0}p'.format(height) + video_key = 'contentUrl{0}'.format(format_key) + thumbnail_key = 'thumbnail{0}'.format(format_key) + width = int(round(aspect_ratio * height)) + + # Second condition needed as sometimes medal says + # they have a format when in fact it is another format. + format_url = clip_info.get(video_key) + if(format_url and format_key in format_url): + formats.append({ + 'url': format_url, + 'format_id': format_key, + 'width': width, + 'height': height + }) + + thumbnail_url = clip_info.get(thumbnail_key) + if(thumbnail_url and format_key in thumbnail_url): + thumbnails.append({ + 'id': format_key, + 'url': thumbnail_url, + 'width': width, + 'height': height + }) + + # add source to formats + source_url = clip_info.get('contentUrl') + if(source_url): + formats.append({ + 'url': source_url, + 'format_id': 'source', + 'width': width, + 'height': height + }) + + error = clip_info.get('error') + if not formats and error: + if(error == 404): + raise ExtractorError('That clip does not exist.', + expected=True, video_id=video_id) + else: + raise ExtractorError('An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + # Necessary because the id of the author is not known in advance. + # Won't raise an issue if no profile can be found as this is optional. + author_info = try_get(parsed, + lambda x: list(x['profiles'].values())[0], dict + ) or {} + author_id = author_info.get('id') + author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + + return { + 'id': video_id, + 'title': clip_info.get('contentTitle'), + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clip_info.get('contentDescription'), + + 'uploader': author_info.get('displayName'), + 'timestamp': float_or_none(clip_info.get('created'), 1000), + 'uploader_id': author_id, + 'uploader_url': author_url, + + 'duration': float_or_none(clip_info.get('videoLengthSeconds')), + 'view_count': int_or_none(clip_info.get('views')), + 'like_count': int_or_none(clip_info.get('likes')), + 'comment_count': int_or_none(clip_info.get('comments')) + } From 579d43951d7d60b5027ae224973b5f541b51c74a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 24 Nov 2020 18:29:46 +0100 Subject: [PATCH 013/424] [medaltv] improve extraction --- youtube_dl/extractor/medaltv.py | 131 +++++++++++++++----------------- 1 file changed, 62 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py index 06f7b6e92..1603b55f6 100644 --- a/youtube_dl/extractor/medaltv.py +++ b/youtube_dl/extractor/medaltv.py @@ -1,13 +1,16 @@ # coding: utf-8 - from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, - try_get, float_or_none, - int_or_none + int_or_none, + str_or_none, + try_get, ) @@ -45,94 +48,84 @@ class MedalTVIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - hydration_data = self._search_regex( + hydration_data = self._parse_json(self._search_regex( r']*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*', - webpage, 'hydration data', default='{}') - parsed = self._parse_json(hydration_data, video_id) + webpage, 'hydration data', default='{}'), video_id) - clip_info = try_get(parsed, lambda x: x['clips'][video_id], dict) or {} - if not clip_info: - raise ExtractorError('Could not find video information.', - video_id=video_id) + clip = try_get( + hydration_data, lambda x: x['clips'][video_id], dict) or {} + if not clip: + raise ExtractorError( + 'Could not find video information.', video_id=video_id) - width = int_or_none(clip_info.get('sourceWidth')) - height = int_or_none(clip_info.get('sourceHeight')) + title = clip['contentTitle'] - aspect_ratio = (width / height) if(width and height) else (16 / 9) + source_width = int_or_none(clip.get('sourceWidth')) + source_height = int_or_none(clip.get('sourceHeight')) - # ordered from lowest to highest resolution - heights = (144, 240, 360, 480, 720, 1080) + aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 - formats = [] - thumbnails = [] - - for height in heights: - format_key = '{0}p'.format(height) - video_key = 'contentUrl{0}'.format(format_key) - thumbnail_key = 'thumbnail{0}'.format(format_key) + def add_item(container, item_url, height, id_key='format_id', item_id=None): + item_id = item_id or '%dp' % height + if item_id not in item_url: + return width = int(round(aspect_ratio * height)) - - # Second condition needed as sometimes medal says - # they have a format when in fact it is another format. - format_url = clip_info.get(video_key) - if(format_url and format_key in format_url): - formats.append({ - 'url': format_url, - 'format_id': format_key, - 'width': width, - 'height': height - }) - - thumbnail_url = clip_info.get(thumbnail_key) - if(thumbnail_url and format_key in thumbnail_url): - thumbnails.append({ - 'id': format_key, - 'url': thumbnail_url, - 'width': width, - 'height': height - }) - - # add source to formats - source_url = clip_info.get('contentUrl') - if(source_url): - formats.append({ - 'url': source_url, - 'format_id': 'source', + container.append({ + 'url': item_url, + id_key: item_id, 'width': width, 'height': height }) - error = clip_info.get('error') + formats = [] + thumbnails = [] + for k, v in clip.items(): + if not (v and isinstance(v, compat_str)): + continue + mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) + if not mobj: + continue + prefix = mobj.group(1) + height = int_or_none(mobj.group(2)) + if prefix == 'contentUrl': + add_item( + formats, v, height or source_height, + item_id=None if height else 'source') + elif prefix == 'thumbnail': + add_item(thumbnails, v, height, 'id') + + error = clip.get('error') if not formats and error: - if(error == 404): - raise ExtractorError('That clip does not exist.', - expected=True, video_id=video_id) + if error == 404: + raise ExtractorError( + 'That clip does not exist.', + expected=True, video_id=video_id) else: - raise ExtractorError('An unknown error occurred ({0}).'.format(error), - video_id=video_id) + raise ExtractorError( + 'An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + self._sort_formats(formats) # Necessary because the id of the author is not known in advance. # Won't raise an issue if no profile can be found as this is optional. - author_info = try_get(parsed, - lambda x: list(x['profiles'].values())[0], dict - ) or {} - author_id = author_info.get('id') + author = try_get( + hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} + author_id = str_or_none(author.get('id')) author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None return { 'id': video_id, - 'title': clip_info.get('contentTitle'), + 'title': title, 'formats': formats, 'thumbnails': thumbnails, - 'description': clip_info.get('contentDescription'), - - 'uploader': author_info.get('displayName'), - 'timestamp': float_or_none(clip_info.get('created'), 1000), + 'description': clip.get('contentDescription'), + 'uploader': author.get('displayName'), + 'timestamp': float_or_none(clip.get('created'), 1000), 'uploader_id': author_id, 'uploader_url': author_url, - - 'duration': float_or_none(clip_info.get('videoLengthSeconds')), - 'view_count': int_or_none(clip_info.get('views')), - 'like_count': int_or_none(clip_info.get('likes')), - 'comment_count': int_or_none(clip_info.get('comments')) + 'duration': int_or_none(clip.get('videoLengthSeconds')), + 'view_count': int_or_none(clip.get('views')), + 'like_count': int_or_none(clip.get('likes')), + 'comment_count': int_or_none(clip.get('comments')), } From 6c35de4c6bc13c307215cd2f8977b7ac09de28fb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 24 Nov 2020 22:49:04 +0100 Subject: [PATCH 014/424] [bbc] fix BBC News videos extraction --- youtube_dl/extractor/bbc.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 002c39c39..ca93b7be5 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -981,7 +981,7 @@ class BBCIE(BBCCoUkIE): group_id = self._search_regex( r']+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, webpage, 'group id', default=None) - if playlist_id: + if group_id: return self.url_result( 'https://www.bbc.co.uk/programmes/%s' % group_id, ie=BBCCoUkIE.ie_key()) @@ -1118,6 +1118,39 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if initial_data: + def parse_media(media): + if not media: + return + for item in (try_get(media, lambda x: x['media']['items'], list) or []): + item_id = item.get('id') + item_title = item.get('title') + if not (item_id and item_title): + continue + formats, subtitles = self._download_media_selector(item_id) + self._sort_formats(formats) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item.get('holdingImageUrl'), + 'formats': formats, + 'subtitles': subtitles, + }) + for resp in (initial_data.get('data') or {}).values(): + name = resp.get('name') + if name == 'media-experience': + parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) + elif name == 'article': + for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + if block.get('type') != 'media': + continue + parse_media(block.get('model')) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), From a7ea88537a8bc8843636f21eb9bd9fa04d5060c5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 24 Nov 2020 22:54:08 +0100 Subject: [PATCH 015/424] [bbc] fix BBC Three clip extraction --- youtube_dl/extractor/bbc.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index ca93b7be5..54cbcdc8e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1092,10 +1092,26 @@ class BBCIE(BBCCoUkIE): self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, 'bbcthree config', default='{}'), - playlist_id, transform_source=js_to_json, fatal=False) - if bbc3_config: + playlist_id, transform_source=js_to_json, fatal=False) or {} + payload = bbc3_config.get('payload') or {} + if payload: + clip = payload.get('currentClip') or {} + clip_vpid = clip.get('vpid') + clip_title = clip.get('title') + if clip_vpid and clip_title: + formats, subtitles = self._download_media_selector(clip_vpid) + self._sort_formats(formats) + return { + 'id': clip_vpid, + 'title': clip_title, + 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), + 'description': clip.get('description'), + 'duration': parse_duration(clip.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } bbc3_playlist = try_get( - bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + payload, lambda x: x['content']['bbcMedia']['playlist'], dict) if bbc3_playlist: playlist_title = bbc3_playlist.get('title') or playlist_title From 97c5be383c78da84efdb6ea9334ee95b198a2f1d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 24 Nov 2020 23:25:03 +0100 Subject: [PATCH 016/424] [viki] fix video API request(closes #27184) --- youtube_dl/extractor/viki.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a003b7af8..a311f21ef 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -20,6 +20,7 @@ from ..utils import ( parse_age_limit, parse_iso8601, sanitized_Request, + std_headers, ) @@ -226,8 +227,10 @@ class VikiIE(VikiBaseIE): resp = self._download_json( 'https://www.viki.com/api/videos/' + video_id, - video_id, 'Downloading video JSON', - headers={'x-viki-app-ver': '4.0.57'}) + video_id, 'Downloading video JSON', headers={ + 'x-client-user-agent': std_headers['User-Agent'], + 'x-viki-app-ver': '4.0.57', + }) video = resp['video'] self._check_errors(video) From 836c810716cca64673b2cb85a2b489f2dd0a15ef Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 25 Nov 2020 11:26:26 +0100 Subject: [PATCH 017/424] [vlive] Add support for post URLs(closes #27122)(closes #27123) --- youtube_dl/extractor/vlive.py | 101 ++++++++++++++++++++++++++++++---- 1 file changed, 90 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index df1dc78dd..111b8c1c3 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -13,6 +13,8 @@ from ..utils import ( ExtractorError, int_or_none, merge_dicts, + str_or_none, + strip_or_none, try_get, urlencode_postdata, ) @@ -103,23 +105,23 @@ class VLiveIE(VLiveBaseIE): query = {'appId': self._APP_ID} if fields: query['fields'] = fields - return self._download_json( - 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, - 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], - headers={'Referer': 'https://www.vlive.tv/'}, query=query) - - def _real_extract(self, url): - video_id = self._match_id(url) - try: - post = self._call_api( - 'post/v1.0/officialVideoPost-%s', video_id, - 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') + return self._download_json( + 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, + 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], + headers={'Referer': 'https://www.vlive.tv/'}, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: self.raise_login_required(json.loads(e.cause.read().decode())['message']) raise + def _real_extract(self, url): + video_id = self._match_id(url) + + post = self._call_api( + 'post/v1.0/officialVideoPost-%s', video_id, + 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') + video = post['officialVideo'] def get_common_fields(): @@ -170,6 +172,83 @@ class VLiveIE(VLiveBaseIE): raise ExtractorError('Unknown status ' + status) +class VLivePostIE(VLiveIE): + IE_NAME = 'vlive:post' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P\d-\d+)' + _TESTS = [{ + # uploadType = SOS + 'url': 'https://www.vlive.tv/post/1-20088044', + 'info_dict': { + 'id': '1-20088044', + 'title': 'Hola estrellitas la tierra les dice hola (si era así no?) Ha...', + 'description': 'md5:fab8a1e50e6e51608907f46c7fa4b407', + }, + 'playlist_count': 3, + }, { + # uploadType = V + 'url': 'https://www.vlive.tv/post/1-20087926', + 'info_dict': { + 'id': '1-20087926', + 'title': 'James Corden: And so, the baby becamos the Papa💜😭💪😭', + }, + 'playlist_count': 1, + }] + _FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s' + _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo' + _INKEY_TMPL = _FVIDEO_TMPL % 'inKey' + + def _real_extract(self, url): + post_id = self._match_id(url) + + post = self._call_api( + 'post/v1.0/post-%s', post_id, + 'attachments{video},officialVideo{videoSeq},plainBody,title') + + video_seq = str_or_none(try_get( + post, lambda x: x['officialVideo']['videoSeq'])) + if video_seq: + return self.url_result( + 'http://www.vlive.tv/video/' + video_seq, + VLiveIE.ie_key(), video_seq) + + title = post['title'] + entries = [] + for idx, video in enumerate(post['attachments']['video'].values()): + video_id = video.get('videoId') + if not video_id: + continue + upload_type = video.get('uploadType') + upload_info = video.get('uploadInfo') or {} + entry = None + if upload_type == 'SOS': + download = self._call_api( + self._SOS_TMPL, video_id)['videoUrl']['download'] + formats = [] + for f_id, f_url in download.items(): + formats.append({ + 'format_id': f_id, + 'url': f_url, + 'height': int_or_none(f_id[:-1]), + }) + self._sort_formats(formats) + entry = { + 'formats': formats, + 'id': video_id, + 'thumbnail': upload_info.get('imageUrl'), + } + elif upload_type == 'V': + vod_id = upload_info.get('videoId') + if not vod_id: + continue + inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey'] + entry = self._extract_video_info(video_id, vod_id, inkey) + if entry: + entry['title'] = '%s_part%s' % (title, idx) + entries.append(entry) + return self.playlist_result( + entries, post_id, title, strip_or_none(post.get('plainBody'))) + + class VLiveChannelIE(VLiveBaseIE): IE_NAME = 'vlive:channel' _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P[0-9A-Z]+)' From 3a78198a96ffcb989d1302294f6a747c6147b418 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 25 Nov 2020 11:40:37 +0100 Subject: [PATCH 018/424] [vlive] improve extraction for geo-restricted videos --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vlive.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 688437dc2..f7757b4f4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1375,6 +1375,7 @@ from .vk import ( ) from .vlive import ( VLiveIE, + VLivePostIE, VLiveChannelIE, ) from .vodlocker import VodlockerIE diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 111b8c1c3..223709b1e 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -68,6 +68,10 @@ class VLiveIE(VLiveBaseIE): }, { 'url': 'https://www.vlive.tv/embed/1326', 'only_matching': True, + }, { + # works only with gcc=KR + 'url': 'https://www.vlive.tv/video/225019', + 'only_matching': True, }] def _real_initialize(self): @@ -102,7 +106,7 @@ class VLiveIE(VLiveBaseIE): raise ExtractorError('Unable to log in', expected=True) def _call_api(self, path_template, video_id, fields=None): - query = {'appId': self._APP_ID} + query = {'appId': self._APP_ID, 'gcc': 'KR'} if fields: query['fields'] = fields try: From 686e898fde48de9981c170f21d631d15e6f419ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 26 Nov 2020 02:58:48 +0700 Subject: [PATCH 019/424] [spreaker] Add extractor (closes #13480, closes #13877) --- youtube_dl/extractor/extractors.py | 6 + youtube_dl/extractor/spreaker.py | 176 +++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 youtube_dl/extractor/spreaker.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f7757b4f4..fd19f0f0a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1082,6 +1082,12 @@ from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE from .srgssr import ( diff --git a/youtube_dl/extractor/spreaker.py b/youtube_dl/extractor/spreaker.py new file mode 100644 index 000000000..beee6670c --- /dev/null +++ b/youtube_dl/extractor/spreaker.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +def _extract_episode(data, episode_id=None): + title = data['title'] + download_url = data['download_url'] + + series = try_get(data, lambda x: x['show']['title'], compat_str) + uploader = try_get(data, lambda x: x['author']['fullname'], compat_str) + + thumbnails = [] + for image in ('image_original', 'image_medium', 'image'): + image_url = url_or_none(data.get('%s_url' % image)) + if image_url: + thumbnails.append({'url': image_url}) + + def stats(key): + return int_or_none(try_get( + data, + (lambda x: x['%ss_count' % key], + lambda x: x['stats']['%ss' % key]))) + + def duration(key): + return float_or_none(data.get(key), scale=1000) + + return { + 'id': compat_str(episode_id or data['episode_id']), + 'url': download_url, + 'display_id': data.get('permalink'), + 'title': title, + 'description': data.get('description'), + 'timestamp': unified_timestamp(data.get('published_at')), + 'uploader': uploader, + 'uploader_id': str_or_none(data.get('author_id')), + 'creator': uploader, + 'duration': duration('duration') or duration('length'), + 'view_count': stats('play'), + 'like_count': stats('like'), + 'comment_count': stats('message'), + 'format': 'MPEG Layer 3', + 'format_id': 'mp3', + 'container': 'mp3', + 'ext': 'mp3', + 'thumbnails': thumbnails, + 'series': series, + 'extractor_key': SpreakerIE.ie_key(), + } + + +class SpreakerIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + api\.spreaker\.com/ + (?: + (?:download/)?episode| + v2/episodes + )/ + (?P\d+) + ''' + _TESTS = [{ + 'url': 'https://api.spreaker.com/episode/12534508', + 'info_dict': { + 'id': '12534508', + 'display_id': 'swm-ep15-how-to-market-your-music-part-2', + 'ext': 'mp3', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', + 'description': 'md5:0588c43e27be46423e183076fa071177', + 'timestamp': 1502250336, + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': '9780658', + 'duration': 1063.42, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'series': 'Success With Music (SWM)', + }, + }, { + 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', + 'only_matching': True, + }, { + 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + data = self._download_json( + 'https://api.spreaker.com/v2/episodes/%s' % episode_id, + episode_id)['response']['episode'] + return _extract_episode(data, episode_id) + + +class SpreakerPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + episode_id = self._search_regex( + (r'data-episode_id=["\'](?P\d+)', + r'episode_id\s*:\s*(?P\d+)'), webpage, 'episode id') + return self.url_result( + 'https://api.spreaker.com/episode/%s' % episode_id, + ie=SpreakerIE.ie_key(), video_id=episode_id) + + +class SpreakerShowIE(InfoExtractor): + _VALID_URL = r'https?://api\.spreaker\.com/show/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/3-ninjas-podcast', + 'info_dict': { + 'id': '4652058', + }, + 'playlist_mincount': 118, + }] + + def _entries(self, show_id): + for page_num in itertools.count(1): + episodes = self._download_json( + 'https://api.spreaker.com/show/%s/episodes' % show_id, + show_id, note='Downloading JSON page %d' % page_num, query={ + 'page': page_num, + 'max_per_page': 100, + }) + pager = try_get(episodes, lambda x: x['response']['pager'], dict) + if not pager: + break + results = pager.get('results') + if not results or not isinstance(results, list): + break + for result in results: + if not isinstance(result, dict): + continue + yield _extract_episode(result) + if page_num == pager.get('last_page'): + break + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) + + +class SpreakerShowPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/success-with-music', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show_id = self._search_regex( + r'show_id\s*:\s*(?P\d+)', webpage, 'show id') + return self.url_result( + 'https://api.spreaker.com/show/%s' % show_id, + ie=SpreakerShowIE.ie_key(), video_id=show_id) From 4dc545553f977013ae92fcb10046749b9a0405ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 26 Nov 2020 03:03:51 +0700 Subject: [PATCH 020/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 3894ac2db..9f47e6826 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version + +Core +* [downloader/fragment] Set final file's mtime according to last fragment's + Last-Modified header (#11718, #18384, #27138) + +Extractors ++ [spreaker] Add support for spreaker.com (#13480, #13877) +* [vlive] Improve extraction for geo-restricted videos ++ [vlive] Add support for post URLs (#27122, #27123) +* [viki] Fix video API request (#27184) +* [bbc] Fix BBC Three clip extraction +* [bbc] Fix BBC News videos extraction ++ [medaltv] Add support for medal.tv (#27149) +* [youtube] Imporve music metadata and license extraction (#26013) +* [nrk] Fix extraction +* [cda] Fix extraction (#17803, #24458, #24518, #26381) + + version 2020.11.24 Core From 9fe50837c3e8f6c40b7bed8bf7105a868a7a678f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 26 Nov 2020 03:05:51 +0700 Subject: [PATCH 021/424] release 2020.11.26 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 ++++++ youtube_dl/version.py | 2 +- 8 files changed, 20 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 73155d94e..98c075cb1 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.24** +- [ ] I've verified that I'm running youtube-dl version **2020.11.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.24 + [debug] youtube-dl version 2020.11.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 9e1921149..12423c0e1 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.24** +- [ ] I've verified that I'm running youtube-dl version **2020.11.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 40ce65e73..11522132a 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.24** +- [ ] I've verified that I'm running youtube-dl version **2020.11.26** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index b98f06c2d..9be368760 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.24** +- [ ] I've verified that I'm running youtube-dl version **2020.11.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.24 + [debug] youtube-dl version 2020.11.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 60bd5f337..a6e1eabe6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.24** +- [ ] I've verified that I'm running youtube-dl version **2020.11.26** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 9f47e6826..b785e89ad 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.11.26 Core * [downloader/fragment] Set final file's mtime according to last fragment's diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dea88b5c5..8387dcce0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -471,6 +471,7 @@ - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA + - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** - **Medialaan** @@ -839,6 +840,10 @@ - **Sport5** - **SportBox** - **SportDeutschland** + - **Spreaker** + - **SpreakerPage** + - **SpreakerShow** + - **SpreakerShowPage** - **SpringboardPlatform** - **Sprout** - **sr:mediathek**: Saarländischer Rundfunk @@ -1055,6 +1060,7 @@ - **vk:wallpost** - **vlive** - **vlive:channel** + - **vlive:post** - **Vodlocker** - **VODPl** - **VODPlatform** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e8a816d0d..d8a53b96e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.24' +__version__ = '2020.11.26' From 99de2f38d375f794ffdc4f5a110a5227310f3ab0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 25 Nov 2020 21:39:17 +0100 Subject: [PATCH 022/424] [spreaker] fix SpreakerShowIE test URL --- youtube_dl/extractor/spreaker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spreaker.py b/youtube_dl/extractor/spreaker.py index beee6670c..6c7e40ae4 100644 --- a/youtube_dl/extractor/spreaker.py +++ b/youtube_dl/extractor/spreaker.py @@ -126,7 +126,7 @@ class SpreakerPageIE(InfoExtractor): class SpreakerShowIE(InfoExtractor): _VALID_URL = r'https?://api\.spreaker\.com/show/(?P\d+)' _TESTS = [{ - 'url': 'https://www.spreaker.com/show/3-ninjas-podcast', + 'url': 'https://api.spreaker.com/show/4652058', 'info_dict': { 'id': '4652058', }, From a3cf22e590f7057c50563f36c17b59993446dbdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= Date: Thu, 26 Nov 2020 12:55:06 +0100 Subject: [PATCH 023/424] [videa] Adapt to updates (#26301) closes #25973, closes #25650. --- youtube_dl/extractor/videa.py | 62 +++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index d0e34c819..a03614cc1 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -2,15 +2,24 @@ from __future__ import unicode_literals import re +import random +import string +import struct from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, mimetype2ext, parse_codecs, xpath_element, xpath_text, ) +from ..compat import ( + compat_b64decode, + compat_ord, + compat_parse_qs, +) class VideaIE(InfoExtractor): @@ -60,15 +69,63 @@ class VideaIE(InfoExtractor): r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', webpage)] + def rc4(self, ciphertext, key): + res = b'' + + keyLen = len(key) + S = list(range(256)) + + j = 0 + for i in range(256): + j = (j + S[i] + ord(key[i % keyLen])) % 256 + S[i], S[j] = S[j], S[i] + + i = 0 + j = 0 + for m in range(len(ciphertext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + k = S[(S[i] + S[j]) % 256] + res += struct.pack("B", k ^ compat_ord(ciphertext[m])) + + return res + def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, fatal=True) + error = self._search_regex(r'

([^<]+)

', webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) - info = self._download_xml( + video_src_params_raw = self._search_regex(r']+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params') + video_src_params = compat_parse_qs(video_src_params_raw) + player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True) + nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') + random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) + static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' + l = nonce[:32] + s = nonce[32:] + result = '' + for i in range(0, 32): + result += s[i - (static_secret.index(l[i]) - 31)] + + video_src_params['_s'] = random_seed + video_src_params['_t'] = result[:16] + encryption_key_stem = result[16:] + random_seed + + [b64_info, handle] = self._download_webpage_handle( 'http://videa.hu/videaplayer_get_xml.php', video_id, - query={'v': video_id}) + query=video_src_params, fatal=True) + + encrypted_info = compat_b64decode(b64_info) + key = encryption_key_stem + handle.info()['x-videa-xs'] + info_str = self.rc4(encrypted_info, key).decode('utf8') + info = self._parse_xml(info_str, video_id) video = xpath_element(info, './/video', 'video', fatal=True) sources = xpath_element(info, './/video_sources', 'sources', fatal=True) + hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True) title = xpath_text(video, './title', fatal=True) @@ -77,6 +134,7 @@ class VideaIE(InfoExtractor): source_url = source.text if not source_url: continue + source_url += '?md5=%s&expires=%s' % (hash_values.find('hash_value_%s' % source.get('name')).text, source.get('exp')) f = parse_codecs(source.get('codecs')) f.update({ 'url': source_url, From f9f9699f2fa87ec54d59b1834cc56adb6147fec6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 26 Nov 2020 12:56:49 +0100 Subject: [PATCH 024/424] [videa] improve extraction --- youtube_dl/extractor/videa.py | 91 +++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index a03614cc1..ab2c15cde 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -1,10 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re import random +import re import string -import struct from .common import InfoExtractor from ..utils import ( @@ -12,13 +11,14 @@ from ..utils import ( int_or_none, mimetype2ext, parse_codecs, + update_url_query, xpath_element, xpath_text, ) from ..compat import ( compat_b64decode, compat_ord, - compat_parse_qs, + compat_struct_pack, ) @@ -28,7 +28,7 @@ class VideaIE(InfoExtractor): videa(?:kid)?\.hu/ (?: videok/(?:[^/]+/)*[^?#&]+-| - player\?.*?\bv=| + (?:videojs_)?player\?.*?\bv=| player/v/ ) (?P[^?#&]+) @@ -62,6 +62,7 @@ class VideaIE(InfoExtractor): 'url': 'https://videakid.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', 'only_matching': True, }] + _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' @staticmethod def _extract_urls(webpage): @@ -69,75 +70,84 @@ class VideaIE(InfoExtractor): r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', webpage)] - def rc4(self, ciphertext, key): + @staticmethod + def rc4(cipher_text, key): res = b'' - keyLen = len(key) + key_len = len(key) S = list(range(256)) j = 0 for i in range(256): - j = (j + S[i] + ord(key[i % keyLen])) % 256 + j = (j + S[i] + ord(key[i % key_len])) % 256 S[i], S[j] = S[j], S[i] i = 0 j = 0 - for m in range(len(ciphertext)): + for m in range(len(cipher_text)): i = (i + 1) % 256 j = (j + S[i]) % 256 S[i], S[j] = S[j], S[i] k = S[(S[i] + S[j]) % 256] - res += struct.pack("B", k ^ compat_ord(ciphertext[m])) + res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m])) - return res + return res.decode() def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, fatal=True) - error = self._search_regex(r'

([^<]+)

', webpage, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) + query = {'v': video_id} + player_page = self._download_webpage( + 'https://videa.hu/player', video_id, query=query) - video_src_params_raw = self._search_regex(r']+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params') - video_src_params = compat_parse_qs(video_src_params_raw) - player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True) - nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') - random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) - static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' + nonce = self._search_regex( + r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') l = nonce[:32] s = nonce[32:] result = '' for i in range(0, 32): - result += s[i - (static_secret.index(l[i]) - 31)] + result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] - video_src_params['_s'] = random_seed - video_src_params['_t'] = result[:16] - encryption_key_stem = result[16:] + random_seed + random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) + query['_s'] = random_seed + query['_t'] = result[:16] - [b64_info, handle] = self._download_webpage_handle( - 'http://videa.hu/videaplayer_get_xml.php', video_id, - query=video_src_params, fatal=True) + b64_info, handle = self._download_webpage_handle( + 'http://videa.hu/videaplayer_get_xml.php', video_id, query=query) + if b64_info.startswith(' Date: Sat, 28 Nov 2020 15:02:31 +0700 Subject: [PATCH 025/424] [youtube] Improve yt initial player response extraction (closes #27216) --- youtube_dl/extractor/youtube.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1e3ff7d44..4616ac53d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -283,6 +283,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): } _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' + _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' def _call_api(self, ep, query, video_id): data = self._DEFAULT_API_DATA.copy() @@ -1068,7 +1069,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, { - # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093) + # with '};' inside yt initial data (see [1]) + # see [2] for an example with '};' inside ytInitialPlayerResponse + # 1. https://github.com/ytdl-org/youtube-dl/issues/27093 + # 2. https://github.com/ytdl-org/youtube-dl/issues/27216 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', 'info_dict': { 'id': 'CHqg6qOn4no', @@ -1686,7 +1690,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info and not player_response: player_response = extract_player_response( self._search_regex( - r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage, + (r'%s\s*(?:var\s+meta| Date: Sat, 28 Nov 2020 15:32:13 -0400 Subject: [PATCH 026/424] [pornhub] Fix like and dislike count extraction (closes #27227) (#27234) --- youtube_dl/extractor/pornhub.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 529f3f711..a50b6260c 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -346,9 +346,9 @@ class PornHubIE(PornHubBaseIE): view_count = self._extract_count( r'([\d,\.]+) [Vv]iews', webpage, 'view') like_count = self._extract_count( - r'([\d,\.]+)', webpage, 'like') + r']+class="votesUp"[^>]*>([\d,\.]+)', webpage, 'like') dislike_count = self._extract_count( - r'([\d,\.]+)', webpage, 'dislike') + r']+class="votesDown"[^>]*>([\d,\.]+)', webpage, 'dislike') comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') From 9585b376db8cb0a3f0e7a866e5d64b351fb6ecf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Nov 2020 03:55:49 +0700 Subject: [PATCH 027/424] [YoutubeDL] Write static debug to stderr and respect quiet for dynamic debug (closes #14579, closes #22593) TODO: logging and verbosity needs major refactoring (refs #10894) --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 855a73157..fdd68f616 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1610,7 +1610,7 @@ class YoutubeDL(object): if req_format is None: req_format = self._default_format_spec(info_dict, download=download) if self.params.get('verbose'): - self.to_stdout('[debug] Default format spec: %s' % req_format) + self._write_string('[debug] Default format spec: %s\n' % req_format) format_selector = self.build_format_selector(req_format) @@ -1871,7 +1871,7 @@ class YoutubeDL(object): for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + self.to_screen('[debug] Invoking downloader on %r' % info.get('url')) return fd.download(name, info) if info_dict.get('requested_formats') is not None: From 717d1d2d5acf88aba6e227870e15bbff0be89ff1 Mon Sep 17 00:00:00 2001 From: bopol Date: Sat, 28 Nov 2020 22:15:53 +0100 Subject: [PATCH 028/424] [ina] Add support for mobile URLs (#27229) --- youtube_dl/extractor/ina.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 12695af27..b3b2683cb 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -12,7 +12,7 @@ from ..utils import ( class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P[A-Z0-9_]+)' _TESTS = [{ 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', @@ -31,6 +31,9 @@ class InaIE(InfoExtractor): }, { 'url': 'https://www.ina.fr/video/P16173408-video.html', 'only_matching': True, + }, { + 'url': 'http://m.ina.fr/video/I12055569', + 'only_matching': True, }] def _real_extract(self, url): From fb626c05867deab04425bad0c0b16b55473841a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Nov 2020 08:09:20 +0700 Subject: [PATCH 029/424] [tiktok] Fix extraction (closes #20809, closes #22838, closes #22850, closes #25987, closes #26281, closes #26411, closes #26639, closes #26776, closes #27237) --- youtube_dl/extractor/tiktok.py | 122 +++++++++++++++++---------------- 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 66088b9ab..ea1beb8af 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( compat_str, ExtractorError, + float_or_none, int_or_none, str_or_none, try_get, @@ -13,7 +14,7 @@ from ..utils import ( class TikTokBaseIE(InfoExtractor): - def _extract_aweme(self, data): + def _extract_video(self, data, video_id=None): video = data['video'] description = str_or_none(try_get(data, lambda x: x['desc'])) width = int_or_none(try_get(data, lambda x: video['width'])) @@ -21,43 +22,54 @@ class TikTokBaseIE(InfoExtractor): format_urls = set() formats = [] - for format_id in ( - 'play_addr_lowbr', 'play_addr', 'play_addr_h264', - 'download_addr'): - for format in try_get( - video, lambda x: x[format_id]['url_list'], list) or []: - format_url = url_or_none(format) - if not format_url: - continue - if format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'url': format_url, - 'ext': 'mp4', - 'height': height, - 'width': width, - }) + for format_id in ('download', 'play'): + format_url = url_or_none(video.get('%sAddr' % format_id)) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'height': height, + 'width': width, + 'http_headers': { + 'Referer': 'https://www.tiktok.com/', + } + }) self._sort_formats(formats) - thumbnail = url_or_none(try_get( - video, lambda x: x['cover']['url_list'][0], compat_str)) - uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) - timestamp = int_or_none(data.get('create_time')) - comment_count = int_or_none(data.get('comment_count')) or int_or_none( - try_get(data, lambda x: x['statistics']['comment_count'])) - repost_count = int_or_none(try_get( - data, lambda x: x['statistics']['share_count'])) + thumbnail = url_or_none(video.get('cover')) + duration = float_or_none(video.get('duration')) - aweme_id = data['aweme_id'] + uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) + uploader_id = try_get(data, lambda x: x['author']['id'], compat_str) + + timestamp = int_or_none(data.get('createTime')) + + def stats(key): + return int_or_none(try_get( + data, lambda x: x['stats']['%sCount' % key])) + + view_count = stats('play') + like_count = stats('digg') + comment_count = stats('comment') + repost_count = stats('share') + + aweme_id = data.get('id') or video_id return { 'id': aweme_id, 'title': uploader or aweme_id, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, 'uploader': uploader, + 'uploader_id': uploader_id, 'timestamp': timestamp, + 'view_count': view_count, + 'like_count': like_count, 'comment_count': comment_count, 'repost_count': repost_count, 'formats': formats, @@ -65,62 +77,56 @@ class TikTokBaseIE(InfoExtractor): class TikTokIE(TikTokBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:m\.)?tiktok\.com/v| - (?:www\.)?tiktok\.com/share/video - ) - /(?P\d+) - ''' + _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@[^/]+/video/(?P\d+)' _TESTS = [{ - 'url': 'https://m.tiktok.com/v/6606727368545406213.html', - 'md5': 'd584b572e92fcd48888051f238022420', + 'url': 'https://www.tiktok.com/@zureeal/video/6606727368545406213', + 'md5': '163ceff303bb52de60e6887fe399e6cd', 'info_dict': { 'id': '6606727368545406213', 'ext': 'mp4', 'title': 'Zureeal', 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', - 'thumbnail': r're:^https?://.*~noop.image', + 'thumbnail': r're:^https?://.*', + 'duration': 15, 'uploader': 'Zureeal', + 'uploader_id': '188294915489964032', 'timestamp': 1538248586, 'upload_date': '20180929', + 'view_count': int, + 'like_count': int, 'comment_count': int, 'repost_count': int, } - }, { - 'url': 'https://www.tiktok.com/share/video/6606727368545406213', - 'only_matching': True, }] + def _real_initialize(self): + # Setup session (will set necessary cookies) + self._request_webpage( + 'https://www.tiktok.com/', None, note='Setting up session') + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://m.tiktok.com/v/%s.html' % video_id, video_id) + webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( - r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id) - return self._extract_aweme(data) + r']+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s*\d+) - ''' + _VALID_URL = r'https://(?:www\.)?tiktok\.com/@(?P[^/?#&]+)' _TESTS = [{ - 'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html', + 'url': 'https://www.tiktok.com/@zureeal', 'info_dict': { 'id': '188294915489964032', }, 'playlist_mincount': 24, - }, { - 'url': 'https://www.tiktok.com/share/user/188294915489964032', - 'only_matching': True, }] + _WORKING = False + + @classmethod + def suitable(cls, url): + return False if TikTokIE.suitable(url) else super(TikTokUserIE, cls).suitable(url) def _real_extract(self, url): user_id = self._match_id(url) @@ -130,7 +136,7 @@ class TikTokUserIE(TikTokBaseIE): entries = [] for aweme in data['aweme_list']: try: - entry = self._extract_aweme(aweme) + entry = self._extract_video(aweme) except ExtractorError: continue entry['extractor_key'] = TikTokIE.ie_key() From 43181707799fe89b477759a6a20ce0bb6a1e5d82 Mon Sep 17 00:00:00 2001 From: Michael Munch Date: Sun, 29 Nov 2020 07:44:36 +0100 Subject: [PATCH 030/424] [drtv] Extend _VALID_URL (#27243) --- youtube_dl/extractor/drtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 390e79f8c..c0036adb6 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -29,7 +29,7 @@ class DRTVIE(InfoExtractor): https?:// (?: (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| - (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/ + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P[\da-z_-]+) ''' @@ -111,6 +111,9 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/program/jagten_220924', + 'only_matching': True, }] def _real_extract(self, url): From 16c822e91e6cbd433bd562c213538553574b2a94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Nov 2020 13:49:12 +0700 Subject: [PATCH 031/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index b785e89ad..fc75cf7c2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +version + +Core +* [YoutubeDL] Write static debug to stderr and respect quiet for dynamic debug + (#14579, #22593) + +Extractors +* [drtv] Extend URL regular expression (#27243) +* [tiktok] Fix extraction (#20809, #22838, #22850, #25987, #26281, #26411, + #26639, #26776, #27237) ++ [ina] Add support for mobile URLs (#27229) +* [pornhub] Fix like and dislike count extraction (#27227, #27234) +* [youtube] Improve yt initial player response extraction (#27216) +* [videa] Fix extraction (#25650, #25973, #26301) + + version 2020.11.26 Core @@ -12,7 +28,7 @@ Extractors * [bbc] Fix BBC Three clip extraction * [bbc] Fix BBC News videos extraction + [medaltv] Add support for medal.tv (#27149) -* [youtube] Imporve music metadata and license extraction (#26013) +* [youtube] Improve music metadata and license extraction (#26013) * [nrk] Fix extraction * [cda] Fix extraction (#17803, #24458, #24518, #26381) From b449b73dccf6fdd676770ea83cff322348cdf778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Nov 2020 13:53:01 +0700 Subject: [PATCH 032/424] release 2020.11.29 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 98c075cb1..95b6f674f 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.26** +- [ ] I've verified that I'm running youtube-dl version **2020.11.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.26 + [debug] youtube-dl version 2020.11.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 12423c0e1..a3b53f5f3 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.26** +- [ ] I've verified that I'm running youtube-dl version **2020.11.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 11522132a..476fdc3b7 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.26** +- [ ] I've verified that I'm running youtube-dl version **2020.11.29** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 9be368760..2ae00f75e 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.26** +- [ ] I've verified that I'm running youtube-dl version **2020.11.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.26 + [debug] youtube-dl version 2020.11.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index a6e1eabe6..f9b7edb49 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.26** +- [ ] I've verified that I'm running youtube-dl version **2020.11.29** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index fc75cf7c2..966f9379f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.11.29 Core * [YoutubeDL] Write static debug to stderr and respect quiet for dynamic debug diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8387dcce0..702930e71 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -912,7 +912,7 @@ - **ThisAV** - **ThisOldHouse** - **TikTok** - - **TikTokUser** + - **TikTokUser** (Currently broken) - **tinypic**: tinypic.com videos - **TMZ** - **TMZArticle** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d8a53b96e..ebb8a200d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.26' +__version__ = '2020.11.29' From d6ce649f15cff2e259a0f38b176b97fec293f0ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Nov 2020 22:15:51 +0700 Subject: [PATCH 033/424] [yandexmusic:track] Fix extraction (closes #26449, closes #26669, closes #26747, closes #26748, closes #26762) --- youtube_dl/extractor/yandexmusic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 08d35e04c..5805caef2 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -109,8 +109,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'Downloading track location JSON', query={'format': 'json'}) key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest() - storage = track['storageDir'].split('.') - f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], storage[1]) + f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], track['id']) thumbnail = None cover_uri = track.get('albums', [{}])[0].get('coverUri') From 6da0e5e7a238a9a27260198da741a4014787dcae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 30 Nov 2020 00:25:06 +0700 Subject: [PATCH 034/424] [yandexmusic] Refactor and add support for artist's tracks and albums (closes #11887, closes #22284) --- youtube_dl/extractor/extractors.py | 2 + youtube_dl/extractor/yandexmusic.py | 280 ++++++++++++++++++++-------- 2 files changed, 200 insertions(+), 82 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index fd19f0f0a..e5327d375 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1478,6 +1478,8 @@ from .yandexmusic import ( YandexMusicTrackIE, YandexMusicAlbumIE, YandexMusicPlaylistIE, + YandexMusicArtistTracksIE, + YandexMusicArtistAlbumsIE, ) from .yandexvideo import YandexVideoIE from .yapfiles import YapFilesIE diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 5805caef2..324ac8bba 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -46,57 +46,69 @@ class YandexMusicBaseIE(InfoExtractor): self._handle_error(response) return response + def _call_api(self, ep, tld, url, item_id, note, query): + return self._download_json( + 'https://music.yandex.%s/handlers/%s.jsx' % (tld, ep), + item_id, note, + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + 'X-Retpath-Y': url, + }, + query=query) + class YandexMusicTrackIE(YandexMusicBaseIE): IE_NAME = 'yandexmusic:track' IE_DESC = 'Яндекс.Музыка - Трек' - _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/track/(?P\d+)' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/album/(?P\d+)/track/(?P\d+)' _TESTS = [{ 'url': 'http://music.yandex.ru/album/540508/track/4878838', - 'md5': 'f496818aa2f60b6c0062980d2e00dc20', + 'md5': 'dec8b661f12027ceaba33318787fff76', 'info_dict': { 'id': '4878838', 'ext': 'mp3', - 'title': 'Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1', - 'filesize': 4628061, + 'title': 'md5:c63e19341fdbe84e43425a30bc777856', + 'filesize': int, 'duration': 193.04, - 'track': 'Gypsy Eyes 1', - 'album': 'Gypsy Soul', - 'album_artist': 'Carlo Ambrosio', - 'artist': 'Carlo Ambrosio & Fabio Di Bari', + 'track': 'md5:210508c6ffdfd67a493a6c378f22c3ff', + 'album': 'md5:cd04fb13c4efeafdfa0a6a6aca36d01a', + 'album_artist': 'md5:5f54c35462c07952df33d97cfb5fc200', + 'artist': 'md5:e6fd86621825f14dc0b25db3acd68160', 'release_year': 2009, }, - 'skip': 'Travis CI servers blocked by YandexMusic', + # 'skip': 'Travis CI servers blocked by YandexMusic', }, { # multiple disks 'url': 'http://music.yandex.ru/album/3840501/track/705105', - 'md5': 'ebe7b4e2ac7ac03fe11c19727ca6153e', + 'md5': '82a54e9e787301dd45aba093cf6e58c0', 'info_dict': { 'id': '705105', 'ext': 'mp3', - 'title': 'Hooverphonic - Sometimes', - 'filesize': 5743386, + 'title': 'md5:f86d4a9188279860a83000277024c1a6', + 'filesize': int, 'duration': 239.27, - 'track': 'Sometimes', - 'album': 'The Best of Hooverphonic', - 'album_artist': 'Hooverphonic', - 'artist': 'Hooverphonic', + 'track': 'md5:40f887f0666ba1aa10b835aca44807d1', + 'album': 'md5:624f5224b14f5c88a8e812fd7fbf1873', + 'album_artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12', + 'artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12', 'release_year': 2016, 'genre': 'pop', 'disc_number': 2, 'track_number': 9, }, - 'skip': 'Travis CI servers blocked by YandexMusic', + # 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - album_id, track_id = mobj.group('album_id'), mobj.group('id') + tld, album_id, track_id = mobj.group('tld'), mobj.group('album_id'), mobj.group('id') - track = self._download_json( - 'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id), - track_id, 'Downloading track JSON')['track'] + track = self._call_api( + 'track', tld, url, track_id, 'Downloading track JSON', + {'track': '%s:%s' % (track_id, album_id)})['track'] track_title = track['title'] download_data = self._download_json( @@ -179,42 +191,85 @@ class YandexMusicTrackIE(YandexMusicBaseIE): class YandexMusicPlaylistBaseIE(YandexMusicBaseIE): + def _extract_tracks(self, source, item_id, url, tld): + tracks = source['tracks'] + track_ids = [compat_str(track_id) for track_id in source['trackIds']] + + # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, + # missing tracks should be retrieved manually. + if len(tracks) < len(track_ids): + present_track_ids = set([ + compat_str(track['id']) + for track in tracks if track.get('id')]) + missing_track_ids = [ + track_id for track_id in track_ids + if track_id not in present_track_ids] + missing_tracks = self._call_api( + 'track-entries', tld, url, item_id, + 'Downloading missing tracks JSON', { + 'entries': ','.join(missing_track_ids), + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + 'strict': 'true', + }) + if missing_tracks: + tracks.extend(missing_tracks) + + return tracks + def _build_playlist(self, tracks): - return [ - self.url_result( - 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id'])) - for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)] + entries = [] + for track in tracks: + track_id = track.get('id') or track.get('realId') + if not track_id: + continue + albums = track.get('albums') + if not albums or not isinstance(albums, list): + continue + album = albums[0] + if not isinstance(album, dict): + continue + album_id = album.get('id') + if not album_id: + continue + entries.append(self.url_result( + 'http://music.yandex.ru/album/%s/track/%s' % (album_id, track_id), + ie=YandexMusicTrackIE.ie_key(), video_id=track_id)) + return entries class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:album' IE_DESC = 'Яндекс.Музыка - Альбом' - _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/?(\?|$)' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/album/(?P\d+)/?(\?|$)' _TESTS = [{ 'url': 'http://music.yandex.ru/album/540508', 'info_dict': { 'id': '540508', - 'title': 'Carlo Ambrosio - Gypsy Soul (2009)', + 'title': 'md5:7ed1c3567f28d14be9f61179116f5571', }, 'playlist_count': 50, - 'skip': 'Travis CI servers blocked by YandexMusic', + # 'skip': 'Travis CI servers blocked by YandexMusic', }, { 'url': 'https://music.yandex.ru/album/3840501', 'info_dict': { 'id': '3840501', - 'title': 'Hooverphonic - The Best of Hooverphonic (2016)', + 'title': 'md5:36733472cdaa7dcb1fd9473f7da8e50f', }, 'playlist_count': 33, - 'skip': 'Travis CI servers blocked by YandexMusic', + # 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): - album_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + album_id = mobj.group('id') - album = self._download_json( - 'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id, - album_id, 'Downloading album JSON') + album = self._call_api( + 'album', tld, url, album_id, 'Downloading album JSON', + {'album': album_id}) entries = self._build_playlist([track for volume in album['volumes'] for track in volume]) @@ -235,21 +290,24 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', 'info_dict': { 'id': '1245', - 'title': 'Что слушают Enter Shikari', + 'title': 'md5:841559b3fe2b998eca88d0d2e22a3097', 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, - 'playlist_count': 6, - 'skip': 'Travis CI servers blocked by YandexMusic', + 'playlist_count': 5, + # 'skip': 'Travis CI servers blocked by YandexMusic', }, { - # playlist exceeding the limit of 150 tracks shipped with webpage (see - # https://github.com/ytdl-org/youtube-dl/issues/6666) 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', + 'only_matching': True, + }, { + # playlist exceeding the limit of 150 tracks (see + # https://github.com/ytdl-org/youtube-dl/issues/6666) + 'url': 'https://music.yandex.ru/users/mesiaz/playlists/1364', 'info_dict': { - 'id': '1036', - 'title': 'Музыка 90-х', + 'id': '1364', + 'title': 'md5:b3b400f997d3f878a13ae0699653f7db', }, - 'playlist_mincount': 300, - 'skip': 'Travis CI servers blocked by YandexMusic', + 'playlist_mincount': 437, + # 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): @@ -258,16 +316,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): user = mobj.group('user') playlist_id = mobj.group('id') - playlist = self._download_json( - 'https://music.yandex.%s/handlers/playlist.jsx' % tld, - playlist_id, 'Downloading missing tracks JSON', - fatal=False, - headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - 'X-Retpath-Y': url, - }, - query={ + playlist = self._call_api( + 'playlist', tld, url, playlist_id, 'Downloading playlist JSON', { 'owner': user, 'kinds': playlist_id, 'light': 'true', @@ -276,37 +326,103 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'overembed': 'false', })['playlist'] - tracks = playlist['tracks'] - track_ids = [compat_str(track_id) for track_id in playlist['trackIds']] - - # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, - # missing tracks should be retrieved manually. - if len(tracks) < len(track_ids): - present_track_ids = set([ - compat_str(track['id']) - for track in tracks if track.get('id')]) - missing_track_ids = [ - track_id for track_id in track_ids - if track_id not in present_track_ids] - missing_tracks = self._download_json( - 'https://music.yandex.%s/handlers/track-entries.jsx' % tld, - playlist_id, 'Downloading missing tracks JSON', - fatal=False, - headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }, - query={ - 'entries': ','.join(missing_track_ids), - 'lang': tld, - 'external-domain': 'music.yandex.%s' % tld, - 'overembed': 'false', - 'strict': 'true', - }) - if missing_tracks: - tracks.extend(missing_tracks) + tracks = self._extract_tracks(playlist, playlist_id, url, tld) return self.playlist_result( self._build_playlist(tracks), compat_str(playlist_id), playlist.get('title'), playlist.get('description')) + + +class YandexMusicArtistBaseIE(YandexMusicPlaylistBaseIE): + def _call_artist(self, tld, url, artist_id): + return self._call_api( + 'artist', tld, url, artist_id, + 'Downloading artist %s JSON' % self._ARTIST_WHAT, { + 'artist': artist_id, + 'what': self._ARTIST_WHAT, + 'sort': self._ARTIST_SORT or '', + 'dir': '', + 'period': '', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + }) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + artist_id = mobj.group('id') + data = self._call_artist(tld, url, artist_id) + tracks = self._extract_tracks(data, artist_id, url, tld) + title = try_get(data, lambda x: x['artist']['name'], compat_str) + return self.playlist_result( + self._build_playlist(tracks), artist_id, title) + + +class YandexMusicArtistTracksIE(YandexMusicArtistBaseIE): + IE_NAME = 'yandexmusic:artist:tracks' + IE_DESC = 'Яндекс.Музыка - Артист - Треки' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/artist/(?P\d+)/tracks' + + _TESTS = [{ + 'url': 'https://music.yandex.ru/artist/617526/tracks', + 'info_dict': { + 'id': '617526', + 'title': 'md5:131aef29d45fd5a965ca613e708c040b', + }, + 'playlist_count': 507, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }] + + _ARTIST_SORT = '' + _ARTIST_WHAT = 'tracks' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + artist_id = mobj.group('id') + data = self._call_artist(tld, url, artist_id) + tracks = self._extract_tracks(data, artist_id, url, tld) + artist = try_get(data, lambda x: x['artist']['name'], compat_str) + title = '%s - %s' % (artist or artist_id, 'Треки') + return self.playlist_result( + self._build_playlist(tracks), artist_id, title) + + +class YandexMusicArtistAlbumsIE(YandexMusicArtistBaseIE): + IE_NAME = 'yandexmusic:artist:albums' + IE_DESC = 'Яндекс.Музыка - Артист - Альбомы' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/artist/(?P\d+)/albums' + + _TESTS = [{ + 'url': 'https://music.yandex.ru/artist/617526/albums', + 'info_dict': { + 'id': '617526', + 'title': 'md5:55dc58d5c85699b7fb41ee926700236c', + }, + 'playlist_count': 8, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }] + + _ARTIST_SORT = 'year' + _ARTIST_WHAT = 'albums' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + artist_id = mobj.group('id') + data = self._call_artist(tld, url, artist_id) + entries = [] + for album in data['albums']: + if not isinstance(album, dict): + continue + album_id = album.get('id') + if not album_id: + continue + entries.append(self.url_result( + 'http://music.yandex.ru/album/%s' % album_id, + ie=YandexMusicAlbumIE.ie_key(), video_id=album_id)) + artist = try_get(data, lambda x: x['artist']['name'], compat_str) + title = '%s - %s' % (artist or artist_id, 'Альбомы') + return self.playlist_result(entries, artist_id, title) From cfeba5d17f5f621b129571213e7034c44c719e76 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 30 Nov 2020 09:45:44 +0100 Subject: [PATCH 035/424] [mediaset] add support for movie URLs(closes #27240) --- youtube_dl/extractor/mediaset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 933df1495..2c16fc9e2 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -23,7 +23,7 @@ class MediasetIE(ThePlatformBaseIE): https?:// (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: - (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| player/index\.html\?.*?\bprogramGuid= ) )(?P[0-9A-Z]{16,}) @@ -88,6 +88,9 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', + 'only_matching': True, }] @staticmethod From 59d63d8d4aa82c7b1472d5b6a0f9f4e9c2893212 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Dec 2020 00:49:03 +0700 Subject: [PATCH 036/424] [youtube] Improve age-gated videos extraction (closes #27259) --- youtube_dl/extractor/youtube.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4616ac53d..6fdc379cd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -602,7 +602,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } }, - # Normal age-gate video (No vevo, embed allowed) + # Normal age-gate video (No vevo, embed allowed), available via embed page { 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', 'info_dict': { @@ -618,6 +618,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 18, }, }, + { + # Age-gated video only available with authentication (unavailable + # via embed page workaround) + 'url': 'XgnwCQzjau8', + 'only_matching': True, + }, # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) # YouTube Red ad is not captured for creator { @@ -1637,8 +1643,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Get video info video_info = {} embed_webpage = None - if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+' - or re.search(r'player-age-gate-content">', video_webpage) is not None): + + if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube From be19ae11fde9eb8cb28ff967ded15b457b5f80ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roman=20Ber=C3=A1nek?= Date: Mon, 30 Nov 2020 19:14:29 +0100 Subject: [PATCH 037/424] [cspan] Pass Referer header with format's video URL (#26032) (closes #25729) --- youtube_dl/extractor/cspan.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 67d6df4b0..3356cc280 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -165,6 +165,8 @@ class CSpanIE(InfoExtractor): formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] + for f in formats: + f.setdefault('http_headers', {})['Referer'] = url self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), From 3e4e338133463e57125ade82fe5dfd57bb42dc1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Dec 2020 01:53:12 +0700 Subject: [PATCH 038/424] [cspan] Extract info from jwplayer data (closes #3672, closes #3734, closes #10638, closes #13030, closes #18806, closes #23148, closes #24461, closes #26171, closes #26800, closes #27263) --- youtube_dl/extractor/cspan.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 3356cc280..766942146 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -10,6 +10,8 @@ from ..utils import ( find_xpath_attr, get_element_by_class, int_or_none, + js_to_json, + merge_dicts, smuggle_url, unescapeHTML, ) @@ -98,6 +100,26 @@ class CSpanIE(InfoExtractor): bc_attr['data-bcid']) return self.url_result(smuggle_url(bc_url, {'source_url': url})) + def add_referer(formats): + for f in formats: + f.setdefault('http_headers', {})['Referer'] = url + + # As of 01.12.2020 this path looks to cover all cases making the rest + # of the code unnecessary + jwsetup = self._parse_json( + self._search_regex( + r'(?s)jwsetup\s*=\s*({.+?})\s*;', webpage, 'jwsetup', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if jwsetup: + info = self._parse_jwplayer_data( + jwsetup, video_id, require_title=False, m3u8_id='hls', + base_url=url) + add_referer(info['formats']) + ld_info = self._search_json_ld(webpage, video_id, default={}) + return merge_dicts(info, ld_info) + + # Obsolete # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) @@ -165,8 +187,7 @@ class CSpanIE(InfoExtractor): formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] - for f in formats: - f.setdefault('http_headers', {})['Referer'] = url + add_referer(formats) self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), From 132aece1ed824aafd849ef63dcb44c835eab6ee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Dec 2020 04:44:10 +0700 Subject: [PATCH 039/424] [youtube:tab] Extract channels only from channels tab (closes #27266) --- youtube_dl/extractor/youtube.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6fdc379cd..b796f58b2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2796,12 +2796,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): # TODO pass - def _shelf_entries(self, shelf_renderer): + def _shelf_entries(self, shelf_renderer, skip_channels=False): ep = try_get( shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], compat_str) shelf_url = urljoin('https://www.youtube.com', ep) if shelf_url: + # Skipping links to another channels, note that checking for + # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL + # will not work + if skip_channels and '/channels?' in shelf_url: + return title = try_get( shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) yield self.url_result(shelf_url, video_title=title) @@ -2912,9 +2917,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): } def _entries(self, tab, identity_token): - slr_renderer = try_get(tab, lambda x: x['sectionListRenderer'], dict) + tab_content = try_get(tab, lambda x: x['content'], dict) + if not tab_content: + return + slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict) if not slr_renderer: return + is_channels_tab = tab.get('title') == 'Channels' continuation = None slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] for slr_content in slr_contents: @@ -2941,7 +2950,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continue renderer = isr_content.get('shelfRenderer') if renderer: - for entry in self._shelf_entries(renderer): + for entry in self._shelf_entries(renderer, not is_channels_tab): yield entry continue renderer = isr_content.get('backstagePostThreadRenderer') @@ -3071,7 +3080,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): description = None playlist_id = item_id playlist = self.playlist_result( - self._entries(selected_tab['content'], identity_token), + self._entries(selected_tab, identity_token), playlist_id=playlist_id, playlist_title=title, playlist_description=description) playlist.update(self._extract_uploader(data)) From d3e142b3faca04ab21163595d7dcf308d52e5253 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 1 Dec 2020 10:11:51 +0100 Subject: [PATCH 040/424] [toggle] Add support for new MeWatch URLs (closes #27256) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/toggle.py | 74 +++++++++++++++++------------- 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e5327d375..048e72f60 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1185,7 +1185,10 @@ from .tnaflix import ( EMPFlixIE, MovieFapIE, ) -from .toggle import ToggleIE +from .toggle import ( + ToggleIE, + MeWatchIE, +) from .tonline import TOnlineIE from .toongoggles import ToonGogglesIE from .toutv import TouTvIE diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index ca2e36efe..cababa69e 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -11,13 +11,13 @@ from ..utils import ( float_or_none, int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, ) class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P[0-9]+)' + _VALID_URL = r'(?:https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}|toggle:)(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -96,16 +96,6 @@ class ToggleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, note='Downloading video page') - - api_user = self._search_regex( - r'apiUser\s*:\s*(["\'])(?P.+?)\1', webpage, 'apiUser', - default=self._API_USER, group='user') - api_pass = self._search_regex( - r'apiPass\s*:\s*(["\'])(?P.+?)\1', webpage, 'apiPass', - default=self._API_PASS, group='pass') - params = { 'initObj': { 'Locale': { @@ -118,17 +108,16 @@ class ToggleIE(InfoExtractor): 'SiteGuid': 0, 'DomainID': '0', 'UDID': '', - 'ApiUser': api_user, - 'ApiPass': api_pass + 'ApiUser': self._API_USER, + 'ApiPass': self._API_PASS }, 'MediaID': video_id, 'mediaType': 0, } - req = sanitized_Request( + info = self._download_json( 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', - json.dumps(params).encode('utf-8')) - info = self._download_json(req, video_id, 'Downloading video info json') + video_id, 'Downloading video info json', data=json.dumps(params).encode('utf-8')) title = info['MediaName'] @@ -172,14 +161,6 @@ class ToggleIE(InfoExtractor): raise ExtractorError('No downloadable videos found', expected=True) self._sort_formats(formats) - duration = int_or_none(info.get('Duration')) - description = info.get('Description') - created_at = parse_iso8601(info.get('CreationDate') or None) - - average_rating = float_or_none(info.get('Rating')) - view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter')) - like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter')) - thumbnails = [] for picture in info.get('Pictures', []): if not isinstance(picture, dict): @@ -199,15 +180,46 @@ class ToggleIE(InfoExtractor): }) thumbnails.append(thumbnail) + def counter(prefix): + return int_or_none( + info.get(prefix + 'Counter') or info.get(prefix.lower() + '_counter')) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': created_at, - 'average_rating': average_rating, - 'view_count': view_count, - 'like_count': like_count, + 'description': strip_or_none(info.get('Description')), + 'duration': int_or_none(info.get('Duration')), + 'timestamp': parse_iso8601(info.get('CreationDate') or None), + 'average_rating': float_or_none(info.get('Rating')), + 'view_count': counter('View'), + 'like_count': counter('Like'), 'thumbnails': thumbnails, 'formats': formats, } + + +class MeWatchIE(InfoExtractor): + IE_NAME = 'mewatch' + _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[0-9a-zA-Z-]+-(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', + 'info_dict': { + 'id': '1008625', + 'ext': 'mp4', + 'title': 'Recipe Of Life 味之道', + 'timestamp': 1603306526, + 'description': 'md5:6e88cde8af2068444fc8e1bc3ebf257c', + 'upload_date': '20201021', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }] + + def _real_extract(self, url): + item_id = self._match_id(url) + custom_id = self._download_json( + 'https://cdn.mewatch.sg/api/items/' + item_id, + item_id, query={'segments': 'all'})['customId'] + return self.url_result( + 'toggle:' + custom_id, ToggleIE.ie_key(), custom_id) From 95ac4de229921d7e59781179c7f7a9aaba34b5d5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 1 Dec 2020 10:38:53 +0100 Subject: [PATCH 041/424] [toggle] Detect DRM protected videos (closes #16479)(closes #20805) --- youtube_dl/extractor/toggle.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index cababa69e..91b8023b8 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -84,12 +84,6 @@ class ToggleIE(InfoExtractor): 'only_matching': True, }] - _FORMAT_PREFERENCES = { - 'wvm-STBMain': -10, - 'wvm-iPadMain': -20, - 'wvm-iPhoneMain': -30, - 'wvm-Android': -40, - } _API_USER = 'tvpapi_147' _API_PASS = '11111' @@ -130,11 +124,16 @@ class ToggleIE(InfoExtractor): vid_format = vid_format.replace(' ', '') # if geo-restricted, m3u8 is inaccessible, but mp4 is okay if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, ext='mp4', m3u8_id=vid_format, note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, - fatal=False)) + fatal=False) + for f in m3u8_formats: + # Apple FairPlay Streaming + if '/fpshls/' in f['url']: + continue + formats.append(f) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id=vid_format, @@ -147,16 +146,17 @@ class ToggleIE(InfoExtractor): note='Downloading %s ISM manifest' % vid_format, errnote='Failed to download %s ISM manifest' % vid_format, fatal=False)) - elif ext in ('mp4', 'wvm'): - # wvm are drm-protected files + elif ext == 'mp4': formats.append({ 'ext': ext, 'url': video_url, 'format_id': vid_format, - 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, - 'format_note': 'DRM-protected video' if ext == 'wvm' else None }) if not formats: + for meta in (info.get('Metas') or []): + if meta.get('Key') == 'Encryption' and meta.get('Value') == '1': + raise ExtractorError( + 'This video is DRM protected.', expected=True) # Most likely because geo-blocked raise ExtractorError('No downloadable videos found', expected=True) self._sort_formats(formats) From 37d979ad33717c2b98d4f259d109c3b1c11906ae Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 1 Dec 2020 12:25:02 +0100 Subject: [PATCH 042/424] [tva] Add support for qub.ca (closes #27235) --- youtube_dl/extractor/tva.py | 65 +++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py index 443f46e8a..52a4ddf32 100644 --- a/youtube_dl/extractor/tva.py +++ b/youtube_dl/extractor/tva.py @@ -4,7 +4,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( float_or_none, + int_or_none, smuggle_url, + strip_or_none, ) @@ -23,7 +25,8 @@ class TVAIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'https://video.tva.ca/details/_5596811470001', 'only_matching': True, @@ -32,26 +35,54 @@ class TVAIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={ - 'Accept': 'application/json', - }, query={ - 'appId': '5955fc5f23eec60006c951f1', - }) - - def get_attribute(key): - for attribute in video_data.get('attributes', []): - if attribute.get('key') == key: - return attribute.get('value') - return None return { '_type': 'url_transparent', 'id': video_id, - 'title': get_attribute('title'), 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}), - 'description': get_attribute('description'), - 'thumbnail': get_attribute('image-background') or get_attribute('image-landscape'), - 'duration': float_or_none(get_attribute('video-duration'), 1000), 'ie_key': 'BrightcoveNew', } + + +class QubIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?qub\.ca/(?:[^/]+/)*[0-9a-z-]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.qub.ca/tvaplus/tva/alerte-amber/saison-1/episode-01-1000036619', + 'md5': '949490fd0e7aee11d0543777611fbd53', + 'info_dict': { + 'id': '6084352463001', + 'ext': 'mp4', + 'title': 'Épisode 01', + 'uploader_id': '5481942443001', + 'upload_date': '20190907', + 'timestamp': 1567899756, + 'description': 'md5:9c0d7fbb90939420c651fd977df90145', + }, + }, { + 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', + 'only_matching': True, + }] + # reference_id also works with old account_id(5481942443001) + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5813221784001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + entity_id = self._match_id(url) + entity = self._download_json( + 'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities', + entity_id, query={'id': entity_id}) + video_id = entity['videoId'] + episode = strip_or_none(entity.get('name')) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': episode, + # 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'], + 'url': 'https://videos.tva.ca/details/_' + video_id, + 'description': entity.get('longDescription'), + 'duration': float_or_none(entity.get('durationMillis'), 1000), + 'episode': episode, + 'episode_number': int_or_none(entity.get('episodeNumber')), + # 'ie_key': 'BrightcoveNew', + 'ie_key': TVAIE.ie_key(), + } From 09d923f2ebad2a24e11abbe86c47ec1efa2b7dc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 Dec 2020 01:22:43 +0700 Subject: [PATCH 043/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index 966f9379f..02dc12645 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version + +Extractors ++ [tva] Add support for qub.ca (#27235) ++ [toggle] Detect DRM protected videos (closes #16479)(closes #20805) ++ [toggle] Add support for new MeWatch URLs (#27256) +* [youtube:tab] Extract channels only from channels tab (#27266) ++ [cspan] Extract info from jwplayer data (#3672, #3734, #10638, #13030, + #18806, #23148, #24461, #26171, #26800, #27263) +* [cspan] Pass Referer header with format's video URL (#26032, #25729) +* [youtube] Improve age-gated videos extraction (#27259) ++ [mediaset] Add support for movie URLs (#27240) +* [yandexmusic] Refactor ++ [yandexmusic] Add support for artist's tracks and albums (#11887, #22284) +* [yandexmusic:track] Fix extraction (#26449, #26669, #26747, #26748, #26762) + + version 2020.11.29 Core From 2bb70750a9bb27aea34cab0e17327f549bae5734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 Dec 2020 01:37:40 +0700 Subject: [PATCH 044/424] release 2020.12.02 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 95b6f674f..5f0e33eaf 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.02** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.29 + [debug] youtube-dl version 2020.12.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index a3b53f5f3..631c60958 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.02** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 476fdc3b7..19d1b53e2 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.02** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 2ae00f75e..6d2f3e16d 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.02** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.29 + [debug] youtube-dl version 2020.12.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index f9b7edb49..82009b505 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.02** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 02dc12645..65406eafd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.12.02 Extractors + [tva] Add support for qub.ca (#27235) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 702930e71..e5b470041 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -486,6 +486,7 @@ - **META** - **metacafe** - **Metacritic** + - **mewatch** - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** @@ -1132,6 +1133,8 @@ - **yahoo:japannews**: Yahoo! Japan News - **YandexDisk** - **yandexmusic:album**: Яндекс.Музыка - Альбом + - **yandexmusic:artist:albums**: Яндекс.Музыка - Артист - Альбомы + - **yandexmusic:artist:tracks**: Яндекс.Музыка - Артист - Треки - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек - **YandexVideo** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ebb8a200d..0b0a6c785 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.29' +__version__ = '2020.12.02' From c0820dd52ad2796b8ef159a0f6ab21ac28d4bf67 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 2 Dec 2020 00:41:00 +0100 Subject: [PATCH 045/424] [extractors] Add QubIE import --- youtube_dl/extractor/extractors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 048e72f60..eb5a75f30 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1221,7 +1221,10 @@ from .tv2dk import ( from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE -from .tva import TVAIE +from .tva import ( + TVAIE, + QubIE, +) from .tvanouvelles import ( TVANouvellesIE, TVANouvellesArticleIE, From 4ded9c0f0018f45034070092e4899668f58b4690 Mon Sep 17 00:00:00 2001 From: opusforlife2 <53176348+opusforlife2@users.noreply.github.com> Date: Wed, 2 Dec 2020 18:30:08 +0000 Subject: [PATCH 046/424] [pornhub] Add support for pornhub.org (#27276) Most ISPs block the other two TLDs through deep packet inspection --- youtube_dl/extractor/pornhub.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index a50b6260c..69a9797aa 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -53,7 +53,7 @@ class PornHubIE(PornHubBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) @@ -152,6 +152,9 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933', + 'only_matching': True, }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, @@ -160,7 +163,7 @@ class PornHubIE(PornHubBaseIE): @staticmethod def _extract_urls(webpage): return re.findall( - r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -422,7 +425,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -490,7 +493,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?P(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -605,7 +608,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { From 64554c12e121a5dc0ab3f715fbf2452e5969a722 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 2 Dec 2020 21:36:51 +0100 Subject: [PATCH 047/424] [tver] Add new extractor (closes #26662)(closes #27284) --- youtube_dl/extractor/extractors.py | 2 + youtube_dl/extractor/fujitv.py | 35 ++++++++++++++++ youtube_dl/extractor/tver.py | 67 ++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 youtube_dl/extractor/fujitv.py create mode 100644 youtube_dl/extractor/tver.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index eb5a75f30..03aff1d52 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -393,6 +393,7 @@ from .frontendmasters import ( FrontendMastersLessonIE, FrontendMastersCourseIE ) +from .fujitv import FujiTVFODPlus7IE from .funimation import FunimationIE from .funk import FunkIE from .fusion import FusionIE @@ -1233,6 +1234,7 @@ from .tvc import ( TVCIE, TVCArticleIE, ) +from .tver import TVerIE from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE diff --git a/youtube_dl/extractor/fujitv.py b/youtube_dl/extractor/fujitv.py new file mode 100644 index 000000000..39685e075 --- /dev/null +++ b/youtube_dl/extractor/fujitv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class FujiTVFODPlus7IE(InfoExtractor): + _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P[0-9a-z]+)' + _BASE_URL = 'http://i.fod.fujitv.co.jp/' + _BITRATE_MAP = { + 300: (320, 180), + 800: (640, 360), + 1200: (1280, 720), + 2000: (1280, 720), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = self._extract_m3u8_formats( + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + for f in formats: + wh = self._BITRATE_MAP.get(f.get('tbr')) + if wh: + f.update({ + 'width': wh[0], + 'height': wh[1], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + 'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id, + } diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py new file mode 100644 index 000000000..c5299722d --- /dev/null +++ b/youtube_dl/extractor/tver.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + remove_start, + smuggle_url, + try_get, +) + + +class TVerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P(?:corner|episode|feature)/(?Pf?\d+))' + # videos are only available for 7 days + _TESTS = [{ + 'url': 'https://tver.jp/corner/f0062178', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/feature/f0062413', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/episode/79622438', + 'only_matching': True, + }] + _TOKEN = None + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_initialize(self): + self._TOKEN = self._download_json( + 'https://tver.jp/api/access_token.php', None)['token'] + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + main = self._download_json( + 'https://api.tver.jp/v4/' + path, video_id, + query={'token': self._TOKEN})['main'] + p_id = main['publisher_id'] + service = remove_start(main['service'], 'ts_') + info = { + '_type': 'url_transparent', + 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), + 'episode': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + } + + if service == 'cx': + info.update({ + 'title': main.get('subtitle') or main['title'], + 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), + 'ie_key': 'FujiTVFODPlus7', + }) + else: + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + info.update({ + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + }) + + return info From 664dd8ba854238207fa1421a4f353606406dcded Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 2 Dec 2020 21:37:14 +0100 Subject: [PATCH 048/424] [extractor/common] improve Akamai HTTP format extraction - Allow m3u8 manifest without an additional audio format - Fix extraction for qualities starting with a number Solution provided by @nixxo based on: https://stackoverflow.com/a/5984688 --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 16aff885c..4463e06b3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2623,7 +2623,7 @@ class InfoExtractor(object): REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) - if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): + if len(formats) in (qualities_length, qualities_length + 1, qualities_length * 2, qualities_length * 2 + 1): i = 0 http_formats = [] for f in formats: @@ -2632,7 +2632,7 @@ class InfoExtractor(object): http_f = f.copy() del http_f['manifest_url'] http_url = re.sub( - REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) + REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) http_f.update({ 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), 'url': http_url, From ea89680aeae7c26af355ac23038e3a45f116649e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 2 Dec 2020 22:49:51 +0100 Subject: [PATCH 049/424] [tver] correct episode_number key --- youtube_dl/extractor/tver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index c5299722d..931d4d650 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -43,7 +43,7 @@ class TVerIE(InfoExtractor): info = { '_type': 'url_transparent', 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), - 'episode': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), } if service == 'cx': From da4304609dd627b577d7b69c0a05851497fa384c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 3 Dec 2020 00:33:55 +0100 Subject: [PATCH 050/424] [extractor/commons] improve Akamai HTTP formats extraction --- youtube_dl/extractor/common.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4463e06b3..a21afefeb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2614,20 +2614,20 @@ class InfoExtractor(object): hls_host = hosts.get('hls') if hls_host: m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) http_host = hosts.get('http') - if http_host and 'hdnea=' not in manifest_url: - REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' + if http_host and m3u8_formats and 'hdnea=' not in m3u8_url: + REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) - if len(formats) in (qualities_length, qualities_length + 1, qualities_length * 2, qualities_length * 2 + 1): + if len(m3u8_formats) in (qualities_length, qualities_length + 1): i = 0 - http_formats = [] - for f in formats: - if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': + for f in m3u8_formats: + if f['vcodec'] != 'none': for protocol in ('http', 'https'): http_f = f.copy() del http_f['manifest_url'] @@ -2638,9 +2638,8 @@ class InfoExtractor(object): 'url': http_url, 'protocol': protocol, }) - http_formats.append(http_f) + formats.append(http_f) i += 1 - formats.extend(http_formats) return formats From 00254473691185e567006da00524eedbee9597aa Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 3 Dec 2020 00:35:47 +0100 Subject: [PATCH 051/424] [gamespot] Extract DASH and HTTP formats --- youtube_dl/extractor/gamespot.py | 110 +++++++------------------------ 1 file changed, 25 insertions(+), 85 deletions(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 4236a5ed8..7a1beae3c 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -1,16 +1,7 @@ from __future__ import unicode_literals -import re - from .once import OnceIE -from ..compat import ( - compat_urllib_parse_unquote, -) -from ..utils import ( - unescapeHTML, - url_basename, - dict_get, -) +from ..compat import compat_urllib_parse_unquote class GameSpotIE(OnceIE): @@ -24,17 +15,16 @@ class GameSpotIE(OnceIE): 'title': 'Arma 3 - Community Guide: SITREP I', 'description': 'Check out this video where some of the basics of Arma 3 is explained.', }, + 'skip': 'manifest URL give HTTP Error 404: Not Found', }, { 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', + 'md5': '173ea87ad762cf5d3bf6163dceb255a6', 'info_dict': { 'id': 'gs-2300-6424837', 'ext': 'mp4', 'title': 'Now Playing - The Witcher 3: Wild Hunt', 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', }, - 'params': { - 'skip_download': True, # m3u8 downloads - }, }, { 'url': 'https://www.gamespot.com/videos/embed/6439218/', 'only_matching': True, @@ -49,90 +39,40 @@ class GameSpotIE(OnceIE): def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - data_video_json = self._search_regex( - r'data-video=["\'](.*?)["\']', webpage, 'data video') - data_video = self._parse_json(unescapeHTML(data_video_json), page_id) + data_video = self._parse_json(self._html_search_regex( + r'data-video=(["\'])({.*?})\1', webpage, + 'video data', group=2), page_id) + title = compat_urllib_parse_unquote(data_video['title']) streams = data_video['videoStreams'] - - manifest_url = None formats = [] - f4m_url = streams.get('f4m_stream') - if f4m_url: - manifest_url = f4m_url - formats.extend(self._extract_f4m_formats( - f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) - m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream')) + + m3u8_url = streams.get('adaptive_stream') if m3u8_url: - manifest_url = m3u8_url m3u8_formats = self._extract_m3u8_formats( m3u8_url, page_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(m3u8_formats) - progressive_url = dict_get( - streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr')) - if progressive_url and manifest_url: - qualities_basename = self._search_regex( - r'/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if qualities_basename: - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if qualities: - qualities = list(map(lambda q: int(q), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename) - http_url_basename = url_basename(progressive_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) + for f in m3u8_formats: + formats.append(f) + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': f['url'].replace('.m3u8', '.mp4'), + }) + formats.append(http_f) - onceux_json = self._search_regex( - r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None) - if onceux_json: - onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') - if onceux_url: - formats.extend(self._extract_once_formats(re.sub( - r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url), - http_formats_preference=-1)) + mpd_url = streams.get('adaptive_dash') + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, page_id, mpd_id='dash', fatal=False)) - if not formats: - for quality in ['sd', 'hd']: - # It's actually a link to a flv file - flv_url = streams.get('f4m_{0}'.format(quality)) - if flv_url is not None: - formats.append({ - 'url': flv_url, - 'ext': 'flv', - 'format_id': quality, - }) self._sort_formats(formats) return { - 'id': data_video['guid'], + 'id': data_video.get('guid') or page_id, 'display_id': page_id, - 'title': compat_urllib_parse_unquote(data_video['title']), + 'title': title, 'formats': formats, 'description': self._html_search_meta('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), From beab2f88c9c83c2dba3d539f06133345542b7f03 Mon Sep 17 00:00:00 2001 From: Matthew Rayermann Date: Wed, 2 Dec 2020 23:47:56 -0800 Subject: [PATCH 052/424] [nhk] Add audio clip test to NHK extractor (#27269) --- youtube_dl/extractor/nhk.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index de6a707c4..6a61a47d2 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -10,7 +10,7 @@ class NhkVodIE(InfoExtractor): # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # clip + # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '256a1be14f48d960a7e61e2532d95ec3', 'info_dict': { @@ -21,6 +21,19 @@ class NhkVodIE(InfoExtractor): 'timestamp': 1565965194, 'upload_date': '20190816', }, + }, { + # audio clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + 'info_dict': { + 'id': 'r_inventions-20201104-1-en', + 'ext': 'm4a', + 'title': "Japan's Top Inventions - Miniature Video Cameras", + 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, From 1c78cb118c306541984c7f5e2c40fd4e3ed1c94c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 3 Dec 2020 23:25:36 +0700 Subject: [PATCH 053/424] [travis] Disable download jobs Until youtube-dl OSS approval by Travis or moving to GitHub Actions [ci skip] --- .travis.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index 51afd469a..d828d027d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,29 +12,29 @@ python: dist: trusty env: - YTDL_TEST_SET=core - - YTDL_TEST_SET=download +# - YTDL_TEST_SET=download jobs: include: - python: 3.7 dist: xenial env: YTDL_TEST_SET=core - - python: 3.7 - dist: xenial - env: YTDL_TEST_SET=download +# - python: 3.7 +# dist: xenial +# env: YTDL_TEST_SET=download - python: 3.8 dist: xenial env: YTDL_TEST_SET=core - - python: 3.8 - dist: xenial - env: YTDL_TEST_SET=download +# - python: 3.8 +# dist: xenial +# env: YTDL_TEST_SET=download - python: 3.8-dev dist: xenial env: YTDL_TEST_SET=core - - python: 3.8-dev - dist: xenial - env: YTDL_TEST_SET=download +# - python: 3.8-dev +# dist: xenial +# env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - - env: JYTHON=true; YTDL_TEST_SET=download +# - env: JYTHON=true; YTDL_TEST_SET=download - name: flake8 python: 3.8 dist: xenial @@ -42,9 +42,9 @@ jobs: script: flake8 . fast_finish: true allow_failures: - - env: YTDL_TEST_SET=download +# - env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - - env: JYTHON=true; YTDL_TEST_SET=download +# - env: JYTHON=true; YTDL_TEST_SET=download before_install: - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi script: ./devscripts/run_tests.sh From 2e4726423544041887641df18e2f4c4c5e3f4f6d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 4 Dec 2020 14:16:03 +0100 Subject: [PATCH 054/424] [zdf] extract webm formats(closes #26659) --- youtube_dl/extractor/zdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 656864b2e..5ed2946c2 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -40,7 +40,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -119,7 +119,7 @@ class ZDFIE(ZDFBaseIE): if not ptmd_path: ptmd_path = t[ 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'portal') + '{playerId}', 'ngplayer_2_4') ptmd = self._call_api( urljoin(url, ptmd_path), player, url, video_id, 'metadata') From c5636e9bcaf37f2190b1e7d73785fde399001c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 4 Dec 2020 23:38:42 +0700 Subject: [PATCH 055/424] [nrktv] Relax _VALID_URL (closes #27299, closes #26185) --- youtube_dl/extractor/nrk.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 4a395546f..0c4b126ed 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -146,7 +146,7 @@ class NRKTVIE(NRKBaseIE): _VALID_URL = r'''(?x) https?:// (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ + (?:serie(?:/[^/]+){1,}|program)/ (?![Ee]pisodes)%s (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P\d+))? @@ -275,6 +275,9 @@ class NRKTVIE(NRKBaseIE): }, { 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201507/NPUB21019315', + 'only_matching': True, }] _api_host = None From e91df0c5501bd5df9987e310a37df51129ee1cf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 4 Dec 2020 23:56:50 +0700 Subject: [PATCH 056/424] [pornhub] Handle HTTP errors gracefully (closes #26414) --- youtube_dl/extractor/pornhub.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 69a9797aa..9ad92a8ec 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -31,7 +31,12 @@ class PornHubBaseIE(InfoExtractor): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) - webpage, urlh = dl(*args, **kwargs) + ret = dl(*args, **kwargs) + + if not ret: + return ret + + webpage, urlh = ret if any(re.search(p, webpage) for p in ( r']+\bonload=["\']go\(\)', From 5e95e18ce949e759e8e26de76c386c44e50b2abd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 4 Dec 2020 18:04:38 +0100 Subject: [PATCH 057/424] [nrk] improve format extraction and geo-restriction detection (closes #24221) --- youtube_dl/extractor/nrk.py | 43 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 0c4b126ed..19d820f61 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -24,6 +24,11 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + def _extract_nrk_formats(self, asset_url, video_id): + return self._extract_m3u8_formats( + re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -94,9 +99,7 @@ class NRKIE(NRKBaseIE): if not format_url: continue if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_nrk_formats(format_url, video_id)) self._sort_formats(formats) data = self._download_json( @@ -298,6 +301,7 @@ class NRKTVIE(NRKBaseIE): title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id + urls = [] entries = [] conviva = data.get('convivaStatistics') or {} @@ -314,19 +318,13 @@ class NRKTVIE(NRKBaseIE): else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) for num, asset in enumerate(media_assets, 1): asset_url = asset.get('url') - if not asset_url: + if not asset_url or asset_url in urls: continue - formats = self._extract_akamai_formats(asset_url, video_id) + formats = extract_nrk_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) - # Some f4m streams may not work with hdcore in fragments' URLs - for f in formats: - extra_param = f.get('extra_param_to_segment_url') - if extra_param and 'hdcore' in extra_param: - del f['extra_param_to_segment_url'] - entry_id, entry_title = video_id_and_title(num) duration = parse_duration(asset.get('duration')) subtitles = {} @@ -346,16 +344,17 @@ class NRKTVIE(NRKBaseIE): if not entries: media_url = data.get('mediaUrl') - if media_url: - formats = self._extract_akamai_formats(media_url, video_id) - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - }] + if media_url and media_url not in urls: + formats = extract_nrk_formats(media_url, video_id) + if formats: + self._sort_formats(formats) + duration = parse_duration(data.get('duration')) + entries = [{ + 'id': video_id, + 'title': make_title(title), + 'duration': duration, + 'formats': formats, + }] if not entries: MESSAGES = { @@ -366,7 +365,7 @@ class NRKTVIE(NRKBaseIE): } message_type = data.get('messageType', '') # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type: + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is Trues: self.raise_geo_restricted( msg=MESSAGES.get('ProgramIsGeoBlocked'), countries=self._GEO_COUNTRIES) From 957c65b9eaeaf181d76c6aa815a8f946c090129b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 4 Dec 2020 18:05:27 +0100 Subject: [PATCH 058/424] [nrk] fix typo --- youtube_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 19d820f61..0f69579c5 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -365,7 +365,7 @@ class NRKTVIE(NRKBaseIE): } message_type = data.get('messageType', '') # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is Trues: + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: self.raise_geo_restricted( msg=MESSAGES.get('ProgramIsGeoBlocked'), countries=self._GEO_COUNTRIES) From fe0c28f9569bb99f3bddd59b7f4b2be919368f22 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 4 Dec 2020 18:08:08 +0100 Subject: [PATCH 059/424] [nrk] fix call to moved method --- youtube_dl/extractor/nrk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 0f69579c5..8595f55b1 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -320,7 +320,7 @@ class NRKTVIE(NRKBaseIE): asset_url = asset.get('url') if not asset_url or asset_url in urls: continue - formats = extract_nrk_formats(asset_url, video_id) + formats = self._extract_nrk_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) @@ -345,7 +345,7 @@ class NRKTVIE(NRKBaseIE): if not entries: media_url = data.get('mediaUrl') if media_url and media_url not in urls: - formats = extract_nrk_formats(media_url, video_id) + formats = self._extract_nrk_formats(media_url, video_id) if formats: self._sort_formats(formats) duration = parse_duration(data.get('duration')) From 41c92b8d02c33d6018991f9e5d3a9b616e325355 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 01:19:37 +0700 Subject: [PATCH 060/424] [nrktv:season] Improve extraction --- youtube_dl/extractor/nrk.py | 99 +++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 8595f55b1..4d5f4c5ba 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor @@ -17,6 +18,7 @@ from ..utils import ( parse_age_limit, parse_duration, try_get, + urljoin, url_or_none, ) @@ -547,44 +549,109 @@ class NRKTVSerieBaseIE(InfoExtractor): return [] entries = [] for episode in entry_list: - nrk_id = episode.get('prfId') + nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue + if not re.match(NRKTVIE._EPISODE_RE, nrk_id): + continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?Ptv|radio)\.nrk\.no/serie/(?P[^/]+)/(?:sesong/)?(?P\d+)' + _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { - 'id': '1', + 'id': 'backstage/1', 'title': 'Sesong 1', }, 'playlist_mincount': 30, - } + }, { + # no /sesong/ in path + 'url': 'https://tv.nrk.no/serie/lindmo/2016', + 'info_dict': { + 'id': 'lindmo/2016', + 'title': '2016', + }, + 'playlist_mincount': 29, + }, { + # weird nested _embedded in catalog JSON response + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens/sesong/1', + 'info_dict': { + 'id': 'dickie-dick-dickens/1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 11, + }, { + # 841 entries, multi page + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201509', + 'info_dict': { + 'id': 'dagsnytt/201509', + 'title': 'September 2015', + }, + 'playlist_mincount': 841, + }, { + # 180 entries, single page + 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', + 'only_matching': True, + }] @classmethod def suitable(cls, url): return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') + if not isinstance(embedded, dict): + break + # Extract entries + for asset_key in self._ASSETS_KEYS: + entries = try_get( + embedded, + (lambda x: x[asset_key]['_embedded'][asset_key], + lambda x: x[asset_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + for asset_key in self._ASSETS_KEYS: + next_url = urljoin( + 'https://psapi.nrk.no/', + try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][asset_key]['_links']['next']['href']), + compat_str)) + if next_url: + break + if not next_url: + break + data = self._download_json( + next_url, display_id, + 'Downloading season JSON page %d' % page_num, fatal=False) + if not data: + break + def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + serie = mobj.group('serie') + season_id = mobj.group('id') + display_id = '%s/%s' % (serie, season_id) - webpage = self._download_webpage(url, display_id) - - series = self._extract_series(webpage, display_id) - - season = next( - s for s in series['seasons'] - if int(display_id) == s.get('seasonNumber')) - - title = try_get(season, lambda x: x['titles']['title'], compat_str) + data = self._download_json( + 'https://psapi.nrk.no/%s/catalog/series/%s/seasons/%s' + % (domain, serie, season_id), display_id, query={'pageSize': 50}) + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( - self._extract_episodes(season), display_id, title) + self._entries(data, display_id), + display_id, title) class NRKTVSeriesIE(NRKTVSerieBaseIE): From 02b04785ee22036e53e7883ba3217d17cc181916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 02:21:07 +0700 Subject: [PATCH 061/424] [nrktv:series] Improve extraction --- youtube_dl/extractor/nrk.py | 138 +++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 56 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 4d5f4c5ba..7cfbe7856 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -558,6 +558,46 @@ class NRKTVSerieBaseIE(InfoExtractor): 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _extract_assets_key(self, embedded): + for asset_key in self._ASSETS_KEYS: + if embedded.get(asset_key): + return asset_key + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') + if not isinstance(embedded, dict): + break + assets_key = self._extract_assets_key(embedded) + if not assets_key: + break + # Extract entries + entries = try_get( + embedded, + (lambda x: x[assets_key]['_embedded'][assets_key], + lambda x: x[assets_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + next_url = urljoin( + 'https://psapi.nrk.no/', + try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str)) + if not next_url: + break + data = self._download_json( + next_url, display_id, + 'Downloading %s JSON page %d' % (assets_key, page_num), + fatal=False) + if not data: + break + class NRKTVSeasonIE(NRKTVSerieBaseIE): _VALID_URL = r'https?://(?Ptv|radio)\.nrk\.no/serie/(?P[^/]+)/(?:sesong/)?(?P\d+)' @@ -603,41 +643,6 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) - _ASSETS_KEYS = ('episodes', 'instalments',) - - def _entries(self, data, display_id): - for page_num in itertools.count(1): - embedded = data.get('_embedded') - if not isinstance(embedded, dict): - break - # Extract entries - for asset_key in self._ASSETS_KEYS: - entries = try_get( - embedded, - (lambda x: x[asset_key]['_embedded'][asset_key], - lambda x: x[asset_key]), - list) - for e in self._extract_entries(entries): - yield e - # Find next URL - for asset_key in self._ASSETS_KEYS: - next_url = urljoin( - 'https://psapi.nrk.no/', - try_get( - data, - (lambda x: x['_links']['next']['href'], - lambda x: x['_embedded'][asset_key]['_links']['next']['href']), - compat_str)) - if next_url: - break - if not next_url: - break - data = self._download_json( - next_url, display_id, - 'Downloading season JSON page %d' % page_num, fatal=False) - if not data: - break - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) domain = mobj.group('domain') @@ -648,6 +653,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): data = self._download_json( 'https://psapi.nrk.no/%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), display_id, query={'pageSize': 50}) + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( self._entries(data, display_id), @@ -655,26 +661,9 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' + _VALID_URL = r'https?://(?Ptv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/blank', - 'info_dict': { - 'id': 'blank', - 'title': 'Blank', - 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', - }, - 'playlist_mincount': 30, - }, { - # new layout, seasons - 'url': 'https://tv.nrk.no/serie/backstage', - 'info_dict': { - 'id': 'backstage', - 'title': 'Backstage', - 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', - }, - 'playlist_mincount': 60, - }, { # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', 'info_dict': { @@ -682,7 +671,30 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'title': 'Grønn glede', 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', }, - 'playlist_mincount': 10, + 'playlist_mincount': 90, + }, { + # new layout, instalments, more entries + 'url': 'https://tv.nrk.no/serie/lindmo', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/blank', + 'info_dict': { + 'id': 'blank', + 'title': 'Blank', + 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', + }, + 'playlist_mincount': 30, + 'expected_warnings': ['HTTP Error 404: Not Found'], + }, { + # new layout, seasons + 'url': 'https://tv.nrk.no/serie/backstage', + 'info_dict': { + 'id': 'backstage', + 'title': 'Backstage', + 'description': 'md5:63692ceb96813d9a207e9910483d948b', + }, + 'playlist_mincount': 60, + 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # old layout 'url': 'https://tv.nrksuper.no/serie/labyrint', @@ -711,16 +723,30 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - series_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + series_id = mobj.group('id') + + title = description = None webpage = self._download_webpage(url, series_id) - # New layout (e.g. https://tv.nrk.no/serie/backstage) series = self._extract_series(webpage, series_id, fatal=False) if series: title = try_get(series, lambda x: x['titles']['title'], compat_str) description = try_get( series, lambda x: x['titles']['subtitle'], compat_str) + + data = self._download_json( + 'https://psapi.nrk.no/%s/catalog/series/%s/instalments' + % (domain, series_id), series_id, query={'pageSize': 50}, + fatal=False) + if data: + return self.playlist_result( + self._entries(data, series_id), series_id, title, description) + + # New layout (e.g. https://tv.nrk.no/serie/backstage) + if series: entries = [] entries.extend(self._extract_seasons(series.get('seasons'))) entries.extend(self._extract_entries(series.get('instalments'))) From 5d769860c32cd3c5a4719a46a8c11863d3ce0bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 02:43:33 +0700 Subject: [PATCH 062/424] [nrktv:series] Improve extraction (closes #21926) --- youtube_dl/extractor/nrk.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 7cfbe7856..4a82b11fd 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -521,7 +521,8 @@ class NRKTVSerieBaseIE(InfoExtractor): config = self._parse_json( self._search_regex( (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*'), + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*', + r'PRELOADED_STATE_*\s*=\s*({.+?})\s*\n'), webpage, 'config', default='{}' if not fatal else NO_DEFAULT), display_id, fatal=False, transform_source=js_to_json) if not config: @@ -531,12 +532,26 @@ class NRKTVSerieBaseIE(InfoExtractor): (lambda x: x['initialState']['series'], lambda x: x['series']), dict) - def _extract_seasons(self, seasons): + def _extract_seasons(self, domain, series_id, seasons): + if isinstance(seasons, dict): + seasons = seasons.get('seasons') if not isinstance(seasons, list): return [] entries = [] for season in seasons: - entries.extend(self._extract_episodes(season)) + if not isinstance(season, dict): + continue + episodes = self._extract_episodes(season) + if episodes: + entries.extend(episodes) + continue + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + entries.append(self.url_result( + 'https://%s.nrk.no/serie/%s/sesong/%s' + % (domain, series_id, season_name), + ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) return entries def _extract_episodes(self, season): @@ -713,6 +728,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://tv.nrk.no/serie/postmann-pat', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', + 'info_dict': { + 'id': 'dickie-dick-dickens', + }, + 'playlist_mincount': 8, + 'expected_warnings': ['HTTP Error 404: Not Found'], }] @classmethod @@ -748,7 +770,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): # New layout (e.g. https://tv.nrk.no/serie/backstage) if series: entries = [] - entries.extend(self._extract_seasons(series.get('seasons'))) + entries.extend(self._extract_seasons(domain, series_id, series.get('seasons'))) entries.extend(self._extract_entries(series.get('instalments'))) entries.extend(self._extract_episodes(series.get('extraMaterial'))) return self.playlist_result(entries, series_id, title, description) From e8c0af04b79f84ab3b697498423f8dc27caca24a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 02:46:28 +0700 Subject: [PATCH 063/424] [nrktv] Relax _VALID_URL --- youtube_dl/extractor/nrk.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 4a82b11fd..08e331893 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -148,14 +148,7 @@ class NRKIE(NRKBaseIE): class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P\d+))? - ''' % _EPISODE_RE + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', From c67b33888f4b8ea21189a966f087f795c6ea52bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 03:00:10 +0700 Subject: [PATCH 064/424] [nrk] Improve error extraction --- youtube_dl/extractor/nrk.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 08e331893..f5e964753 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -31,6 +31,22 @@ class NRKBaseIE(InfoExtractor): re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), video_id, 'mp4', 'm3u8_native', fatal=False) + def _raise_error(self, data): + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) + message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -89,6 +105,9 @@ class NRKIE(NRKBaseIE): 'http://psapi.nrk.no/playback/manifest/%s' % video_id, video_id, 'Downloading manifest JSON') + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) + playable = manifest['playable'] formats = [] @@ -352,22 +371,7 @@ class NRKTVIE(NRKBaseIE): }] if not entries: - MESSAGES = { - 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', - 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', - 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - } - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: - self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), - countries=self._GEO_COUNTRIES) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, MESSAGES.get( - message_type, message_type)), - expected=True) + self._raise_error(data) series = conviva.get('seriesName') or data.get('seriesTitle') episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') From 58f7ada235801aae3fdcb959df04a4bfdb790415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 03:24:56 +0700 Subject: [PATCH 065/424] [teachable:course] Improve extraction (closes #24507, closes #27286) --- youtube_dl/extractor/teachable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index a75369dbe..6f264bddc 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -269,7 +269,7 @@ class TeachableCourseIE(TeachableBaseIE): r'(?s)(?P
  • ]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?
  • )', webpage): li = mobj.group('li') - if 'fa-youtube-play' not in li: + if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): continue lecture_url = self._search_regex( r']+href=(["\'])(?P(?:(?!\1).)+)\1', li, From e7f93fbd85549896102fe4e870ec840046f9025a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 03:31:07 +0700 Subject: [PATCH 066/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 65406eafd..aaca9748b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version + +Core +* [extractor/common] Improve Akamai HTTP format extraction + * Allow m3u8 manifest without an additional audio format + * Fix extraction for qualities starting with a number + +Extractors +* [teachable:course] Improve extraction (#24507, #27286) +* [nrk] Improve error extraction +* [nrktv:series] Improve extraction (#21926) +* [nrktv:season] Improve extraction +* [nrk] Improve format extraction and geo-restriction detection (#24221) +* [pornhub] Handle HTTP errors gracefully (#26414) +* [nrktv] Relax URL regular expression (#27299, #26185) ++ [zdf] Extract webm formats (#26659) ++ [gamespot] Extract DASH and HTTP formats ++ [tver] Add support for tver.jp (#26662, #27284) ++ [pornhub] Add support for pornhub.org (#27276) + + version 2020.12.02 Extractors From e1476196692131fa7218f9cf91571b40bb4facb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 03:40:00 +0700 Subject: [PATCH 067/424] release 2020.12.05 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 5f0e33eaf..4cbf763be 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.12.02** +- [ ] I've verified that I'm running youtube-dl version **2020.12.05** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.02 + [debug] youtube-dl version 2020.12.05 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 631c60958..5e20c5505 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.12.02** +- [ ] I've verified that I'm running youtube-dl version **2020.12.05** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 19d1b53e2..23c309dc0 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.02** +- [ ] I've verified that I'm running youtube-dl version **2020.12.05** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 6d2f3e16d..43363cfa0 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.12.02** +- [ ] I've verified that I'm running youtube-dl version **2020.12.05** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.02 + [debug] youtube-dl version 2020.12.05 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 82009b505..8b03ed273 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.02** +- [ ] I've verified that I'm running youtube-dl version **2020.12.05** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index aaca9748b..d9b8336ab 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.12.05 Core * [extractor/common] Improve Akamai HTTP format extraction diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e5b470041..651a754d9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -308,6 +308,7 @@ - **FrontendMasters** - **FrontendMastersCourse** - **FrontendMastersLesson** + - **FujiTVFODPlus7** - **Funimation** - **Funk** - **Fusion** @@ -710,6 +711,7 @@ - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - **QuantumTV** + - **Qub** - **Quickline** - **QuicklineLive** - **R7** @@ -952,6 +954,7 @@ - **TVANouvellesArticle** - **TVC** - **TVCArticle** + - **TVer** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0b0a6c785..9e876731b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.12.02' +__version__ = '2020.12.05' From 51579d87e4435d5e661329dcefd0422d263bb56b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 04:41:33 +0700 Subject: [PATCH 068/424] [peertube] Recognize audio-only formats (closes #27295) --- youtube_dl/extractor/peertube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 48fb95416..2d00791d2 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -541,6 +541,8 @@ class PeerTubeIE(InfoExtractor): 'format_id': format_id, 'filesize': file_size, }) + if format_id == '0p': + f['vcodec'] = 'none' formats.append(f) self._sort_formats(formats) From 13ec444a982fa0c3b2e61666792069d35111ff31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Dec 2020 04:45:30 +0700 Subject: [PATCH 069/424] [peertube] Extract fps --- youtube_dl/extractor/peertube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 2d00791d2..c39d12728 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -543,6 +543,8 @@ class PeerTubeIE(InfoExtractor): }) if format_id == '0p': f['vcodec'] = 'none' + else: + f['fps'] = int_or_none(file_.get('fps')) formats.append(f) self._sort_formats(formats) From 1b26bfd425eaf5f7e314a6ebc810d62e8608ce5a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 5 Dec 2020 00:32:59 +0100 Subject: [PATCH 070/424] [nrk] improve extraction - improve format extraction for old akamai formats - update some of the tests - add is_live value to entry info dict - request instalments only when their available - fix skole extraction --- youtube_dl/extractor/nrk.py | 252 ++++++++++++------------------------ 1 file changed, 81 insertions(+), 171 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index f5e964753..8b31a6ad2 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -13,8 +13,6 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - js_to_json, - NO_DEFAULT, parse_age_limit, parse_duration, try_get, @@ -24,9 +22,10 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['NO'] - def _extract_nrk_formats(self, asset_url, video_id): + if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): + return self._extract_akamai_formats( + re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) return self._extract_m3u8_formats( re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), video_id, 'mp4', 'm3u8_native', fatal=False) @@ -47,6 +46,12 @@ class NRKBaseIE(InfoExtractor): message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): + return self._download_json( + urljoin('http://psapi.nrk.no/', path), + video_id, note or 'Downloading %s JSON' % item, + fatal=fatal, query=query) + class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -64,7 +69,7 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', 'info_dict': { 'id': '150533', 'ext': 'mp4', @@ -78,7 +83,7 @@ class NRKIE(NRKBaseIE): # MD5 is unstable 'info_dict': { 'id': '154915', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', 'duration': 20, @@ -101,9 +106,9 @@ class NRKIE(NRKBaseIE): }] def _extract_from_playback(self, video_id): - manifest = self._download_json( - 'http://psapi.nrk.no/playback/manifest/%s' % video_id, - video_id, 'Downloading manifest JSON') + path_templ = 'playback/%s/' + video_id + call_playback_api = lambda x: self._call_api(path_templ % x, video_id, x) + manifest = call_playback_api('manifest') if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -123,9 +128,7 @@ class NRKIE(NRKBaseIE): formats.extend(self._extract_nrk_formats(format_url, video_id)) self._sort_formats(formats) - data = self._download_json( - 'http://psapi.nrk.no/playback/metadata/%s' % video_id, - video_id, 'Downloading metadata JSON') + data = call_playback_api('metadata') preplay = data['preplay'] titles = preplay['titles'] @@ -171,18 +174,18 @@ class NRKTVIE(NRKBaseIE): _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', 'info_dict': { 'id': 'MDDP12000117AA', 'ext': 'mp4', 'title': 'Alarm Trolltunga', 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, + 'duration': 2223.44, 'age_limit': 6, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', + 'md5': '8d40dab61cea8ab0114e090b029a0565', 'info_dict': { 'id': 'MUHH48000314AA', 'ext': 'mp4', @@ -200,7 +203,7 @@ class NRKTVIE(NRKBaseIE): 'ext': 'mp4', 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, + 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': '24.05.2014', }, @@ -223,39 +226,13 @@ class NRKTVIE(NRKBaseIE): 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], 'info_dict': { - 'id': 'MSPO40010515', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'description': 'md5:c03aba1e917561eface5214020551b7a', }, - 'expected_warnings': ['Video is geo restricted'], + 'skip': 'Video is geo restricted', }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { @@ -286,6 +263,7 @@ class NRKTVIE(NRKBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'ProgramRightsHasExpired', }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, @@ -354,6 +332,7 @@ class NRKTVIE(NRKBaseIE): 'duration': duration, 'subtitles': subtitles, 'formats': formats, + 'is_live': live, }) if not entries: @@ -368,6 +347,7 @@ class NRKTVIE(NRKBaseIE): 'title': make_title(title), 'duration': duration, 'formats': formats, + 'is_live': live, }] if not entries: @@ -513,49 +493,7 @@ class NRKTVEpisodeIE(InfoExtractor): return info -class NRKTVSerieBaseIE(InfoExtractor): - def _extract_series(self, webpage, display_id, fatal=True): - config = self._parse_json( - self._search_regex( - (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*', - r'PRELOADED_STATE_*\s*=\s*({.+?})\s*\n'), - webpage, 'config', default='{}' if not fatal else NO_DEFAULT), - display_id, fatal=False, transform_source=js_to_json) - if not config: - return - return try_get( - config, - (lambda x: x['initialState']['series'], lambda x: x['series']), - dict) - - def _extract_seasons(self, domain, series_id, seasons): - if isinstance(seasons, dict): - seasons = seasons.get('seasons') - if not isinstance(seasons, list): - return [] - entries = [] - for season in seasons: - if not isinstance(season, dict): - continue - episodes = self._extract_episodes(season) - if episodes: - entries.extend(episodes) - continue - season_name = season.get('name') - if season_name and isinstance(season_name, compat_str): - entries.append(self.url_result( - 'https://%s.nrk.no/serie/%s/sesong/%s' - % (domain, series_id, season_name), - ie=NRKTVSeasonIE.ie_key(), - video_title=season.get('title'))) - return entries - - def _extract_episodes(self, season): - if not isinstance(season, dict): - return [] - return self._extract_entries(season.get('episodes')) - +class NRKTVSerieBaseIE(NRKBaseIE): def _extract_entries(self, entry_list): if not isinstance(entry_list, list): return [] @@ -579,7 +517,7 @@ class NRKTVSerieBaseIE(InfoExtractor): def _entries(self, data, display_id): for page_num in itertools.count(1): - embedded = data.get('_embedded') + embedded = data.get('_embedded') or data if not isinstance(embedded, dict): break assets_key = self._extract_assets_key(embedded) @@ -594,18 +532,16 @@ class NRKTVSerieBaseIE(InfoExtractor): for e in self._extract_entries(entries): yield e # Find next URL - next_url = urljoin( - 'https://psapi.nrk.no/', - try_get( - data, - (lambda x: x['_links']['next']['href'], - lambda x: x['_embedded'][assets_key]['_links']['next']['href']), - compat_str)) - if not next_url: + next_url_path = try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str) + if not next_url_path: break - data = self._download_json( - next_url, display_id, - 'Downloading %s JSON page %d' % (assets_key, page_num), + data = self._call_api( + next_url_path, display_id, + note='Downloading %s JSON page %d' % (assets_key, page_num), fatal=False) if not data: break @@ -656,15 +592,12 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - domain = mobj.group('domain') - serie = mobj.group('serie') - season_id = mobj.group('id') + domain, serie, season_id = re.match(self._VALID_URL, url).groups() display_id = '%s/%s' % (serie, season_id) - data = self._download_json( - 'https://psapi.nrk.no/%s/catalog/series/%s/seasons/%s' - % (domain, serie, season_id), display_id, query={'pageSize': 50}) + data = self._call_api( + '%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), + display_id, 'season', query={'pageSize': 50}) title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( @@ -673,8 +606,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?Ptv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' + _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P[^/]+)' _TESTS = [{ # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', @@ -696,7 +628,6 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', }, 'playlist_mincount': 30, - 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', @@ -706,14 +637,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'description': 'md5:63692ceb96813d9a207e9910483d948b', }, 'playlist_mincount': 60, - 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # old layout 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', + 'description': 'I Daidalos sin undersjøiske Labyrint venter spennende oppgaver, skumle robotskapninger og slim.', }, 'playlist_mincount': 3, }, { @@ -729,9 +659,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', 'info_dict': { 'id': 'dickie-dick-dickens', + 'title': 'Dickie Dick Dickens', + 'description': 'md5:19e67411ffe57f7dce08a943d7a0b91f', }, 'playlist_mincount': 8, - 'expected_warnings': ['HTTP Error 404: Not Found'], + }, { + 'url': 'https://nrksuper.no/serie/labyrint', + 'only_matching': True, }] @classmethod @@ -742,57 +676,39 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - domain = mobj.group('domain') - series_id = mobj.group('id') + site, series_id = re.match(self._VALID_URL, url).groups() + domain = 'radio' if site == 'radio.nrk' else 'tv' - title = description = None + series = self._call_api( + '%s/catalog/series/%s' % (domain, series_id), series_id, 'serie') + titles = try_get(series, [ + lambda x: x['titles'], + lambda x: x[x['type']]['titles'], + lambda x: x[x['seriesType']]['titles'], + ]) or {} - webpage = self._download_webpage(url, series_id) + entries = [] + entries.extend(self._entries(series, series_id)) + embedded = series.get('_embedded') or {} + linked_seasons = try_get(series, lambda x: x['_links']['seasons']) or [] + embedded_seasons = embedded.get('seasons') or [] + if len(linked_seasons) > len(embedded_seasons): + for season in linked_seasons: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + entries.append(self.url_result( + 'https://%s.nrk.no/serie/%s/sesong/%s' + % (domain, series_id, season_name), + ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) + else: + for season in embedded_seasons: + entries.extend(self._entries(season, series_id)) + entries.extend(self._entries( + embedded.get('extraMaterial') or {}, series_id)) - series = self._extract_series(webpage, series_id, fatal=False) - if series: - title = try_get(series, lambda x: x['titles']['title'], compat_str) - description = try_get( - series, lambda x: x['titles']['subtitle'], compat_str) - - data = self._download_json( - 'https://psapi.nrk.no/%s/catalog/series/%s/instalments' - % (domain, series_id), series_id, query={'pageSize': 50}, - fatal=False) - if data: - return self.playlist_result( - self._entries(data, series_id), series_id, title, description) - - # New layout (e.g. https://tv.nrk.no/serie/backstage) - if series: - entries = [] - entries.extend(self._extract_seasons(domain, series_id, series.get('seasons'))) - entries.extend(self._extract_entries(series.get('instalments'))) - entries.extend(self._extract_episodes(series.get('extraMaterial'))) - return self.playlist_result(entries, series_id, title, description) - - # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] - - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - if title: - title = self._search_regex( - r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) - - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) - - return self.playlist_result(entries, series_id, title, description) + return self.playlist_result( + entries, series_id, titles.get('title'), titles.get('subtitle')) class NRKTVDirekteIE(NRKTVIE): @@ -896,14 +812,8 @@ class NRKSkoleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id, - video_id) - - nrk_id = self._parse_json( - self._search_regex( - r']+type=["\']application/json["\'][^>]*>({.+?})', - webpage, 'application json'), - video_id)['activeMedia']['psId'] + nrk_id = self._download_json( + 'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id, + video_id)['psId'] return self.url_result('nrk:%s' % nrk_id) From 4c93b2fd155cc15588017c14c033bcea1a1ff931 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 5 Dec 2020 09:13:42 +0100 Subject: [PATCH 071/424] [nrk] improve format extraction --- youtube_dl/extractor/nrk.py | 40 ++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 8b31a6ad2..289a0a3a4 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import itertools +import random import re from .common import InfoExtractor @@ -22,13 +23,26 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['NO'] + _CDN_REPL_REGEX = r'''(?x):// + (?: + nrkod\d{1,2}-httpcache0-47115-cacheod0\.dna\.ip-only\.net/47115-cacheod0| + nrk-od-no\.telenorcdn\.net| + minicdn-od\.nrk\.no/od/nrkhd-osl-rr\.netwerk\.no/no + )/''' + def _extract_nrk_formats(self, asset_url, video_id): if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): return self._extract_akamai_formats( re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) - return self._extract_m3u8_formats( - re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), - video_id, 'mp4', 'm3u8_native', fatal=False) + asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) + formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) + if not formats and re.search(self._CDN_REPL_REGEX, asset_url): + formats = self._extract_m3u8_formats( + re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + return formats def _raise_error(self, data): MESSAGES = { @@ -107,8 +121,10 @@ class NRKIE(NRKBaseIE): def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id - call_playback_api = lambda x: self._call_api(path_templ % x, video_id, x) - manifest = call_playback_api('manifest') + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -195,7 +211,6 @@ class NRKTVIE(NRKBaseIE): 'series': '20 spørsmål', 'episode': '23.05.2014', }, - 'skip': 'NoProgramRights', }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { @@ -214,15 +229,15 @@ class NRKTVIE(NRKBaseIE): # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Video is geo restricted'], + 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', @@ -232,7 +247,7 @@ class NRKTVIE(NRKBaseIE): 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', }, - 'skip': 'Video is geo restricted', + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { @@ -312,6 +327,7 @@ class NRKTVIE(NRKBaseIE): asset_url = asset.get('url') if not asset_url or asset_url in urls: continue + urls.append(asset_url) formats = self._extract_nrk_formats(asset_url, video_id) if not formats: continue From 6bf95b15ee311d673e63ed5bc98d0b74040f24f0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 5 Dec 2020 15:35:29 +0100 Subject: [PATCH 072/424] [nrk] reduce the number of instalments requests --- youtube_dl/extractor/nrk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 289a0a3a4..24993b1c8 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -121,6 +121,7 @@ class NRKIE(NRKBaseIE): def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id + def call_playback_api(item, query=None): return self._call_api(path_templ % item, video_id, item, query=query) # known values for preferredCdn: akamai, iponly, minicdn and telenor @@ -696,7 +697,8 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): domain = 'radio' if site == 'radio.nrk' else 'tv' series = self._call_api( - '%s/catalog/series/%s' % (domain, series_id), series_id, 'serie') + '%s/catalog/series/%s' % (domain, series_id), + series_id, 'serie', query={'embeddedInstalmentsPageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], lambda x: x[x['type']]['titles'], From 082da364166b5e33e1aa3ff643a177c0456b43e6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 5 Dec 2020 16:24:49 +0100 Subject: [PATCH 073/424] [nrk] reduce requests for Radio series --- youtube_dl/extractor/nrk.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 24993b1c8..fdf2d7407 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -694,11 +694,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): def _real_extract(self, url): site, series_id = re.match(self._VALID_URL, url).groups() - domain = 'radio' if site == 'radio.nrk' else 'tv' + is_radio = site == 'radio.nrk' + domain = 'radio' if is_radio else 'tv' + size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' series = self._call_api( '%s/catalog/series/%s' % (domain, series_id), - series_id, 'serie', query={'embeddedInstalmentsPageSize': 50}) + series_id, 'serie', query={size_prefix + 'ageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], lambda x: x[x['type']]['titles'], From cc017e07ca1ce70740f45620f2bceb1b0ac25eb2 Mon Sep 17 00:00:00 2001 From: renalid Date: Sun, 6 Dec 2020 16:12:25 +0100 Subject: [PATCH 074/424] [generic] Extract RSS video description (#27177) --- youtube_dl/extractor/generic.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f10f11244..8ed2789d0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -198,11 +198,19 @@ class GenericIE(InfoExtractor): { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } + 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'title': 'MSNBC Rachel Maddow (video)', + 'description': 're:.*her unique approach to storytelling.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'mov', + 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', + 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', + 'description': 're:.*her unique approach to storytelling.*', + 'upload_date': '20201204', + }, + }], }, # RSS feed with enclosures and unsupported link URLs { @@ -2199,6 +2207,7 @@ class GenericIE(InfoExtractor): '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, + 'description': xpath_text(it, 'description', default=None), }) return { From 5e822c252627905036b164cdbc1220dc7b1f5d08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 6 Dec 2020 22:30:30 +0700 Subject: [PATCH 075/424] [generic] Extract RSS video timestamp --- youtube_dl/extractor/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8ed2789d0..d0e0d6919 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -30,6 +30,7 @@ from ..utils import ( smuggle_url, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, UnsupportedError, xpath_text, @@ -2208,6 +2209,8 @@ class GenericIE(InfoExtractor): 'url': next_url, 'title': it.find('title').text, 'description': xpath_text(it, 'description', default=None), + 'timestamp': unified_timestamp( + xpath_text(it, 'pubDate', default=None)), }) return { From f2c704e1126d614a7cd3283ed94a2226587d7a33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 6 Dec 2020 23:08:03 +0700 Subject: [PATCH 076/424] [generic] Extract RSS video itunes metadata --- youtube_dl/extractor/generic.py | 36 ++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d0e0d6919..d2ba07839 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -20,12 +20,14 @@ from ..utils import ( ExtractorError, float_or_none, HEADRequest, + int_or_none, is_html, js_to_json, KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, orderedSet, + parse_duration, sanitized_Request, smuggle_url, unescapeHTML, @@ -33,7 +35,9 @@ from ..utils import ( unified_timestamp, unsmuggle_url, UnsupportedError, + url_or_none, xpath_text, + xpath_with_ns, ) from .commonprotocols import RtmpIE from .brightcove import ( @@ -206,10 +210,12 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'mov', - 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', - 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', + 'id': 'pdv_maddow_netcast_mov-12-04-2020-224335', + 'title': 're:MSNBC Rachel Maddow', 'description': 're:.*her unique approach to storytelling.*', - 'upload_date': '20201204', + 'timestamp': int, + 'upload_date': compat_str, + 'duration': float, }, }], }, @@ -2189,6 +2195,10 @@ class GenericIE(InfoExtractor): playlist_desc_el = doc.find('./channel/description') playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + NS_MAP = { + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + } + entries = [] for it in doc.findall('./channel/item'): next_url = None @@ -2204,6 +2214,20 @@ class GenericIE(InfoExtractor): if not next_url: continue + def itunes(key): + return xpath_text( + it, xpath_with_ns('./itunes:%s' % key, NS_MAP), + default=None) + + duration = itunes('duration') + explicit = itunes('explicit') + if explicit == 'true': + age_limit = 18 + elif explicit == 'false': + age_limit = 0 + else: + age_limit = None + entries.append({ '_type': 'url_transparent', 'url': next_url, @@ -2211,6 +2235,12 @@ class GenericIE(InfoExtractor): 'description': xpath_text(it, 'description', default=None), 'timestamp': unified_timestamp( xpath_text(it, 'pubDate', default=None)), + 'duration': int_or_none(duration) or parse_duration(duration), + 'thumbnail': url_or_none(itunes('image')), + 'episode': itunes('title'), + 'episode_number': int_or_none(itunes('episode')), + 'season_number': int_or_none(itunes('season')), + 'age_limit': age_limit, }) return { From 6ad0d8781e2b156a4599abcebaa0b91b7e3131f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 00:45:16 +0700 Subject: [PATCH 077/424] [extractor/common] Fix media type extraction for HTML5 media tags in start/end form --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a21afefeb..864596e66 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2521,8 +2521,8 @@ class InfoExtractor(object): # Allowing more characters may end up in significant slow down (see # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: # http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?P(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)', webpage)) - for media_tag, media_type, media_content in media_tags: + r'(?s)(<(?P(?:amp-)?(video|audio))(?:\s+[^>]*)?>)(.*?)', webpage)) + for media_tag, _, media_type, media_content in media_tags: media_info = { 'formats': [], 'subtitles': {}, From 06bf2ac20f2fb64df9f6e4cd1dba267b25effd22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 00:55:49 +0700 Subject: [PATCH 078/424] [extractor/common] Eliminate media tag name regex duplication --- youtube_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 864596e66..877873ebd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2513,15 +2513,16 @@ class InfoExtractor(object): # amp-video and amp-audio are very similar to their HTML5 counterparts # so we wll include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) + _MEDIA_TAG_NAME_RE = r'(?:amp-)?(video|audio)' media_tags = [(media_tag, media_type, '') for media_tag, media_type - in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] + in re.findall(r'(?s)(<%s[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: # http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?P(?:amp-)?(video|audio))(?:\s+[^>]*)?>)(.*?)', webpage)) + r'(?s)(<(?P%s)(?:\s+[^>]*)?>)(.*?)' % _MEDIA_TAG_NAME_RE, webpage)) for media_tag, _, media_type, media_content in media_tags: media_info = { 'formats': [], From 91dd25fe1e18aa3f617005799a8f5018a551c7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 00:59:25 +0700 Subject: [PATCH 079/424] [extractor/common] Add support for dl8-* media tags (closes #27283) --- youtube_dl/extractor/common.py | 3 ++- youtube_dl/extractor/generic.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 877873ebd..dd07a1cae 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2513,7 +2513,8 @@ class InfoExtractor(object): # amp-video and amp-audio are very similar to their HTML5 counterparts # so we wll include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) - _MEDIA_TAG_NAME_RE = r'(?:amp-)?(video|audio)' + # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ + _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' media_tags = [(media_tag, media_type, '') for media_tag, media_type in re.findall(r'(?s)(<%s[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d2ba07839..85dc1d02d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2466,7 +2466,9 @@ class GenericIE(InfoExtractor): # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way - webpage = compat_urllib_parse_unquote(webpage) + # FIXME: unescaping the whole page may break URLs, commenting out for now. + # There probably should be a second run of generic extractor on unescaped webpage. + # webpage = compat_urllib_parse_unquote(webpage) # Unescape squarespace embeds to be detected by generic extractor, # see https://github.com/ytdl-org/youtube-dl/issues/21294 From dccf4932e15392a35a44c24a2732cc0ea6deada4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 01:15:30 +0700 Subject: [PATCH 080/424] [extractor/common] Extract timestamp from Last-Modified header --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 85dc1d02d..055ca838f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2360,7 +2360,7 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) } # Check for direct link to a video From 2bf0634d16b5732548652093d77dbd1aa76e8189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 01:48:10 +0700 Subject: [PATCH 081/424] [youtube] Improve youtu.be extraction in non-existing playlists (closes #27324) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/youtube.py | 55 +++++++++++++++++++----------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 03aff1d52..575b4b36f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1520,6 +1520,7 @@ from .youtube import ( YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, + YoutubeYtBeIE, YoutubeYtUserIE, YoutubeWatchLaterIE, ) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b796f58b2..b8219a0ad 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3139,8 +3139,7 @@ class YoutubePlaylistIE(InfoExtractor): (?: (?: youtube(?:kids)?\.com| - invidio\.us| - youtu\.be + invidio\.us ) /.*?\?.*?\blist= )? @@ -3185,6 +3184,32 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', } }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if YoutubeTabIE.suitable(url) else super( + YoutubePlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if not qs: + qs = {'list': playlist_id} + return self.url_result( + update_url_query('https://www.youtube.com/playlist', qs), + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + + +class YoutubeYtBeIE(InfoExtractor): + _VALID_URL = r'https?://youtu\.be/(?P[0-9A-Za-z_-]{11})/*?.*?\blist=(?P%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + _TESTS = [{ 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', 'info_dict': { 'id': 'yeWKywCrFtk', @@ -3207,28 +3232,18 @@ class YoutubePlaylistIE(InfoExtractor): }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, - }, { - 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', - 'only_matching': True, - }, { - # music album playlist - 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', - 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if YoutubeTabIE.suitable(url) else super( - YoutubePlaylistIE, cls).suitable(url) - def _real_extract(self, url): - playlist_id = self._match_id(url) - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if not qs: - qs = {'list': playlist_id} + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + playlist_id = mobj.group('playlist_id') return self.url_result( - update_url_query('https://www.youtube.com/playlist', qs), - ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + update_url_query('https://www.youtube.com/watch', { + 'v': video_id, + 'list': playlist_id, + 'feature': 'youtu.be', + }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) class YoutubeYtUserIE(InfoExtractor): From fa604d9083805aa2ff398f31d56dc3508b77a37c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 6 Dec 2020 19:38:27 +0100 Subject: [PATCH 082/424] [aenetworks] Fix extraction - Fix Fastly format extraction - Add support for play and watch subdomains - Extract series metadata closes #23363 closes #23390 closes #26795 closes #26985 --- youtube_dl/extractor/aenetworks.py | 277 +++++++++++++++++------------ youtube_dl/extractor/extractors.py | 2 + 2 files changed, 165 insertions(+), 114 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 611b948f5..3d0cf1208 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -5,20 +5,30 @@ import re from .theplatform import ThePlatformIE from ..utils import ( - extract_attributes, ExtractorError, int_or_none, - smuggle_url, update_url_query, -) -from ..compat import ( - compat_urlparse, + urlencode_postdata, ) class AENetworksBaseIE(ThePlatformIE): + _BASE_URL_REGEX = r'''(?x)https?:// + (?:(?:www|play|watch)\.)? + (?P + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/''' _THEPLATFORM_KEY = 'crazyjava' _THEPLATFORM_SECRET = 's3cr3t' + _DOMAIN_MAP = { + 'history.com': ('HISTORY', 'history'), + 'aetv.com': ('AETV', 'aetv'), + 'mylifetime.com': ('LIFETIME', 'lifetime'), + 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), + 'fyi.tv': ('FYI', 'fyi'), + 'historyvault.com': (None, 'historyvault'), + } def _extract_aen_smil(self, smil_url, video_id, auth=None): query = {'mbr': 'true'} @@ -31,7 +41,7 @@ class AENetworksBaseIE(ThePlatformIE): 'assetTypes': 'high_video_s3' }, { 'assetTypes': 'high_video_s3', - 'switch': 'hls_ingest_fastly' + 'switch': 'hls_high_fastly', }] formats = [] subtitles = {} @@ -61,20 +71,13 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?P - (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| - fyi\.tv - )/ - (?: - shows/(?P[^/]+(?:/[^/]+){0,2})| - movies/(?P[^/]+)(?:/full-movie)?| - specials/(?P[^/]+)/(?:full-special|preview-)| - collections/[^/]+/(?P[^/]+) - ) - ''' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P + shows/[^/]+/season-\d+/episode-\d+| + (?: + (?:movie|special)s/[^/]+| + (?:shows/[^/]+/)?videos + )/[^/?#&]+ + )''' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'info_dict': { @@ -91,22 +94,23 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.history.com/shows/ancient-aliens/season-1', - 'info_dict': { - 'id': '71889446852', - }, - 'playlist_mincount': 5, - }, { - 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', - 'info_dict': { - 'id': 'SERIES4317', - 'title': 'Atlanta Plastic', - }, - 'playlist_mincount': 2, + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', - 'only_matching': True + 'info_dict': { + 'id': '600587331957', + 'ext': 'mp4', + 'title': 'Inlawful Entry', + 'description': 'md5:57c12115a2b384d883fe64ca50529e08', + 'timestamp': 1452634428, + 'upload_date': '20160112', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -117,80 +121,152 @@ class AENetworksIE(AENetworksBaseIE): 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }, { - 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', + 'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie', 'only_matching': True }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True - }, { - 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', - 'only_matching': True }, { 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', 'only_matching': True + }, { + 'url': 'http://www.history.com/videos/history-of-valentines-day', + 'only_matching': True + }, { + 'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape', + 'only_matching': True }] - _DOMAIN_TO_REQUESTOR_ID = { - 'history.com': 'HISTORY', - 'aetv.com': 'AETV', - 'mylifetime.com': 'LIFETIME', - 'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB', - 'fyi.tv': 'FYI', - } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id or collection_display_id - webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) - if show_path: - url_parts = show_path.split('/') - url_parts_len = len(url_parts) - if url_parts_len == 1: - entries = [] - for season_url_path in re.findall(r'(?s)]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - if entries: - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - else: - # single season - url_parts_len = 2 - if url_parts_len == 2: - entries = [] - for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): - episode_attributes = extract_attributes(episode_item) - episode_url = compat_urlparse.urljoin( - url, episode_attributes['data-canonical']) - entries.append(self.url_result( - episode_url, 'AENetworks', - episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeasonId', webpage)) - - video_id = self._html_search_meta('aetn:VideoID', webpage) - media_url = self._search_regex( - [r"media_url\s*=\s*'(?P[^']+)'", - r'data-media-url=(?P(?:https?:)?//[^\s>]+)', - r'data-media-url=(["\'])(?P(?:(?!\1).)+?)\1'], - webpage, 'video url', group='url') + domain, canonical = re.match(self._VALID_URL, url).groups() + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + canonical, query={'filter[canonical]': '/' + canonical})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] theplatform_metadata = self._download_theplatform_metadata(self._search_regex( r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) info = self._parse_theplatform_metadata(theplatform_metadata) auth = None if theplatform_metadata.get('AETN$isBehindWall'): - requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] resource = self._get_mvpd_resource( requestor_id, theplatform_metadata['title'], theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), theplatform_metadata['ratings'][0]['rating']) auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) - info.update(self._search_json_ld(webpage, video_id, fatal=False)) info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) return info +class AENetworksListBaseIE(AENetworksBaseIE): + def _call_api(self, resource, slug, brand, fields): + return self._download_json( + 'https://yoga.appsvcs.aetnd.com/graphql', + slug, query={'brand': brand}, data=urlencode_postdata({ + 'query': '''{ + %s(slug: "%s") { + %s + } +}''' % (resource, slug, fields), + }))['data'][resource] + + def _real_extract(self, url): + domain, slug = re.match(self._VALID_URL, url).groups() + _, brand = self._DOMAIN_MAP[domain] + playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) + base_url = 'http://watch.%s' % domain + + entries = [] + for item in (playlist.get(self._ITEMS_KEY) or []): + doc = self._get_doc(item) + canonical = doc.get('canonical') + if not canonical: + continue + entries.append(self.url_result( + base_url + canonical, AENetworksIE.ie_key(), doc.get('id'))) + + description = None + if self._PLAYLIST_DESCRIPTION_KEY: + description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY) + + return self.playlist_result( + entries, playlist.get('id'), + playlist.get(self._PLAYLIST_TITLE_KEY), description) + + +class AENetworksCollectionIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:collection' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://watch.historyvault.com/list/america-the-story-of-us', + 'info_dict': { + 'id': '282', + 'title': 'America The Story of Us', + }, + 'playlist_mincount': 12, + }, { + 'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us', + 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/mysteryquest', + 'only_matching': True + }] + _RESOURCE = 'list' + _ITEMS_KEY = 'items' + _PLAYLIST_TITLE_KEY = 'display_title' + _PLAYLIST_DESCRIPTION_KEY = None + _FIELDS = '''id + display_title + items { + ... on ListVideoItem { + doc { + canonical + id + } + } + }''' + + def _get_doc(self, item): + return item.get('doc') or {} + + +class AENetworksShowIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:show' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.history.com/shows/ancient-aliens', + 'info_dict': { + 'id': 'SH012427480000', + 'title': 'Ancient Aliens', + 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', + }, + 'playlist_mincount': 168, + }] + _RESOURCE = 'series' + _ITEMS_KEY = 'episodes' + _PLAYLIST_TITLE_KEY = 'title' + _PLAYLIST_DESCRIPTION_KEY = 'description' + _FIELDS = '''description + id + title + episodes { + canonical + id + }''' + + def _get_doc(self, item): + return item + + class HistoryTopicIE(AENetworksBaseIE): IE_NAME = 'history:topic' IE_DESC = 'History.com Topic' @@ -204,6 +280,7 @@ class HistoryTopicIE(AENetworksBaseIE): 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', 'timestamp': 1375819729, 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -212,36 +289,8 @@ class HistoryTopicIE(AENetworksBaseIE): 'add_ie': ['ThePlatform'], }] - def theplatform_url_result(self, theplatform_url, video_id, query): - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url( - update_url_query(theplatform_url, query), - { - 'sig': { - 'key': self._THEPLATFORM_KEY, - 'secret': self._THEPLATFORM_SECRET, - }, - 'force_smil_url': True - }), - 'ie_key': 'ThePlatform', - } - def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r']+src="[^"]+\btpid=(\d+)', webpage, 'tpid') - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/history/videos', - video_id, query={'filter[id]': video_id})['results'][0] - title = result['title'] - info = self._extract_aen_smil(result['publicUrl'], video_id) - info.update({ - 'title': title, - 'description': result.get('description'), - 'duration': int_or_none(result.get('duration')), - 'timestamp': int_or_none(result.get('added'), 1000), - }) - return info + return self.url_result( + 'http://www.history.com/videos/' + display_id, + AENetworksIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 575b4b36f..833d5d6aa 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -30,6 +30,8 @@ from .adobetv import ( from .adultswim import AdultSwimIE from .aenetworks import ( AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, HistoryTopicIE, ) from .afreecatv import AfreecaTVIE From 791b743765b1154cff099c543a6723f2c40b822b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 02:01:47 +0700 Subject: [PATCH 083/424] [extractor/generic] Remove unused import --- youtube_dl/extractor/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 055ca838f..9f79e0ba4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -31,7 +31,6 @@ from ..utils import ( sanitized_Request, smuggle_url, unescapeHTML, - unified_strdate, unified_timestamp, unsmuggle_url, UnsupportedError, From d9482c0083e1a6ad6602f063bab46edfa7514338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 02:01:53 +0700 Subject: [PATCH 084/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ChangeLog b/ChangeLog index d9b8336ab..eeb7cbb88 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,29 @@ +version + +Core +* [extractor/common] Extract timestamp from Last-Modified header ++ [extractor/common] Add support for dl8-* media tags (#27283) +* [extractor/common] Fix media type extraction for HTML5 media tags + in start/end form + +Extractors +* [aenetworks] Fix extraction (#23363, #23390, #26795, #26985) + * Fix Fastly format extraction + + Add support for play and watch subdomains + + Extract series metadata +* [youtube] Improve youtu.be extraction in non-existing playlists (#27324) ++ [generic] Extract RSS video description, timestamp and itunes metadata + (#27177) +* [nrk] Reduce the number of instalments and episodes requests +* [nrk] Improve extraction + * Improve format extraction for old akamai formats + + Add is_live value to entry info dict + * Request instalments only when available + * Fix skole extraction ++ [peertube] Extract fps ++ [peertube] Recognize audio-only formats (#27295) + + version 2020.12.05 Core From 2717036489924fce092db593fb769d2f20179e05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 02:03:34 +0700 Subject: [PATCH 085/424] release 2020.12.07 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 4cbf763be..f72f57d02 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.12.05** +- [ ] I've verified that I'm running youtube-dl version **2020.12.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.05 + [debug] youtube-dl version 2020.12.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5e20c5505..8160d257a 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.12.05** +- [ ] I've verified that I'm running youtube-dl version **2020.12.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 23c309dc0..f7b6ba9ea 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.05** +- [ ] I've verified that I'm running youtube-dl version **2020.12.07** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 43363cfa0..efeae549f 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.12.05** +- [ ] I've verified that I'm running youtube-dl version **2020.12.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.05 + [debug] youtube-dl version 2020.12.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 8b03ed273..0435a0ddf 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.05** +- [ ] I've verified that I'm running youtube-dl version **2020.12.07** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index eeb7cbb88..f3571e77b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.12.07 Core * [extractor/common] Extract timestamp from Last-Modified header diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 651a754d9..59e73ba59 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -35,6 +35,8 @@ - **adobetv:video** - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault + - **aenetworks:collection** + - **aenetworks:show** - **afreecatv**: afreecatv.com - **AirMozilla** - **AliExpressLive** @@ -1164,6 +1166,7 @@ - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:tab**: YouTube.com tab - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **YoutubeYtBe** - **YoutubeYtUser** - **Zapiks** - **Zaq1** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9e876731b..39a685197 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.12.05' +__version__ = '2020.12.07' From 4f1ecca58d19be96b17893edb44308004c2b47fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 03:27:21 +0700 Subject: [PATCH 086/424] [slideslive] Add support for yoda service videos and extract subtitles (closes #27323) --- youtube_dl/extractor/slideslive.py | 55 +++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/slideslive.py b/youtube_dl/extractor/slideslive.py index d9ea76831..cd70841a9 100644 --- a/youtube_dl/extractor/slideslive.py +++ b/youtube_dl/extractor/slideslive.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + bool_or_none, + smuggle_url, + try_get, + url_or_none, +) class SlidesLiveIE(InfoExtractor): @@ -18,8 +23,21 @@ class SlidesLiveIE(InfoExtractor): 'description': 'Watch full version of this video at https://slideslive.com/38902413.', 'uploader': 'SlidesLive Videos - A', 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'timestamp': 1597615266, 'upload_date': '20170925', } + }, { + # video_service_name = yoda + 'url': 'https://slideslive.com/38935785', + 'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a', + 'info_dict': { + 'id': 'RMraDYN5ozA_', + 'ext': 'mp4', + 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', + }, + 'params': { + 'format': 'bestvideo', + }, }, { # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', @@ -39,18 +57,47 @@ class SlidesLiveIE(InfoExtractor): video_data = self._download_json( 'https://ben.slideslive.com/player/' + video_id, video_id) service_name = video_data['video_service_name'].lower() - assert service_name in ('url', 'vimeo', 'youtube') + assert service_name in ('url', 'yoda', 'vimeo', 'youtube') service_id = video_data['video_service_id'] + subtitles = {} + for sub in try_get(video_data, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + webvtt_url = url_or_none(sub.get('webvtt_url')) + if not webvtt_url: + continue + lang = sub.get('language') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': webvtt_url, + }) info = { 'id': video_id, 'thumbnail': video_data.get('thumbnail'), - 'url': service_id, + 'is_live': bool_or_none(video_data.get('is_live')), + 'subtitles': subtitles, } - if service_name == 'url': + if service_name in ('url', 'yoda'): info['title'] = video_data['title'] + if service_name == 'url': + info['url'] = service_id + else: + formats = [] + _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' + formats.extend(self._extract_m3u8_formats( + _MANIFEST_PATTERN % (service_id, 'm3u8'), service_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_mpd_formats( + _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) + info.update({ + 'id': service_id, + 'formats': formats, + }) else: info.update({ '_type': 'url_transparent', + 'url': service_id, 'ie_key': service_name.capitalize(), 'title': video_data.get('title'), }) From 17b01228f84713df850912b554675bbfcbc62fb3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 6 Dec 2020 23:37:16 +0100 Subject: [PATCH 087/424] [americastestkitchen] Fix Extraction and add support for Cook's Country and Cook's Illustrated closes #17234 closes #27322 --- youtube_dl/extractor/americastestkitchen.py | 68 ++++++++------------- 1 file changed, 26 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 9c9d77ae1..e20f00fc3 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,33 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( clean_html, - int_or_none, - js_to_json, try_get, unified_strdate, ) class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { 'id': '5b400b9ee338f922cb06450c', - 'title': 'Weeknight Japanese Suppers', + 'title': 'Japanese Suppers', 'ext': 'mp4', - 'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8', + 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', 'timestamp': 1523664000, 'upload_date': '20180414', - 'release_date': '20180414', + 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, - 'episode': 'Weeknight Japanese Suppers', + 'episode': 'Japanese Suppers', 'episode_number': 15, }, 'params': { @@ -36,47 +36,31 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + resource_type, video_id = re.match(self._VALID_URL, url).groups() + is_episode = resource_type == 'episode' + if is_episode: + resource_type = 'episodes' - webpage = self._download_webpage(url, video_id) - - video_data = self._parse_json( - self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', - webpage, 'initial context'), - video_id, js_to_json) - - ep_data = try_get( - video_data, - (lambda x: x['episodeDetail']['content']['data'], - lambda x: x['videoDetail']['content']['data']), dict) - ep_meta = ep_data.get('full_video', {}) - - zype_id = ep_data.get('zype_id') or ep_meta['zype_id'] - - title = ep_data.get('title') or ep_meta.get('title') - description = clean_html(ep_meta.get('episode_description') or ep_data.get( - 'description') or ep_meta.get('description')) - thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) - release_date = unified_strdate(ep_data.get('aired_at')) - - season_number = int_or_none(ep_meta.get('season_number')) - episode = ep_meta.get('title') - episode_number = int_or_none(ep_meta.get('episode_number')) + resource = self._download_json( + 'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id) + video = resource['video'] if is_episode else resource + episode = resource if is_episode else resource.get('episode') or {} return { '_type': 'url_transparent', - 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id, + 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'release_date': release_date, - 'series': "America's Test Kitchen", - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': clean_html(video.get('description')), + 'release_date': unified_strdate(video.get('publishDate')), + 'series': try_get(episode, lambda x: x['show']['title']), + 'episode': episode.get('title'), } From 6956db3606e7ffa638856f1816747f544c1b1a53 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 7 Dec 2020 15:12:54 +0100 Subject: [PATCH 088/424] [tvplay:home] Fix extraction(closes #21153) --- youtube_dl/extractor/tvplay.py | 90 +++++++++++++--------------------- 1 file changed, 35 insertions(+), 55 deletions(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 3c2450dd0..0d858c025 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -12,11 +12,13 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + parse_duration, parse_iso8601, qualities, try_get, update_url_query, url_or_none, + urljoin, ) @@ -414,7 +416,7 @@ class ViafreeIE(InfoExtractor): class TVPlayHomeIE(InfoExtractor): - _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P\d+)' + _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P\d+)' _TESTS = [{ 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', 'info_dict': { @@ -433,80 +435,58 @@ class TVPlayHomeIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [TVPlayIE.ie_key()], }, { 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', 'only_matching': True, }, { 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', 'only_matching': True, + }, { + 'url': 'https://play.tv3.lt/aferistai-10047125', + 'only_matching': True, + }, { + 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317', + 'only_matching': True, + }, { + 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + asset = self._download_json( + urljoin(url, '/sb/public/asset/' + video_id), video_id) - video_id = self._search_regex( - r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id') - - if len(video_id) < 8: - return self.url_result( - 'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id) - - m3u8_url = self._search_regex( - r'data-file\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'm3u8 url', group='url') + m3u8_url = asset['movie']['contentUrl'] + video_id = asset['assetId'] + asset_title = asset['title'] + title = asset_title['title'] formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) - title = self._search_regex( - r'data-title\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'title', default=None, group='value') or self._html_search_meta( - 'title', webpage, default=None) or self._og_search_title( - webpage) + thumbnails = None + image_url = asset.get('imageUrl') + if image_url: + thumbnails = [{ + 'url': urljoin(url, image_url), + 'ext': 'jpg', + }] - description = self._html_search_meta( - 'description', webpage, - default=None) or self._og_search_description(webpage) - - thumbnail = self._search_regex( - r'data-image\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'thumbnail', default=None, group='url') or self._html_search_meta( - 'thumbnail', webpage, default=None) or self._og_search_thumbnail( - webpage) - - duration = int_or_none(self._search_regex( - r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration', - fatal=False)) - - season = self._search_regex( - (r'data-series-title\s*=\s*(["\'])[^/]+/(?P(?:(?!\1).)+)\1', - r'\bseason\s*:\s*(["\'])(?P(?:(?!\1).)+)\1'), webpage, - 'season', default=None, group='value') - season_number = int_or_none(self._search_regex( - r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number', - default=None)) - episode = self._search_regex( - (r'\bepisode\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', - r'data-subtitle\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), webpage, - 'episode', default=None, group='value') - episode_number = int_or_none(self._search_regex( - r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number', - default=None)) + metadata = asset.get('metadata') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'), + 'thumbnails': thumbnails, + 'duration': parse_duration(asset_title.get('runTime')), + 'series': asset.get('tvSeriesTitle'), + 'season': asset.get('tvSeasonTitle'), + 'season_number': int_or_none(metadata.get('seasonNumber')), + 'episode': asset_title.get('titleBrief'), + 'episode_number': int_or_none(metadata.get('episodeNumber')), 'formats': formats, } From 3ded7519857ef3fd1e3760e49e7ee26a92d44679 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 7 Dec 2020 15:27:13 +0100 Subject: [PATCH 089/424] [generic] comment a test covered now by AmericasTestKitchenIE --- youtube_dl/extractor/generic.py | 34 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9f79e0ba4..9eba9614f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2117,23 +2117,23 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - { - # Zype embed - 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', - 'info_dict': { - 'id': '5b400b834b32992a310622b9', - 'ext': 'mp4', - 'title': 'Smoky Barbecue Favorites', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', - 'upload_date': '20170909', - 'timestamp': 1504915200, - }, - 'add_ie': [ZypeIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, + # { + # # Zype embed + # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + # 'info_dict': { + # 'id': '5b400b834b32992a310622b9', + # 'ext': 'mp4', + # 'title': 'Smoky Barbecue Favorites', + # 'thumbnail': r're:^https?://.*\.jpe?g', + # 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', + # 'upload_date': '20170909', + # 'timestamp': 1504915200, + # }, + # 'add_ie': [ZypeIE.ie_key()], + # 'params': { + # 'skip_download': True, + # }, + # }, { # videojs embed 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', From 5bd7ad2e811c9bda2e889156bc261a76e9f086ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 8 Dec 2020 01:12:00 +0700 Subject: [PATCH 090/424] [youtube:tab] Capture and output alerts (closes #27340) --- youtube_dl/extractor/youtube.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b8219a0ad..173f5a2d8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3060,6 +3060,24 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) return uploader + @staticmethod + def _extract_alert(data): + alerts = [] + for alert in try_get(data, lambda x: x['alerts'], list) or []: + if not isinstance(alert, dict): + continue + alert_text = try_get( + alert, lambda x: x['alertRenderer']['text'], dict) + if not alert_text: + continue + text = try_get( + alert_text, + (lambda x: x['simpleText'], lambda x: x['runs'][0]['text']), + compat_str) + if text: + alerts.append(text) + return '\n'.join(alerts) + def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( @@ -3127,6 +3145,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): compat_str) or video_id if video_id: return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) + # Capture and output alerts + alert = self._extract_alert(data) + if alert: + raise ExtractorError(alert, expected=True) # Failed to recognize raise ExtractorError('Unable to recognize tab page') From 07333d00626fe47c4491f762f4d77b51cd69a1b9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 8 Dec 2020 00:31:47 +0100 Subject: [PATCH 091/424] [telequebec] Fix Extraction and Add Support for video.telequebec.tv closes #25733 closes #26883 closes #27339 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/telequebec.py | 160 ++++++++++++++++------------- 2 files changed, 88 insertions(+), 73 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 833d5d6aa..86589a059 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1152,6 +1152,7 @@ from .telequebec import ( TeleQuebecSquatIE, TeleQuebecEmissionIE, TeleQuebecLiveIE, + TeleQuebecVideoIE, ) from .teletask import TeleTaskIE from .telewebion import TelewebionIE diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index b4c485b9b..800d87b70 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -12,25 +12,16 @@ from ..utils import ( class TeleQuebecBaseIE(InfoExtractor): + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + @staticmethod - def _result(url, ie_key): + def _brightcove_result(brightcove_id, player_id, account_id='6150020952001'): return { '_type': 'url_transparent', - 'url': smuggle_url(url, {'geo_countries': ['CA']}), - 'ie_key': ie_key, + 'url': smuggle_url(TeleQuebecBaseIE.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, brightcove_id), {'geo_countries': ['CA']}), + 'ie_key': 'BrightcoveNew', } - @staticmethod - def _limelight_result(media_id): - return TeleQuebecBaseIE._result( - 'limelight:media:' + media_id, 'LimelightMedia') - - @staticmethod - def _brightcove_result(brightcove_id): - return TeleQuebecBaseIE._result( - 'http://players.brightcove.net/6150020952001/default_default/index.html?videoId=%s' - % brightcove_id, 'BrightcoveNew') - class TeleQuebecIE(TeleQuebecBaseIE): _VALID_URL = r'''(?x) @@ -44,14 +35,18 @@ class TeleQuebecIE(TeleQuebecBaseIE): # available till 01.01.2023 'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane', 'info_dict': { - 'id': '577116881b4b439084e6b1cf4ef8b1b3', + 'id': '6155972771001', 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', - 'description': 'md5:067bc84bd6afecad85e69d1000730907', + 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', + 'timestamp': 1589262469, + 'uploader_id': '6150020952001', + 'upload_date': '20200512', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, + 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout', 'info_dict': { @@ -65,7 +60,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): }, 'params': { 'format': 'bestvideo', - 'skip_download': True, }, 'add_ie': ['BrightcoveNew'], }, { @@ -79,25 +73,20 @@ class TeleQuebecIE(TeleQuebecBaseIE): def _real_extract(self, url): media_id = self._match_id(url) - - media_data = self._download_json( - 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, + media = self._download_json( + 'https://mnmedias.api.telequebec.tv/api/v3/media/' + media_id, media_id)['media'] - - source_id = media_data['streamInfo']['sourceId'] - source = (try_get( - media_data, lambda x: x['streamInfo']['source'], - compat_str) or 'limelight').lower() - if source == 'brightcove': - info = self._brightcove_result(source_id) - else: - info = self._limelight_result(source_id) + source_id = next(source_info['sourceId'] for source_info in media['streamInfos'] if source_info.get('source') == 'Brightcove') + info = self._brightcove_result(source_id, '22gPKdt7f') + product = media.get('product') or {} + season = product.get('season') or {} info.update({ - 'title': media_data.get('title'), - 'description': try_get( - media_data, lambda x: x['descriptions'][0]['text'], compat_str), - 'duration': int_or_none( - media_data.get('durationInMilliseconds'), 1000), + 'description': try_get(media, lambda x: x['descriptions'][-1]['text'], compat_str), + 'series': try_get(season, lambda x: x['serie']['titre']), + 'season': season.get('name'), + 'season_number': int_or_none(season.get('seasonNo')), + 'episode': product.get('titre'), + 'episode_number': int_or_none(product.get('episodeNo')), }) return info @@ -148,7 +137,7 @@ class TeleQuebecSquatIE(InfoExtractor): } -class TeleQuebecEmissionIE(TeleQuebecBaseIE): +class TeleQuebecEmissionIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: @@ -160,15 +149,16 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): _TESTS = [{ 'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente', 'info_dict': { - 'id': '66648a6aef914fe3badda25e81a4d50a', + 'id': '6154476028001', 'ext': 'mp4', - 'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?", - 'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014', - 'upload_date': '20171024', - 'timestamp': 1508862118, + 'title': 'Des soins esthétiques à 377 % d’intérêts annuels, ça vous tente?', + 'description': 'md5:cb4d378e073fae6cce1f87c00f84ae9f', + 'upload_date': '20200505', + 'timestamp': 1588713424, + 'uploader_id': '6150020952001', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, }, { 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', @@ -187,26 +177,26 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): webpage = self._download_webpage(url, display_id) media_id = self._search_regex( - r'mediaUID\s*:\s*["\'][Ll]imelight_(?P[a-z0-9]{32})', webpage, - 'limelight id') + r'mediaId\s*:\s*(?P\d+)', webpage, 'media id') - info = self._limelight_result(media_id) - info.update({ - 'title': self._og_search_title(webpage, default=None), - 'description': self._og_search_description(webpage, default=None), - }) - return info + return self.url_result( + 'http://zonevideo.telequebec.tv/media/' + media_id, + TeleQuebecIE.ie_key()) -class TeleQuebecLiveIE(InfoExtractor): +class TeleQuebecLiveIE(TeleQuebecBaseIE): _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?Pendirect)' _TEST = { 'url': 'http://zonevideo.telequebec.tv/endirect/', 'info_dict': { - 'id': 'endirect', + 'id': '6159095684001', 'ext': 'mp4', - 'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Télé-Québec [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'description': 'Canal principal de Télé-Québec', + 'uploader_id': '6150020952001', + 'timestamp': 1590439901, + 'upload_date': '20200525', }, 'params': { 'skip_download': True, @@ -214,25 +204,49 @@ class TeleQuebecLiveIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + return self._brightcove_result('6159095684001', 'skCsmi2Uw') - m3u8_url = None - webpage = self._download_webpage( - 'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id, - fatal=False) - if webpage: - m3u8_url = self._search_regex( - r'm3U8Url\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'm3u8 url', default=None, group='url') - if not m3u8_url: - m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8' - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) - return { - 'id': video_id, - 'title': self._live_title('Télé-Québec - En direct'), - 'is_live': True, - 'formats': formats, - } +class TeleQuebecVideoIE(TeleQuebecBaseIE): + _VALID_URL = r'https?://video\.telequebec\.tv/player(?:-live)?/(?P\d+)' + _TESTS = [{ + 'url': 'https://video.telequebec.tv/player/31110/stream', + 'info_dict': { + 'id': '6202570652001', + 'ext': 'mp4', + 'title': 'Le coût du véhicule le plus vendu au Canada / Tous les frais liés à la procréation assistée', + 'description': 'md5:685a7e4c450ba777c60adb6e71e41526', + 'upload_date': '20201019', + 'timestamp': 1603115930, + 'uploader_id': '6101674910001', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + 'url': 'https://video.telequebec.tv/player-live/28527', + 'only_matching': True, + }] + + def _call_api(self, path, video_id): + return self._download_json( + 'http://beacon.playback.api.brightcove.com/telequebec/api/assets/' + path, + video_id, query={'device_layout': 'web', 'device_type': 'web'})['data'] + + def _real_extract(self, url): + asset_id = self._match_id(url) + asset = self._call_api(asset_id, asset_id)['asset'] + stream = self._call_api( + asset_id + '/streams/' + asset['streams'][0]['id'], asset_id)['stream'] + stream_url = stream['url'] + account_id = try_get( + stream, lambda x: x['video_provider_details']['account_id']) or '6101674910001' + info = self._brightcove_result(stream_url, 'default', account_id) + info.update({ + 'description': asset.get('long_description') or asset.get('short_description'), + 'series': asset.get('series_original_name'), + 'season_number': int_or_none(asset.get('season_number')), + 'episode': asset.get('original_name'), + 'episode_number': int_or_none(asset.get('episode_number')), + }) + return info From e7eff914cd3ffd8bc2975100036fdac9829dd16d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 8 Dec 2020 10:52:52 +0100 Subject: [PATCH 092/424] [lbry] add support for short and embed URLs and fix channel metadata extraction --- youtube_dl/extractor/lbry.py | 44 +++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 6177297ab..4fd50a60d 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -11,12 +11,14 @@ from ..utils import ( int_or_none, mimetype2ext, try_get, + urljoin, ) class LBRYIE(InfoExtractor): IE_NAME = 'lbry.tv' - _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P@[^:]+:[0-9a-z]+/[^:]+:[0-9a-z])' + _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}' + _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P@[^:]+:{0}/[^:]+:{0}|[^:]+:{0}|\$/embed/[^/]+/{0})'.format(_CLAIM_ID_REGEX) _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', @@ -40,6 +42,11 @@ class LBRYIE(InfoExtractor): 'description': 'md5:661ac4f1db09f31728931d7b88807a61', 'timestamp': 1591312601, 'upload_date': '20200604', + 'tags': list, + 'duration': 2570, + 'channel': 'The LBRY Foundation', + 'channel_id': '0ed629d2b9c601300cacf7eabe9da0be79010212', + 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', } }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', @@ -47,6 +54,15 @@ class LBRYIE(InfoExtractor): }, { 'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b", 'only_matching': True, + }, { + 'url': 'https://lbry.tv/Episode-1:e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'only_matching': True, + }, { + 'url': 'https://lbry.tv/$/embed/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'only_matching': True, + }, { + 'url': 'https://lbry.tv/Episode-1:e7', + 'only_matching': True, }] def _call_api_proxy(self, method, display_id, params): @@ -59,22 +75,33 @@ class LBRYIE(InfoExtractor): }).encode())['result'] def _real_extract(self, url): - display_id = self._match_id(url).replace(':', '#') + display_id = self._match_id(url) + if display_id.startswith('$/embed/'): + display_id = display_id[8:].replace('/', ':') + else: + display_id = display_id.replace(':', '#') uri = 'lbry://' + display_id result = self._call_api_proxy( 'resolve', display_id, {'urls': [uri]})[uri] result_value = result['value'] if result_value.get('stream_type') not in ('video', 'audio'): raise ExtractorError('Unsupported URL', expected=True) + claim_id = result['claim_id'] + title = result_value['title'] streaming_url = self._call_api_proxy( - 'get', display_id, {'uri': uri})['streaming_url'] + 'get', claim_id, {'uri': uri})['streaming_url'] source = result_value.get('source') or {} media = result_value.get('video') or result_value.get('audio') or {} - signing_channel = result_value.get('signing_channel') or {} + signing_channel = result.get('signing_channel') or {} + channel_name = signing_channel.get('name') + channel_claim_id = signing_channel.get('claim_id') + channel_url = None + if channel_name and channel_claim_id: + channel_url = urljoin(url, '/%s:%s' % (channel_name, channel_claim_id)) return { - 'id': result['claim_id'], - 'title': result_value['title'], + 'id': claim_id, + 'title': title, 'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str), 'description': result_value.get('description'), 'license': result_value.get('license'), @@ -83,8 +110,9 @@ class LBRYIE(InfoExtractor): 'width': int_or_none(media.get('width')), 'height': int_or_none(media.get('height')), 'duration': int_or_none(media.get('duration')), - 'channel': signing_channel.get('name'), - 'channel_id': signing_channel.get('claim_id'), + 'channel': try_get(signing_channel, lambda x: x['value']['title']), + 'channel_id': channel_claim_id, + 'channel_url': channel_url, 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), 'filesize': int_or_none(source.get('size')), 'url': streaming_url, From c368dc98e06961a84b0751541ee2869c352253a8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 8 Dec 2020 14:53:22 +0100 Subject: [PATCH 093/424] [lbry] add support for channel extraction(closes #25584) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/lbry.py | 179 ++++++++++++++++++++++------- 2 files changed, 141 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 86589a059..a94502eb3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -534,7 +534,10 @@ from .laola1tv import ( EHFTVIE, ITTFIE, ) -from .lbry import LBRYIE +from .lbry import ( + LBRYIE, + LBRYChannelIE, +) from .lci import LCIIE from .lcp import ( LcpPlayIE, diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 4fd50a60d..41cc245eb 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import json from .common import InfoExtractor @@ -10,15 +11,73 @@ from ..utils import ( ExtractorError, int_or_none, mimetype2ext, + OnDemandPagedList, try_get, urljoin, ) -class LBRYIE(InfoExtractor): - IE_NAME = 'lbry.tv' +class LBRYBaseIE(InfoExtractor): + _BASE_URL_REGEX = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/' _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}' - _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P@[^:]+:{0}/[^:]+:{0}|[^:]+:{0}|\$/embed/[^/]+/{0})'.format(_CLAIM_ID_REGEX) + _OPT_CLAIM_ID = '[^:/?#&]+(?::%s)?' % _CLAIM_ID_REGEX + _SUPPORTED_STREAM_TYPES = ['video', 'audio'] + + def _call_api_proxy(self, method, display_id, params, resource): + return self._download_json( + 'https://api.lbry.tv/api/v1/proxy', + display_id, 'Downloading %s JSON metadata' % resource, + headers={'Content-Type': 'application/json-rpc'}, + data=json.dumps({ + 'method': method, + 'params': params, + }).encode())['result'] + + def _resolve_url(self, url, display_id, resource): + return self._call_api_proxy( + 'resolve', display_id, {'urls': url}, resource)[url] + + def _permanent_url(self, url, claim_name, claim_id): + return urljoin(url, '/%s:%s' % (claim_name, claim_id)) + + def _parse_stream(self, stream, url): + stream_value = stream.get('value') or {} + stream_type = stream_value.get('stream_type') + source = stream_value.get('source') or {} + media = stream_value.get(stream_type) or {} + signing_channel = stream.get('signing_channel') or {} + channel_name = signing_channel.get('name') + channel_claim_id = signing_channel.get('claim_id') + channel_url = None + if channel_name and channel_claim_id: + channel_url = self._permanent_url(url, channel_name, channel_claim_id) + + info = { + 'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str), + 'description': stream_value.get('description'), + 'license': stream_value.get('license'), + 'timestamp': int_or_none(stream.get('timestamp')), + 'tags': stream_value.get('tags'), + 'duration': int_or_none(media.get('duration')), + 'channel': try_get(signing_channel, lambda x: x['value']['title']), + 'channel_id': channel_claim_id, + 'channel_url': channel_url, + 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), + 'filesize': int_or_none(source.get('size')), + } + if stream_type == 'audio': + info['vcodec'] = 'none' + else: + info.update({ + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + }) + return info + + +class LBRYIE(LBRYBaseIE): + IE_NAME = 'lbry' + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P\$/[^/]+/[^/]+/{1}|@{0}/{0}|(?!@){0})'.format(LBRYBaseIE._OPT_CLAIM_ID, LBRYBaseIE._CLAIM_ID_REGEX) _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', @@ -30,6 +89,8 @@ class LBRYIE(InfoExtractor): 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', 'timestamp': 1595694354, 'upload_date': '20200725', + 'width': 1280, + 'height': 720, } }, { # Audio @@ -47,6 +108,7 @@ class LBRYIE(InfoExtractor): 'channel': 'The LBRY Foundation', 'channel_id': '0ed629d2b9c601300cacf7eabe9da0be79010212', 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', + 'vcodec': 'none', } }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', @@ -63,57 +125,90 @@ class LBRYIE(InfoExtractor): }, { 'url': 'https://lbry.tv/Episode-1:e7', 'only_matching': True, + }, { + 'url': 'https://lbry.tv/@LBRYFoundation/Episode-1', + 'only_matching': True, + }, { + 'url': 'https://lbry.tv/$/download/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'only_matching': True, }] - def _call_api_proxy(self, method, display_id, params): - return self._download_json( - 'https://api.lbry.tv/api/v1/proxy', display_id, - headers={'Content-Type': 'application/json-rpc'}, - data=json.dumps({ - 'method': method, - 'params': params, - }).encode())['result'] - def _real_extract(self, url): display_id = self._match_id(url) - if display_id.startswith('$/embed/'): - display_id = display_id[8:].replace('/', ':') + if display_id.startswith('$/'): + display_id = display_id.split('/', 2)[-1].replace('/', ':') else: display_id = display_id.replace(':', '#') uri = 'lbry://' + display_id - result = self._call_api_proxy( - 'resolve', display_id, {'urls': [uri]})[uri] + result = self._resolve_url(uri, display_id, 'stream') result_value = result['value'] - if result_value.get('stream_type') not in ('video', 'audio'): + if result_value.get('stream_type') not in self._SUPPORTED_STREAM_TYPES: raise ExtractorError('Unsupported URL', expected=True) claim_id = result['claim_id'] title = result_value['title'] streaming_url = self._call_api_proxy( - 'get', claim_id, {'uri': uri})['streaming_url'] - source = result_value.get('source') or {} - media = result_value.get('video') or result_value.get('audio') or {} - signing_channel = result.get('signing_channel') or {} - channel_name = signing_channel.get('name') - channel_claim_id = signing_channel.get('claim_id') - channel_url = None - if channel_name and channel_claim_id: - channel_url = urljoin(url, '/%s:%s' % (channel_name, channel_claim_id)) - - return { + 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + info = self._parse_stream(result, url) + info.update({ 'id': claim_id, 'title': title, - 'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str), - 'description': result_value.get('description'), - 'license': result_value.get('license'), - 'timestamp': int_or_none(result.get('timestamp')), - 'tags': result_value.get('tags'), - 'width': int_or_none(media.get('width')), - 'height': int_or_none(media.get('height')), - 'duration': int_or_none(media.get('duration')), - 'channel': try_get(signing_channel, lambda x: x['value']['title']), - 'channel_id': channel_claim_id, - 'channel_url': channel_url, - 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), - 'filesize': int_or_none(source.get('size')), 'url': streaming_url, - } + }) + return info + + +class LBRYChannelIE(LBRYBaseIE): + IE_NAME = 'lbry:channel' + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P@%s)/?(?:[?#&]|$)' % LBRYBaseIE._OPT_CLAIM_ID + _TESTS = [{ + 'url': 'https://lbry.tv/@LBRYFoundation:0', + 'info_dict': { + 'id': '0ed629d2b9c601300cacf7eabe9da0be79010212', + 'title': 'The LBRY Foundation', + 'description': 'Channel for the LBRY Foundation. Follow for updates and news.', + }, + 'playlist_count': 29, + }, { + 'url': 'https://lbry.tv/@LBRYFoundation', + 'only_matching': True, + }] + _PAGE_SIZE = 50 + + def _fetch_page(self, claim_id, url, page): + page += 1 + result = self._call_api_proxy( + 'claim_search', claim_id, { + 'channel_ids': [claim_id], + 'claim_type': 'stream', + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + 'stream_types': self._SUPPORTED_STREAM_TYPES, + }, 'page %d' % page) + for item in (result.get('items') or []): + stream_claim_name = item.get('name') + stream_claim_id = item.get('claim_id') + if not (stream_claim_name and stream_claim_id): + continue + + info = self._parse_stream(item, url) + info.update({ + '_type': 'url', + 'id': stream_claim_id, + 'title': try_get(item, lambda x: x['value']['title']), + 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), + }) + yield info + + def _real_extract(self, url): + display_id = self._match_id(url).replace(':', '#') + result = self._resolve_url( + 'lbry://' + display_id, display_id, 'channel') + claim_id = result['claim_id'] + entries = OnDemandPagedList( + functools.partial(self._fetch_page, claim_id, url), + self._PAGE_SIZE) + result_value = result.get('value') or {} + return self.playlist_result( + entries, claim_id, result_value.get('title'), + result_value.get('description')) From e2bdf8bf4f3de7698d1d2844687e3acc760b34e7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 8 Dec 2020 17:11:28 +0100 Subject: [PATCH 094/424] [amcnetworks] Fix free content extraction(closes #20354) --- youtube_dl/extractor/amcnetworks.py | 50 ++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 6fb3d6c53..12b6de0bf 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import ( int_or_none, @@ -11,25 +13,22 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', - 'md5': '', + 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', 'info_dict': { - 'id': 's3MX01Nl4vPH', + 'id': '4Lq1dzOnZGt0', 'ext': 'mp4', - 'title': 'Maron - Season 4 - Step 1', - 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', - 'age_limit': 17, - 'upload_date': '20160505', - 'timestamp': 1462468831, + 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", + 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", + 'upload_date': '20201120', + 'timestamp': 1605904350, 'uploader': 'AMCN', }, 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Requires TV provider accounts', }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, @@ -55,32 +54,33 @@ class AMCNetworksIE(ThePlatformIE): 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'only_matching': True, }] + _REQUESTOR_ID_MAP = { + 'amc': 'AMC', + 'bbcamerica': 'BBCA', + 'ifc': 'IFC', + 'sundancetv': 'SUNDANCE', + 'wetv': 'WETV', + } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + site, display_id = re.match(self._VALID_URL, url).groups() + requestor_id = self._REQUESTOR_ID_MAP[site] + properties = self._download_json( + 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' % (requestor_id.lower(), display_id), + display_id)['data']['properties'] query = { 'mbr': 'true', 'manifest': 'm3u', } - media_url = self._search_regex( - r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', - webpage, 'media url') - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'link\.theplatform\.com/s/([^?]+)', - media_url, 'theplatform_path'), display_id) + tp_path = 'M_UwQC/media/' + properties['videoPid'] + media_url = 'https://link.theplatform.com/s/' + tp_path + theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) info = self._parse_theplatform_metadata(theplatform_metadata) video_id = theplatform_metadata['pid'] title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - auth_required = self._search_regex( - r'window\.authRequired\s*=\s*(true|false);', - webpage, 'auth required') - if auth_required == 'true': - requestor_id = self._search_regex( - r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', - webpage, 'requestor id') + if properties.get('videoCategory') == 'TVE-Auth': resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( From 5a1fbbf8b7215aab0e6382e93eaa1561093352cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 9 Dec 2020 00:05:21 +0700 Subject: [PATCH 095/424] [extractor/common] Fix inline HTML5 media tags processing and add test (closes #27345) --- test/test_InfoExtractor.py | 12 ++++++++++++ youtube_dl/extractor/common.py | 6 +++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 71f6608fe..644b3759c 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -108,6 +108,18 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) def test_parse_html5_media_entries(self): + # inline video tag + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://127.0.0.1/video.html', + r'
    ', webpage, 'like') - dislike_count = self._extract_count( - r']+class="votesDown"[^>]*>([\d,\.]+)', webpage, 'dislike') + like_count = extract_vote_count('Up', 'like') + dislike_count = extract_vote_count('Down', 'dislike') comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') From cfa4ffa23b96df7ee3ffbe0dd35ec5c80e17590f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 21:55:12 +0700 Subject: [PATCH 201/424] [spangbang] Add support for playlist videos --- youtube_dl/extractor/spankbang.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 61ca902ce..e3ec8602d 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -17,7 +17,14 @@ from ..utils import ( class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/(?:video|play|embed)\b' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?spankbang\.com/ + (?: + (?P[\da-z]+)/(?:video|play|embed)\b| + [\da-z]+-(?P[\da-z]+)/playlist/[^/?#&]+ + ) + ''' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -57,10 +64,14 @@ class SpankBangIE(InfoExtractor): }, { 'url': 'https://spankbang.com/2y3td/embed/', 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') webpage = self._download_webpage( url.replace('/%s/embed' % video_id, '/%s/video' % video_id), video_id, headers={'Cookie': 'country=US'}) From 8bc4c6350e0a9c54e79ac3bd28ce97054d8c000d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 21:58:26 +0700 Subject: [PATCH 202/424] [spangbang:playlist] Fix extraction (closes #24087) --- youtube_dl/extractor/spankbang.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index e3ec8602d..8a7102d0c 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -13,6 +13,7 @@ from ..utils import ( str_to_int, url_or_none, urlencode_postdata, + urljoin, ) @@ -166,30 +167,33 @@ class SpankBangIE(InfoExtractor): class SpankBangPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/[^/]+' + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/(?P[^/]+)' _TEST = { 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', 'info_dict': { 'id': 'ug0k', 'title': 'Big Ass Titties', }, - 'playlist_mincount': 50, + 'playlist_mincount': 40, } def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + display_id = mobj.group('display_id') webpage = self._download_webpage( url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) entries = [self.url_result( - 'https://spankbang.com/%s/video' % video_id, - ie=SpankBangIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( - r']+\bhref=["\']/?([\da-z]+)/play/', webpage))] + urljoin(url, mobj.group('path')), + ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) + for mobj in re.finditer( + r']+\bhref=(["\'])(?P/?[\da-z]+-(?P[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' + % re.escape(display_id), webpage)] title = self._html_search_regex( - r'

    ([^<]+)\s+playlist

    ', webpage, 'playlist title', + r'

    ([^<]+)\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title) From d61ed9f2f15b1962646547f582c6637e7fa70df7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 22:14:31 +0700 Subject: [PATCH 203/424] [spankbang] Remove unused import --- youtube_dl/extractor/spankbang.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 8a7102d0c..37cb8c839 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -7,7 +7,6 @@ from ..utils import ( determine_ext, ExtractorError, merge_dicts, - orderedSet, parse_duration, parse_resolution, str_to_int, From 58e55198c1b13949b20f6912feed9ec9d9214950 Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Mon, 28 Oct 2019 03:54:03 -0400 Subject: [PATCH 204/424] [instagram] Fix extraction when authenticated (closes #27422) --- youtube_dl/extractor/instagram.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index b061850a1..0755896de 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -137,6 +137,16 @@ class InstagramIE(InfoExtractor): (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], lambda x: x['entry_data']['PostPage'][0]['media']), dict) + if not media: + additional_data = self._parse_json( + self._search_regex(r'window\.__additionalDataLoaded\(\'[^\']+\',\s*({.+?})\);', + webpage, 'additional data', default='{}'), + video_id, fatal=False) + if additional_data: + media = try_get( + additional_data, + lambda x: x['graphql']['shortcode_media'], + dict) if media: video_url = media.get('video_url') height = int_or_none(media.get('dimensions', {}).get('height')) From f22b5a6b960bca57990ede16d0e06603d1b9ab0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 22:37:41 +0700 Subject: [PATCH 205/424] [instagram] Improve extraction (closes #22880) --- youtube_dl/extractor/instagram.py | 128 +++++++++++++++--------------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 0755896de..3f5648c0c 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -122,9 +122,9 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - (video_url, description, thumbnail, timestamp, uploader, + (media, video_url, description, thumbnail, timestamp, uploader, uploader_id, like_count, comment_count, comments, height, - width) = [None] * 11 + width) = [None] * 12 shared_data = self._parse_json( self._search_regex( @@ -137,69 +137,71 @@ class InstagramIE(InfoExtractor): (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], lambda x: x['entry_data']['PostPage'][0]['media']), dict) - if not media: - additional_data = self._parse_json( - self._search_regex(r'window\.__additionalDataLoaded\(\'[^\']+\',\s*({.+?})\);', - webpage, 'additional data', default='{}'), - video_id, fatal=False) - if additional_data: - media = try_get( - additional_data, - lambda x: x['graphql']['shortcode_media'], - dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - thumbnail = media.get('display_src') - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') + # _sharedData.entry_data.PostPage is empty when authenticated (see + # https://github.com/ytdl-org/youtube-dl/pull/22880) + if not media: + additional_data = self._parse_json( + self._search_regex( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', + webpage, 'additional data', default='{}'), + video_id, fatal=False) + if additional_data: + media = try_get( + additional_data, lambda x: x['graphql']['shortcode_media'], + dict) + if media: + video_url = media.get('video_url') + height = int_or_none(media.get('dimensions', {}).get('height')) + width = int_or_none(media.get('dimensions', {}).get('width')) + description = try_get( + media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) or media.get('caption') + thumbnail = media.get('display_src') + timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) + uploader = media.get('owner', {}).get('full_name') + uploader_id = media.get('owner', {}).get('username') - def get_count(key, kind): - return int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) - like_count = get_count('preview_like', 'like') - comment_count = get_count('to_comment', 'comment') + def get_count(key, kind): + return int_or_none(try_get( + media, (lambda x: x['edge_media_%s' % key]['count'], + lambda x: x['%ss' % kind]['count']))) + like_count = get_count('preview_like', 'like') + comment_count = get_count('to_comment', 'comment') - comments = [{ - 'author': comment.get('user', {}).get('username'), - 'author_id': comment.get('user', {}).get('id'), - 'id': comment.get('id'), - 'text': comment.get('text'), - 'timestamp': int_or_none(comment.get('created_at')), - } for comment in media.get( - 'comments', {}).get('nodes', []) if comment.get('text')] - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) + comments = [{ + 'author': comment.get('user', {}).get('username'), + 'author_id': comment.get('user', {}).get('id'), + 'id': comment.get('id'), + 'text': comment.get('text'), + 'timestamp': int_or_none(comment.get('created_at')), + } for comment in media.get( + 'comments', {}).get('nodes', []) if comment.get('text')] + if not video_url: + edges = try_get( + media, lambda x: x['edge_sidecar_to_children']['edges'], + list) or [] + if edges: + entries = [] + for edge_num, edge in enumerate(edges, start=1): + node = try_get(edge, lambda x: x['node'], dict) + if not node: + continue + node_video_url = url_or_none(node.get('video_url')) + if not node_video_url: + continue + entries.append({ + 'id': node.get('shortcode') or node['id'], + 'title': 'Video %d' % edge_num, + 'url': node_video_url, + 'thumbnail': node.get('display_url'), + 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), + 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), + 'view_count': int_or_none(node.get('video_view_count')), + }) + return self.playlist_result( + entries, video_id, + 'Post by %s' % uploader_id if uploader_id else None, + description) if not video_url: video_url = self._og_search_video_url(webpage, secure=False) From 6911312e53dc2b2ac76450741e7e86d963cb939a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 22:42:58 +0700 Subject: [PATCH 206/424] [instagram] Improve thumbnail extraction --- youtube_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 3f5648c0c..c1526f865 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -156,7 +156,7 @@ class InstagramIE(InfoExtractor): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') - thumbnail = media.get('display_src') + thumbnail = media.get('display_src') or media.get('display_url') timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') From c32a059f52ccb76703785dccdd91ae9ce947a824 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 Dec 2020 16:54:24 +0100 Subject: [PATCH 207/424] [bbc] switch to media selector v6 closes #23232 closes #23933 closes #26303 closes #26432 closes #26821 closes #27538 --- youtube_dl/extractor/bbc.py | 74 ++++++++++--------------------------- 1 file changed, 20 insertions(+), 54 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 54cbcdc8e..b4daee54e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -49,22 +49,17 @@ class BBCCoUkIE(InfoExtractor): _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' - _MEDIASELECTOR_URLS = [ + _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s' + _MEDIA_SETS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + 'iptv-all', + 'pc', ] - _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' - _NAMESPACES = ( - _MEDIASELECTION_NS, - _EMP_PLAYLIST_NS, - ) - _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -261,8 +256,6 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' - def _login(self): username, password = self._get_login_info() if username is None: @@ -307,22 +300,14 @@ class BBCCoUkIE(InfoExtractor): def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) - def _findall_ns(self, element, xpath): - elements = [] - for ns in self._NAMESPACES: - elements.extend(element.findall(xpath % ns)) - return elements - def _extract_medias(self, media_selection): - error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) - if error is None: - media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) - if error is not None: - raise BBCCoUkIE.MediaSelectionError(error.get('id')) - return self._findall_ns(media_selection, './{%s}media') + error = media_selection.get('result') + if error: + raise BBCCoUkIE.MediaSelectionError(error) + return media_selection.get('media') or [] def _extract_connections(self, media): - return self._findall_ns(media, './{%s}connection') + return media.get('connection') or [] def _get_subtitles(self, media, programme_id): subtitles = {} @@ -334,13 +319,13 @@ class BBCCoUkIE(InfoExtractor): cc_url, programme_id, 'Downloading captions', fatal=False) if not isinstance(captions, compat_etree_Element): continue - lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - subtitles[lang] = [ + subtitles['en'] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, ] + break return subtitles def _raise_extractor_error(self, media_selection_error): @@ -350,10 +335,10 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): last_exception = None - for mediaselector_url in self._MEDIASELECTOR_URLS: + for media_set in self._MEDIA_SETS: try: return self._download_media_selector_url( - mediaselector_url % programme_id, programme_id) + self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e @@ -362,8 +347,8 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML', + media_selection = self._download_json( + url, programme_id, 'Downloading media selection JSON', expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) @@ -377,7 +362,6 @@ class BBCCoUkIE(InfoExtractor): if kind in ('video', 'audio'): bitrate = int_or_none(media.get('bitrate')) encoding = media.get('encoding') - service = media.get('service') width = int_or_none(media.get('width')) height = int_or_none(media.get('height')) file_size = int_or_none(media.get('media_file_size')) @@ -392,8 +376,6 @@ class BBCCoUkIE(InfoExtractor): supplier = connection.get('supplier') transfer_format = connection.get('transferFormat') format_id = supplier or conn_kind or protocol - if service: - format_id = '%s_%s' % (service, format_id) # ASX playlist if supplier == 'asx': for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): @@ -408,20 +390,11 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) - if re.search(self._USP_RE, href): - usp_formats = self._extract_m3u8_formats( - re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), - programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for f in usp_formats: - if f.get('height') and f['height'] > 720: - continue - formats.append(f) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) else: - if not service and not supplier and bitrate: + if not supplier and bitrate: format_id += '-%d' % bitrate fmt = { 'format_id': format_id, @@ -554,7 +527,7 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') error = self._search_regex( - r']+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + r']+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<', webpage, 'error', default=None) if error: raise ExtractorError(error, expected=True) @@ -607,16 +580,9 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - _MEDIASELECTOR_URLS = [ - # Provides HQ HLS streams but fails with geolocation in some cases when it's - # even not geo restricted at all - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - # Provides more formats, namely direct mp4 links, but fails on some videos with - # notukerror for non UK (?) users (e.g. - # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', - # Provides fewer formats, but works everywhere for everybody (hopefully) - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + _MEDIA_SETS = [ + 'mobile-tablet-main', + 'pc', ] _TESTS = [{ From 46cffb0c47bf57b7aba45916bae6cb4092965686 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 22:50:58 +0700 Subject: [PATCH 208/424] [instagram] Add support for reel URLs (closes #26234, closes #26250) --- youtube_dl/extractor/instagram.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c1526f865..e15508fca 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -22,7 +22,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv)/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -95,6 +95,9 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://www.instagram.com/tv/aye83DjauH/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', + 'only_matching': True, }] @staticmethod From 12053450dceda14ec63e3e721c72731f2cc40b34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 22:58:27 +0700 Subject: [PATCH 209/424] [instagram] Fix comment count extraction --- youtube_dl/extractor/instagram.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index e15508fca..07c8df7a6 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -164,12 +164,18 @@ class InstagramIE(InfoExtractor): uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') - def get_count(key, kind): - return int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) + def get_count(keys, kind): + if not isinstance(keys, (list, tuple)): + keys = [keys] + for key in keys: + count = int_or_none(try_get( + media, (lambda x: x['edge_media_%s' % key]['count'], + lambda x: x['%ss' % kind]['count']))) + if count is not None: + return count like_count = get_count('preview_like', 'like') - comment_count = get_count('to_comment', 'comment') + comment_count = get_count( + ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') comments = [{ 'author': comment.get('user', {}).get('username'), From cec1c2f211b93d5c5b1c1ed8cd59d9bbf6f35d49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 22:59:10 +0700 Subject: [PATCH 210/424] [instagram] Fix test --- youtube_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 07c8df7a6..1eeddc3b6 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -35,7 +35,7 @@ class InstagramIE(InfoExtractor): 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', - 'uploader': 'Naomi Leonor Phan-Quang', + 'uploader': 'B E A U T Y F O R A S H E S', 'like_count': int, 'comment_count': int, 'comments': list, From a272fe21a838be843a2d35f0161b75f23edab70b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 23:13:26 +0700 Subject: [PATCH 211/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/ChangeLog b/ChangeLog index fbb453ec6..8a4819345 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,35 @@ +version + +Extractors +* [instagram] Fix comment count extraction ++ [instagram] Add support for reel URLs (#26234, #26250) +* [bbc] Switch to media selector v6 (#23232, #23933, #26303, #26432, #26821, + #27538) +* [instagram] Improve thumbnail extraction +* [instagram] Fix extraction when authenticated (#22880, #26377, #26981, + #27422) +* [spankbang:playlist] Fix extraction (#24087) ++ [spankbang] Add support for playlist videos +* [pornhub] Improve like and dislike count extraction (#27356) +* [pornhub] Fix lq formats extraction (#27386, #27393) ++ [bongacams] Add support for bongacams.com (#27440) +* [youtube:tab] Extend URL regular expression (#27501) +* [theweatherchannel] Fix extraction (#25930, #26051) ++ [sprout] Add support for Universal Kids (#22518) +* [theplatform] Allow passing geo bypass countries from other extractors ++ [wistia] Add support for playlists (#27533) ++ [ctv] Add support for ctv.ca (#27525) +* [9c9media] Improve info extraction +* [youtube] Fix automatic captions extraction (#27162, #27388) +* [sonyliv] Fix title for movies +* [sonyliv] Fix extraction (#25667) +* [streetvoice] Fix extraction (#27455, #27492) ++ [facebook] Add support for watchparty pages (#27507) +* [cbslocal] Fix video extraction ++ [brightcove] Add another method to extract policyKey +* [mewatch] Relax URL regular expression (#27506) + + version 2020.12.22 Core From 365b3cc72dc265d20ece3e57f6abe590eb026544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 23:17:35 +0700 Subject: [PATCH 212/424] release 2020.12.26 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 4 ++++ youtube_dl/version.py | 2 +- 8 files changed, 18 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 59de7fd02..52295e426 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.12.22** +- [ ] I've verified that I'm running youtube-dl version **2020.12.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.22 + [debug] youtube-dl version 2020.12.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 91c1b5b50..40a61bc80 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.12.22** +- [ ] I've verified that I'm running youtube-dl version **2020.12.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 255b8426a..e71c3ad61 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.22** +- [ ] I've verified that I'm running youtube-dl version **2020.12.26** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 85cb4bec6..b10168073 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.12.22** +- [ ] I've verified that I'm running youtube-dl version **2020.12.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.22 + [debug] youtube-dl version 2020.12.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index c0a096f11..cba0b3394 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.22** +- [ ] I've verified that I'm running youtube-dl version **2020.12.26** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 8a4819345..86b12b1c3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.12.26 Extractors * [instagram] Fix comment count extraction diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c1f73b757..c54507e90 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -112,6 +112,7 @@ - **blinkx** - **Bloomberg** - **BokeCC** + - **BongaCams** - **BostonGlobe** - **Box** - **Bpb**: Bundeszentrale für politische Bildung @@ -146,6 +147,7 @@ - **CBS** - **CBSInteractive** - **CBSLocal** + - **CBSLocalArticle** - **cbsnews**: CBS News - **cbsnews:embed** - **cbsnews:livevideo**: CBS News Live Videos @@ -198,6 +200,7 @@ - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 + - **CTV** - **CTVNews** - **cu.ntv.co.jp**: Nippon Television Network - **Culturebox** @@ -1108,6 +1111,7 @@ - **WeiboMobile** - **WeiqiTV**: WQTV - **Wistia** + - **WistiaPlaylist** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** - **WSJ**: Wall Street Journal diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 3d45c1190..d1dcebd88 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.12.22' +__version__ = '2020.12.26' From 0fa67c1d686c1c25b467906307cafefa885c4a80 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 08:55:51 +0100 Subject: [PATCH 213/424] [generic] Add support for VHX Embeds(#27546) --- youtube_dl/extractor/generic.py | 22 ++++++++++++++++++++-- youtube_dl/extractor/vimeo.py | 7 +++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6e46b2c0e..14c27c6da 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -67,7 +67,10 @@ from .tube8 import Tube8IE from .mofosex import MofosexEmbedIE from .spankwire import SpankwireIE from .youporn import YouPornIE -from .vimeo import VimeoIE +from .vimeo import ( + VimeoIE, + VHXEmbedIE, +) from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE @@ -2193,7 +2196,18 @@ class GenericIE(InfoExtractor): # 'params': { # 'force_generic_extractor': True, # }, - # } + # }, + { + # VHX Embed + 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy', + 'info_dict': { + 'id': '858208', + 'ext': 'mp4', + 'title': 'Untitled', + 'uploader_id': 'user80538407', + 'uploader': 'OTT Videos', + }, + }, ] def report_following_redirect(self, new_url): @@ -2571,6 +2585,10 @@ class GenericIE(InfoExtractor): if vimeo_urls: return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) + vhx_url = VHXEmbedIE._extract_url(webpage) + if vhx_url: + return self.url_result(vhx_url, VHXEmbedIE.ie_key()) + vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', webpage, 'vid.me embed', default=None) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4c55946f1..15cd06268 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1119,6 +1119,12 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P\d+)' + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) + return unescapeHTML(mobj.group(1)) if mobj else None + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -1127,5 +1133,6 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): 'ott data'), video_id, js_to_json)['config_url'] config = self._download_json(config_url, video_id) info = self._parse_config(config, video_id) + info['id'] = video_id self._vimeo_sort_formats(info['formats']) return info From aed617e311b95d771e10f48cc8dcde25d7816224 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 08:58:34 +0100 Subject: [PATCH 214/424] [amcnetworks] improve auth only video detection(closes #27548) --- youtube_dl/extractor/amcnetworks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 12b6de0bf..b8027bbca 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -80,7 +80,8 @@ class AMCNetworksIE(ThePlatformIE): title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - if properties.get('videoCategory') == 'TVE-Auth': + video_category = properties.get('videoCategory') + if video_category and video_category.endswith('-Auth'): resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( From e4749965412edf2c6d3938d4b8f1d3dbab61b0b5 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sun, 27 Dec 2020 21:15:09 +0700 Subject: [PATCH 215/424] [youtube] Update invidious.snopyta.org (#22667) Co-authored-by: sofutru <54445344+sofutru@users.noreply.github.com> --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 289d9bab1..5ef58d730 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -321,7 +321,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances (?:(?:www|dev)\.)?invidio\.us/| (?:(?:www|no)\.)?invidiou\.sh/| - (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| + (?:(?:www|fi)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| From f86b299d0ecdba3462d67247cf70cee3a2809a1f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 16:20:49 +0100 Subject: [PATCH 216/424] [telecinco] fix extraction --- youtube_dl/extractor/telecinco.py | 77 ++++++++----------------------- 1 file changed, 20 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 9ba3da341..eecd6a5c9 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -5,14 +5,11 @@ import json import re from .common import InfoExtractor -from .ooyala import OoyalaIE from ..utils import ( clean_html, - determine_ext, int_or_none, str_or_none, try_get, - urljoin, ) @@ -28,7 +25,7 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', }, 'playlist': [{ - 'md5': 'adb28c37238b675dad0f042292f209a7', + 'md5': '7ee56d665cfd241c0e6d80fd175068b0', 'info_dict': { 'id': 'JEA5ijCnF6p5W08A1rNKn7', 'ext': 'mp4', @@ -38,7 +35,7 @@ class TelecincoIE(InfoExtractor): }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', + 'md5': 'c86fe0d99e3bdb46b7950d38bf6ef12a', 'info_dict': { 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', @@ -48,7 +45,7 @@ class TelecincoIE(InfoExtractor): }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', + 'md5': 'eddb50291df704ce23c74821b995bcac', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', @@ -90,58 +87,24 @@ class TelecincoIE(InfoExtractor): def _parse_content(self, content, url): video_id = content['dataMediaId'] - if content.get('dataCmsId') == 'ooyala': - return self.url_result( - 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) - config_url = urljoin(url, content['dataConfig']) config = self._download_json( - config_url, video_id, 'Downloading config JSON') + content['dataConfig'], video_id, 'Downloading config JSON') title = config['info']['title'] - - def mmc_url(mmc_type): - return re.sub( - r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, - config['services']['mmc']) - - duration = None - formats = [] - for mmc_type in ('flash', 'html5'): - mmc = self._download_json( - mmc_url(mmc_type), video_id, - 'Downloading %s mmc JSON' % mmc_type, fatal=False) - if not mmc: - continue - if not duration: - duration = int_or_none(mmc.get('duration')) - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - gcp = location.get('gcp') - ogn = location.get('ogn') - if None in (gat, gcp, ogn): - continue - token_data = { - 'gcp': gcp, - 'ogn': ogn, - 'sta': 0, - } - media = self._download_json( - gat, video_id, data=json.dumps(token_data).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }, fatal=False) or {} - stream = media.get('stream') or media.get('file') - if not stream: - continue - ext = determine_ext(stream) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + services = config['services'] + caronte = self._download_json(services['caronte'], video_id) + stream = caronte['dls'][0]['stream'] + headers = self.geo_verification_headers() + headers.update({ + 'Content-Type': 'application/json;charset=UTF-8', + 'Origin': re.match(r'https?://[^/]+', url).group(0), + }) + cdn = self._download_json( + caronte['cerbero'], video_id, data=json.dumps({ + 'bbx': caronte['bbx'], + 'gbx': self._download_json(services['gbx'], video_id)['gbx'], + }).encode(), headers=headers)['tokens']['1']['cdn'] + formats = self._extract_m3u8_formats( + stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) return { @@ -149,7 +112,7 @@ class TelecincoIE(InfoExtractor): 'title': title, 'formats': formats, 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), - 'duration': duration, + 'duration': int_or_none(content.get('dataDuration')), } def _real_extract(self, url): From 4c7a4dbc4d07786734324b267a41b74ee7099a85 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 16:22:01 +0100 Subject: [PATCH 217/424] [mitele] fix free video extraction(#24624)(closes #25827)(closes #26757) --- youtube_dl/extractor/mitele.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index ad9da9612..b5937233b 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,15 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor +from .telecinco import TelecincoIE from ..utils import ( int_or_none, parse_iso8601, - smuggle_url, ) -class MiTeleIE(InfoExtractor): +class MiTeleIE(TelecincoIE): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P[^/]+)/player' @@ -31,7 +30,6 @@ class MiTeleIE(InfoExtractor): 'timestamp': 1471209401, 'upload_date': '20160814', }, - 'add_ie': ['Ooyala'], }, { # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', @@ -54,7 +52,6 @@ class MiTeleIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': ['Ooyala'], }, { 'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player', 'only_matching': True, @@ -70,16 +67,11 @@ class MiTeleIE(InfoExtractor): r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})', webpage, 'Pre Player'), display_id)['prePlayer'] title = pre_player['title'] - video = pre_player['video'] - video_id = video['dataMediaId'] + video_info = self._parse_content(pre_player['video'], url) content = pre_player.get('content') or {} info = content.get('info') or {} - return { - '_type': 'url_transparent', - # for some reason only HLS is supported - 'url': smuggle_url('ooyala:' + video_id, {'supportedformats': 'm3u8,dash'}), - 'id': video_id, + video_info.update({ 'title': title, 'description': info.get('synopsis'), 'series': content.get('title'), @@ -87,7 +79,7 @@ class MiTeleIE(InfoExtractor): 'episode': content.get('subtitle'), 'episode_number': int_or_none(info.get('episode_number')), 'duration': int_or_none(info.get('duration')), - 'thumbnail': video.get('dataPoster'), 'age_limit': int_or_none(info.get('rating')), 'timestamp': parse_iso8601(pre_player.get('publishedTime')), - } + }) + return video_info From 6f2eaaf73daef3ac0995cd7b51c677b003c04218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Dec 2020 22:57:50 +0700 Subject: [PATCH 218/424] [teachable] Improve embed detection (closes #26923) --- youtube_dl/extractor/teachable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 6f264bddc..2394f86d4 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -140,7 +140,7 @@ class TeachableIE(TeachableBaseIE): @staticmethod def _is_teachable(webpage): return 'teachableTracker.linker:autoLink' in webpage and re.search( - r']+href=["\']https?://process\.fs\.teachablecdn\.com', + r']+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', webpage) @staticmethod From 794771a164009ff94046c98d8a7d45f7706547af Mon Sep 17 00:00:00 2001 From: JamKage Date: Sun, 27 Dec 2020 17:36:21 +0000 Subject: [PATCH 219/424] [go] Added support for FXNetworks (#26826) Co-authored-by: James Kirrage closes #13972 closes #22467 closes #23754 --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/fxnetworks.py | 77 ------------------------------ youtube_dl/extractor/go.py | 21 +++++++- 3 files changed, 19 insertions(+), 80 deletions(-) delete mode 100644 youtube_dl/extractor/fxnetworks.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index da472d58e..cf50b897b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -399,7 +399,6 @@ from .fujitv import FujiTVFODPlus7IE from .funimation import FunimationIE from .funk import FunkIE from .fusion import FusionIE -from .fxnetworks import FXNetworksIE from .gaia import GaiaIE from .gameinformer import GameInformerIE from .gamespot import GameSpotIE diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py deleted file mode 100644 index 00e67426b..000000000 --- a/youtube_dl/extractor/fxnetworks.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .adobepass import AdobePassIE -from ..utils import ( - extract_attributes, - int_or_none, - parse_age_limit, - smuggle_url, - update_url_query, -) - - -class FXNetworksIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P\d+)' - _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/1032565827847', - 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', - 'info_dict': { - 'id': 'dRzwHC_MMqIv', - 'ext': 'mp4', - 'title': 'First Look: Better Things - Season 2', - 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', - 'age_limit': 14, - 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20170825', - 'timestamp': 1503686274, - 'episode_number': 0, - 'season_number': 2, - 'series': 'Better Things', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.simpsonsworld.com/video/716094019682', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - if 'The content you are trying to access is not available in your region.' in webpage: - self.raise_geo_restricted() - video_data = extract_attributes(self._search_regex( - r'()', webpage, 'video data')) - player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) - release_url = video_data['rel'] - title = video_data['data-title'] - rating = video_data.get('data-rating') - query = { - 'mbr': 'true', - } - if player_type == 'movies': - query.update({ - 'manifest': 'm3u', - }) - else: - query.update({ - 'switch': 'http', - }) - if video_data.get('data-req-auth') == '1': - resource = self._get_mvpd_resource( - video_data['data-channel'], title, - video_data.get('data-guid'), rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'series': video_data.get('data-show-title'), - 'episode_number': int_or_none(video_data.get('data-episode')), - 'season_number': int_or_none(video_data.get('data-season')), - 'thumbnail': video_data.get('data-large-thumb'), - 'age_limit': parse_age_limit(rating), - 'ie_key': 'ThePlatform', - } diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 03cfba91f..0d731e90a 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -38,13 +38,17 @@ class GoIE(AdobePassIE): 'disneynow': { 'brand': '011', 'resource_id': 'Disney', - } + }, + 'fxnow.fxnetworks': { + 'brand': '025', + 'requestor_id': 'dtci', + }, } _VALID_URL = r'''(?x) https?:// (?: (?:(?P%s)\.)?go| - (?Pabc|freeform|disneynow) + (?Pabc|freeform|disneynow|fxnow\.fxnetworks) )\.com/ (?: (?:[^/]+/)*(?P[Vv][Dd][Kk][Aa]\w+)| @@ -99,6 +103,19 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', + 'info_dict': { + 'id': 'VDKA12782841', + 'ext': 'mp4', + 'title': 'First Look: Better Things - Season 2', + 'description': 'md5:fa73584a95761c605d9d54904e35b407', + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, From af93ecfd88d539cccea97f6cfc33b8cbe362a8ed Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 22:26:20 +0100 Subject: [PATCH 220/424] [toggle] add support for live.mewatch.sg (closes #27555) --- youtube_dl/extractor/toggle.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index 3b9b54759..270c84daa 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -200,7 +200,7 @@ class ToggleIE(InfoExtractor): class MeWatchIE(InfoExtractor): IE_NAME = 'mewatch' - _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|live)\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', 'info_dict': { @@ -220,6 +220,9 @@ class MeWatchIE(InfoExtractor): }, { 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232', 'only_matching': True, + }, { + 'url': 'https://live.mewatch.sg/watch/Recipe-Of-Life-E41-189759', + 'only_matching': True, }] def _real_extract(self, url): From c0071885987b3737d2c586133007c61ab513a477 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 23:47:28 +0100 Subject: [PATCH 221/424] [zype] Add support for uplynk videos --- youtube_dl/extractor/zype.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py index 2e2e97a0c..5288f40d8 100644 --- a/youtube_dl/extractor/zype.py +++ b/youtube_dl/extractor/zype.py @@ -85,7 +85,13 @@ class ZypeIE(InfoExtractor): else: m3u8_url = self._search_regex( r'(["\'])(?P(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', - body, 'm3u8 url', group='url') + body, 'm3u8 url', group='url', default=None) + if not m3u8_url: + source = self._parse_json(self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, + 'source'), video_id, js_to_json) + if source.get('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id'] formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( From f27224d57b6768569e1aedfaff326605bdb4f049 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Dec 2020 10:50:29 +0100 Subject: [PATCH 222/424] [piksel] import format extraction --- youtube_dl/extractor/nhk.py | 2 +- youtube_dl/extractor/piksel.py | 109 ++++++++++++++++++++++++--------- 2 files changed, 80 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index c5b406573..8a9331a79 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -90,7 +90,7 @@ class NhkVodIE(NhkBaseIE): _TESTS = [{ # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '256a1be14f48d960a7e61e2532d95ec3', + 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', 'info_dict': { 'id': 'a95j5iza', 'ext': 'mp4', diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py index 88b6859b0..ecf56ff8f 100644 --- a/youtube_dl/extractor/piksel.py +++ b/youtube_dl/extractor/piksel.py @@ -6,16 +6,33 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, dict_get, + ExtractorError, int_or_none, - unescapeHTML, parse_iso8601, + try_get, + unescapeHTML, ) class PikselIE(InfoExtractor): - _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P[a-z0-9_]+)' + _VALID_URL = r'''(?x)https?:// + (?: + (?: + player\. + (?: + olympusattelecom| + vibebyvista + )| + (?:api|player)\.multicastmedia| + (?:api-ovp|player)\.piksel + )\.com| + (?: + mz-edge\.stream\.co| + movie-s\.nhk\.or + )\.jp| + vidego\.baltimorecity\.gov + )/v/(?:refid/(?P[^/]+)/prefid/)?(?P[\w-]+)''' _TESTS = [ { 'url': 'http://player.piksel.com/v/ums2867l', @@ -56,46 +73,41 @@ class PikselIE(InfoExtractor): if mobj: return mobj.group('url') + def _call_api(self, app_token, resource, display_id, query, fatal=True): + response = (self._download_json( + 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), + display_id, query=query, fatal=fatal) or {}).get('response') + failure = try_get(response, lambda x: x['failure']['reason']) + if failure: + if fatal: + raise ExtractorError(failure, expected=True) + self.report_warning(failure) + return response + def _real_extract(self, url): - display_id = self._match_id(url) + ref_id, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'data-de-program-uuid=[\'"]([a-z0-9]+)', - webpage, 'program uuid', default=display_id) app_token = self._search_regex([ r'clientAPI\s*:\s*"([^"]+)"', r'data-de-api-key\s*=\s*"([^"]+)"' ], webpage, 'app token') - response = self._download_json( - 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, - video_id, query={ - 'v': video_id - })['response'] - failure = response.get('failure') - if failure: - raise ExtractorError(response['failure']['reason'], expected=True) - video_data = response['WsProgramResponse']['program']['asset'] + query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} + program = self._call_api( + app_token, 'program', display_id, query)['WsProgramResponse']['program'] + video_id = program['uuid'] + video_data = program['asset'] title = video_data['title'] + asset_type = dict_get(video_data, ['assetType', 'asset_type']) formats = [] - m3u8_url = dict_get(video_data, [ - 'm3u8iPadURL', - 'ipadM3u8Url', - 'm3u8AndroidURL', - 'm3u8iPhoneURL', - 'iphoneM3u8Url']) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - asset_type = dict_get(video_data, ['assetType', 'asset_type']) - for asset_file in video_data.get('assetFiles', []): + def process_asset_file(asset_file): + if not asset_file: + return # TODO: extract rtmp formats http_url = asset_file.get('http_url') if not http_url: - continue + return tbr = None vbr = int_or_none(asset_file.get('videoBitrate'), 1024) abr = int_or_none(asset_file.get('audioBitrate'), 1024) @@ -118,6 +130,43 @@ class PikselIE(InfoExtractor): 'filesize': int_or_none(asset_file.get('filesize')), 'tbr': tbr, }) + + def process_asset_files(asset_files): + for asset_file in (asset_files or []): + process_asset_file(asset_file) + + process_asset_files(video_data.get('assetFiles')) + process_asset_file(video_data.get('referenceFile')) + if not formats: + asset_id = video_data.get('assetid') or program.get('assetid') + if asset_id: + process_asset_files(try_get(self._call_api( + app_token, 'asset_file', display_id, { + 'assetid': asset_id, + }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) + if smil_url: + transform_source = None + if ref_id == 'nhkworld': + # TODO: figure out if this is something to be fixed in urljoin, + # _parse_smil_formats or keep it here + transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') + formats.extend(self._extract_smil_formats( + re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, + transform_source=transform_source, fatal=False)) + self._sort_formats(formats) subtitles = {} From 782ea947b487d4ef2b7b11fa40a00a518b529fed Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Dec 2020 11:12:57 +0100 Subject: [PATCH 223/424] [brightcove] remove sonyliv specific code --- youtube_dl/extractor/brightcove.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 300d75458..65b44c099 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -534,14 +534,6 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) - if not formats: - # for sonyliv.com DRM protected videos - s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') - if s3_source_url: - formats.append({ - 'url': s3_source_url, - 'format_id': 'source', - }) errors = json_data.get('errors') if not formats and errors: From 64e419bd7386b2a16a3c2e7ac5da30427afe856d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Dec 2020 18:19:30 +0100 Subject: [PATCH 224/424] [aparat] Fix extraction closes #22285 closes #22611 closes #23348 closes #24354 closes #24591 closes #24904 closes #25418 closes #26070 closes #26350 closes #26738 closes #27563 --- youtube_dl/extractor/aparat.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 883dcee7a..a9527e785 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + get_element_by_id, int_or_none, merge_dicts, mimetype2ext, @@ -39,23 +40,15 @@ class AparatIE(InfoExtractor): webpage = self._download_webpage(url, video_id, fatal=False) if not webpage: - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work webpage = self._download_webpage( 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, video_id) - options = self._parse_json( - self._search_regex( - r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', - webpage, 'options', group='value'), - video_id) - - player = options['plugins']['sabaPlayerPlugin'] + options = self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) formats = [] - for sources in player['multiSRC']: + for sources in (options.get('multiSRC') or []): for item in sources: if not isinstance(item, dict): continue @@ -85,11 +78,12 @@ class AparatIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, default={}) if not info.get('title'): - info['title'] = player['title'] + info['title'] = get_element_by_id('videoTitle', webpage) or \ + self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True) return merge_dicts(info, { 'id': video_id, 'thumbnail': url_or_none(options.get('poster')), - 'duration': int_or_none(player.get('duration')), + 'duration': int_or_none(options.get('duration')), 'formats': formats, }) From f1bc56c99bac05dccb01c1b68ef778eb08fbcb71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Dec 2020 02:11:48 +0700 Subject: [PATCH 225/424] [youtube:tab] Restore retry on browse requests (closes #27313, closes #27564) --- youtube_dl/extractor/youtube.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5ef58d730..77f128285 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_HTTPError, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -3009,10 +3010,24 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): for page_num in itertools.count(1): if not continuation: break - browse = self._download_json( - 'https://www.youtube.com/browse_ajax', None, - 'Downloading page %d' % page_num, - headers=headers, query=continuation, fatal=False) + count = 0 + retries = 3 + while count <= retries: + try: + # Downloading page may result in intermittent 5xx HTTP error + # that is usually worked around with a retry + browse = self._download_json( + 'https://www.youtube.com/browse_ajax', None, + 'Downloading page %d%s' + % (page_num, ' (retry #%d)' % count if count else ''), + headers=headers, query=continuation) + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): + count += 1 + if count <= retries: + continue + raise if not browse: break response = try_get(browse, lambda x: x[1]['response'], dict) From 71febd1c52d6de89ff571d4c212846aaaafb33ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Dec 2020 02:19:43 +0700 Subject: [PATCH 226/424] [youtube:tab] Improve URL matching (closes #27559) --- youtube_dl/extractor/youtube.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 77f128285..0044ed909 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1103,6 +1103,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', + 'only_matching': True, + }, ] def __init__(self, *args, **kwargs): @@ -2730,6 +2734,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if YoutubeIE.suitable(url) else super( + YoutubeTabIE, cls).suitable(url) + def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( 'channelId', webpage, 'channel id', default=None) From 1a95953867412bc7a785f21f6bff5145b2b13fd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Dec 2020 02:29:34 +0700 Subject: [PATCH 227/424] [youtube] Improve yt initial data extraction (closes #27524) --- youtube_dl/extractor/youtube.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0044ed909..87bdc1677 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -280,6 +280,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' + _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta| Date: Tue, 29 Dec 2020 02:49:53 +0700 Subject: [PATCH 228/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 86b12b1c3..7e1bcb237 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version + +Extractors +* [youtube] Improve yt initial data extraction (#27524) +* [youtube:tab] Improve URL matching #27559) +* [youtube:tab] Restore retry on browse requests (#27313, #27564) +* [aparat] Fix extraction (#22285, #22611, #23348, #24354, #24591, #24904, + #25418, #26070, #26350, #26738, #27563) +- [brightcove] Remove sonyliv specific code +* [piksel] Improve format extraction ++ [zype] Add support for uplynk videos ++ [toggle] Add support for live.mewatch.sg (#27555) ++ [go] Add support for fxnow.fxnetworks.com (#13972, #22467, #23754, #26826) +* [teachable] Improve embed detection (#26923) +* [mitele] Fix free video extraction (#24624, #25827, #26757) +* [telecinco] Fix extraction +* [youtube] Update invidious.snopyta.org (#22667) +* [amcnetworks] Improve auth only video detection (#27548) ++ [generic] Add support for VHX Embeds (#27546) + + version 2020.12.26 Extractors From 479cc6d5a166dc2f250687616041c9f3b36c80b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Dec 2020 02:52:31 +0700 Subject: [PATCH 229/424] release 2020.12.29 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 8 files changed, 14 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 52295e426..0d5a8e666 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.26 + [debug] youtube-dl version 2020.12.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 40a61bc80..88a6ea3fc 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index e71c3ad61..1d75d1a79 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index b10168073..83f1f43cd 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.26 + [debug] youtube-dl version 2020.12.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index cba0b3394..d3e03f78b 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 7e1bcb237..2dad14949 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.12.29 Extractors * [youtube] Improve yt initial data extraction (#27524) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c54507e90..1406ba8b8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -317,7 +317,6 @@ - **Funk** - **Fusion** - **Fux** - - **FXNetworks** - **Gaia** - **GameInformer** - **GameSpot** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d1dcebd88..c7fb697c4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.12.26' +__version__ = '2020.12.29' From bcfe485e0172ff32c450bb7835cfae7fca7594ae Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 14:05:23 +0100 Subject: [PATCH 230/424] [brightcove] raise ExtractorError for DRM protected videos(closes #23467)(closes #27568) --- youtube_dl/extractor/brightcove.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 65b44c099..6022076ac 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -471,13 +471,18 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() + num_drm_sources = 0 formats = [] - for source in json_data.get('sources', []): + sources = json_data.get('sources') or [] + for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + if container == 'WVM' or source.get('key_systems'): + num_drm_sources += 1 + continue + elif ext == 'ism': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -535,11 +540,14 @@ class BrightcoveNewIE(AdobePassIE): }) formats.append(f) - errors = json_data.get('errors') - if not formats and errors: - error = errors[0] - raise ExtractorError( - error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if not formats: + errors = json_data.get('errors') + if errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if sources and num_drm_sources == len(sources): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) From 7acd042bbb555962f42fa4f0f236772194d2da64 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 14:06:58 +0100 Subject: [PATCH 231/424] [tenplay] fix format extraction(closes #26653) --- youtube_dl/extractor/tenplay.py | 34 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py index af325fea8..cd30d57f4 100644 --- a/youtube_dl/extractor/tenplay.py +++ b/youtube_dl/extractor/tenplay.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + HEADRequest, parse_age_limit, parse_iso8601, - smuggle_url, + # smuggle_url, ) @@ -24,14 +25,16 @@ class TenPlayIE(InfoExtractor): 'uploader_id': '2199827728001', }, 'params': { - 'format': 'bestvideo', + # 'format': 'bestvideo', 'skip_download': True, } }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + _GEO_BYPASS = False + _FASTLY_URL_TEMPL = 'https://10-selector.global.ssl.fastly.net/s/kYEXFC/media/%s?mbr=true&manifest=m3u&format=redirect' def _real_extract(self, url): content_id = self._match_id(url) @@ -40,19 +43,28 @@ class TenPlayIE(InfoExtractor): video = data.get('video') or {} metadata = data.get('metaData') or {} brightcove_id = video.get('videoId') or metadata['showContentVideoId'] - brightcove_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['AU']}) + # brightcove_url = smuggle_url( + # self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + # {'geo_countries': ['AU']}) + m3u8_url = self._request_webpage(HEADRequest( + self._FASTLY_URL_TEMPL % brightcove_id), brightcove_id).geturl() + if '10play-not-in-oz' in m3u8_url: + self.raise_geo_restricted(countries=['AU']) + formats = self._extract_m3u8_formats(m3u8_url, brightcove_id, 'mp4') + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': brightcove_url, - 'id': content_id, - 'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'), + # '_type': 'url_transparent', + # 'url': brightcove_url, + 'formats': formats, + 'id': brightcove_id, + 'title': video.get('title') or metadata.get('pageContentName') or metadata['showContentName'], 'description': video.get('description'), 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')), 'series': metadata.get('showName'), 'season': metadata.get('showContentSeason'), 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')), - 'ie_key': 'BrightcoveNew', + 'thumbnail': video.get('poster'), + 'uploader_id': '2199827728001', + # 'ie_key': 'BrightcoveNew', } From c931c4b8ddb32371cddf48ea52d0c036a6a66240 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 14:09:10 +0100 Subject: [PATCH 232/424] [sevenplay] detect API errors --- youtube_dl/extractor/sevenplus.py | 32 ++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py index 84568ac69..240afc18f 100644 --- a/youtube_dl/extractor/sevenplus.py +++ b/youtube_dl/extractor/sevenplus.py @@ -4,8 +4,12 @@ from __future__ import unicode_literals import re from .brightcove import BrightcoveNewIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + ExtractorError, try_get, update_url_query, ) @@ -41,16 +45,22 @@ class SevenPlusIE(BrightcoveNewIE): def _real_extract(self, url): path, episode_id = re.match(self._VALID_URL, url).groups() - media = self._download_json( - 'https://videoservice.swm.digital/playback', episode_id, query={ - 'appId': '7plus', - 'deviceType': 'web', - 'platformType': 'web', - 'accountId': 5303576322001, - 'referenceId': 'ref:' + episode_id, - 'deliveryId': 'csai', - 'videoType': 'vod', - })['media'] + try: + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) + raise for source in media.get('sources', {}): src = source.get('src') From 53528e1d2385494c72349f609907f0164d6f8431 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 14:10:46 +0100 Subject: [PATCH 233/424] [uktvplay] match new video URLs(closes #17909) --- youtube_dl/extractor/uktvplay.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/uktvplay.py b/youtube_dl/extractor/uktvplay.py index 2137502a1..f28fd514d 100644 --- a/youtube_dl/extractor/uktvplay.py +++ b/youtube_dl/extractor/uktvplay.py @@ -5,10 +5,9 @@ from .common import InfoExtractor class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P\d+)' - _TEST = { + _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P\d+)' + _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', - 'md5': '', 'info_dict': { 'id': '2117008346001', 'ext': 'mp4', @@ -23,7 +22,11 @@ class UKTVPlayIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Failed to download MPD manifest'] - } + }, { + 'url': 'https://uktvplay.uktv.co.uk/shows/africa/watch-online/5983349675001', + 'only_matching': True, + }] + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/1242911124001/OrCyvJ2gyL_default/index.html?videoId=%s' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' def _real_extract(self, url): From 9ee984fc760c9a3f4818055ff28ea886aadc58cb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 16:13:36 +0100 Subject: [PATCH 234/424] [aenetworks] add support for biography.com (closes #3863) --- youtube_dl/extractor/aenetworks.py | 97 ++++++++++++++++++++++-------- youtube_dl/extractor/extractors.py | 2 + 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 3d0cf1208..237012978 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -6,6 +6,7 @@ import re from .theplatform import ThePlatformIE from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, update_url_query, urlencode_postdata, @@ -28,6 +29,7 @@ class AENetworksBaseIE(ThePlatformIE): 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), 'fyi.tv': ('FYI', 'fyi'), 'historyvault.com': (None, 'historyvault'), + 'biography.com': (None, 'biography'), } def _extract_aen_smil(self, smil_url, video_id, auth=None): @@ -54,6 +56,8 @@ class AENetworksBaseIE(ThePlatformIE): tp_formats, tp_subtitles = self._extract_theplatform_smil( m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) except ExtractorError as e: + if isinstance(e, GeoRestrictedError): + raise last_e = e continue formats.extend(tp_formats) @@ -67,6 +71,34 @@ class AENetworksBaseIE(ThePlatformIE): 'subtitles': subtitles, } + def _extract_aetn_info(self, domain, filter_key, filter_value, url): + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None + if theplatform_metadata.get('AETN$isBehindWall'): + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) + return info + class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' @@ -139,32 +171,7 @@ class AENetworksIE(AENetworksBaseIE): def _real_extract(self, url): domain, canonical = re.match(self._VALID_URL, url).groups() - requestor_id, brand = self._DOMAIN_MAP[domain] - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - canonical, query={'filter[canonical]': '/' + canonical})['results'][0] - title = result['title'] - video_id = result['id'] - media_url = result['publicUrl'] - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - auth = None - if theplatform_metadata.get('AETN$isBehindWall'): - resource = self._get_mvpd_resource( - requestor_id, theplatform_metadata['title'], - theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), - theplatform_metadata['ratings'][0]['rating']) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - info.update(self._extract_aen_smil(media_url, video_id, auth)) - info.update({ - 'title': title, - 'series': result.get('seriesName'), - 'season_number': int_or_none(result.get('tvSeasonNumber')), - 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), - }) - return info + return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url) class AENetworksListBaseIE(AENetworksBaseIE): @@ -294,3 +301,41 @@ class HistoryTopicIE(AENetworksBaseIE): return self.url_result( 'http://www.history.com/videos/' + display_id, AENetworksIE.ie_key()) + + +class HistoryPlayerIE(AENetworksBaseIE): + IE_NAME = 'history:player' + _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_aetn_info(domain, 'id', video_id, url) + + +class BiographyIE(AENetworksBaseIE): + _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808', + 'info_dict': { + 'id': '30322987', + 'ext': 'mp4', + 'title': 'Vincent Van Gogh - Full Episode', + 'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.', + 'timestamp': 1311970571, + 'upload_date': '20110729', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_url = self._search_regex( + r']+src="(%s)' % HistoryPlayerIE._VALID_URL, + webpage, 'player URL') + return self.url_result(player_url, HistoryPlayerIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index cf50b897b..20472f2f7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -33,6 +33,8 @@ from .aenetworks import ( AENetworksCollectionIE, AENetworksShowIE, HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, ) from .afreecatv import AfreecaTVIE from .airmozilla import AirMozillaIE From 5966095e65b5365e8e4d211ffca6ab50514d3c84 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 16:59:31 +0100 Subject: [PATCH 235/424] [aenetworks] fix HistoryPlayerIE tests --- youtube_dl/extractor/aenetworks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 237012978..8e4963131 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -306,6 +306,7 @@ class HistoryTopicIE(AENetworksBaseIE): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + _TESTS = [] def _real_extract(self, url): domain, video_id = re.match(self._VALID_URL, url).groups() From ebdcf70b0d52fd7b4627b2435b3081c7f4f4ce4b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 17:15:13 +0100 Subject: [PATCH 236/424] [nbc] fix NBCSport VPlayer URL extraction(closes #16640) --- youtube_dl/extractor/nbc.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index ea5f5a315..9695a9616 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -158,7 +158,8 @@ class NBCIE(AdobePassIE): class NBCSportsVPlayerIE(InfoExtractor): - _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' + _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' + _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -174,12 +175,15 @@ class NBCSportsVPlayerIE(InfoExtractor): }, { 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, + }, { + 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): iframe_m = re.search( - r']+src="(?Phttps?://vplayer\.nbcsports\.com/[^"]+)"', webpage) + r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) if iframe_m: return iframe_m.group('url') @@ -192,21 +196,29 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): - # Does not include https because its certificate is invalid - _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P[0-9a-z-]+)' - _TEST = { + _TESTS = [{ + # iframe src 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', 'info_dict': { 'id': 'PHJSaFWbrTY9', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', 'uploader': 'NBCU-SPORTS', 'upload_date': '20150330', 'timestamp': 1427726529, } - } + }, { + # data-mpx-src + 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot', + 'only_matching': True, + }, { + # data-src + 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From c706fbe9fe4a517ad43ca95393384e5f78870f82 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 17:21:05 +0100 Subject: [PATCH 237/424] [nbc] Remove CSNNE extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/nbc.py | 27 --------------------------- 2 files changed, 28 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 20472f2f7..51e6a463a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -692,7 +692,6 @@ from .nba import ( NBAChannelIE, ) from .nbc import ( - CSNNEIE, NBCIE, NBCNewsIE, NBCOlympicsIE, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 9695a9616..0d77648c2 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -286,33 +286,6 @@ class NBCSportsStreamIE(AdobePassIE): } -class CSNNEIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' - - _TEST = { - 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', - 'info_dict': { - 'id': 'yvBLLUgQ8WU0', - 'ext': 'mp4', - 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', - 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', - 'timestamp': 1459369979, - 'upload_date': '20160330', - 'uploader': 'NBCU-SPORTS', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': self._html_search_meta('twitter:player:stream', webpage), - 'display_id': display_id, - } - - class NBCNewsIE(ThePlatformIE): _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' From 9c1e164e0cd77331ea4f0b474b32fd06f84bad71 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 19:29:08 +0100 Subject: [PATCH 238/424] [YoutubeDL] Allow format filtering using audio language(#16209) --- README.md | 1 + youtube_dl/YoutubeDL.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ffa34493e..85fed6d3a 100644 --- a/README.md +++ b/README.md @@ -678,6 +678,7 @@ Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format + - `language`: Language code Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6f477bc32..aaac149e9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1083,7 +1083,7 @@ class YoutubeDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?Pext|acodec|vcodec|container|protocol|format_id) + \s*(?Pext|acodec|vcodec|container|protocol|format_id|language) \s*(?P!\s*)?(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ From 9dd674e1d20440564a3d25f33cd8785695e110f6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 09:22:30 +0100 Subject: [PATCH 239/424] [utils] accept only supported protocols in url_or_none --- test/test_utils.py | 5 +++++ youtube_dl/utils.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 925a21d34..d49d3239c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -554,6 +554,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_or_none('http$://foo.de'), None) self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') self.assertEqual(url_or_none('//foo.de'), '//foo.de') + self.assertEqual(url_or_none('s3://foo.de'), None) + self.assertEqual(url_or_none('rtmpte://foo.de'), 'rtmpte://foo.de') + self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') + self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') + self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8cefafd79..d5fb6fd24 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3640,7 +3640,7 @@ def url_or_none(url): if not url or not isinstance(url, compat_str): return None url = url.strip() - return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def parse_duration(s): From f7e95fb2a0516f90edffe72d9911222d1ed1a2bc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 09:24:37 +0100 Subject: [PATCH 240/424] [yandexvideo] fix extraction(closes #25000) --- youtube_dl/extractor/yandexvideo.py | 116 ++++++++++++++++++---------- 1 file changed, 76 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py index 46529be05..36d01cc8e 100644 --- a/youtube_dl/extractor/yandexvideo.py +++ b/youtube_dl/extractor/yandexvideo.py @@ -13,26 +13,30 @@ class YandexVideoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - yandex\.ru(?:/portal/(?:video|efir))?/?\?.*?stream_id=| + yandex\.ru(?:/(?:portal/(?:video|efir)|efir))?/?\?.*?stream_id=| frontend\.vh\.yandex\.ru/player/ ) - (?P[\da-f]+) + (?P(?:[\da-f]{32}|[\w-]{12})) ''' _TESTS = [{ - 'url': 'https://yandex.ru/portal/video?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374', + 'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4', 'info_dict': { - 'id': '4dbb262b4fe5cf15a215de4f34eee34d', + 'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374', 'ext': 'mp4', - 'title': 'В Нью-Йорке баржи и теплоход оторвались от причала и расплылись по Гудзону', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 0, - 'duration': 30, + 'title': 'Русский Вудсток - главный рок-фест в истории СССР / вДудь', + 'description': 'md5:7d6b8d4bc4a3b9a56499916c1ea5b5fa', + 'thumbnail': r're:^https?://', + 'timestamp': 1549972939, + 'duration': 5575, 'age_limit': 18, + 'upload_date': '20190212', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, }, }, { - 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374&from=morda', + 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda', 'only_matching': True, }, { 'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', @@ -52,53 +56,85 @@ class YandexVideoIE(InfoExtractor): # DASH with DRM 'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8', 'only_matching': True, + }, { + 'url': 'https://yandex.ru/efir?stream_active=watching&stream_id=v7a2dZ-v5mSI&from_block=efir_newtab', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) content = self._download_json( - 'https://frontend.vh.yandex.ru/v22/player/%s.json' % video_id, - video_id, query={ - 'stream_options': 'hires', - 'disable_trackings': 1, - })['content'] + # 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + # video_id, query={ + # 'stream_options': 'hires', + # 'disable_trackings': 1, + # })['content'] + 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + player(content_id: "%s") { + computed_title + content_url + description + dislikes + duration + likes + program_title + release_date + release_date_ut + release_year + restriction_age + season + start_time + streams + thumbnail + title + views_count + } +}''' % video_id.encode())['player']['content']['content'] - content_url = url_or_none(content.get('content_url')) or url_or_none( - content['streams'][0]['url']) - title = content.get('title') or content.get('computed_title') + title = content.get('title') or content['computed_title'] - ext = determine_ext(content_url) - - if ext == 'm3u8': - formats = self._extract_m3u8_formats( - content_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - elif ext == 'mpd': - formats = self._extract_mpd_formats( - content_url, video_id, mpd_id='dash') - else: - formats = [{'url': content_url}] + formats = [] + streams = content.get('streams') or [] + streams.append({'url': content.get('content_url')}) + for stream in streams: + content_url = url_or_none(stream.get('url')) + if not content_url: + continue + ext = determine_ext(content_url) + if ext == 'ismc': + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + content_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({'url': content_url}) self._sort_formats(formats) - description = content.get('description') - thumbnail = content.get('thumbnail') timestamp = (int_or_none(content.get('release_date')) or int_or_none(content.get('release_date_ut')) or int_or_none(content.get('start_time'))) - duration = int_or_none(content.get('duration')) - series = content.get('program_title') - age_limit = int_or_none(content.get('restriction_age')) + season = content.get('season') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': content.get('description'), + 'thumbnail': content.get('thumbnail'), 'timestamp': timestamp, - 'duration': duration, - 'series': series, - 'age_limit': age_limit, + 'duration': int_or_none(content.get('duration')), + 'series': content.get('program_title'), + 'age_limit': int_or_none(content.get('restriction_age')), + 'view_count': int_or_none(content.get('views_count')), + 'like_count': int_or_none(content.get('likes')), + 'dislike_count': int_or_none(content.get('dislikes')), + 'season_number': int_or_none(season.get('season_number')), + 'season_id': season.get('id'), + 'release_year': int_or_none(content.get('release_year')), 'formats': formats, } From bdd044e67b5d10736aa712e9be64beff0d47f490 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 13:30:11 +0100 Subject: [PATCH 241/424] [yandexvideo] use old api call as fallback --- youtube_dl/extractor/yandexvideo.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py index 36d01cc8e..ab8c84c93 100644 --- a/youtube_dl/extractor/yandexvideo.py +++ b/youtube_dl/extractor/yandexvideo.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + try_get, url_or_none, ) @@ -64,12 +65,7 @@ class YandexVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - content = self._download_json( - # 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, - # video_id, query={ - # 'stream_options': 'hires', - # 'disable_trackings': 1, - # })['content'] + player = try_get((self._download_json( 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ player(content_id: "%s") { computed_title @@ -90,7 +86,15 @@ class YandexVideoIE(InfoExtractor): title views_count } -}''' % video_id.encode())['player']['content']['content'] +}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) + if not player or player.get('error'): + player = self._download_json( + 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + video_id, query={ + 'stream_options': 'hires', + 'disable_trackings': 1, + }) + content = player['content'] title = content.get('title') or content['computed_title'] From bd18824c2a99d6d01b00edfa186b9fd227af255c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 13:43:56 +0100 Subject: [PATCH 242/424] [yandexdisk] fix extraction(closes #17861)(closes #27131) --- youtube_dl/extractor/yandexdisk.py | 138 +++++++++++++++++------------ 1 file changed, 81 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py index e8f6ae10f..21f37c192 100644 --- a/youtube_dl/extractor/yandexdisk.py +++ b/youtube_dl/extractor/yandexdisk.py @@ -1,19 +1,40 @@ # coding: utf-8 from __future__ import unicode_literals +import json + from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( determine_ext, + ExtractorError, float_or_none, int_or_none, - try_get, - urlencode_postdata, + mimetype2ext, + parse_iso8601, + urljoin, ) class YandexDiskIE(InfoExtractor): - _VALID_URL = r'https?://yadi\.sk/[di]/(?P[^/?#&]+)' + _VALID_URL = r'''(?x)https?:// + (?: + (?:www\.)?yadi\.sk| + disk\.yandex\. + (?: + az| + by| + co(?:m(?:\.(?:am|ge|tr))?|\.il)| + ee| + fr| + k[gz]| + l[tv]| + md| + t[jm]| + u[az]| + ru + ) + )/(?:[di]/|public.*?\bhash=)(?P[^/?#&]+)''' _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', @@ -25,94 +46,97 @@ class YandexDiskIE(InfoExtractor): 'duration': 168.6, 'uploader': 'y.botova', 'uploader_id': '300043621', + 'timestamp': 1421396809, + 'upload_date': '20150116', 'view_count': int, }, }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, + }, { + 'url': 'https://yadi.sk/public?hash=5DZ296JK9GWCLp02f6jrObjnctjRxMs8L6%2B%2FuhNqk38%3D', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - status = self._download_webpage( - 'https://disk.yandex.com/auth/status', video_id, query={ - 'urlOrigin': url, - 'source': 'public', - 'md5': 'false', - }) + try: + resource = self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources', + video_id, query={'public_key': url}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error_description = self._parse_json( + e.cause.read().decode(), video_id)['description'] + raise ExtractorError(error_description, expected=True) + raise - sk = self._search_regex( - r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P(?:(?!\2).)+)\2', - status, 'sk', group='value') + title = resource['name'] + public_url = resource.get('public_url') + if public_url: + video_id = self._match_id(public_url) - webpage = self._download_webpage(url, video_id) + self._set_cookie('yadi.sk', 'yandexuid', '0') - models = self._parse_json( - self._search_regex( - r']+id=["\']models-client[^>]+>\s*(\[.+?\])\s* Date: Wed, 30 Dec 2020 16:45:53 +0100 Subject: [PATCH 243/424] [yandexdisk] extract info from webpage the public API does not return metadata when download limit is reached --- youtube_dl/extractor/yandexdisk.py | 89 ++++++++++++++++-------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py index 21f37c192..6fcd8ee7e 100644 --- a/youtube_dl/extractor/yandexdisk.py +++ b/youtube_dl/extractor/yandexdisk.py @@ -2,24 +2,23 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( determine_ext, - ExtractorError, float_or_none, int_or_none, mimetype2ext, - parse_iso8601, + try_get, urljoin, ) class YandexDiskIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// - (?: - (?:www\.)?yadi\.sk| + (?P + yadi\.sk| disk\.yandex\. (?: az| @@ -38,7 +37,7 @@ class YandexDiskIE(InfoExtractor): _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'md5': 'a4a8d52958c8fddcf9845935070402ae', 'info_dict': { 'id': 'VdOeDou8eZs6Y', 'ext': 'mp4', @@ -46,10 +45,9 @@ class YandexDiskIE(InfoExtractor): 'duration': 168.6, 'uploader': 'y.botova', 'uploader_id': '300043621', - 'timestamp': 1421396809, - 'upload_date': '20150116', 'view_count': int, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, @@ -59,51 +57,58 @@ class YandexDiskIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.match(self._VALID_URL, url).groups() - try: - resource = self._download_json( - 'https://cloud-api.yandex.net/v1/disk/public/resources', - video_id, query={'public_key': url}) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error_description = self._parse_json( - e.cause.read().decode(), video_id)['description'] - raise ExtractorError(error_description, expected=True) - raise + webpage = self._download_webpage(url, video_id) + store = self._parse_json(self._search_regex( + r']+id="store-prefetch"[^>]*>\s*({.+?})\s*', + webpage, 'store'), video_id) + resource = store['resources'][store['rootResourceId']] title = resource['name'] - public_url = resource.get('public_url') + meta = resource.get('meta') or {} + + public_url = meta.get('short_url') if public_url: video_id = self._match_id(public_url) - self._set_cookie('yadi.sk', 'yandexuid', '0') + source_url = (self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources/download', + video_id, query={'public_key': url}, fatal=False) or {}).get('href') + video_streams = resource.get('videoStreams') or {} + video_hash = resource.get('hash') or url + environment = store.get('environment') or {} + sk = environment.get('sk') + yandexuid = environment.get('yandexuid') + if sk and yandexuid and not (source_url and video_streams): + self._set_cookie(domain, 'yandexuid', yandexuid) - def call_api(action): - return (self._download_json( - urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ - 'hash': url, - # obtain sk if needed from call_api('check-auth') while - # the yandexuid cookie is set and sending an empty JSON object - 'sk': 'ya6b52f8c6b12abe91a66d22d3a31084b' - }).encode(), headers={ - 'Content-Type': 'text/plain', - }, fatal=False) or {}).get('data') or {} + def call_api(action): + return (self._download_json( + urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ + 'hash': video_hash, + 'sk': sk, + }).encode(), headers={ + 'Content-Type': 'text/plain', + }, fatal=False) or {}).get('data') or {} + if not source_url: + # TODO: figure out how to detect if download limit has + # been reached and then avoid unnecessary source format + # extraction requests + source_url = call_api('download-url').get('url') + if not video_streams: + video_streams = call_api('get-video-streams') formats = [] - source_url = resource.get('file') - if not source_url: - source_url = call_api('download-url').get('url') if source_url: formats.append({ 'url': source_url, 'format_id': 'source', - 'ext': determine_ext(title, mimetype2ext(resource.get('mime_type')) or 'mp4'), + 'ext': determine_ext(title, meta.get('ext') or mimetype2ext(meta.get('mime_type')) or 'mp4'), 'quality': 1, - 'filesize': int_or_none(resource.get('size')) + 'filesize': int_or_none(meta.get('size')) }) - video_streams = call_api('get-video-streams') for video in (video_streams.get('videos') or []): format_url = video.get('url') if not format_url: @@ -128,15 +133,15 @@ class YandexDiskIE(InfoExtractor): }) self._sort_formats(formats) - owner = resource.get('owner') or {} + uid = resource.get('uid') + display_name = try_get(store, lambda x: x['users'][uid]['displayName']) return { 'id': video_id, 'title': title, 'duration': float_or_none(video_streams.get('duration'), 1000), - 'uploader': owner.get('display_name'), - 'uploader_id': owner.get('uid'), - 'view_count': int_or_none(resource.get('views_count')), - 'timestamp': parse_iso8601(resource.get('created')), + 'uploader': display_name, + 'uploader_id': uid, + 'view_count': int_or_none(meta.get('views_counter')), 'formats': formats, } From 2e21b06ea2f7a1ee5e038bf4274e5a74a4f52c2c Mon Sep 17 00:00:00 2001 From: nixxo Date: Wed, 30 Dec 2020 18:12:17 +0100 Subject: [PATCH 244/424] [vvvvid] add playlists support (#27574) closes #18130 --- youtube_dl/extractor/extractors.py | 5 ++- youtube_dl/extractor/vvvvid.py | 65 +++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 51e6a463a..254206a08 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1425,7 +1425,10 @@ from .vshare import VShareIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE -from .vvvvid import VVVVIDIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) from .vyborymos import VyboryMosIE from .vzaar import VzaarIE from .wakanim import WakanimIE diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 6906cd2ab..5b8ea3665 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -12,7 +12,8 @@ from ..utils import ( class VVVVIDIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' + _VALID_URL_BASE = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/' + _VALID_URL = r'%s(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' % _VALID_URL_BASE _TESTS = [{ # video_type == 'video/vvvvid' 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', @@ -45,20 +46,26 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _real_extract(self, url): - show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + def _download_info(self, show_id, path, video_id, fatal=True): response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id), + 'https://www.vvvvid.it/vvvvid/ondemand/%s%s' % (show_id, path), video_id, headers=self.geo_verification_headers(), query={ 'conn_id': self._conn_id, - }) + }, fatal=fatal) if response['result'] == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) + return response['data'] + + def _real_extract(self, url): + show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, '/season/%s' % season_id, video_id) vid = int(video_id) video_data = list(filter( - lambda episode: episode.get('video_id') == vid, response['data']))[0] + lambda episode: episode.get('video_id') == vid, response))[0] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -156,3 +163,49 @@ class VVVVIDIE(InfoExtractor): 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), } + + +class VVVVIDShowIE(VVVVIDIE): + _VALID_URL = r'(?P%s(?P\d+)/(?P[^/]+))/?(?:$|[\?&].*$)?$' % VVVVIDIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.vvvvid.it/show/156/psyco-pass', + 'info_dict': { + 'id': '156', + 'title': 'Psycho-Pass', + 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', + }, + 'playlist_count': 46, + }] + + def _real_extract(self, url): + base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, '/seasons/', show_title) + + show_infos = self._download_info( + show_id, '/info/', show_title, fatal=False) + + entries = [] + for season in response: + episodes = season.get('episodes') or [] + for episode in episodes: + season_id = str_or_none(episode.get('season_id')) + video_id = str_or_none(episode.get('video_id')) + if not (season_id and video_id): + continue + + video_url = '/'.join([base_url, season_id, video_id]) + + entries.append({ + '_type': 'url_transparent', + 'ie_key': VVVVIDIE.ie_key(), + 'url': video_url, + 'title': episode.get('title'), + 'thumbnail': episode.get('thumbnail'), + 'description': episode.get('description'), + 'season_number': int_or_none(episode.get('season_number')), + 'episode_number': int_or_none(episode.get('number')), + }) + return self.playlist_result( + entries, show_id, show_infos.get('title'), show_infos.get('description')) From 7b0f04ed1f72fd4e4b5b3e935e08a912857fa8c4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 18:16:47 +0100 Subject: [PATCH 245/424] [vvvvid] imporove info extraction --- youtube_dl/extractor/vvvvid.py | 78 ++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 5b8ea3665..014a67e53 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -22,6 +22,16 @@ class VVVVIDIE(InfoExtractor): 'id': '489048', 'ext': 'mp4', 'title': 'Ping Pong', + 'duration': 239, + 'series': '"Perché dovrei guardarlo?" di Dario Moccia', + 'season_id': '437', + 'season_number': 1, + 'episode': 'Ping Pong', + 'episode_number': 1, + 'episode_id': '3334', + 'view_count': int, + 'like_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, @@ -38,6 +48,9 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', + 'only_matching': True }] _conn_id = None @@ -48,24 +61,34 @@ class VVVVIDIE(InfoExtractor): def _download_info(self, show_id, path, video_id, fatal=True): response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s%s' % (show_id, path), + 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), video_id, headers=self.geo_verification_headers(), query={ 'conn_id': self._conn_id, }, fatal=fatal) - if response['result'] == 'error': + if not (response or fatal): + return + if response.get('result') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) return response['data'] + def _extract_common_video_info(self, video_data): + return { + 'thumbnail': video_data.get('thumbnail'), + 'episode_number': int_or_none(video_data.get('number')), + 'episode_id': str_or_none(video_data.get('id')), + } + def _real_extract(self, url): show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() response = self._download_info( - show_id, '/season/%s' % season_id, video_id) + show_id, 'season/%s' % season_id, video_id) vid = int(video_id) video_data = list(filter( lambda episode: episode.get('video_id') == vid, response))[0] + title = video_data['title'] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -148,25 +171,25 @@ class VVVVIDIE(InfoExtractor): 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) self._sort_formats(formats) - return { + info = self._extract_common_video_info(video_data) + info.update({ 'id': video_id, - 'title': video_data['title'], + 'title': title, 'formats': formats, - 'thumbnail': video_data.get('thumbnail'), 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, 'season_number': video_data.get('season_number'), - 'episode_id': str_or_none(video_data.get('id')), - 'episode_number': int_or_none(video_data.get('number')), - 'episode_title': video_data['title'], + 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), - } + 'repost_count': int_or_none(video_data.get('video_shares')), + }) + return info class VVVVIDShowIE(VVVVIDIE): - _VALID_URL = r'(?P%s(?P\d+)/(?P[^/]+))/?(?:$|[\?&].*$)?$' % VVVVIDIE._VALID_URL_BASE + _VALID_URL = r'(?P%s(?P\d+)(?:/(?P[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.vvvvid.it/show/156/psyco-pass', 'info_dict': { @@ -175,37 +198,40 @@ class VVVVIDShowIE(VVVVIDIE): 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', }, 'playlist_count': 46, + }, { + 'url': 'https://www.vvvvid.it/show/156', + 'only_matching': True, }] def _real_extract(self, url): base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() - response = self._download_info( - show_id, '/seasons/', show_title) + seasons = self._download_info( + show_id, 'seasons/', show_title) - show_infos = self._download_info( - show_id, '/info/', show_title, fatal=False) + show_info = self._download_info( + show_id, 'info/', show_title, fatal=False) entries = [] - for season in response: + for season in (seasons or []): + season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: season_id = str_or_none(episode.get('season_id')) video_id = str_or_none(episode.get('video_id')) if not (season_id and video_id): continue - - video_url = '/'.join([base_url, season_id, video_id]) - - entries.append({ - '_type': 'url_transparent', + info = self._extract_common_video_info(episode) + info.update({ + '_type': 'url', 'ie_key': VVVVIDIE.ie_key(), - 'url': video_url, + 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), - 'thumbnail': episode.get('thumbnail'), 'description': episode.get('description'), - 'season_number': int_or_none(episode.get('season_number')), - 'episode_number': int_or_none(episode.get('number')), + 'season_number': season_number, + 'season_id': season_id, }) + entries.append(info) + return self.playlist_result( - entries, show_id, show_infos.get('title'), show_infos.get('description')) + entries, show_id, show_info.get('title'), show_info.get('description')) From d1d0612160ed2d753cc57b30483776b7c1b03473 Mon Sep 17 00:00:00 2001 From: ozburo Date: Tue, 22 Dec 2020 06:12:14 -0600 Subject: [PATCH 246/424] [redditr] Extract all thumbnails --- youtube_dl/extractor/reddit.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 3b2abb262..2d1a1fd99 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, try_get, url_or_none, + unescapeHTML, ) @@ -118,11 +119,23 @@ class RedditRIE(InfoExtractor): else: age_limit = None + thumbnails = [] + images = try_get( + data, lambda x: x['preview']['images'][0]['resolutions']) or [] + for image in images: + url = url_or_none(unescapeHTML(image['url'])) + if url is not None: + thumbnails.append({ + 'url': url, + 'width': int_or_none(image['width']), + 'height': int_or_none(image['height']), + }) + return { '_type': 'url_transparent', 'url': video_url, 'title': data.get('title'), - 'thumbnail': url_or_none(data.get('thumbnail')), + 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), 'duration': int_or_none(try_get( From 4046ffe1e16a65196e113cb12aa2d935f7f17c06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Dec 2020 05:06:54 +0700 Subject: [PATCH 247/424] [redditr] Fix review issues and extract source thumbnail (closes #27503) --- youtube_dl/extractor/reddit.py | 36 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 2d1a1fd99..222fa0172 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -8,8 +8,8 @@ from ..utils import ( int_or_none, float_or_none, try_get, - url_or_none, unescapeHTML, + url_or_none, ) @@ -57,7 +57,8 @@ class RedditRIE(InfoExtractor): 'id': 'zv89llsvexdz', 'ext': 'mp4', 'title': 'That small heart attack.', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:4', 'timestamp': 1501941939, 'upload_date': '20170805', 'uploader': 'Antw87', @@ -120,16 +121,27 @@ class RedditRIE(InfoExtractor): age_limit = None thumbnails = [] - images = try_get( - data, lambda x: x['preview']['images'][0]['resolutions']) or [] - for image in images: - url = url_or_none(unescapeHTML(image['url'])) - if url is not None: - thumbnails.append({ - 'url': url, - 'width': int_or_none(image['width']), - 'height': int_or_none(image['height']), - }) + + def add_thumbnail(src): + if not isinstance(src, dict): + return + thumbnail_url = url_or_none(src.get('url')) + if not thumbnail_url: + return + thumbnails.append({ + 'url': unescapeHTML(thumbnail_url), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + }) + + for image in try_get(data, lambda x: x['preview']['images']) or []: + if not isinstance(image, dict): + continue + add_thumbnail(image.get('source')) + resolutions = image.get('resolutions') + if isinstance(resolutions, list): + for resolution in resolutions: + add_thumbnail(resolution) return { '_type': 'url_transparent', From 2a84694b1e8d91b271eb5bb74b9d63cbfcc58cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Dec 2020 05:14:33 +0700 Subject: [PATCH 248/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ChangeLog b/ChangeLog index 2dad14949..1045fef5e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +version + +Core +* [utils] Accept only supported protocols in url_or_none +* [YoutubeDL] Allow format filtering using audio language (#16209) + +Extractors ++ [redditr] Extract all thumbnails (#27503) +* [vvvvid] Improve info extraction ++ [vvvvid] Add support for playlists (#18130, #27574) ++ [yandexdisk] Extract info from webpage +* [yandexdisk] Fix extraction (#17861, #27131) +* [yandexvideo] Use old API call as fallback +* [yandexvideo] Fix extraction (#25000) +- [nbc] Remove CSNNE extractor +* [nbc] Fix NBCSport VPlayer URL extraction (#16640) ++ [aenetworks] Add support for biography.com (#3863) +* [uktvplay] Match new video URLs (#17909) +* [sevenplay] Detect API errors +* [tenplay] Fix format extraction (#26653) +* [brightcove] Raise error for DRM protected videos (#23467, #27568) + + version 2020.12.29 Extractors From 4066945919a3f51a01f551ade8957f4856c67e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Dec 2020 05:17:55 +0700 Subject: [PATCH 249/424] release 2020.12.31 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 4 +++- youtube_dl/version.py | 2 +- 8 files changed, 17 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 0d5a8e666..d1a6ad1f6 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.29 + [debug] youtube-dl version 2020.12.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 88a6ea3fc..ded5beadf 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 1d75d1a79..0e06de8dc 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 83f1f43cd..dfbfe3701 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.29 + [debug] youtube-dl version 2020.12.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index d3e03f78b..30061808d 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 1045fef5e..3782ad090 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.12.31 Core * [utils] Accept only supported protocols in url_or_none diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1406ba8b8..b1112f83b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -104,6 +104,7 @@ - **BilibiliAudioAlbum** - **BiliBiliPlayer** - **BioBioChileTV** + - **Biography** - **BIQLE** - **BitChute** - **BitChuteChannel** @@ -197,7 +198,6 @@ - **CrooksAndLiars** - **crunchyroll** - **crunchyroll:playlist** - - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **CTV** @@ -349,6 +349,7 @@ - **hgtv.com:show** - **HiDive** - **HistoricFilms** + - **history:player** - **history:topic**: History.com Topic - **hitbox** - **hitbox:live** @@ -1088,6 +1089,7 @@ - **vube**: Vube.com - **VuClip** - **VVVVID** + - **VVVVIDShow** - **VyboryMos** - **Vzaar** - **Wakanim** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c7fb697c4..f795f0735 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.12.29' +__version__ = '2020.12.31' From 2f56caf0835786fa5af92da428ef90a530d90f26 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 11:58:52 +0100 Subject: [PATCH 250/424] [yandexvideo] fix extraction for Python 3.4 --- youtube_dl/extractor/yandexvideo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py index ab8c84c93..6a166ec9b 100644 --- a/youtube_dl/extractor/yandexvideo.py +++ b/youtube_dl/extractor/yandexvideo.py @@ -66,7 +66,7 @@ class YandexVideoIE(InfoExtractor): video_id = self._match_id(url) player = try_get((self._download_json( - 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + 'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{ player(content_id: "%s") { computed_title content_url @@ -86,7 +86,7 @@ class YandexVideoIE(InfoExtractor): title views_count } -}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) +}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content']) if not player or player.get('error'): player = self._download_json( 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, From c35bc8260665e071d5088f5be3bb37f7664ef06e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 12:04:35 +0100 Subject: [PATCH 251/424] [vvvvid] skip unplayable episodes and extract akamai formats(closes #27599) --- youtube_dl/extractor/vvvvid.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 014a67e53..145805492 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -152,7 +152,6 @@ class VVVVIDIE(InfoExtractor): embed_code = ds(embed_code) video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): - embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') if video_type == 'video/kenc': kenc = self._download_json( 'https://www.vvvvid.it/kenc', video_id, query={ @@ -163,9 +162,7 @@ class VVVVIDIE(InfoExtractor): kenc_message = kenc.get('message') if kenc_message: embed_code += '?' + ds(kenc_message) - formats.extend(self._extract_m3u8_formats( - embed_code, video_id, 'mp4', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_akamai_formats(embed_code, video_id)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) @@ -217,6 +214,8 @@ class VVVVIDShowIE(VVVVIDIE): season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: + if episode.get('playable') is False: + continue season_id = str_or_none(episode.get('season_id')) video_id = str_or_none(episode.get('video_id')) if not (season_id and video_id): From 4d7d056909665f68e2aaa95fe42a000c287265b6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 12:17:55 +0100 Subject: [PATCH 252/424] [sky] add support for Sports News articles and Brighcove videos(closes #13054) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/sky.py | 99 +++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 254206a08..54cbbe9d2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1052,6 +1052,7 @@ from .skynewsarabia import ( from .sky import ( SkyNewsIE, SkySportsIE, + SkySportsNewsIE, ) from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE diff --git a/youtube_dl/extractor/sky.py b/youtube_dl/extractor/sky.py index 681691004..ff2c977a0 100644 --- a/youtube_dl/extractor/sky.py +++ b/youtube_dl/extractor/sky.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( extract_attributes, @@ -11,36 +13,59 @@ from ..utils import ( class SkyBaseIE(InfoExtractor): - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_data = extract_attributes(self._search_regex( - r'(]+>)', - webpage, 'video data')) + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + _SDC_EL_REGEX = r'(?s)(]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)' - video_url = 'ooyala:%s' % video_data['data-video-id'] - if video_data.get('data-token-required') == 'true': - token_fetch_options = self._parse_json(video_data.get( - 'data-token-fetch-options', '{}'), video_id, fatal=False) or {} - token_fetch_url = token_fetch_options.get('url') - if token_fetch_url: - embed_token = self._download_webpage(urljoin( - url, token_fetch_url), video_id, fatal=False) - if embed_token: - video_url = smuggle_url( - video_url, {'embed_token': embed_token.strip('"')}) + def _process_ooyala_element(self, webpage, sdc_el, url): + sdc = extract_attributes(sdc_el) + provider = sdc.get('data-provider') + if provider == 'ooyala': + video_id = sdc['data-sdc-video-id'] + video_url = 'ooyala:%s' % video_id + ie_key = 'Ooyala' + ooyala_el = self._search_regex( + r'(]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id, + webpage, 'video data', fatal=False) + if ooyala_el: + ooyala_attrs = extract_attributes(ooyala_el) or {} + if ooyala_attrs.get('data-token-required') == 'true': + token_fetch_url = (self._parse_json(ooyala_attrs.get( + 'data-token-fetch-options', '{}'), + video_id, fatal=False) or {}).get('url') + if token_fetch_url: + embed_token = self._download_json(urljoin( + url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url( + video_url, {'embed_token': embed_token}) + elif provider == 'brightcove': + video_id = sdc['data-video-id'] + account_id = sdc.get('data-account-id') or '6058004172001' + player_id = sdc.get('data-player-id') or 'RC9PQUaJ6' + video_url = self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id) + ie_key = 'BrightcoveNew' return { '_type': 'url_transparent', 'id': video_id, 'url': video_url, + 'ie_key': ie_key, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._process_ooyala_element(webpage, self._search_regex( + self._SDC_EL_REGEX, webpage, 'sdc element'), url) + info.update({ 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), - 'ie_key': 'Ooyala', - } + }) + return info class SkySportsIE(SkyBaseIE): + IE_NAME = 'sky:sports' _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', @@ -62,15 +87,45 @@ class SkySportsIE(SkyBaseIE): class SkyNewsIE(SkyBaseIE): + IE_NAME = 'sky:news' _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P[0-9]+)' _TEST = { 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', - 'md5': 'd6327e581473cea9976a3236ded370cd', + 'md5': '411e8893fd216c75eaf7e4c65d364115', 'info_dict': { - 'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', + 'id': 'ref:1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', 'ext': 'mp4', 'title': 'Russian plane inspected after deadly fire', 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', + 'uploader_id': '6058004172001', + 'timestamp': 1567112345, + 'upload_date': '20190829', }, - 'add_ie': ['Ooyala'], + 'add_ie': ['BrightcoveNew'], } + + +class SkySportsNewsIE(SkyBaseIE): + IE_NAME = 'sky:sports:news' + _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P\d+)' + _TEST = { + 'url': 'http://www.skysports.com/golf/news/12176/10871916/dustin-johnson-ready-to-conquer-players-championship-at-tpc-sawgrass', + 'info_dict': { + 'id': '10871916', + 'title': 'Dustin Johnson ready to conquer Players Championship at TPC Sawgrass', + 'description': 'Dustin Johnson is confident he can continue his dominant form in 2017 by adding the Players Championship to his list of victories.', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + entries = [] + for sdc_el in re.findall(self._SDC_EL_REGEX, webpage): + entries.append(self._process_ooyala_element(webpage, sdc_el, url)) + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), + self._html_search_meta(['og:description', 'description'], webpage)) From ab89a8678b77a53ef3ca701868a3acd5a4c300c1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 23:31:07 +0100 Subject: [PATCH 253/424] [arcpublishing] Add new extractor closes #2298 closes #9340 closes #17200 --- youtube_dl/extractor/arcpublishing.py | 173 +++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 19 +++ youtube_dl/extractor/washingtonpost.py | 101 +++------------ 4 files changed, 210 insertions(+), 84 deletions(-) create mode 100644 youtube_dl/extractor/arcpublishing.py diff --git a/youtube_dl/extractor/arcpublishing.py b/youtube_dl/extractor/arcpublishing.py new file mode 100644 index 000000000..d1fb1a054 --- /dev/null +++ b/youtube_dl/extractor/arcpublishing.py @@ -0,0 +1,173 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_iso8601, + try_get, +) + + +class ArcPublishingIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _VALID_URL = r'arcpublishing:(?P[a-z]+):(?P%s)' % _UUID_REGEX + _TESTS = [{ + # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/ + 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'only_matching': True, + }, { + # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/ + 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1', + 'only_matching': True, + }, { + # https://www.actionnewsjax.com/video/live-stream/ + 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a', + 'only_matching': True, + }, { + # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/ + 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3', + 'only_matching': True, + }, { + # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/ + 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe', + 'only_matching': True, + }, { + # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/ + 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e', + 'only_matching': True, + }, { + # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/ + 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143', + 'only_matching': True, + }, { + # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/ + 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055', + 'only_matching': True, + }, { + # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/ + 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d', + 'only_matching': True, + }, { + # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/ + 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7', + 'only_matching': True, + }, { + # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/ + 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b', + 'only_matching': True, + }, { + # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html + 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685', + 'only_matching': True, + }] + _POWA_DEFAULTS = [ + (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'), + ([ + 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo', + 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom', + 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek', + ], 'video-api-cdn.%s.arcpublishing.com/api'), + ] + + def _extract_urls(webpage): + entries = [] + # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview + for powa_el in re.findall(r'(]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): + powa = extract_attributes(powa_el) or {} + org = powa.get('data-org') + uuid = powa.get('data-uuid') + if org and uuid: + entries.append('arcpublishing:%s:%s' % (org, uuid)) + return entries + + def _real_extract(self, url): + org, uuid = re.match(self._VALID_URL, url).groups() + for orgs, tmpl in self._POWA_DEFAULTS: + if org in orgs: + base_api_tmpl = tmpl + break + else: + base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api' + if org == 'wapo': + org = 'washpost' + video = self._download_json( + 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org), + uuid, query={'uuid': uuid})[0] + title = video['headlines']['basic'] + is_live = video.get('status') == 'live' + + urls = [] + formats = [] + for s in video.get('streams', []): + s_url = s.get('url') + if not s_url or s_url in urls: + continue + urls.append(s_url) + stream_type = s.get('stream_type') + if stream_type == 'smil': + smil_formats = self._extract_smil_formats( + s_url, uuid, fatal=False) + for f in smil_formats: + if f['url'].endswith('/cfx/st'): + f['app'] = 'cfx/st' + if not f['play_path'].startswith('mp4:'): + f['play_path'] = 'mp4:' + f['play_path'] + if isinstance(f['tbr'], float): + f['vbr'] = f['tbr'] * 1000 + del f['tbr'] + f['format_id'] = 'rtmp-%d' % f['vbr'] + formats.extend(smil_formats) + elif stream_type in ('ts', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False) + if all([f.get('acodec') == 'none' for f in m3u8_formats]): + continue + for f in m3u8_formats: + if f.get('acodec') == 'none': + f['preference'] = -40 + elif f.get('vcodec') == 'none': + f['preference'] = -50 + height = f.get('height') + if not height: + continue + vbr = self._search_regex( + r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None) + if vbr: + f['vbr'] = int(vbr) + formats.extend(m3u8_formats) + else: + vbr = int_or_none(s.get('bitrate')) + formats.append({ + 'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type, + 'vbr': vbr, + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'filesize': int_or_none(s.get('filesize')), + 'url': s_url, + 'preference': -1, + }) + self._sort_formats( + formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id')) + + subtitles = {} + for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): + subtitle_url = subtitle.get('url') + if subtitle_url: + subtitles.setdefault('en', []).append({'url': subtitle_url}) + + return { + 'id': uuid, + 'title': self._live_title(title) if is_live else title, + 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), + 'description': try_get(video, lambda x: x['subheadlines']['basic']), + 'formats': formats, + 'duration': int_or_none(video.get('duration'), 100), + 'timestamp': parse_iso8601(video.get('created_date')), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 54cbbe9d2..d9d3f4940 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -56,6 +56,7 @@ from .appletrailers import ( AppleTrailersSectionIE, ) from .archiveorg import ArchiveOrgIE +from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE from .ard import ( ARDBetaMediathekIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 14c27c6da..780971a92 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -127,6 +127,7 @@ from .expressen import ExpressenIE from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE +from .arcpublishing import ArcPublishingIE class GenericIE(InfoExtractor): @@ -2208,6 +2209,20 @@ class GenericIE(InfoExtractor): 'uploader': 'OTT Videos', }, }, + { + # ArcPublishing PoWa video player + 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/', + 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3', + 'info_dict': { + 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'ext': 'mp4', + 'title': 'Senate candidates wave to voters on Anchorage streets', + 'description': 'md5:91f51a6511f090617353dc720318b20e', + 'timestamp': 1604378735, + 'upload_date': '20201103', + 'duration': 1581, + }, + }, ] def report_following_redirect(self, new_url): @@ -2574,6 +2589,10 @@ class GenericIE(InfoExtractor): if tp_urls: return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') + arc_urls = ArcPublishingIE._extract_urls(webpage) + if arc_urls: + return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 625d0a1cc..8afb1af83 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -4,17 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - strip_jsonp, -) class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' - _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' - _TEST = { + _TESTS = [{ 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', 'info_dict': { @@ -23,10 +19,15 @@ class WashingtonPostIE(InfoExtractor): 'title': 'Egypt finds belongings, debris from plane crash', 'description': 'md5:a17ceee432f215a5371388c1f680bd86', 'upload_date': '20160520', - 'uploader': 'Reuters', - 'timestamp': 1463778452, + 'timestamp': 1463775187, }, - } + }, { + 'url': 'https://www.washingtonpost.com/video/world/egypt-finds-belongings-debris-from-plane-crash/2016/05/20/480ba4ee-1ec7-11e6-82c2-a7dcb313287d_video.html', + 'only_matching': True, + }, { + 'url': 'https://www.washingtonpost.com/posttv/world/iraq-to-track-down-antiquities-after-islamic-state-museum-rampage/2015/02/28/7c57e916-bf86-11e4-9dfb-03366e719af8_video.html', + 'only_matching': True, + }] @classmethod def _extract_urls(cls, webpage): @@ -35,73 +36,8 @@ class WashingtonPostIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id, - video_id, transform_source=strip_jsonp)[0]['contentConfig'] - title = video_data['title'] - - urls = [] - formats = [] - for s in video_data.get('streams', []): - s_url = s.get('url') - if not s_url or s_url in urls: - continue - urls.append(s_url) - video_type = s.get('type') - if video_type == 'smil': - continue - elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url): - m3u8_formats = self._extract_m3u8_formats( - s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - for m3u8_format in m3u8_formats: - width = m3u8_format.get('width') - if not width: - continue - vbr = self._search_regex( - r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None) - if vbr: - m3u8_format.update({ - 'vbr': int_or_none(vbr), - }) - formats.extend(m3u8_formats) - else: - width = int_or_none(s.get('width')) - vbr = int_or_none(s.get('bitrate')) - has_width = width != 0 - formats.append({ - 'format_id': ( - '%s-%d-%d' % (video_type, width, vbr) - if width - else video_type), - 'vbr': vbr if has_width else None, - 'width': width, - 'height': int_or_none(s.get('height')), - 'acodec': s.get('audioCodec'), - 'vcodec': s.get('videoCodec') if has_width else 'none', - 'filesize': int_or_none(s.get('fileSize')), - 'url': s_url, - 'ext': 'mp4', - 'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None, - }) - source_media_url = video_data.get('sourceMediaURL') - if source_media_url: - formats.append({ - 'format_id': 'source_media', - 'url': source_media_url, - }) - self._sort_formats( - formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id')) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('blurb'), - 'uploader': video_data.get('credits', {}).get('source'), - 'formats': formats, - 'duration': int_or_none(video_data.get('videoDuration'), 100), - 'timestamp': int_or_none( - video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000), - } + return self.url_result( + 'arcpublishing:wapo:' + video_id, 'ArcPublishing', video_id) class WashingtonPostArticleIE(InfoExtractor): @@ -121,9 +57,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'Breaking Points: The Paper Mine', 'duration': 1290, 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.', - 'uploader': 'The Washington Post', - 'timestamp': 1395527908, - 'upload_date': '20140322', + 'timestamp': 1395440416, + 'upload_date': '20140321', }, }, { 'md5': '1fff6a689d8770966df78c8cb6c8c17c', @@ -133,9 +68,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'The town bureaucracy sustains', 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.', 'duration': 2220, - 'timestamp': 1395528005, - 'upload_date': '20140322', - 'uploader': 'The Washington Post', + 'timestamp': 1395441819, + 'upload_date': '20140321', }, }], }, { @@ -151,8 +85,7 @@ class WashingtonPostArticleIE(InfoExtractor): 'ext': 'mp4', 'description': 'Washington Post transportation reporter Ashley Halsey III explains why a plane\'s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.', 'upload_date': '20141230', - 'uploader': 'The Washington Post', - 'timestamp': 1419974765, + 'timestamp': 1419972442, 'title': 'Why black boxes don’t transmit data in real time', } }] From 40d66e07dfef72fbef761ac720b82eb77deb7398 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 23:42:17 +0100 Subject: [PATCH 254/424] [arcpublishing] add missing staticmethod decorator --- youtube_dl/extractor/arcpublishing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/arcpublishing.py b/youtube_dl/extractor/arcpublishing.py index d1fb1a054..ca6a6c4d8 100644 --- a/youtube_dl/extractor/arcpublishing.py +++ b/youtube_dl/extractor/arcpublishing.py @@ -73,6 +73,7 @@ class ArcPublishingIE(InfoExtractor): ], 'video-api-cdn.%s.arcpublishing.com/api'), ] + @staticmethod def _extract_urls(webpage): entries = [] # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview From 8c17afc4710cb39801bdae5027fe3bf641bf485d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Jan 2021 17:05:42 +0100 Subject: [PATCH 255/424] [acast] fix extraction(closes #21444)(closes #27612)(closes #27613) --- youtube_dl/extractor/acast.py | 116 ++++++++++++++++------------------ 1 file changed, 53 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index b17c792d2..60378db1b 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -2,21 +2,47 @@ from __future__ import unicode_literals import re -import functools from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, - float_or_none, int_or_none, - try_get, - unified_timestamp, - OnDemandPagedList, + parse_iso8601, ) -class ACastIE(InfoExtractor): +class ACastBaseIE(InfoExtractor): + def _extract_episode(self, episode, show_info): + title = episode['title'] + info = { + 'id': episode['id'], + 'display_id': episode.get('episodeUrl'), + 'url': episode['url'], + 'title': title, + 'description': clean_html(episode.get('description') or episode.get('summary')), + 'thumbnail': episode.get('image'), + 'timestamp': parse_iso8601(episode.get('publishDate')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(episode.get('contentLength')), + 'season_number': int_or_none(episode.get('season')), + 'episode': title, + 'episode_number': int_or_none(episode.get('episode')), + } + info.update(show_info) + return info + + def _extract_show_info(self, show): + return { + 'creator': show.get('author'), + 'series': show.get('title'), + } + + def _call_api(self, path, video_id, query=None): + return self._download_json( + 'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query) + + +class ACastIE(ACastBaseIE): IE_NAME = 'acast' _VALID_URL = r'''(?x) https?:// @@ -28,15 +54,15 @@ class ACastIE(InfoExtractor): ''' _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': '16d936099ec5ca2d5869e3a813ee8dc4', + 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', + 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', 'timestamp': 1477346700, 'upload_date': '20161024', - 'duration': 2766.602563, + 'duration': 2766, 'creator': 'Anton Berg & Martin Johnson', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', @@ -45,7 +71,7 @@ class ACastIE(InfoExtractor): 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', 'only_matching': True, }, { - 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2', 'only_matching': True, }, { 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', @@ -54,40 +80,14 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - s = self._download_json( - 'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id), - display_id) - media_url = s['url'] - if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id): - episode_url = s.get('episodeUrl') - if episode_url: - display_id = episode_url - else: - channel, display_id = re.match(self._VALID_URL, s['link']).groups() - cast_data = self._download_json( - 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), - display_id)['result'] - e = cast_data['episode'] - title = e.get('name') or s['title'] - return { - 'id': compat_str(e['id']), - 'display_id': display_id, - 'url': media_url, - 'title': title, - 'description': e.get('summary') or clean_html(e.get('description') or s.get('description')), - 'thumbnail': e.get('image'), - 'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')), - 'duration': float_or_none(e.get('duration') or s.get('duration')), - 'filesize': int_or_none(e.get('contentLength')), - 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), - 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), - 'season_number': int_or_none(e.get('seasonNumber')), - 'episode': title, - 'episode_number': int_or_none(e.get('episodeNumber')), - } + episode = self._call_api( + '%s/episodes/%s' % (channel, display_id), + display_id, {'showInfo': 'true'}) + return self._extract_episode( + episode, self._extract_show_info(episode.get('show') or {})) -class ACastChannelIE(InfoExtractor): +class ACastChannelIE(ACastBaseIE): IE_NAME = 'acast:channel' _VALID_URL = r'''(?x) https?:// @@ -102,34 +102,24 @@ class ACastChannelIE(InfoExtractor): 'info_dict': { 'id': '4efc5294-5385-4847-98bd-519799ce5786', 'title': 'Today in Focus', - 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', + 'description': 'md5:c09ce28c91002ce4ffce71d6504abaae', }, - 'playlist_mincount': 35, + 'playlist_mincount': 200, }, { 'url': 'http://play.acast.com/s/ft-banking-weekly', 'only_matching': True, }] - _API_BASE_URL = 'https://play.acast.com/api/' - _PAGE_SIZE = 10 @classmethod def suitable(cls, url): return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) - def _fetch_page(self, channel_slug, page): - casts = self._download_json( - self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page), - channel_slug, note='Download page %d of channel data' % page) - for cast in casts: - yield self.url_result( - 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), - 'ACast', cast['id']) - def _real_extract(self, url): - channel_slug = self._match_id(url) - channel_data = self._download_json( - self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug) - entries = OnDemandPagedList(functools.partial( - self._fetch_page, channel_slug), self._PAGE_SIZE) - return self.playlist_result(entries, compat_str( - channel_data['id']), channel_data['name'], channel_data.get('description')) + show_slug = self._match_id(url) + show = self._call_api(show_slug, show_slug) + show_info = self._extract_show_info(show) + entries = [] + for episode in (show.get('episodes') or []): + entries.append(self._extract_episode(episode, show_info)) + return self.playlist_result( + entries, show.get('id'), show.get('title'), show.get('description')) From d0d838638c8b49514c8bf093d6c76fea98019971 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Jan 2021 18:30:56 +0100 Subject: [PATCH 256/424] [stitcher] fix extraction(closes #20811)(closes #27606) --- youtube_dl/extractor/stitcher.py | 60 ++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index 97d1ff681..b8b5711b1 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -4,25 +4,28 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, + clean_html, + ExtractorError, int_or_none, - js_to_json, - unescapeHTML, + str_or_none, + try_get, ) class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', - 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'md5': 'e9635098e0da10b21a0e2b85585530f6', 'info_dict': { 'id': '40789481', 'ext': 'mp3', 'title': 'Machine Learning Mastery and Cancer Clusters', - 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20180126', + 'timestamp': 1516989316, }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -38,6 +41,7 @@ class StitcherIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Page Not Found', }, { # escaped title 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', @@ -45,37 +49,39 @@ class StitcherIE(InfoExtractor): }, { 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', 'only_matching': True, + }, { + 'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - audio_id = mobj.group('id') - display_id = mobj.group('display_id') or audio_id + display_id, audio_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) + resp = self._download_json( + 'https://api.prod.stitcher.com/episode/' + audio_id, + display_id or audio_id) + episode = try_get(resp, lambda x: x['data']['episodes'][0], dict) + if not episode: + raise ExtractorError(resp['errors'][0]['message'], expected=True) - episode = self._parse_json( - js_to_json(self._search_regex( - r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), - display_id)['config']['episode'] + title = episode['title'].strip() + audio_url = episode['audio_url'] - title = unescapeHTML(episode['title']) - formats = [{ - 'url': episode[episode_key], - 'ext': determine_ext(episode[episode_key]) or 'mp3', - 'vcodec': 'none', - } for episode_key in ('episodeURL',) if episode.get(episode_key)] - description = self._search_regex( - r'Episode Info:\s*([^<]+)<', webpage, 'description', fatal=False) - duration = int_or_none(episode.get('duration')) - thumbnail = episode.get('episodeImage') + thumbnail = None + show_id = episode.get('show_id') + if show_id and episode.get('classic_id') != -1: + thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id return { 'id': audio_id, 'display_id': display_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': clean_html(episode.get('html_description') or episode.get('description')), + 'duration': int_or_none(episode.get('duration')), 'thumbnail': thumbnail, - 'formats': formats, + 'url': audio_url, + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_created')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), } From 75972e200d033429bf9d34793ad3ffc813042347 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Jan 2021 20:21:59 +0100 Subject: [PATCH 257/424] [vvvvid] fix season metadata extraction(#18130) --- youtube_dl/extractor/vvvvid.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 145805492..f4cae7fe9 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -25,7 +25,6 @@ class VVVVIDIE(InfoExtractor): 'duration': 239, 'series': '"Perché dovrei guardarlo?" di Dario Moccia', 'season_id': '437', - 'season_number': 1, 'episode': 'Ping Pong', 'episode_number': 1, 'episode_id': '3334', @@ -75,7 +74,6 @@ class VVVVIDIE(InfoExtractor): def _extract_common_video_info(self, video_data): return { 'thumbnail': video_data.get('thumbnail'), - 'episode_number': int_or_none(video_data.get('number')), 'episode_id': str_or_none(video_data.get('id')), } @@ -145,6 +143,17 @@ class VVVVIDIE(InfoExtractor): return d + info = {} + + def metadata_from_url(r_url): + if not info and r_url: + mobj = re.search(r'_(?:S(\d+))?Ep(\d+)', r_url) + if mobj: + info['episode_number'] = int(mobj.group(2)) + season_number = mobj.group(1) + if season_number: + info['season_number'] = int(season_number) + for quality in ('_sd', ''): embed_code = video_data.get('embed_info' + quality) if not embed_code: @@ -166,9 +175,12 @@ class VVVVIDIE(InfoExtractor): else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) + metadata_from_url(embed_code) + self._sort_formats(formats) - info = self._extract_common_video_info(video_data) + metadata_from_url(video_data.get('thumbnail')) + info.update(self._extract_common_video_info(video_data)) info.update({ 'id': video_id, 'title': title, @@ -176,7 +188,6 @@ class VVVVIDIE(InfoExtractor): 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, - 'season_number': video_data.get('season_number'), 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), @@ -211,7 +222,6 @@ class VVVVIDShowIE(VVVVIDIE): entries = [] for season in (seasons or []): - season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: if episode.get('playable') is False: @@ -227,7 +237,6 @@ class VVVVIDShowIE(VVVVIDIE): 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), 'description': episode.get('description'), - 'season_number': season_number, 'season_id': season_id, }) entries.append(info) From 2df93a0c4ada8eff399844ac4a249a743e0a980d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 05:01:45 +0700 Subject: [PATCH 258/424] [nrktv] Switch to playback endpoint mediaelement endpoint is no longer in use. --- youtube_dl/extractor/nrk.py | 273 ++++++++---------------------------- 1 file changed, 57 insertions(+), 216 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index b545f291b..871e4845c 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -6,15 +6,11 @@ import random import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, int_or_none, - parse_age_limit, parse_duration, try_get, urljoin, @@ -63,60 +59,8 @@ class NRKBaseIE(InfoExtractor): return self._download_json( urljoin('http://psapi.nrk.no/', path), video_id, note or 'Downloading %s JSON' % item, - fatal=fatal, query=query) - - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P[^?\#&]+) - ''' - - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': 'f46be075326e23ad0e524edfcb06aeb6', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'mp4', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', - 'only_matching': True, - }] + fatal=fatal, query=query, + headers={'Accept-Encoding': 'gzip, deflate, br'}) def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id @@ -178,6 +122,59 @@ class NRKIE(NRKBaseIE): 'formats': formats, } + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'mp4', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }] + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_playback(video_id) @@ -187,7 +184,6 @@ class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', @@ -290,164 +286,9 @@ class NRKTVIE(NRKBaseIE): 'only_matching': True, }] - _api_host = None - - def _extract_from_mediaelement(self, video_id): - api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS - - for api_host in api_hosts: - data = self._download_json( - 'http://%s/mediaelement/%s' % (api_host, video_id), - video_id, 'Downloading mediaelement JSON', - fatal=api_host == api_hosts[-1]) - if not data: - continue - self._api_host = api_host - break - - title = data.get('fullTitle') or data.get('mainTitle') or data['title'] - video_id = data.get('id') or video_id - - urls = [] - entries = [] - - conviva = data.get('convivaStatistics') or {} - live = (data.get('mediaElementType') == 'Live' - or data.get('isLive') is True or conviva.get('isLive')) - - def make_title(t): - return self._live_title(t) if live else t - - media_assets = data.get('mediaAssets') - if media_assets and isinstance(media_assets, list): - def video_id_and_title(idx): - return ((video_id, title) if len(media_assets) == 1 - else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) - for num, asset in enumerate(media_assets, 1): - asset_url = asset.get('url') - if not asset_url or asset_url in urls: - continue - urls.append(asset_url) - formats = self._extract_nrk_formats(asset_url, video_id) - if not formats: - continue - self._sort_formats(formats) - - entry_id, entry_title = video_id_and_title(num) - duration = parse_duration(asset.get('duration')) - subtitles = {} - for subtitle in ('webVtt', 'timedText'): - subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) - if subtitle_url: - subtitles.setdefault('no', []).append({ - 'url': compat_urllib_parse_unquote(subtitle_url) - }) - entries.append({ - 'id': asset.get('carrierId') or entry_id, - 'title': make_title(entry_title), - 'duration': duration, - 'subtitles': subtitles, - 'formats': formats, - 'is_live': live, - }) - - if not entries: - media_url = data.get('mediaUrl') - if media_url and media_url not in urls: - formats = self._extract_nrk_formats(media_url, video_id) - if formats: - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - 'is_live': live, - }] - - if not entries: - self._raise_error(data) - - series = conviva.get('seriesName') or data.get('seriesTitle') - episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') - - season_number = None - episode_number = None - if data.get('mediaElementType') == 'Episode': - _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ - data.get('relativeOriginUrl', '') - EPISODENUM_RE = [ - r'/s(?P\d{,2})e(?P\d{,2})\.', - r'/sesong-(?P\d{,2})/episode-(?P\d{,2})', - ] - season_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'season number', - default=None, group='season')) - episode_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'episode number', - default=None, group='episode')) - - thumbnails = None - images = data.get('images') - if images and isinstance(images, dict): - web_images = images.get('webImages') - if isinstance(web_images, list): - thumbnails = [{ - 'url': image['imageUrl'], - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in web_images if image.get('imageUrl')] - - description = data.get('description') - category = data.get('mediaAnalytics', {}).get('category') - - common_info = { - 'description': description, - 'series': series, - 'episode': episode, - 'season_number': season_number, - 'episode_number': episode_number, - 'categories': [category] if category else None, - 'age_limit': parse_age_limit(data.get('legalAge')), - 'thumbnails': thumbnails, - } - - vcodec = 'none' if data.get('mediaType') == 'Audio' else None - - for entry in entries: - entry.update(common_info) - for f in entry['formats']: - f['vcodec'] = vcodec - - points = data.get('shortIndexPoints') - if isinstance(points, list): - chapters = [] - for next_num, point in enumerate(points, start=1): - if not isinstance(point, dict): - continue - start_time = parse_duration(point.get('startPoint')) - if start_time is None: - continue - end_time = parse_duration( - data.get('duration') - if next_num == len(points) - else points[next_num].get('startPoint')) - if end_time is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': point.get('title'), - }) - if chapters and len(entries) == 1: - entries[0]['chapters'] = chapters - - return self.playlist_result(entries, video_id, title, description) - def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_from_mediaelement(video_id) + return self._extract_from_playback(video_id) class NRKTVEpisodeIE(InfoExtractor): From 21a42e2588226b781a02124e26d709a9562c8fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:29:52 +0700 Subject: [PATCH 259/424] [nrk] Improve extraction (closes #27634, closes #27635) + Add support for mp3 formats * Generalize and delegate all item extractors to nrk, beware ie key breakages + Add support for podcasts + Generalize nrk shortcut form to support all kind of ids --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nrk.py | 248 ++++++++++++++++++++--------- 2 files changed, 172 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d9d3f4940..74743a449 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -790,6 +790,7 @@ from .nrk import ( NRKSkoleIE, NRKTVIE, NRKTVDirekteIE, + NRKRadioPodkastIE, NRKTVEpisodeIE, NRKTVEpisodesIE, NRKTVSeasonIE, diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 871e4845c..9621522d4 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -62,66 +62,6 @@ class NRKBaseIE(InfoExtractor): fatal=fatal, query=query, headers={'Accept-Encoding': 'gzip, deflate, br'}) - def _extract_from_playback(self, video_id): - path_templ = 'playback/%s/' + video_id - - def call_playback_api(item, query=None): - return self._call_api(path_templ % item, video_id, item, query=query) - # known values for preferredCdn: akamai, iponly, minicdn and telenor - manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) - - if manifest.get('playability') == 'nonPlayable': - self._raise_error(manifest['nonPlayable']) - - playable = manifest['playable'] - - formats = [] - for asset in playable['assets']: - if not isinstance(asset, dict): - continue - if asset.get('encrypted'): - continue - format_url = url_or_none(asset.get('url')) - if not format_url: - continue - if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_nrk_formats(format_url, video_id)) - self._sort_formats(formats) - - data = call_playback_api('metadata') - - preplay = data['preplay'] - titles = preplay['titles'] - title = titles['title'] - alt_title = titles.get('subtitle') - - description = preplay.get('description') - duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) - - thumbnails = [] - for image in try_get( - preplay, lambda x: x['poster']['images'], list) or []: - if not isinstance(image, dict): - continue - image_url = url_or_none(image.get('url')) - if not image_url: - continue - thumbnails.append({ - 'url': image_url, - 'width': int_or_none(image.get('pixelWidth')), - 'height': int_or_none(image.get('pixelHeight')), - }) - - return { - 'id': video_id, - 'title': title, - 'alt_title': alt_title, - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'formats': formats, - } - class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -173,14 +113,97 @@ class NRKIE(NRKBaseIE): }, { 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', 'only_matching': True, + }, { + # podcast + 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + # clip + 'url': 'nrk:150533', + 'only_matching': True, + }, { + # episode + 'url': 'nrk:MDDP12000117', + 'only_matching': True, + }, { + # direkte + 'url': 'nrk:nrk1', + 'only_matching': True, }] + def _extract_from_playback(self, video_id): + path_templ = 'playback/%s/' + video_id + + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + asset_format = (asset.get('format') or '').lower() + if asset_format == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_nrk_formats(format_url, video_id)) + elif asset_format == 'mp3': + formats.append({ + 'url': format_url, + 'format_id': asset_format, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + data = call_playback_api('metadata') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_playback(video_id) -class NRKTVIE(NRKBaseIE): +class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE @@ -288,7 +311,8 @@ class NRKTVIE(NRKBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_from_playback(video_id) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) class NRKTVEpisodeIE(InfoExtractor): @@ -359,8 +383,6 @@ class NRKTVSerieBaseIE(NRKBaseIE): nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue - if not re.match(NRKTVIE._EPISODE_RE, nrk_id): - continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries @@ -372,6 +394,10 @@ class NRKTVSerieBaseIE(NRKBaseIE): if embedded.get(asset_key): return asset_key + @staticmethod + def _catalog_name(serie_kind): + return 'podcast' if serie_kind in ('podcast', 'podkast') else 'series' + def _entries(self, data, display_id): for page_num in itertools.count(1): embedded = data.get('_embedded') or data @@ -405,7 +431,16 @@ class NRKTVSerieBaseIE(NRKBaseIE): class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?Ptv|radio)\.nrk\.no/serie/(?P[^/]+)/(?:sesong/)?(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?Ptv|radio)\.nrk\.no/ + (?Pserie|pod[ck]ast)/ + (?P[^/]+)/ + (?: + (?:sesong/)?(?P\d+)| + sesong/(?P[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { @@ -441,19 +476,34 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): # 180 entries, single page 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/diagnose-kverulant', + 'info_dict': { + 'id': 'hele_historien/diagnose-kverulant', + 'title': 'Diagnose kverulant', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://radio.nrk.no/podkast/loerdagsraadet/sesong/202101', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - domain, serie, season_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + serie_kind = mobj.group('serie_kind') + serie = mobj.group('serie') + season_id = mobj.group('id') or mobj.group('id_2') display_id = '%s/%s' % (serie, season_id) data = self._call_api( - '%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), + '%s/catalog/%s/%s/seasons/%s' + % (domain, self._catalog_name(serie_kind), serie, season_id), display_id, 'season', query={'pageSize': 50}) title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id @@ -463,7 +513,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P[^/]+)' + _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/(?Pserie|pod[ck]ast)/(?P[^/]+)' _TESTS = [{ # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', @@ -523,23 +573,33 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://nrksuper.no/serie/labyrint', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers', + 'info_dict': { + 'id': 'ulrikkes_univers', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/nrkno-poddkast-26588-134079-05042018030000', + 'only_matching': True, }] @classmethod def suitable(cls, url): return ( False if any(ie.suitable(url) - for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE)) else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - site, series_id = re.match(self._VALID_URL, url).groups() + site, serie_kind, series_id = re.match(self._VALID_URL, url).groups() is_radio = site == 'radio.nrk' domain = 'radio' if is_radio else 'tv' size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' series = self._call_api( - '%s/catalog/series/%s' % (domain, series_id), + '%s/catalog/%s/%s' + % (domain, self._catalog_name(serie_kind), series_id), series_id, 'serie', query={size_prefix + 'ageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], @@ -554,12 +614,14 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): embedded_seasons = embedded.get('seasons') or [] if len(linked_seasons) > len(embedded_seasons): for season in linked_seasons: - season_name = season.get('name') - if season_name and isinstance(season_name, compat_str): + season_url = urljoin(url, season.get('href')) + if not season_url: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name) + if season_url: entries.append(self.url_result( - 'https://%s.nrk.no/serie/%s/sesong/%s' - % (domain, series_id, season_name), - ie=NRKTVSeasonIE.ie_key(), + season_url, ie=NRKTVSeasonIE.ie_key(), video_title=season.get('title'))) else: for season in embedded_seasons: @@ -584,6 +646,38 @@ class NRKTVDirekteIE(NRKTVIE): }] +class NRKRadioPodkastIE(InfoExtractor): + _VALID_URL = r'https?://radio\.nrk\.no/pod[ck]ast/(?:[^/]+/)+(?Pl_[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'md5': '8d40dab61cea8ab0114e090b029a0565', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + }, { + 'url': 'https://radio.nrk.no/podcast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/sesong/1/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/bortfoert-i-bergen/l_774d1a2c-7aa7-4965-8d1a-2c7aa7d9652c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + + class NRKPlaylistBaseIE(InfoExtractor): def _extract_description(self, webpage): pass From 1f1d01d498fee8120bb3415b1b1a20e21259bbf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:41:37 +0700 Subject: [PATCH 260/424] [nrk] Add more shortcut tests --- youtube_dl/extractor/nrk.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 9621522d4..61a7c9aad 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -117,18 +117,30 @@ class NRKIE(NRKBaseIE): # podcast 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', 'only_matching': True, + }, { + 'url': 'nrk:podcast/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, }, { # clip 'url': 'nrk:150533', 'only_matching': True, }, { - # episode + 'url': 'nrk:clip/150533', + 'only_matching': True, + }, { + # program 'url': 'nrk:MDDP12000117', 'only_matching': True, + }, { + 'url': 'nrk:program/ENRK10100318', + 'only_matching': True, }, { # direkte 'url': 'nrk:nrk1', 'only_matching': True, + }, { + 'url': 'nrk:channel/nrk1', + 'only_matching': True, }] def _extract_from_playback(self, video_id): From 7b643d4cd0f5aa02149e29d8212acd42038da63b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:44:28 +0700 Subject: [PATCH 261/424] [nrk] Improve video id extraction --- youtube_dl/extractor/nrk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 61a7c9aad..5f12b0d9e 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -151,6 +151,8 @@ class NRKIE(NRKBaseIE): # known values for preferredCdn: akamai, iponly, minicdn and telenor manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id + if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -211,7 +213,7 @@ class NRKIE(NRKBaseIE): } def _real_extract(self, url): - video_id = self._match_id(url) + video_id = self._match_id(url).split('/')[-1] return self._extract_from_playback(video_id) From cabfd4b1f0354518068bbdf0718cd36497d5c8a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:45:17 +0700 Subject: [PATCH 262/424] [nrk] Inline _extract_from_playback --- youtube_dl/extractor/nrk.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 5f12b0d9e..520206534 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -143,7 +143,9 @@ class NRKIE(NRKBaseIE): 'only_matching': True, }] - def _extract_from_playback(self, video_id): + def _real_extract(self, url): + video_id = self._match_id(url).split('/')[-1] + path_templ = 'playback/%s/' + video_id def call_playback_api(item, query=None): @@ -212,10 +214,6 @@ class NRKIE(NRKBaseIE): 'formats': formats, } - def _real_extract(self, url): - video_id = self._match_id(url).split('/')[-1] - return self._extract_from_playback(video_id) - class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' From a25d03d7cb52111fab906c2c180f9bdf2c52ed12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:51:21 +0700 Subject: [PATCH 263/424] [nrk] Fix age limit extraction --- youtube_dl/extractor/nrk.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 520206534..d023de7f7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -204,6 +204,9 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) + age_limit = int_or_none(try_get( + data, lambda x: x['legalAge']['body']['rating']['code'])) + return { 'id': video_id, 'title': title, @@ -211,6 +214,7 @@ class NRKIE(NRKBaseIE): 'description': description, 'duration': duration, 'thumbnails': thumbnails, + 'age_limit': age_limit, 'formats': formats, } From 7dfd966848e17e3d438e5e49e1088b8b1444f9e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:02:13 +0700 Subject: [PATCH 264/424] [nrk] Extract subtitles --- youtube_dl/extractor/nrk.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index d023de7f7..bd96d9d14 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, int_or_none, parse_duration, + str_or_none, try_get, urljoin, url_or_none, @@ -204,6 +205,21 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) + subtitles = {} + for sub in try_get(playable, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + sub_url = url_or_none(sub.get('webVtt')) + if not sub_url: + continue + sub_key = str_or_none(sub.get('language')) or 'nb' + sub_type = str_or_none(sub.get('type')) + if sub_type: + sub_key += '-%s' % sub_type + subtitles.setdefault(sub_key, []).append({ + 'url': sub_url, + }) + age_limit = int_or_none(try_get( data, lambda x: x['legalAge']['body']['rating']['code'])) @@ -216,6 +232,7 @@ class NRKIE(NRKBaseIE): 'thumbnails': thumbnails, 'age_limit': age_limit, 'formats': formats, + 'subtitles': subtitles, } From 85de33b04ef339005cea949576dda4b553296f50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:30:45 +0700 Subject: [PATCH 265/424] [nrk] Improve series metadata extraction --- youtube_dl/extractor/nrk.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index bd96d9d14..20a5d7673 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -223,7 +223,9 @@ class NRKIE(NRKBaseIE): age_limit = int_or_none(try_get( data, lambda x: x['legalAge']['body']['rating']['code'])) - return { + is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' + + info = { 'id': video_id, 'title': title, 'alt_title': alt_title, @@ -235,6 +237,27 @@ class NRKIE(NRKBaseIE): 'subtitles': subtitles, } + if is_series: + series = title + if alt_title: + title += ' - %s' % alt_title + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + episode = alt_title if is_series else None + episode_number = int_or_none(self._search_regex( + r'(\d+)\.\s+episode', episode or '', 'episode number', + default=None)) + info.update({ + 'title': title, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info + class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' From bc2f83b95e02c9a75e576109ca0bced8a6c67eb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:31:03 +0700 Subject: [PATCH 266/424] [nrktv] Fix tests --- youtube_dl/extractor/nrk.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 20a5d7673..4fb7df959 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -267,7 +267,7 @@ class NRKTVIE(InfoExtractor): 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', 'info_dict': { - 'id': 'MDDP12000117AA', + 'id': 'MDDP12000117', 'ext': 'mp4', 'title': 'Alarm Trolltunga', 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', @@ -278,24 +278,25 @@ class NRKTVIE(InfoExtractor): 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '8d40dab61cea8ab0114e090b029a0565', 'info_dict': { - 'id': 'MUHH48000314AA', + 'id': 'MUHH48000314', 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', + 'title': '20 spørsmål - 23. mai 2014', + 'alt_title': '23. mai 2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, 'series': '20 spørsmål', - 'episode': '23.05.2014', + 'episode': '23. mai 2014', }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { - 'id': 'MDFP15000514CA', + 'id': 'MDFP15000514', 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', 'duration': 4605.08, 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', + 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', }, 'params': { 'skip_download': True, @@ -304,7 +305,7 @@ class NRKTVIE(InfoExtractor): # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'info_dict': { - 'id': 'MSPO40010515AH', + 'id': 'MSPO40010515', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', @@ -317,22 +318,23 @@ class NRKTVIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'info_dict': { - 'id': 'MSPO40010515AH', + 'id': 'MSPO40010515', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', }, 'expected_warnings': ['Failed to download m3u8 information'], + 'skip': 'Ikke tilgjengelig utenfor Norge', }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { - 'id': 'KMTE50001317AA', + 'id': 'KMTE50001317', 'ext': 'mp4', - 'title': 'Anno 13:30', + 'title': 'Anno - 13. episode', 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', 'duration': 2340, 'series': 'Anno', - 'episode': '13:30', + 'episode': '13. episode', 'season_number': 3, 'episode_number': 13, }, @@ -342,7 +344,7 @@ class NRKTVIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', 'info_dict': { - 'id': 'MUHH46000317AA', + 'id': 'MUHH46000317', 'ext': 'mp4', 'title': 'Nytt på Nytt 27.01.2017', 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', From b2d9fd9c9f7d4b9f6b6bebbbb8f7e799cb08d4b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:53:05 +0700 Subject: [PATCH 267/424] [nrk] Improve episode and season number extraction --- youtube_dl/extractor/nrk.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 4fb7df959..48387420c 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -246,7 +246,9 @@ class NRKIE(NRKBaseIE): default=None)) episode = alt_title if is_series else None episode_number = int_or_none(self._search_regex( - r'(\d+)\.\s+episode', episode or '', 'episode number', + r'^(\d+)\.', episode or '', 'episode number', + default=None)) or int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', 'episode number', default=None)) info.update({ 'title': title, @@ -374,19 +376,19 @@ class NRKTVIE(InfoExtractor): class NRKTVEpisodeIE(InfoExtractor): - _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/\d+/episode/\d+)' + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/(?P\d+)/episode/(?P\d+))' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', 'info_dict': { - 'id': 'MUHH36005220BA', + 'id': 'MUHH36005220', 'ext': 'mp4', - 'title': 'Kro, krig og kjærlighet 2:6', - 'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350', - 'duration': 1563, + 'title': 'Hellums kro - 2. Kro, krig og kjærlighet', + 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', + 'duration': 1563.92, 'series': 'Hellums kro', - 'season_number': 1, + # 'season_number': 1, 'episode_number': 2, - 'episode': '2:6', + 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, }, 'params': { @@ -395,15 +397,15 @@ class NRKTVEpisodeIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', 'info_dict': { - 'id': 'MSUI14000816AA', + 'id': 'MSUI14000816', 'ext': 'mp4', - 'title': 'Backstage 8:30', + 'title': 'Backstage - 8. episode', 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', 'duration': 1320, 'series': 'Backstage', 'season_number': 1, 'episode_number': 8, - 'episode': '8:30', + 'episode': '8. episode', }, 'params': { 'skip_download': True, @@ -412,7 +414,7 @@ class NRKTVEpisodeIE(InfoExtractor): }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id, season_number, episode_number = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) @@ -424,10 +426,12 @@ class NRKTVEpisodeIE(InfoExtractor): assert re.match(NRKTVIE._EPISODE_RE, nrk_id) info.update({ - '_type': 'url_transparent', + '_type': 'url', 'id': nrk_id, 'url': 'nrk:%s' % nrk_id, 'ie_key': NRKIE.ie_key(), + 'season_number': int(season_number), + 'episode_number': int(episode_number), }) return info From 1aaee908b9ba12fc363c65daa2ef7d1690472a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:57:12 +0700 Subject: [PATCH 268/424] [nrk] PEP 8 --- youtube_dl/extractor/nrk.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 48387420c..2873d7938 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -247,9 +247,11 @@ class NRKIE(NRKBaseIE): episode = alt_title if is_series else None episode_number = int_or_none(self._search_regex( r'^(\d+)\.', episode or '', 'episode number', - default=None)) or int_or_none(self._search_regex( - r'\((\d+)\s*:\s*\d+\)', description or '', 'episode number', default=None)) + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', + 'episode number', default=None)) info.update({ 'title': title, 'series': series, From 5c6e84c0ff732559d8307e9e823b5a07b0bcc9ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 08:31:26 +0700 Subject: [PATCH 269/424] [nrk] Improve series metadata extraction (closes #27473) --- youtube_dl/extractor/nrk.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 2873d7938..5d33355e7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -238,16 +238,29 @@ class NRKIE(NRKBaseIE): } if is_series: - series = title + series = season_id = season_number = episode = episode_number = None + programs = self._call_api( + 'programs/%s' % video_id, video_id, 'programs', fatal=False) + if programs and isinstance(programs, dict): + series = str_or_none(programs.get('seriesTitle')) + season_id = str_or_none(programs.get('seasonId')) + season_number = int_or_none(programs.get('seasonNumber')) + episode = str_or_none(programs.get('episodeTitle')) + episode_number = int_or_none(programs.get('episodeNumber')) + if not series: + series = title if alt_title: title += ' - %s' % alt_title - season_number = int_or_none(self._search_regex( - r'Sesong\s+(\d+)', description or '', 'season number', - default=None)) - episode = alt_title if is_series else None - episode_number = int_or_none(self._search_regex( - r'^(\d+)\.', episode or '', 'episode number', - default=None)) + if not season_number: + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + if not episode: + episode = alt_title if is_series else None + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'^(\d+)\.', episode or '', 'episode number', + default=None)) if not episode_number: episode_number = int_or_none(self._search_regex( r'\((\d+)\s*:\s*\d+\)', description or '', @@ -255,6 +268,7 @@ class NRKIE(NRKBaseIE): info.update({ 'title': title, 'series': series, + 'season_id': season_id, 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, @@ -388,7 +402,7 @@ class NRKTVEpisodeIE(InfoExtractor): 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', 'duration': 1563.92, 'series': 'Hellums kro', - # 'season_number': 1, + 'season_number': 1, 'episode_number': 2, 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, From d5b8cf093cdcf3ba873dba9aad898a3af91429e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 08:58:05 +0700 Subject: [PATCH 270/424] [nrk] Fix age limit extraction --- youtube_dl/extractor/nrk.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 5d33355e7..69178e157 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -220,8 +220,15 @@ class NRKIE(NRKBaseIE): 'url': sub_url, }) - age_limit = int_or_none(try_get( - data, lambda x: x['legalAge']['body']['rating']['code'])) + legal_age = try_get( + data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) + # https://en.wikipedia.org/wiki/Norwegian_Media_Authority + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) + else: + age_limit = None is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' @@ -304,6 +311,7 @@ class NRKTVIE(InfoExtractor): 'duration': 1741, 'series': '20 spørsmål', 'episode': '23. mai 2014', + 'age_limit': 0, }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', @@ -315,6 +323,7 @@ class NRKTVIE(InfoExtractor): 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -327,6 +336,7 @@ class NRKTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -340,6 +350,7 @@ class NRKTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'Ikke tilgjengelig utenfor Norge', @@ -355,6 +366,7 @@ class NRKTVIE(InfoExtractor): 'episode': '13. episode', 'season_number': 3, 'episode_number': 13, + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -369,6 +381,7 @@ class NRKTVIE(InfoExtractor): 'duration': 1796, 'series': 'Nytt på nytt', 'episode': '27.01.2017', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -422,6 +435,7 @@ class NRKTVEpisodeIE(InfoExtractor): 'season_number': 1, 'episode_number': 8, 'episode': '8. episode', + 'age_limit': 0, }, 'params': { 'skip_download': True, From f4afb9a6a8832c48888956d1e8722cb5bd36a78e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 09:10:40 +0700 Subject: [PATCH 271/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 3782ad090..4f5fbc7f8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version + +Extractors +* [nrk] Improve series metadata extraction (#27473) ++ [nrk] Extract subtitles +* [nrk] Fix age limit extraction +* [nrk] Improve video id extraction ++ [nrk] Add support for podcasts (#27634, #27635) +* [nrk] Generalize and delegate all item extractors to nrk ++ [nrk] Add support for mp3 formats +* [nrktv] Switch to playback endpoint +* [vvvvid] Fix season metadata extraction (#18130) +* [stitcher] Fix extraction (#20811, #27606) +* [acast] Fix extraction (#21444, #27612, #27613) ++ [arcpublishing] Add support for arcpublishing.com (#2298, #9340, #17200) ++ [sky] Add support for Sports News articles and Brighcove videos (#13054) ++ [vvvvid] Extract akamai formats +* [vvvvid] Skip unplayable episodes (#27599) +* [yandexvideo] Fix extraction for Python 3.4 + + version 2020.12.31 Core From 8e953dcbb10a1a42f4e12e4e132657cb0100a1f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 09:12:47 +0700 Subject: [PATCH 272/424] release 2021.01.03 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 7 +++++-- youtube_dl/version.py | 2 +- 8 files changed, 19 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index d1a6ad1f6..ee2da644b 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.31 + [debug] youtube-dl version 2021.01.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index ded5beadf..7d535b6d1 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 0e06de8dc..74674ce62 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index dfbfe3701..7b0402d41 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.31 + [debug] youtube-dl version 2021.01.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 30061808d..17ffff465 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 4f5fbc7f8..9942ee578 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.01.03 Extractors * [nrk] Improve series metadata extraction (#27473) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b1112f83b..8178af5b6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -57,6 +57,7 @@ - **appletrailers** - **appletrailers:section** - **archive.org**: archive.org videos + - **ArcPublishing** - **ARD** - **ARD:mediathek** - **ARDBetaMediathek** @@ -610,6 +611,7 @@ - **Npr** - **NRK** - **NRKPlaylist** + - **NRKRadioPodkast** - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte @@ -813,12 +815,13 @@ - **ShowRoomLive** - **Sina** - **sky.it** + - **sky:news** + - **sky:sports** + - **sky:sports:news** - **skyacademy.it** - **SkylineWebcams** - - **SkyNews** - **skynewsarabia:article** - **skynewsarabia:video** - - **SkySports** - **Slideshare** - **SlidesLive** - **Slutload** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f795f0735..1588ee8e4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.12.31' +__version__ = '2021.01.03' From ac71fd5919302f0d42c0cd79e04522cab8ab0318 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Jan 2021 10:04:32 +0100 Subject: [PATCH 273/424] [stv] improve episode id extraction(closes #23083) --- youtube_dl/extractor/stv.py | 42 ++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/stv.py b/youtube_dl/extractor/stv.py index bae8b71f4..539220a94 100644 --- a/youtube_dl/extractor/stv.py +++ b/youtube_dl/extractor/stv.py @@ -8,13 +8,17 @@ from ..utils import ( compat_str, float_or_none, int_or_none, + smuggle_url, + str_or_none, + try_get, ) class STVPlayerIE(InfoExtractor): IE_NAME = 'stv:player' _VALID_URL = r'https?://player\.stv\.tv/(?Pepisode|video)/(?P[a-z0-9]{4})' - _TEST = { + _TESTS = [{ + # shortform 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', 'md5': '5adf9439c31d554f8be0707c7abe7e0a', 'info_dict': { @@ -27,7 +31,11 @@ class STVPlayerIE(InfoExtractor): 'uploader_id': '1486976045', }, 'skip': 'this resource is unavailable outside of the UK', - } + }, { + # episodes + 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' _PTYPE_MAP = { 'episode': 'episodes', @@ -36,11 +44,31 @@ class STVPlayerIE(InfoExtractor): def _real_extract(self, url): ptype, video_id = re.match(self._VALID_URL, url).groups() - resp = self._download_json( - 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), - video_id) - result = resp['results'] + webpage = self._download_webpage(url, video_id, fatal=False) or '' + props = (self._parse_json(self._search_regex( + r']+id="__NEXT_DATA__"[^>]*>({.+?})', + webpage, 'next data', default='{}'), video_id, + fatal=False) or {}).get('props') or {} + player_api_cache = try_get( + props, lambda x: x['initialReduxState']['playerApiCache']) or {} + + api_path, resp = None, {} + for k, v in player_api_cache.items(): + if k.startswith('/episodes/') or k.startswith('/shortform/'): + api_path, resp = k, v + break + else: + episode_id = str_or_none(try_get( + props, lambda x: x['pageProps']['episodeId'])) + api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id) + + result = resp.get('results') + if not result: + resp = self._download_json( + 'https://player.api.stv.tv/v1' + api_path, video_id) + result = resp['results'] + video = result['video'] video_id = compat_str(video['id']) @@ -57,7 +85,7 @@ class STVPlayerIE(InfoExtractor): return { '_type': 'url_transparent', 'id': video_id, - 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}), 'description': result.get('summary'), 'duration': float_or_none(video.get('length'), 1000), 'subtitles': subtitles, From 8522bcd97c4173407261a3fa0283dd7800c39601 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Jan 2021 12:12:06 +0100 Subject: [PATCH 274/424] [stitcher] Add support for shows and show metadata extraction(closes #20510) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/stitcher.py | 120 +++++++++++++++++++++-------- 2 files changed, 92 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 74743a449..d1e1e9a60 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1092,7 +1092,10 @@ from .spike import ( BellatorIE, ParamountNetworkIE, ) -from .stitcher import StitcherIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index b8b5711b1..3dd0d3b5f 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -1,19 +1,60 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, int_or_none, str_or_none, try_get, + url_or_none, ) -class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' +class StitcherBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/' + + def _call_api(self, path, video_id, query): + resp = self._download_json( + 'https://api.prod.stitcher.com/' + path, + video_id, query=query) + error_massage = try_get(resp, lambda x: x['errors'][0]['message']) + if error_massage: + raise ExtractorError(error_massage, expected=True) + return resp['data'] + + def _extract_description(self, data): + return clean_html(data.get('html_description') or data.get('description')) + + def _extract_audio_url(self, episode): + return url_or_none(episode.get('audio_url') or episode.get('guid')) + + def _extract_show_info(self, show): + return { + 'thumbnail': show.get('image_base_url'), + 'series': show.get('title'), + } + + def _extract_episode(self, episode, audio_url, show_info): + info = { + 'id': compat_str(episode['id']), + 'display_id': episode.get('slug'), + 'title': episode['title'].strip(), + 'description': self._extract_description(episode), + 'duration': int_or_none(episode.get('duration')), + 'url': audio_url, + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_published')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + } + info.update(show_info) + return info + + +class StitcherIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P\d+)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', 'md5': 'e9635098e0da10b21a0e2b85585530f6', @@ -24,8 +65,9 @@ class StitcherIE(InfoExtractor): 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20180126', - 'timestamp': 1516989316, + 'upload_date': '20151008', + 'timestamp': 1444285800, + 'series': 'Talking Machines', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -55,33 +97,47 @@ class StitcherIE(InfoExtractor): }] def _real_extract(self, url): - display_id, audio_id = re.match(self._VALID_URL, url).groups() + audio_id = self._match_id(url) + data = self._call_api( + 'shows/episodes', audio_id, {'episode_ids': audio_id}) + episode = data['episodes'][0] + audio_url = self._extract_audio_url(episode) + if not audio_url: + self.raise_login_required() + show = try_get(data, lambda x: x['shows'][0], dict) or {} + return self._extract_episode( + episode, audio_url, self._extract_show_info(show)) - resp = self._download_json( - 'https://api.prod.stitcher.com/episode/' + audio_id, - display_id or audio_id) - episode = try_get(resp, lambda x: x['data']['episodes'][0], dict) - if not episode: - raise ExtractorError(resp['errors'][0]['message'], expected=True) - title = episode['title'].strip() - audio_url = episode['audio_url'] +class StitcherShowIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P[^/#?&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines', + 'info_dict': { + 'id': 'the-talking-machines', + 'title': 'Talking Machines', + 'description': 'md5:831f0995e40f26c10231af39cf1ebf0b', + }, + 'playlist_mincount': 106, + }, { + 'url': 'https://www.stitcher.com/show/the-talking-machines', + 'only_matching': True, + }] - thumbnail = None - show_id = episode.get('show_id') - if show_id and episode.get('classic_id') != -1: - thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id + def _real_extract(self, url): + show_slug = self._match_id(url) + data = self._call_api( + 'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000}) + show = try_get(data, lambda x: x['shows'][0], dict) or {} + show_info = self._extract_show_info(show) - return { - 'id': audio_id, - 'display_id': display_id, - 'title': title, - 'description': clean_html(episode.get('html_description') or episode.get('description')), - 'duration': int_or_none(episode.get('duration')), - 'thumbnail': thumbnail, - 'url': audio_url, - 'vcodec': 'none', - 'timestamp': int_or_none(episode.get('date_created')), - 'season_number': int_or_none(episode.get('season')), - 'season_id': str_or_none(episode.get('season_id')), - } + entries = [] + for episode in (data.get('episodes') or []): + audio_url = self._extract_audio_url(episode) + if not audio_url: + continue + entries.append(self._extract_episode(episode, audio_url, show_info)) + + return self.playlist_result( + entries, show_slug, show.get('title'), + self._extract_description(show)) From 491ee7efe45c287cfb8b28e6d74290d844b8bbb8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Jan 2021 13:29:00 +0100 Subject: [PATCH 275/424] [twitter] try to use a Generic fallback for unknown twitter cards(closes #25982) --- youtube_dl/extractor/twitter.py | 52 +++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 4284487db..a35e1686c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -251,10 +251,10 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vetugo', + 'uploader': 'simon vertugo', 'uploader_id': 'simonvertugo', 'duration': 30.0, 'timestamp': 1455777459, @@ -312,6 +312,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1492000653, 'upload_date': '20170412', }, + 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { @@ -380,6 +381,14 @@ class TwitterIE(TwitterBaseIE): # promo_video_website card 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', 'only_matching': True, + }, { + # promo_video_convo card + 'url': 'https://twitter.com/poco_dandy/status/1047395834013384704', + 'only_matching': True, + }, { + # appplayer card + 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', + 'only_matching': True, }] def _real_extract(self, url): @@ -462,7 +471,25 @@ class TwitterIE(TwitterBaseIE): return try_get(o, lambda x: x[x['type'].lower() + '_value']) card_name = card['name'].split(':')[-1] - if card_name in ('amplify', 'promo_video_website'): + if card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + # amplify, promo_video_website, promo_video_convo, appplayer, ... + else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) @@ -488,25 +515,6 @@ class TwitterIE(TwitterBaseIE): 'duration': int_or_none(get_binding_value( 'content_duration_seconds')), }) - elif card_name == 'player': - info.update({ - '_type': 'url', - 'url': get_binding_value('player_url'), - }) - elif card_name == 'periscope_broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('url') or get_binding_value('player_url'), - 'ie_key': PeriscopeIE.ie_key(), - }) - elif card_name == 'broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('broadcast_url'), - 'ie_key': TwitterBroadcastIE.ie_key(), - }) - else: - raise ExtractorError('Unsupported Twitter Card.') else: expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) if not expanded_url: From 19ec46863587758fa6de274df293ede09b10eeb3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Jan 2021 13:37:24 +0100 Subject: [PATCH 276/424] [twitter] Add support for summary card(closes #25121) --- youtube_dl/extractor/twitter.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index a35e1686c..1190d721e 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -488,6 +488,11 @@ class TwitterIE(TwitterBaseIE): 'url': get_binding_value('broadcast_url'), 'ie_key': TwitterBroadcastIE.ie_key(), }) + elif card_name == 'summary': + info.update({ + '_type': 'url', + 'url': get_binding_value('card_url'), + }) # amplify, promo_video_website, promo_video_convo, appplayer, ... else: is_amplify = card_name == 'amplify' From 0021a2b9a14626b1b56c77bec4d93e70a217d32c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 22:25:17 +0700 Subject: [PATCH 277/424] [nrktv] Add subtitles test --- youtube_dl/extractor/nrk.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 69178e157..cafb85616 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -298,6 +298,14 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', 'duration': 2223.44, 'age_limit': 6, + 'subtitles': { + 'nb-nor': [{ + 'ext': 'vtt', + }], + 'nb-ttv': [{ + 'ext': 'vtt', + }] + }, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', From 0889eb33e0d40d567be5b2f8431952a5517276fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 4 Jan 2021 00:17:51 +0700 Subject: [PATCH 278/424] [xfileshare] Add support for aparat.cam (closes #27651) --- youtube_dl/extractor/xfileshare.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 48ef07ed1..cbd5d1cbb 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -45,6 +45,7 @@ def aa_decode(aa_code): class XFileShareIE(InfoExtractor): _SITES = ( + (r'aparat\.cam', 'Aparat'), (r'clipwatching\.com', 'ClipWatching'), (r'gounlimited\.to', 'GoUnlimited'), (r'govid\.me', 'GoVid'), @@ -78,6 +79,9 @@ class XFileShareIE(InfoExtractor): 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', }, + }, { + 'url': 'https://aparat.cam/n4d6dh0wvlpr', + 'only_matching': True, }] @staticmethod From e88c9ef62a4a26cc77370b741a4244d298c7d45a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 00:51:55 +0100 Subject: [PATCH 279/424] [utils] add a function to clean podcast URLs --- test/test_utils.py | 5 +++++ youtube_dl/utils.py | 17 +++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index d49d3239c..259c4763e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,7 @@ from youtube_dl.utils import ( encode_base_n, caesar, clean_html, + clean_podcast_url, date_from_str, DateRange, detect_exe_version, @@ -1470,6 +1471,10 @@ Line 1 self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + def test_clean_podcast_url(self): + self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') + self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d5fb6fd24..8e4d144c9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -5706,3 +5706,20 @@ def random_birthday(year_field, month_field, day_field): month_field: str(random_date.month), day_field: str(random_date.day), } + + +def clean_podcast_url(url): + return re.sub(r'''(?x) + (?: + (?: + chtbl\.com/track| + media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ + play\.podtrac\.com + )/[^/]+| + (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure + flex\.acast\.com| + pd(?: + cn\.co| # https://podcorn.com/analytics-prefix/ + st\.fm # https://podsights.com/docs/ + )/e + )/''', '', url) From a563c97c5cddf55f8989ed7ea8314ef78e30107f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 00:53:32 +0100 Subject: [PATCH 280/424] [stitcher] clean podcast URLs --- youtube_dl/extractor/stitcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index 3dd0d3b5f..822782507 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( clean_html, + clean_podcast_url, ExtractorError, int_or_none, str_or_none, @@ -43,7 +44,7 @@ class StitcherBaseIE(InfoExtractor): 'title': episode['title'].strip(), 'description': self._extract_description(episode), 'duration': int_or_none(episode.get('duration')), - 'url': audio_url, + 'url': clean_podcast_url(audio_url), 'vcodec': 'none', 'timestamp': int_or_none(episode.get('date_published')), 'season_number': int_or_none(episode.get('season')), From 0e96b4b5ce79a5f5c990247f6260da40f7d71a7c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 00:53:58 +0100 Subject: [PATCH 281/424] [acast] clean podcast URLs --- youtube_dl/extractor/acast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 60378db1b..b9355a2c8 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + clean_podcast_url, int_or_none, parse_iso8601, ) @@ -17,7 +18,7 @@ class ACastBaseIE(InfoExtractor): info = { 'id': episode['id'], 'display_id': episode.get('episodeUrl'), - 'url': episode['url'], + 'url': clean_podcast_url(episode['url']), 'title': title, 'description': clean_html(episode.get('description') or episode.get('summary')), 'thumbnail': episode.get('image'), From 9c484c0019ba16cd4b4d686930bfa038f1bf9320 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 01:00:31 +0100 Subject: [PATCH 282/424] [iheart] Add new extractor for iHeartRadio(#27037) --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/iheart.py | 97 ++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/iheart.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d1e1e9a60..82d440a6d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -463,6 +463,10 @@ from .ign import ( OneUPIE, PCMagIE, ) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) from .imdb import ( ImdbIE, ImdbListIE diff --git a/youtube_dl/extractor/iheart.py b/youtube_dl/extractor/iheart.py new file mode 100644 index 000000000..6710baeb4 --- /dev/null +++ b/youtube_dl/extractor/iheart.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + str_or_none, +) + + +class IHeartRadioBaseIE(InfoExtractor): + def _call_api(self, path, video_id, fatal=True, query=None): + return self._download_json( + 'https://api.iheart.com/api/v3/podcast/' + path, + video_id, fatal=fatal, query=query) + + def _extract_episode(self, episode): + return { + 'thumbnail': episode.get('imageUrl'), + 'description': episode.get('description'), + 'timestamp': int_or_none(episode.get('startDate'), 1000), + 'duration': int_or_none(episode.get('duration')), + } + + +class IHeartRadioIE(IHeartRadioBaseIE): + IENAME = 'iheartradio' + _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P[^/?&#]+)-|iheartradio:)(?P\d+)' + _TEST = { + 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', + 'md5': 'c8609c92c8688dcb69d8541042b8abca', + 'info_dict': { + 'id': '70346499', + 'ext': 'mp3', + 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', + 'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c', + 'timestamp': 1597741200, + 'upload_date': '20200818', + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api( + 'episodes/' + episode_id, episode_id)['episode'] + info = self._extract_episode(episode) + print(episode['mediaUrl']) + info.update({ + 'id': episode_id, + 'title': episode['title'], + 'url': clean_podcast_url(episode['mediaUrl']), + }) + return info + + +class IHeartRadioPodcastIE(IHeartRadioBaseIE): + IE_NAME = 'iheartradio:podcast' + _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P\d+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/', + 'info_dict': { + 'id': '30717896', + 'title': 'It Could Happen Here', + 'description': 'md5:5842117412a967eb0b01f8088eb663e2', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277', + 'only_matching': True, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + path = 'podcasts/' + podcast_id + episodes = self._call_api( + path + '/episodes', podcast_id, query={'limit': 1000000000})['data'] + + entries = [] + for episode in episodes: + episode_id = str_or_none(episode.get('id')) + if not episode_id: + continue + info = self._extract_episode(episode) + info.update({ + '_type': 'url', + 'id': episode_id, + 'title': episode.get('title'), + 'url': 'iheartradio:' + episode_id, + 'ie_key': IHeartRadioIE.ie_key(), + }) + entries.append(info) + + podcast = self._call_api(path, podcast_id, False) or {} + + return self.playlist_result( + entries, podcast_id, podcast.get('title'), podcast.get('description')) From 8487e8b98afd1b469c2b9d29ee53bd173ff9a7e0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 01:04:20 +0100 Subject: [PATCH 283/424] [googlepodcasts] Add new extractor --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/googlepodcasts.py | 88 ++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 youtube_dl/extractor/googlepodcasts.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 82d440a6d..3668197b1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -423,6 +423,10 @@ from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE diff --git a/youtube_dl/extractor/googlepodcasts.py b/youtube_dl/extractor/googlepodcasts.py new file mode 100644 index 000000000..31ad79907 --- /dev/null +++ b/youtube_dl/extractor/googlepodcasts.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + try_get, + urlencode_postdata, +) + + +class GooglePodcastsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/' + + def _batch_execute(self, func_id, video_id, params): + return json.loads(self._download_json( + 'https://podcasts.google.com/_/PodcastsUi/data/batchexecute', + video_id, data=urlencode_postdata({ + 'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]), + }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2]) + + def _extract_episode(self, episode): + return { + 'id': episode[4][3], + 'title': episode[8], + 'url': clean_podcast_url(episode[13]), + 'thumbnail': episode[2], + 'description': episode[9], + 'creator': try_get(episode, lambda x: x[14]), + 'timestamp': int_or_none(episode[11]), + 'duration': int_or_none(episode[12]), + 'series': episode[1], + } + + +class GooglePodcastsIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P[^/]+)/episode/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh', + 'md5': 'fa56b2ee8bd0703e27e42d4b104c4766', + 'info_dict': { + 'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a', + 'ext': 'mp3', + 'title': 'WWDTM New Year 2021', + 'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.', + 'upload_date': '20210102', + 'timestamp': 1609606800, + 'duration': 2901, + 'series': "Wait Wait... Don't Tell Me!", + } + } + + def _real_extract(self, url): + b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups() + episode = self._batch_execute( + 'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1] + return self._extract_episode(episode) + + +class GooglePodcastsFeedIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts:feed' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P[^/?&#]+)/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA', + 'info_dict': { + 'title': "Wait Wait... Don't Tell Me!", + 'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.", + }, + 'playlist_mincount': 20, + } + + def _real_extract(self, url): + b64_feed_url = self._match_id(url) + data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url]) + + entries = [] + for episode in (try_get(data, lambda x: x[1][0]) or []): + entries.append(self._extract_episode(episode)) + + feed = try_get(data, lambda x: x[3]) or [] + return self.playlist_result( + entries, playlist_title=try_get(feed, lambda x: x[0]), + playlist_description=try_get(feed, lambda x: x[2])) From ac61f2e0581ad15727870e8dd9a80ddacf01636e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 01:09:11 +0100 Subject: [PATCH 284/424] [applepodcasts] Add new extractor(#25918) --- youtube_dl/extractor/applepodcasts.py | 61 +++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/applepodcasts.py diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py new file mode 100644 index 000000000..95758fece --- /dev/null +++ b/youtube_dl/extractor/applepodcasts.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + try_get, +) + + +class ApplePodcastsIE(InfoExtractor): + _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' + _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'info_dict': { + 'id': '1000482637777', + 'ext': 'mp3', + 'title': '207 - Whitney Webb Returns', + 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'upload_date': '20200705', + 'timestamp': 1593921600, + 'duration': 6425, + 'series': 'The Tim Dillon Show', + } + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + ember_data = self._parse_json(self._search_regex( + r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) + episode = ember_data['data']['attributes'] + description = episode.get('description') or {} + + series = None + for inc in (ember_data.get('included') or []): + if inc.get('type') == 'media/podcast': + series = try_get(inc, lambda x: x['attributes']['name']) + + return { + 'id': episode_id, + 'title': episode['name'], + 'url': clean_podcast_url(episode['assetUrl']), + 'description': description.get('standard') or description.get('short'), + 'timestamp': parse_iso8601(episode.get('releaseDateTime')), + 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), + 'series': series, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3668197b1..dbc8f89a6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -55,6 +55,7 @@ from .appletrailers import ( AppleTrailersIE, AppleTrailersSectionIE, ) +from .applepodcasts import ApplePodcastsIE from .archiveorg import ArchiveOrgIE from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE From 964a8eb754dd082069419a2fda1ecedea62b7cc5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 01:13:53 +0100 Subject: [PATCH 285/424] [googleplus] Remove Extractor(closes #4955)(closes #7400) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/googleplus.py | 73 ------------------------------ 2 files changed, 74 deletions(-) delete mode 100644 youtube_dl/extractor/googleplus.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dbc8f89a6..a82f6e5f0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -423,7 +423,6 @@ from .go import GoIE from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE -from .googleplus import GooglePlusIE from .googlepodcasts import ( GooglePodcastsIE, GooglePodcastsFeedIE, diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py deleted file mode 100644 index 6b927bb44..000000000 --- a/youtube_dl/extractor/googleplus.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import codecs - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class GooglePlusIE(InfoExtractor): - IE_DESC = 'Google Plus' - _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P\w+)' - IE_NAME = 'plus.google' - _TEST = { - 'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH', - 'info_dict': { - 'id': 'ZButuJc6CtH', - 'ext': 'flv', - 'title': '嘆きの天使 降臨', - 'upload_date': '20120613', - 'uploader': '井上ヨシマサ', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Step 1, Retrieve post webpage to extract further information - webpage = self._download_webpage(url, video_id, 'Downloading entry webpage') - - title = self._og_search_description(webpage).splitlines()[0] - upload_date = unified_strdate(self._html_search_regex( - r'''(?x) - ([0-9]{4}-[0-9]{2}-[0-9]{2})''', - webpage, 'upload date', fatal=False, flags=re.VERBOSE)) - uploader = self._html_search_regex( - r'rel="author".*?>(.*?)', webpage, 'uploader', fatal=False) - - # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com/' - video_page = self._search_regex( - r' Date: Mon, 4 Jan 2021 09:42:27 +0100 Subject: [PATCH 286/424] [iheart] remove print statement --- youtube_dl/extractor/iheart.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/iheart.py b/youtube_dl/extractor/iheart.py index 6710baeb4..7a7295ff4 100644 --- a/youtube_dl/extractor/iheart.py +++ b/youtube_dl/extractor/iheart.py @@ -45,7 +45,6 @@ class IHeartRadioIE(IHeartRadioBaseIE): episode = self._call_api( 'episodes/' + episode_id, episode_id)['episode'] info = self._extract_episode(episode) - print(episode['mediaUrl']) info.update({ 'id': episode_id, 'title': episode['title'], From 745db8899d77c56bf14443be60970aed1d9e2bdd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 09:59:20 +0100 Subject: [PATCH 287/424] [iheart] clean HTML tags from episode description --- youtube_dl/extractor/iheart.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iheart.py b/youtube_dl/extractor/iheart.py index 7a7295ff4..266c67a76 100644 --- a/youtube_dl/extractor/iheart.py +++ b/youtube_dl/extractor/iheart.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, int_or_none, str_or_none, @@ -18,7 +19,7 @@ class IHeartRadioBaseIE(InfoExtractor): def _extract_episode(self, episode): return { 'thumbnail': episode.get('imageUrl'), - 'description': episode.get('description'), + 'description': clean_html(episode.get('description')), 'timestamp': int_or_none(episode.get('startDate'), 1000), 'duration': int_or_none(episode.get('duration')), } From 8a3797a4abdc0b63b6fcbd1fbc9d81acc57ec142 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 12:16:54 +0100 Subject: [PATCH 288/424] [nrk] fix extraction for videos without a legalAge rating --- youtube_dl/extractor/nrk.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index cafb85616..40dee2162 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -223,12 +223,12 @@ class NRKIE(NRKBaseIE): legal_age = try_get( data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) # https://en.wikipedia.org/wiki/Norwegian_Media_Authority - if legal_age == 'A': - age_limit = 0 - elif legal_age.isdigit(): - age_limit = int_or_none(legal_age) - else: - age_limit = None + age_limit = None + if legal_age: + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' From f6ea29e24bf67d13d21de3c623975891a8d61ac1 Mon Sep 17 00:00:00 2001 From: Yurii H Date: Mon, 4 Jan 2021 18:16:27 +0200 Subject: [PATCH 289/424] [iheart] Update test description value (#27037) the description has no HTML tags now. --- youtube_dl/extractor/iheart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iheart.py b/youtube_dl/extractor/iheart.py index 266c67a76..b54c05eeb 100644 --- a/youtube_dl/extractor/iheart.py +++ b/youtube_dl/extractor/iheart.py @@ -35,7 +35,7 @@ class IHeartRadioIE(IHeartRadioBaseIE): 'id': '70346499', 'ext': 'mp3', 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', - 'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c', + 'description': 'md5:96cc7297b3a5a9ebae28643801c96fae', 'timestamp': 1597741200, 'upload_date': '20200818', } From 766fcdd0fad8495222b2b5a14f1626960cd79d89 Mon Sep 17 00:00:00 2001 From: Kevin O'Connor Date: Mon, 4 Jan 2021 13:24:01 -0500 Subject: [PATCH 290/424] [downloader/hls] Disable decryption in tests (#27660) Tests truncate the download to 10241 bytes, which is not divisible by 16 and cannot be decrypted. Tests don't really care about the decrypted content, just that the data they retrieved is the expected data. Therefore, it's fine to just return the encrypted data to tests. See: #27621 and #27620 --- youtube_dl/downloader/hls.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 5e1ff4f6b..7aaebc940 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -172,8 +172,12 @@ class HlsFD(FragmentFD): iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - frag_content = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if not test: + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) # We only download the first fragment during the test if test: From 9237aaa77f7e2f0b9fc17c66589423632172f473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Jan 2021 01:34:28 +0700 Subject: [PATCH 291/424] [workflows/ci.yml] Add support for jython --- .github/workflows/ci.yml | 26 +++++++++++++++++++++++++- devscripts/install_jython.sh | 5 ----- 2 files changed, 25 insertions(+), 6 deletions(-) delete mode 100755 devscripts/install_jython.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 99944e848..ac34196cb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,28 +10,52 @@ jobs: os: [ubuntu-latest] # TODO: python 2.6 python-version: [2.7, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7] + python-impl: [cpython] ytdl-test-set: [core, download] run-tests-ext: [sh] include: # python 3.2 is only available on windows via setup-python - os: windows-latest python-version: 3.2 + python-impl: cpython ytdl-test-set: core run-tests-ext: bat - os: windows-latest python-version: 3.2 + python-impl: cpython ytdl-test-set: download run-tests-ext: bat + # jython + - os: ubuntu-latest + python-impl: jython + ytdl-test-set: core + run-tests-ext: sh + - os: ubuntu-latest + python-impl: jython + ytdl-test-set: download + run-tests-ext: sh steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 + if: ${{ matrix.python-impl == 'cpython' }} with: python-version: ${{ matrix.python-version }} + - name: Set up Java 8 + if: ${{ matrix.python-impl == 'jython' }} + uses: actions/setup-java@v1 + with: + java-version: 8 + - name: Install Jython + if: ${{ matrix.python-impl == 'jython' }} + run: | + wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + java -jar jython-installer.jar -s -d "$HOME/jython" + echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose run: pip install nose - name: Run tests - continue-on-error: ${{ matrix.ytdl-test-set == 'download' }} + continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: YTDL_TEST_SET: ${{ matrix.ytdl-test-set }} run: ./devscripts/run_tests.${{ matrix.run-tests-ext }} diff --git a/devscripts/install_jython.sh b/devscripts/install_jython.sh deleted file mode 100755 index bafca4da4..000000000 --- a/devscripts/install_jython.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -wget http://central.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -java -jar jython-installer-2.7.1.jar -s -d "$HOME/jython" -$HOME/jython/bin/jython -m pip install nose From c3399cac19a51897024545fb267f7579ea42318b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Jan 2021 02:44:27 +0700 Subject: [PATCH 292/424] [travis] Drop Travis CI (closes #7193, closes #12366) --- .travis.yml | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d828d027d..000000000 --- a/.travis.yml +++ /dev/null @@ -1,50 +0,0 @@ -language: python -python: - - "2.6" - - "2.7" - - "3.2" - - "3.3" - - "3.4" - - "3.5" - - "3.6" - - "pypy" - - "pypy3" -dist: trusty -env: - - YTDL_TEST_SET=core -# - YTDL_TEST_SET=download -jobs: - include: - - python: 3.7 - dist: xenial - env: YTDL_TEST_SET=core -# - python: 3.7 -# dist: xenial -# env: YTDL_TEST_SET=download - - python: 3.8 - dist: xenial - env: YTDL_TEST_SET=core -# - python: 3.8 -# dist: xenial -# env: YTDL_TEST_SET=download - - python: 3.8-dev - dist: xenial - env: YTDL_TEST_SET=core -# - python: 3.8-dev -# dist: xenial -# env: YTDL_TEST_SET=download - - env: JYTHON=true; YTDL_TEST_SET=core -# - env: JYTHON=true; YTDL_TEST_SET=download - - name: flake8 - python: 3.8 - dist: xenial - install: pip install flake8 - script: flake8 . - fast_finish: true - allow_failures: -# - env: YTDL_TEST_SET=download - - env: JYTHON=true; YTDL_TEST_SET=core -# - env: JYTHON=true; YTDL_TEST_SET=download -before_install: - - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi -script: ./devscripts/run_tests.sh From f318882955b90bead8206ee411641e65037b1011 Mon Sep 17 00:00:00 2001 From: cladmi Date: Wed, 16 Dec 2020 09:54:48 +0100 Subject: [PATCH 293/424] [motherless] Fix recent videos upload date extraction (closes #27661) Less than a week old videos use a '20h ago' or '1d ago' format. I kept the support for 'Ago' with uppercase start at is was already in the code. --- youtube_dl/extractor/motherless.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index b1615b4d8..6cc36b308 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -85,18 +85,27 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', # 1,234,567 Views + r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), + (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites + r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) upload_date = self._html_search_regex( (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', + r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago r'Uploaded\s+([^<]+)<'), webpage, 'upload date') - if 'Ago' in upload_date: - days = int(re.search(r'([0-9]+)', upload_date).group(1)) - upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') + relative = re.match(r'(\d+)([hd])$', upload_date) + if relative: + delta = int(relative.group(1)) + unit = relative.group(2) + if unit == 'h': + delta_t = datetime.timedelta(hours=delta) + else: # unit == 'd' + delta_t = datetime.timedelta(days=delta) + upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d') else: upload_date = unified_strdate(upload_date) From ecae54a98d2a8d9300142bf3d586f31e8144ccd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Jan 2021 07:40:06 +0700 Subject: [PATCH 294/424] [motherless] Fix review issues and improve extraction (closes #26495, closes #27450) --- youtube_dl/extractor/motherless.py | 52 +++++++++++++++++++----------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6cc36b308..ef1e081f2 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor): # no keywords 'url': 'http://motherless.com/8B4BBC1', 'only_matching': True, + }, { + # see https://motherless.com/videos/recent for recent videos with + # uploaded date in "ago" format + 'url': 'https://motherless.com/3C3E2CF', + 'info_dict': { + 'id': '3C3E2CF', + 'ext': 'mp4', + 'title': 'a/ Hot Teens', + 'categories': list, + 'upload_date': '20210104', + 'uploader_id': 'yonbiw', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -85,29 +102,28 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Views<', # 1,234,567 Views - r'Views\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites + (r'>([\d,.]+)\s+Favorites<', r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex( - (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', - r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago - r'Uploaded\s+([^<]+)<'), webpage, 'upload date') - relative = re.match(r'(\d+)([hd])$', upload_date) - if relative: - delta = int(relative.group(1)) - unit = relative.group(2) - if unit == 'h': - delta_t = datetime.timedelta(hours=delta) - else: # unit == 'd' - delta_t = datetime.timedelta(days=delta) - upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d') - else: - upload_date = unified_strdate(upload_date) + upload_date = unified_strdate(self._search_regex( + r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, + 'upload date', default=None)) + if not upload_date: + uploaded_ago = self._search_regex( + r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', + default=None) + if uploaded_ago: + delta = int(uploaded_ago[:-1]) + _AGO_UNITS = { + 'h': 'hours', + 'd': 'days', + } + kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} + upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = webpage.count('class="media-comment-contents"') uploader_id = self._html_search_regex( From be1a3f2d11b9d88c10b624b12b00b508d923983d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jan 2021 17:31:19 +0100 Subject: [PATCH 295/424] [dplay] Add suport Discovery+ domains(closes #27680) --- youtube_dl/extractor/dplay.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a7b9db568..47501dbe6 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -17,7 +17,12 @@ from ..utils import ( class DPlayIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?P - (?:www\.)?(?Pdplay\.(?Pdk|fi|jp|se|no))| + (?:www\.)?(?Pd + (?: + play\.(?Pdk|fi|jp|se|no)| + iscoveryplus\.(?Pdk|es|fi|it|se|no) + ) + )| (?Pes|it)\.dplay\.com )/[^/]+/(?P[^/]+/[^/?#]+)''' @@ -126,6 +131,24 @@ class DPlayIE(InfoExtractor): }, { 'url': 'https://www.dplay.jp/video/gold-rush/24086', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', + 'only_matching': True, }] def _get_disco_api_info(self, url, display_id, disco_host, realm, country): @@ -241,7 +264,7 @@ class DPlayIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') domain = mobj.group('domain').lstrip('www.') - country = mobj.group('country') or mobj.group('subdomain_country') - host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com' + country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') + host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) From 8f757c735306355e1743a444364b79b681dea661 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jan 2021 18:17:04 +0100 Subject: [PATCH 296/424] [ketnet] fix extraction(closes #27662) --- youtube_dl/extractor/ketnet.py | 119 ++++++++++++++------------------- 1 file changed, 49 insertions(+), 70 deletions(-) diff --git a/youtube_dl/extractor/ketnet.py b/youtube_dl/extractor/ketnet.py index 93a98e1e0..e0599d02f 100644 --- a/youtube_dl/extractor/ketnet.py +++ b/youtube_dl/extractor/ketnet.py @@ -2,92 +2,71 @@ from __future__ import unicode_literals from .canvas import CanvasIE from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + int_or_none, + parse_iso8601, +) class KetnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes', - 'md5': '6bdeb65998930251bbd1c510750edba9', + 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook', + 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', 'info_dict': { - 'id': 'zomerse-filmpjes', + 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd', 'ext': 'mp4', - 'title': 'Gluur mee op de filmset en op Pennenzakkenrock', - 'description': 'Gluur mee met Ghost Rockers op de filmset', + 'title': 'Nachtwacht - Reeks 3: Aflevering 1', + 'description': 'De Nachtwacht krijgt te maken met een parasiet', 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - # mzid in playerConfig instead of sources - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook', - 'md5': '90139b746a0a9bd7bb631283f6e2a64e', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'flv', - 'title': 'Nachtwacht: De Greystook', - 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.03, + 'duration': 1468.02, + 'timestamp': 1609225200, + 'upload_date': '20201229', + 'series': 'Nachtwacht', + 'season': 'Reeks 3', + 'episode': 'De Greystook', + 'episode_number': 1, }, 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { - 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016', - 'only_matching': True, - }, { - 'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life', - 'only_matching': True, - }, { - # mzsource, geo restricted to Belgium - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe', + 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video = self._download_json( + 'https://senior-bff.ketnet.be/graphql', display_id, query={ + 'query': '''{ + video(id: "content/ketnet/nl/%s.model.json") { + description + episodeNr + imageUrl + mediaReference + programTitle + publicationDate + seasonTitle + subtitleVideodetail + titleVideodetail + } +}''' % display_id, + })['data']['video'] - config = self._parse_json( - self._search_regex( - r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage, - 'player config'), - video_id) - - mzid = config.get('mzid') - if mzid: - return self.url_result( - 'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid, - CanvasIE.ie_key(), video_id=mzid) - - title = config['title'] - - formats = [] - for source_key in ('', 'mz'): - source = config.get('%ssource' % source_key) - if not isinstance(source, dict): - continue - for format_id, format_url in source.items(): - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False)) - elif format_id == 'hds': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) + mz_id = compat_urllib_parse_unquote(video['mediaReference']) return { - 'id': video_id, - 'title': title, - 'description': config.get('description'), - 'thumbnail': config.get('image'), - 'series': config.get('program'), - 'episode': config.get('episode'), - 'formats': formats, + '_type': 'url_transparent', + 'id': mz_id, + 'title': video['titleVideodetail'], + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id, + 'thumbnail': video.get('imageUrl'), + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publicationDate')), + 'series': video.get('programTitle'), + 'season': video.get('seasonTitle'), + 'episode': video.get('subtitleVideodetail'), + 'episode_number': int_or_none(video.get('episodeNr')), + 'ie_key': CanvasIE.ie_key(), } From fcd90d258305cdafa5bd23c50443229205fcb9ed Mon Sep 17 00:00:00 2001 From: nixxo Date: Tue, 5 Jan 2021 19:49:56 +0100 Subject: [PATCH 297/424] [rai] Detect ContentItem in iframe (closes #12652) (#27673) Co-authored-by: Sergey M. --- youtube_dl/extractor/rai.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index ecb628f14..0a68d16b0 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -326,6 +326,19 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # ContentItem in iframe (see #12652) + 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', + 'info_dict': { + 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', + 'ext': 'mp4', + 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', + 'description': 'md5:d291b03407ec505f95f27970c0b025f4', + 'upload_date': '20150913', + }, + 'params': { + 'skip_download': True, + }, }, { # Direct MMS URL 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', @@ -403,7 +416,8 @@ class RaiIE(RaiBaseIE): r'''(?x) (?: (?:initEdizione|drawMediaRaiTV)\(| - <(?:[^>]+\bdata-id|var\s+uniquename)= + <(?:[^>]+\bdata-id|var\s+uniquename)=| + ]+\bsrc= ) (["\']) (?:(?!\1).)*\bContentItem-(?P%s) From 5b24f8f505582f353c3c2f7b79b5f67612ba9c87 Mon Sep 17 00:00:00 2001 From: 23rd <23rd@vivaldi.net> Date: Sun, 3 Jan 2021 16:30:56 +0300 Subject: [PATCH 298/424] [twitch] Switch access token to GraphQL and refactor. --- youtube_dl/extractor/twitch.py | 114 ++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index ab6654432..50dcb93ef 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -160,7 +160,64 @@ class TwitchBaseIE(InfoExtractor): return compat_str(self._parse_json(token, channel_name)['channel_id']) -class TwitchVodIE(TwitchBaseIE): +class TwitchGraphQLBaseIE(TwitchBaseIE): + _PAGE_LIMIT = 100 + + _OPERATION_HASHES = { + 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', + 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', + 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', + 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', + 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', + 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', + 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', + } + + def _download_base_gql(self, video_id, ops, note, fatal=True): + return self._download_json( + 'https://gql.twitch.tv/gql', video_id, note, + data=json.dumps(ops).encode(), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8', + 'Client-ID': self._CLIENT_ID, + }, fatal=fatal) + + def _download_gql(self, video_id, ops, note, fatal=True): + for op in ops: + op['extensions'] = { + 'persistedQuery': { + 'version': 1, + 'sha256Hash': self._OPERATION_HASHES[op['operationName']], + } + } + return self._download_base_gql(video_id, ops, note) + + def _download_access_token_gql(self, video_id, item_type=None): + if item_type == 'vod': + method = 'videoPlaybackAccessToken' + param_name = 'id' + else: + method = 'streamPlaybackAccessToken' + param_name = 'channelName' + ops = { + 'query': '''{ + %s( + %s: "%s", + params: { + platform: "web", + playerBackend: "mediaplayer", + playerType: "site" + }) { + value + signature + } + }''' % (method, param_name, video_id), + } + note = 'Downloading access token GraphQL' + return self._download_base_gql(video_id, ops, note)['data'][method] + + +class TwitchVodIE(TwitchGraphQLBaseIE): IE_NAME = 'twitch:vod' _VALID_URL = r'''(?x) https?:// @@ -276,9 +333,7 @@ class TwitchVodIE(TwitchBaseIE): vod_id = self._match_id(url) info = self._download_info(vod_id) - access_token = self._call_api( - 'api/vods/%s/access_token' % vod_id, vod_id, - 'Downloading %s access token' % self._ITEM_TYPE) + access_token = self._download_access_token_gql(vod_id, self._ITEM_TYPE) formats = self._extract_m3u8_formats( '%s/vod/%s.m3u8?%s' % ( @@ -289,8 +344,8 @@ class TwitchVodIE(TwitchBaseIE): 'allow_spectre': 'true', 'player': 'twitchweb', 'playlist_include_framerate': 'true', - 'nauth': access_token['token'], - 'nauthsig': access_token['sig'], + 'nauth': access_token['value'], + 'nauthsig': access_token['signature'], })), vod_id, 'mp4', entry_protocol='m3u8_native') @@ -333,36 +388,6 @@ def _make_video_result(node): } -class TwitchGraphQLBaseIE(TwitchBaseIE): - _PAGE_LIMIT = 100 - - _OPERATION_HASHES = { - 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', - 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', - 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', - 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', - 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', - 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', - 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', - } - - def _download_gql(self, video_id, ops, note, fatal=True): - for op in ops: - op['extensions'] = { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': self._OPERATION_HASHES[op['operationName']], - } - } - return self._download_json( - 'https://gql.twitch.tv/gql', video_id, note, - data=json.dumps(ops).encode(), - headers={ - 'Content-Type': 'text/plain;charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }, fatal=fatal) - - class TwitchCollectionIE(TwitchGraphQLBaseIE): _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P[^/]+)' @@ -814,8 +839,8 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): if not stream: raise ExtractorError('%s is offline' % channel_name, expected=True) - access_token = self._download_access_token(channel_name) - token = access_token['token'] + access_token = self._download_access_token_gql(channel_name) + token = access_token['value'] stream_id = stream.get('id') or channel_name query = { @@ -826,7 +851,7 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): 'player': 'twitchweb', 'playlist_include_framerate': 'true', 'segment_preference': '4', - 'sig': access_token['sig'].encode('utf-8'), + 'sig': access_token['signature'].encode('utf-8'), 'token': token.encode('utf-8'), } formats = self._extract_m3u8_formats( @@ -866,7 +891,7 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): } -class TwitchClipsIE(TwitchBaseIE): +class TwitchClipsIE(TwitchGraphQLBaseIE): IE_NAME = 'twitch:clips' _VALID_URL = r'''(?x) https?:// @@ -912,8 +937,8 @@ class TwitchClipsIE(TwitchBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - clip = self._download_json( - 'https://gql.twitch.tv/gql', video_id, data=json.dumps({ + clip = self._download_base_gql( + video_id, { 'query': '''{ clip(slug: "%s") { broadcaster { @@ -937,10 +962,7 @@ class TwitchClipsIE(TwitchBaseIE): } viewCount } -}''' % video_id, - }).encode(), headers={ - 'Client-ID': self._CLIENT_ID, - })['data']['clip'] +}''' % video_id}, 'Downloading clip GraphQL')['data']['clip'] if not clip: raise ExtractorError( From ccc71122915e630d99e8266c73a2eba26707f199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 6 Jan 2021 02:10:44 +0700 Subject: [PATCH 299/424] [twitch] Improve access token extraction and remove unused code (closes #27646) --- youtube_dl/extractor/twitch.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 50dcb93ef..a939ea24e 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -192,29 +192,27 @@ class TwitchGraphQLBaseIE(TwitchBaseIE): } return self._download_base_gql(video_id, ops, note) - def _download_access_token_gql(self, video_id, item_type=None): - if item_type == 'vod': - method = 'videoPlaybackAccessToken' - param_name = 'id' - else: - method = 'streamPlaybackAccessToken' - param_name = 'channelName' + def _download_access_token_gql(self, video_id, token_kind, param_name): + method = '%sPlaybackAccessToken' % token_kind ops = { 'query': '''{ %s( %s: "%s", - params: { - platform: "web", - playerBackend: "mediaplayer", - playerType: "site" - }) { + params: { + platform: "web", + playerBackend: "mediaplayer", + playerType: "site" + } + ) + { value signature } }''' % (method, param_name, video_id), } - note = 'Downloading access token GraphQL' - return self._download_base_gql(video_id, ops, note)['data'][method] + return self._download_base_gql( + video_id, ops, + 'Downloading %s access token GraphQL' % token_kind)['data'][method] class TwitchVodIE(TwitchGraphQLBaseIE): @@ -227,8 +225,6 @@ class TwitchVodIE(TwitchGraphQLBaseIE): ) (?P\d+) ''' - _ITEM_TYPE = 'vod' - _ITEM_SHORTCUT = 'v' _TESTS = [{ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', @@ -333,7 +329,7 @@ class TwitchVodIE(TwitchGraphQLBaseIE): vod_id = self._match_id(url) info = self._download_info(vod_id) - access_token = self._download_access_token_gql(vod_id, self._ITEM_TYPE) + access_token = self._download_access_token_gql(vod_id, 'video', 'id') formats = self._extract_m3u8_formats( '%s/vod/%s.m3u8?%s' % ( @@ -839,7 +835,8 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): if not stream: raise ExtractorError('%s is offline' % channel_name, expected=True) - access_token = self._download_access_token_gql(channel_name) + access_token = self._download_access_token_gql( + channel_name, 'stream', 'channelName') token = access_token['value'] stream_id = stream.get('id') or channel_name From 1ae7ae0b969d378ea41e6b90b9c5d44358d3e36b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jan 2021 21:17:08 +0100 Subject: [PATCH 300/424] [canvas] Fix VRT NU extraction(closes #26957)(closes #27053) --- youtube_dl/extractor/canvas.py | 120 ++++++++++++--------------------- 1 file changed, 43 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8667a0d04..65d65d52e 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -7,12 +7,12 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( + extract_attributes, ExtractorError, strip_or_none, float_or_none, int_or_none, merge_dicts, - parse_iso8601, str_or_none, url_or_none, ) @@ -37,6 +37,7 @@ class CanvasIE(InfoExtractor): 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] + _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', @@ -47,29 +48,34 @@ class CanvasIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) + data = None + if site_id != 'vrtvideo': + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) # New API endpoint if not data: + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json'}) token = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', - headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', fatal=False, query={ + video_id, 'Downloading video JSON', query={ 'vrtPlayerToken': token, 'client': '%s@PROD' % site_id, }, expected_status=400) - message = data.get('message') - if message and not data.get('title'): - if data.get('code') == 'AUTHENTICATION_REQUIRED': - self.raise_login_required(message) - raise ExtractorError(message, expected=True) + if not data.get('title'): + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) title = data['title'] description = data.get('description') @@ -208,17 +214,21 @@ class VrtNUIE(GigyaBaseIE): _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?Pvrtnu)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 'info_dict': { - 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', 'ext': 'mp4', - 'title': 'De zwarte weduwe', - 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', + 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', - 'season': 'Season 1', - 'season_number': 1, + 'series': 'Postbus X', + 'season': 'Seizoen 1989', + 'season_number': 1989, + 'episode': 'De zwarte weduwe', 'episode_number': 1, + 'timestamp': 1595822400, + 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', 'params': { @@ -300,69 +310,25 @@ class VrtNUIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, display_id) + webpage = self._download_webpage(url, display_id) + + attrs = extract_attributes(self._search_regex( + r'(]+>)', webpage, 'media element')) + video_id = attrs['videoid'] + publication_id = attrs.get('publicationid') + if publication_id: + video_id = publication_id + '$' + video_id + + page = (self._parse_json(self._search_regex( + r'digitalData\s*=\s*({.+?});', webpage, 'digial data', + default='{}'), video_id, fatal=False) or {}).get('page') or {} info = self._search_json_ld(webpage, display_id, default={}) - - # title is optional here since it may be extracted by extractor - # that is delegated from here - title = strip_or_none(self._html_search_regex( - r'(?ms)

    (.+?)

    ', - webpage, 'title', default=None)) - - description = self._html_search_regex( - r'(?ms)
    (.+?)
    ', - webpage, 'description', default=None) - - season = self._html_search_regex( - [r'''(?xms)\s* - seizoen\ (.+?)\s* - ''', - r'

    ', r'', r'

    (.*?)

    ', r']*>(.*?)'], webpage, 'title') - description = self._html_search_meta( + description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( @@ -249,18 +249,18 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?:video-?)?(?P[0-9]+))\.html' _TESTS = [{ - # available till 14.02.2019 - 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', - 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + # available till 7.01.2022 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', + 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', 'info_dict': { - 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', - 'id': '102', + 'display_id': 'maischberger-die-woche', + 'id': '100', 'ext': 'mp4', - 'duration': 4435.0, - 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', - 'upload_date': '20180214', + 'duration': 3687.0, + 'title': 'maischberger. die woche vom 7. Januar 2021', + 'upload_date': '20210107', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -315,17 +315,17 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', + 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': '70153354', + 'id': '78566716', 'title': 'Die robuste Roswita', - 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', - 'timestamp': 1577047500, - 'upload_date': '20191222', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', + 'timestamp': 1596658200, + 'upload_date': '20200805', 'ext': 'mp4', }, }, { From de026a6acd3cbc2e62d0988b21c97e71e3730cb0 Mon Sep 17 00:00:00 2001 From: Tatsh Date: Sun, 17 Jan 2021 08:05:39 -0500 Subject: [PATCH 341/424] [Minds] Add new extractor (#17934) --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/minds.py | 164 +++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 youtube_dl/extractor/minds.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c50e1419f..90012fc4f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -651,6 +651,11 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) +from .minds import ( + MindsIE, + MindsActivityIE, + MindsChannelIE, +) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py new file mode 100644 index 000000000..4523d0938 --- /dev/null +++ b/youtube_dl/extractor/minds.py @@ -0,0 +1,164 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import (int_or_none, sanitized_Request, str_or_none, + unified_strdate) + + +class MindsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?minds\.com/media/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.minds.com/media/100000000000086822', + 'md5': '215a658184a419764852239d4970b045', + 'info_dict': { + 'id': '100000000000086822', + 'ext': 'mp4', + 'title': 'Minds intro sequence', + 'thumbnail': 'https://cdn-cinemr.minds.com/cinemr_com/334128440657580032/thumbnail-00001.png', + 'uploader_id': '100000000000000341', + 'description': '', + 'upload_date': '20130524', + 'timestamp': 1369404826, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_api_url = 'https://www.minds.com/api/v1/media/%s' % video_id + token = self._get_cookies(url).get('XSRF-TOKEN') + headers = { + 'authority': 'www.minds.com', + 'referer': url, + 'x-xsrf-token': token.value if token else '', + } + data = self._download_json(video_api_url, video_id, headers=headers, + query={'children': 'false'}) + formats = [] + owner = data.get('ownerObj', {}) + + transcodes = data.get('transcodes', {}) + # These keys are the width so keep the highest width last + keys = sorted(transcodes.keys()) + + for format_id in keys: + is_numeric = re.match('^[0-9]+\.mp4', format_id) + video_url = transcodes[format_id] + info = { + 'url': video_url, + 'format_id': format_id, + 'http_headers': headers, + } + if is_numeric: + info['width'] = int(format_id.split('.')[0]) + formats.append(info) + + uploader_id = str_or_none(owner.get('guid') or + data.get('owner_guid') or + owner.get('legacy_guid') or + owner.get('owner_guid')) + description = str_or_none(data.get('description')) + if description: + description = description.strip() + uploader_url = age_limit = thumbnail = None + + if owner.get('username'): + uploader_url = 'https://www.minds.com/%s' % owner.get('username') + if data.get('mature') is True: + age_limit = 18 + + thumbnail_api_url = data.get('thumbnail_src') + if thumbnail_api_url: + req = sanitized_Request(thumbnail_api_url) + req.get_method = lambda: 'HEAD' + res = self._request_webpage(req, video_id) + if res.headers.get('content-type', '').startswith('image/'): + thumbnail = getattr(res, 'url', None) + tags = data.get('tags', '').strip() + if isinstance(tags, compat_str) and tags: + tags = [x.strip() for x in tags.split(',')] + else: + tags = None + category = data.get('category') + if isinstance(category, compat_str) and category: + category = [category] + else: + category = None + + return { + 'id': video_id, + 'title': data['title'], + 'formats': formats, + 'description': description, + 'license': str_or_none(data.get('license')), + 'creator': str_or_none(owner.get('name') or owner.get('username')), + 'release_date': unified_strdate(data.get('time_created')), + 'timestamp': int_or_none(data.get('time_created')), + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'view_count': int_or_none(data.get('play:count')), + 'like_count': int_or_none(data.get('thumbs:up:count')), + 'dislike_count': int_or_none(data.get('thumbs:down:count')), + 'average_rating': int_or_none(data.get('rating')), + 'age_limit': age_limit, + 'categories': [str_or_none(data.get('category'))], + 'tags': tags, + # As of 20181020 the API is returning `false` for this value both + # at top level and within the entity.comments:count path. The only + # other way to get this is to fetch all comments and count. + 'comment_count': int_or_none(data.get('comments:count')), + 'thumbnail': thumbnail, + } + + +class MindsActivityIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?minds\.com/newsfeed/(?P[0-9]+)' + + def _real_extract(self, url): + guid = self._match_id(url) + api_url = 'https://www.minds.com/api/v1/newsfeed/single/%s' % guid + token = self._get_cookies(url).get('XSRF-TOKEN') + headers = { + 'authority': 'www.minds.com', + 'referer': url, + 'x-xsrf-token': token.value if token else '', + } + data = self._download_json(api_url, guid, headers=headers) + return self.url_result('https://www.minds.com/media/%s' % data['activity']['entity_guid']) + + +class MindsChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?minds\.com/(?!newsfeed|media|api)(?P[^/]+)' + + def _real_extract(self, url): + channel_name = self._match_id(url) + api_url = 'https://www.minds.com/api/v1/channel/%s' % channel_name + token = self._get_cookies(url).get('XSRF-TOKEN') + headers = { + 'authority': 'www.minds.com', + 'referer': url, + 'x-xsrf-token': token.value if token else '', + } + data = self._download_json(api_url, channel_name, headers=headers) + channel = data.get('channel', {}) + params = {'limit': 12, 'offset': ''} + api_url = 'https://www.minds.com/api/v1/newsfeed/personal/%s' % channel['guid'] + entries = [] + while True: + data = self._download_json(api_url, channel['guid'], + headers=headers, query=params) + activity = data.get('activity', []) + if len(activity) == 0 or not data.get('load-next'): + break + for info in activity: + if info.get('custom_type') != 'video': + continue + entries.append(self.url_result('https://www.minds.com/media/%s' % info['entity_guid'])) + params['offset'] = data['load-next'] + return self.playlist_result(entries, + playlist_title='%s activity' % channel_name) From 58f6c2112d55cdd77ad76b323760bb934d7e7576 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jan 2021 14:07:56 +0100 Subject: [PATCH 342/424] [minds] improve extraction --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/minds.py | 288 ++++++++++++++++------------- 2 files changed, 161 insertions(+), 129 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 90012fc4f..29b0e615e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -653,8 +653,8 @@ from .microsoftvirtualacademy import ( ) from .minds import ( MindsIE, - MindsActivityIE, MindsChannelIE, + MindsGroupIE, ) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py index 4523d0938..8e9f0f825 100644 --- a/youtube_dl/extractor/minds.py +++ b/youtube_dl/extractor/minds.py @@ -1,164 +1,196 @@ # coding: utf-8 from __future__ import unicode_literals -import re from .common import InfoExtractor from ..compat import compat_str -from ..utils import (int_or_none, sanitized_Request, str_or_none, - unified_strdate) +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + strip_or_none, +) -class MindsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/media/(?P[0-9]+)' - _TEST = { +class MindsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?minds\.com/' + + def _call_api(self, path, video_id, resource, query=None): + api_url = 'https://www.minds.com/api/' + path + token = self._get_cookies(api_url).get('XSRF-TOKEN') + return self._download_json( + api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={ + 'Referer': 'https://www.minds.com/', + 'X-XSRF-TOKEN': token.value if token else '', + }, query=query) + + +class MindsIE(MindsBaseIE): + IE_NAME = 'minds' + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?:media|newsfeed|archive/view)/(?P[0-9]+)' + _TESTS = [{ 'url': 'https://www.minds.com/media/100000000000086822', 'md5': '215a658184a419764852239d4970b045', 'info_dict': { 'id': '100000000000086822', 'ext': 'mp4', 'title': 'Minds intro sequence', - 'thumbnail': 'https://cdn-cinemr.minds.com/cinemr_com/334128440657580032/thumbnail-00001.png', - 'uploader_id': '100000000000000341', - 'description': '', + 'thumbnail': r're:https?://.+\.png', + 'uploader_id': 'ottman', 'upload_date': '20130524', 'timestamp': 1369404826, + 'uploader': 'Bill Ottman', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['animation'], + 'comment_count': int, + 'license': 'attribution-cc', }, - 'params': { - 'skip_download': True, + }, { + # entity.type == 'activity' and empty title + 'url': 'https://www.minds.com/newsfeed/798025111988506624', + 'md5': 'b2733a74af78d7fd3f541c4cbbaa5950', + 'info_dict': { + 'id': '798022190320226304', + 'ext': 'mp4', + 'title': '798022190320226304', + 'uploader': 'ColinFlaherty', + 'upload_date': '20180111', + 'timestamp': 1515639316, + 'uploader_id': 'ColinFlaherty', }, - } + }, { + 'url': 'https://www.minds.com/archive/view/715172106794442752', + 'only_matching': True, + }, { + # youtube perma_url + 'url': 'https://www.minds.com/newsfeed/1197131838022602752', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - video_api_url = 'https://www.minds.com/api/v1/media/%s' % video_id - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(video_api_url, video_id, headers=headers, - query={'children': 'false'}) + entity_id = self._match_id(url) + entity = self._call_api( + 'v1/entities/entity/' + entity_id, entity_id, 'entity')['entity'] + if entity.get('type') == 'activity': + if entity.get('custom_type') == 'video': + video_id = entity['entity_guid'] + else: + return self.url_result(entity['perma_url']) + else: + assert(entity['subtype'] == 'video') + video_id = entity_id + # 1080p and webm formats available only on the sources array + video = self._call_api( + 'v2/media/video/' + video_id, video_id, 'video') + formats = [] - owner = data.get('ownerObj', {}) + for source in (video.get('sources') or []): + src = source.get('src') + if not src: + continue + formats.append({ + 'format_id': source.get('label'), + 'height': int_or_none(source.get('size')), + 'url': src, + }) + self._sort_formats(formats) - transcodes = data.get('transcodes', {}) - # These keys are the width so keep the highest width last - keys = sorted(transcodes.keys()) + entity = video.get('entity') or entity + owner = entity.get('ownerObj') or {} + uploader_id = owner.get('username') - for format_id in keys: - is_numeric = re.match('^[0-9]+\.mp4', format_id) - video_url = transcodes[format_id] - info = { - 'url': video_url, - 'format_id': format_id, - 'http_headers': headers, - } - if is_numeric: - info['width'] = int(format_id.split('.')[0]) - formats.append(info) + tags = entity.get('tags') + if tags and isinstance(tags, compat_str): + tags = [tags] - uploader_id = str_or_none(owner.get('guid') or - data.get('owner_guid') or - owner.get('legacy_guid') or - owner.get('owner_guid')) - description = str_or_none(data.get('description')) - if description: - description = description.strip() - uploader_url = age_limit = thumbnail = None - - if owner.get('username'): - uploader_url = 'https://www.minds.com/%s' % owner.get('username') - if data.get('mature') is True: - age_limit = 18 - - thumbnail_api_url = data.get('thumbnail_src') - if thumbnail_api_url: - req = sanitized_Request(thumbnail_api_url) - req.get_method = lambda: 'HEAD' - res = self._request_webpage(req, video_id) - if res.headers.get('content-type', '').startswith('image/'): - thumbnail = getattr(res, 'url', None) - tags = data.get('tags', '').strip() - if isinstance(tags, compat_str) and tags: - tags = [x.strip() for x in tags.split(',')] - else: - tags = None - category = data.get('category') - if isinstance(category, compat_str) and category: - category = [category] - else: - category = None + thumbnail = None + poster = video.get('poster') or entity.get('thumbnail_src') + if poster: + urlh = self._request_webpage(poster, video_id, fatal=False) + if urlh: + thumbnail = urlh.geturl() return { 'id': video_id, - 'title': data['title'], + 'title': entity.get('title') or video_id, 'formats': formats, - 'description': description, - 'license': str_or_none(data.get('license')), - 'creator': str_or_none(owner.get('name') or owner.get('username')), - 'release_date': unified_strdate(data.get('time_created')), - 'timestamp': int_or_none(data.get('time_created')), + 'description': clean_html(entity.get('description')) or None, + 'license': str_or_none(entity.get('license')), + 'timestamp': int_or_none(entity.get('time_created')), + 'uploader': strip_or_none(owner.get('name')), 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - 'view_count': int_or_none(data.get('play:count')), - 'like_count': int_or_none(data.get('thumbs:up:count')), - 'dislike_count': int_or_none(data.get('thumbs:down:count')), - 'average_rating': int_or_none(data.get('rating')), - 'age_limit': age_limit, - 'categories': [str_or_none(data.get('category'))], + 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None, + 'view_count': int_or_none(entity.get('play:count')), + 'like_count': int_or_none(entity.get('thumbs:up:count')), + 'dislike_count': int_or_none(entity.get('thumbs:down:count')), 'tags': tags, - # As of 20181020 the API is returning `false` for this value both - # at top level and within the entity.comments:count path. The only - # other way to get this is to fetch all comments and count. - 'comment_count': int_or_none(data.get('comments:count')), + 'comment_count': int_or_none(entity.get('comments:count')), 'thumbnail': thumbnail, } -class MindsActivityIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/newsfeed/(?P[0-9]+)' +class MindsFeedBaseIE(MindsBaseIE): + _PAGE_SIZE = 150 - def _real_extract(self, url): - guid = self._match_id(url) - api_url = 'https://www.minds.com/api/v1/newsfeed/single/%s' % guid - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(api_url, guid, headers=headers) - return self.url_result('https://www.minds.com/media/%s' % data['activity']['entity_guid']) - - -class MindsChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/(?!newsfeed|media|api)(?P[^/]+)' - - def _real_extract(self, url): - channel_name = self._match_id(url) - api_url = 'https://www.minds.com/api/v1/channel/%s' % channel_name - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(api_url, channel_name, headers=headers) - channel = data.get('channel', {}) - params = {'limit': 12, 'offset': ''} - api_url = 'https://www.minds.com/api/v1/newsfeed/personal/%s' % channel['guid'] - entries = [] + def _entries(self, feed_id): + query = {'limit': self._PAGE_SIZE, 'sync': 1} + i = 1 while True: - data = self._download_json(api_url, channel['guid'], - headers=headers, query=params) - activity = data.get('activity', []) - if len(activity) == 0 or not data.get('load-next'): - break - for info in activity: - if info.get('custom_type') != 'video': + data = self._call_api( + 'v2/feeds/container/%s/videos' % feed_id, + feed_id, 'page %s' % i, query) + entities = data.get('entities') or [] + for entity in entities: + guid = entity.get('guid') + if not guid: continue - entries.append(self.url_result('https://www.minds.com/media/%s' % info['entity_guid'])) - params['offset'] = data['load-next'] - return self.playlist_result(entries, - playlist_title='%s activity' % channel_name) + yield self.url_result( + 'https://www.minds.com/newsfeed/' + guid, + MindsIE.ie_key(), guid) + query['from_timestamp'] = data['load-next'] + if not (query['from_timestamp'] and len(entities) == self._PAGE_SIZE): + break + i += 1 + + def _real_extract(self, url): + feed_id = self._match_id(url) + feed = self._call_api( + 'v1/%s/%s' % (self._FEED_PATH, feed_id), + feed_id, self._FEED_TYPE)[self._FEED_TYPE] + + return self.playlist_result( + self._entries(feed['guid']), feed_id, + strip_or_none(feed.get('name')), + feed.get('briefdescription')) + + +class MindsChannelIE(MindsFeedBaseIE): + _FEED_TYPE = 'channel' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?!(?:newsfeed|media|api|archive|groups)/)(?P[^/?&#]+)' + _FEED_PATH = 'channel' + _TEST = { + 'url': 'https://www.minds.com/ottman', + 'info_dict': { + 'id': 'ottman', + 'title': 'Bill Ottman', + 'description': 'Co-creator & CEO @minds', + }, + 'playlist_mincount': 54, + } + + +class MindsGroupIE(MindsFeedBaseIE): + _FEED_TYPE = 'group' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'groups/profile/(?P[0-9]+)' + _FEED_PATH = 'groups/group' + _TEST = { + 'url': 'https://www.minds.com/groups/profile/785582576369672204/feed/videos', + 'info_dict': { + 'id': '785582576369672204', + 'title': 'Cooking Videos', + }, + 'playlist_mincount': 1, + } From 26499ba823ecba99e18cf5cc76e001f4bbbe4293 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jan 2021 17:35:10 +0100 Subject: [PATCH 343/424] [aljazeera] fix extraction(closes #20911)(closes #27779) --- youtube_dl/extractor/aljazeera.py | 41 ++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index c68be3134..c4f915a3c 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -1,13 +1,16 @@ from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?Pprogram/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P[^/?&#]+)' _TESTS = [{ - 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', + 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', 'info_dict': { 'id': '3792260579001', 'ext': 'mp4', @@ -20,14 +23,34 @@ class AlJazeeraIE(InfoExtractor): 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', }, { - 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', + 'only_matching': True, + }, { + 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): - program_name = self._match_id(url) - webpage = self._download_webpage(url, program_name) - brightcove_id = self._search_regex( - r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + post_type, name = re.match(self._VALID_URL, url).groups() + post_type = { + 'features': 'post', + 'program': 'episode', + 'videos': 'video', + }[post_type.split('/')[0]] + video = self._download_json( + 'https://www.aljazeera.com/graphql', name, query={ + 'operationName': 'SingleArticleQuery', + 'variables': json.dumps({ + 'name': name, + 'postType': post_type, + }), + }, headers={ + 'wp-site': 'aje', + })['data']['article']['video'] + video_id = video['id'] + account_id = video.get('accountId') or '665003303001' + player_id = video.get('playerId') or 'BkeSH5BDb' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) From f9201cef58c2ef6bd99315ea06b57d0c69a83f59 Mon Sep 17 00:00:00 2001 From: Brian Marks Date: Mon, 18 Jan 2021 09:47:06 -0500 Subject: [PATCH 344/424] [americastestkitchen] Improve metadata extraction for ATK episodes (#27860) --- youtube_dl/extractor/americastestkitchen.py | 30 +++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index e20f00fc3..7d2c375c4 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -6,8 +6,10 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, try_get, unified_strdate, + unified_timestamp, ) @@ -22,8 +24,8 @@ class AmericasTestKitchenIE(InfoExtractor): 'ext': 'mp4', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', - 'timestamp': 1523664000, - 'upload_date': '20180414', + 'timestamp': 1523318400, + 'upload_date': '20180410', 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, @@ -33,6 +35,27 @@ class AmericasTestKitchenIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above) + 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner', + 'md5': '06451608c57651e985a498e69cec17e5', + 'info_dict': { + 'id': '5fbe8c61bda2010001c6763b', + 'title': 'Simple Chicken Dinner', + 'ext': 'mp4', + 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', + 'thumbnail': r're:^https?://', + 'timestamp': 1610755200, + 'upload_date': '20210116', + 'release_date': '20210116', + 'series': "America's Test Kitchen", + 'season_number': 21, + 'episode': 'Simple Chicken Dinner', + 'episode_number': 3, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, @@ -60,7 +83,10 @@ class AmericasTestKitchenIE(InfoExtractor): 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', 'description': clean_html(video.get('description')), + 'timestamp': unified_timestamp(video.get('publishDate')), 'release_date': unified_strdate(video.get('publishDate')), + 'episode_number': int_or_none(episode.get('number')), + 'season_number': int_or_none(episode.get('season')), 'series': try_get(episode, lambda x: x['show']['title']), 'episode': episode.get('title'), } From 1dd12708c2042e5d78887e4c5026d6196cc02bb2 Mon Sep 17 00:00:00 2001 From: DrWursterich <31037782+DrWursterich@users.noreply.github.com> Date: Tue, 19 Jan 2021 10:21:37 +0100 Subject: [PATCH 345/424] [9gag] Fix Extraction (#23022) --- youtube_dl/extractor/ninegag.py | 200 +++++++++++++++++++------------- 1 file changed, 122 insertions(+), 78 deletions(-) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index dc6a27d36..3753bc0a2 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -3,102 +3,146 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + determine_ext, + url_or_none, + int_or_none, + float_or_none, + ExtractorError +) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P[a-zA-Z0-9]+)(?:/(?P[^?#/]+))?' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', + 'url': 'https://9gag.com/gag/an5Qz5b', 'info_dict': { - 'id': 'kXzwOKyGlSA', - 'ext': 'mp4', - 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', - 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome', - 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA', - 'uploader': 'CompilationChannel', - 'upload_date': '20131110', - 'view_count': int, - }, - 'add_ie': ['Youtube'], + 'id': 'an5Qz5b', + 'ext': 'webm', + 'title': 'Dogs playing tetherball', + 'upload_date': '20191108', + 'timestamp': 1573243994, + 'age_limit': 0, + 'categories': [ + 'Wholesome' + ], + 'tags': [ + 'Dog' + ] + } }, { - 'url': 'http://9gag.com/tv/p/aKolP3', + 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { - 'id': 'aKolP3', - 'ext': 'mp4', - 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video', - 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!", - 'uploader_id': 'rickmereki', - 'uploader': 'Rick Mereki', - 'upload_date': '20110803', - 'view_count': int, - }, - 'add_ie': ['Vimeo'], - }, { - 'url': 'http://9gag.com/tv/p/KklwM', - 'only_matching': True, - }, { - 'url': 'http://9gag.tv/p/Kk2X5', - 'only_matching': True, - }, { - 'url': 'http://9gag.com/tv/embed/a5Dmvl', - 'only_matching': True, + 'id': 'ae5Ag7B', + 'ext': 'webm', + 'title': 'Capybara Agility Training', + 'upload_date': '20191108', + 'timestamp': 1573237208, + 'age_limit': 0, + 'categories': [ + 'Awesome' + ], + 'tags': [ + 'Weimaraner', + 'American Pit Bull Terrier' + ] + } }] - _EXTERNAL_VIDEO_PROVIDER = { - '1': { - 'url': '%s', - 'ie_key': 'Youtube', - }, - '2': { - 'url': 'http://player.vimeo.com/video/%s', - 'ie_key': 'Vimeo', - }, - '3': { - 'url': 'http://instagram.com/p/%s', - 'ie_key': 'Instagram', - }, - '4': { - 'url': 'http://vine.co/v/%s', - 'ie_key': 'Vine', - }, + _EXTERNAL_VIDEO_PROVIDERS = { + 'Youtube': 'https://youtube.com/watch?v=%s' } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + rawJsonData = self._search_regex( + r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);', + webpage, + 'data') + rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/') + data = self._parse_json(rawJsonData, video_id)['data']['post'] - webpage = self._download_webpage(url, display_id) + if data['type'] == 'Video': + vid = data['video']['id'] + ie_key = data['video']['source'].capitalize() + return { + '_type': 'url_transparent', + 'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid, + 'ie_key': ie_key, + 'id': vid, + 'duration': data['video'].get('duration'), + 'start_time': data['video'].get('startTs') + } - post_view = self._parse_json( - self._search_regex( - r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost', - webpage, 'post view'), - display_id) + if data['type'] == 'EmbedVideo': + vid = data['video']['id'] + ie_key = data['video']['source'].capitalize() + return { + '_type': 'url_transparent', + 'url': data['video']['embedUrl'], + #'ie_key': vid, + 'start_time': data['video'].get('startTs') + } - ie_key = None - source_url = post_view.get('sourceUrl') - if not source_url: - external_video_id = post_view['videoExternalId'] - external_video_provider = post_view['videoExternalProvider'] - source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id - ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key'] - title = post_view['title'] - description = post_view.get('description') - view_count = str_to_int(post_view.get('externalView')) - thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w') + if data['type'] != 'Animated': + raise ExtractorError( + 'The given url does not contain a video', + expected=True) + + duration = None + formats = [] + thumbnails = [] + for key in data['images']: + image = data['images'][key] + if 'duration' in image and duration is None: + duration = int_or_none(image['duration']) + url = url_or_none(image.get('url')) + if url == None: + continue + ext = determine_ext(url) + if ext == 'jpg' or ext == 'png': + thumbnail = { + 'url': url, + 'width': float_or_none(image.get('width')), + 'height': float_or_none(image.get('height')) + } + thumbnails.append(thumbnail) + elif ext == 'webm' or ext == 'mp4': + formats.append({ + 'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url), + 'ext': ext, + 'url': url, + 'width': float_or_none(image.get('width')), + 'height': float_or_none(image.get('height')) + }) + section = None + postSection = data.get('postSection') + if postSection != None and 'name' in postSection: + section = re.sub(r'\\[^\\]{5}', '', postSection['name']) + age_limit = int_or_none(data.get('nsfw')) + if age_limit != None: + age_limit = age_limit * 18 + tags = None + if 'tags' in data: + tags = [] + for tag in data.get('tags') or []: + tags.append(tag.get('key')) return { - '_type': 'url_transparent', - 'url': source_url, - 'ie_key': ie_key, 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'view_count': view_count, - 'thumbnail': thumbnail, + 'title': data['title'], + 'timestamp': int_or_none(data.get('creationTs')), + 'duration': duration, + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': int_or_none(data.get('upVoteCount')), + 'dislike_count': int_or_none(data.get('downVoteCount')), + 'comment_count': int_or_none(data.get('commentsCount')), + 'age_limit': age_limit, + 'categories': [section], + 'tags': tags, + 'is_live': False } From 54856480d7bac670c9d571d4191f5f35aadc5270 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 10:23:02 +0100 Subject: [PATCH 346/424] [ninegag] improve extraction --- youtube_dl/extractor/ninegag.py | 189 ++++++++++++++------------------ 1 file changed, 83 insertions(+), 106 deletions(-) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index 3753bc0a2..440f865bc 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,148 +1,125 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( determine_ext, - url_or_none, + ExtractorError, int_or_none, - float_or_none, - ExtractorError + try_get, + url_or_none, ) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' - _TESTS = [{ - 'url': 'https://9gag.com/gag/an5Qz5b', - 'info_dict': { - 'id': 'an5Qz5b', - 'ext': 'webm', - 'title': 'Dogs playing tetherball', - 'upload_date': '20191108', - 'timestamp': 1573243994, - 'age_limit': 0, - 'categories': [ - 'Wholesome' - ], - 'tags': [ - 'Dog' - ] - } - }, { + _TEST = { 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { 'id': 'ae5Ag7B', - 'ext': 'webm', + 'ext': 'mp4', 'title': 'Capybara Agility Training', 'upload_date': '20191108', 'timestamp': 1573237208, - 'age_limit': 0, - 'categories': [ - 'Awesome' - ], - 'tags': [ - 'Weimaraner', - 'American Pit Bull Terrier' - ] + 'categories': ['Awesome'], + 'tags': ['Weimaraner', 'American Pit Bull Terrier'], + 'duration': 44, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, } - }] - - _EXTERNAL_VIDEO_PROVIDERS = { - 'Youtube': 'https://youtube.com/watch?v=%s' } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - rawJsonData = self._search_regex( - r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);', - webpage, - 'data') - rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/') - data = self._parse_json(rawJsonData, video_id)['data']['post'] + post_id = self._match_id(url) + post = self._download_json( + 'https://9gag.com/v1/post', post_id, query={ + 'id': post_id + })['data']['post'] - if data['type'] == 'Video': - vid = data['video']['id'] - ie_key = data['video']['source'].capitalize() - return { - '_type': 'url_transparent', - 'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid, - 'ie_key': ie_key, - 'id': vid, - 'duration': data['video'].get('duration'), - 'start_time': data['video'].get('startTs') - } - - if data['type'] == 'EmbedVideo': - vid = data['video']['id'] - ie_key = data['video']['source'].capitalize() - return { - '_type': 'url_transparent', - 'url': data['video']['embedUrl'], - #'ie_key': vid, - 'start_time': data['video'].get('startTs') - } - - if data['type'] != 'Animated': + if post.get('type') != 'Animated': raise ExtractorError( 'The given url does not contain a video', expected=True) + title = post['title'] + duration = None formats = [] thumbnails = [] - for key in data['images']: - image = data['images'][key] - if 'duration' in image and duration is None: - duration = int_or_none(image['duration']) - url = url_or_none(image.get('url')) - if url == None: + for key, image in (post.get('images') or {}).items(): + image_url = url_or_none(image.get('url')) + if not image_url: continue - ext = determine_ext(url) - if ext == 'jpg' or ext == 'png': - thumbnail = { - 'url': url, - 'width': float_or_none(image.get('width')), - 'height': float_or_none(image.get('height')) - } - thumbnails.append(thumbnail) - elif ext == 'webm' or ext == 'mp4': - formats.append({ - 'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url), + ext = determine_ext(image_url) + image_id = key.strip('image') + common = { + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } + if ext in ('jpg', 'png'): + webp_url = image.get('webpUrl') + if webp_url: + t = common.copy() + t.update({ + 'id': image_id + '-webp', + 'url': webp_url, + }) + thumbnails.append(t) + common.update({ + 'id': image_id, 'ext': ext, - 'url': url, - 'width': float_or_none(image.get('width')), - 'height': float_or_none(image.get('height')) }) - section = None - postSection = data.get('postSection') - if postSection != None and 'name' in postSection: - section = re.sub(r'\\[^\\]{5}', '', postSection['name']) - age_limit = int_or_none(data.get('nsfw')) - if age_limit != None: - age_limit = age_limit * 18 + thumbnails.append(common) + elif ext in ('webm', 'mp4'): + if not duration: + duration = int_or_none(image.get('duration')) + common['acodec'] = 'none' if image.get('hasAudio') == 0 else None + for vcodec in ('vp8', 'vp9', 'h265'): + c_url = image.get(vcodec + 'Url') + if not c_url: + continue + c_f = common.copy() + c_f.update({ + 'format_id': image_id + '-' + vcodec, + 'url': c_url, + 'vcodec': vcodec, + }) + formats.append(c_f) + common.update({ + 'ext': ext, + 'format_id': image_id, + }) + formats.append(common) + self._sort_formats(formats) + + section = try_get(post, lambda x: x['postSection']['name']) + tags = None - if 'tags' in data: + post_tags = post.get('tags') + if post_tags: tags = [] - for tag in data.get('tags') or []: - tags.append(tag.get('key')) + for tag in post_tags: + tag_key = tag.get('key') + if not tag_key: + continue + tags.append(tag_key) + + get_count = lambda x: int_or_none(post.get(x + 'Count')) return { - 'id': video_id, - 'title': data['title'], - 'timestamp': int_or_none(data.get('creationTs')), + 'id': post_id, + 'title': title, + 'timestamp': int_or_none(post.get('creationTs')), 'duration': duration, 'formats': formats, 'thumbnails': thumbnails, - 'like_count': int_or_none(data.get('upVoteCount')), - 'dislike_count': int_or_none(data.get('downVoteCount')), - 'comment_count': int_or_none(data.get('commentsCount')), - 'age_limit': age_limit, - 'categories': [section], + 'like_count': get_count('upVote'), + 'dislike_count': get_count('downVote'), + 'comment_count': get_count('comments'), + 'age_limit': 18 if post.get('nsfw') == 1 else None, + 'categories': [section] if section else None, 'tags': tags, - 'is_live': False } From 015e19b350a4bb7868008fb9df9092f4a60a5f00 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 11:54:39 +0100 Subject: [PATCH 347/424] [lbry] unescape lbry uri(closes #27872) --- youtube_dl/extractor/lbry.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 41cc245eb..413215a99 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -5,7 +5,10 @@ import functools import json from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( determine_ext, ExtractorError, @@ -131,6 +134,9 @@ class LBRYIE(LBRYBaseIE): }, { 'url': 'https://lbry.tv/$/download/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', 'only_matching': True, + }, { + 'url': 'https://lbry.tv/@lacajadepandora:a/TRUMP-EST%C3%81-BIEN-PUESTO-con-Pilar-Baselga,-Carlos-Senra,-Luis-Palacios-(720p_30fps_H264-192kbit_AAC):1', + 'only_matching': True, }] def _real_extract(self, url): @@ -139,6 +145,7 @@ class LBRYIE(LBRYBaseIE): display_id = display_id.split('/', 2)[-1].replace('/', ':') else: display_id = display_id.replace(':', '#') + display_id = compat_urllib_parse_unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') result_value = result['value'] From bc7c8f3d4ea6cb98fa62a5fe457046dd4e9b5379 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 14:47:39 +0100 Subject: [PATCH 348/424] [yahoo] fix single video extraction --- youtube_dl/extractor/yahoo.py | 80 +++++++++++++++++------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e4615376c..a17b10d6e 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -177,46 +177,9 @@ class YahooIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - url, country, display_id = re.match(self._VALID_URL, url).groups() - if not country: - country = 'us' - else: - country = country.split('-')[0] - api_base = 'https://%s.yahoo.com/_td/api/resource/' % country - - for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]): - content = self._download_json( - api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid, - display_id, 'Downloading content JSON metadata', fatal=i == 1) - if content: - item = content['items'][0] - break - - if item.get('type') != 'video': - entries = [] - - cover = item.get('cover') or {} - if cover.get('type') == 'yvideo': - cover_url = cover.get('url') - if cover_url: - entries.append(self.url_result( - cover_url, 'Yahoo', cover.get('uuid'))) - - for e in item.get('body', []): - if e.get('type') == 'videoIframe': - iframe_url = e.get('url') - if not iframe_url: - continue - entries.append(self.url_result(iframe_url)) - - return self.playlist_result( - entries, item.get('uuid'), - item.get('title'), item.get('summary')) - - video_id = item['uuid'] + def _extract_yahoo_video(self, video_id, country): video = self._download_json( - api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id, + 'https://%s.yahoo.com/_td/api/resource/VideoService.videos;view=full;video_ids=["%s"]' % (country, video_id), video_id, 'Downloading video JSON metadata')[0] title = video['title'] @@ -298,7 +261,6 @@ class YahooIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'formats': formats, - 'display_id': display_id, 'thumbnails': thumbnails, 'description': clean_html(video.get('description')), 'timestamp': parse_iso8601(video.get('publish_time')), @@ -311,6 +273,44 @@ class YahooIE(InfoExtractor): 'episode_number': int_or_none(series_info.get('episode_number')), } + def _real_extract(self, url): + url, country, display_id = re.match(self._VALID_URL, url).groups() + if not country: + country = 'us' + else: + country = country.split('-')[0] + + item = self._download_json( + 'https://%s.yahoo.com/caas/content/article' % country, display_id, + 'Downloading content JSON metadata', query={ + 'url': url + })['items'][0]['data']['partnerData'] + + if item.get('type') != 'video': + entries = [] + + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in (item.get('body') or []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if not iframe_url: + continue + entries.append(self.url_result(iframe_url)) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + info = self._extract_yahoo_video(item['uuid'], country) + info['display_id'] = display_id + return info + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' From b79977fb6b1b2cea0231e5e25e201db05c0d1dba Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 14:49:45 +0100 Subject: [PATCH 349/424] [aol] add support for yahoo videos(closes #26650) --- youtube_dl/extractor/aol.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index e87994a6a..f6ecb8438 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .yahoo import YahooIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -15,9 +15,9 @@ from ..utils import ( ) -class AolIE(InfoExtractor): +class AolIE(YahooIE): IE_NAME = 'aol.com' - _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P[0-9a-f]+)' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' _TESTS = [{ # video with 5min ID @@ -76,10 +76,16 @@ class AolIE(InfoExtractor): }, { 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', 'only_matching': True, + }, { + # Yahoo video + 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + if '-' in video_id: + return self._extract_yahoo_video(video_id, 'us') response = self._download_json( 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, From 62acf5fa2c23d2eb52fb1dd07804352116bba12f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 22:43:59 +0100 Subject: [PATCH 350/424] [trovo] Add new extractor(closes #26125) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/trovo.py | 193 +++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+) create mode 100644 youtube_dl/extractor/trovo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 29b0e615e..536b184bc 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1238,6 +1238,10 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, +) from .trunews import TruNewsIE from .trutv import TruTVIE from .tube8 import Tube8IE diff --git a/youtube_dl/extractor/trovo.py b/youtube_dl/extractor/trovo.py new file mode 100644 index 000000000..43745213d --- /dev/null +++ b/youtube_dl/extractor/trovo.py @@ -0,0 +1,193 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + try_get, +) + + +class TrovoBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' + + def _extract_streamer_info(self, data): + streamer_info = data.get('streamerInfo') or {} + username = streamer_info.get('userName') + return { + 'uploader': streamer_info.get('nickName'), + 'uploader_id': str_or_none(streamer_info.get('uid')), + 'uploader_url': 'https://trovo.live/' + username if username else None, + } + + +class TrovoIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P[^/?&#]+)' + + def _real_extract(self, url): + username = self._match_id(url) + live_info = self._download_json( + 'https://gql.trovo.live/', username, query={ + 'query': '''{ + getLiveInfo(params: {userName: "%s"}) { + isLive + programInfo { + coverUrl + id + streamInfo { + desc + playUrl + } + title + } + streamerInfo { + nickName + uid + userName + } + } +}''' % username, + })['data']['getLiveInfo'] + if live_info.get('isLive') == 0: + raise ExtractorError('%s is offline' % username, expected=True) + program_info = live_info['programInfo'] + program_id = program_info['id'] + title = self._live_title(program_info['title']) + + formats = [] + for stream_info in (program_info.get('streamInfo') or []): + play_url = stream_info.get('playUrl') + if not play_url: + continue + format_id = stream_info.get('desc') + formats.append({ + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'url': play_url, + }) + self._sort_formats(formats) + + info = { + 'id': program_id, + 'title': title, + 'formats': formats, + 'thumbnail': program_info.get('coverUrl'), + 'is_live': True, + } + info.update(self._extract_streamer_info(live_info)) + return info + + +class TrovoVodIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', + 'info_dict': { + 'id': 'ltv-100095501_100095501_1609596043', + 'ext': 'mp4', + 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!', + 'uploader': 'Exsl', + 'timestamp': 1609640305, + 'upload_date': '20210103', + 'uploader_id': '100095501', + 'duration': 43977, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'comments': 'mincount:8', + 'categories': ['Grand Theft Auto V'], + }, + }, { + 'url': 'https://trovo.live/clip/lc-5285890810184026005', + 'only_matching': True, + }] + + def _real_extract(self, url): + vid = self._match_id(url) + resp = self._download_json( + 'https://gql.trovo.live/', vid, data=json.dumps([{ + 'query': '''{ + batchGetVodDetailInfo(params: {vids: ["%s"]}) { + VodDetailInfos + } +}''' % vid, + }, { + 'query': '''{ + getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { + commentList { + author { + nickName + uid + } + commentID + content + createdAt + parentID + } + } +}''' % vid, + }]).encode(), headers={ + 'Content-Type': 'application/json', + }) + vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] + vod_info = vod_detail_info['vodInfo'] + title = vod_info['title'] + + language = vod_info.get('languageName') + formats = [] + for play_info in (vod_info.get('playInfos') or []): + play_url = play_info.get('playUrl') + if not play_url: + continue + format_id = play_info.get('desc') + formats.append({ + 'ext': 'mp4', + 'filesize': int_or_none(play_info.get('fileSize')), + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'language': language, + 'protocol': 'm3u8_native', + 'tbr': int_or_none(play_info.get('bitrate')), + 'url': play_url, + }) + self._sort_formats(formats) + + category = vod_info.get('categoryName') + get_count = lambda x: int_or_none(vod_info.get(x + 'Num')) + + comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or [] + comments = [] + for comment in comment_list: + content = comment.get('content') + if not content: + continue + author = comment.get('author') or {} + parent = comment.get('parentID') + comments.append({ + 'author': author.get('nickName'), + 'author_id': str_or_none(author.get('uid')), + 'id': str_or_none(comment.get('commentID')), + 'text': content, + 'timestamp': int_or_none(comment.get('createdAt')), + 'parent': 'root' if parent == 0 else str_or_none(parent), + }) + + info = { + 'id': vid, + 'title': title, + 'formats': formats, + 'thumbnail': vod_info.get('coverUrl'), + 'timestamp': int_or_none(vod_info.get('publishTs')), + 'duration': int_or_none(vod_info.get('duration')), + 'view_count': get_count('watch'), + 'like_count': get_count('like'), + 'comment_count': get_count('comment'), + 'comments': comments, + 'categories': [category] if category else None, + } + info.update(self._extract_streamer_info(vod_detail_info)) + return info From 657221c81d966115cf6ac263805168e49a48abce Mon Sep 17 00:00:00 2001 From: Brian Marks Date: Thu, 21 Jan 2021 10:46:29 -0500 Subject: [PATCH 351/424] [americastestkitchen] Add support for downloading entire seasons (#27861) --- youtube_dl/extractor/americastestkitchen.py | 67 +++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 7d2c375c4..35d3220c1 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -90,3 +91,69 @@ class AmericasTestKitchenIE(InfoExtractor): 'series': try_get(episode, lambda x: x['show']['title']), 'episode': episode.get('title'), } + + +class AmericasTestKitchenSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _TESTS = [{ + # ATK Season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', + 'info_dict': { + 'id': 'season-1', + 'title': 'Season 1', + }, + 'playlist_count': 13, + }, { + # Cooks Country Season + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'info_dict': { + 'id': 'season-12', + 'title': 'Season 12', + }, + 'playlist_count': 13, + }, { + # Multi-digit season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_20', + 'only_matching': True, + }] + + def _real_extract(self, url): + show_name, season = re.match(self._VALID_URL, url).groups() + + slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + + filters = [ + 'search_season_list:Season %s' % season, + 'search_document_klass:episode', + 'search_show_slug:%s' % slug, + ] + + season_search = self._download_json( + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_atk_season_desc_production', + season, headers={ + 'Origin': 'https://www.%s.com' % show_name, + 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', + 'X-Algolia-Application-Id': 'Y1FNZXUI30', + }, query={ + 'facetFilters': json.dumps(filters), + 'attributesToRetrieve': 'search_url', + 'attributesToHighlight': '', + # ATK and CCO generally have less than 26 episodes per season + 'hitsPerPage': '100', + }) + + entries = [ + self.url_result( + 'https://www.%s.com%s' % (show_name, episode['search_url']), + 'AmericasTestKitchen', + try_get(episode, lambda e: e['objectID'].split('_')[-1])) + for episode in season_search['hits'] + if 'search_url' in episode and episode['search_url'] + ] + + return { + '_type': 'playlist', + 'id': 'season-%s' % season, + 'title': 'Season %s' % season, + 'entries': sorted(entries, key=lambda e: e.get('id')), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 536b184bc..52b8db0f9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -42,7 +42,10 @@ from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amara import AmaraIE from .amcnetworks import AMCNetworksIE -from .americastestkitchen import AmericasTestKitchenIE +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) from .animeondemand import AnimeOnDemandIE from .anvato import AnvatoIE from .aol import AolIE From cff72b4cc0330f6d635353083eea7e570036b1ea Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 21 Jan 2021 16:47:55 +0100 Subject: [PATCH 352/424] [americastestkitchen] improve season extraction --- youtube_dl/extractor/americastestkitchen.py | 62 ++++++++++----------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 35d3220c1..be960c0f9 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -99,7 +99,7 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', 'info_dict': { - 'id': 'season-1', + 'id': 'season_1', 'title': 'Season 1', }, 'playlist_count': 13, @@ -107,53 +107,53 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): # Cooks Country Season 'url': 'https://www.cookscountry.com/episodes/browse/season_12', 'info_dict': { - 'id': 'season-12', + 'id': 'season_12', 'title': 'Season 12', }, 'playlist_count': 13, - }, { - # Multi-digit season - 'url': 'https://www.americastestkitchen.com/episodes/browse/season_20', - 'only_matching': True, }] def _real_extract(self, url): - show_name, season = re.match(self._VALID_URL, url).groups() + show_name, season_number = re.match(self._VALID_URL, url).groups() + season_number = int(season_number) slug = 'atk' if show_name == 'americastestkitchen' else 'cco' - filters = [ - 'search_season_list:Season %s' % season, - 'search_document_klass:episode', - 'search_show_slug:%s' % slug, - ] + season = 'Season %d' % season_number season_search = self._download_json( - 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_atk_season_desc_production', + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, season, headers={ 'Origin': 'https://www.%s.com' % show_name, 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps(filters), - 'attributesToRetrieve': 'search_url', + 'facetFilters': json.dumps([ + 'search_season_list:' + season, + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ]), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, 'attributesToHighlight': '', - # ATK and CCO generally have less than 26 episodes per season - 'hitsPerPage': '100', + 'hitsPerPage': 1000, }) - entries = [ - self.url_result( - 'https://www.%s.com%s' % (show_name, episode['search_url']), - 'AmericasTestKitchen', - try_get(episode, lambda e: e['objectID'].split('_')[-1])) - for episode in season_search['hits'] - if 'search_url' in episode and episode['search_url'] - ] + def entries(): + for episode in (season_search.get('hits') or []): + search_url = episode.get('search_url') + if not search_url: + continue + yield { + '_type': 'url', + 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'timestamp': unified_timestamp(episode.get('search_document_date')), + 'season_number': season_number, + 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)), + 'ie_key': AmericasTestKitchenIE.ie_key(), + } - return { - '_type': 'playlist', - 'id': 'season-%s' % season, - 'title': 'Season %s' % season, - 'entries': sorted(entries, key=lambda e: e.get('id')), - } + return self.playlist_result( + entries(), 'season_%d' % season_number, season) From 8d286bd5b67004d9c5420f6e3f6b7f75d2ba6395 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 21 Jan 2021 17:20:32 +0100 Subject: [PATCH 353/424] [wat] fix format extraction(closes #27901) --- youtube_dl/extractor/wat.py | 55 +++++++++---------------------------- 1 file changed, 13 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 8ef3e0906..7214bfebf 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -1,12 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, unified_strdate, HEADRequest, int_or_none, @@ -97,46 +94,20 @@ class WatIE(InfoExtractor): return red_url return None - def remove_bitrate_limit(manifest_url): - return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url) - formats = [] - try: - alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - m3u8_url = remove_bitrate_limit(m3u8_url) - for m3u8_alt_url in alt_urls(m3u8_url): - formats.extend(self._extract_m3u8_formats( - m3u8_alt_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - mpd_url = remove_bitrate_limit(mpd_url) - for mpd_alt_url in alt_urls(mpd_url): - formats.extend(self._extract_mpd_formats( - mpd_alt_url, video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - except ExtractorError: - abr = 64 - for vbr, width, height in self._FORMATS: - tbr = vbr + abr - format_id = 'http-%s' % tbr - fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) - if self._is_valid_url(fmt_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': fmt_url, - 'vbr': vbr, - 'abr': abr, - 'width': width, - 'height': height, - }) + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id) + m3u8_url = manifest_urls.get('hls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + mpd_url = manifest_urls.get('mpd') + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') upload_date = unified_strdate(date_diffusion) if date_diffusion else None From 3bb7769c405e02fc1078252cafbbd982913fe50c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 21 Jan 2021 17:22:30 +0100 Subject: [PATCH 354/424] [wat] remove unused variable --- youtube_dl/extractor/wat.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 7214bfebf..f6940b371 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -43,15 +43,6 @@ class WatIE(InfoExtractor): }, ] - _FORMATS = ( - (200, 416, 234), - (400, 480, 270), - (600, 640, 360), - (1200, 640, 360), - (1800, 960, 540), - (2500, 1280, 720), - ) - def _real_extract(self, url): video_id = self._match_id(url) video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) From fa8f6d858064cf07b9167b73647545b3007c6b21 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 21 Jan 2021 23:53:09 +0100 Subject: [PATCH 355/424] [comedycentral] fix extraction(closes #27905) --- youtube_dl/extractor/comedycentral.py | 141 +++++--------------------- youtube_dl/extractor/extractors.py | 3 - youtube_dl/extractor/mtv.py | 23 ++--- youtube_dl/extractor/spike.py | 15 +-- 4 files changed, 37 insertions(+), 145 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index d08b909a6..1bfa912be 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,142 +1,51 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from .common import InfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes))) - /(?P.*)''' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ - 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', - 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', + 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike', + 'md5': 'b8acb347177c680ff18a292aa2166f80', 'info_dict': { - 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025', 'ext': 'mp4', - 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', - 'description': 'After a certain point, breastfeeding becomes c**kblocking.', - 'timestamp': 1376798400, - 'upload_date': '20130818', + 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike', + 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498', + 'timestamp': 1598670000, + 'upload_date': '20200829', }, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', + 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314', 'only_matching': True, - }] - - -class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (?:full-episodes|shows(?=/[^/]+/full-episodes)) - /(?P<id>[^?]+)''' - _FEED_URL = 'http://comedycentral.com/feeds/mrss/' - - _TESTS = [{ - 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028', - 'info_dict': { - 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."', - 'title': 'November 28, 2016 - Ryan Speedo Green', - }, - 'playlist_count': 4, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') - videos_info = self._get_videos_info(mgid) - return videos_info - - -class ToshIE(MTVServicesInfoExtractor): - IE_DESC = 'Tosh.0' - _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' - _FEED_URL = 'http://tosh.cc.com/feeds/mrss' - - _TESTS = [{ - 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', - 'info_dict': { - 'description': 'Tosh asked fans to share their summer plans.', - 'title': 'Twitter Users Share Summer Plans', - }, - 'playlist': [{ - 'md5': 'f269e88114c1805bb6d7653fecea9e06', - 'info_dict': { - 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', - 'ext': 'mp4', - 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', - 'description': 'Tosh asked fans to share their summer plans.', - 'thumbnail': r're:^https?://.*\.jpg', - # It's really reported to be published on year 2077 - 'upload_date': '20770610', - 'timestamp': 3390510600, - 'subtitles': { - 'en': 'mincount:3', - }, - }, - }] - }, { - 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', + 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', 'only_matching': True, }] class ComedyCentralTVIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})' _TESTS = [{ - 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1', 'info_dict': { - 'id': 'local_playlist-f99b626bdfe13568579a', - 'ext': 'flv', - 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Josh Investigates', + 'description': 'Steht uns das Ende der Welt bevor?', }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', - 'only_matching': True, - }, { - 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', - 'only_matching': True, }] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + _GEO_COUNTRIES = ['DE'] - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - mrss_url = self._search_regex( - r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'mrss url', group='url') - - return self._get_videos_info_from_url(mrss_url, video_id) - - -class ComedyCentralShortnameIE(InfoExtractor): - _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$' - _TESTS = [{ - 'url': ':tds', - 'only_matching': True, - }, { - 'url': ':thedailyshow', - 'only_matching': True, - }, { - 'url': ':theopposition', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - shortcut_map = { - 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'web.cc.tv', + 'ep': 'b9032c3a', + 'imageEp': 'web.cc.tv', + 'mgid': uri, } - return self.url_result(shortcut_map[video_id]) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 52b8db0f9..ef57f5556 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -235,11 +235,8 @@ from .cnn import ( ) from .coub import CoubIE from .comedycentral import ( - ComedyCentralFullEpisodesIE, ComedyCentralIE, - ComedyCentralShortnameIE, ComedyCentralTVIE, - ToshIE, ) from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import ( diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index df1034fc5..f5e30d22d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -253,6 +253,10 @@ class MTVServicesInfoExtractor(InfoExtractor): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + @staticmethod + def _extract_child_with_type(parent, t): + return next(c for c in parent['children'] if c.get('type') == t) + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf @@ -278,6 +282,13 @@ class MTVServicesInfoExtractor(InfoExtractor): if not mgid: mgid = self._extract_triforce_mgid(webpage) + if not mgid: + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self._extract_child_with_type(data, 'MainContainer') + video_player = self._extract_child_with_type(main_container, 'VideoPlayer') + mgid = video_player['props']['media']['video']['config']['uri'] + return mgid def _real_extract(self, url): @@ -349,18 +360,6 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] - @staticmethod - def extract_child_with_type(parent, t): - children = parent['children'] - return next(c for c in children if c.get('type') == t) - - def _extract_mgid(self, webpage): - data = self._parse_json(self._search_regex( - r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) - main_container = self.extract_child_with_type(data, 'MainContainer') - video_player = self.extract_child_with_type(main_container, 'VideoPlayer') - return video_player['props']['media']['video']['config']['uri'] - class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 4c5e3f7c2..5805f3d44 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - return self._extract_triforce_mgid(webpage) - class ParamountNetworkIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' @@ -46,16 +43,6 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): def _get_feed_query(self, uri): return { 'arcEp': 'paramountnetwork.com', + 'imageEp': 'paramountnetwork.com', 'mgid': uri, } - - def _extract_mgid(self, webpage): - root_data = self._parse_json(self._search_regex( - r'window\.__DATA__\s*=\s*({.+})', - webpage, 'data'), None) - - def find_sub_data(data, data_type): - return next(c for c in data['children'] if c.get('type') == data_type) - - c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer') - return c['props']['media']['video']['config']['uri'] From 4542e3e55538a8e7115bde777e7ddbd781c2e446 Mon Sep 17 00:00:00 2001 From: aarubui <aarubui@users.noreply.github.com> Date: Fri, 22 Jan 2021 19:56:51 +1100 Subject: [PATCH 356/424] [njpwworld] fix extraction (#27890) --- youtube_dl/extractor/njpwworld.py | 54 ++++++++++++++++--------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index 025c5d249..3639d142f 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -6,30 +6,40 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - extract_attributes, get_element_by_class, urlencode_postdata, ) class NJPWWorldIE(InfoExtractor): - _VALID_URL = r'https?://njpwworld\.com/p/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://(front\.)?njpwworld\.com/p/(?P<id>[a-z0-9_]+)' IE_DESC = '新日本プロレスワールド' _NETRC_MACHINE = 'njpwworld' - _TEST = { + _TESTS = [{ 'url': 'http://njpwworld.com/p/s_series_00155_1_9/', 'info_dict': { 'id': 's_series_00155_1_9', 'ext': 'mp4', - 'title': '第9試合 ランディ・サベージ vs リック・スタイナー', + 'title': '闘強導夢2000 2000年1月4日 東京ドーム 第9試合 ランディ・サベージ VS リック・スタイナー', 'tags': list, }, 'params': { 'skip_download': True, # AES-encrypted m3u8 }, 'skip': 'Requires login', - } + }, { + 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', + 'info_dict': { + 'id': 's_series_00563_16_bs', + 'ext': 'mp4', + 'title': 'WORLD TAG LEAGUE 2020 & BEST OF THE SUPER Jr.27 2020年12月6日 福岡・福岡国際センター バックステージコメント(字幕あり)', + 'tags': ["福岡・福岡国際センター", "バックステージコメント", "2020", "20年代"], + }, + 'params': { + 'skip_download': True, + }, + }] _LOGIN_URL = 'https://front.njpwworld.com/auth/login' @@ -64,35 +74,27 @@ class NJPWWorldIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage): - player = extract_attributes(mobj.group(0)) - player_path = player.get('href') - if not player_path: - continue - kind = self._search_regex( - r'(low|high)$', player.get('class') or '', 'kind', - default='low') + for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): + player_path = '/intent?id=%s&type=url' % vid player_url = compat_urlparse.urljoin(url, player_path) - player_page = self._download_webpage( - player_url, video_id, note='Downloading player page') - entries = self._parse_html5_media_entries( - player_url, player_page, video_id, m3u8_id='hls-%s' % kind, - m3u8_entry_protocol='m3u8_native') - kind_formats = entries[0]['formats'] - for f in kind_formats: - f['quality'] = 2 if kind == 'high' else 1 - formats.extend(kind_formats) + formats.append({ + 'url': player_url, + 'format_id': kind, + 'ext': 'mp4', + 'protocol': 'm3u8', + 'quality': 2 if kind == 'high' else 1, + }) self._sort_formats(formats) - post_content = get_element_by_class('post-content', webpage) + tag_block = get_element_by_class('tag-block', webpage) tags = re.findall( - r'<li[^>]+class="tag-[^"]+"><a[^>]*>([^<]+)</a></li>', post_content - ) if post_content else None + r'<a[^>]+class="tag-[^"]+"[^>]*>([^<]+)</a>', tag_block + ) if tag_block else None return { 'id': video_id, - 'title': self._og_search_title(webpage), + 'title': get_element_by_class('article-title', webpage) or self._og_search_title(webpage), 'formats': formats, 'tags': tags, } From 142c584063ec02406e636522fe11d0d2be22b299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 22 Jan 2021 23:51:22 +0700 Subject: [PATCH 357/424] Introduce --output-na-placeholder (closes #27896) --- README.md | 2 +- test/test_YoutubeDL.py | 15 +++++++++++---- youtube_dl/YoutubeDL.py | 7 ++++--- youtube_dl/__init__.py | 1 + youtube_dl/options.py | 4 ++++ 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 85fed6d3a..8c2569aaa 100644 --- a/README.md +++ b/README.md @@ -583,7 +583,7 @@ Available for the media that is a track or a part of a music album: - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to - `release_year` (numeric): Year (YYYY) when the album was released -Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with `NA`. +Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj`, this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory. diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 4d62ba145..a35effe0e 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -633,13 +633,20 @@ class TestYoutubeDL(unittest.TestCase): 'title2': '%PATH%', } - def fname(templ): - ydl = YoutubeDL({'outtmpl': templ}) + def fname(templ, na_placeholder='NA'): + params = {'outtmpl': templ} + if na_placeholder != 'NA': + params['outtmpl_na_placeholder'] = na_placeholder + ydl = YoutubeDL(params) return ydl.prepare_filename(info) self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4') self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4') - # Replace missing fields with 'NA' - self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') + NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(id)s.%(ext)s' + # Replace missing fields with 'NA' by default + self.assertEqual(fname(NA_TEST_OUTTMPL), 'NA-NA-1234.mp4') + # Or by provided placeholder + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder='none'), 'none-none-1234.mp4') + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder=''), '--1234.mp4') self.assertEqual(fname('%(height)d.%(ext)s'), '1080.mp4') self.assertEqual(fname('%(height)6d.%(ext)s'), ' 1080.mp4') self.assertEqual(fname('%(height)-6d.%(ext)s'), '1080 .mp4') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index efd42fa63..ecac31f7a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -163,6 +163,7 @@ class YoutubeDL(object): simulate: Do not download the video files. format: Video format code. See options.py for more information. outtmpl: Template for output names. + outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. force_generic_extractor: Force downloader to use the generic extractor @@ -658,7 +659,7 @@ class YoutubeDL(object): template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() if v is not None and not isinstance(v, (list, tuple, dict))) - template_dict = collections.defaultdict(lambda: 'NA', template_dict) + template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) @@ -678,8 +679,8 @@ class YoutubeDL(object): # Missing numeric fields used together with integer presentation types # in format specification will break the argument substitution since - # string 'NA' is returned for missing fields. We will patch output - # template for missing fields to meet string presentation type. + # string NA placeholder is returned for missing fields. We will patch + # output template for missing fields to meet string presentation type. for numeric_field in self._NUMERIC_FIELDS: if numeric_field not in template_dict: # As of [1] format syntax is: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9a659fc65..e1bd67919 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -340,6 +340,7 @@ def _real_main(argv=None): 'format': opts.format, 'listformats': opts.listformats, 'outtmpl': outtmpl, + 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, 'autonumber_size': opts.autonumber_size, 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3000ba41e..6b0c62e19 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -689,6 +689,10 @@ def parseOpts(overrideArguments=None): '-o', '--output', dest='outtmpl', metavar='TEMPLATE', help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) + filesystem.add_option( + '--output-na-placeholder', + dest='outtmpl_na_placeholder', metavar='PLACEHOLDER', default='NA', + help=('Placeholder value for unavailable meta fields in output filename template (default is "%default")')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', type=int, From 5519bba3e14d05ccc8c7114e2f2909294c65a26e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jan 2021 00:00:53 +0700 Subject: [PATCH 358/424] [options] Clarify --extract-audio help string (closes #27878) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 6b0c62e19..241cf110f 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -786,7 +786,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '-x', '--extract-audio', action='store_true', dest='extractaudio', default=False, - help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') + help='Convert video files to audio-only files (requires ffmpeg/avconv and ffprobe/avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') From d8dab85419ea7e35bd234479abe751334f3e7116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Grosdidier?= <aurelien.grosdidier@gmail.com> Date: Fri, 22 Jan 2021 19:03:45 +0100 Subject: [PATCH 359/424] [franceculture] Fix extraction (closes #27891) (#27903) Co-authored-by: Sergey M. <dstftw@gmail.com> --- youtube_dl/extractor/franceculture.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index 306b45fc9..7e9ceabbc 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -20,7 +20,7 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', - 'timestamp': 1393642916, + 'timestamp': 1393700400, 'vcodec': 'none', } } @@ -36,12 +36,12 @@ class FranceCultureIE(InfoExtractor): </h1>| <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> ).*? - (<button[^>]+data-asset-source="[^"]+"[^>]+>) + (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) ''', webpage, 'video data')) - video_url = video_data['data-asset-source'] - title = video_data.get('data-asset-title') or self._og_search_title(webpage) + video_url = video_data.get('data-url') or video_data['data-asset-source'] + title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage) description = self._html_search_regex( r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', @@ -64,6 +64,6 @@ class FranceCultureIE(InfoExtractor): 'ext': ext, 'vcodec': 'none' if ext == 'mp3' else None, 'uploader': uploader, - 'timestamp': int_or_none(video_data.get('data-asset-created-date')), + 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')), 'duration': int_or_none(video_data.get('data-duration')), } From f08c31cf33beb0687c4df58b83e319ec8dfe03ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jan 2021 01:10:35 +0700 Subject: [PATCH 360/424] [franceculture] Make thumbnail optional (closes #18807) --- youtube_dl/extractor/franceculture.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index 7e9ceabbc..14f4cb489 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -11,7 +11,7 @@ from ..utils import ( class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { 'id': 'rendez-vous-au-pays-des-geeks', @@ -23,7 +23,11 @@ class FranceCultureIE(InfoExtractor): 'timestamp': 1393700400, 'vcodec': 'none', } - } + }, { + # no thumbnail + 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -48,7 +52,7 @@ class FranceCultureIE(InfoExtractor): webpage, 'description', default=None) thumbnail = self._search_regex( r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + webpage, 'thumbnail', default=None) uploader = self._html_search_regex( r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None) From dbf3fa8af67dfdab42d56fdc5f35610658bc2746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jan 2021 17:36:40 +0700 Subject: [PATCH 361/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 27f01c438..f09049fea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version <unreleased> + +Core +* Introduce --output-na-placeholder (#27896) + +Extractors +* [franceculture] Make thumbnail optional (#18807) +* [franceculture] Fix extraction (#27891, #27903) +* [njpwworld] Fix extraction (#27890) +* [comedycentral] Fix extraction (#27905) +* [wat] Fix format extraction (#27901) ++ [americastestkitchen:season] Add support for seasons (#27861) ++ [trovo] Add support for trovo.live (#26125) ++ [aol] Add support for yahoo videos (#26650) +* [yahoo] Fix single video extraction +* [lbry] Unescape lbry URI (#27872) +* [9gag] Fix and improve extraction (#23022) +* [americastestkitchen] Improve metadata extraction for ATK episodes (#27860) +* [aljazeera] Fix extraction (#20911, #27779) ++ [minds] Add support for minds.com (#17934) +* [ard] Fix title and description extraction (#27761) ++ [spotify] Add support for Spotify Podcasts (#27443) + + version 2021.01.16 Core From 186cbaffb954f00c4ff1f58f9fe378c65b4ef87b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jan 2021 18:00:58 +0700 Subject: [PATCH 362/424] release 2021.01.24 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +- .../ISSUE_TEMPLATE/2_site_support_request.md | 4 +- .../ISSUE_TEMPLATE/3_site_feature_request.md | 4 +- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 +- ChangeLog | 2 +- README.md | 761 +++++++++--------- docs/supportedsites.md | 13 +- youtube_dl/version.py | 2 +- 9 files changed, 422 insertions(+), 380 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index aedcfa6b3..935806b5e 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.16 + [debug] youtube-dl version 2021.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5c0dfea4e..453a5e147 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 772147a75..07094c10d 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 2fcaa3a23..b234f8ccd 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.16 + [debug] youtube-dl version 2021.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index f1adfce8f..557e59ca5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index f09049fea..9b2f38b25 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.01.24 Core * Introduce --output-na-placeholder (#27896) diff --git a/README.md b/README.md index 8c2569aaa..94c34d89a 100644 --- a/README.md +++ b/README.md @@ -52,394 +52,431 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo youtube-dl [OPTIONS] URL [URL...] # OPTIONS - -h, --help Print this help text and exit - --version Print program version and exit - -U, --update Update this program to latest version. Make - sure that you have sufficient permissions - (run with sudo if needed) - -i, --ignore-errors Continue on download errors, for example to - skip unavailable videos in a playlist - --abort-on-error Abort downloading of further videos (in the - playlist or the command line) if an error - occurs - --dump-user-agent Display the current browser identification - --list-extractors List all supported extractors - --extractor-descriptions Output descriptions of all supported - extractors - --force-generic-extractor Force extraction to use the generic - extractor - --default-search PREFIX Use this prefix for unqualified URLs. For - example "gvsearch2:" downloads two videos - from google videos for youtube-dl "large - apple". Use the value "auto" to let - youtube-dl guess ("auto_warning" to emit a - warning when guessing). "error" just throws - an error. The default value "fixup_error" - repairs broken URLs, but emits an error if - this is not possible instead of searching. - --ignore-config Do not read configuration files. When given - in the global configuration file - /etc/youtube-dl.conf: Do not read the user - configuration in ~/.config/youtube- - dl/config (%APPDATA%/youtube-dl/config.txt - on Windows) - --config-location PATH Location of the configuration file; either - the path to the config or its containing - directory. - --flat-playlist Do not extract the videos of a playlist, - only list them. - --mark-watched Mark videos watched (YouTube only) - --no-mark-watched Do not mark videos watched (YouTube only) - --no-color Do not emit color codes in output + -h, --help Print this help text and exit + --version Print program version and exit + -U, --update Update this program to latest version. + Make sure that you have sufficient + permissions (run with sudo if needed) + -i, --ignore-errors Continue on download errors, for + example to skip unavailable videos in a + playlist + --abort-on-error Abort downloading of further videos (in + the playlist or the command line) if an + error occurs + --dump-user-agent Display the current browser + identification + --list-extractors List all supported extractors + --extractor-descriptions Output descriptions of all supported + extractors + --force-generic-extractor Force extraction to use the generic + extractor + --default-search PREFIX Use this prefix for unqualified URLs. + For example "gvsearch2:" downloads two + videos from google videos for youtube- + dl "large apple". Use the value "auto" + to let youtube-dl guess ("auto_warning" + to emit a warning when guessing). + "error" just throws an error. The + default value "fixup_error" repairs + broken URLs, but emits an error if this + is not possible instead of searching. + --ignore-config Do not read configuration files. When + given in the global configuration file + /etc/youtube-dl.conf: Do not read the + user configuration in + ~/.config/youtube-dl/config + (%APPDATA%/youtube-dl/config.txt on + Windows) + --config-location PATH Location of the configuration file; + either the path to the config or its + containing directory. + --flat-playlist Do not extract the videos of a + playlist, only list them. + --mark-watched Mark videos watched (YouTube only) + --no-mark-watched Do not mark videos watched (YouTube + only) + --no-color Do not emit color codes in output ## Network Options: - --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable SOCKS proxy, specify a proper - scheme. For example - socks5://127.0.0.1:1080/. Pass in an empty - string (--proxy "") for direct connection - --socket-timeout SECONDS Time to wait before giving up, in seconds - --source-address IP Client-side IP address to bind to - -4, --force-ipv4 Make all connections via IPv4 - -6, --force-ipv6 Make all connections via IPv6 + --proxy URL Use the specified HTTP/HTTPS/SOCKS + proxy. To enable SOCKS proxy, specify a + proper scheme. For example + socks5://127.0.0.1:1080/. Pass in an + empty string (--proxy "") for direct + connection + --socket-timeout SECONDS Time to wait before giving up, in + seconds + --source-address IP Client-side IP address to bind to + -4, --force-ipv4 Make all connections via IPv4 + -6, --force-ipv6 Make all connections via IPv6 ## Geo Restriction: - --geo-verification-proxy URL Use this proxy to verify the IP address for - some geo-restricted sites. The default - proxy specified by --proxy (or none, if the - option is not present) is used for the - actual downloading. - --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header - --no-geo-bypass Do not bypass geographic restriction via - faking X-Forwarded-For HTTP header - --geo-bypass-country CODE Force bypass geographic restriction with - explicitly provided two-letter ISO 3166-2 - country code - --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with - explicitly provided IP block in CIDR - notation + --geo-verification-proxy URL Use this proxy to verify the IP address + for some geo-restricted sites. The + default proxy specified by --proxy (or + none, if the option is not present) is + used for the actual downloading. + --geo-bypass Bypass geographic restriction via + faking X-Forwarded-For HTTP header + --no-geo-bypass Do not bypass geographic restriction + via faking X-Forwarded-For HTTP header + --geo-bypass-country CODE Force bypass geographic restriction + with explicitly provided two-letter ISO + 3166-2 country code + --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction + with explicitly provided IP block in + CIDR notation ## Video Selection: - --playlist-start NUMBER Playlist video to start at (default is 1) - --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify - indices of the videos in the playlist - separated by commas like: "--playlist-items - 1,2,5,8" if you want to download videos - indexed 1, 2, 5, 8 in the playlist. You can - specify range: "--playlist-items - 1-3,7,10-13", it will download the videos - at index 1, 2, 3, 7, 10, 11, 12 and 13. - --match-title REGEX Download only matching titles (regex or - caseless sub-string) - --reject-title REGEX Skip download for matching titles (regex or - caseless sub-string) - --max-downloads NUMBER Abort after downloading NUMBER files - --min-filesize SIZE Do not download any videos smaller than - SIZE (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE - (e.g. 50k or 44.6m) - --date DATE Download only videos uploaded in this date - --datebefore DATE Download only videos uploaded on or before - this date (i.e. inclusive) - --dateafter DATE Download only videos uploaded on or after - this date (i.e. inclusive) - --min-views COUNT Do not download any videos with less than - COUNT views - --max-views COUNT Do not download any videos with more than - COUNT views - --match-filter FILTER Generic video filter. Specify any key (see - the "OUTPUT TEMPLATE" for a list of - available keys) to match if the key is - present, !key to check if the key is not - present, key > NUMBER (like "comment_count - > 12", also works with >=, <, <=, !=, =) to - compare against a number, key = 'LITERAL' - (like "uploader = 'Mike Smith'", also works - with !=) to match against a string literal - and & to require multiple matches. Values - which are not known are excluded unless you - put a question mark (?) after the operator. - For example, to only match videos that have - been liked more than 100 times and disliked - less than 50 times (or the dislike - functionality is not available at the given - service), but who also have a description, - use --match-filter "like_count > 100 & - dislike_count <? 50 & description" . - --no-playlist Download only the video, if the URL refers - to a video and a playlist. - --yes-playlist Download the playlist, if the URL refers to - a video and a playlist. - --age-limit YEARS Download only videos suitable for the given - age - --download-archive FILE Download only videos not listed in the - archive file. Record the IDs of all - downloaded videos in it. - --include-ads Download advertisements as well - (experimental) + --playlist-start NUMBER Playlist video to start at (default is + 1) + --playlist-end NUMBER Playlist video to end at (default is + last) + --playlist-items ITEM_SPEC Playlist video items to download. + Specify indices of the videos in the + playlist separated by commas like: "-- + playlist-items 1,2,5,8" if you want to + download videos indexed 1, 2, 5, 8 in + the playlist. You can specify range: " + --playlist-items 1-3,7,10-13", it will + download the videos at index 1, 2, 3, + 7, 10, 11, 12 and 13. + --match-title REGEX Download only matching titles (regex or + caseless sub-string) + --reject-title REGEX Skip download for matching titles + (regex or caseless sub-string) + --max-downloads NUMBER Abort after downloading NUMBER files + --min-filesize SIZE Do not download any videos smaller than + SIZE (e.g. 50k or 44.6m) + --max-filesize SIZE Do not download any videos larger than + SIZE (e.g. 50k or 44.6m) + --date DATE Download only videos uploaded in this + date + --datebefore DATE Download only videos uploaded on or + before this date (i.e. inclusive) + --dateafter DATE Download only videos uploaded on or + after this date (i.e. inclusive) + --min-views COUNT Do not download any videos with less + than COUNT views + --max-views COUNT Do not download any videos with more + than COUNT views + --match-filter FILTER Generic video filter. Specify any key + (see the "OUTPUT TEMPLATE" for a list + of available keys) to match if the key + is present, !key to check if the key is + not present, key > NUMBER (like + "comment_count > 12", also works with + >=, <, <=, !=, =) to compare against a + number, key = 'LITERAL' (like "uploader + = 'Mike Smith'", also works with !=) to + match against a string literal and & to + require multiple matches. Values which + are not known are excluded unless you + put a question mark (?) after the + operator. For example, to only match + videos that have been liked more than + 100 times and disliked less than 50 + times (or the dislike functionality is + not available at the given service), + but who also have a description, use + --match-filter "like_count > 100 & + dislike_count <? 50 & description" . + --no-playlist Download only the video, if the URL + refers to a video and a playlist. + --yes-playlist Download the playlist, if the URL + refers to a video and a playlist. + --age-limit YEARS Download only videos suitable for the + given age + --download-archive FILE Download only videos not listed in the + archive file. Record the IDs of all + downloaded videos in it. + --include-ads Download advertisements as well + (experimental) ## Download Options: - -r, --limit-rate RATE Maximum download rate in bytes per second - (e.g. 50K or 4.2M) - -R, --retries RETRIES Number of retries (default is 10), or - "infinite". - --fragment-retries RETRIES Number of retries for a fragment (default - is 10), or "infinite" (DASH, hlsnative and - ISM) - --skip-unavailable-fragments Skip unavailable fragments (DASH, hlsnative - and ISM) - --abort-on-unavailable-fragment Abort downloading when some fragment is not - available - --keep-fragments Keep downloaded fragments on disk after - downloading is finished; fragments are - erased by default - --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) - (default is 1024) - --no-resize-buffer Do not automatically adjust the buffer - size. By default, the buffer size is - automatically resized from an initial value - of SIZE. - --http-chunk-size SIZE Size of a chunk for chunk-based HTTP - downloading (e.g. 10485760 or 10M) (default - is disabled). May be useful for bypassing - bandwidth throttling imposed by a webserver - (experimental) - --playlist-reverse Download playlist videos in reverse order - --playlist-random Download playlist videos in random order - --xattr-set-filesize Set file xattribute ytdl.filesize with - expected file size - --hls-prefer-native Use the native HLS downloader instead of - ffmpeg - --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS - downloader - --hls-use-mpegts Use the mpegts container for HLS videos, - allowing to play the video while - downloading (some players may not be able - to play it) - --external-downloader COMMAND Use the specified external downloader. - Currently supports - aria2c,avconv,axel,curl,ffmpeg,httpie,wget - --external-downloader-args ARGS Give these arguments to the external - downloader + -r, --limit-rate RATE Maximum download rate in bytes per + second (e.g. 50K or 4.2M) + -R, --retries RETRIES Number of retries (default is 10), or + "infinite". + --fragment-retries RETRIES Number of retries for a fragment + (default is 10), or "infinite" (DASH, + hlsnative and ISM) + --skip-unavailable-fragments Skip unavailable fragments (DASH, + hlsnative and ISM) + --abort-on-unavailable-fragment Abort downloading when some fragment is + not available + --keep-fragments Keep downloaded fragments on disk after + downloading is finished; fragments are + erased by default + --buffer-size SIZE Size of download buffer (e.g. 1024 or + 16K) (default is 1024) + --no-resize-buffer Do not automatically adjust the buffer + size. By default, the buffer size is + automatically resized from an initial + value of SIZE. + --http-chunk-size SIZE Size of a chunk for chunk-based HTTP + downloading (e.g. 10485760 or 10M) + (default is disabled). May be useful + for bypassing bandwidth throttling + imposed by a webserver (experimental) + --playlist-reverse Download playlist videos in reverse + order + --playlist-random Download playlist videos in random + order + --xattr-set-filesize Set file xattribute ytdl.filesize with + expected file size + --hls-prefer-native Use the native HLS downloader instead + of ffmpeg + --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS + downloader + --hls-use-mpegts Use the mpegts container for HLS + videos, allowing to play the video + while downloading (some players may not + be able to play it) + --external-downloader COMMAND Use the specified external downloader. + Currently supports aria2c,avconv,axel,c + url,ffmpeg,httpie,wget + --external-downloader-args ARGS Give these arguments to the external + downloader ## Filesystem Options: - -a, --batch-file FILE File containing URLs to download ('-' for - stdin), one URL per line. Lines starting - with '#', ';' or ']' are considered as - comments and ignored. - --id Use only video ID in file name - -o, --output TEMPLATE Output filename template, see the "OUTPUT - TEMPLATE" for all the info - --autonumber-start NUMBER Specify the start value for %(autonumber)s - (default is 1) - --restrict-filenames Restrict filenames to only ASCII - characters, and avoid "&" and spaces in - filenames - -w, --no-overwrites Do not overwrite files - -c, --continue Force resume of partially downloaded files. - By default, youtube-dl will resume - downloads if possible. - --no-continue Do not resume partially downloaded files - (restart from beginning) - --no-part Do not use .part files - write directly - into output file - --no-mtime Do not use the Last-modified header to set - the file modification time - --write-description Write video description to a .description - file - --write-info-json Write video metadata to a .info.json file - --write-annotations Write video annotations to a - .annotations.xml file - --load-info-json FILE JSON file containing the video information - (created with the "--write-info-json" - option) - --cookies FILE File to read cookies from and dump cookie - jar in - --cache-dir DIR Location in the filesystem where youtube-dl - can store some downloaded information - permanently. By default - $XDG_CACHE_HOME/youtube-dl or - ~/.cache/youtube-dl . At the moment, only - YouTube player files (for videos with - obfuscated signatures) are cached, but that - may change. - --no-cache-dir Disable filesystem caching - --rm-cache-dir Delete all filesystem cache files + -a, --batch-file FILE File containing URLs to download ('-' + for stdin), one URL per line. Lines + starting with '#', ';' or ']' are + considered as comments and ignored. + --id Use only video ID in file name + -o, --output TEMPLATE Output filename template, see the + "OUTPUT TEMPLATE" for all the info + --output-na-placeholder PLACEHOLDER Placeholder value for unavailable meta + fields in output filename template + (default is "NA") + --autonumber-start NUMBER Specify the start value for + %(autonumber)s (default is 1) + --restrict-filenames Restrict filenames to only ASCII + characters, and avoid "&" and spaces in + filenames + -w, --no-overwrites Do not overwrite files + -c, --continue Force resume of partially downloaded + files. By default, youtube-dl will + resume downloads if possible. + --no-continue Do not resume partially downloaded + files (restart from beginning) + --no-part Do not use .part files - write directly + into output file + --no-mtime Do not use the Last-modified header to + set the file modification time + --write-description Write video description to a + .description file + --write-info-json Write video metadata to a .info.json + file + --write-annotations Write video annotations to a + .annotations.xml file + --load-info-json FILE JSON file containing the video + information (created with the "--write- + info-json" option) + --cookies FILE File to read cookies from and dump + cookie jar in + --cache-dir DIR Location in the filesystem where + youtube-dl can store some downloaded + information permanently. By default + $XDG_CACHE_HOME/youtube-dl or + ~/.cache/youtube-dl . At the moment, + only YouTube player files (for videos + with obfuscated signatures) are cached, + but that may change. + --no-cache-dir Disable filesystem caching + --rm-cache-dir Delete all filesystem cache files ## Thumbnail images: - --write-thumbnail Write thumbnail image to disk - --write-all-thumbnails Write all thumbnail image formats to disk - --list-thumbnails Simulate and list all available thumbnail - formats + --write-thumbnail Write thumbnail image to disk + --write-all-thumbnails Write all thumbnail image formats to + disk + --list-thumbnails Simulate and list all available + thumbnail formats ## Verbosity / Simulation Options: - -q, --quiet Activate quiet mode - --no-warnings Ignore warnings - -s, --simulate Do not download the video and do not write - anything to disk - --skip-download Do not download the video - -g, --get-url Simulate, quiet but print URL - -e, --get-title Simulate, quiet but print title - --get-id Simulate, quiet but print id - --get-thumbnail Simulate, quiet but print thumbnail URL - --get-description Simulate, quiet but print video description - --get-duration Simulate, quiet but print video length - --get-filename Simulate, quiet but print output filename - --get-format Simulate, quiet but print output format - -j, --dump-json Simulate, quiet but print JSON information. - See the "OUTPUT TEMPLATE" for a description - of available keys. - -J, --dump-single-json Simulate, quiet but print JSON information - for each command-line argument. If the URL - refers to a playlist, dump the whole - playlist information in a single line. - --print-json Be quiet and print the video information as - JSON (video is still being downloaded). - --newline Output progress bar as new lines - --no-progress Do not print progress bar - --console-title Display progress in console titlebar - -v, --verbose Print various debugging information - --dump-pages Print downloaded pages encoded using base64 - to debug problems (very verbose) - --write-pages Write downloaded intermediary pages to - files in the current directory to debug - problems - --print-traffic Display sent and read HTTP traffic - -C, --call-home Contact the youtube-dl server for debugging - --no-call-home Do NOT contact the youtube-dl server for - debugging + -q, --quiet Activate quiet mode + --no-warnings Ignore warnings + -s, --simulate Do not download the video and do not + write anything to disk + --skip-download Do not download the video + -g, --get-url Simulate, quiet but print URL + -e, --get-title Simulate, quiet but print title + --get-id Simulate, quiet but print id + --get-thumbnail Simulate, quiet but print thumbnail URL + --get-description Simulate, quiet but print video + description + --get-duration Simulate, quiet but print video length + --get-filename Simulate, quiet but print output + filename + --get-format Simulate, quiet but print output format + -j, --dump-json Simulate, quiet but print JSON + information. See the "OUTPUT TEMPLATE" + for a description of available keys. + -J, --dump-single-json Simulate, quiet but print JSON + information for each command-line + argument. If the URL refers to a + playlist, dump the whole playlist + information in a single line. + --print-json Be quiet and print the video + information as JSON (video is still + being downloaded). + --newline Output progress bar as new lines + --no-progress Do not print progress bar + --console-title Display progress in console titlebar + -v, --verbose Print various debugging information + --dump-pages Print downloaded pages encoded using + base64 to debug problems (very verbose) + --write-pages Write downloaded intermediary pages to + files in the current directory to debug + problems + --print-traffic Display sent and read HTTP traffic + -C, --call-home Contact the youtube-dl server for + debugging + --no-call-home Do NOT contact the youtube-dl server + for debugging ## Workarounds: - --encoding ENCODING Force the specified encoding (experimental) - --no-check-certificate Suppress HTTPS certificate validation - --prefer-insecure Use an unencrypted connection to retrieve - information about the video. (Currently - supported only for YouTube) - --user-agent UA Specify a custom user agent - --referer URL Specify a custom referer, use if the video - access is restricted to one domain - --add-header FIELD:VALUE Specify a custom HTTP header and its value, - separated by a colon ':'. You can use this - option multiple times - --bidi-workaround Work around terminals that lack - bidirectional text support. Requires bidiv - or fribidi executable in PATH - --sleep-interval SECONDS Number of seconds to sleep before each - download when used alone or a lower bound - of a range for randomized sleep before each - download (minimum possible number of - seconds to sleep) when used along with - --max-sleep-interval. - --max-sleep-interval SECONDS Upper bound of a range for randomized sleep - before each download (maximum possible - number of seconds to sleep). Must only be - used along with --min-sleep-interval. + --encoding ENCODING Force the specified encoding + (experimental) + --no-check-certificate Suppress HTTPS certificate validation + --prefer-insecure Use an unencrypted connection to + retrieve information about the video. + (Currently supported only for YouTube) + --user-agent UA Specify a custom user agent + --referer URL Specify a custom referer, use if the + video access is restricted to one + domain + --add-header FIELD:VALUE Specify a custom HTTP header and its + value, separated by a colon ':'. You + can use this option multiple times + --bidi-workaround Work around terminals that lack + bidirectional text support. Requires + bidiv or fribidi executable in PATH + --sleep-interval SECONDS Number of seconds to sleep before each + download when used alone or a lower + bound of a range for randomized sleep + before each download (minimum possible + number of seconds to sleep) when used + along with --max-sleep-interval. + --max-sleep-interval SECONDS Upper bound of a range for randomized + sleep before each download (maximum + possible number of seconds to sleep). + Must only be used along with --min- + sleep-interval. ## Video Format Options: - -f, --format FORMAT Video format code, see the "FORMAT - SELECTION" for all the info - --all-formats Download all available video formats - --prefer-free-formats Prefer free video formats unless a specific - one is requested - -F, --list-formats List all available formats of requested - videos - --youtube-skip-dash-manifest Do not download the DASH manifests and - related data on YouTube videos - --merge-output-format FORMAT If a merge is required (e.g. - bestvideo+bestaudio), output to given - container format. One of mkv, mp4, ogg, - webm, flv. Ignored if no merge is required + -f, --format FORMAT Video format code, see the "FORMAT + SELECTION" for all the info + --all-formats Download all available video formats + --prefer-free-formats Prefer free video formats unless a + specific one is requested + -F, --list-formats List all available formats of requested + videos + --youtube-skip-dash-manifest Do not download the DASH manifests and + related data on YouTube videos + --merge-output-format FORMAT If a merge is required (e.g. + bestvideo+bestaudio), output to given + container format. One of mkv, mp4, ogg, + webm, flv. Ignored if no merge is + required ## Subtitle Options: - --write-sub Write subtitle file - --write-auto-sub Write automatically generated subtitle file - (YouTube only) - --all-subs Download all the available subtitles of the - video - --list-subs List all available subtitles for the video - --sub-format FORMAT Subtitle format, accepts formats - preference, for example: "srt" or - "ass/srt/best" - --sub-lang LANGS Languages of the subtitles to download - (optional) separated by commas, use --list- - subs for available language tags + --write-sub Write subtitle file + --write-auto-sub Write automatically generated subtitle + file (YouTube only) + --all-subs Download all the available subtitles of + the video + --list-subs List all available subtitles for the + video + --sub-format FORMAT Subtitle format, accepts formats + preference, for example: "srt" or + "ass/srt/best" + --sub-lang LANGS Languages of the subtitles to download + (optional) separated by commas, use + --list-subs for available language tags ## Authentication Options: - -u, --username USERNAME Login with this account ID - -p, --password PASSWORD Account password. If this option is left - out, youtube-dl will ask interactively. - -2, --twofactor TWOFACTOR Two-factor authentication code - -n, --netrc Use .netrc authentication data - --video-password PASSWORD Video password (vimeo, youku) + -u, --username USERNAME Login with this account ID + -p, --password PASSWORD Account password. If this option is + left out, youtube-dl will ask + interactively. + -2, --twofactor TWOFACTOR Two-factor authentication code + -n, --netrc Use .netrc authentication data + --video-password PASSWORD Video password (vimeo, youku) ## Adobe Pass Options: - --ap-mso MSO Adobe Pass multiple-system operator (TV - provider) identifier, use --ap-list-mso for - a list of available MSOs - --ap-username USERNAME Multiple-system operator account login - --ap-password PASSWORD Multiple-system operator account password. - If this option is left out, youtube-dl will - ask interactively. - --ap-list-mso List all supported multiple-system - operators + --ap-mso MSO Adobe Pass multiple-system operator (TV + provider) identifier, use --ap-list-mso + for a list of available MSOs + --ap-username USERNAME Multiple-system operator account login + --ap-password PASSWORD Multiple-system operator account + password. If this option is left out, + youtube-dl will ask interactively. + --ap-list-mso List all supported multiple-system + operators ## Post-processing Options: - -x, --extract-audio Convert video files to audio-only files - (requires ffmpeg or avconv and ffprobe or - avprobe) - --audio-format FORMAT Specify audio format: "best", "aac", - "flac", "mp3", "m4a", "opus", "vorbis", or - "wav"; "best" by default; No effect without - -x - --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert - a value between 0 (better) and 9 (worse) - for VBR or a specific bitrate like 128K - (default 5) - --recode-video FORMAT Encode the video to another format if - necessary (currently supported: - mp4|flv|ogg|webm|mkv|avi) - --postprocessor-args ARGS Give these arguments to the postprocessor - -k, --keep-video Keep the video file on disk after the post- - processing; the video is erased by default - --no-post-overwrites Do not overwrite post-processed files; the - post-processed files are overwritten by - default - --embed-subs Embed subtitles in the video (only for mp4, - webm and mkv videos) - --embed-thumbnail Embed thumbnail in the audio as cover art - --add-metadata Write metadata to the video file - --metadata-from-title FORMAT Parse additional metadata like song title / - artist from the video title. The format - syntax is the same as --output. Regular - expression with named capture groups may - also be used. The parsed parameters replace - existing values. Example: --metadata-from- - title "%(artist)s - %(title)s" matches a - title like "Coldplay - Paradise". Example - (regex): --metadata-from-title - "(?P<artist>.+?) - (?P<title>.+)" - --xattrs Write metadata to the video file's xattrs - (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the - file. One of never (do nothing), warn (only - emit a warning), detect_or_warn (the - default; fix file if we can, warn - otherwise) - --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors - --prefer-ffmpeg Prefer ffmpeg over avconv for running the - postprocessors (default) - --ffmpeg-location PATH Location of the ffmpeg/avconv binary; - either the path to the binary or its - containing directory. - --exec CMD Execute a command on the file after - downloading and post-processing, similar to - find's -exec syntax. Example: --exec 'adb - push {} /sdcard/Music/ && rm {}' - --convert-subs FORMAT Convert the subtitles to other format - (currently supported: srt|ass|vtt|lrc) + -x, --extract-audio Convert video files to audio-only files + (requires ffmpeg/avconv and + ffprobe/avprobe) + --audio-format FORMAT Specify audio format: "best", "aac", + "flac", "mp3", "m4a", "opus", "vorbis", + or "wav"; "best" by default; No effect + without -x + --audio-quality QUALITY Specify ffmpeg/avconv audio quality, + insert a value between 0 (better) and 9 + (worse) for VBR or a specific bitrate + like 128K (default 5) + --recode-video FORMAT Encode the video to another format if + necessary (currently supported: + mp4|flv|ogg|webm|mkv|avi) + --postprocessor-args ARGS Give these arguments to the + postprocessor + -k, --keep-video Keep the video file on disk after the + post-processing; the video is erased by + default + --no-post-overwrites Do not overwrite post-processed files; + the post-processed files are + overwritten by default + --embed-subs Embed subtitles in the video (only for + mp4, webm and mkv videos) + --embed-thumbnail Embed thumbnail in the audio as cover + art + --add-metadata Write metadata to the video file + --metadata-from-title FORMAT Parse additional metadata like song + title / artist from the video title. + The format syntax is the same as + --output. Regular expression with named + capture groups may also be used. The + parsed parameters replace existing + values. Example: --metadata-from-title + "%(artist)s - %(title)s" matches a + title like "Coldplay - Paradise". + Example (regex): --metadata-from-title + "(?P<artist>.+?) - (?P<title>.+)" + --xattrs Write metadata to the video file's + xattrs (using dublin core and xdg + standards) + --fixup POLICY Automatically correct known faults of + the file. One of never (do nothing), + warn (only emit a warning), + detect_or_warn (the default; fix file + if we can, warn otherwise) + --prefer-avconv Prefer avconv over ffmpeg for running + the postprocessors + --prefer-ffmpeg Prefer ffmpeg over avconv for running + the postprocessors (default) + --ffmpeg-location PATH Location of the ffmpeg/avconv binary; + either the path to the binary or its + containing directory. + --exec CMD Execute a command on the file after + downloading and post-processing, + similar to find's -exec syntax. + Example: --exec 'adb push {} + /sdcard/Music/ && rm {}' + --convert-subs FORMAT Convert the subtitles to other format + (currently supported: srt|ass|vtt|lrc) # CONFIGURATION diff --git a/docs/supportedsites.md b/docs/supportedsites.md index aa8026a32..13bac6e27 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -46,10 +46,11 @@ - **Amara** - **AMCNetworks** - **AmericasTestKitchen** + - **AmericasTestKitchenSeason** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **AnimeOnDemand** - **Anvato** - - **aol.com** + - **aol.com**: Yahoo screen and movies - **APA** - **Aparat** - **AppleConnect** @@ -192,8 +193,6 @@ - **CNNArticle** - **CNNBlogs** - **ComedyCentral** - - **ComedyCentralFullEpisodes** - - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **CONtv** @@ -506,6 +505,9 @@ - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** + - **minds** + - **minds:channel** + - **minds:group** - **MinistryGrid** - **Minoto** - **miomio.tv** @@ -859,6 +861,8 @@ - **Sport5** - **SportBox** - **SportDeutschland** + - **spotify** + - **spotify:show** - **Spreaker** - **SpreakerPage** - **SpreakerShow** @@ -940,12 +944,13 @@ - **TNAFlixNetworkEmbed** - **toggle** - **ToonGoggles** - - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics video - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **Trovo** + - **TrovoVod** - **TruNews** - **TruTV** - **Tube8** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ac7242abb..080460d50 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.16' +__version__ = '2021.01.24' From b63981e85095542e056d7180496de4a2d85ddf9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jan 2021 18:11:20 +0700 Subject: [PATCH 363/424] release 2021.01.24.1 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- youtube_dl/version.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 935806b5e..2dde97a2c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24 + [debug] youtube-dl version 2021.01.24.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 453a5e147..c520d1ee0 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 07094c10d..4aacd3bdc 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index b234f8ccd..91bbed506 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24 + [debug] youtube-dl version 2021.01.24.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 557e59ca5..a0a2c989a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 080460d50..c52f1d9ca 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.24' +__version__ = '2021.01.24.1' From 811a183eb6a7dabcc8600f6bf9dc4aac11e72b26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jan 2021 18:15:32 +0700 Subject: [PATCH 364/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 9b2f38b25..7f2e0aad1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version 2021.01.24 +version 2021.01.24.1 Core * Introduce --output-na-placeholder (#27896) From ea399a53eb9744c0b1530d72bf9d5e1e21ed3489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= <mail@adrianheine.de> Date: Mon, 25 Jan 2021 09:25:57 +0100 Subject: [PATCH 365/424] [ADN] Implement login (#27937) closes #17091 closes #27841 --- youtube_dl/extractor/adn.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index d611ee237..40111586d 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -26,6 +26,7 @@ from ..utils import ( strip_or_none, try_get, unified_strdate, + urlencode_postdata, ) @@ -51,9 +52,11 @@ class ADNIE(InfoExtractor): } } + _NETRC_MACHINE = 'animedigitalnetwork' _BASE_URL = 'http://animedigitalnetwork.fr' _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' + _HEADERS = {} _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, @@ -129,19 +132,32 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles + def _real_initialize(self): + username, password = self._get_login_info() + if username: + access_token = (self._download_json( + self._API_BASE_URL + 'authentication/login', None, + 'Logging in', errnote='Unable to log in', fatal=False, + data=urlencode_postdata({ + 'password': password, + 'rememberMe': False, + 'source': 'Web', + 'username': username, + })) or {}).get('accessToken') + if access_token: + self._HEADERS = {'authorization': 'Bearer ' + access_token} + def _real_extract(self, url): video_id = self._match_id(url) video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id player = self._download_json( video_base_url + 'configuration', video_id, - 'Downloading player config JSON metadata')['player'] + 'Downloading player config JSON metadata', headers=self._HEADERS)['player'] options = player['options'] user = options['user'] if not user.get('hasAccess'): - raise ExtractorError( - 'This video is only available for paying users', expected=True) - # self.raise_login_required() # FIXME: Login is not implemented + self.raise_login_required() token = self._download_json( user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), From 57f2488bbe4345eee545488e937e0f8426664f6f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 25 Jan 2021 09:20:48 +0100 Subject: [PATCH 366/424] [zype] fix uplynk id extraction(closes #27956) --- youtube_dl/extractor/zype.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py index 5288f40d8..f20f953cb 100644 --- a/youtube_dl/extractor/zype.py +++ b/youtube_dl/extractor/zype.py @@ -87,11 +87,16 @@ class ZypeIE(InfoExtractor): r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', body, 'm3u8 url', group='url', default=None) if not m3u8_url: - source = self._parse_json(self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, - 'source'), video_id, js_to_json) - if source.get('integration') == 'verizon-media': - m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id'] + source = self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source') + + def get_attr(key): + return self._search_regex( + r'\b%s\s*:\s*([\'"])(?P<val>(?:(?!\1).)+)\1' % key, + source, key, group='val') + + if get_attr('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id') formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( From 55bb3556c8c5ec088ef88edbdef925860e0926dc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 25 Jan 2021 09:28:24 +0100 Subject: [PATCH 367/424] [adn] improve login warning reporting --- youtube_dl/extractor/adn.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 40111586d..a55ebbcbd 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -57,6 +57,7 @@ class ADNIE(InfoExtractor): _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' _HEADERS = {} + _LOGIN_ERR_MESSAGE = 'Unable to log in' _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, @@ -134,10 +135,12 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' def _real_initialize(self): username, password = self._get_login_info() - if username: + if not username: + return + try: access_token = (self._download_json( self._API_BASE_URL + 'authentication/login', None, - 'Logging in', errnote='Unable to log in', fatal=False, + 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, data=urlencode_postdata({ 'password': password, 'rememberMe': False, @@ -146,13 +149,21 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' })) or {}).get('accessToken') if access_token: self._HEADERS = {'authorization': 'Bearer ' + access_token} + except ExtractorError as e: + message = None + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json( + e.cause.read().decode(), None, fatal=False) or {} + message = resp.get('message') or resp.get('code') + self.report_warning(message or self._LOGIN_ERR_MESSAGE) def _real_extract(self, url): video_id = self._match_id(url) video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id player = self._download_json( video_base_url + 'configuration', video_id, - 'Downloading player config JSON metadata', headers=self._HEADERS)['player'] + 'Downloading player config JSON metadata', + headers=self._HEADERS)['player'] options = player['options'] user = options['user'] @@ -204,8 +215,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) - else: - raise ExtractorError(message) + raise ExtractorError(message) else: raise ExtractorError('Giving up retrying') From 395981288ba0b2e1afabd4e595cb9c959ef62356 Mon Sep 17 00:00:00 2001 From: tpikonen <tpikonen@gmail.com> Date: Mon, 25 Jan 2021 15:43:41 +0200 Subject: [PATCH 368/424] [tv2] Add support for mtvuutiset.fi (#27744) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tv2.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ef57f5556..2331b0e15 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1260,6 +1260,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, KatsomoIE, + MTVuutisetIE, ) from .tv2dk import ( TV2DKIE, diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 4a19b9be6..42a9af126 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -190,3 +190,32 @@ class KatsomoIE(TV2IE): _API_DOMAIN = 'api.katsomo.fi' _PROTOCOLS = ('HLS', 'MPD') _GEO_COUNTRIES = ['FI'] + + +class MTVuutisetIE(KatsomoIE): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/(?:artikkeli/[0-9a-z-]+/|video/prog)(?P<id>\d+)' + _TEST = { + 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', + 'info_dict': { + 'id': '1311159', + 'ext': 'mp4', + 'title': 'MTV Uutiset Live', + 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'timestamp': 1600608966, + 'upload_date': '20200920', + 'duration': 153.7886666, + 'view_count': int, + 'categories': list, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + art_id = self._match_id(url) + webpage = self._download_webpage(url, art_id) + video_id = self._html_search_regex( + r'<div class=\'player-container\' .*data-katsomoid="(.+?)"', webpage, 'video_id') + return self.url_result("http://mtv.fi/a/0/a/%s" % video_id, video_id=video_id, ie="Katsomo") From 286e5d6724f86ace38f6ccf0e33c511eeb1ab65d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 25 Jan 2021 14:46:04 +0100 Subject: [PATCH 369/424] [tv2] improve MTV Uutiset Article extraction --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/tv2.py | 73 ++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2331b0e15..c554a8504 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1260,7 +1260,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, KatsomoIE, - MTVuutisetIE, + MTVUutisetArticleIE, ) from .tv2dk import ( TV2DKIE, diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 42a9af126..334b7d540 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -20,7 +20,7 @@ from ..utils import ( class TV2IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { 'id': '916509', @@ -33,7 +33,7 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, - } + }] _API_DOMAIN = 'sumo.tv2.no' _PROTOCOLS = ('HDS', 'HLS', 'DASH') _GEO_COUNTRIES = ['NO'] @@ -42,6 +42,12 @@ class TV2IE(InfoExtractor): video_id = self._match_id(url) api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id) + asset = self._download_json( + api_base + '.json', video_id, + 'Downloading metadata JSON')['asset'] + title = asset.get('subtitle') or asset['title'] + is_live = asset.get('live') is True + formats = [] format_urls = [] for protocol in self._PROTOCOLS: @@ -81,7 +87,8 @@ class TV2IE(InfoExtractor): elif ext == 'm3u8': if not data.get('drmProtected'): formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', + video_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id, fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -99,11 +106,6 @@ class TV2IE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) - asset = self._download_json( - api_base + '.json', video_id, - 'Downloading metadata JSON')['asset'] - title = asset['title'] - thumbnails = [{ 'id': thumbnail.get('@type'), 'url': thumbnail.get('url'), @@ -112,7 +114,7 @@ class TV2IE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': strip_or_none(asset.get('description')), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(asset.get('createTime')), @@ -120,6 +122,7 @@ class TV2IE(InfoExtractor): 'view_count': int_or_none(asset.get('views')), 'categories': asset.get('keywords', '').split(','), 'formats': formats, + 'is_live': is_live, } @@ -168,13 +171,13 @@ class TV2ArticleIE(InfoExtractor): class KatsomoIE(TV2IE): - _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321', 'info_dict': { 'id': '1181321', 'ext': 'mp4', - 'title': 'MTV Uutiset Live', + 'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle', 'description': 'Päätöksen teki Pelicansin hallitus.', 'timestamp': 1575116484, 'upload_date': '20191130', @@ -186,20 +189,29 @@ class KatsomoIE(TV2IE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa', + 'only_matching': True, + }, { + 'url': 'https://www.mtvuutiset.fi/video/prog1311159', + 'only_matching': True, + }, { + 'url': 'https://www.katsomo.fi/#!/jakso/1311159', + 'only_matching': True, + }] _API_DOMAIN = 'api.katsomo.fi' _PROTOCOLS = ('HLS', 'MPD') _GEO_COUNTRIES = ['FI'] -class MTVuutisetIE(KatsomoIE): - _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/(?:artikkeli/[0-9a-z-]+/|video/prog)(?P<id>\d+)' - _TEST = { +class MTVUutisetArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', 'info_dict': { 'id': '1311159', 'ext': 'mp4', - 'title': 'MTV Uutiset Live', + 'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', 'timestamp': 1600608966, 'upload_date': '20200920', @@ -211,11 +223,26 @@ class MTVuutisetIE(KatsomoIE): # m3u8 download 'skip_download': True, }, - } + }, { + # multiple Youtube embeds + 'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962', + 'only_matching': True, + }] def _real_extract(self, url): - art_id = self._match_id(url) - webpage = self._download_webpage(url, art_id) - video_id = self._html_search_regex( - r'<div class=\'player-container\' .*data-katsomoid="(.+?)"', webpage, 'video_id') - return self.url_result("http://mtv.fi/a/0/a/%s" % video_id, video_id=video_id, ie="Katsomo") + article_id = self._match_id(url) + article = self._download_json( + 'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id, + article_id) + + def entries(): + for video in (article.get('videos') or []): + video_type = video.get('videotype') + video_url = video.get('url') + if not (video_url and video_type in ('katsomo', 'youtube')): + continue + yield self.url_result( + video_url, video_type.capitalize(), video.get('video_id')) + + return self.playlist_result( + entries(), article_id, article.get('title'), article.get('description')) From 0f7d413d5b2637e2fb091745ab4f70811a6cc600 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 25 Jan 2021 15:15:45 +0100 Subject: [PATCH 370/424] [tv4] relax _VALID_URL(closes #27964) --- youtube_dl/extractor/tv4.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index c498b0191..b73bab9a8 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -17,7 +17,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| + (?:program|barn)/(?:(?:[^/]+/){1,2}|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -65,6 +65,10 @@ class TV4IE(InfoExtractor): { 'url': 'http://www.tv4play.se/program/farang/3922081', 'only_matching': True, + }, + { + 'url': 'https://www.tv4play.se/program/nyheterna/avsnitt/13315940', + 'only_matching': True, } ] From d18f4419a72a01abc2cb45ef23f2400cd3eb5f43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= <mail@adrianheine.de> Date: Tue, 26 Jan 2021 22:43:11 +0100 Subject: [PATCH 371/424] [AMP] Fix upload_date and timestamp extraction (#27970) --- youtube_dl/extractor/abcnews.py | 2 ++ youtube_dl/extractor/amp.py | 3 ++- youtube_dl/extractor/bleacherreport.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 8b407bf9c..64ea6e6ed 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -36,6 +36,8 @@ class AbcNewsVideoIE(AMPIE): 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', 'duration': 180, 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1380454200, + 'upload_date': '20130929', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 7ff098cfa..24c684cad 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + unified_timestamp, url_or_none, ) @@ -88,7 +89,7 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) - timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { 'id': video_id, diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index dc60224d0..d1bf8e829 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -90,13 +90,19 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', - 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', + 'md5': '670b2d73f48549da032861130488c681', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + 'upload_date': '20150723', + 'timestamp': 1437679032, + }, + 'expected_warnings': [ + 'Unable to download f4m manifest' + ] }] def _real_extract(self, url): From 11b68df7a4980f7f6175cdf2d7334fde11ff76b1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 27 Jan 2021 12:28:22 +0100 Subject: [PATCH 372/424] [abcnews] fix extraction(closes #12394)(closes #27920) --- youtube_dl/extractor/abcnews.py | 126 +++++++++++++++++--------------- 1 file changed, 67 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 64ea6e6ed..908c83377 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -1,14 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar import re -import time from .amp import AMPIE from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + parse_iso8601, + try_get, +) class AbcNewsVideoIE(AMPIE): @@ -18,8 +19,8 @@ class AbcNewsVideoIE(AMPIE): (?: abcnews\.go\.com/ (?: - [^/]+/video/(?P<display_id>[0-9a-z-]+)-| - video/embed\?.*?\bid= + (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-| + video/(?:embed|itemfeed)\?.*?\bid= )| fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ ) @@ -49,6 +50,12 @@ class AbcNewsVideoIE(AMPIE): }, { 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033', + 'only_matching': True, + }, { + 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761', + 'only_matching': True, }] def _real_extract(self, url): @@ -69,28 +76,23 @@ class AbcNewsIE(InfoExtractor): _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' _TESTS = [{ - 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', + # Youtube Embeds + 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501', 'info_dict': { - 'id': '10505354', - 'ext': 'flv', - 'display_id': 'dramatic-video-rare-death-job-america', - 'title': 'Occupational Hazards', - 'description': 'Nightline investigates the dangers that lurk at various jobs.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20100428', - 'timestamp': 1272412800, + 'id': '51286501', + 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player", + 'description': 'Billingsley went from a child actor to Hollywood power player.', }, - 'add_ie': ['AbcNewsVideo'], + 'playlist_count': 5, }, { 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'info_dict': { 'id': '38897857', 'ext': 'mp4', - 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', 'title': 'Justin Timberlake Drops Hints For Secret Single', 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', - 'upload_date': '20160515', - 'timestamp': 1463329500, + 'upload_date': '20160505', + 'timestamp': 1462442280, }, 'params': { # m3u8 download @@ -102,49 +104,55 @@ class AbcNewsIE(InfoExtractor): }, { 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', 'only_matching': True, + }, { + # inline.type == 'video' + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_id = mobj.group('id') + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + story = self._parse_json(self._search_regex( + r"window\['__abcnews__'\]\s*=\s*({.+?});", + webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0] + article_contents = story.get('articleContents') or {} - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') - full_video_url = compat_urlparse.urljoin(url, video_url) + def entries(): + featured_video = story.get('featuredVideo') or {} + feed = try_get(featured_video, lambda x: x['video']['feed']) + if feed: + yield { + '_type': 'url', + 'id': featured_video.get('id'), + 'title': featured_video.get('name'), + 'url': feed, + 'thumbnail': featured_video.get('images'), + 'description': featured_video.get('description'), + 'timestamp': parse_iso8601(featured_video.get('uploadDate')), + 'duration': parse_duration(featured_video.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - youtube_url = YoutubeIE._extract_url(webpage) + for inline in (article_contents.get('inlines') or []): + inline_type = inline.get('type') + if inline_type == 'iframe': + iframe_url = try_get(inline, lambda x: x['attrs']['src']) + if iframe_url: + yield self.url_result(iframe_url) + elif inline_type == 'video': + video_id = inline.get('id') + if video_id: + yield { + '_type': 'url', + 'id': video_id, + 'url': 'http://abcnews.go.com/video/embed?id=' + video_id, + 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'), + 'description': inline.get('description'), + 'duration': parse_duration(inline.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - timestamp = None - date_str = self._html_search_regex( - r'<span[^>]+class="timestamp">([^<]+)</span>', - webpage, 'timestamp', fatal=False) - if date_str: - tz_offset = 0 - if date_str.endswith(' ET'): # Eastern Time - tz_offset = -5 - date_str = date_str[:-3] - date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] - for date_format in date_formats: - try: - timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) - except ValueError: - continue - if timestamp is not None: - timestamp -= tz_offset * 3600 - - entry = { - '_type': 'url_transparent', - 'ie_key': AbcNewsVideoIE.ie_key(), - 'url': full_video_url, - 'id': video_id, - 'display_id': display_id, - 'timestamp': timestamp, - } - - if youtube_url: - entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] - return self.playlist_result(entries) - - return entry + return self.playlist_result( + entries(), story_id, article_contents.get('headline'), + article_contents.get('subHead')) From c669554ef5491302eb20fc2bcb52339ea1a4ac1a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 27 Jan 2021 14:51:30 +0100 Subject: [PATCH 373/424] [medialaan] add support DPG Media MyChannels based websites closes #14871 closes #15597 closes #16106 closes #16489 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 23 +++ youtube_dl/extractor/medialaan.py | 303 +++++++---------------------- youtube_dl/extractor/vtm.py | 62 ++++++ 4 files changed, 160 insertions(+), 229 deletions(-) create mode 100644 youtube_dl/extractor/vtm.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c554a8504..ab8d6a5a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1459,6 +1459,7 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE +from .vtm import VTMIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 780971a92..09e680c96 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -128,6 +128,7 @@ from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE from .arcpublishing import ArcPublishingIE +from .medialaan import MedialaanIE class GenericIE(InfoExtractor): @@ -2223,6 +2224,20 @@ class GenericIE(InfoExtractor): 'duration': 1581, }, }, + { + # MyChannels SDK embed + # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen + 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/', + 'md5': '90c0699c37006ef18e198c032d81739c', + 'info_dict': { + 'id': '194165', + 'ext': 'mp4', + 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe', + 'timestamp': 1611740340, + 'upload_date': '20210127', + 'duration': 159, + }, + }, ] def report_following_redirect(self, new_url): @@ -2462,6 +2477,9 @@ class GenericIE(InfoExtractor): webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) + if '<title>DPG Media Privacy Gate' in webpage: + webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? @@ -2593,6 +2611,11 @@ class GenericIE(InfoExtractor): if arc_urls: return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) + mychannels_urls = MedialaanIE._extract_urls(webpage) + if mychannels_urls: + return self.playlist_from_matches( + mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index 50d5db802..788acf7fb 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -2,268 +2,113 @@ from __future__ import unicode_literals import re -from .gigya import GigyaBaseIE - -from ..compat import compat_str +from .common import InfoExtractor from ..utils import ( + extract_attributes, int_or_none, - parse_duration, - try_get, - unified_timestamp, + mimetype2ext, + parse_iso8601, ) -class MedialaanIE(GigyaBaseIE): +class MedialaanIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?:www\.|nieuws\.)? (?: - (?Pvtm|q2|vtmkzoom)\.be/ - (?: - video(?:/[^/]+/id/|/?\?.*?\baid=)| - (?:[^/]+/)* - ) + (?:embed\.)?mychannels.video/embed/| + embed\.mychannels\.video/(?:s(?:dk|cript)/)?production/| + (?:www\.)?(?: + (?: + 7sur7| + demorgen| + hln| + joe| + qmusic + )\.be| + (?: + [abe]d| + bndestem| + destentor| + gelderlander| + pzc| + tubantia| + volkskrant + )\.nl + )/video/(?:[^/]+/)*[^/?&#]+~p ) - (?P[^/?#&]+) + (?P\d+) ''' - _NETRC_MACHINE = 'medialaan' - _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' - _SITE_TO_APP_ID = { - 'vtm': 'vtm_watch', - 'q2': 'q2', - 'vtmkzoom': 'vtmkzoom', - } _TESTS = [{ - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', + 'url': 'https://www.bndestem.nl/video/de-terugkeer-van-ally-de-aap-en-wie-vertrekt-er-nog-bij-nac~p193993', 'info_dict': { - 'id': 'vtm_20170219_VM0678361_vtmwatch', + 'id': '193993', 'ext': 'mp4', - 'title': 'Allemaal Chris afl. 6', - 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', - 'timestamp': 1487533280, - 'upload_date': '20170219', - 'duration': 2562, - 'series': 'Allemaal Chris', - 'season': 'Allemaal Chris', - 'season_number': 1, - 'season_id': '256936078124527', - 'episode': 'Allemaal Chris afl. 6', - 'episode_number': 6, - 'episode_id': '256936078591527', + 'title': 'De terugkeer van Ally de Aap en wie vertrekt er nog bij NAC?', + 'timestamp': 1611663540, + 'upload_date': '20210126', + 'duration': 238, }, 'params': { 'skip_download': True, }, - 'skip': 'Requires account credentials', }, { - # clip - 'url': 'http://vtm.be/video?aid=168332', - 'info_dict': { - 'id': '168332', - 'ext': 'mp4', - 'title': '"Veronique liegt!"', - 'description': 'md5:1385e2b743923afe54ba4adc38476155', - 'timestamp': 1489002029, - 'upload_date': '20170308', - 'duration': 96, - }, - }, { - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', + 'url': 'https://www.gelderlander.nl/video/kanalen/degelderlander~c320/series/snel-nieuws~s984/noodbevel-in-doetinchem-politie-stuurt-mensen-centrum-uit~p194093', 'only_matching': True, }, { - # vod - 'url': 'http://vtm.be/video?aid=163157', + 'url': 'https://embed.mychannels.video/sdk/production/193993?options=TFTFF_default', 'only_matching': True, }, { - # vod - 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', + 'url': 'https://embed.mychannels.video/script/production/193993', 'only_matching': True, }, { - # clip - 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', + 'url': 'https://embed.mychannels.video/production/193993', 'only_matching': True, }, { - # http/s redirect - 'url': 'https://vtmkzoom.be/video?aid=45724', - 'info_dict': { - 'id': '257136373657000', - 'ext': 'mp4', - 'title': 'K3 Dansstudio Ushuaia afl.6', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires account credentials', + 'url': 'https://mychannels.video/embed/193993', + 'only_matching': True, }, { - # nieuws.vtm.be - 'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma', + 'url': 'https://embed.mychannels.video/embed/193993', 'only_matching': True, }] - def _real_initialize(self): - self._logged_in = False - - def _login(self): - username, password = self._get_login_info() - if username is None: - self.raise_login_required() - - auth_data = { - 'APIKey': self._APIKEY, - 'sdk': 'js_6.1', - 'format': 'json', - 'loginID': username, - 'password': password, - } - - auth_info = self._gigya_login(auth_data) - - self._uid = auth_info['UID'] - self._uid_signature = auth_info['UIDSignature'] - self._signature_timestamp = auth_info['signatureTimestamp'] - - self._logged_in = True + @staticmethod + def _extract_urls(webpage): + entries = [] + for element in re.findall(r'(]+data-mychannels-type="video"[^>]*>)', webpage): + mychannels_id = extract_attributes(element).get('data-mychannels-id') + if mychannels_id: + entries.append('https://mychannels.video/embed/' + mychannels_id) + return entries def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, site_id = mobj.group('id', 'site_id') + production_id = self._match_id(url) + production = self._download_json( + 'https://embed.mychannels.video/sdk/production/' + production_id, + production_id, query={'options': 'UUUU_default'})['productions'][0] + title = production['title'] - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._search_regex( - r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);', - webpage, 'config', default='{}'), video_id, - transform_source=lambda s: s.replace( - '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'")) - - vod_id = config.get('vodId') or self._search_regex( - (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', - r'"vodId"\s*:\s*"(.+?)"', - r'<[^>]+id=["\']vod-(\d+)'), - webpage, 'video_id', default=None) - - # clip, no authentication required - if not vod_id: - player = self._parse_json( - self._search_regex( - r'vmmaplayer\(({.+?})\);', webpage, 'vmma player', - default=''), - video_id, transform_source=lambda s: '[%s]' % s, fatal=False) - if player: - video = player[-1] - if video['videoUrl'] in ('http', 'https'): - return self.url_result(video['url'], MedialaanIE.ie_key()) - info = { - 'id': video_id, - 'url': video['videoUrl'], - 'title': video['title'], - 'thumbnail': video.get('imageUrl'), - 'timestamp': int_or_none(video.get('createdDate')), - 'duration': int_or_none(video.get('duration')), - } + formats = [] + for source in (production.get('sources') or []): + src = source.get('src') + if not src: + continue + ext = mimetype2ext(source.get('type')) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, production_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - info = self._parse_html5_media_entries( - url, webpage, video_id, m3u8_id='hls')[0] - info.update({ - 'id': video_id, - 'title': self._html_search_meta('description', webpage), - 'duration': parse_duration(self._html_search_meta('duration', webpage)), + formats.append({ + 'ext': ext, + 'url': src, }) - # vod, authentication required - else: - if not self._logged_in: - self._login() + self._sort_formats(formats) - settings = self._parse_json( - self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', default='{}'), - video_id) - - def get(container, item): - return try_get( - settings, lambda x: x[container][item], - compat_str) or self._search_regex( - r'"%s"\s*:\s*"([^"]+)' % item, webpage, item, - default=None) - - app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch') - sso = get('vod', 'gigyaDatabase') or 'vtm-sso' - - data = self._download_json( - 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id, - video_id, query={ - 'app_id': app_id, - 'user_network': sso, - 'UID': self._uid, - 'UIDSignature': self._uid_signature, - 'signatureTimestamp': self._signature_timestamp, - }) - - formats = self._extract_m3u8_formats( - data['response']['uri'], video_id, entry_protocol='m3u8_native', - ext='mp4', m3u8_id='hls') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'formats': formats, - } - - api_key = get('vod', 'apiKey') - channel = get('medialaanGigya', 'channel') - - if api_key: - videos = self._download_json( - 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False, - query={ - 'channels': channel, - 'ids': vod_id, - 'limit': 1, - 'apikey': api_key, - }) - if videos: - video = try_get( - videos, lambda x: x['response']['videos'][0], dict) - if video: - def get(container, item, expected_type=None): - return try_get( - video, lambda x: x[container][item], expected_type) - - def get_string(container, item): - return get(container, item, compat_str) - - info.update({ - 'series': get_string('program', 'title'), - 'season': get_string('season', 'title'), - 'season_number': int_or_none(get('season', 'number')), - 'season_id': get_string('season', 'id'), - 'episode': get_string('episode', 'title'), - 'episode_number': int_or_none(get('episode', 'number')), - 'episode_id': get_string('episode', 'id'), - 'duration': int_or_none( - video.get('duration')) or int_or_none( - video.get('durationMillis'), scale=1000), - 'title': get_string('episode', 'title'), - 'description': get_string('episode', 'text'), - 'timestamp': unified_timestamp(get_string( - 'publication', 'begin')), - }) - - if not info.get('title'): - info['title'] = try_get( - config, lambda x: x['videoConfig']['title'], - compat_str) or self._html_search_regex( - r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title', - default=None) or self._og_search_title(webpage) - - if not info.get('description'): - info['description'] = self._html_search_regex( - r']+class="field-item\s+even">\s*

    (.+?)

    ', - webpage, 'description', default=None) - - return info + return { + 'id': production_id, + 'title': title, + 'formats': formats, + 'thumbnail': production.get('posterUrl'), + 'timestamp': parse_iso8601(production.get('publicationDate'), ' '), + 'duration': int_or_none(production.get('duration')) or None, + } diff --git a/youtube_dl/extractor/vtm.py b/youtube_dl/extractor/vtm.py new file mode 100644 index 000000000..093f1aa69 --- /dev/null +++ b/youtube_dl/extractor/vtm.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) + + +class VTMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vtm\.be/([^/?&#]+)~v(?P[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12})' + _TEST = { + 'url': 'https://vtm.be/gast-vernielt-genkse-hotelkamer~ve7534523-279f-4b4d-a5c9-a33ffdbe23e1', + 'md5': '37dca85fbc3a33f2de28ceb834b071f8', + 'info_dict': { + 'id': '192445', + 'ext': 'mp4', + 'title': 'Gast vernielt Genkse hotelkamer', + 'timestamp': 1611060180, + 'upload_date': '20210119', + 'duration': 74, + # TODO: fix url _type result processing + # 'series': 'Op Interventie', + } + } + + def _real_extract(self, url): + uuid = self._match_id(url) + video = self._download_json( + 'https://omc4vm23offuhaxx6hekxtzspi.appsync-api.eu-west-1.amazonaws.com/graphql', + uuid, query={ + 'query': '''{ + getComponent(type: Video, uuid: "%s") { + ... on Video { + description + duration + myChannelsVideo + program { + title + } + publishedAt + title + } + } +}''' % uuid, + }, headers={ + 'x-api-key': 'da2-lz2cab4tfnah3mve6wiye4n77e', + })['data']['getComponent'] + + return { + '_type': 'url', + 'id': uuid, + 'title': video.get('title'), + 'url': 'http://mychannels.video/embed/%d' % video['myChannelsVideo'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publishedAt')), + 'duration': int_or_none(video.get('duration')), + 'series': try_get(video, lambda x: x['program']['title']), + 'ie_key': 'Medialaan', + } From fd95fc33b13d732002d53c35521f17184d14cc21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= Date: Wed, 27 Jan 2021 20:06:12 +0100 Subject: [PATCH 374/424] [awaan] Extract uploader id (#27963) --- youtube_dl/extractor/awaan.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/awaan.py b/youtube_dl/extractor/awaan.py index a2603bbff..3a7700cd4 100644 --- a/youtube_dl/extractor/awaan.py +++ b/youtube_dl/extractor/awaan.py @@ -48,6 +48,7 @@ class AWAANBaseIE(InfoExtractor): 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, + 'uploader_id': video_data.get('user_id'), } @@ -107,6 +108,7 @@ class AWAANLiveIE(AWAANBaseIE): 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'upload_date': '20150107', 'timestamp': 1420588800, + 'uploader_id': '71', }, 'params': { # m3u8 download From 7b8fa658f88d53066f7a2ad00df19697552cf286 Mon Sep 17 00:00:00 2001 From: knapior Date: Wed, 27 Jan 2021 20:43:20 +0100 Subject: [PATCH 375/424] [cda] Improve birth validation detection (closes #14022) (#27929) Co-authored-by: Sergey M --- youtube_dl/extractor/cda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index d67900e62..6429454fb 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -96,7 +96,7 @@ class CDAIE(InfoExtractor): raise ExtractorError('This video is only available for premium users.', expected=True) need_confirm_age = False - if self._html_search_regex(r'(]+action="/a/validatebirth")', + if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( url, video_id, note='Confirming age') From 0b4f03a56394dee070f6e1723af8ce3a9ce44bfb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 28 Jan 2021 16:22:24 +0100 Subject: [PATCH 376/424] [youtube] improve DASH formats file size extraction --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f57099f8c..b254ceced 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2370,7 +2370,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mpd_url, video_id, fatal=dash_mpd_fatal, formats_dict=self._formats): if not df.get('filesize'): - df['filesize'] = _extract_filesize(df['url']) + df['filesize'] = _extract_filesize(df.get('fragment_base_url') or df['url']) # Do not overwrite DASH format found in some previous DASH manifest if df['format_id'] not in dash_formats: dash_formats[df['format_id']] = df From 56a7ee903360fe89cb5372805473b508153fb856 Mon Sep 17 00:00:00 2001 From: ping Date: Fri, 29 Jan 2021 16:02:18 +0800 Subject: [PATCH 377/424] [vlive] Fix error message decoding for python 2 (#28004) --- youtube_dl/extractor/vlive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 6224e6200..e2f5d81b8 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -116,7 +116,7 @@ class VLiveIE(VLiveBaseIE): headers={'Referer': 'https://www.vlive.tv/'}, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_login_required(json.loads(e.cause.read().decode())['message']) + self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) raise def _real_extract(self, url): From ba15b2fee673f76c2cff2f193d5b4a19029b501f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Jan 2021 11:52:48 +0100 Subject: [PATCH 378/424] [googledrive] report download page errors(closes #28005) --- youtube_dl/extractor/googledrive.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index de8c80e36..3f2de00f1 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -7,6 +7,7 @@ from ..compat import compat_parse_qs from ..utils import ( determine_ext, ExtractorError, + get_element_by_class, int_or_none, lowercase_escape, try_get, @@ -237,7 +238,7 @@ class GoogleDriveIE(InfoExtractor): if confirmation_webpage: confirm = self._search_regex( r'confirm=([^&"\']+)', confirmation_webpage, - 'confirmation code', fatal=False) + 'confirmation code', default=None) if confirm: confirmed_source_url = update_url_query(source_url, { 'confirm': confirm, @@ -245,6 +246,11 @@ class GoogleDriveIE(InfoExtractor): urlh = request_source_file(confirmed_source_url, 'confirmed source') if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) + else: + self.report_warning( + get_element_by_class('uc-error-subcaption', confirmation_webpage) + or get_element_by_class('uc-error-caption', confirmation_webpage) + or 'unable to extract confirmation code') if not formats and reason: raise ExtractorError(reason, expected=True) From a800838f5afad43a76a53bdb5f4c1b20c80ff202 Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 29 Jan 2021 14:24:28 +0100 Subject: [PATCH 379/424] [vvvvid] add support for youtube embeds (#27825) --- youtube_dl/extractor/vvvvid.py | 35 ++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index f4cae7fe9..778ce8b76 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( ExtractorError, int_or_none, @@ -47,6 +48,22 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # video_type == 'video/youtube' + 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': 'RzmFKUDOUgw', + 'ext': 'mp4', + 'title': 'Trailer', + 'upload_date': '20150906', + 'description': 'md5:a5e802558d35247fee285875328c0b80', + 'uploader_id': 'BandaiVisual', + 'uploader': 'BANDAI NAMCO Arts Channel', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', 'only_matching': True @@ -154,12 +171,13 @@ class VVVVIDIE(InfoExtractor): if season_number: info['season_number'] = int(season_number) - for quality in ('_sd', ''): + video_type = video_data.get('video_type') + is_youtube = False + for quality in ('', '_sd'): embed_code = video_data.get('embed_info' + quality) if not embed_code: continue embed_code = ds(embed_code) - video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): if video_type == 'video/kenc': kenc = self._download_json( @@ -172,19 +190,28 @@ class VVVVIDIE(InfoExtractor): if kenc_message: embed_code += '?' + ds(kenc_message) formats.extend(self._extract_akamai_formats(embed_code, video_id)) + elif video_type == 'video/youtube': + info.update({ + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'url': embed_code, + }) + is_youtube = True + break else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) metadata_from_url(embed_code) - self._sort_formats(formats) + if not is_youtube: + self._sort_formats(formats) + info['formats'] = formats metadata_from_url(video_data.get('thumbnail')) info.update(self._extract_common_video_info(video_data)) info.update({ 'id': video_id, 'title': title, - 'formats': formats, 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, From 8bf9591a70757c624a8ea5bf686040ed752246e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= Date: Sat, 30 Jan 2021 13:44:01 +0100 Subject: [PATCH 380/424] [AENetworks] update AENetworksShowIE test playlist id (#27851) --- youtube_dl/extractor/aenetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index a5d88ebbe..e55c03fd7 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -252,7 +252,7 @@ class AENetworksShowIE(AENetworksListBaseIE): _TESTS = [{ 'url': 'http://www.history.com/shows/ancient-aliens', 'info_dict': { - 'id': 'SH012427480000', + 'id': 'SERIES1574', 'title': 'Ancient Aliens', 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', }, From 67299f23d8b1894120e875edf97440de87e22308 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 1 Feb 2021 14:30:59 +0100 Subject: [PATCH 381/424] [youtube] Rewrite Extractor - improve format sorting - remove unused code(swf parsing, ...) - fix series metadata extraction - fix trailer video extraction - improve error reporting - extract video location --- youtube_dl/extractor/common.py | 17 +- youtube_dl/extractor/youtube.py | 1717 +++++++++---------------------- 2 files changed, 483 insertions(+), 1251 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d5faa0eb7..8eb110f4e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2064,7 +2064,7 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', @@ -2078,10 +2078,9 @@ class InfoExtractor(object): mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, - formats_dict=formats_dict, mpd_url=mpd_url) + mpd_doc, mpd_id, mpd_base_url, mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): + def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): """ Parse formats from MPD manifest. References: @@ -2359,15 +2358,7 @@ class InfoExtractor(object): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - - # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation - # is not necessarily unique within a Period thus formats with - # the same `format_id` are quite possible. There are numerous examples - # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111, - # https://github.com/ytdl-org/youtube-dl/issues/13919) - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) + formats.append(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b254ceced..5f6769878 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2,41 +2,34 @@ from __future__ import unicode_literals - import itertools import json import os.path import random import re -import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import JSInterpreter -from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, compat_HTTPError, compat_parse_qs, - compat_urllib_parse_unquote, + compat_str, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, - compat_str, ) +from ..jsinterp import JSInterpreter from ..utils import ( - bool_or_none, - clean_html, - error_to_compat_str, ExtractorError, + clean_html, float_or_none, - get_element_by_id, int_or_none, mimetype2ext, parse_codecs, parse_duration, - remove_quotes, + qualities, remove_start, smuggle_url, str_or_none, @@ -46,7 +39,6 @@ from ..utils import ( unified_strdate, unsmuggle_url, update_url_query, - uppercase_escape, url_or_none, urlencode_postdata, urljoin, @@ -68,12 +60,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' - def _set_language(self): - self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', - # YouTube sets the expire time to about two months - expire_time=time.time() + 2 * 30 * 24 * 3600) - def _ids_to_results(self, ids): return [ self.url_result(vid_id, 'Youtube', video_id=vid_id) @@ -265,7 +251,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _real_initialize(self): if self._downloader is None: return - self._set_language() if not self._login(): return @@ -282,19 +267,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P[a-z]+)$', - r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.(?P[a-z]+)$', + r'/(?P[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.js$', + r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) - _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, - - # av01 video only formats sometimes served with "unknown" codecs - '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - } _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') _GEO_BYPASS = False @@ -566,7 +443,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'setindia', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', 'age_limit': 18, - } + }, + 'skip': 'Private video', }, { 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', @@ -640,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'title': 'Burning Everyone\'s Koran', - 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', + 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } }, # Normal age-gate video (No vevo, embed allowed), available via embed page @@ -676,11 +554,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'Dada Life, deadmau5', - 'description': 'md5:12c56784b8032162bb936a5f76d55360', + 'creator': 'deadmau5', + 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'This Machine Kills Some Chords', + 'alt_title': 'Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', @@ -775,69 +653,64 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, { # Multifeed videos (multiple cameras), URL is for Main Camera - 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', + 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg', 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'title': 'teamPGP: Rocket League Noob Stream', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'id': 'jvGDaLqkpTg', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever', + 'description': 'md5:e03b909557865076822aa169218d6a5d', }, 'playlist': [{ 'info_dict': { - 'id': 'jqWvoWXjCVs', + 'id': 'jvGDaLqkpTg', 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7335, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10643, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', }, }, { 'info_dict': { - 'id': '6h8e8xoXJzg', + 'id': '3AKt1R1aDnw', 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10991, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', }, }, { 'info_dict': { - 'id': 'PUOgX5z9xZw', + 'id': 'RtAMM00gpVc', 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10995, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', }, }, { 'info_dict': { - 'id': 'teuwxikvS5k', + 'id': '6N2fdlP3C5U', 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (zim)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7334, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10990, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', }, }], 'params': { 'skip_download': True, }, - 'skip': 'This video is not available.', }, { # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) @@ -931,7 +804,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'eQcmzGIKrzg', 'ext': 'mp4', 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', - 'description': 'md5:dda0d780d5a6e120758d1711d062a867', + 'description': 'md5:13a2503d7b5904ef4b223aa101628f39', 'duration': 4060, 'upload_date': '20151119', 'uploader': 'Bernie Sanders', @@ -978,7 +851,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', + 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', @@ -1013,6 +886,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.', }, { # itag 212 @@ -1098,6 +972,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Video unavailable', }, { # empty description results in an empty string @@ -1147,24 +1022,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) + self._code_cache = {} self._player_cache = {} - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen('%s: Downloading video info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen('%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen('%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen('RTMP download detected') - def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) @@ -1177,40 +1037,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break else: raise ExtractorError('Cannot identify player %r' % player_url) - return id_m.group('ext'), id_m.group('id') + return id_m.group('id') def _extract_signature_function(self, video_id, player_url, example_sig): - player_type, player_id = self._extract_player_info(player_url) + player_id = self._extract_player_info(player_url) # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) + func_id = 'js_%s_%s' % ( + player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - download_note = ( - 'Downloading player %s' % player_url - if self._downloader.params.get('verbose') else - 'Downloading %s player %s' % (player_type, player_id) - ) - if player_type == 'js': - code = self._download_webpage( + if player_id not in self._code_cache: + self._code_cache[player_id] = self._download_webpage( player_url, video_id, - note=download_note, + note='Downloading player ' + player_id, errnote='Download of %s failed' % player_url) - res = self._parse_sig_js(code) - elif player_type == 'swf': - urlh = self._request_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - code = urlh.read() - res = self._parse_sig_swf(code) - else: - assert False, 'Invalid player type %r' % player_type + code = self._code_cache[player_id] + res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) cache_res = res(test_string) @@ -1279,14 +1126,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) - def _parse_sig_swf(self, file_contents): - swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = 'SignatureDecipher' - searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, 'decipher') - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url, age_gate=False): + def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" if player_url is None: @@ -1313,158 +1153,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_subtitles(self, video_id, webpage): - try: - subs_doc = self._download_xml( - 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - - sub_lang_list = {} - for track in subs_doc.findall('track'): - lang = track.attrib['lang_code'] - if lang in sub_lang_list: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': ext, - 'name': track.attrib['name'].encode('utf-8'), - }) - sub_formats.append({ - 'url': 'https://www.youtube.com/api/timedtext?' + params, - 'ext': ext, - }) - sub_lang_list[lang] = sub_formats - if not sub_lang_list: - self._downloader.report_warning('video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _get_ytplayer_config(self, video_id, webpage): - patterns = ( - # User data may contain arbitrary character sequences that may affect - # JSON extraction with regex, e.g. when '};' is contained the second - # regex won't capture the whole JSON. Yet working around by trying more - # concrete regex first keeping in mind proper quoted string handling - # to be implemented in future that will replace this workaround (see - # https://github.com/ytdl-org/youtube-dl/issues/7468, - # https://github.com/ytdl-org/youtube-dl/pull/7599) - r';ytplayer\.config\s*=\s*({.+?});ytplayer', - r';ytplayer\.config\s*=\s*({.+?});', - ) - config = self._search_regex( - patterns, webpage, 'ytplayer.config', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - - def _get_automatic_captions(self, video_id, player_response, player_config): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - self.to_screen('%s: Looking for automatic captions' % video_id) - err_msg = 'Couldn\'t find automatic captions for %s' % video_id - if not (player_response or player_config): - self._downloader.report_warning(err_msg) - return {} - try: - args = player_config.get('args') if player_config else {} - caption_url = args.get('ttsurl') - if caption_url: - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse_urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_formats.append({ - 'url': caption_url + '&' + params, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list - - def make_captions(sub_url, sub_langs): - parsed_sub_url = compat_urllib_parse_urlparse(sub_url) - caption_qs = compat_parse_qs(parsed_sub_url.query) - captions = {} - for sub_lang in sub_langs: - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - captions[sub_lang] = sub_formats - return captions - - # New captions format as of 22.06.2017 - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - base_url = renderer['captionTracks'][0]['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) - - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, IndexError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _mark_watched(self, video_id, video_info, player_response): + def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, - lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( - video_info, lambda x: x['videostats_playback_base_url'][0])) + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) @@ -1531,289 +1223,74 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id - def _extract_chapters_from_json(self, webpage, video_id, duration): - if not webpage: - return - data = self._extract_yt_initial_data(video_id, webpage) - if not data or not isinstance(data, dict): - return - chapters_list = try_get( - data, - lambda x: x['playerOverlays'] - ['playerOverlayRenderer'] - ['decoratedPlayerBarRenderer'] - ['decoratedPlayerBarRenderer'] - ['playerBar'] - ['chapteredPlayerBarRenderer'] - ['chapters'], - list) - if not chapters_list: - return - - def chapter_time(chapter): - return float_or_none( - try_get( - chapter, - lambda x: x['chapterRenderer']['timeRangeStartMillis'], - int), - scale=1000) - chapters = [] - for next_num, chapter in enumerate(chapters_list, start=1): - start_time = chapter_time(chapter) - if start_time is None: - continue - end_time = (chapter_time(chapters_list[next_num]) - if next_num < len(chapters_list) else duration) - if end_time is None: - continue - title = try_get( - chapter, lambda x: x['chapterRenderer']['title']['simpleText'], - compat_str) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': title, - }) - return chapters - - @staticmethod - def _extract_chapters_from_description(description, duration): - if not description: - return None - chapter_lines = re.findall( - r'(?:^|)([^<]*]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)
    [^>]*)(?=$|)', - description) - if not chapter_lines: - return None - chapters = [] - for next_num, (chapter_line, time_point) in enumerate( - chapter_lines, start=1): - start_time = parse_duration(time_point) - if start_time is None: - continue - if start_time > duration: - break - end_time = (duration if next_num == len(chapter_lines) - else parse_duration(chapter_lines[next_num][1])) - if end_time is None: - continue - if end_time > duration: - end_time = duration - if start_time > end_time: - break - chapter_title = re.sub( - r']+>[^<]+', '', chapter_line).strip(' \t-') - chapter_title = re.sub(r'\s+', ' ', chapter_title) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': chapter_title, - }) - return chapters - - def _extract_chapters(self, webpage, description, video_id, duration): - return (self._extract_chapters_from_json(webpage, video_id, duration) - or self._extract_chapters_from_description(description, duration)) + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + base_url = self.http_scheme() + '//www.youtube.com/' + webpage_url = base_url + 'watch?v=' + video_id + webpage = self._download_webpage(webpage_url, video_id, fatal=False) - proto = ( - 'http' if self._downloader.params.get('prefer_insecure', False) - else 'https') + player_response = None + if webpage: + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, + video_id, 'initial player response') + if not player_response: + player_response = self._call_api( + 'player', {'videoId': video_id}, video_id) - start_time = None - end_time = None - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - if start_time is None and 't' in query: - start_time = parse_duration(query['t'][0]) - if start_time is None and 'start' in query: - start_time = parse_duration(query['start'][0]) - if end_time is None and 'end' in query: - end_time = parse_duration(query['end'][0]) + playability_status = player_response.get('playabilityStatus') or {} + if playability_status.get('reason') == 'Sign in to confirm your age': + pr = self._parse_json(try_get(compat_parse_qs( + self._download_webpage( + base_url + 'get_video_info', video_id, + 'Refetching age-gated info webpage', + 'unable to download video info webpage', query={ + 'video_id': video_id, + }, fatal=False)), + lambda x: x['player_response'][0], + compat_str) or '{}', video_id) + if pr: + player_response = pr - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') - video_id = self.extract_id(url) + trailer_video_id = try_get( + playability_status, + lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'], + compat_str) + if trailer_video_id: + return self.url_result( + trailer_video_id, self.ie_key(), trailer_video_id) - # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage, urlh = self._download_webpage_handle(url, video_id) - - qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query) - video_id = qs.get('v', [None])[0] or video_id - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - dash_mpds = [] - - def add_dash_mpd(video_info): - dash_mpd = video_info.get('dashmpd') - if dash_mpd and dash_mpd[0] not in dash_mpds: - dash_mpds.append(dash_mpd[0]) - - def add_dash_mpd_pr(pl_response): - dash_mpd = url_or_none(try_get( - pl_response, lambda x: x['streamingData']['dashManifestUrl'], - compat_str)) - if dash_mpd and dash_mpd not in dash_mpds: - dash_mpds.append(dash_mpd) - - is_live = None - view_count = None - - def extract_view_count(v_info): - return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - - def extract_player_response(player_response, video_id): - pl_response = str_or_none(player_response) - if not pl_response: + def get_text(x): + if not x: return - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - add_dash_mpd_pr(pl_response) - return pl_response + return x.get('simpleText') or ''.join([r['text'] for r in x['runs']]) - player_response = {} - - # Get video info - video_info = {} - embed_webpage = None - ytplayer_config = None - - if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None: - age_gate = True - # We simulate the access to the video from www.youtube.com/v/{video_id} - # this can be viewed without login into Youtube - url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - try: - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - except ExtractorError: - video_info_webpage = None - if video_info_webpage: - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) - else: - age_gate = False - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) - - if not video_info and not player_response: - player_response = extract_player_response( - self._search_regex( - (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE), - self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage, - 'initial player response', default='{}'), - video_id) - - def extract_unavailable_message(): - messages = [] - for tag, kind in (('h1', 'message'), ('div', 'submessage')): - msg = self._html_search_regex( - r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)'.format(tag=tag, kind=kind), - video_webpage, 'unavailable %s' % kind, default=None) - if msg: - messages.append(msg) - if messages: - return '\n'.join(messages) - - if not video_info and not player_response: - unavailable_message = extract_unavailable_message() - if not unavailable_message: - unavailable_message = 'Unable to extract video data' - raise ExtractorError( - 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - - if not isinstance(video_info, dict): - video_info = {} - - video_details = try_get( - player_response, lambda x: x['videoDetails'], dict) or {} + search_meta = ( + lambda x: self._html_search_meta(x, webpage, default=None)) \ + if webpage else lambda x: None + video_details = player_response.get('videoDetails') or {} microformat = try_get( - player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} - - video_title = video_info.get('title', [None])[0] or video_details.get('title') - if not video_title: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - - description_original = video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - - def replace_url(m): - redir_url = compat_urlparse.urljoin(url, m.group(1)) - parsed_redir_url = compat_urllib_parse_urlparse(redir_url) - if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': - qs = compat_parse_qs(parsed_redir_url.query) - q = qs.get('q') - if q and q[0]: - return q[0] - return redir_url - - description_original = video_description = re.sub(r'''(?x) - ]*> - [^<]+\.{3}\s* - - ''', replace_url, video_description) - video_description = clean_html(video_description) - else: - video_description = video_details.get('shortDescription') - if video_description is None: - video_description = self._html_search_meta('description', video_webpage) + player_response, + lambda x: x['microformat']['playerMicroformatRenderer'], + dict) or {} + video_title = video_details.get('title') \ + or get_text(microformat.get('title')) \ + or search_meta(['og:title', 'twitter:title', 'title']) + video_description = video_details.get('shortDescription') if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): multifeed_metadata_list = try_get( player_response, lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) or try_get( - video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + compat_str) if multifeed_metadata_list: entries = [] feed_ids = [] @@ -1821,10 +1298,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + feed_data = compat_parse_qs( + compat_urllib_parse_unquote_plus(feed)) def feed_entry(name): - return try_get(feed_data, lambda x: x[name][0], compat_str) + return try_get( + feed_data, lambda x: x[name][0], compat_str) feed_id = feed_entry('id') if not feed_id: @@ -1837,7 +1316,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + base_url + 'watch?v=' + feed_data['id'][0], {'force_singlefeed': True}), 'title': title, }) @@ -1845,631 +1324,393 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) + return self.playlist_result( + entries, video_id, video_title, video_description) else: self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if view_count is None: - view_count = extract_view_count(video_info) - if view_count is None and video_details: - view_count = int_or_none(video_details.get('viewCount')) - if view_count is None and microformat: - view_count = int_or_none(microformat.get('viewCount')) + formats = [] + itags = [] + player_url = None + q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) + streaming_data = player_response.get('streamingData') or {} + streaming_formats = streaming_data.get('formats') or [] + streaming_formats.extend(streaming_data.get('adaptiveFormats') or []) + for fmt in streaming_formats: + if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): + continue - if is_live is None: - is_live = bool_or_none(video_details.get('isLive')) - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - - def _extract_filesize(media_url): - return int_or_none(self._search_regex( - r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) - - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or [] - streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or []) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) - formats = [] - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: + fmt_url = fmt.get('url') + if not fmt_url: + sc = compat_parse_qs(fmt.get('signatureCipher')) + fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) + encrypted_sig = try_get(sc, lambda x: x['s'][0]) + if not (sc and fmt_url and encrypted_sig): continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - - for fmt in streaming_formats: - if fmt.get('drmFamilies') or fmt.get('drm_families'): - continue - url = url_or_none(fmt.get('url')) - - if not url: - cipher = fmt.get('cipher') or fmt.get('signatureCipher') - if not cipher: + if not player_url: + if not webpage: continue - url_data = compat_parse_qs(cipher) - url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) - if not url: + player_url = self._search_regex( + r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', + webpage, 'player URL', fatal=False) + if not player_url: + continue + signature = self._decrypt_signature(sc['s'][0], video_id, player_url) + sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' + fmt_url += '&' + sp + '=' + signature + + itag = str_or_none(fmt.get('itag')) + if itag: + itags.append(itag) + quality = fmt.get('quality') + dct = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_id': itag, + 'format_note': fmt.get('qualityLabel') or quality, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + 'tbr': float_or_none(fmt.get( + 'averageBitrate') or fmt.get('bitrate'), 1000), + 'url': fmt_url, + 'width': fmt.get('width'), + } + mimetype = fmt.get('mimeType') + if mimetype: + mobj = re.match( + r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype) + if mobj: + dct['ext'] = mimetype2ext(mobj.group(1)) + dct.update(parse_codecs(mobj.group(2))) + if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } + formats.append(dct) + + hls_manifest_url = streaming_data.get('hlsManifestUrl') + if hls_manifest_url: + for f in self._extract_m3u8_formats( + hls_manifest_url, video_id, 'mp4', fatal=False): + itag = self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None) + if itag: + f['format_id'] = itag + formats.append(f) + + if self._downloader.params.get('youtube_include_dash_manifest'): + dash_manifest_url = streaming_data.get('dashManifestUrl') + if dash_manifest_url: + for f in self._extract_mpd_formats( + dash_manifest_url, video_id, fatal=False): + if f['format_id'] in itags: continue - else: - cipher = None - url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + filesize = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') + or f['url'], 'file size', default=None)) + if filesize: + f['filesize'] = filesize + formats.append(f) - stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) - # Unsupported FORMAT_STREAM_TYPE_OTF - if stream_type == 3: - continue + if not formats: + if streaming_data.get('licenseInfos'): + raise ExtractorError( + 'This video is DRM protected.', expected=True) + pemr = try_get( + playability_status, + lambda x: x['errorScreen']['playerErrorMessageRenderer'], + dict) or {} + reason = get_text(pemr.get('reason')) or playability_status.get('reason') + subreason = pemr.get('subreason') + if subreason: + subreason = clean_html(get_text(subreason)) + if subreason == 'The uploader has not made this video available in your country.': + countries = microformat.get('availableCountries') + if not countries: + regions_allowed = search_meta('regionsAllowed') + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + subreason, countries) + reason += '\n' + subreason + if reason: + raise ExtractorError(reason, expected=True) - format_id = fmt.get('itag') or url_data['itag'][0] - if not format_id: - continue - format_id = compat_str(format_id) + self._sort_formats(formats) - if cipher: - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = ( - r']+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base', - r'"jsUrl"\s*:\s*("[^"]+")', - r'"assets":.+?"js":\s*("[^"]+")') - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): - if player_url is None: - player_desc = 'unknown' - else: - player_type, player_version = self._extract_player_info(player_url) - player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - if width is None: - width = int_or_none(fmt.get('width')) - if height is None: - height = int_or_none(fmt.get('height')) - - filesize = int_or_none(url_data.get( - 'clen', [None])[0]) or _extract_filesize(url) - - quality = url_data.get('quality', [None])[0] or fmt.get('quality') - quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') - - tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) - or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None - fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) - - more_fields = { - 'filesize': filesize, - 'tbr': tbr, - 'width': width, - 'height': height, - 'fps': fps, - 'format_note': quality_label or quality, - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P[a-zA-Z_-]+)=(?P["\']?)(?P.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - dct.update(parse_codecs(codecs)) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - formats.append(dct) - else: - manifest_url = ( - url_or_none(try_get( - player_response, - lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) - or url_or_none(try_get( - video_info, lambda x: x['hlsvp'][0], compat_str))) - if manifest_url: - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) - else: - error_message = extract_unavailable_message() - if not error_message: - reason_list = try_get( - player_response, - lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], - list) or [] - for reason in reason_list: - if not isinstance(reason, dict): - continue - reason_text = try_get(reason, lambda x: x['text'], compat_str) - if reason_text: - if not error_message: - error_message = '' - error_message += reason_text - if error_message: - error_message = clean_html(error_message) - if not error_message: - error_message = clean_html(try_get( - player_response, lambda x: x['playabilityStatus']['reason'], - compat_str)) - if not error_message: - error_message = clean_html( - try_get(video_info, lambda x: x['reason'][0], compat_str)) - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') - - # uploader - video_uploader = try_get( - video_info, lambda x: x['author'][0], - compat_str) or str_or_none(video_details.get('author')) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) - if owner_profile_url: - video_uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id', - default=None) - video_uploader_url = owner_profile_url - - channel_id = ( - str_or_none(video_details.get('channelId')) - or self._html_search_meta( - 'channelId', video_webpage, 'channel id', default=None) - or self._search_regex( - r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', - video_webpage, 'channel id', default=None, group='id')) - channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + keywords = video_details.get('keywords') or [] + if not keywords and webpage: + keywords = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] + for keyword in keywords: + if keyword.startswith('yt:stretch='): + w, h = keyword.split('=')[1].split(':') + w, h = int(w), int(h) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio thumbnails = [] - thumbnails_list = try_get( - video_details, lambda x: x['thumbnail']['thumbnails'], list) or [] - for t in thumbnails_list: - if not isinstance(t, dict): - continue - thumbnail_url = url_or_none(t.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')), - }) - - if not thumbnails: - video_thumbnail = None - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str) - if thumbnail_url: - video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url) - if video_thumbnail: - thumbnails.append({'url': video_thumbnail}) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = microformat.get('publishDate') or microformat.get('uploadDate') - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) + for container in (video_details, microformat): + for thumbnail in (try_get( + container, + lambda x: x['thumbnail']['thumbnails'], list) or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'height': int_or_none(thumbnail.get('height')), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + }) + if thumbnails: + break else: - video_alt_title = video_creator = None + thumbnail = search_meta(['og:image', 'twitter:image']) + if thumbnail: + thumbnails = [{'url': thumbnail}] - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) + category = microformat.get('category') or search_meta('genre') + channel_id = video_details.get('channelId') \ + or microformat.get('externalChannelId') \ + or search_meta('channelId') + duration = int_or_none( + video_details.get('lengthSeconds') + or microformat.get('lengthSeconds')) \ + or parse_duration(search_meta('duration')) + is_live = video_details.get('isLive') + owner_profile_url = microformat.get('ownerProfileUrl') - track = extract_meta('Song') - artist = extract_meta('Artist') - album = extract_meta('Album') + info = { + 'id': video_id, + 'title': self._live_title(video_title) if is_live else video_title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video_description, + 'upload_date': unified_strdate( + microformat.get('uploadDate') + or search_meta('uploadDate')), + 'uploader': video_details['author'], + 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, + 'uploader_url': owner_profile_url, + 'channel_id': channel_id, + 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, + 'duration': duration, + 'view_count': int_or_none( + video_details.get('viewCount') + or microformat.get('viewCount') + or search_meta('interactionCount')), + 'average_rating': float_or_none(video_details.get('averageRating')), + 'age_limit': 18 if ( + microformat.get('isFamilySafe') is False + or search_meta('isFamilyFriendly') == 'false' + or search_meta('og:restrictions:age') == '18+') else 0, + 'webpage_url': webpage_url, + 'categories': [category] if category else None, + 'tags': keywords, + 'is_live': is_live, + } + + pctr = try_get( + player_response, + lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) + if pctr: + def process_language(container, base_url, caption, query): + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + query.update({ + 'fmt': fmt, + }) + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, query), + }) + subtitles[caption['languageCode']] = lang_subs + + subtitles = {} + for caption_track in pctr['captionTracks']: + base_url = caption_track['baseUrl'] + if caption_track.get('kind') != 'asr': + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, { + 'fmt': fmt, + }), + }) + subtitles[caption_track['languageCode']] = lang_subs + continue + automatic_captions = {} + for translation_language in pctr['translationLanguages']: + translation_language_code = translation_language['languageCode'] + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, { + 'fmt': fmt, + 'tlang': translation_language_code, + }), + }) + automatic_captions[translation_language_code] = lang_subs + info['automatic_captions'] = automatic_captions + info['subtitles'] = subtitles + + parsed_url = compat_urllib_parse_urlparse(url) + for component in [parsed_url.fragment, parsed_url.query]: + query = compat_parse_qs(component) + for k, v in query.items(): + for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: + d_k += '_time' + if d_k not in info and k in s_ks: + info[d_k] = parse_duration(query[k][0]) - # Youtube Music Auto-generated description - release_date = release_year = None if video_description: mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: - if not track: - track = mobj.group('track').strip() - if not artist: - artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) - if not album: - album = mobj.group('album'.strip()) release_year = mobj.group('release_year') release_date = mobj.group('release_date') if release_date: release_date = release_date.replace('-', '') if not release_year: - release_year = int(release_date[:4]) - if release_year: - release_year = int(release_year) + release_year = release_date[:4] + info.update({ + 'album': mobj.group('album'.strip()), + 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), + 'track': mobj.group('track').strip(), + 'release_date': release_date, + 'release_year': int(release_year), + }) - yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) - contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] - for content in contents: - rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] - multiple_songs = False - for row in rows: - if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: - multiple_songs = True - break - for row in rows: - mrr = row.get('metadataRowRenderer') or {} - mrr_title = try_get( - mrr, lambda x: x['title']['simpleText'], compat_str) - mrr_contents = try_get( - mrr, lambda x: x['contents'][0], dict) or {} - mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str) - if not (mrr_title and mrr_contents_text): + initial_data = None + if webpage: + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, + 'yt initial data') + if not initial_data: + initial_data = self._call_api( + 'next', {'videoId': video_id}, video_id, fatal=False) + + if initial_data: + for engagment_pannel in (initial_data.get('engagementPanels') or []): + contents = try_get( + engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], + list) + if not contents: continue - if mrr_title == 'License': - video_license = mrr_contents_text - elif not multiple_songs: - if mrr_title == 'Album': - album = mrr_contents_text - elif mrr_title == 'Artist': - artist = mrr_contents_text - elif mrr_title == 'Song': - track = mrr_contents_text - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = unescapeHTML(m_episode.group('series')) - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None + def chapter_time(mmlir): + return parse_duration(mmlir.get( + get_text(mmlir.get('timeDescription')))) - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - category = None - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - if not category: - category = try_get( - microformat, lambda x: x['category'], compat_str) - video_categories = None if category is None else [category] + chapters = [] + for next_num, content in enumerate(contents, start=1): + mmlir = content.get('macroMarkersListItemRenderer') or {} + start_time = chapter_time(mmlir) + end_time = chapter_time(try_get( + contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ + if next_num < len(contents) else duration + if not (start_time and end_time): + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': get_text(mmlir.get('title')), + }) + info['chapters'] = chapters - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - if not video_tags: - video_tags = try_get(video_details, lambda x: x['keywords'], list) + contents = try_get( + initial_data, + lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], + list) or [] + for content in contents: + vpir = content.get('videoPrimaryInfoRenderer') + if vpir: + stl = vpir.get('superTitleLink') + if stl: + stl = get_text(stl) + if try_get( + vpir, + lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': + info['location'] = stl + else: + mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + if mobj: + info.update({ + 'series': mobj.group(1), + 'season_number': int(mobj.group(2)), + 'episode_number': int(mobj.group(3)), + }) + for tlb in (try_get( + vpir, + lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], + list) or []): + tbr = tlb.get('toggleButtonRenderer') or {} + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) + break + sbr_tooltip = try_get( + vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) + if sbr_tooltip: + like_count, dislike_count = sbr_tooltip.split(' / ') + info.update({ + 'like_count': str_to_int(like_count), + 'dislike_count': str_to_int(dislike_count), + }) + vsir = content.get('videoSecondaryInfoRenderer') + if vsir: + info['channel'] = get_text(try_get( + vsir, + lambda x: x['owner']['videoOwnerRenderer']['title'], + compat_str)) + rows = try_get( + vsir, + lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], + list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = mrr.get('title') + if not mrr_title: + continue + mrr_title = get_text(mrr['title']) + mrr_contents_text = get_text(mrr['contents'][0]) + if mrr_title == 'License': + info['license'] = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + info['album'] = mrr_contents_text + elif mrr_title == 'Artist': + info['artist'] = mrr_contents_text + elif mrr_title == 'Song': + info['track'] = mrr_contents_text - def _extract_count(count_name): - return str_to_int(self._search_regex( - (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), - r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)), - video_webpage, count_name, default=None)) + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: + v = info.get(s_k) + if v: + info[d_k] = v - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') + self.mark_watched(video_id, player_response) - if view_count is None: - view_count = str_to_int(self._search_regex( - r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, - 'view count', default=None)) - - average_rating = ( - float_or_none(video_details.get('averageRating')) - or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = int_or_none(video_details.get('lengthSeconds')) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - xsrf_token = None - ytcfg = self._extract_ytcfg(video_id, video_webpage) - if ytcfg: - xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str) - if not xsrf_token: - xsrf_token = self._search_regex( - r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2', - video_webpage, 'xsrf token', group='xsrf_token', fatal=False) - invideo_url = try_get( - player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) - if xsrf_token and invideo_url: - xsrf_field_name = None - if ytcfg: - xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str) - if not xsrf_field_name: - xsrf_field_name = self._search_regex( - r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', - video_webpage, 'xsrf field name', - group='xsrf_field_name', default='session_token') - video_annotations = self._download_webpage( - self._proto_relative_url(invideo_url), - video_id, note='Downloading annotations', - errnote='Unable to download video annotations', fatal=False, - data=urlencode_postdata({xsrf_field_name: xsrf_token})) - - chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) - - # Look for the DASH manifest - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd_fatal = True - for mpd_url in dash_mpds: - dash_formats = {} - try: - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - - mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) - - for df in self._extract_mpd_formats( - mpd_url, video_id, fatal=dash_mpd_fatal, - formats_dict=self._formats): - if not df.get('filesize'): - df['filesize'] = _extract_filesize(df.get('fragment_base_url') or df['url']) - # Do not overwrite DASH format found in some previous DASH manifest - if df['format_id'] not in dash_formats: - dash_formats[df['format_id']] = df - # Additional DASH manifests may end up in HTTP Error 403 therefore - # allow them to fail without bug report message if we already have - # some DASH manifest succeeded. This is temporary workaround to reduce - # burst of bug reports until we figure out the reason and whether it - # can be fixed at all. - dash_mpd_fatal = False - except (ExtractorError, KeyError) as e: - self.report_warning( - 'Skipping DASH manifest: %r' % e, video_id) - if dash_formats: - # Remove the formats we found through non-DASH, they - # contain less info and it can be wrong, because we use - # fixed values (for example the resolution). See - # https://github.com/ytdl-org/youtube-dl/issues/5774 for an - # example. - formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] - formats.extend(dash_formats.values()) - - # Check for malformed aspect ratio - stretched_m = re.search( - r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', - video_webpage) - if stretched_m: - w = float(stretched_m.group('w')) - h = float(stretched_m.group('h')) - # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). - # We will only process correct ratios. - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - - if not formats: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - self.mark_watched(video_id, video_info, player_response) - - return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'uploader_url': video_uploader_url, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'upload_date': upload_date, - 'license': video_license, - 'creator': video_creator or artist, - 'title': video_title, - 'alt_title': video_alt_title or track, - 'thumbnails': thumbnails, - 'description': video_description, - 'categories': video_categories, - 'tags': video_tags, - 'subtitles': video_subtitles, - 'automatic_captions': automatic_captions, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'chapters': chapters, - 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'average_rating': average_rating, - 'formats': formats, - 'is_live': is_live, - 'start_time': start_time, - 'end_time': end_time, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'track': track, - 'artist': artist, - 'album': album, - 'release_date': release_date, - 'release_year': release_year, - } + return info class YoutubeTabIE(YoutubeBaseInfoExtractor): From 9c724601ba234085dc5071ec9c1c3d98e6834817 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:08:50 +0100 Subject: [PATCH 382/424] [youtube] remove description chapters tests video description no longer contain yt.www.watch.player.seekTo function --- test/test_youtube_chapters.py | 275 -------------------------------- youtube_dl/extractor/youtube.py | 91 ++++++++--- 2 files changed, 67 insertions(+), 299 deletions(-) delete mode 100644 test/test_youtube_chapters.py diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py deleted file mode 100644 index e69c57377..000000000 --- a/test/test_youtube_chapters.py +++ /dev/null @@ -1,275 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -from __future__ import unicode_literals - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from test.helper import expect_value -from youtube_dl.extractor import YoutubeIE - - -class TestYoutubeChapters(unittest.TestCase): - - _TEST_CASES = [ - ( - # https://www.youtube.com/watch?v=A22oy8dFjqc - # pattern: 00:00 - <title> - '''This is the absolute ULTIMATE experience of Queen's set at LIVE AID, this is the best video mixed to the absolutely superior stereo radio broadcast. This vastly superior audio mix takes a huge dump on all of the official mixes. Best viewed in 1080p. ENJOY! ***MAKE SURE TO READ THE DESCRIPTION***<br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+36);return false;">00:36</a> - Bohemian Rhapsody<br /><a href="#" onclick="yt.www.watch.player.seekTo(02*60+42);return false;">02:42</a> - Radio Ga Ga<br /><a href="#" onclick="yt.www.watch.player.seekTo(06*60+53);return false;">06:53</a> - Ay Oh!<br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+34);return false;">07:34</a> - Hammer To Fall<br /><a href="#" onclick="yt.www.watch.player.seekTo(12*60+08);return false;">12:08</a> - Crazy Little Thing Called Love<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+03);return false;">16:03</a> - We Will Rock You<br /><a href="#" onclick="yt.www.watch.player.seekTo(17*60+18);return false;">17:18</a> - We Are The Champions<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+12);return false;">21:12</a> - Is This The World We Created...?<br /><br />Short song analysis:<br /><br />- "Bohemian Rhapsody": Although it's a short medley version, it's one of the best performances of the ballad section, with Freddie nailing the Bb4s with the correct studio phrasing (for the first time ever!).<br /><br />- "Radio Ga Ga": Although it's missing one chorus, this is one of - if not the best - the best versions ever, Freddie nails all the Bb4s and sounds very clean! Spike Edney's Roland Jupiter 8 also really shines through on this mix, compared to the DVD releases!<br /><br />- "Audience Improv": A great improv, Freddie sounds strong and confident. You gotta love when he sustains that A4 for 4 seconds!<br /><br />- "Hammer To Fall": Despite missing a verse and a chorus, it's a strong version (possibly the best ever). Freddie sings the song amazingly, and even ad-libs a C#5 and a C5! Also notice how heavy Brian's guitar sounds compared to the thin DVD mixes - it roars!<br /><br />- "Crazy Little Thing Called Love": A great version, the crowd loves the song, the jam is great as well! Only downside to this is the slight feedback issues.<br /><br />- "We Will Rock You": Although cut down to the 1st verse and chorus, Freddie sounds strong. He nails the A4, and the solo from Dr. May is brilliant!<br /><br />- "We Are the Champions": Perhaps the high-light of the performance - Freddie is very daring on this version, he sustains the pre-chorus Bb4s, nails the 1st C5, belts great A4s, but most importantly: He nails the chorus Bb4s, in all 3 choruses! This is the only time he has ever done so! It has to be said though, the last one sounds a bit rough, but that's a side effect of belting high notes for the past 18 minutes, with nodules AND laryngitis!<br /><br />- "Is This The World We Created... ?": Freddie and Brian perform a beautiful version of this, and it is one of the best versions ever. It's both sad and hilarious that a couple of BBC engineers are talking over the song, one of them being completely oblivious of the fact that he is interrupting the performance, on live television... Which was being televised to almost 2 billion homes.<br /><br /><br />All rights go to their respective owners!<br />-----Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for fair use for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use''', - 1477, - [{ - 'start_time': 36, - 'end_time': 162, - 'title': 'Bohemian Rhapsody', - }, { - 'start_time': 162, - 'end_time': 413, - 'title': 'Radio Ga Ga', - }, { - 'start_time': 413, - 'end_time': 454, - 'title': 'Ay Oh!', - }, { - 'start_time': 454, - 'end_time': 728, - 'title': 'Hammer To Fall', - }, { - 'start_time': 728, - 'end_time': 963, - 'title': 'Crazy Little Thing Called Love', - }, { - 'start_time': 963, - 'end_time': 1038, - 'title': 'We Will Rock You', - }, { - 'start_time': 1038, - 'end_time': 1272, - 'title': 'We Are The Champions', - }, { - 'start_time': 1272, - 'end_time': 1477, - 'title': 'Is This The World We Created...?', - }] - ), - ( - # https://www.youtube.com/watch?v=ekYlRhALiRQ - # pattern: <num>. <title> 0:00 - '1. Those Beaten Paths of Confusion <a href="#" onclick="yt.www.watch.player.seekTo(0*60+00);return false;">0:00</a><br />2. Beyond the Shadows of Emptiness & Nothingness <a href="#" onclick="yt.www.watch.player.seekTo(11*60+47);return false;">11:47</a><br />3. Poison Yourself...With Thought <a href="#" onclick="yt.www.watch.player.seekTo(26*60+30);return false;">26:30</a><br />4. The Agents of Transformation <a href="#" onclick="yt.www.watch.player.seekTo(35*60+57);return false;">35:57</a><br />5. Drowning in the Pain of Consciousness <a href="#" onclick="yt.www.watch.player.seekTo(44*60+32);return false;">44:32</a><br />6. Deny the Disease of Life <a href="#" onclick="yt.www.watch.player.seekTo(53*60+07);return false;">53:07</a><br /><br />More info/Buy: http://crepusculonegro.storenvy.com/products/257645-cn-03-arizmenda-within-the-vacuum-of-infinity<br /><br />No copyright is intended. The rights to this video are assumed by the owner and its affiliates.', - 4009, - [{ - 'start_time': 0, - 'end_time': 707, - 'title': '1. Those Beaten Paths of Confusion', - }, { - 'start_time': 707, - 'end_time': 1590, - 'title': '2. Beyond the Shadows of Emptiness & Nothingness', - }, { - 'start_time': 1590, - 'end_time': 2157, - 'title': '3. Poison Yourself...With Thought', - }, { - 'start_time': 2157, - 'end_time': 2672, - 'title': '4. The Agents of Transformation', - }, { - 'start_time': 2672, - 'end_time': 3187, - 'title': '5. Drowning in the Pain of Consciousness', - }, { - 'start_time': 3187, - 'end_time': 4009, - 'title': '6. Deny the Disease of Life', - }] - ), - ( - # https://www.youtube.com/watch?v=WjL4pSzog9w - # pattern: 00:00 <title> - '<a href="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" class="yt-uix-servicelink " data-target-new-window="True" data-servicelink="CDAQ6TgYACITCNf1raqT2dMCFdRjGAod_o0CBSj4HQ" data-url="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" rel="nofollow noopener" target="_blank">https://arizmenda.bandcamp.com/merch/...</a><br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> Christening Unborn Deformities <br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+08);return false;">07:08</a> Taste of Purity<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+16);return false;">16:16</a> Sculpting Sins of a Universal Tongue<br /><a href="#" onclick="yt.www.watch.player.seekTo(24*60+45);return false;">24:45</a> Birth<br /><a href="#" onclick="yt.www.watch.player.seekTo(31*60+24);return false;">31:24</a> Neves<br /><a href="#" onclick="yt.www.watch.player.seekTo(37*60+55);return false;">37:55</a> Libations in Limbo', - 2705, - [{ - 'start_time': 0, - 'end_time': 428, - 'title': 'Christening Unborn Deformities', - }, { - 'start_time': 428, - 'end_time': 976, - 'title': 'Taste of Purity', - }, { - 'start_time': 976, - 'end_time': 1485, - 'title': 'Sculpting Sins of a Universal Tongue', - }, { - 'start_time': 1485, - 'end_time': 1884, - 'title': 'Birth', - }, { - 'start_time': 1884, - 'end_time': 2275, - 'title': 'Neves', - }, { - 'start_time': 2275, - 'end_time': 2705, - 'title': 'Libations in Limbo', - }] - ), - ( - # https://www.youtube.com/watch?v=o3r1sn-t3is - # pattern: <title> 00:00 <note> - 'Download this show in MP3: <a href="http://sh.st/njZKK" class="yt-uix-servicelink " data-url="http://sh.st/njZKK" data-target-new-window="True" data-servicelink="CDAQ6TgYACITCK3j8_6o2dMCFVDCGAoduVAKKij4HQ" rel="nofollow noopener" target="_blank">http://sh.st/njZKK</a><br /><br />Setlist:<br />I-E-A-I-A-I-O <a href="#" onclick="yt.www.watch.player.seekTo(00*60+45);return false;">00:45</a><br />Suite-Pee <a href="#" onclick="yt.www.watch.player.seekTo(4*60+26);return false;">4:26</a> (Incomplete)<br />Attack <a href="#" onclick="yt.www.watch.player.seekTo(5*60+31);return false;">5:31</a> (First live performance since 2011)<br />Prison Song <a href="#" onclick="yt.www.watch.player.seekTo(8*60+42);return false;">8:42</a><br />Know <a href="#" onclick="yt.www.watch.player.seekTo(12*60+32);return false;">12:32</a> (First live performance since 2011)<br />Aerials <a href="#" onclick="yt.www.watch.player.seekTo(15*60+32);return false;">15:32</a><br />Soldier Side - Intro <a href="#" onclick="yt.www.watch.player.seekTo(19*60+13);return false;">19:13</a><br />B.Y.O.B. <a href="#" onclick="yt.www.watch.player.seekTo(20*60+09);return false;">20:09</a><br />Soil <a href="#" onclick="yt.www.watch.player.seekTo(24*60+32);return false;">24:32</a><br />Darts <a href="#" onclick="yt.www.watch.player.seekTo(27*60+48);return false;">27:48</a><br />Radio/Video <a href="#" onclick="yt.www.watch.player.seekTo(30*60+38);return false;">30:38</a><br />Hypnotize <a href="#" onclick="yt.www.watch.player.seekTo(35*60+05);return false;">35:05</a><br />Temper <a href="#" onclick="yt.www.watch.player.seekTo(38*60+08);return false;">38:08</a> (First live performance since 1999)<br />CUBErt <a href="#" onclick="yt.www.watch.player.seekTo(41*60+00);return false;">41:00</a><br />Needles <a href="#" onclick="yt.www.watch.player.seekTo(42*60+57);return false;">42:57</a><br />Deer Dance <a href="#" onclick="yt.www.watch.player.seekTo(46*60+27);return false;">46:27</a><br />Bounce <a href="#" onclick="yt.www.watch.player.seekTo(49*60+38);return false;">49:38</a><br />Suggestions <a href="#" onclick="yt.www.watch.player.seekTo(51*60+25);return false;">51:25</a><br />Psycho <a href="#" onclick="yt.www.watch.player.seekTo(53*60+52);return false;">53:52</a><br />Chop Suey! <a href="#" onclick="yt.www.watch.player.seekTo(58*60+13);return false;">58:13</a><br />Lonely Day <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+01*60+15);return false;">1:01:15</a><br />Question! <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+04*60+14);return false;">1:04:14</a><br />Lost in Hollywood <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+08*60+10);return false;">1:08:10</a><br />Vicinity of Obscenity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+13*60+40);return false;">1:13:40</a>(First live performance since 2012)<br />Forest <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+16*60+17);return false;">1:16:17</a><br />Cigaro <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+20*60+02);return false;">1:20:02</a><br />Toxicity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+23*60+57);return false;">1:23:57</a>(with Chino Moreno)<br />Sugar <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+27*60+53);return false;">1:27:53</a>', - 5640, - [{ - 'start_time': 45, - 'end_time': 266, - 'title': 'I-E-A-I-A-I-O', - }, { - 'start_time': 266, - 'end_time': 331, - 'title': 'Suite-Pee (Incomplete)', - }, { - 'start_time': 331, - 'end_time': 522, - 'title': 'Attack (First live performance since 2011)', - }, { - 'start_time': 522, - 'end_time': 752, - 'title': 'Prison Song', - }, { - 'start_time': 752, - 'end_time': 932, - 'title': 'Know (First live performance since 2011)', - }, { - 'start_time': 932, - 'end_time': 1153, - 'title': 'Aerials', - }, { - 'start_time': 1153, - 'end_time': 1209, - 'title': 'Soldier Side - Intro', - }, { - 'start_time': 1209, - 'end_time': 1472, - 'title': 'B.Y.O.B.', - }, { - 'start_time': 1472, - 'end_time': 1668, - 'title': 'Soil', - }, { - 'start_time': 1668, - 'end_time': 1838, - 'title': 'Darts', - }, { - 'start_time': 1838, - 'end_time': 2105, - 'title': 'Radio/Video', - }, { - 'start_time': 2105, - 'end_time': 2288, - 'title': 'Hypnotize', - }, { - 'start_time': 2288, - 'end_time': 2460, - 'title': 'Temper (First live performance since 1999)', - }, { - 'start_time': 2460, - 'end_time': 2577, - 'title': 'CUBErt', - }, { - 'start_time': 2577, - 'end_time': 2787, - 'title': 'Needles', - }, { - 'start_time': 2787, - 'end_time': 2978, - 'title': 'Deer Dance', - }, { - 'start_time': 2978, - 'end_time': 3085, - 'title': 'Bounce', - }, { - 'start_time': 3085, - 'end_time': 3232, - 'title': 'Suggestions', - }, { - 'start_time': 3232, - 'end_time': 3493, - 'title': 'Psycho', - }, { - 'start_time': 3493, - 'end_time': 3675, - 'title': 'Chop Suey!', - }, { - 'start_time': 3675, - 'end_time': 3854, - 'title': 'Lonely Day', - }, { - 'start_time': 3854, - 'end_time': 4090, - 'title': 'Question!', - }, { - 'start_time': 4090, - 'end_time': 4420, - 'title': 'Lost in Hollywood', - }, { - 'start_time': 4420, - 'end_time': 4577, - 'title': 'Vicinity of Obscenity (First live performance since 2012)', - }, { - 'start_time': 4577, - 'end_time': 4802, - 'title': 'Forest', - }, { - 'start_time': 4802, - 'end_time': 5037, - 'title': 'Cigaro', - }, { - 'start_time': 5037, - 'end_time': 5273, - 'title': 'Toxicity (with Chino Moreno)', - }, { - 'start_time': 5273, - 'end_time': 5640, - 'title': 'Sugar', - }] - ), - ( - # https://www.youtube.com/watch?v=PkYLQbsqCE8 - # pattern: <num> - <title> [<latinized title>] 0:00:00 - '''Затемно (Zatemno) is an Obscure Black Metal Band from Russia.<br /><br />"Во прах (Vo prakh)'' Into The Ashes", Debut mini-album released may 6, 2016, by Death Knell Productions<br />Released on 6 panel digipak CD, limited to 100 copies only<br />And digital format on Bandcamp<br /><br />Tracklist<br /><br />1 - Во прах [Vo prakh] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;">0:00:00</a><br />2 - Искупление [Iskupleniye] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+08*60+10);return false;">0:08:10</a><br />3 - Из серпов луны...[Iz serpov luny] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+14*60+30);return false;">0:14:30</a><br /><br />Links:<br /><a href="https://deathknellprod.bandcamp.com/album/--2" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://deathknellprod.bandcamp.com/album/--2" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://deathknellprod.bandcamp.com/a...</a><br /><a href="https://www.facebook.com/DeathKnellProd/" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://www.facebook.com/DeathKnellProd/" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://www.facebook.com/DeathKnellProd/</a><br /><br /><br />I don't have any right about this artifact, my only intention is to spread the music of the band, all rights are reserved to the Затемно (Zatemno) and his producers, Death Knell Productions.<br /><br />------------------------------------------------------------------<br /><br />Subscribe for more videos like this.<br />My link: <a href="https://web.facebook.com/AttackOfTheDragons" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://web.facebook.com/AttackOfTheDragons" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://web.facebook.com/AttackOfTheD...</a>''', - 1138, - [{ - 'start_time': 0, - 'end_time': 490, - 'title': '1 - Во прах [Vo prakh]', - }, { - 'start_time': 490, - 'end_time': 870, - 'title': '2 - Искупление [Iskupleniye]', - }, { - 'start_time': 870, - 'end_time': 1138, - 'title': '3 - Из серпов луны...[Iz serpov luny]', - }] - ), - ( - # https://www.youtube.com/watch?v=xZW70zEasOk - # time point more than duration - '''● LCS Spring finals: Saturday and Sunday from <a href="#" onclick="yt.www.watch.player.seekTo(13*60+30);return false;">13:30</a> outside the venue! <br />● PAX East: Fri, Sat & Sun - more info in tomorrows video on the main channel!''', - 283, - [] - ), - ] - - def test_youtube_chapters(self): - for description, duration, expected_chapters in self._TEST_CASES: - ie = YoutubeIE() - expect_value( - self, ie._extract_chapters_from_description(description, duration), - expected_chapters, None) - - -if __name__ == '__main__': - unittest.main() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5f6769878..edaca0658 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1223,6 +1223,46 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id + def _extract_chapters_from_json(self, data, video_id, duration): + chapters_list = try_get( + data, + lambda x: x['playerOverlays'] + ['playerOverlayRenderer'] + ['decoratedPlayerBarRenderer'] + ['decoratedPlayerBarRenderer'] + ['playerBar'] + ['chapteredPlayerBarRenderer'] + ['chapters'], + list) + if not chapters_list: + return + + def chapter_time(chapter): + return float_or_none( + try_get( + chapter, + lambda x: x['chapterRenderer']['timeRangeStartMillis'], + int), + scale=1000) + chapters = [] + for next_num, chapter in enumerate(chapters_list, start=1): + start_time = chapter_time(chapter) + if start_time is None: + continue + end_time = (chapter_time(chapters_list[next_num]) + if next_num < len(chapters_list) else duration) + if end_time is None: + continue + title = try_get( + chapter, lambda x: x['chapterRenderer']['title']['simpleText'], + compat_str) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': title, + }) + return chapters + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): return self._parse_json(self._search_regex( (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), @@ -1597,31 +1637,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'next', {'videoId': video_id}, video_id, fatal=False) if initial_data: - for engagment_pannel in (initial_data.get('engagementPanels') or []): - contents = try_get( - engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], - list) - if not contents: - continue - - def chapter_time(mmlir): - return parse_duration(mmlir.get( - get_text(mmlir.get('timeDescription')))) - - chapters = [] - for next_num, content in enumerate(contents, start=1): - mmlir = content.get('macroMarkersListItemRenderer') or {} - start_time = chapter_time(mmlir) - end_time = chapter_time(try_get( - contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ - if next_num < len(contents) else duration - if not (start_time and end_time): + chapters = self._extract_chapters_from_json( + initial_data, video_id, duration) + if not chapters: + for engagment_pannel in (initial_data.get('engagementPanels') or []): + contents = try_get( + engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], + list) + if not contents: continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': get_text(mmlir.get('title')), - }) + + def chapter_time(mmlir): + return parse_duration(mmlir.get( + get_text(mmlir.get('timeDescription')))) + + for next_num, content in enumerate(contents, start=1): + mmlir = content.get('macroMarkersListItemRenderer') or {} + start_time = chapter_time(mmlir) + end_time = chapter_time(try_get( + contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ + if next_num < len(contents) else duration + if not (start_time and end_time): + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': get_text(mmlir.get('title')), + }) + if chapters: info['chapters'] = chapters contents = try_get( From b46483a6ec6a42889fc16d53afd76d147748785f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:35:07 +0100 Subject: [PATCH 383/424] [youtube/test_youtube_signature] fix test --- test/test_youtube_signature.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 69df30eda..b5a4d0d5f 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -86,13 +86,9 @@ class TestPlayerInfo(unittest.TestCase): ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), - ('http://s.ytimg.com/yt/swfbin/watch_as3-vflrEm9Nq.swf', 'vflrEm9Nq'), - ('https://s.ytimg.com/yts/swfbin/player-vflenCdZL/watch_as3.swf', 'vflenCdZL'), ) for player_url, expected_player_id in PLAYER_URLS: - expected_player_type = player_url.split('.')[-1] - player_type, player_id = YoutubeIE._extract_player_info(player_url) - self.assertEqual(player_type, expected_player_type) + player_id = YoutubeIE._extract_player_info(player_url) self.assertEqual(player_id, expected_player_id) From 159a3d48dfb2b4ed77dc691433e420506c9340c3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:36:19 +0100 Subject: [PATCH 384/424] [youtube] keep _formats array for format sorting tests --- youtube_dl/extractor/youtube.py | 105 ++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index edaca0658..ed844e2a3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1019,6 +1019,111 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'only_matching': True, }, ] + _formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + } def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) From efef4ddf51c375c3a9eb12355a61a21d69aec33f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:49:52 +0100 Subject: [PATCH 385/424] [youtube] fix chapter extraction fallback --- youtube_dl/extractor/youtube.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ed844e2a3..65fa777e4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1753,22 +1753,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue def chapter_time(mmlir): - return parse_duration(mmlir.get( - get_text(mmlir.get('timeDescription')))) + return parse_duration( + get_text(mmlir.get('timeDescription'))) + chapters = [] for next_num, content in enumerate(contents, start=1): mmlir = content.get('macroMarkersListItemRenderer') or {} start_time = chapter_time(mmlir) end_time = chapter_time(try_get( contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ if next_num < len(contents) else duration - if not (start_time and end_time): + if start_time is None or end_time is None: continue chapters.append({ 'start_time': start_time, 'end_time': end_time, 'title': get_text(mmlir.get('title')), }) + if chapters: + break if chapters: info['chapters'] = chapters From 65eee5a745f705a7904709accdba47efb852cc6a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 18:12:35 +0100 Subject: [PATCH 386/424] [youtube] improve subtitle extraction --- youtube_dl/extractor/youtube.py | 43 ++++++++++++++------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 65fa777e4..75a007353 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1664,7 +1664,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response, lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) if pctr: - def process_language(container, base_url, caption, query): + def process_language(container, base_url, lang_code, query): lang_subs = [] for fmt in self._SUBTITLE_FORMATS: query.update({ @@ -1674,35 +1674,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': fmt, 'url': update_url_query(base_url, query), }) - subtitles[caption['languageCode']] = lang_subs + container[lang_code] = lang_subs subtitles = {} - for caption_track in pctr['captionTracks']: - base_url = caption_track['baseUrl'] + for caption_track in (pctr.get('captionTracks') or []): + base_url = caption_track.get('baseUrl') + if not base_url: + continue if caption_track.get('kind') != 'asr': - lang_subs = [] - for fmt in self._SUBTITLE_FORMATS: - lang_subs.append({ - 'ext': fmt, - 'url': update_url_query(base_url, { - 'fmt': fmt, - }), - }) - subtitles[caption_track['languageCode']] = lang_subs + lang_code = caption_track.get('languageCode') + if not lang_code: + continue + process_language( + subtitles, base_url, lang_code, {}) continue automatic_captions = {} - for translation_language in pctr['translationLanguages']: - translation_language_code = translation_language['languageCode'] - lang_subs = [] - for fmt in self._SUBTITLE_FORMATS: - lang_subs.append({ - 'ext': fmt, - 'url': update_url_query(base_url, { - 'fmt': fmt, - 'tlang': translation_language_code, - }), - }) - automatic_captions[translation_language_code] = lang_subs + for translation_language in (pctr.get('translationLanguages') or []): + translation_language_code = translation_language.get('languageCode') + if not translation_language_code: + continue + process_language( + automatic_captions, base_url, translation_language_code, + {'tlang': translation_language_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles From 8fa7cc387d699899114f7430bcf61837d58557a8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 21:35:18 +0100 Subject: [PATCH 387/424] [vidio] improve metadata extraction --- youtube_dl/extractor/vidio.py | 86 ++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index b48baf00b..b1243e847 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, +) class VidioIE(InfoExtractor): @@ -21,57 +27,63 @@ class VidioIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 149, 'like_count': int, + 'uploader': 'TWELVE Pic', + 'timestamp': 1444902800, + 'upload_date': '20151015', + 'uploader_id': 'twelvepictures', + 'channel': 'Cover Music Video', + 'channel_id': '280236', + 'view_count': int, + 'dislike_count': int, + 'comment_count': int, + 'tags': 'count:4', }, }, { 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', 'only_matching': True, }] + def _real_initialize(self): + self._api_key = self._download_json( + 'https://www.vidio.com/auth', None, data=b'')['api_key'] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') + video_id, display_id = re.match(self._VALID_URL, url).groups() + data = self._download_json( + 'https://api.vidio.com/videos/' + video_id, display_id, headers={ + 'Content-Type': 'application/vnd.api+json', + 'X-API-KEY': self._api_key, + }) + video = data['videos'][0] + title = video['title'].strip() - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - m3u8_url, duration, thumbnail = [None] * 3 - - clips = self._parse_json( - self._html_search_regex( - r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1', - webpage, 'video data', default='[]', group='data'), - display_id, fatal=False) - if clips: - clip = clips[0] - m3u8_url = clip.get('sources', [{}])[0].get('file') - duration = clip.get('clip_duration') - thumbnail = clip.get('image') - - m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'hls url', group='url') formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native') self._sort_formats(formats) - duration = int_or_none(duration or self._search_regex( - r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage, - 'duration', fatal=False, group='duration')) - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - - like_count = int_or_none(self._search_regex( - (r'<span[^>]+data-comment-vote-count=["\'](\d+)', - r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) + get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} + channel = get_first('channel') + user = get_first('user') + username = user.get('username') + get_count = lambda x: int_or_none(video.get('total_' + x)) return { 'id': video_id, 'display_id': display_id, 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - 'duration': duration, - 'like_count': like_count, + 'description': strip_or_none(video.get('description')), + 'thumbnail': video.get('image_url_medium'), + 'duration': int_or_none(video.get('duration')), + 'like_count': get_count('likes'), 'formats': formats, + 'uploader': user.get('name'), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader_id': username, + 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'channel': channel.get('name'), + 'channel_id': str_or_none(channel.get('id')), + 'view_count': get_count('view_count'), + 'dislike_count': get_count('dislikes'), + 'comment_count': get_count('comments'), + 'tags': video.get('tag_list'), } From c11f7cf9bd6ef239f25e7fb9c54e092ae1490e2d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 22:35:28 +0100 Subject: [PATCH 388/424] [vidzi] remove extractor(closes #12629) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/vidzi.py | 68 ------------------------------ 2 files changed, 69 deletions(-) delete mode 100644 youtube_dl/extractor/vidzi.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ab8d6a5a5..97b0b4034 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1399,7 +1399,6 @@ from .vidme import ( VidmeUserIE, VidmeUserLikesIE, ) -from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewlift import ( ViewLiftIE, diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py deleted file mode 100644 index 42ea4952c..000000000 --- a/youtube_dl/extractor/vidzi.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, - NO_DEFAULT, - PACKED_CODES_RE, -) - - -class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'http://vidzi.tv/cghql9yq6emu.html', - 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', - 'info_dict': { - 'id': 'cghql9yq6emu', - 'ext': 'mp4', - 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.cc/cghql9yq6emu.html', - 'only_matching': True, - }, { - 'url': 'https://vidzi.si/rph9gztxj1et.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.nu/cghql9yq6emu.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://vidzi.tv/%s' % video_id, video_id) - title = self._html_search_regex( - r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - - codes = [webpage] - codes.extend([ - decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') - for mobj in re.finditer(PACKED_CODES_RE, webpage)]) - for num, code in enumerate(codes, 1): - jwplayer_data = self._parse_json( - self._search_regex( - r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=lambda s: js_to_json( - re.sub(r'\s*\+\s*window\[.+?\]', '', s))) - if jwplayer_data: - break - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict['title'] = title - - return info_dict From 0e3a9684795c6c53546dace9e917ed11c4ae72a5 Mon Sep 17 00:00:00 2001 From: Viren Rajput <virendra.rajput567@gmail.com> Date: Mon, 1 Feb 2021 04:56:33 +0000 Subject: [PATCH 389/424] [egghead] update API domain(closes #28038) --- youtube_dl/extractor/egghead.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index df11dc206..94dd75b9b 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -12,7 +12,14 @@ from ..utils import ( ) -class EggheadCourseIE(InfoExtractor): +class EggheadBaseIE(InfoExtractor): + def _call_api(self, path, video_id, resource, fatal=True): + return self._download_json( + 'https://app.egghead.io/api/v1/' + path, + video_id, 'Downloading %s JSON' % resource) + + +class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)' @@ -28,10 +35,9 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - - lessons = self._download_json( - 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, - playlist_id, 'Downloading course lessons JSON') + series_path = 'series/' + playlist_id + lessons = self._call_api( + series_path + '/lessons', playlist_id, 'course lessons') entries = [] for lesson in lessons: @@ -44,9 +50,8 @@ class EggheadCourseIE(InfoExtractor): entries.append(self.url_result( lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, - playlist_id, 'Downloading course JSON', fatal=False) or {} + course = self._call_api( + series_path, playlist_id, 'course', False) or {} playlist_id = course.get('id') if playlist_id: @@ -57,7 +62,7 @@ class EggheadCourseIE(InfoExtractor): course.get('description')) -class EggheadLessonIE(InfoExtractor): +class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' @@ -74,7 +79,7 @@ class EggheadLessonIE(InfoExtractor): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['javascript', 'free'], + 'tags': ['free', 'javascript'], }, 'params': { 'skip_download': True, @@ -88,8 +93,8 @@ class EggheadLessonIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + lesson = self._call_api( + 'lessons/' + display_id, display_id, 'lesson') lesson_id = compat_str(lesson['id']) title = lesson['title'] From b111a64135244b73b86a1720e9a5212e726afcbf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 2 Feb 2021 19:05:37 +0100 Subject: [PATCH 390/424] [egghead] fix typo --- youtube_dl/extractor/egghead.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index 94dd75b9b..aff9b88c0 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -16,7 +16,7 @@ class EggheadBaseIE(InfoExtractor): def _call_api(self, path, video_id, resource, fatal=True): return self._download_json( 'https://app.egghead.io/api/v1/' + path, - video_id, 'Downloading %s JSON' % resource) + video_id, 'Downloading %s JSON' % resource, fatal=fatal) class EggheadCourseIE(EggheadBaseIE): @@ -79,7 +79,7 @@ class EggheadLessonIE(EggheadBaseIE): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['free', 'javascript'], + 'tags': 'count:2', }, 'params': { 'skip_download': True, From 1e2575df8714ce9056e559058a187ec0ffd2d739 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 3 Feb 2021 00:21:46 +0100 Subject: [PATCH 391/424] Credit @adrianheine for #27732 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index b507cb8df..4a6d7dacd 100644 --- a/AUTHORS +++ b/AUTHORS @@ -246,3 +246,4 @@ Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin +Adrian Heine \ No newline at end of file From 07f7aad81c47a11483a357e53380fae1ffbadea9 Mon Sep 17 00:00:00 2001 From: Guillem Vela <guillemglez@gmail.com> Date: Thu, 27 Feb 2020 22:18:47 +0100 Subject: [PATCH 392/424] [ccma] improve metadata extraction(closes #27994) - extract age_limit, alt_title, categories, series and episode_number - fix timestamp multiple subtitles extraction --- youtube_dl/extractor/ccma.py | 65 ++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 544647f92..4db51e650 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import datetime import re from .common import InfoExtractor @@ -8,8 +9,8 @@ from ..utils import ( clean_html, int_or_none, parse_duration, - parse_iso8601, parse_resolution, + try_get, url_or_none, ) @@ -24,8 +25,9 @@ class CCMAIE(InfoExtractor): 'ext': 'mp4', 'title': 'L\'espot de La Marató de TV3', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', - 'timestamp': 1470918540, - 'upload_date': '20160811', + 'timestamp': 1478608140, + 'upload_date': '20161108', + 'age_limit': 0, } }, { 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', @@ -35,8 +37,24 @@ class CCMAIE(InfoExtractor): 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20171205', - 'timestamp': 1512507300, + 'upload_date': '20170512', + 'timestamp': 1494622500, + 'vcodec': 'none', + 'categories': ['Esports'], + } + }, { + 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', + 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'info_dict': { + 'id': '6031387', + 'ext': 'mp4', + 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', + 'timestamp': 1582577700, + 'upload_date': '20200224', + 'subtitles': 'mincount:4', + 'age_limit': 16, + 'series': 'Crims', } }] @@ -72,17 +90,27 @@ class CCMAIE(InfoExtractor): informacio = media['informacio'] title = informacio['titol'] - durada = informacio.get('durada', {}) + durada = informacio.get('durada') or {} duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) - timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + tematica = try_get(informacio, lambda x: x['tematica']['text']) + + timestamp = None + data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) + try: + timestamp = datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() + except TypeError: + pass subtitles = {} - subtitols = media.get('subtitols', {}) - if subtitols: - sub_url = subtitols.get('url') + subtitols = media.get('subtitols') or [] + if isinstance(subtitols, dict): + subtitols = [subtitols] + for st in subtitols: + sub_url = st.get('url') if sub_url: subtitles.setdefault( - subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + st.get('iso') or st.get('text') or 'ca', []).append({ 'url': sub_url, }) @@ -97,6 +125,16 @@ class CCMAIE(InfoExtractor): 'height': int_or_none(imatges.get('alcada')), }] + age_limit = None + codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) + if codi_etic: + codi_etic_s = codi_etic.split('_') + if len(codi_etic_s) == 2: + if codi_etic_s[1] == 'TP': + age_limit = 0 + else: + age_limit = int_or_none(codi_etic_s[1]) + return { 'id': media_id, 'title': title, @@ -106,4 +144,9 @@ class CCMAIE(InfoExtractor): 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, + 'age_limit': age_limit, + 'alt_title': informacio.get('titol_complet'), + 'episode_number': int_or_none(informacio.get('capitol')), + 'categories': [tematica] if tematica else None, + 'series': informacio.get('programa'), } From ab25f3f43196ca56964ba34ba4674fcb2d08f69a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 3 Feb 2021 17:15:31 +0100 Subject: [PATCH 393/424] [youtube] pass embed URL to get_video_info request --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 75a007353..42b0f452c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1397,6 +1397,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Refetching age-gated info webpage', 'unable to download video info webpage', query={ 'video_id': video_id, + 'eurl': 'https://www.youtube.com/embed/' + video_id, }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) From 1b731ebcaa3ef2a1e52cf6968cf93e08d50fe0d4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 3 Feb 2021 18:13:17 +0100 Subject: [PATCH 394/424] [bravotv] add support for oxygen.com(closes #13357)(closes #22500) --- youtube_dl/extractor/bravotv.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index b9715df00..bae2aedce 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -12,7 +12,7 @@ from ..utils import ( class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', @@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE): }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), @@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE): tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'bravo'), + adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( - url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] From 83031d749b11f062b9ba97023c228329e771cbd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 00:25:26 +0700 Subject: [PATCH 395/424] [pornhub:user] Add support for URLs unavailable via /videos page and improve paging (closes #27853) --- youtube_dl/extractor/pornhub.py | 56 +++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2fcbd186f..67e3731c8 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -22,6 +22,7 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + update_url_query, url_or_none, ) @@ -405,6 +406,10 @@ class PornHubIE(PornHubBaseIE): class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see @@ -463,14 +468,27 @@ class PornHubUserIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) return self.url_result( - '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), - video_id=user_id) + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): @@ -488,17 +506,37 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') - page = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) + page = self._extract_page(url) + + VIDEOS = '/videos' + + def download_page(base_url, num): + note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 entries = [] - for page_num in (page, ) if page is not None else itertools.count(1): + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): try: - webpage = self._download_webpage( - url, item_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num) + else: + raise except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if is_404(e) and page_num != first_page: break raise page_entries = self._extract_entries(webpage, host) From e22ff4e35681a600ed61918beab8ed316728ec39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:09:11 +0700 Subject: [PATCH 396/424] [pornhub] Add support for authentication (closes #18797, closes #21416, closes #24294) --- youtube_dl/extractor/pornhub.py | 106 +++++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 67e3731c8..83307a233 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -23,6 +23,7 @@ from ..utils import ( remove_quotes, str_to_int, update_url_query, + urlencode_postdata, url_or_none, ) @@ -53,6 +54,66 @@ class PornHubBaseIE(InfoExtractor): return webpage, urlh + def _real_initialize(self): + self._logged_in = False + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']signOut', + r'>Sign\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' @@ -164,12 +225,20 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)', + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -181,12 +250,7 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) + self._login(host) self._set_cookie(host, 'age_verified', '1') @@ -427,26 +491,6 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): container)) ] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - entries = self._extract_entries(webpage, host) - - playlist = self._parse_json( - self._search_regex( - r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, - 'playlist', default='{}'), - playlist_id, fatal=False) - title = playlist.get('title') or self._search_regex( - r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) - - return self.playlist_result( - entries, playlist_id, title, playlist.get('description')) - class PornHubUserIE(PornHubPlaylistBaseIE): _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' @@ -506,12 +550,14 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') + self._login(host) + page = self._extract_page(url) VIDEOS = '/videos' - def download_page(base_url, num): - note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') return self._download_webpage( base_url, item_id, note, query={'page': num}) @@ -532,7 +578,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 if is_404(e) and page_num == first_page and VIDEOS in base_url: base_url = base_url.replace(VIDEOS, '') - webpage = download_page(base_url, page_num) + webpage = download_page(base_url, page_num, fallback=True) else: raise except ExtractorError as e: From 1f0910bc2742b16be8425841d5ed6a0fd96f82a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:17:45 +0700 Subject: [PATCH 397/424] [svtplay] Fix video id extraction (closes #28058) --- youtube_dl/extractor/svt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index a0b6ef4db..4acc29fce 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -255,8 +255,10 @@ class SVTPlayIE(SVTPlayBaseIE): svt_id = self._search_regex( (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), webpage, 'video id') info_dict = self._extract_by_video_id(svt_id, webpage) From 2adc0c51cdf38e039fba0ede11f65bbd9c71bde8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:20:09 +0700 Subject: [PATCH 398/424] [pornhub] Add placeholder netrc machine --- youtube_dl/extractor/pornhub.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 83307a233..83773aebb 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -29,6 +29,8 @@ from ..utils import ( class PornHubBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pornhub' + def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) From 89c5a7d5aabd138a14c76453d79d5d66ef573bde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:36:57 +0700 Subject: [PATCH 399/424] [pornhub] Implement lazy playlist extraction --- youtube_dl/extractor/pornhub.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 83773aebb..b7631e4e1 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -547,13 +547,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): <button[^>]+\bid=["\']moreDataBtn ''', webpage) is not None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - item_id = mobj.group('id') - - self._login(host) - + def _entries(self, url, host, item_id): page = self._extract_page(url) VIDEOS = '/videos' @@ -566,7 +560,6 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): def is_404(e): return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 - entries = [] base_url = url has_page = page is not None first_page = page if has_page else 1 @@ -590,11 +583,19 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): page_entries = self._extract_entries(webpage, host) if not page_entries: break - entries.extend(page_entries) + for e in page_entries: + yield e if not self._has_more(webpage): break - return self.playlist_result(orderedSet(entries), item_id) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): From 3c07d007ca5376719a0cfe6b9c6627b38cbd3e1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:47:30 +0700 Subject: [PATCH 400/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/ChangeLog b/ChangeLog index 7f2e0aad1..bd753d524 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,45 @@ +version <unreleased> + +Extractors +* [pornhub] Implement lazy playlist extraction +* [svtplay] Fix video id extraction (#28058) ++ [pornhub] Add support for authentication (#18797, #21416, #24294) +* [pornhub:user] Improve paging ++ [pornhub:user] Add support for URLs unavailable via /videos page (#27853) ++ [bravotv] Add support for oxygen.com (#13357, #22500) ++ [youtube] Pass embed URL to get_video_info request +* [ccma] Improve metadata extraction (#27994) + + Extract age limit, alt title, categories, series and episode number + * Fix timestamp multiple subtitles extraction +* [egghead] Update API domain (#28038) +- [vidzi] Remove extractor (#12629) +* [vidio] Improve metadata extraction +* [youtube] Improve subtitles extraction +* [youtube] Fix chapter extraction fallback +* [youtube] Rewrite extractor + * Improve format sorting + * Remove unused code + * Fix series metadata extraction + * Fix trailer video extraction + * Improve error reporting + + Extract video location ++ [vvvvid] Add support for youtube embeds (#27825) +* [googledrive] Report download page errors (#28005) +* [vlive] Fix error message decoding for python 2 (#28004) +* [youtube] Improve DASH formats file size extraction +* [cda] Improve birth validation detection (#14022, #27929) ++ [awaan] Extract uploader id (#27963) ++ [medialaan] Add support DPG Media MyChannels based websites (#14871, #15597, + #16106, #16489) +* [abcnews] Fix extraction (#12394, #27920) +* [AMP] Fix upload date and timestamp extraction (#27970) +* [tv4] Relax URL regular expression (#27964) ++ [tv2] Add support for mtvuutiset.fi (#27744) +* [adn] Improve login warning reporting +* [zype] Fix uplynk id extraction (#27956) ++ [adn] Add support for authentication (#17091, #27841, #27937) + + version 2021.01.24.1 Core From cfefb7d854f87e02c971170fcfa08f3ff2cb1bfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:49:25 +0700 Subject: [PATCH 401/424] release 2021.02.04 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 2dde97a2c..86e48bc4e 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24.1 + [debug] youtube-dl version 2021.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index c520d1ee0..fa369b744 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 4aacd3bdc..806c7c58d 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 91bbed506..1d1a36dda 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24.1 + [debug] youtube-dl version 2021.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index a0a2c989a..c19052a7a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index bd753d524..d5d9c00a2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.02.04 Extractors * [pornhub] Implement lazy playlist extraction diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 13bac6e27..e1b85b1d1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -537,6 +537,7 @@ - **mtv:video** - **mtvjapan** - **mtvservices:embedded** + - **MTVUutisetArticle** - **MuenchenTV**: münchen.tv - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses @@ -1058,7 +1059,6 @@ - **vidme** - **vidme:user** - **vidme:user:likes** - - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - **viewlift** @@ -1103,6 +1103,7 @@ - **vrv** - **vrv:series** - **VShare** + - **VTM** - **VTXTV** - **vube**: Vube.com - **VuClip** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c52f1d9ca..d898525c9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.24.1' +__version__ = '2021.02.04' From fc88e8f0e3e66f17f787cbc1ea45c87fdc70781e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= <mail@adrianheine.de> Date: Thu, 4 Feb 2021 00:57:56 +0100 Subject: [PATCH 402/424] [azmedien] Fix extraction (#28064) --- youtube_dl/extractor/azmedien.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index b1e20def5..930266990 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): From 7215691ab7cabc858b17c16928c372da3e35ec59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 13:07:43 +0700 Subject: [PATCH 403/424] [youtube] Prefer DASH formats (closes #28070) --- youtube_dl/extractor/youtube.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 42b0f452c..a3b10c094 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1549,16 +1549,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('youtube_include_dash_manifest'): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: + dash_formats = [] for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): - if f['format_id'] in itags: - continue filesize = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if filesize: f['filesize'] = filesize - formats.append(f) + dash_formats.append(f) + # Until further investigation prefer DASH formats as non-DASH + # may not be available (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/28070 + if dash_formats: + dash_formats_keys = [f['format_id'] for f in dash_formats] + formats = [f for f in formats if f['format_id'] not in dash_formats_keys] + formats.extend(dash_formats) if not formats: if streaming_data.get('licenseInfos'): From c7d407bca205d8eb248b94b611435187265b79da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 13:09:28 +0700 Subject: [PATCH 404/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index d5d9c00a2..4392a4e6f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +version <unreleased> + +Extractors +* [youtube] Prefer DASH formats (#28070) +* [azmedien] Fix extraction (#28064) + + version 2021.02.04 Extractors From a4bdc3112bf0e925afc2e512d5f23f9097f6bc7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 13:11:33 +0700 Subject: [PATCH 405/424] release 2021.02.04.1 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 86e48bc4e..19b750f86 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index fa369b744..8acb80b60 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 806c7c58d..66edcf752 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 1d1a36dda..18203fb34 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index c19052a7a..20df40cc5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 4392a4e6f..784b73d8d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.02.04.1 Extractors * [youtube] Prefer DASH formats (#28070) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d898525c9..425f15589 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.04' +__version__ = '2021.02.04.1' From 1641b132323b544b9ae0dad06707425eba1f926b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 4 Feb 2021 13:05:35 +0100 Subject: [PATCH 406/424] [youtube] skip OTF formats(#28070) --- youtube_dl/extractor/youtube.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3b10c094..eb5f70763 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1477,6 +1477,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats = [] itags = [] + itag_qualities = {} player_url = None q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) streaming_data = player_response.get('streamingData') or {} @@ -1486,6 +1487,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): continue + itag = str_or_none(fmt.get('itag')) + quality = fmt.get('quality') + if itag and quality: + itag_qualities[itag] = quality + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment + # (adding `&sq=0` to the URL) and parsing emsg box to determine the + # number of fragment that would subsequently requested with (`&sq=N`) + if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': + continue + fmt_url = fmt.get('url') if not fmt_url: sc = compat_parse_qs(fmt.get('signatureCipher')) @@ -1505,10 +1516,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' fmt_url += '&' + sp + '=' + signature - itag = str_or_none(fmt.get('itag')) if itag: itags.append(itag) - quality = fmt.get('quality') dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -1549,22 +1558,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('youtube_include_dash_manifest'): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: - dash_formats = [] for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): + itag = f['format_id'] + if itag in itags: + continue + if itag in itag_qualities: + f['quality'] = q(itag_qualities[itag]) filesize = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if filesize: f['filesize'] = filesize - dash_formats.append(f) - # Until further investigation prefer DASH formats as non-DASH - # may not be available (see [1]) - # 1. https://github.com/ytdl-org/youtube-dl/issues/28070 - if dash_formats: - dash_formats_keys = [f['format_id'] for f in dash_formats] - formats = [f for f in formats if f['format_id'] not in dash_formats_keys] - formats.extend(dash_formats) + formats.append(f) if not formats: if streaming_data.get('licenseInfos'): From 0156ce95c5ba83de6c68a149d352ccecd983a294 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Feb 2021 01:54:46 +0700 Subject: [PATCH 407/424] [youtube] Extract abr and vbr (closes #28100) --- youtube_dl/extractor/youtube.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eb5f70763..b5e0f4eaa 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -500,6 +500,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', + 'abr': 129.495, }, 'params': { 'youtube_include_dash_manifest': True, @@ -1518,6 +1519,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if itag: itags.append(itag) + tbr = float_or_none( + fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -1526,8 +1529,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'fps': int_or_none(fmt.get('fps')), 'height': int_or_none(fmt.get('height')), 'quality': q(quality), - 'tbr': float_or_none(fmt.get( - 'averageBitrate') or fmt.get('bitrate'), 1000), + 'tbr': tbr, 'url': fmt_url, 'width': fmt.get('width'), } @@ -1538,7 +1540,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if mobj: dct['ext'] = mimetype2ext(mobj.group(1)) dct.update(parse_codecs(mobj.group(2))) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + no_audio = dct.get('acodec') == 'none' + no_video = dct.get('vcodec') == 'none' + if no_audio: + dct['vbr'] = tbr + if no_video: + dct['abr'] = tbr + if no_audio or no_video: dct['downloader_options'] = { # Youtube throttles chunks >~10M 'http_chunk_size': 10485760, From 0cf09c2b4168cb99800836d8c1ff0d6d8b16fb6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Feb 2021 02:17:03 +0700 Subject: [PATCH 408/424] [youtube] Fix release date extraction (closes #28094) --- youtube_dl/extractor/youtube.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b5e0f4eaa..c87e54e6b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1019,6 +1019,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', 'only_matching': True, }, + { + # https://github.com/ytdl-org/youtube-dl/pull/28094 + 'url': 'OtqTfy26tG0', + 'info_dict': { + 'id': 'OtqTfy26tG0', + 'ext': 'mp4', + 'title': 'Burn Out', + 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131', + 'upload_date': '20141120', + 'uploader': 'The Cinematic Orchestra - Topic', + 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw', + 'artist': 'The Cinematic Orchestra', + 'track': 'Burn Out', + 'album': 'Every Day', + 'release_data': None, + 'release_year': None, + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1743,7 +1765,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), 'track': mobj.group('track').strip(), 'release_date': release_date, - 'release_year': int(release_year), + 'release_year': int_or_none(release_year), }) initial_data = None From 240585470539d31d9c3785a67861491fa3696451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= <mail@adrianheine.de> Date: Sat, 6 Feb 2021 20:46:05 +0100 Subject: [PATCH 409/424] [urplay] Fix extraction (closes #28073) (#28074) --- youtube_dl/extractor/urplay.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 10b817760..5452c7ca1 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -42,8 +42,8 @@ class URPlayIE(InfoExtractor): url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) urplayer_data = self._parse_json(self._html_search_regex( - r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['currentProduct'] + r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'][0] episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( From 7a9161578e42abe681c9d3352ecc9a18a9b8df6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Feb 2021 19:18:06 +0700 Subject: [PATCH 410/424] [cda] Detect geo restricted videos (refs #28106) --- youtube_dl/extractor/cda.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 6429454fb..1b4362144 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -95,6 +95,9 @@ class CDAIE(InfoExtractor): if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) + if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): + self.raise_geo_restricted() + need_confirm_age = False if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")', webpage, 'birthday validate form', default=None): From 5fc53690cbe6abb11941a3f4846b566a7472753e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Feb 2021 20:34:41 +0700 Subject: [PATCH 411/424] [archiveorg] Fix and improve extraction (closes #21330, closes #23586, closes #25277, closes #26780, closes #27109, closes #27236, closes #28063) --- youtube_dl/extractor/archiveorg.py | 54 +++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index c79c58e82..e42ed5e79 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -2,15 +2,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - unified_strdate, clean_html, + extract_attributes, + unified_strdate, + unified_timestamp, ) class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', @@ -19,8 +21,11 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', - 'upload_date': '19681210', - 'uploader': 'SRI International' + 'creator': 'SRI International', + 'release_date': '19681210', + 'uploader': 'SRI International', + 'timestamp': 1268695290, + 'upload_date': '20100315', } }, { 'url': 'https://archive.org/details/Cops1922', @@ -29,22 +34,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', + 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'timestamp': 1387699629, + 'upload_date': '20131222', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, + }, { + 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist'), video_id) - info = self._parse_jwplayer_data( - {'playlist': jwplayer_playlist}, video_id, base_url=url) + + playlist = None + play8 = self._search_regex( + r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage, + 'playlist', default=None) + if play8: + attrs = extract_attributes(play8) + playlist = attrs.get('value') + if not playlist: + # Old jwplayer fallback + playlist = self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", + webpage, 'jwplayer playlist', default='[]') + jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False) + if jwplayer_playlist: + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) + else: + # HTML5 media fallback + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info['id'] = video_id def get_optional(metadata, field): return metadata.get(field, [None])[0] @@ -58,8 +84,12 @@ class ArchiveOrgIE(InfoExtractor): 'description': clean_html(get_optional(metadata, 'description')), }) if info.get('_type') != 'playlist': + creator = get_optional(metadata, 'creator') info.update({ - 'uploader': get_optional(metadata, 'creator'), - 'upload_date': unified_strdate(get_optional(metadata, 'date')), + 'creator': creator, + 'release_date': unified_strdate(get_optional(metadata, 'date')), + 'uploader': get_optional(metadata, 'publisher') or creator, + 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')), + 'language': get_optional(metadata, 'language'), }) return info From 99c68db0a8adc634e2e928ea2756a2ceee3ae863 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 8 Feb 2021 09:20:28 +0100 Subject: [PATCH 412/424] [youtube] add support phone/tablet JS player(closes #26424) --- test/test_youtube_signature.py | 31 ++++++++++--------------------- youtube_dl/extractor/youtube.py | 6 +++++- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index b5a4d0d5f..627d4cb92 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -19,55 +19,46 @@ from youtube_dl.compat import compat_str, compat_urlretrieve _TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', - 'js', 86, '>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', - 'js', 85, '3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfle-mVwz.js', - 'js', 90, ']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl0Cbn9e.js', - 'js', 84, 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVW@YZ!"#$%&\'()*+,-./:;<=', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', - 'js', '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', 'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', - 'js', 84, '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js', - 'js', 83, '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js', - 'js', '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', - 'js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', ) @@ -78,6 +69,10 @@ class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): PLAYER_URLS = ( ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/fr_FR/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), # obsolete ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), @@ -100,13 +95,13 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, stype, sig_input, expected_sig): +def make_tfunc(url, sig_input, expected_sig): m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) assert m, '%r should follow URL format' % url test_id = m.group(1) def test_func(self): - basename = 'player-%s.%s' % (test_id, stype) + basename = 'player-%s.js' % test_id fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): @@ -114,22 +109,16 @@ def make_tfunc(url, stype, sig_input, expected_sig): ydl = FakeYDL() ie = YoutubeIE(ydl) - if stype == 'js': - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - else: - assert stype == 'swf' - with open(fn, 'rb') as testf: - swfcode = testf.read() - func = ie._parse_sig_swf(swfcode) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + func = ie._parse_sig_js(jscode) src_sig = ( compat_str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) got_sig = func(src_sig) self.assertEqual(got_sig, expected_sig) - test_func.__name__ = str('test_signature_' + stype + '_' + test_id) + test_func.__name__ = str('test_signature_js_' + test_id) setattr(TestSignature, test_func.__name__, test_func) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c87e54e6b..346311d9b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -398,7 +398,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?(1).+)? # if we found the ID, everything can follow $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _PLAYER_INFO_RE = ( - r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.js$', + r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', + r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') @@ -1237,6 +1238,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)', + r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns From 311ebdd9a57e72116136a464fbc0fa8cad32db42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 8 Feb 2021 15:46:32 +0700 Subject: [PATCH 413/424] [xhamster] Extract formats from xplayer settings and extract filesizes (closes #28114) --- youtube_dl/extractor/xhamster.py | 80 +++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 76aeaf9a4..f73b9778f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,11 +11,14 @@ from ..utils import ( dict_get, extract_attributes, ExtractorError, + float_or_none, int_or_none, parse_duration, + str_or_none, try_get, unified_strdate, url_or_none, + urljoin, ) @@ -146,36 +149,89 @@ class XHamsterIE(InfoExtractor): video = initials['videoModel'] title = video['title'] formats = [] - for format_id, formats_dict in video['sources'].items(): + format_urls = set() + format_sizes = {} + sources = try_get(video, lambda x: x['sources'], dict) or {} + for format_id, formats_dict in sources.items(): if not isinstance(formats_dict, dict): continue + download_sources = try_get(sources, lambda x: x['download'], dict) or {} + for quality, format_dict in download_sources.items(): + if not isinstance(format_dict, dict): + continue + format_sizes[quality] = float_or_none(format_dict.get('size')) for quality, format_item in formats_dict.items(): if format_id == 'download': # Download link takes some time to be generated, # skipping for now continue - if not isinstance(format_item, dict): - continue - format_url = format_item.get('link') - filesize = int_or_none( - format_item.get('size'), invscale=1000000) - else: - format_url = format_item - filesize = None + format_url = format_item format_url = url_or_none(format_url) - if not format_url: + if not format_url or format_url in format_urls: continue + format_urls.add(format_url) formats.append({ 'format_id': '%s-%s' % (format_id, quality), 'url': format_url, 'ext': determine_ext(format_url, 'mp4'), 'height': get_height(quality), - 'filesize': filesize, + 'filesize': format_sizes.get(quality), 'http_headers': { 'Referer': urlh.geturl(), }, }) - self._sort_formats(formats) + xplayer_sources = try_get( + initials, lambda x: x['xplayerSettings']['sources'], dict) + if xplayer_sources: + hls_sources = xplayer_sources.get('hls') + if isinstance(hls_sources, dict): + for hls_format_key in ('url', 'fallback'): + hls_url = hls_sources.get(hls_format_key) + if not hls_url: + continue + hls_url = urljoin(url, hls_url) + if not hls_url or hls_url in format_urls: + continue + format_urls.add(hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + standard_sources = xplayer_sources.get('standard') + if isinstance(standard_sources, dict): + for format_id, formats_list in standard_sources.items(): + if not isinstance(formats_list, list): + continue + for standard_format in formats_list: + if not isinstance(standard_format, dict): + continue + for standard_format_key in ('url', 'fallback'): + standard_url = standard_format.get(standard_format_key) + if not standard_url: + continue + standard_url = urljoin(url, standard_url) + if not standard_url or standard_url in format_urls: + continue + format_urls.add(standard_url) + ext = determine_ext(standard_url, 'mp4') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + standard_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = (str_or_none(standard_format.get('quality')) + or str_or_none(standard_format.get('label')) + or '') + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': standard_url, + 'ext': ext, + 'height': get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': standard_url, + }, + }) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) categories_list = video.get('categories') if isinstance(categories_list, list): From 7f8b8bc418b8831ea1c2ae8de64e3bf0e8b707f8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 8 Feb 2021 15:56:42 +0100 Subject: [PATCH 414/424] [ign] fix extraction(closes #24771) --- youtube_dl/extractor/extractors.py | 4 +- youtube_dl/extractor/ign.py | 371 +++++++++++++++-------------- 2 files changed, 200 insertions(+), 175 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 97b0b4034..84998316c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -470,8 +470,8 @@ from .hungama import ( from .hypem import HypemIE from .ign import ( IGNIE, - OneUPIE, - PCMagIE, + IGNVideoIE, + IGNArticleIE, ) from .iheart import ( IHeartRadioIE, diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index a96ea8010..0d9f50ed2 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -3,230 +3,255 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + HEADRequest, + determine_ext, int_or_none, parse_iso8601, + strip_or_none, + try_get, ) -class IGNIE(InfoExtractor): +class IGNBaseIE(InfoExtractor): + def _call_api(self, slug): + return self._download_json( + 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + + +class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)' + _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)' IE_NAME = 'ign.com' + _PAGE_TYPE = 'video' - _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' - _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' - - _TESTS = [ - { - 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', - 'info_dict': { - 'id': '8f862beef863986b2785559b9e1aa599', - 'ext': 'mp4', - 'title': 'The Last of Us Review', - 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', - 'timestamp': 1370440800, - 'upload_date': '20130605', - 'uploader_id': 'cberidon@ign.com', - } - }, - { - 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', - 'info_dict': { - 'id': '100-little-things-in-gta-5-that-will-blow-your-mind', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '5ebbd138523268b93c9141af17bec937', - 'ext': 'mp4', - 'title': 'GTA 5 Video Review', - 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', - 'timestamp': 1379339880, - 'upload_date': '20130916', - 'uploader_id': 'danieljkrupa@gmail.com', - }, - }, - { - 'info_dict': { - 'id': '638672ee848ae4ff108df2a296418ee2', - 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', - 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', - 'timestamp': 1386878820, - 'upload_date': '20131212', - 'uploader_id': 'togilvie@ign.com', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', - 'md5': '618fedb9c901fd086f6f093564ef8558', - 'info_dict': { - 'id': '078fdd005f6d3c02f63d795faa1b984f', - 'ext': 'mp4', - 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', - 'timestamp': 1408047180, - 'upload_date': '20140814', - 'uploader_id': 'jamesduggan1990@gmail.com', - }, - }, - { - 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', - 'only_matching': True, - }, - { - 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', - 'only_matching': True, - }, - { - # videoId pattern - 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', - 'only_matching': True, - }, - ] - - def _find_video_id(self, webpage): - res_id = [ - r'"video_id"\s*:\s*"(.*?)"', - r'class="hero-poster[^"]*?"[^>]*id="(.+?)"', - r'data-video-id="(.+?)"', - r'<object id="vid_(.+?)"', - r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', - r'videoId"\s*:\s*"(.+?)"', - r'videoId["\']\s*:\s*["\']([^"\']+?)["\']', - ] - return self._search_regex(res_id, webpage, 'video id', default=None) + _TESTS = [{ + 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', + 'md5': 'd2e1586d9987d40fad7867bf96a018ea', + 'info_dict': { + 'id': '8f862beef863986b2785559b9e1aa599', + 'ext': 'mp4', + 'title': 'The Last of Us Review', + 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', + 'timestamp': 1370440800, + 'upload_date': '20130605', + 'tags': 'count:9', + } + }, { + 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', + 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', + 'info_dict': { + 'id': 'ee10d774b508c9b8ec07e763b9125b91', + 'ext': 'mp4', + 'title': 'What\'s New Now: Is GoGo Snooping on Your Data?', + 'description': 'md5:817a20299de610bd56f13175386da6fa', + 'timestamp': 1420571160, + 'upload_date': '20150106', + 'tags': 'count:4', + } + }, { + 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name_or_id = mobj.group('name_or_id') - page_type = mobj.group('type') - webpage = self._download_webpage(url, name_or_id) - if page_type != 'video': - multiple_urls = re.findall( - r'<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', - webpage) - if multiple_urls: - entries = [self.url_result(u, ie='IGN') for u in multiple_urls] - return { - '_type': 'playlist', - 'id': name_or_id, - 'entries': entries, - } - - video_id = self._find_video_id(webpage) - if not video_id: - return self.url_result(self._search_regex( - self._EMBED_RE, webpage, 'embed url')) - return self._get_video_info(video_id) - - def _get_video_info(self, video_id): - api_data = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id) + display_id = self._match_id(url) + video = self._call_api(display_id) + video_id = video['videoId'] + metadata = video['metadata'] + title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] formats = [] - m3u8_url = api_data['refs'].get('m3uUrl') + refs = video.get('refs') or {} + + m3u8_url = refs.get('m3uUrl') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - f4m_url = api_data['refs'].get('f4mUrl') + + f4m_url = refs.get('f4mUrl') if f4m_url: formats.extend(self._extract_f4m_formats( f4m_url, video_id, f4m_id='hds', fatal=False)) - for asset in api_data['assets']: + + for asset in (video.get('assets') or []): + asset_url = asset.get('url') + if not asset_url: + continue formats.append({ - 'url': asset['url'], - 'tbr': asset.get('actual_bitrate_kbps'), - 'fps': asset.get('frame_rate'), + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), 'height': int_or_none(asset.get('height')), 'width': int_or_none(asset.get('width')), }) + + mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'preference': 1, + 'url': mezzanine_url, + }) + self._sort_formats(formats) - thumbnails = [{ - 'url': thumbnail['url'] - } for thumbnail in api_data.get('thumbnails', [])] + thumbnails = [] + for thumbnail in (video.get('thumbnails') or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + }) - metadata = api_data['metadata'] + tags = [] + for tag in (video.get('tags') or []): + display_name = tag.get('displayName') + if not display_name: + continue + tags.append(display_name) return { - 'id': api_data.get('videoId') or video_id, - 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], - 'description': metadata.get('description'), + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), 'timestamp': parse_iso8601(metadata.get('publishDate')), 'duration': int_or_none(metadata.get('duration')), - 'display_id': metadata.get('slug') or video_id, - 'uploader_id': metadata.get('creator'), + 'display_id': display_id, 'thumbnails': thumbnails, 'formats': formats, + 'tags': tags, } -class OneUPIE(IGNIE): - _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html' - IE_NAME = '1up.com' - +class IGNVideoIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/' _TESTS = [{ - 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': 'c9cc69e07acb675c31a16719f909e347', + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1', 'info_dict': { - 'id': '34976', + 'id': 'e9be7ea899a9bbfc0674accc22a36cc8', 'ext': 'mp4', - 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', - 'timestamp': 1313099220, - 'upload_date': '20110811', - 'uploader_id': 'IGN', + 'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015', + 'description': 'Taking out assassination targets in Hitman has never been more stylish.', + 'timestamp': 1444665600, + 'upload_date': '20151012', } + }, { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed', + 'only_matching': True, + }, { + # Twitter embed + 'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed', + 'only_matching': True, + }, { + # Vimeo embed + 'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - result = super(OneUPIE, self)._real_extract(url) - result['id'] = mobj.group('name_or_id') - return result + video_id = self._match_id(url) + req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') + url = self._request_webpage(req, video_id).geturl() + ign_url = compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + if ign_url: + return self.url_result(ign_url, IGNIE.ie_key()) + return self.url_result(url) -class PCMagIE(IGNIE): - _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)' - IE_NAME = 'pcmag' - - _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]' - +class IGNArticleIE(IGNBaseIE): + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)' + _PAGE_TYPE = 'article' _TESTS = [{ - 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', - 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': 'ee10d774b508c9b8ec07e763b9125b91', - 'ext': 'mp4', - 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', - 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', - 'timestamp': 1420571160, - 'upload_date': '20150106', - 'uploader_id': 'cozzipix@gmail.com', - } + 'id': '524497489e4e8ff5848ece34', + 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', + }, + 'playlist': [ + { + 'info_dict': { + 'id': '5ebbd138523268b93c9141af17bec937', + 'ext': 'mp4', + 'title': 'GTA 5 Video Review', + 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', + }, + }, + { + 'info_dict': { + 'id': '638672ee848ae4ff108df2a296418ee2', + 'ext': 'mp4', + 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', + }, + }, + ], + 'params': { + 'playlist_items': '2-3', + 'skip_download': True, + }, }, { - 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', - 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { - 'id': '042e560ba94823d43afcb12ddf7142ca', - 'ext': 'mp4', - 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', - 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', - 'timestamp': 1412953920, - 'upload_date': '20141010', - 'uploader_id': 'chris_snyder@pcmag.com', - } + 'id': '53ee806780a81ec46e0790f8', + 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', + }, + 'playlist_count': 2, + }, { + # videoId pattern + 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii', + 'only_matching': True, + }, { + # IMDB embed + 'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer', + 'only_matching': True, + }, { + # Facebook embed + 'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series', + 'only_matching': True, + }, { + # Brightcove embed + 'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip', + 'only_matching': True, }] + + def _real_extract(self, url): + display_id = self._match_id(url) + article = self._call_api(display_id) + + def entries(): + media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) From a4c7ed6b1e9100be8ef65c44e7e6e43b9314ff5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 10 Feb 2021 22:28:58 +0700 Subject: [PATCH 415/424] [youtube:tab] Improve grid continuation extraction (closes #28130) --- youtube_dl/extractor/youtube.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 346311d9b..c78996629 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2374,9 +2374,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): next_continuation = cls._extract_next_continuation_data(renderer) if next_continuation: return next_continuation - contents = renderer.get('contents') - if not isinstance(contents, list): - return + contents = [] + for key in ('contents', 'items'): + contents.extend(try_get(renderer, lambda x: x[key], list) or []) for content in contents: if not isinstance(content, dict): continue @@ -2509,6 +2509,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue + renderer = continuation_item.get('gridVideoRenderer') + if renderer: + grid_renderer = {'items': continuation_items} + for entry in self._grid_entries(grid_renderer): + yield entry + continuation = self._extract_continuation(grid_renderer) + continue renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') if renderer: video_list_renderer = {'contents': continuation_items} From cd493c5adcb526cdfa2a9d5194269b671a0dc343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 10 Feb 2021 22:32:25 +0700 Subject: [PATCH 416/424] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index 784b73d8d..5951372b3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version <unreleased> + +Extractors +* [youtube:tab] Improve grid continuation extraction (#28130) +* [ign] Fix extraction (#24771) ++ [xhamster] Extract format filesize ++ [xhamster] Extract formats from xplayer settings (#28114) ++ [youtube] Add support phone/tablet JS player (#26424) +* [archiveorg] Fix and improve extraction (#21330, #23586, #25277, #26780, + #27109, #27236, #28063) ++ [cda] Detect geo restricted videos (#28106) +* [urplay] Fix extraction (#28073, #28074) +* [youtube] Fix release date extraction (#28094) ++ [youtube] Extract abr and vbr (#28100) +* [youtube] Skip OTF formats (#28070) + + version 2021.02.04.1 Extractors From 360d5f0daac879a1371c6a45e0d3310ced60e352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 10 Feb 2021 22:34:47 +0700 Subject: [PATCH 417/424] release 2021.02.10 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 4 ++-- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 19b750f86..ea0a59dca 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04.1 + [debug] youtube-dl version 2021.02.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 8acb80b60..d24855c72 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 66edcf752..8b96a2883 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 18203fb34..e46971047 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04.1 + [debug] youtube-dl version 2021.02.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 20df40cc5..a9ca379ca 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 5951372b3..384bd19c2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.02.10 Extractors * [youtube:tab] Improve grid continuation extraction (#28130) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e1b85b1d1..1373cc4f6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1,6 +1,5 @@ # Supported sites - **1tv**: Первый канал - - **1up.com** - **20min** - **220.ro** - **23video** @@ -376,6 +375,8 @@ - **HungamaSong** - **Hypem** - **ign.com** + - **IGNArticle** + - **IGNVideo** - **IHeartRadio** - **iheartradio:podcast** - **imdb**: Internet Movie Database trailers @@ -676,7 +677,6 @@ - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - - **pcmag** - **PearVideo** - **PeerTube** - **People** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 425f15589..79d2be625 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.04.1' +__version__ = '2021.02.10' From f28f1b4d6ed053f1dbfbc7fc992162c4ea4f2ce7 Mon Sep 17 00:00:00 2001 From: Kevin Velghe <kevin@paretje.be> Date: Thu, 11 Feb 2021 09:04:16 +0100 Subject: [PATCH 418/424] [canvas] Add new extractor for Dagelijkse Kost (#28119) --- youtube_dl/extractor/canvas.py | 56 ++++++++++++++++++++++++++++-- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8b76a0200..eefbab241 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -7,19 +7,21 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( - extract_attributes, ExtractorError, - strip_or_none, + clean_html, + extract_attributes, float_or_none, + get_element_by_class, int_or_none, merge_dicts, str_or_none, + strip_or_none, url_or_none, ) class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '68993eda72ef62386a15ea2cf3c93107', @@ -332,3 +334,51 @@ class VrtNUIE(GigyaBaseIE): 'display_id': display_id, 'season_number': int_or_none(page.get('episode_season')), }) + + +class DagelijkseKostIE(InfoExtractor): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'md5': '30bfffc323009a3e5f689bef6efa2365', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'display_id': 'hachis-parmentier-met-witloof', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 283.02, + }, + 'expected_warnings': ['is not a supported codec'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage + ) or self._html_search_meta( + 'twitter:title', webpage)) + + description = clean_html(get_element_by_class( + 'dish-description', webpage) + ) or self._html_search_meta( + ('description', 'twitter:description', 'og:description'), + webpage) + + video_id = self._html_search_regex( + r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 84998316c..e4c475fd8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -163,6 +163,7 @@ from .canvas import ( CanvasIE, CanvasEenIE, VrtNUIE, + DagelijkseKostIE, ) from .carambatv import ( CarambaTVIE, From f94d76499362017f673520286bc3848916735275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 14 Feb 2021 05:03:15 +0700 Subject: [PATCH 419/424] [ard] Improve formats extraction (closes #28155) --- youtube_dl/extractor/ard.py | 44 +++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6bf5b3f13..143fc51e9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -284,20 +284,42 @@ class ARDIE(InfoExtractor): formats = [] for a in video_node.findall('.//asset'): + file_name = xpath_text(a, './fileName', default=None) + if not file_name: + continue + format_type = a.attrib.get('type') + format_url = url_or_none(file_name) + if format_url: + ext = determine_ext(file_name) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_type or 'hls', fatal=False)) + continue + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), + display_id, f4m_id=format_type or 'hds', fatal=False)) + continue f = { - 'format_id': a.attrib['type'], - 'width': int_or_none(a.find('./frameWidth').text), - 'height': int_or_none(a.find('./frameHeight').text), - 'vbr': int_or_none(a.find('./bitrateVideo').text), - 'abr': int_or_none(a.find('./bitrateAudio').text), - 'vcodec': a.find('./codecVideo').text, - 'tbr': int_or_none(a.find('./totalBitrate').text), + 'format_id': format_type, + 'width': int_or_none(xpath_text(a, './frameWidth')), + 'height': int_or_none(xpath_text(a, './frameHeight')), + 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), + 'abr': int_or_none(xpath_text(a, './bitrateAudio')), + 'vcodec': xpath_text(a, './codecVideo'), + 'tbr': int_or_none(xpath_text(a, './totalBitrate')), } - if a.find('./serverPrefix').text: - f['url'] = a.find('./serverPrefix').text - f['playpath'] = a.find('./fileName').text + server_prefix = xpath_text(a, './serverPrefix', default=None) + if server_prefix: + f.update({ + 'url': server_prefix, + 'playpath': file_name, + }) else: - f['url'] = a.find('./fileName').text + if not format_url: + continue + f['url'] = format_url formats.append(f) self._sort_formats(formats) From 6d32c6c6d3b0588b193eaeb4178592219c3b4df8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 14 Feb 2021 16:22:45 +0100 Subject: [PATCH 420/424] [xboxclips] fix extraction(closes #27151) --- youtube_dl/extractor/xboxclips.py | 45 ++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py index d9c277bc3..25f487e1e 100644 --- a/youtube_dl/extractor/xboxclips.py +++ b/youtube_dl/extractor/xboxclips.py @@ -1,40 +1,55 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( int_or_none, + month_by_abbreviation, parse_filesize, - unified_strdate, ) class XboxClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', 'ext': 'mp4', - 'title': 'Iabdulelah playing Titanfall', + 'title': 'iAbdulElah playing Titanfall', 'filesize_approx': 26800000, 'upload_date': '20140807', 'duration': 56, } - } + }, { + 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + if '/video.php' in url: + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0]) - video_url = self._html_search_regex( - r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'<title>XboxClips \| ([^<]+)', webpage, 'title') - upload_date = unified_strdate(self._html_search_regex( - r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) + webpage = self._download_webpage(url, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + upload_date = None + mobj = re.search( + r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})', + webpage) + if mobj: + upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1)) filesize = parse_filesize(self._html_search_regex( r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) duration = int_or_none(self._html_search_regex( @@ -42,12 +57,12 @@ class XboxClipsIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'>Views: (\d+)<', webpage, 'view count', fatal=False)) - return { + info.update({ 'id': video_id, - 'url': video_url, 'title': title, 'upload_date': upload_date, 'filesize_approx': filesize, 'duration': duration, 'view_count': view_count, - } + }) + return info From d8085580f63ad3b146a31712ff76cf41d5a4558a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Feb 2021 19:48:26 +0100 Subject: [PATCH 421/424] [kakao] improve info extraction and detect geo restriction(closes #26577) --- youtube_dl/extractor/kakao.py | 64 ++++++++++++++++------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 32935bb28..31ce7a85c 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, + str_or_none, strip_or_none, + try_get, unified_timestamp, update_url_query, ) @@ -23,7 +26,7 @@ class KakaoIE(InfoExtractor): 'id': '301965083', 'ext': 'mp4', 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', - 'uploader_id': 2671005, + 'uploader_id': '2671005', 'uploader': '그랑그랑이', 'timestamp': 1488160199, 'upload_date': '20170227', @@ -36,11 +39,15 @@ class KakaoIE(InfoExtractor): 'ext': 'mp4', 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', - 'uploader_id': 2653210, + 'uploader_id': '2653210', 'uploader': '쇼! 음악중심', 'timestamp': 1485684628, 'upload_date': '20170129', } + }, { + # geo restricted + 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491', + 'only_matching': True, }] def _real_extract(self, url): @@ -68,8 +75,7 @@ class KakaoIE(InfoExtractor): 'fields': ','.join([ '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', 'description', 'channelId', 'createTime', 'duration', 'playCount', - 'likeCount', 'commentCount', 'tagList', 'channel', 'name', - 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', + 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'thumbnailUrl', 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) } @@ -82,24 +88,28 @@ class KakaoIE(InfoExtractor): title = clip.get('title') or clip_link.get('displayTitle') - query['tid'] = impress.get('tid', '') + query.update({ + 'fields': '-*,code,message,url', + 'tid': impress.get('tid') or '', + }) formats = [] - for fmt in clip.get('videoOutputList', []): + for fmt in (clip.get('videoOutputList') or []): try: profile_name = fmt['profile'] if profile_name == 'AUDIO': continue - query.update({ - 'profile': profile_name, - 'fields': '-*,url', - }) - fmt_url_json = self._download_json( - api_base + 'raw/videolocation', display_id, - 'Downloading video URL for profile %s' % profile_name, - query=query, headers=player_header, fatal=False) - - if fmt_url_json is None: + query['profile'] = profile_name + try: + fmt_url_json = self._download_json( + api_base + 'raw/videolocation', display_id, + 'Downloading video URL for profile %s' % profile_name, + query=query, headers=player_header) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + resp = self._parse_json(e.cause.read().decode(), video_id) + if resp.get('code') == 'GeoBlocked': + self.raise_geo_restricted() continue fmt_url = fmt_url_json['url'] @@ -116,27 +126,13 @@ class KakaoIE(InfoExtractor): pass self._sort_formats(formats) - thumbs = [] - for thumb in clip.get('clipChapterThumbnailList', []): - thumbs.append({ - 'url': thumb.get('thumbnailUrl'), - 'id': compat_str(thumb.get('timeInSec')), - 'preference': -1 if thumb.get('isDefault') else 0 - }) - top_thumbnail = clip.get('thumbnailUrl') - if top_thumbnail: - thumbs.append({ - 'url': top_thumbnail, - 'preference': 10, - }) - return { 'id': display_id, 'title': title, 'description': strip_or_none(clip.get('description')), - 'uploader': clip_link.get('channel', {}).get('name'), - 'uploader_id': clip_link.get('channelId'), - 'thumbnails': thumbs, + 'uploader': try_get(clip_link, lambda x: x['channel']['name']), + 'uploader_id': str_or_none(clip_link.get('channelId')), + 'thumbnail': clip.get('thumbnailUrl'), 'timestamp': unified_timestamp(clip_link.get('createTime')), 'duration': int_or_none(clip.get('duration')), 'view_count': int_or_none(clip.get('playCount')), From be2e9b76eea73b073f00871ea831ee3f4a1000b3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Feb 2021 22:10:06 +0100 Subject: [PATCH 422/424] [videopress] add support for video.wordpress.com --- youtube_dl/extractor/videopress.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index e5f964d39..6376ff096 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -4,21 +4,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, + int_or_none, parse_age_limit, qualities, random_birthday, - try_get, unified_timestamp, urljoin, ) class VideoPressIE(InfoExtractor): - _VALID_URL = r'https?://videopress\.com/embed/(?P[\da-zA-Z]+)' + _ID_REGEX = r'[\da-zA-Z]{8}' + _PATH_REGEX = r'video(?:\.word)?press\.com/embed/' + _VALID_URL = r'https?://%s(?P%s)' % (_PATH_REGEX, _ID_REGEX) _TESTS = [{ 'url': 'https://videopress.com/embed/kUJmAcSf', 'md5': '706956a6c875873d51010921310e4bc6', @@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor): # 17+, requires birth_* params 'url': 'https://videopress.com/embed/iH3gstfZ', 'only_matching': True, + }, { + 'url': 'https://video.wordpress.com/embed/kUJmAcSf', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r']+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', + r']+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX), webpage) def _real_extract(self, url): video_id = self._match_id(url) query = random_birthday('birth_year', 'birth_month', 'birth_day') + query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width' video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, video_id, query=query) title = video['title'] - def base_url(scheme): - return try_get( - video, lambda x: x['file_url_base'][scheme], compat_str) - - base_url = base_url('https') or base_url('http') + file_url_base = video.get('file_url_base') or {} + base_url = file_url_base.get('https') or file_url_base.get('http') QUALITIES = ('std', 'dvd', 'hd') quality = qualities(QUALITIES) formats = [] - for format_id, f in video['files'].items(): + for format_id, f in (video.get('files') or {}).items(): if not isinstance(f, dict): continue for ext, path in f.items(): @@ -75,12 +77,14 @@ class VideoPressIE(InfoExtractor): 'ext': determine_ext(path, ext), 'quality': quality(format_id), }) - original_url = try_get(video, lambda x: x['original'], compat_str) + original_url = video.get('original') if original_url: formats.append({ 'url': original_url, 'format_id': 'original', 'quality': len(QUALITIES), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), }) self._sort_formats(formats) From 4b5410c5c841b826965ea76d62bea82a26a9e1b8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Feb 2021 13:06:54 +0100 Subject: [PATCH 423/424] [ccma] fix timestamp parsing in python 2 --- youtube_dl/extractor/ccma.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 4db51e650..e6ae49352 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import calendar import datetime import re from .common import InfoExtractor from ..utils import ( clean_html, + extract_timezone, int_or_none, parse_duration, parse_resolution, @@ -97,8 +99,9 @@ class CCMAIE(InfoExtractor): timestamp = None data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) try: - timestamp = datetime.datetime.strptime( - data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() + timezone, data_utc = extract_timezone(data_utc) + timestamp = calendar.timegm((datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) except TypeError: pass From 07eb8f19169c58bce0e784607ea350ae16ed5363 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Feb 2021 05:57:53 +0700 Subject: [PATCH 424/424] [youtube] Fix controversial videos when authenticated with cookies (closes #28174) --- youtube_dl/extractor/youtube.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c78996629..7db4503e0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1042,6 +1042,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # controversial video, only works with bpctr when authenticated with cookies + 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', + 'only_matching': True, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1405,7 +1410,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = self._match_id(url) base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id - webpage = self._download_webpage(webpage_url, video_id, fatal=False) + webpage = self._download_webpage( + webpage_url + '&bpctr=9999999999', video_id, fatal=False) player_response = None if webpage: