From 06159135ef148a6ddc632d0c89b90c937d5bb021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 00:07:32 +0700 Subject: [PATCH 01/41] [youtube] Improve URL to extractor routing (closes #27572, closes #28335, closes #28742) --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 79e47c919..4d7f3f837 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -46,6 +46,10 @@ from ..utils import ( ) +def parse_qs(url): + return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -413,16 +417,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID - (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) + (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" % { - 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, 'invidious': '|'.join(_INVIDIOUS_SITES), } _PLAYER_INFO_RE = ( @@ -1208,6 +1205,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } + @classmethod + def suitable(cls, url): + qs = parse_qs(url) + if qs.get('list', [None])[0]: + return False + return super(YoutubeIE, cls).suitable(url) + def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) self._code_cache = {} @@ -2275,6 +2279,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': '#cctv9', }, 'playlist_mincount': 350, + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, }] @classmethod @@ -2764,7 +2771,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) # Handle both video/playlist URLs - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) video_id = qs.get('v', [None])[0] playlist_id = qs.get('list', [None])[0] if video_id and playlist_id: @@ -2860,12 +2867,16 @@ class YoutubePlaylistIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubeTabIE.suitable(url) else super( - YoutubePlaylistIE, cls).suitable(url) + if YoutubeTabIE.suitable(url): + return False + qs = parse_qs(url) + if qs.get('v', [None])[0]: + return False + return super(YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): playlist_id = self._match_id(url) - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) if not qs: qs = {'list': playlist_id} return self.url_result( From 79e4ccfc4b395127bb3e5e957b20b04e75cba355 Mon Sep 17 00:00:00 2001 From: quyleanh Date: Sat, 17 Apr 2021 00:30:10 +0700 Subject: [PATCH 02/41] [pluralsight] Extend anti-throttling timeout (#28712) --- youtube_dl/extractor/pluralsight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index abd08bc28..2d63855df 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -393,7 +393,7 @@ query viewClip { # To somewhat reduce the probability of these consequences # we will sleep random amount of time before each call to ViewClip. self._sleep( - random.randint(2, 5), display_id, + random.randint(5, 10), display_id, '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') if not viewclip: From d01e261a15abd779decae6e0858d8586f7a71621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20=C3=81vila?= Date: Fri, 16 Apr 2021 14:31:34 -0300 Subject: [PATCH 03/41] [youtube] Add more invidious instances (#28706) --- youtube_dl/extractor/youtube.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4d7f3f837..7fa9b473a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -359,21 +359,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'(?:www\.)?invidious\.mastodon\.host', r'(?:www\.)?invidious\.zapashcanon\.fr', r'(?:www\.)?invidious\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', r'(?:www\.)?invidious\.tube', r'(?:www\.)?invidiou\.site', r'(?:www\.)?invidious\.site', r'(?:www\.)?invidious\.xyz', r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', r'(?:www\.)?tube\.poal\.co', r'(?:www\.)?tube\.connect\.cafe', r'(?:www\.)?vid\.wxzm\.sx', r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', r'(?:www\.)?yewtu\.be', r'(?:www\.)?yt\.elukerio\.org', r'(?:www\.)?yt\.lelux\.fi', r'(?:www\.)?invidious\.ggc-project\.de', r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', r'(?:www\.)?invidious\.13ad\.de', r'(?:www\.)?invidious\.toot\.koeln', r'(?:www\.)?invidious\.fdn\.fr', From ea87ed8394127c4bf824688b8780eaf5a804e7a3 Mon Sep 17 00:00:00 2001 From: zraktvor <=> Date: Sat, 10 Apr 2021 15:11:35 +0200 Subject: [PATCH 04/41] [youtube:tab] Detect series playlist on playlists page (closes #28723) --- youtube_dl/extractor/youtube.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7fa9b473a..581687d96 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2019,6 +2019,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Игорь Клейнер - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', }, + }, { + # playlists, series + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + }, }, { # playlists, singlepage 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', @@ -2311,7 +2320,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_grid_item_renderer(item): - for item_kind in ('Playlist', 'Video', 'Channel'): + for item_kind in ('Playlist', 'Video', 'Channel', 'Show'): renderer = item.get('grid%sRenderer' % item_kind) if renderer: return renderer @@ -2344,6 +2353,19 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) + # show + if playlist_id is None: # needs to check for playlist_id, or non-series playlists are recognized twice + show_playlist_url = try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str) + if show_playlist_url: + playlist_id = self._search_regex(r'/playlist\?list=([0-9a-zA-Z-_]+)', show_playlist_url, + 'playlist id', default=None) + if playlist_id: + title = try_get(renderer, lambda x: x['title']['simpleText'], compat_str) + yield self.url_result( + "https://www.youtube.com/playlist?list=%s" % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') From 7c5239547928079513b65f62e4c84aea21ce76e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 01:05:44 +0700 Subject: [PATCH 05/41] [youtube:tab] Improve grid extraction (closes #28725) --- youtube_dl/extractor/youtube.py | 38 ++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 581687d96..b6945570f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2320,10 +2320,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_grid_item_renderer(item): - for item_kind in ('Playlist', 'Video', 'Channel', 'Show'): - renderer = item.get('grid%sRenderer' % item_kind) - if renderer: - return renderer + assert isinstance(item, dict) + for key, renderer in item.items(): + if not key.startswith('grid') or not key.endswith('Renderer'): + continue + if not isinstance(renderer, dict): + continue + return renderer def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: @@ -2333,7 +2336,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not isinstance(renderer, dict): continue title = try_get( - renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + renderer, (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) # playlist playlist_id = renderer.get('playlistId') if playlist_id: @@ -2341,10 +2345,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'https://www.youtube.com/playlist?list=%s' % playlist_id, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) + continue # video video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) + continue # channel channel_id = renderer.get('channelId') if channel_id: @@ -2353,19 +2359,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) - # show - if playlist_id is None: # needs to check for playlist_id, or non-series playlists are recognized twice - show_playlist_url = try_get( - renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str) - if show_playlist_url: - playlist_id = self._search_regex(r'/playlist\?list=([0-9a-zA-Z-_]+)', show_playlist_url, - 'playlist id', default=None) - if playlist_id: - title = try_get(renderer, lambda x: x['title']['simpleText'], compat_str) + continue + # generic endpoint URL support + ep_url = urljoin('https://www.youtube.com/', try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str)) + if ep_url: + for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): + if ie.suitable(ep_url): yield self.url_result( - "https://www.youtube.com/playlist?list=%s" % playlist_id, - ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) + ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) + break def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') From 54558e0baa4d62a94af105cd1d7f8abcbd16b468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 02:27:54 +0700 Subject: [PATCH 06/41] [youtube] Improve stretch extraction and fix stretched ratio calculation (closes #28769) --- youtube_dl/extractor/youtube.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b6945570f..75751d5a6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -812,6 +812,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'skip': 'This video does not exist.', }, + { + # Video with incomplete 'yt:stretch=16:' + 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI', + 'only_matching': True, + }, { # Video licensed under Creative Commons 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', @@ -1717,13 +1722,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] for keyword in keywords: if keyword.startswith('yt:stretch='): - w, h = keyword.split('=')[1].split(':') - w, h = int(w), int(h) - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio + mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword) + if mobj: + # NB: float is intentional for forcing float division + w, h = (float(v) for v in mobj.groups()) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio + break thumbnails = [] for container in (video_details, microformat): From a00a7e0cad3308d999599bf17df5d3e6aba502d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:22:13 +0700 Subject: [PATCH 07/41] [utils] Add support for support for experimental HTTP response status code 308 Permanent Redirect (refs #27877, refs #28768) --- youtube_dl/utils.py | 62 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8e4d144c9..538cc2b63 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ import zlib from .compat import ( compat_HTMLParseError, compat_HTMLParser, + compat_HTTPError, compat_basestring, compat_chr, compat_cookiejar, @@ -2879,12 +2880,61 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - if sys.version_info[0] < 3: - def redirect_request(self, req, fp, code, msg, headers, newurl): - # On python 2 urlh.geturl() may sometimes return redirect URL - # as byte string instead of unicode. This workaround allows - # to force it always return unicode. - return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl)) + """YoutubeDL redirect handler + + The code is based on HTTPRedirectHandler implementation from CPython [1]. + + This redirect handler solves two issues: + - ensures redirect URL is always unicode under python 2 + - introduces support for experimental HTTP response status code + 308 Permanent Redirect [2] used by some sites [3] + + 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py + 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 + 3. https://github.com/ytdl-org/youtube-dl/issues/28768 + """ + + http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") + or code in (301, 302, 303) and m == "POST")): + raise compat_HTTPError(req.full_url, code, msg, headers, fp) + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib.request, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + + # On python 2 urlh.geturl() may sometimes return redirect URL + # as byte string instead of unicode. This workaround allows + # to force it always return unicode. + if sys.version_info[0] < 3: + newurl = compat_str(newurl) + + # Be conciliant with URIs containing a space. This is mainly + # redundant with the more complete encoding done in http_error_302(), + # but it is kept for compatibility with other callers. + newurl = newurl.replace(' ', '%20') + + CONTENT_HEADERS = ("content-length", "content-type") + # NB: don't use dict comprehension for python 2.6 compatibility + newheaders = dict((k, v) for k, v in req.headers.items() + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request(newurl, + headers=newheaders, + origin_req_host=req.origin_req_host, + unverifiable=True) def extract_timezone(date_str): From 30a3a4c70fdcad10ef1dc6c3402457a95fe1ae5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:23:47 +0700 Subject: [PATCH 08/41] [lbry] Add support for HLS videos (closes #27877, closes #28768) --- youtube_dl/extractor/lbry.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index ae43d56ea..cfd6b8393 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -120,6 +120,26 @@ class LBRYIE(LBRYBaseIE): 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', 'vcodec': 'none', } + }, { + # HLS + 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', + 'md5': 'fc82f45ea54915b1495dd7cb5cc1289f', + 'info_dict': { + 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', + 'ext': 'mp4', + 'title': 'PLANTS I WILL NEVER GROW AGAIN. THE BLACK LIST PLANTS FOR A CANADIAN GARDEN | Gardening in Canada 🍁', + 'description': 'md5:9c539c6a03fb843956de61a4d5288d5e', + 'timestamp': 1618254123, + 'upload_date': '20210412', + 'release_timestamp': 1618254002, + 'release_date': '20210412', + 'tags': list, + 'duration': 554, + 'channel': 'Gardening In Canada', + 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'formats': 'mincount:3', + } }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, @@ -163,10 +183,18 @@ class LBRYIE(LBRYBaseIE): streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] info = self._parse_stream(result, url) + urlh = self._request_webpage( + streaming_url, display_id, note='Downloading streaming redirect url info') + if determine_ext(urlh.geturl()) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(info['formats']) + else: + info['url'] = streaming_url info.update({ 'id': claim_id, 'title': title, - 'url': streaming_url, }) return info From cfee2dfe83c5593d46bd0c8e8ce6a3d8c6e42db7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:32:04 +0700 Subject: [PATCH 09/41] [utils] PEP 8 --- youtube_dl/utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 538cc2b63..e722eed58 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2908,7 +2908,7 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): """ m = req.get_method() if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST")): + or code in (301, 302, 303) and m == "POST")): raise compat_HTTPError(req.full_url, code, msg, headers, fp) # Strictly (according to RFC 2616), 301 or 302 in response to # a POST MUST NOT cause a redirection without confirmation @@ -2930,11 +2930,10 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): CONTENT_HEADERS = ("content-length", "content-type") # NB: don't use dict comprehension for python 2.6 compatibility newheaders = dict((k, v) for k, v in req.headers.items() - if k.lower() not in CONTENT_HEADERS) - return compat_urllib_request.Request(newurl, - headers=newheaders, - origin_req_host=req.origin_req_host, - unverifiable=True) + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request( + newurl, headers=newheaders, origin_req_host=req.origin_req_host, + unverifiable=True) def extract_timezone(date_str): From f20b505b46e6654464635ca4afb0f37e1f14c57b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:47:00 +0700 Subject: [PATCH 10/41] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 22b4fa67d..c249412e2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core ++ [utils] Add support for experimental HTTP response status code + 308 Permanent Redirect (#27877, #28768) + +Extractors ++ [lbry] Add support for HLS videos (#27877, #28768) +* [youtube] Fix stretched ratio calculation +* [youtube] Improve stretch extraction (#28769) +* [youtube:tab] Improve grid extraction (#28725) ++ [youtube:tab] Detect series playlist on playlists page (#28723) ++ [youtube] Add more invidious instances (#28706) +* [pluralsight] Extend anti-throttling timeout (#28712) +* [youtube] Improve URL to extractor routing (#27572, #28335, #28742) ++ [maoritv] Add support for maoritelevision.com (#24552) ++ [youtube:tab] Pass innertube context and x-goog-visitor-id header along with + continuation requests (#28702) +* [mtv] Fix Viacom A/B Testing Video Player extraction (#28703) ++ [pornhub] Extract DASH and HLS formats from get_media end point (#28698) +* [cbssports] Fix extraction (#28682) +* [jamendo] Fix track extraction (#28686) +* [curiositystream] Fix format extraction (#26845, #28668) + + version 2021.04.07 Core From 596b26606cfe20aa9f776ac0658b6bfb1ea95397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:50:09 +0700 Subject: [PATCH 11/41] release 2021.04.17 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 5 ++++- youtube_dl/version.py | 2 +- 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index febbd2344..e2a89d5c2 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.07 + [debug] youtube-dl version 2021.04.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index d7296d0a9..4d7abd775 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 92e616a1a..d8dce6fd4 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index b55739f6c..d95ee291a 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.07 + [debug] youtube-dl version 2021.04.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index dbdb8356a..ac5dd2f27 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index c249412e2..45d5c2ebf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.17 Core + [utils] Add support for experimental HTTP response status code diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ff9177a2c..a23da1a31 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -3,6 +3,7 @@ - **20min** - **220.ro** - **23video** + - **247sports** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -160,7 +161,8 @@ - **cbsnews**: CBS News - **cbsnews:embed** - **cbsnews:livevideo**: CBS News Live Videos - - **CBSSports** + - **cbssports** + - **cbssports:embed** - **CCMA** - **CCTV**: 央视网 - **CDA** @@ -490,6 +492,7 @@ - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **MaoriTV** - **Markiza** - **MarkizaPage** - **massengeschmack.tv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a6b1b8dce..2b041d593 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.07' +__version__ = '2021.04.17' From 9f6c03a00602eb1119e43a522cf50682f6d6a6dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 05:05:31 +0700 Subject: [PATCH 12/41] [cbsnews] Fix extraction for python <3.6 (closes #23359) --- youtube_dl/extractor/cbsnews.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 345debcf0..1285ed65e 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -26,7 +26,7 @@ class CBSNewsEmbedIE(CBSIE): def _real_extract(self, url): item = self._parse_json(zlib.decompress(compat_b64decode( compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS), None)['video']['items'][0] + -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] return self._extract_video_info(item['mpxRefId'], 'cbsnews') From 41920fc80e4fe4a8996aeb31a04826a5a2534814 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 20 Apr 2021 20:51:55 +0100 Subject: [PATCH 13/41] [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) --- youtube_dl/extractor/bbc.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index e8d000bbb..71ea25881 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -25,8 +26,10 @@ from ..utils import ( js_to_json, parse_duration, parse_iso8601, + strip_or_none, try_get, unescapeHTML, + unified_timestamp, url_or_none, urlencode_postdata, urljoin, @@ -761,8 +764,17 @@ class BBCIE(BBCCoUkIE): 'only_matching': True, }, { # custom redirection to www.bbc.com + # also, video with window.__INITIAL_DATA__ 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', - 'only_matching': True, + 'info_dict': { + 'id': 'p02xzws1', + 'ext': 'mp4', + 'title': "Pluto may have 'nitrogen glaciers'", + 'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.", + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1437785037, + 'upload_date': '20150725', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -1164,12 +1176,23 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) + item_desc = try_get( + media, + lambda x: x['summary']['blocks'][0]['model']['text'], + compat_str) + item_time = None + for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: + if try_get(meta, lambda x: x['label']) == 'Published': + item_time = unified_timestamp(meta.get('timestamp')) + break entries.append({ 'id': item_id, 'title': item_title, 'thumbnail': item.get('holdingImageUrl'), 'formats': formats, 'subtitles': subtitles, + 'timestamp': item_time, + 'description': strip_or_none(item_desc), }) for resp in (initial_data.get('data') or {}).values(): name = resp.get('name') From dab83a25972e0dbcc69583bf78d2a992f581563d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 03:00:56 +0700 Subject: [PATCH 14/41] [bbc] Extract full description from __INITIAL_DATA__ (refs #28774) --- youtube_dl/extractor/bbc.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 71ea25881..247d982ce 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -770,7 +770,7 @@ class BBCIE(BBCCoUkIE): 'id': 'p02xzws1', 'ext': 'mp4', 'title': "Pluto may have 'nitrogen glaciers'", - 'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.", + 'description': 'md5:6a95b593f528d7a5f2605221bc56912f', 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1437785037, 'upload_date': '20150725', @@ -1176,10 +1176,16 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) - item_desc = try_get( - media, - lambda x: x['summary']['blocks'][0]['model']['text'], - compat_str) + item_desc = None + blocks = try_get(media, lambda x: x['summary']['blocks'], list) + if blocks: + summary = [] + for block in blocks: + text = try_get(block, lambda x: x['model']['text'], compat_str) + if text: + summary.append(text) + if summary: + item_desc = '\n\n'.join(summary) item_time = None for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: if try_get(meta, lambda x: x['label']) == 'Published': From 32290307a45260885b2210aa3c2a57e64abf8c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 03:56:04 +0700 Subject: [PATCH 15/41] [youtube] Fix lazy extractors (closes #28780) --- youtube_dl/extractor/youtube.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 75751d5a6..c16dc7ab8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1219,6 +1219,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('list', [None])[0]: return False @@ -2910,6 +2913,9 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('v', [None])[0]: return False From 5ad69d3d0e7d1d8d15a1f2497d602b1b91fcc74a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 04:45:13 +0700 Subject: [PATCH 16/41] [test_youtube_misc] Move YoutubeIE.extract_id test into separate module --- test/test_all_urls.py | 9 --------- test/test_youtube_misc.py | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 test/test_youtube_misc.py diff --git a/test/test_all_urls.py b/test/test_all_urls.py index df6d81b5d..365b66bad 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -70,15 +70,6 @@ class TestAllURLsMatching(unittest.TestCase): # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) - def test_youtube_extract(self): - assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) - assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') - assertExtractId('BaW_jenozKc', 'BaW_jenozKc') - def test_facebook_matching(self): self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) self.assertTrue(FacebookIE.suitable('https://www.facebook.com/cindyweather?fref=ts#!/photo.php?v=10152183998945793')) diff --git a/test/test_youtube_misc.py b/test/test_youtube_misc.py new file mode 100644 index 000000000..e18e71101 --- /dev/null +++ b/test/test_youtube_misc.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from youtube_dl.extractor import YoutubeIE + + +class TestYoutubeMisc(unittest.TestCase): + def test_youtube_extract(self): + assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) + assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') + assertExtractId('BaW_jenozKc', 'BaW_jenozKc') + + +if __name__ == '__main__': + unittest.main() From c4a451bcdd2b743fdb96fcbae261c86ed91022ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 04:47:29 +0700 Subject: [PATCH 17/41] [test_execution] Add test for lazy extractors (refs #28780) --- test/test_execution.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_execution.py b/test/test_execution.py index 11661bb68..32948d93e 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -39,6 +39,16 @@ class TestExecution(unittest.TestCase): _, stderr = p.communicate() self.assertFalse(stderr) + def test_lazy_extractors(self): + try: + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) + finally: + try: + os.remove('youtube_dl/extractor/lazy_extractors.py') + except (IOError, OSError): + pass + if __name__ == '__main__': unittest.main() From ac19c3ac8035fbf9369fd4bd336c9045d4eeafa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 05:35:39 +0700 Subject: [PATCH 18/41] [go] Improve video id extraction (closes #25207, closes #25216, closes #26058) --- youtube_dl/extractor/go.py | 46 +++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 0d731e90a..878ba14e6 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals import re from .adobepass import AdobePassIE +from ..compat import compat_str from ..utils import ( int_or_none, determine_ext, parse_age_limit, + try_get, urlencode_postdata, ExtractorError, ) @@ -116,6 +118,18 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', + 'info_dict': { + 'id': 'VDKA22600213', + 'ext': 'mp4', + 'title': 'Pilot', + 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -149,14 +163,30 @@ class GoIE(AdobePassIE): brand = site_info.get('brand') if not video_id or not site_info: webpage = self._download_webpage(url, display_id or video_id) - video_id = self._search_regex( - ( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', - # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet - r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' - ), webpage, 'video id', default=video_id) + data = self._parse_json( + self._search_regex( + r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, + 'data', default='{}'), + display_id or video_id, fatal=False) + # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot + layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) + video_id = None + if layout: + video_id = try_get( + layout, + (lambda x: x['videoid'], lambda x: x['video']['id']), + compat_str) + if not video_id: + video_id = self._search_regex( + ( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*(VDKA\w+)', + # page.analytics.videoIdCode + r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', + # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + ), webpage, 'video id', default=video_id) if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', From 7e8b3f9439ebefb3a3a4e5da9c0bd2b595976438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 05:37:51 +0700 Subject: [PATCH 19/41] [youtube] Remove unused code --- youtube_dl/extractor/youtube.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c16dc7ab8..0c52e5a8b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -65,11 +65,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - def _login(self): """ Attempt to log in to YouTube. From 0db79d8181c1c3ebf74b9b6d38262c8dcaaf0f4f Mon Sep 17 00:00:00 2001 From: The Hatsune Daishi Date: Sat, 24 Apr 2021 20:58:03 +0900 Subject: [PATCH 20/41] [tver] Redirect all downloads to Brightcove (#28849) --- youtube_dl/extractor/tver.py | 37 +++++++++++------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index a54f49319..a4a30b1e6 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -9,7 +9,6 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, - strip_or_none, try_get, ) @@ -45,32 +44,18 @@ class TVerIE(InfoExtractor): query={'token': self._TOKEN})['main'] p_id = main['publisher_id'] service = remove_start(main['service'], 'ts_') - info = { + + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + + return { '_type': 'url_transparent', 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'url': bc_url, + 'ie_key': 'BrightcoveNew', } - - if service == 'cx': - title = main['title'] - subtitle = strip_or_none(main.get('subtitle')) - if subtitle: - title += ' - ' + subtitle - info.update({ - 'title': title, - 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), - 'ie_key': 'FujiTVFODPlus7', - }) - else: - r_id = main['reference_id'] - if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): - r_id = 'ref:' + r_id - bc_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), - {'geo_countries': ['JP']}) - info.update({ - 'url': bc_url, - 'ie_key': 'BrightcoveNew', - }) - - return info From c6ab79299034492c5af9dd29b0c49585e4efc4cd Mon Sep 17 00:00:00 2001 From: catboy <79282513+catboy-oss@users.noreply.github.com> Date: Sat, 24 Apr 2021 12:10:35 +0000 Subject: [PATCH 21/41] [medaltv] Fix extraction (#28807) numeric clip ids are no longer used by medal, and integer user ids are now sent as strings. --- youtube_dl/extractor/medaltv.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py index 1603b55f6..ef2283dea 100644 --- a/youtube_dl/extractor/medaltv.py +++ b/youtube_dl/extractor/medaltv.py @@ -15,32 +15,32 @@ from ..utils import ( class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', 'info_dict': { - 'id': '34934644', + 'id': '2mA60jWAGQCBH', 'ext': 'mp4', 'title': 'Quad Cold', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'MowgliSB', 'timestamp': 1603165266, 'upload_date': '20201020', - 'uploader_id': 10619174, + 'uploader_id': '10619174', } }, { - 'url': 'https://medal.tv/clips/36787208', + 'url': 'https://medal.tv/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { - 'id': '36787208', + 'id': '2um24TWdty0NA', 'ext': 'mp4', 'title': 'u tk me i tk u bigger', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'Mimicc', 'timestamp': 1605580939, 'upload_date': '20201117', - 'uploader_id': 5156321, + 'uploader_id': '5156321', } }] From 999329cf6b29878d054c6ccdd24573489a2886d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Apr 2021 23:52:16 +0700 Subject: [PATCH 22/41] [workflows/ci.yml] Fix install nose for Jython --- .github/workflows/ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a9dc47a71..97b50aedc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,7 +53,14 @@ jobs: java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose + if: ${{ matrix.python-impl != 'jython' }} run: pip install nose + - name: Install nose (Jython) + if: ${{ matrix.python-impl == 'jython' }} + # Working around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) + run: | + wget https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl + pip install nose-1.3.7-py2-none-any.whl - name: Run tests continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: From 57eaaff5cf1ae63d4e3ae89f301c0f9b3e86bb55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 25 Apr 2021 22:52:28 +0700 Subject: [PATCH 23/41] [francetvinfo] Improve video id extraction (closes #28792) --- youtube_dl/extractor/francetv.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 7cc88bf18..e4ec2e200 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -383,6 +383,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }, { 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', 'only_matching': True, + }, { + # "
]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', - r'data-id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), + r'(?:data-id| Date: Sun, 25 Apr 2021 19:32:47 +0200 Subject: [PATCH 24/41] [xfileshare] Add support for wolfstream.tv (#28858) --- youtube_dl/extractor/xfileshare.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index cbd5d1cbb..df9efa9fa 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -58,6 +58,7 @@ class XFileShareIE(InfoExtractor): (r'vidlocker\.xyz', 'VidLocker'), (r'vidshare\.tv', 'VidShare'), (r'vup\.to', 'VUp'), + (r'wolfstream\.tv', 'WolfStream'), (r'xvideosharing\.com', 'XVideoSharing'), ) @@ -82,6 +83,9 @@ class XFileShareIE(InfoExtractor): }, { 'url': 'https://aparat.cam/n4d6dh0wvlpr', 'only_matching': True, + }, { + 'url': 'https://wolfstream.tv/nthme29v9u2x', + 'only_matching': True, }] @staticmethod From 346dd3b5e87503c52fc6800d9f73cd6bdbce71bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 01:29:50 +0700 Subject: [PATCH 25/41] [ChangeLog] Actualize [ci skip] --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index 45d5c2ebf..c59984d63 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version + +Extractors ++ [xfileshare] Add support for wolfstream.tv (#28858) +* [francetvinfo] Improve video id extraction (#28792) +* [medaltv] Fix extraction (#28807) +* [tver] Redirect all downloads to Brightcove (#28849) +* [go] Improve video id extraction (#25207, #25216, #26058) +* [youtube] Fix lazy extractors (#28780) ++ [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) +* [cbsnews] Fix extraction for python <3.6 (#23359) + + version 2021.04.17 Core From 273964d190fb048477e71114c4734fcb819c5c16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 01:33:30 +0700 Subject: [PATCH 26/41] release 2021.04.26 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index e2a89d5c2..6ece3e031 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.17 + [debug] youtube-dl version 2021.04.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 4d7abd775..f923b2d5f 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index d8dce6fd4..97d605653 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index d95ee291a..73a806833 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.17 + [debug] youtube-dl version 2021.04.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index ac5dd2f27..ee19a75f5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index c59984d63..f15c84225 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.26 Extractors + [xfileshare] Add support for wolfstream.tv (#28858) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a23da1a31..88d474de4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1162,7 +1162,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing + - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, WolfStream, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2b041d593..576f721db 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.17' +__version__ = '2021.04.26' From 94520568b399d5b4f35a9708f5643d8b16c6c4ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 02:16:47 +0700 Subject: [PATCH 27/41] [workflows/ci.yml] Update link to jython-installer --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 97b50aedc..90bd63c32 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,7 +49,7 @@ jobs: - name: Install Jython if: ${{ matrix.python-impl == 'jython' }} run: | - wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + wget https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose From e33dfb445c547f210a7060e8b7abd592dbe42808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 17:53:27 +0700 Subject: [PATCH 28/41] [tv2dk] Fix extraction (closes #28888) --- youtube_dl/extractor/tv2dk.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 8bda9348d..8bd5fd640 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -74,6 +74,12 @@ class TV2DKIE(InfoExtractor): webpage = self._download_webpage(url, video_id) entries = [] + + def add_entry(partner_id, kaltura_id): + entries.append(self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', + video_id=kaltura_id)) + for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): video = extract_attributes(video_el) kaltura_id = video.get('data-entryid') @@ -82,9 +88,14 @@ class TV2DKIE(InfoExtractor): partner_id = video.get('data-partnerid') if not partner_id: continue - entries.append(self.url_result( - 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', - video_id=kaltura_id)) + add_entry(partner_id, kaltura_id) + if not entries: + kaltura_id = self._search_regex( + r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + partner_id = self._search_regex( + (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, + 'partner id') + add_entry(partner_id, kaltura_id) return self.playlist_result(entries) From d2f72c40db0d1fe1102c98c017682b283579ad97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 18:09:32 +0700 Subject: [PATCH 29/41] [svtplay] Improve extraction (closes #28507, closes #28876) --- youtube_dl/extractor/svt.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index aba9bb447..a5bb6daa7 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -146,7 +146,7 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) - (?:.*?modalId=(?P[\da-zA-Z-]+))? + (?:.*?(?:modalId|id)=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ @@ -177,6 +177,9 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -259,7 +262,7 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', - r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id), + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', From ff04d43c469e4cf8c14ba3e2e79da0d35ef3c7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 18:33:05 +0700 Subject: [PATCH 30/41] [xtube] Fix formats extraction (closes #28870) --- youtube_dl/extractor/xtube.py | 51 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 18969058f..7246409e3 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -11,6 +11,7 @@ from ..utils import ( parse_duration, sanitized_Request, str_to_int, + url_or_none, ) @@ -87,10 +88,10 @@ class XTubeIE(InfoExtractor): 'Cookie': 'age_verified=1; cookiesAccepted=1', }) - title, thumbnail, duration = [None] * 3 + title, thumbnail, duration, sources, media_definition = [None] * 5 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') @@ -99,20 +100,52 @@ class XTubeIE(InfoExtractor): thumbnail = config.get('poster') duration = int_or_none(config.get('duration')) sources = config.get('sources') or config.get('format') + media_definition = config.get('mediaDefinition') - if not isinstance(sources, dict): + if not isinstance(sources, dict) and not media_definition: sources = self._parse_json(self._search_regex( r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', webpage, 'sources', group='sources'), video_id, transform_source=js_to_json) formats = [] - for format_id, format_url in sources.items(): - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) + format_urls = set() + + if isinstance(sources, dict): + for format_id, format_url in sources.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + + if isinstance(media_definition, list): + for media in media_definition: + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + if video_url in format_urls: + continue + format_urls.add(video_url) + format_id = media.get('format') + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id == 'mp4': + height = int_or_none(media.get('quality')) + formats.append({ + 'url': video_url, + 'format_id': '%s-%d' % (format_id, height) if height else format_id, + 'height': height, + }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) From d1b9a5e2eff1c075b38815a3d2b25eb8b3f626bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 19:00:39 +0700 Subject: [PATCH 31/41] [twitter] Improve formats extraction from vmap URL (closes #28909) --- youtube_dl/extractor/twitter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index ed495f297..cfa7a7326 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -19,6 +19,7 @@ from ..utils import ( strip_or_none, unified_timestamp, update_url_query, + url_or_none, xpath_text, ) @@ -52,6 +53,9 @@ class TwitterBaseIE(InfoExtractor): return [f] def _extract_formats_from_vmap_url(self, vmap_url, video_id): + vmap_url = url_or_none(vmap_url) + if not vmap_url: + return [] vmap_data = self._download_xml(vmap_url, video_id) formats = [] urls = [] From a0df8a06178e530a1097f177a1faf1d2c609ac99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 22:53:30 +0700 Subject: [PATCH 32/41] [cda] Improve extraction (closes #28709, closes #28937) --- youtube_dl/extractor/cda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1b4362144..e1b391937 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -133,6 +133,8 @@ class CDAIE(InfoExtractor): 'age_limit': 18 if need_confirm_age else 0, } + info = self._search_json_ld(webpage, video_id, default={}) + # Source: https://www.cda.pl/js/player.js?t=1606154898 def decrypt_file(a): for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): @@ -197,7 +199,7 @@ class CDAIE(InfoExtractor): handler = self._download_webpage webpage = handler( - self._BASE_URL + href, video_id, + urljoin(self._BASE_URL, href), video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: # Manually report warning because empty page is returned when @@ -209,6 +211,4 @@ class CDAIE(InfoExtractor): self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(info_dict, info) From 0204838163bd4068fe23b40414573d1307d817ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 01:57:02 +0700 Subject: [PATCH 33/41] [kaltura] Make embed code alternatives actually work --- youtube_dl/extractor/kaltura.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 49d13460d..5d0ff0418 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -120,7 +120,7 @@ class KalturaIE(InfoExtractor): def _extract_urls(webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( - re.finditer( + list(re.finditer( r"""(?xs) kWidget\.(?:thumb)?[Ee]mbed\( \{.*? @@ -128,8 +128,8 @@ class KalturaIE(InfoExtractor): (?P['"])_?(?P(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P['"])entry_?[Ii]d(?P=q3)\s*:\s* (?P['"])(?P(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) - """, webpage) - or re.finditer( + """, webpage)) + or list(re.finditer( r'''(?xs) (?P["']) (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* @@ -142,8 +142,8 @@ class KalturaIE(InfoExtractor): \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage) - or re.finditer( + ''', webpage)) + or list(re.finditer( r'''(?xs) <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) @@ -151,7 +151,7 @@ class KalturaIE(InfoExtractor): [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) (?:(?!(?P=q1)).)* (?P=q1) - ''', webpage) + ''', webpage)) ) urls = [] for mobj in finditer: From fe05191b8c59538a48b6cbc95f4fe54fc7e6a0ac Mon Sep 17 00:00:00 2001 From: Ben Rog-Wilhelm Date: Tue, 4 May 2021 14:14:35 -0500 Subject: [PATCH 34/41] [kaltura] Improve iframe extraction (#28969) Co-authored-by: Sergey M. --- youtube_dl/extractor/gdcvault.py | 15 +++++++++++++++ youtube_dl/extractor/kaltura.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 2f555c1d4..5ad40ee23 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -102,6 +102,21 @@ class GDCVaultIE(InfoExtractor): 'format': 'mp4-408', }, }, + { + # Kaltura embed, whitespace between quote and embedded URL in iframe's src + 'url': 'https://www.gdcvault.com/play/1025699', + 'info_dict': { + 'id': '0_zagynv0a', + 'ext': 'mp4', + 'title': 'Tech Toolbox', + 'upload_date': '20190408', + 'uploader_id': 'joe@blazestreaming.com', + 'timestamp': 1554764629, + }, + 'params': { + 'skip_download': True, + }, + }, ] def _login(self, webpage_url, display_id): diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 5d0ff0418..c731612c4 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -145,7 +145,7 @@ class KalturaIE(InfoExtractor): ''', webpage)) or list(re.finditer( r'''(?xs) - <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["'])\s* (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) (?:(?!(?P=q1)).)* [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) From b8645c1f5885522ec8bb77649f49ce842e947c25 Mon Sep 17 00:00:00 2001 From: Ben Rog-Wilhelm Date: Sat, 17 Apr 2021 23:15:10 -0500 Subject: [PATCH 35/41] [dispeak] Improve FLV extraction (closes #13513) --- youtube_dl/extractor/dispeak.py | 50 ++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index c345e0274..e776ac00c 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -32,6 +32,14 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1013700/Advanced-Material 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, + }, { + # From https://gdcvault.com/play/1016624 + 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', + 'info_dict': { + 'id': '201210-822101_1349794556671DDDD', + 'ext': 'flv', + 'title': 'Pre-launch - Preparing to Take the Plunge', + }, }] def _parse_mp4(self, metadata): @@ -84,26 +92,28 @@ class DigitallySpeakingIE(InfoExtractor): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', - }) + slide_video_path = xpath_text(metadata, './slideVideo') + if slide_video_path: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(slide_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'slide deck video', + 'quality': -2, + 'preference': -2, + 'format_id': 'slides', + }) + speaker_video_path = xpath_text(metadata, './speakerVideo') + if speaker_video_path: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(speaker_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'speaker video', + 'quality': -1, + 'preference': -1, + 'format_id': 'speaker', + }) return formats def _real_extract(self, url): From 1786cd3fe4e555b83bdd3eea77ade3477293330d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 02:30:42 +0700 Subject: [PATCH 36/41] [dispeak] DRY and update tests (closes #28970) --- youtube_dl/extractor/dispeak.py | 34 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index e776ac00c..276fd4b09 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -33,13 +33,17 @@ class DigitallySpeakingIE(InfoExtractor): 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, }, { - # From https://gdcvault.com/play/1016624 + # From https://gdcvault.com/play/1016624, empty speakerVideo 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', 'info_dict': { 'id': '201210-822101_1349794556671DDDD', 'ext': 'flv', 'title': 'Pre-launch - Preparing to Take the Plunge', }, + }, { + # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo + 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): @@ -92,27 +96,19 @@ class DigitallySpeakingIE(InfoExtractor): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo') - if slide_video_path: + for video_key, format_id, preference in ( + ('slide', 'slides', -2), ('speaker', 'speaker', -1)): + video_path = xpath_text(metadata, './%sVideo' % video_key) + if not video_path: + continue formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), + 'play_path': remove_end(video_path, '.flv'), 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo') - if speaker_video_path: - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', + 'format_note': '%s video' % video_key, + 'quality': preference, + 'preference': preference, + 'format_id': format_id, }) return formats From 504e4d804df0ee666d80ba6796017cf97e026c0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 02:44:29 +0700 Subject: [PATCH 37/41] [gdcvault] Add support for HTML5 videos --- youtube_dl/extractor/gdcvault.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 5ad40ee23..acc6478b8 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( HEADRequest, + remove_start, sanitized_Request, smuggle_url, urlencode_postdata, @@ -117,6 +118,11 @@ class GDCVaultIE(InfoExtractor): 'skip_download': True, }, }, + { + # HTML5 video + 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', + 'only_matching': True, + }, ] def _login(self, webpage_url, display_id): @@ -190,7 +196,18 @@ class GDCVaultIE(InfoExtractor): xml_name = self._html_search_regex( r'