From d947ffe8e385a541f44c6125b4cbc269de6055a4 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 4 Feb 2023 00:19:48 +0000 Subject: [PATCH 01/35] [IGN] Overhaul extractor to avoid URL redirection loop Consequently/also: * centralise video data extraction * detect 404 and 503 expected errors * handle the test video in IGNVideo * handle two additional page formats for the tests in IGNArticle --- youtube_dl/extractor/ign.py | 347 ++++++++++++++++++++++++++---------- 1 file changed, 252 insertions(+), 95 deletions(-) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 0d9f50ed2..c7daa30e5 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -1,19 +1,29 @@ +# coding: utf-8 + from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( + compat_filter as filter, + compat_HTTPError, compat_parse_qs, - compat_urllib_parse_urlparse, + compat_urlparse, ) from ..utils import ( - HEADRequest, determine_ext, + error_to_compat_str, + extract_attributes, + ExtractorError, int_or_none, + merge_dicts, + orderedSet, parse_iso8601, strip_or_none, - try_get, + traverse_obj, + url_or_none, + urljoin, ) @@ -22,14 +32,102 @@ class IGNBaseIE(InfoExtractor): return self._download_json( 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + raise + + def _extract_video_info(self, video, fatal=True): + video_id = video['videoId'] + + formats = [] + refs = traverse_obj(video, 'refs', expected_type=dict) or {} + + m3u8_url = url_or_none(refs.get('m3uUrl')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + f4m_url = url_or_none(refs.get('f4mUrl')) + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + for asset in (video.get('assets') or []): + asset_url = url_or_none(asset.get('url')) + if not asset_url: + continue + formats.append({ + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + + mezzanine_url = traverse_obj( + video, ('system', 'mezzanineUrl'), expected_type=url_or_none) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'preference': 1, + 'url': mezzanine_url, + }) + + if formats or fatal: + self._sort_formats(formats) + else: + return + + thumbnails = traverse_obj( + video, ('thumbnails', Ellipsis, {'url': 'url'}), expected_type=url_or_none) + tags = traverse_obj( + video, ('tags', Ellipsis, 'displayName'), + expected_type=lambda x: x.strip() or None) + + metadata = traverse_obj(video, 'metadata', expected_type=dict) or {} + title = traverse_obj( + metadata, 'longTitle', 'title', 'name', + expected_type=lambda x: x.strip() or None) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + 'tags': tags, + } + + # yt-dlp shim + @classmethod + def _extract_from_webpage(cls, url, webpage): + for embed_url in orderedSet( + cls._extract_embed_urls(url, webpage) or [], lazy=True): + yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls) + class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - - _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P[^/?&#]+)' + _VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P.+?)' + _PLAYLIST_PATH_RE = r'(?:/?\?(?P[^&#]+))?' + _VALID_URL = ( + r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)' + % '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE))) IE_NAME = 'ign.com' _PAGE_TYPE = 'video' @@ -44,7 +142,10 @@ class IGNIE(IGNBaseIE): 'timestamp': 1370440800, 'upload_date': '20130605', 'tags': 'count:9', - } + }, + 'params': { + 'nocheckcertificate': True, + }, }, { 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', @@ -56,86 +157,51 @@ class IGNIE(IGNBaseIE): 'timestamp': 1420571160, 'upload_date': '20150106', 'tags': 'count:4', - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', 'only_matching': True, }] + @classmethod + def _extract_embed_urls(cls, url, webpage): + grids = re.findall( + r'''(?s)]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)]*>''', + webpage) + return filter(None, + (urljoin(url, m.group('path')) for m in re.finditer( + r''']+\bhref\s*=\s*('|")(?P/videos%s)\1''' + % cls._VIDEO_PATH_RE, grids[0] if grids else ''))) + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + display_id = m.group('id') + if display_id: + return self._extract_video(url, display_id) + display_id = m.group('filt') or 'all' + return self._extract_playlist(url, display_id) + + def _extract_playlist(self, url, display_id): + webpage = self._download_webpage(url, display_id) + + return self.playlist_result( + (self.url_result(u, ie=self.ie_key()) + for u in self._extract_embed_urls(url, webpage)), + playlist_id=display_id) + + def _extract_video(self, url, display_id): display_id = self._match_id(url) - video = self._call_api(display_id) - video_id = video['videoId'] - metadata = video['metadata'] - title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] + video = self._checked_call_api(display_id) - formats = [] - refs = video.get('refs') or {} + info = self._extract_video_info(video) - m3u8_url = refs.get('m3uUrl') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - f4m_url = refs.get('f4mUrl') - if f4m_url: - formats.extend(self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False)) - - for asset in (video.get('assets') or []): - asset_url = asset.get('url') - if not asset_url: - continue - formats.append({ - 'url': asset_url, - 'tbr': int_or_none(asset.get('bitrate'), 1000), - 'fps': int_or_none(asset.get('frame_rate')), - 'height': int_or_none(asset.get('height')), - 'width': int_or_none(asset.get('width')), - }) - - mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) - if mezzanine_url: - formats.append({ - 'ext': determine_ext(mezzanine_url, 'mp4'), - 'format_id': 'mezzanine', - 'preference': 1, - 'url': mezzanine_url, - }) - - self._sort_formats(formats) - - thumbnails = [] - for thumbnail in (video.get('thumbnails') or []): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - }) - - tags = [] - for tag in (video.get('tags') or []): - display_name = tag.get('displayName') - if not display_name: - continue - tags.append(display_name) - - return { - 'id': video_id, - 'title': title, - 'description': strip_or_none(metadata.get('description')), - 'timestamp': parse_iso8601(metadata.get('publishDate')), - 'duration': int_or_none(metadata.get('duration')), + return merge_dicts({ 'display_id': display_id, - 'thumbnails': thumbnails, - 'formats': formats, - 'tags': tags, - } + }, info) -class IGNVideoIE(InfoExtractor): +class IGNVideoIE(IGNBaseIE): _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P\d+)/(?:video|trailer)/' _TESTS = [{ 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', @@ -147,7 +213,8 @@ class IGNVideoIE(InfoExtractor): 'description': 'Taking out assassination targets in Hitman has never been more stylish.', 'timestamp': 1444665600, 'upload_date': '20151012', - } + }, + 'expected_warnings': ['HTTP Error 400: Bad Request'], }, { 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'only_matching': True, @@ -167,22 +234,38 @@ class IGNVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') - url = self._request_webpage(req, video_id).geturl() + parsed_url = compat_urlparse.urlparse(url) + embed_url = compat_urlparse.urlunparse( + parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed')) + + webpage, urlh = self._download_webpage_handle(embed_url, video_id) + new_url = urlh.geturl() ign_url = compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + compat_urlparse.urlparse(new_url).query).get('url', [None])[-1] if ign_url: return self.url_result(ign_url, IGNIE.ie_key()) - return self.url_result(url) + video = self._search_regex(r'(]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False) + if not video: + if new_url == url: + raise ExtractorError('Redirect loop: ' + url) + return self.url_result(new_url) + video = extract_attributes(video) + video_data = video.get('data-settings') or '{}' + video_data = self._parse_json(video_data, video_id)['video'] + info = self._extract_video_info(video_data) + + return merge_dicts({ + 'display_id': video_id, + }, info) class IGNArticleIE(IGNBaseIE): - _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P[^/?&#]+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P[^/?&#]+)' _PAGE_TYPE = 'article' _TESTS = [{ 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': '524497489e4e8ff5848ece34', + 'id': '72113', 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', }, 'playlist': [ @@ -190,7 +273,7 @@ class IGNArticleIE(IGNBaseIE): 'info_dict': { 'id': '5ebbd138523268b93c9141af17bec937', 'ext': 'mp4', - 'title': 'GTA 5 Video Review', + 'title': 'Grand Theft Auto V Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'timestamp': 1379339880, 'upload_date': '20130916', @@ -200,7 +283,7 @@ class IGNArticleIE(IGNBaseIE): 'info_dict': { 'id': '638672ee848ae4ff108df2a296418ee2', 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'title': 'GTA 5 In Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', 'timestamp': 1386878820, 'upload_date': '20131212', @@ -208,16 +291,17 @@ class IGNArticleIE(IGNBaseIE): }, ], 'params': { - 'playlist_items': '2-3', 'skip_download': True, }, + 'expected_warnings': ['Backend fetch failed'], }, { 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { 'id': '53ee806780a81ec46e0790f8', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', }, - 'playlist_count': 2, + 'playlist_count': 1, + 'expected_warnings': ['Backend fetch failed'], }, { # videoId pattern 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', @@ -240,18 +324,91 @@ class IGNArticleIE(IGNBaseIE): 'only_matching': True, }] + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + if e.cause.code == 404: + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + elif e.cause.code == 503: + self.report_warning(error_to_compat_str(e.cause)) + return + raise + + def _search_nextjs_data(self, webpage, video_id, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', **kw), + video_id, **kw) + def _real_extract(self, url): display_id = self._match_id(url) - article = self._call_api(display_id) + article = self._checked_call_api(display_id) - def entries(): - media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) - if media_url: - yield self.url_result(media_url, IGNIE.ie_key()) - for content in (article.get('content') or []): - for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): - yield self.url_result(video_url) + if article: + # obsolete ? + def entries(): + media_url = traverse_obj( + article, ('mediaRelations', 0, 'media', 'metadata', 'url'), + expected_type=url_or_none) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): + if url_or_none(video_url): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + traverse_obj( + article, ('metadata', 'headline'), + expected_type=lambda x: x.strip() or None)) + + webpage = self._download_webpage(url, display_id) + + playlist_id = self._html_search_meta('dable:item_id', webpage, default=None) + if playlist_id: + + def entries(): + for m in re.finditer( + r'''(?s)]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P.+?)]+\bname\s*=\s*("|')flashvars\2[^>]*>)''', + m.group('params'), 'flashvars', default='') + flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '') + v_url = url_or_none((flashvars.get('url') or [None])[-1]) + if v_url: + yield self.url_result(v_url) + else: + playlist_id = self._search_regex( + r'''\bdata-post-id\s*=\s*("|')(?P[\da-f]+)\1''', + webpage, 'id', group='id', default=None) + + nextjs_data = self._search_nextjs_data(webpage, display_id) + + def entries(): + for player in traverse_obj( + nextjs_data, + ('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')): + # skip promo links (which may not always be served, eg GH CI servers) + if traverse_obj(nextjs_data, + ('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')), + expected_type=dict): + continue + video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {} + info = self._extract_video_info(video, fatal=False) + if info: + yield merge_dicts({ + 'display_id': display_id, + }, info) return self.playlist_result( - entries(), article.get('articleId'), - strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) + entries(), playlist_id or display_id, + re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None) From cd987e6fca336cf6570b4938442c23cd0bdf7256 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 4 Feb 2023 01:53:47 +0000 Subject: [PATCH 02/35] [jsinterp] Nits --- youtube_dl/jsinterp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 1e7b342ac..60fa2b1b9 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -201,7 +201,7 @@ class JSInterpreter(object): def __init__(self, msg, *args, **kwargs): expr = kwargs.pop('expr', None) if expr is not None: - msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) + msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) class JS_RegExp(object): @@ -699,7 +699,7 @@ class JSInterpreter(object): """ assert, but without risk of getting optimized out """ if not cndn: memb = member - raise self.Exception('{member} {msg}'.format(**locals()), expr=expr) + raise self.Exception('{memb} {msg}'.format(**locals()), expr=expr) def eval_method(): if (variable, member) == ('console', 'debug'): From f2f90887ca7a452dfafa7ca221fe981a4ec56707 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 4 Feb 2023 00:21:35 +0000 Subject: [PATCH 03/35] [Vimeo] Fix `Unable to extract info section` redux * as reported in yt-dlp/yt-dlp#6149 * also allow newline in target JSON object --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 853b38402..14f8dd034 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -663,7 +663,7 @@ class VimeoIE(VimeoBaseInfoExtractor): if '//player.vimeo.com/video/' in url: config = self._parse_json(self._search_regex( - r'\b(?:playerC|c)onfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + r'(?s)\b(?:playerC|c)onfig\s*=\s*({.+?})\s*[;\n]', webpage, 'info section'), video_id) if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) From e19ec5232216fd801ded88728df5b50bfb05c1cc Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 11 Feb 2023 03:25:14 +0000 Subject: [PATCH 04/35] [Vimeo] Support /user{video_id}/{slug} URL format --- youtube_dl/extractor/vimeo.py | 55 +++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 14f8dd034..7f2731d83 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -261,27 +261,33 @@ class VimeoIE(VimeoBaseInfoExtractor): # _VALID_URL matches Vimeo URLs _VALID_URL = r'''(?x) - https?:// - (?: - (?: - www| - player - ) - \. - )? - vimeo(?:pro)?\.com/ - (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)?? - (?: - (?: - play_redirect_hls| - moogaloop\.swf)\?clip_id= - )? - (?:videos?/)? - (?P[0-9]+) - (?:/(?P[\da-f]{10}))? - /?(?:[?&].*)?(?:[#].*)?$ - ''' + https?:// + (?: + (?: + www| + player + ) + \. + )? + vimeo(?:pro)?\.com/ + (?: + (?Puser)| + (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) + (?:.*?/)?? + (?P + (?: + play_redirect_hls| + moogaloop\.swf)\?clip_id= + )? + (?:videos?/)? + ) + (?P[0-9]+) + (?(u) + /(?!videos|likes)[^/?#]+/?| + (?(q)|/(?P[\da-f]{10}))? + ) + (?:(?(q)[&]|(?(u)|/?)[?]).*?)?(?:[#].*)?$ + ''' IE_NAME = 'vimeo' _TESTS = [ { @@ -539,7 +545,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, - } + }, + { + # user playlist alias -> https://vimeo.com/258705797 + 'url': 'https://vimeo.com/user26785108/newspiritualguide', + 'only_matching': True, + }, # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header ] From 58988c1421b88875a33015b08e4d2ada43021e09 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 31 Jan 2022 04:28:54 +0000 Subject: [PATCH 05/35] [YouTube] Bypass age-gating for certain restricted videos * Use TVHTML5_SIMPLY_EMBEDDED_PLAYER client * Also add and fix tests * Introduce and use new utility function `update_url()` --- youtube_dl/extractor/youtube.py | 202 +++++++++++++++++++++++++------- youtube_dl/utils.py | 11 ++ 2 files changed, 168 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 28fdb086a..65428528d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -42,6 +42,7 @@ from ..utils import ( unescapeHTML, unified_strdate, unsmuggle_url, + update_url, update_url_query, url_or_none, urlencode_postdata, @@ -286,15 +287,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|= 18): + + self.report_age_confirmation() + + # Thanks: https://github.com/yt-dlp/yt-dlp/pull/3233 + pb_context = {'html5Preference': 'HTML5_PREF_WANTS'} + query = { + 'playbackContext': {'contentPlaybackContext': {'html5Preference': 'HTML5_PREF_WANTS'}}, + 'contentCheckOk': True, + 'racyCheckOk': True, + 'context': { + 'client': {'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0', 'hl': 'en', 'clientScreen': 'EMBED'}, + 'thirdParty': {'embedUrl': 'https://google.com'}, + }, + 'videoId': video_id, + } + headers = { + 'X-YouTube-Client-Name': '85', + 'X-YouTube-Client-Version': '2.0', + 'Origin': 'https://www.youtube.com' + } + + video_info = self._call_api('player', query, video_id, fatal=False, headers=headers) + age_gate_status = get_playability_status(video_info) + if age_gate_status.get('status') == 'OK': + player_response = video_info + playability_status = age_gate_status trailer_video_id = try_get( playability_status, @@ -1932,12 +2048,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for thumbnail in (try_get( container, lambda x: x['thumbnail']['thumbnails'], list) or []): - thumbnail_url = thumbnail.get('url') + thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue thumbnails.append({ 'height': int_or_none(thumbnail.get('height')), - 'url': thumbnail_url, + 'url': update_url(thumbnail_url, query=None, fragment=None), 'width': int_or_none(thumbnail.get('width')), }) if thumbnails: @@ -2142,6 +2258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sbr_tooltip = try_get( vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) if sbr_tooltip: + # however dislike_count was hidden by YT, as if there could ever be dislikable content on YT like_count, dislike_count = sbr_tooltip.split(' / ') info.update({ 'like_count': str_to_int(like_count), @@ -2411,7 +2528,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'tags': list, 'view_count': int, 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -2438,7 +2554,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': list, 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -2458,7 +2573,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -3043,8 +3157,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) - url = compat_urlparse.urlunparse( - compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + url = update_url(url, netloc='www.youtube.com') # Handle both video/playlist URLs qs = parse_qs(url) video_id = qs.get('v', [None])[0] @@ -3178,7 +3291,6 @@ class YoutubeYtBeIE(InfoExtractor): 'categories': ['Nonprofits & Activism'], 'tags': list, 'like_count': int, - 'dislike_count': int, }, 'params': { 'noplaylist': True, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e3c3ccff9..d5cc6386d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4121,6 +4121,17 @@ def update_url_query(url, query): query=compat_urllib_parse_urlencode(qs, True))) +def update_url(url, **kwargs): + """Replace URL components specified by kwargs + url: compat_str or parsed URL tuple + returns: compat_str""" + if not kwargs: + return compat_urlparse.urlunparse(url) if isinstance(url, tuple) else url + if not isinstance(url, tuple): + url = compat_urlparse.urlparse(url) + return compat_urlparse.urlunparse(url._replace(**kwargs)) + + def update_Request(req, url=None, data=None, headers={}, query={}): req_headers = req.headers.copy() req_headers.update(headers) From 30e986b83493f68bd4c2405b5f4d801891c9bdde Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 20 Jun 2022 23:15:20 +0100 Subject: [PATCH 06/35] [YouTube] Add `signatureTimestamp` for age-gate bypass --- youtube_dl/extractor/youtube.py | 34 +++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 65428528d..6c1cfe7f2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1642,6 +1642,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): fmt['url'] = compat_urlparse.urlunparse( parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + # from yt-dlp, with tweaks + def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): + """ + Extract signatureTimestamp (sts) + Required to tell API what sig/player version is in use. + """ + sts = int_or_none(ytcfg.get('STS')) if isinstance(ytcfg, dict) else None + if not sts: + # Attempt to extract from player + if player_url is None: + error_msg = 'Cannot extract signature timestamp without player_url.' + if fatal: + raise ExtractorError(error_msg) + self._downloader.report_warning(error_msg) + return + code = self._get_player_code(video_id, player_url) + sts = int_or_none(self._search_regex( + r'(?:signatureTimestamp|sts)\s*:\s*(?P[0-9]{5})', code or '', + 'JS player signature timestamp', group='sts', fatal=fatal)) + return sts + def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, @@ -1766,6 +1787,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) player_response = None + player_url = None if webpage: player_response = self._extract_yt_initial_variable( webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, @@ -1799,8 +1821,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Thanks: https://github.com/yt-dlp/yt-dlp/pull/3233 pb_context = {'html5Preference': 'HTML5_PREF_WANTS'} + + # Use signatureTimestamp if available + # Thanks https://github.com/ytdl-org/youtube-dl/issues/31034#issuecomment-1160718026 + player_url = self._extract_player_url(webpage) + ytcfg = self._extract_ytcfg(video_id, webpage) + sts = self._extract_signature_timestamp(video_id, player_url, ytcfg) + if sts: + pb_context['signatureTimestamp'] = sts + query = { - 'playbackContext': {'contentPlaybackContext': {'html5Preference': 'HTML5_PREF_WANTS'}}, + 'playbackContext': {'contentPlaybackContext': pb_context}, 'contentCheckOk': True, 'racyCheckOk': True, 'context': { @@ -1901,7 +1932,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats = [] itags = [] itag_qualities = {} - player_url = None q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) streaming_data = player_response.get('streamingData') or {} streaming_formats = streaming_data.get('formats') or [] From d6b14ba3163b255d0dd8d3b9ddf25d977b8262e7 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 4 Feb 2023 23:18:24 +0000 Subject: [PATCH 07/35] [test] Fix TestAgeRestriction * age restriction may cause DownloadError * update obsolete test URLs [skip ci] --- test/test_age_restriction.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 6f5513faa..db98494ab 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -11,6 +11,7 @@ from test.helper import try_rm from youtube_dl import YoutubeDL +from youtube_dl.utils import DownloadError def _download_restricted(url, filename, age): @@ -26,7 +27,10 @@ def _download_restricted(url, filename, age): ydl.add_default_info_extractors() json_filename = os.path.splitext(filename)[0] + '.info.json' try_rm(json_filename) - ydl.download([url]) + try: + ydl.download([url]) + except DownloadError: + try_rm(json_filename) res = os.path.exists(json_filename) try_rm(json_filename) return res @@ -38,12 +42,12 @@ class TestAgeRestriction(unittest.TestCase): self.assertFalse(_download_restricted(url, filename, age)) def test_youtube(self): - self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) + self._assert_restricted('HtVdAasjOgU', 'HtVdAasjOgU.mp4', 10) def test_youporn(self): self._assert_restricted( - 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', - '505835.mp4', 2, old_age=25) + 'https://www.youporn.com/watch/16715086/sex-ed-in-detention-18-asmr/', + '16715086.mp4', 2, old_age=25) if __name__ == '__main__': From 249f2b631629471af5cfee2993e62de58c8f5990 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 5 Feb 2023 15:43:43 +0000 Subject: [PATCH 08/35] [compat] Systematise compat_ naming [skip ci] --- test/test_compat.py | 3 +- youtube_dl/compat.py | 221 +++++++++++++++++++++++++++---------------- 2 files changed, 139 insertions(+), 85 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 0986cff37..4dddd9a38 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -48,7 +48,8 @@ class TestCompat(unittest.TestCase): def test_all_present(self): import youtube_dl.compat - all_names = youtube_dl.compat.__all__ + all_names = sorted( + youtube_dl.compat.__all__ + youtube_dl.compat.legacy) present_names = set(filter( lambda c: '_' in c and not c.startswith('_'), dir(youtube_dl.compat))) - set(['unicode_literals']) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 28942a8c1..39551f810 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -21,6 +21,10 @@ import subprocess import sys import xml.etree.ElementTree +# naming convention +# 'compat_' + Python3_name.replace('.', '_') +# other aliases exist for convenience and/or legacy + # deal with critical unicode/str things first try: # Python 2 @@ -28,6 +32,7 @@ try: unicode, basestring, unichr ) from .casefold import casefold as compat_casefold + except NameError: compat_str, compat_basestring, compat_chr = ( str, str, chr @@ -53,16 +58,15 @@ try: import urllib.parse as compat_urllib_parse except ImportError: # Python 2 import urllib as compat_urllib_parse + import urlparse as _urlparse + for a in dir(_urlparse): + if not hasattr(compat_urllib_parse, a): + setattr(compat_urllib_parse, a, getattr(_urlparse, a)) + del _urlparse -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse +# unfavoured aliases +compat_urlparse = compat_urllib_parse +compat_urllib_parse_urlparse = compat_urllib_parse.urlparse try: import urllib.response as compat_urllib_response @@ -73,6 +77,7 @@ try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 import cookielib as compat_cookiejar +compat_http_cookiejar = compat_cookiejar if sys.version_info[0] == 2: class compat_cookiejar_Cookie(compat_cookiejar.Cookie): @@ -84,11 +89,13 @@ if sys.version_info[0] == 2: compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs) else: compat_cookiejar_Cookie = compat_cookiejar.Cookie +compat_http_cookiejar_Cookie = compat_cookiejar_Cookie try: import http.cookies as compat_cookies except ImportError: # Python 2 import Cookie as compat_cookies +compat_http_cookies = compat_cookies if sys.version_info[0] == 2: class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): @@ -98,6 +105,7 @@ if sys.version_info[0] == 2: return super(compat_cookies_SimpleCookie, self).load(rawdata) else: compat_cookies_SimpleCookie = compat_cookies.SimpleCookie +compat_http_cookies_SimpleCookie = compat_cookies_SimpleCookie try: import html.entities as compat_html_entities @@ -2351,16 +2359,19 @@ try: from urllib.error import HTTPError as compat_HTTPError except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError +compat_urllib_HTTPError = compat_HTTPError try: from urllib.request import urlretrieve as compat_urlretrieve except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +compat_urllib_request_urlretrieve = compat_urlretrieve try: from html.parser import HTMLParser as compat_HTMLParser except ImportError: # Python 2 from HTMLParser import HTMLParser as compat_HTMLParser +compat_html_parser_HTMLParser = compat_HTMLParser try: # Python 2 from HTMLParser import HTMLParseError as compat_HTMLParseError @@ -2374,6 +2385,7 @@ except ImportError: # Python <3.4 # and uniform cross-version exception handling class compat_HTMLParseError(Exception): pass +compat_html_parser_HTMLParseError = compat_HTMLParseError try: from subprocess import DEVNULL @@ -2390,6 +2402,8 @@ try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus + from urllib.parse import urlencode as compat_urllib_parse_urlencode + from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') else re.compile(r'([\x00-\x7f]+)')) @@ -2456,9 +2470,6 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) -try: - from urllib.parse import urlencode as compat_urllib_parse_urlencode -except ImportError: # Python 2 # Python 2 will choke in urlencode on mixture of byte and unicode strings. # Possible solutions are to either port it from python 3 with all # the friends or manually ensure input query contains only byte strings. @@ -2480,7 +2491,62 @@ except ImportError: # Python 2 def encode_list(l): return [encode_elem(e) for e in l] - return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) + return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq) + + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, compat_str + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError('bad query field: %r' % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + + setattr(compat_urllib_parse, '_urlencode', + getattr(compat_urllib_parse, 'urlencode')) + for name, fix in ( + ('unquote_to_bytes', compat_urllib_parse_unquote_to_bytes), + ('parse_unquote', compat_urllib_parse_unquote), + ('unquote_plus', compat_urllib_parse_unquote_plus), + ('urlencode', compat_urllib_parse_urlencode), + ('parse_qs', compat_parse_qs)): + setattr(compat_urllib_parse, name, fix) + +compat_urllib_parse_parse_qs = compat_parse_qs try: from urllib.request import DataHandler as compat_urllib_request_DataHandler @@ -2520,6 +2586,7 @@ try: from xml.etree.ElementTree import ParseError as compat_xml_parse_error except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error +compat_xml_etree_ElementTree_ParseError = compat_xml_parse_error etree = xml.etree.ElementTree @@ -2533,10 +2600,11 @@ try: # xml.etree.ElementTree.Element is a method in Python <=2.6 and # the following will crash with: # TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types - isinstance(None, xml.etree.ElementTree.Element) + isinstance(None, etree.Element) from xml.etree.ElementTree import Element as compat_etree_Element except TypeError: # Python <=2.6 from xml.etree.ElementTree import _ElementInterface as compat_etree_Element +compat_xml_etree_ElementTree_Element = compat_etree_Element if sys.version_info[0] >= 3: def compat_etree_fromstring(text): @@ -2592,6 +2660,7 @@ else: if k == uri or v == prefix: del etree._namespace_map[k] etree._namespace_map[uri] = prefix +compat_xml_etree_register_namespace = compat_etree_register_namespace if sys.version_info < (2, 7): # Here comes the crazy part: In 2.6, if the xpath is a unicode, @@ -2603,53 +2672,6 @@ if sys.version_info < (2, 7): else: compat_xpath = lambda xpath: xpath -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, compat_str - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError('bad query field: %r' % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - compat_os_name = os._name if os.name == 'java' else os.name @@ -2774,6 +2796,8 @@ else: else: compat_expanduser = os.path.expanduser +compat_os_path_expanduser = compat_expanduser + if compat_os_name == 'nt' and sys.version_info < (3, 8): # os.path.realpath on Windows does not follow symbolic links @@ -2785,6 +2809,8 @@ if compat_os_name == 'nt' and sys.version_info < (3, 8): else: compat_realpath = os.path.realpath +compat_os_path_realpath = compat_realpath + if sys.version_info < (3, 0): def compat_print(s): @@ -2805,11 +2831,15 @@ if sys.version_info < (3, 0) and sys.platform == 'win32': else: compat_getpass = getpass.getpass +compat_getpass_getpass = compat_getpass + + try: compat_input = raw_input except NameError: # Python 3 compat_input = input + # Python < 2.6.5 require kwargs to be bytes try: def _testfunc(x): @@ -2915,15 +2945,16 @@ else: lines = _lines return _terminal_size(columns, lines) + try: itertools.count(start=0, step=1) compat_itertools_count = itertools.count except TypeError: # Python 2.6 def compat_itertools_count(start=0, step=1): - n = start while True: - yield n - n += step + yield start + start += step + if sys.version_info >= (3, 0): from tokenize import tokenize as compat_tokenize_tokenize @@ -3075,6 +3106,8 @@ if sys.version_info < (3, 3): else: compat_b64decode = base64.b64decode +compat_base64_b64decode = compat_b64decode + if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): # PyPy2 prior to version 5.4.0 expects byte strings as Windows function @@ -3094,30 +3127,53 @@ else: return ctypes.WINFUNCTYPE(*args, **kwargs) -__all__ = [ +legacy = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', - 'compat_Struct', 'compat_b64decode', + 'compat_cookiejar', + 'compat_cookiejar_Cookie', + 'compat_cookies', + 'compat_cookies_SimpleCookie', + 'compat_etree_Element', + 'compat_etree_register_namespace', + 'compat_expanduser', + 'compat_getpass', + 'compat_parse_qs', + 'compat_realpath', + 'compat_urllib_parse_parse_qs', + 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_plus', + 'compat_urllib_parse_unquote_to_bytes', + 'compat_urllib_parse_urlencode', + 'compat_urllib_parse_urlparse', + 'compat_urlparse', + 'compat_urlretrieve', + 'compat_xml_parse_error', +] + + +__all__ = [ + 'compat_html_parser_HTMLParseError', + 'compat_html_parser_HTMLParser', + 'compat_Struct', + 'compat_base64_b64decode', 'compat_basestring', 'compat_casefold', 'compat_chr', 'compat_collections_abc', 'compat_collections_chain_map', - 'compat_cookiejar', - 'compat_cookiejar_Cookie', - 'compat_cookies', - 'compat_cookies_SimpleCookie', + 'compat_http_cookiejar', + 'compat_http_cookiejar_Cookie', + 'compat_http_cookies', + 'compat_http_cookies_SimpleCookie', 'compat_ctypes_WINFUNCTYPE', - 'compat_etree_Element', 'compat_etree_fromstring', - 'compat_etree_register_namespace', - 'compat_expanduser', 'compat_filter', 'compat_get_terminal_size', 'compat_getenv', - 'compat_getpass', + 'compat_getpass_getpass', 'compat_html_entities', 'compat_html_entities_html5', 'compat_http_client', @@ -3131,11 +3187,11 @@ __all__ = [ 'compat_numeric_types', 'compat_ord', 'compat_os_name', - 'compat_parse_qs', + 'compat_os_path_expanduser', + 'compat_os_path_realpath', 'compat_print', 'compat_re_Match', 'compat_re_Pattern', - 'compat_realpath', 'compat_setenv', 'compat_shlex_quote', 'compat_shlex_split', @@ -3147,17 +3203,14 @@ __all__ = [ 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', - 'compat_urllib_parse_unquote', - 'compat_urllib_parse_unquote_plus', - 'compat_urllib_parse_unquote_to_bytes', - 'compat_urllib_parse_urlencode', - 'compat_urllib_parse_urlparse', 'compat_urllib_request', 'compat_urllib_request_DataHandler', 'compat_urllib_response', - 'compat_urlparse', - 'compat_urlretrieve', - 'compat_xml_parse_error', + 'compat_urllib_request_urlretrieve', + 'compat_urllib_HTTPError', + 'compat_xml_etree_ElementTree_Element', + 'compat_xml_etree_ElementTree_ParseError', + 'compat_xml_etree_register_namespace', 'compat_xpath', 'compat_zip', 'workaround_optparse_bug9161', From 90c9f789d94fc2c0b4c28c57ba2e0b2f09ef95e3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 5 Feb 2023 13:46:43 +0000 Subject: [PATCH 09/35] [utils] Add parse_qs, update_url [skip ci] --- youtube_dl/utils.py | 64 ++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d5cc6386d..4edbfa27b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -42,6 +42,7 @@ from .compat import ( compat_HTMLParser, compat_HTTPError, compat_basestring, + compat_casefold, compat_chr, compat_collections_abc, compat_cookiejar, @@ -54,18 +55,18 @@ from .compat import ( compat_integer_types, compat_kwargs, compat_os_name, - compat_parse_qs, + compat_re_Match, compat_shlex_quote, compat_str, compat_struct_pack, compat_struct_unpack, compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_parse_qs as compat_parse_qs, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urllib_parse_unquote_plus, compat_urllib_request, - compat_urlparse, compat_xpath, ) @@ -80,12 +81,12 @@ def register_socks_protocols(): # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 # URLs with protocols not in urlparse.uses_netloc are not handled correctly for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in compat_urlparse.uses_netloc: - compat_urlparse.uses_netloc.append(scheme) + if scheme not in compat_urllib_parse.uses_netloc: + compat_urllib_parse.uses_netloc.append(scheme) -# This is not clearly defined otherwise -compiled_regex_type = type(re.compile('')) +# Unfavoured alias +compiled_regex_type = compat_re_Match def random_user_agent(): @@ -2725,7 +2726,7 @@ def make_socks_conn_class(base_class, socks_proxy): assert issubclass(base_class, ( compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) - url_components = compat_urlparse.urlparse(socks_proxy) + url_components = compat_urllib_parse.urlparse(socks_proxy) if url_components.scheme.lower() == 'socks5': socks_type = ProxyType.SOCKS5 elif url_components.scheme.lower() in ('socks', 'socks4'): @@ -3673,7 +3674,7 @@ def remove_quotes(s): def url_basename(url): - path = compat_urlparse.urlparse(url).path + path = compat_urllib_parse.urlparse(url).path return path.strip('/').split('/')[-1] @@ -3693,7 +3694,7 @@ def urljoin(base, path): if not isinstance(base, compat_str) or not re.match( r'^(?:https?:)?//', base): return None - return compat_urlparse.urljoin(base, path) + return compat_urllib_parse.urljoin(base, path) class HEADRequest(compat_urllib_request.Request): @@ -4091,6 +4092,10 @@ def escape_url(url): ).geturl() +def parse_qs(url): + return compat_parse_qs(compat_urllib_parse.urlparse(url).query) + + def read_batch_urls(batch_fd): def fixup(url): if not isinstance(url, compat_str): @@ -4111,25 +4116,28 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') -def update_url_query(url, query): - if not query: - return url - parsed_url = compat_urlparse.urlparse(url) - qs = compat_parse_qs(parsed_url.query) - qs.update(query) - return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse_urlencode(qs, True))) - - def update_url(url, **kwargs): """Replace URL components specified by kwargs url: compat_str or parsed URL tuple - returns: compat_str""" + if query_update is in kwargs, update query with + its value instead of replacing (overrides any `query`) + returns: compat_str + """ if not kwargs: - return compat_urlparse.urlunparse(url) if isinstance(url, tuple) else url + return compat_urllib_parse.urlunparse(url) if isinstance(url, tuple) else url if not isinstance(url, tuple): - url = compat_urlparse.urlparse(url) - return compat_urlparse.urlunparse(url._replace(**kwargs)) + url = compat_urllib_parse.urlparse(url) + query = kwargs.pop('query_update', None) + if query: + qs = compat_parse_qs(url.query) + qs.update(query) + kwargs['query'] = compat_urllib_parse_urlencode(qs, True) + kwargs = compat_kwargs(kwargs) + return compat_urllib_parse.urlunparse(url._replace(**kwargs)) + + +def update_url_query(url, query): + return update_url(url, query_update=query) def update_Request(req, url=None, data=None, headers={}, query={}): @@ -5597,7 +5605,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): if proxy == '__noproxy__': return None # No Proxy - if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + if compat_urllib_parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): req.add_header('Ytdl-socks-proxy', proxy) # youtube-dl's http/https handlers do wrapping the socket with socks return None @@ -6035,14 +6043,6 @@ def traverse_obj(obj, *paths, **kwargs): str = compat_str is_sequence = lambda x: isinstance(x, compat_collections_abc.Sequence) and not isinstance(x, (str, bytes)) - # stand-in until compat_re_Match is added - compat_re_Match = type(re.match('a', 'a')) - # stand-in until casefold.py is added - try: - ''.casefold() - compat_casefold = lambda s: s.casefold() - except AttributeError: - compat_casefold = lambda s: s.lower() casefold = lambda k: compat_casefold(k) if isinstance(k, str) else k if isinstance(expected_type, type): From 4e04f104994c5dac2cb74b64ba7725716ce939d7 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Feb 2023 15:50:28 +0000 Subject: [PATCH 10/35] [compat] Update test_compat [skip ci] --- test/test_compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 4dddd9a38..e233b1ae1 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -50,9 +50,9 @@ class TestCompat(unittest.TestCase): import youtube_dl.compat all_names = sorted( youtube_dl.compat.__all__ + youtube_dl.compat.legacy) - present_names = set(filter( + present_names = set(map(compat_str, filter( lambda c: '_' in c and not c.startswith('_'), - dir(youtube_dl.compat))) - set(['unicode_literals']) + dir(youtube_dl.compat)))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) def test_compat_urllib_parse_unquote(self): From bafb6dec72865cc494feb35ecc94481c30a81069 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Feb 2023 16:19:21 +0000 Subject: [PATCH 11/35] [YouTube] Refresh compat/utils usage * import parse_qs() * import parse_qs in lazy_extractors (clears old TODO) * clean up old compiled lazy_extractors for Py2 * use update_url() --- devscripts/make_lazy_extractors.py | 10 ++++- test/test_execution.py | 12 +++--- youtube_dl/extractor/youtube.py | 61 +++++++++++------------------- 3 files changed, 39 insertions(+), 44 deletions(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 878ae72b1..edc19183d 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -13,6 +13,11 @@ sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) lazy_extractors_filename = sys.argv[1] if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) +# Py2: may be confused by leftover lazy_extractors.pyc +try: + os.remove(lazy_extractors_filename + 'c') +except OSError: + pass from youtube_dl.extractor import _ALL_CLASSES from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor @@ -22,7 +27,10 @@ with open('devscripts/lazy_load_template.py', 'rt') as f: module_contents = [ module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', - 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] + 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', + # needed for suitable() methods of Youtube extractor (see #28780) + 'from youtube_dl.utils import parse_qs\n', +] ie_template = ''' class {name}({bases}): diff --git a/test/test_execution.py b/test/test_execution.py index 32948d93e..704e14612 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -40,14 +40,16 @@ class TestExecution(unittest.TestCase): self.assertFalse(stderr) def test_lazy_extractors(self): + lazy_extractors = 'youtube_dl/extractor/lazy_extractors.py' try: - subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', lazy_extractors], cwd=rootDir, stdout=_DEV_NULL) subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) finally: - try: - os.remove('youtube_dl/extractor/lazy_extractors.py') - except (IOError, OSError): - pass + for x in ['', 'c'] if sys.version_info[0] < 3 else ['']: + try: + os.remove(lazy_extractors + x) + except (IOError, OSError): + pass if __name__ == '__main__': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6c1cfe7f2..6c70a98d1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,12 +14,11 @@ from ..compat import ( compat_chr, compat_HTTPError, compat_map as map, - compat_parse_qs, compat_str, + compat_urllib_parse, + compat_urllib_parse_parse_qs as compat_parse_qs, compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urlparse, ) from ..jsinterp import JSInterpreter from ..utils import ( @@ -33,6 +32,7 @@ from ..utils import ( mimetype2ext, parse_codecs, parse_duration, + parse_qs, qualities, remove_start, smuggle_url, @@ -50,10 +50,6 @@ from ..utils import ( ) -def parse_qs(url): - return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - - class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -636,6 +632,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'duration': 142, 'uploader': 'The Witcher', + 'uploader_id': 'WitcherGame', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg', 'age_limit': 18, @@ -671,7 +669,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, { - 'note': 'Age-gated video embedable only with clientScreen=EMBED', + 'note': 'Age-gated video embeddable only with clientScreen=EMBED', 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg', 'info_dict': { 'id': 'Tq92D6wQ1mg', @@ -1392,11 +1390,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('list', [None])[0]: + if parse_qs(url).get('list', [None])[0]: return False return super(YoutubeIE, cls).suitable(url) @@ -1546,7 +1540,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url.startswith('//'): player_url = 'https:' + player_url elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( + player_url = compat_urllib_parse.urljoin( 'https://www.youtube.com', player_url) return player_url @@ -1628,9 +1622,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _unthrottle_format_urls(self, video_id, player_url, formats): for fmt in formats: - parsed_fmt_url = compat_urlparse.urlparse(fmt['url']) - qs = compat_urlparse.parse_qs(parsed_fmt_url.query) - n_param = qs.get('n') + parsed_fmt_url = compat_urllib_parse.urlparse(fmt['url']) + n_param = compat_parse_qs(parsed_fmt_url.query).get('n') if not n_param: continue n_param = n_param[-1] @@ -1638,9 +1631,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if n_response is None: # give up if descrambling failed break - qs['n'] = [n_response] - fmt['url'] = compat_urlparse.urlunparse( - parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + fmt['url'] = update_url( + parsed_fmt_url, query_update={'n': [n_response]}) # from yt-dlp, with tweaks def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): @@ -1669,20 +1661,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) if not playback_url: return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) # cpn generation algorithm is reverse engineered from base.js. # In fact it works even with dummy cpn. CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + playback_url = update_url( + playback_url, query_update={ + 'ver': ['2'], + 'cpn': [cpn], + }) self._download_webpage( playback_url, video_id, 'Marking watched', @@ -2075,9 +2064,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): thumbnails = [] for container in (video_details, microformat): - for thumbnail in (try_get( + for thumbnail in try_get( container, - lambda x: x['thumbnail']['thumbnails'], list) or []): + lambda x: x['thumbnail']['thumbnails'], list) or []: thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue @@ -3287,11 +3276,7 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('v', [None])[0]: + if parse_qs(url).get('v', [None])[0]: return False return super(YoutubePlaylistIE, cls).suitable(url) @@ -3430,9 +3415,9 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): }] def _real_extract(self, url): - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - query = (qs.get('search_query') or qs.get('q'))[0] - params = qs.get('sp', ('',))[0] + qs = parse_qs(url) + query = (qs.get('search_query') or qs.get('q'))[-1] + params = qs.get('sp', ('',))[-1] return self.playlist_result(self._search_results(query, params), query, query) From e8198c517b70301dd5a459927b5d5976304d6482 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 8 Feb 2023 18:16:51 +0000 Subject: [PATCH 12/35] [YouTube] Fix tests --- youtube_dl/extractor/youtube.py | 55 ++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6c70a98d1..ba0f5c8b6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,8 @@ from ..utils import ( dict_get, error_to_compat_str, float_or_none, + extract_attributes, + get_element_by_attribute, int_or_none, js_to_json, mimetype2ext, @@ -38,6 +40,7 @@ from ..utils import ( smuggle_url, str_or_none, str_to_int, + traverse_obj, try_get, unescapeHTML, unified_strdate, @@ -656,6 +659,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'duration': 177, 'uploader': 'FlyingKitty', + 'uploader_id': 'FlyingKitty900', 'upload_date': '20200408', 'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg', 'age_limit': 18, @@ -678,6 +682,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:17eccca93a786d51bc67646756894066', 'duration': 106, 'uploader': 'Projekt Melody', + 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', 'upload_date': '20191227', 'age_limit': 18, 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', @@ -929,16 +934,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk - Position Music', + 'alt_title': 'Dark Walk', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'track': 'Dark Walk - Position Music', - 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'creator': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk', + 'artist': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', }, 'params': { @@ -2091,7 +2096,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or microformat.get('lengthSeconds')) \ or parse_duration(search_meta('duration')) is_live = video_details.get('isLive') - owner_profile_url = microformat.get('ownerProfileUrl') + + def gen_owner_profile_url(): + yield microformat.get('ownerProfileUrl') + yield extract_attributes(self._search_regex( + r'''(?s)(]+\bitemprop\s*=\s*("|')url\2[^>]*>)''', + get_element_by_attribute('itemprop', 'author', webpage), + 'owner_profile_url', default='')).get('href') + + owner_profile_url = next( + (x for x in map(url_or_none, gen_owner_profile_url()) if x), + None) if not player_url: player_url = self._extract_player_url(webpage) @@ -2176,6 +2191,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info[d_k] = parse_duration(query[k][0]) if video_description: + # Youtube Music Auto-generated description mobj = re.search(r'(?s)(?P[^·\n]+)·(?P[^\n]+)\n+(?P[^\n]+)(?:.+?℗\s*(?P\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: release_year = mobj.group('release_year') @@ -2250,7 +2266,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': info['location'] = stl else: - mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + # •? doesn't match, but [•]? does; \xa0 = non-breaking space + mobj = re.search(r'([^\xa0\s].*?)[\xa0\s]*S(\d+)[\xa0\s]*[•]?[\xa0\s]*E(\d+)', stl) if mobj: info.update({ 'series': mobj.group(1), @@ -2261,7 +2278,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): vpir, lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], list) or []): - tbr = tlb.get('toggleButtonRenderer') or {} + tbr = traverse_obj(tlb, ('segmentedLikeDislikeButtonRenderer', 'likeButton', 'toggleButtonRenderer'), 'toggleButtonRenderer') or {} for getter, regex in [( lambda x: x['defaultText']['accessibility']['accessibilityData'], r'(?P[\d,]+)\s*(?P(?:dis)?like)'), ([ @@ -2315,6 +2332,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif mrr_title == 'Song': info['track'] = mrr_contents_text + # this is not extraction but spelunking! + carousel_lockups = traverse_obj( + initial_data, + ('engagementPanels', Ellipsis, 'engagementPanelSectionListRenderer', + 'content', 'structuredDescriptionContentRenderer', 'items', Ellipsis, + 'videoDescriptionMusicSectionRenderer', 'carouselLockups', Ellipsis), + expected_type=dict) or [] + # try to reproduce logic from metadataRowContainerRenderer above (if it still is) + fields = (('ALBUM', 'album'), ('ARTIST', 'artist'), ('SONG', 'track'), ('LICENSES', 'license')) + # multiple_songs ? + if len(carousel_lockups) > 1: + fields = fields[-1:] + for info_row in traverse_obj( + carousel_lockups, + (0, 'carouselLockupRenderer', 'infoRows', Ellipsis, 'infoRowRenderer'), + expected_type=dict): + row_title = traverse_obj(info_row, ('title', 'simpleText')) + row_text = traverse_obj(info_row, 'defaultMetadata', 'expandedMetadata', expected_type=get_text) + if not row_text: + continue + for name, field in fields: + if name == row_title and not info.get(field): + info[field] = row_text + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: v = info.get(s_k) if v: From f33923cba7670ea2e82f233c1f88210eb41f7c3b Mon Sep 17 00:00:00 2001 From: Valentin Metz <31850924+Valentin-Metz@users.noreply.github.com> Date: Thu, 9 Feb 2023 12:25:28 +0100 Subject: [PATCH 13/35] [rbgtum] Add new extractor (#31305) * [rbgtum] Add new extractor * Small update, force CI --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/rbgtum.py | 97 ++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/rbgtum.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 96b27b179..dfaef0cc3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1010,6 +1010,10 @@ from .raywenderlich import ( RayWenderlichIE, RayWenderlichCourseIE, ) +from .rbgtum import ( + RbgTumIE, + RbgTumCourseIE, +) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import ( diff --git a/youtube_dl/extractor/rbgtum.py b/youtube_dl/extractor/rbgtum.py new file mode 100644 index 000000000..da48ebbc4 --- /dev/null +++ b/youtube_dl/extractor/rbgtum.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RbgTumIE(InfoExtractor): + _VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P.+)' + _TESTS = [{ + # Combined view + 'url': 'https://live.rbg.tum.de/w/cpp/22128', + 'md5': '53a5e7b3e07128e33bbf36687fe1c08f', + 'info_dict': { + 'id': 'cpp/22128', + 'ext': 'mp4', + 'title': 'Lecture: October 18. 2022', + 'series': 'Concepts of C++ programming (IN2377)', + } + }, { + # Presentation only + 'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES', + 'md5': '36c584272179f3e56b0db5d880639cba', + 'info_dict': { + 'id': 'I2DL/12349/PRES', + 'ext': 'mp4', + 'title': 'Lecture 3: Introduction to Neural Networks', + 'series': 'Introduction to Deep Learning (IN2346)', + } + }, { + # Camera only + 'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM', + 'md5': 'e04189d92ff2f56aedf5cede65d37aad', + 'info_dict': { + 'id': 'fvv-info/16130/CAM', + 'ext': 'mp4', + 'title': 'Fachschaftsvollversammlung', + 'series': 'Fachschaftsvollversammlung Informatik', + } + }, ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8') + lecture_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') + lecture_series_title = self._html_search_regex( + r'(?s)]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?', webpage, 'series') + + formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': lecture_title, + 'series': lecture_series_title, + 'formats': formats, + } + + +class RbgTumCourseIE(InfoExtractor): + _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P.+)' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/course/2022/S/fpv', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/course/2022/W/set', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, ] + + def _real_extract(self, url): + course_id = self._match_id(url) + webpage = self._download_webpage(url, course_id) + + lecture_series_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') + + lecture_urls = [] + for lecture_url in re.findall(r'(?i)href="/w/(.+)(? Date: Fri, 10 Feb 2023 04:19:27 +0800 Subject: [PATCH 14/35] [feat]: Add support to external downloader aria2p (#31500) * feat: add class Aria2pFD * feat: create call_downloader function * feat: a colorful download interface to aria2pFD * feat: change value name * Apply suggestions from code review Co-authored-by: dirkf * Typo in suggestion * fix: remove unused value * fix: add not function to return value(0 is normal); add total_seconds to download.eta(timedelta object); add waiting status when hook progress * fix: remove unuse method ..utils.format_bytes * fix: be up to flake8 * fix: be up to flake8 * Apply suggestions from code review * [feat] test external downloader aria2p * [feat] test external downloader aria2p * [fix] test_external_downloader.py * Apply suggestions from code review Co-authored-by: dirkf * Apply suggestions from code review Co-authored-by: dirkf * Update test/test_external_downloader.py Co-authored-by: dirkf * Update test/test_external_downloader.py Co-authored-by: dirkf * Update youtube_dl/downloader/external.py Co-authored-by: dirkf * refactoring code and fix bugs * Apply suggestions from code review * Rename test_external_downloader.py to test_downloader_external.py --------- Co-authored-by: dirkf --- test/helper.py | 11 +++ test/test_downloader_external.py | 115 ++++++++++++++++++++++++++++++ test/test_downloader_http.py | 17 ++--- test/test_http.py | 16 ++--- youtube_dl/downloader/external.py | 58 +++++++++++++++ 5 files changed, 193 insertions(+), 24 deletions(-) create mode 100644 test/test_downloader_external.py diff --git a/test/helper.py b/test/helper.py index c6a2f0667..883b2e877 100644 --- a/test/helper.py +++ b/test/helper.py @@ -89,6 +89,17 @@ class FakeYDL(YoutubeDL): self.report_warning = types.MethodType(report_warning, self) +class FakeLogger(object): + def debug(self, msg): + pass + + def warning(self, msg): + pass + + def error(self, msg): + pass + + def gettestcases(include_onlymatching=False): for ie in youtube_dl.extractor.gen_extractors(): for tc in ie.get_testcases(include_onlymatching): diff --git a/test/test_downloader_external.py b/test/test_downloader_external.py new file mode 100644 index 000000000..c0239502b --- /dev/null +++ b/test/test_downloader_external.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import re +import sys +import subprocess +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import ( + FakeLogger, + http_server_port, + try_rm, +) +from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_http_server +from youtube_dl.utils import encodeFilename +from youtube_dl.downloader.external import Aria2pFD +import threading + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + + +TEST_SIZE = 10 * 1024 + + +class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def send_content_range(self, total=None): + range_header = self.headers.get('Range') + start = end = None + if range_header: + mobj = re.match(r'bytes=(\d+)-(\d+)', range_header) + if mobj: + start, end = (int(mobj.group(i)) for i in (1, 2)) + valid_range = start is not None and end is not None + if valid_range: + content_range = 'bytes %d-%d' % (start, end) + if total: + content_range += '/%d' % total + self.send_header('Content-Range', content_range) + return (end - start + 1) if valid_range else total + + def serve(self, range=True, content_length=True): + self.send_response(200) + self.send_header('Content-Type', 'video/mp4') + size = TEST_SIZE + if range: + size = self.send_content_range(TEST_SIZE) + if content_length: + self.send_header('Content-Length', size) + self.end_headers() + self.wfile.write(b'#' * size) + + def do_GET(self): + if self.path == '/regular': + self.serve() + elif self.path == '/no-content-length': + self.serve(content_length=False) + elif self.path == '/no-range': + self.serve(range=False) + elif self.path == '/no-range-no-content-length': + self.serve(range=False, content_length=False) + else: + assert False, 'unrecognised server path' + + +@unittest.skipUnless(Aria2pFD.available(), 'aria2p module not found') +class TestAria2pFD(unittest.TestCase): + def setUp(self): + self.httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + self.port = http_server_port(self.httpd) + self.server_thread = threading.Thread(target=self.httpd.serve_forever) + self.server_thread.daemon = True + self.server_thread.start() + + def download(self, params, ep): + with subprocess.Popen( + ['aria2c', '--enable-rpc'], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) as process: + if not process.poll(): + filename = 'testfile.mp4' + params['logger'] = FakeLogger() + params['outtmpl'] = filename + ydl = YoutubeDL(params) + try_rm(encodeFilename(filename)) + self.assertEqual(ydl.download(['http://127.0.0.1:%d/%s' % (self.port, ep)]), 0) + self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE) + try_rm(encodeFilename(filename)) + process.kill() + + def download_all(self, params): + for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'): + self.download(params, ep) + + def test_regular(self): + self.download_all({'external_downloader': 'aria2p'}) + + def test_chunked(self): + self.download_all({ + 'external_downloader': 'aria2p', + 'http_chunk_size': 1000, + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 750472281..4e6d7a2a0 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -9,7 +9,11 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import http_server_port, try_rm +from test.helper import ( + FakeLogger, + http_server_port, + try_rm, +) from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server from youtube_dl.downloader.http import HttpFD @@ -66,17 +70,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): assert False -class FakeLogger(object): - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - class TestHttpFD(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( diff --git a/test/test_http.py b/test/test_http.py index 3ee0a5dda..487a9bc77 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,7 +8,10 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import http_server_port +from test.helper import ( + FakeLogger, + http_server_port, +) from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl @@ -52,17 +55,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): assert False -class FakeLogger(object): - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - class TestHTTP(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index a06ab2e50..bffcd10b6 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -200,6 +200,64 @@ class Aria2cFD(ExternalFD): return cmd +class Aria2pFD(ExternalFD): + ''' Aria2pFD class + This class support to use aria2p as downloader. + (Aria2p, a command-line tool and Python library to interact with an aria2c daemon process + through JSON-RPC.) + It can help you to get download progress more easily. + To use aria2p as downloader, you need to install aria2c and aria2p, aria2p can download with pip. + Then run aria2c in the background and enable with the --enable-rpc option. + ''' + try: + import aria2p + __avail = True + except ImportError: + __avail = False + + @classmethod + def available(cls): + return cls.__avail + + def _call_downloader(self, tmpfilename, info_dict): + aria2 = self.aria2p.API( + self.aria2p.Client( + host='http://localhost', + port=6800, + secret='' + ) + ) + + options = { + 'min-split-size': '1M', + 'max-connection-per-server': 4, + 'auto-file-renaming': 'false', + } + options['dir'] = os.path.dirname(tmpfilename) or os.path.abspath('.') + options['out'] = os.path.basename(tmpfilename) + options['header'] = [] + for key, val in info_dict['http_headers'].items(): + options['header'].append('{0}: {1}'.format(key, val)) + download = aria2.add_uris([info_dict['url']], options) + status = { + 'status': 'downloading', + 'tmpfilename': tmpfilename, + } + started = time.time() + while download.status in ['active', 'waiting']: + download = aria2.get_download(download.gid) + status.update({ + 'downloaded_bytes': download.completed_length, + 'total_bytes': download.total_length, + 'elapsed': time.time() - started, + 'eta': download.eta.total_seconds(), + 'speed': download.download_speed, + }) + self._hook_progress(status) + time.sleep(.5) + return download.status != 'complete' + + class HttpieFD(ExternalFD): @classmethod def available(cls): From 822f19f05d0ab1a4a945a85f691f2079f7cb3bbb Mon Sep 17 00:00:00 2001 From: fonkap Date: Sat, 11 Feb 2023 03:37:45 +0100 Subject: [PATCH 15/35] [FileMoonIE] Add extractor for filemoon.sx (#31515) --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/filemoon.py | 43 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 youtube_dl/extractor/filemoon.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dfaef0cc3..f63a2e030 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -376,6 +376,7 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE +from .filemoon import FileMoonIE from .fifa import FifaIE from .filmon import ( FilmOnIE, diff --git a/youtube_dl/extractor/filemoon.py b/youtube_dl/extractor/filemoon.py new file mode 100644 index 000000000..654df9b69 --- /dev/null +++ b/youtube_dl/extractor/filemoon.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + js_to_json, +) + + +class FileMoonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filemoon\.sx/./(?P\w+)' + _TEST = { + 'url': 'https://filemoon.sx/e/dw40rxrzruqz', + 'md5': '5a713742f57ac4aef29b74733e8dda01', + 'info_dict': { + 'id': 'dw40rxrzruqz', + 'title': 'dw40rxrzruqz', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + matches = re.findall(r'(?s)(eval.*?)', webpage) + packed = matches[-1] + unpacked = decode_packed_codes(packed) + jwplayer_sources = self._parse_json( + self._search_regex( + r'(?s)player\s*\.\s*setup\s*\(\s*\{\s*sources\s*:\s*(.*?])', unpacked, 'jwplayer sources'), + video_id, transform_source=js_to_json) + + formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) + + return { + 'id': video_id, + 'title': self._generic_title(url) or video_id, + 'formats': formats + } From de48105dd870e353af468bfb8d49b14d9894e649 Mon Sep 17 00:00:00 2001 From: fonkap Date: Sat, 11 Feb 2023 03:47:43 +0100 Subject: [PATCH 16/35] [KommunetvIE] Add extractor for kommunetv.no (#31516) * Add extractor for kommunetv.no * Using utils.update_url instead of regex --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kommunetv.py | 35 ++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 youtube_dl/extractor/kommunetv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f63a2e030..d8428f46f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -557,6 +557,7 @@ from .khanacademy import ( from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE from .konserthusetplay import KonserthusetPlayIE from .krasview import KrasViewIE from .kth import KTHIE diff --git a/youtube_dl/extractor/kommunetv.py b/youtube_dl/extractor/kommunetv.py new file mode 100644 index 000000000..91d06a74f --- /dev/null +++ b/youtube_dl/extractor/kommunetv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import update_url + + +class KommunetvIE(InfoExtractor): + _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P\w+)' + _TEST = { + 'url': 'https://oslo.kommunetv.no/archive/921', + 'md5': '5f102be308ee759be1e12b63d5da4bbc', + 'info_dict': { + 'id': '921', + 'title': 'Bystyremøte', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'application/json' + } + data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers) + title = data['stream']['title'] + file = data['playlist'][0]['playlist'][0]['file'] + url = update_url(file, query=None, fragment=None) + formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'title': title + } From 6f8c2635a573c84ef66c02f73b4aeff1cc36ae4e Mon Sep 17 00:00:00 2001 From: fonkap Date: Sat, 11 Feb 2023 03:54:45 +0100 Subject: [PATCH 17/35] [StreamsbIE] Add extractor for streamsb.com (viewsb.com) (#31517) * Add extractor for streamsb.com (viewsb.com) * make data url using app.js version --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/streamsb.py | 61 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/streamsb.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d8428f46f..3a87f9e33 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1206,6 +1206,7 @@ from .storyfire import ( from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamsb import StreamsbIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE from .stv import STVPlayerIE diff --git a/youtube_dl/extractor/streamsb.py b/youtube_dl/extractor/streamsb.py new file mode 100644 index 000000000..bffcb3de1 --- /dev/null +++ b/youtube_dl/extractor/streamsb.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import binascii +import random +import re +import string + +from .common import InfoExtractor +from ..utils import urljoin, url_basename + + +def to_ascii_hex(str1): + return binascii.hexlify(str1.encode('utf-8')).decode('ascii') + + +def generate_random_string(length): + return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length)) + + +class StreamsbIE(InfoExtractor): + _DOMAINS = ('viewsb.com', ) + _VALID_URL = r'https://(?P%s)/(?P.+)' % '|'.join(_DOMAINS) + _TEST = { + 'url': 'https://viewsb.com/dxfvlu4qanjx', + 'md5': '488d111a63415369bf90ea83adc8a325', + 'info_dict': { + 'id': 'dxfvlu4qanjx', + 'ext': 'mp4', + 'title': 'Sintel' + } + } + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).group('domain', 'id') + webpage = self._download_webpage(url, video_id) + + iframe_rel_url = self._search_regex(r'''(?i)]+\bsrc\s*=\s*('|")(?P/.*\.html)\1''', webpage, 'iframe', group='path') + iframe_url = urljoin('https://' + domain, iframe_rel_url) + + iframe_data = self._download_webpage(iframe_url, video_id) + app_version = self._search_regex(r''']+\bsrc\s*=\s*["|'].*/app\.min\.(\d+)\.js''', iframe_data, 'app version', fatal=False) or '50' + + video_code = url_basename(iframe_url).rsplit('.')[0] + + length = 12 + req = '||'.join((generate_random_string(length), video_code, generate_random_string(length), 'streamsb')) + ereq = 'https://{0}/sources{1}/{2}'.format(domain, app_version, to_ascii_hex(req)) + + video_data = self._download_webpage(ereq, video_id, headers={ + 'Referer': iframe_url, + 'watchsb': 'sbstream', + }) + player_data = self._parse_json(video_data, video_id) + title = player_data['stream_data']['title'] + formats = self._extract_m3u8_formats(player_data['stream_data']['file'], video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + return { + 'id': video_id, + 'formats': formats, + 'title': title, + } From 42b098dd79e91295376ca98f394876555481a3eb Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 14 Feb 2023 02:47:09 +0000 Subject: [PATCH 18/35] [InfoExtractor] Handle unquoted values in OpenGraph searches --- test/test_InfoExtractor.py | 2 ++ youtube_dl/extractor/common.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index dd69a681b..4db5c93f1 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -62,6 +62,7 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') @@ -74,6 +75,7 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) + self.assertEqual(ie._og_search_property('test4', html), 'unquoted-value') def test_html_search_meta(self): ie = self.ie diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a0a796d7b..7244e5df6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1087,7 +1087,7 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' From dd9aa74beefc179f943051c4e19eecad87ab1124 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 14 Feb 2023 16:33:01 +0000 Subject: [PATCH 19/35] [test] Avoid name TestIE which causes a pytest warning See: https://github.com/yt-dlp/yt-dlp/commit/060ac76257a8c1f7370a8a571821c1d73377701f --- test/test_InfoExtractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 4db5c93f1..6d25441db 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -35,13 +35,13 @@ class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler) assert False -class TestIE(InfoExtractor): +class DummyIE(InfoExtractor): pass class TestInfoExtractor(unittest.TestCase): def setUp(self): - self.ie = TestIE(FakeYDL()) + self.ie = DummyIE(FakeYDL()) def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) From 2dd6c6edd8e0fc5e45865b8e6d865e35147de772 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 17 Feb 2023 11:16:54 +0000 Subject: [PATCH 20/35] [YouTube] Avoid crash if uploader_id extraction fails See #31530. --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ba0f5c8b6..66b0257df 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2122,7 +2122,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): microformat.get('uploadDate') or search_meta('uploadDate')), 'uploader': video_details['author'], - 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, + 'uploader_id': self._search_regex( + r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, + 'uploader id', fatal=False) if owner_profile_url else None, 'uploader_url': owner_profile_url, 'channel_id': channel_id, 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, From 57802e632f5a741df6fd9b30a455c32632944489 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 19 Feb 2023 13:47:49 +0000 Subject: [PATCH 21/35] [jsinterp] Fix dict comprehension for Py2.6 Resolves #31600 --- youtube_dl/jsinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 60fa2b1b9..a3bc42a61 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -262,7 +262,7 @@ class JSInterpreter(object): if not expr: return # collections.Counter() is ~10% slower in both 2.7 and 3.9 - counters = {k: 0 for k in _MATCHING_PARENS.values()} + counters = dict((k, 0) for k in _MATCHING_PARENS.values()) start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, skipping = None, False, 0 after_op, in_regex_char_group, skip_re = True, False, 0 From 6067451e432fb65d487a8a67bb5cff52efb9ccf4 Mon Sep 17 00:00:00 2001 From: df Date: Mon, 20 Feb 2023 01:41:46 +0000 Subject: [PATCH 22/35] [Vimeo] Fix e19ec52 for tween-age Pythons * a check in older Pythons in the 2.7 and earlier, 3.3, 3.4 series caused "sre_constants.error: nothing to repeat" * satisfy the check by avoiding nested qualifiers that can match empty string Resolves #31597 --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7f2731d83..8e1a805f6 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -286,7 +286,7 @@ class VimeoIE(VimeoBaseInfoExtractor): /(?!videos|likes)[^/?#]+/?| (?(q)|/(?P[\da-f]{10}))? ) - (?:(?(q)[&]|(?(u)|/?)[?]).*?)?(?:[#].*)?$ + (?:(?(q)[&]|(?(u)|/?)[?]).+?)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' _TESTS = [ From 1d3751c3fe50b203d3e2bff71d866c8c500f8288 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 1 Jun 2021 18:05:41 +0530 Subject: [PATCH 23/35] Escape URLs in `sanitized_Request`, not `sanitize_url` d2558234cf5dd12d6896eed5427b7dcdb3ab7b5a added escaping of URLs while sanitizing. However, `sanitize_url` may not always receive an actual URL. Eg: When using `youtube-dl "search query" --default-search ytsearch`, `search query` gets escaped to `search%20query` before being prefixed with `ytsearch:` which is not the intended behavior. So the escaping is moved to `sanitized_Request` instead. --- test/test_utils.py | 1 + youtube_dl/extractor/generic.py | 19 +++++++++++++++++++ youtube_dl/utils.py | 4 ++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 9d364c863..ea2b96ed2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -250,6 +250,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar') self.assertEqual(sanitize_url('rmtps://foo.bar'), 'rtmps://foo.bar') self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar') + self.assertEqual(sanitize_url('foo bar'), 'foo bar') def test_expand_path(self): def env(var): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0e473e952..b01900afa 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2320,6 +2320,25 @@ class GenericIE(InfoExtractor): 'height': 720, 'age_limit': 18, }, + }, { + # would like to use the yt-dl test video but searching for + # '"\'/\\ä↭𝕐' fails, so using an old vid from YouTube Korea + 'note': 'Test default search', + 'url': 'Shorts로 허락 필요없이 놀자! (BTS편)', + 'info_dict': { + 'id': 'usDGO4Zb-dc', + 'ext': 'mp4', + 'title': 'YouTube Shorts로 허락 필요없이 놀자! (BTS편)', + 'description': 'md5:96e31607eba81ab441567b5e289f4716', + 'upload_date': '20211107', + 'uploader': 'YouTube Korea', + 'location': '대한민국', + }, + 'params': { + 'default_search': 'ytsearch', + 'skip_download': True, + }, + 'expected_warnings': ['uploader id'], }, ] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4edbfa27b..761edcd49 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2176,11 +2176,11 @@ def sanitize_url(url): for mistake, fixup in COMMON_TYPOS: if re.match(mistake, url): return re.sub(mistake, fixup, url) - return escape_url(url) + return url def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) + return compat_urllib_request.Request(escape_url(sanitize_url(url)), *args, **kwargs) def expand_path(s): From e67e52a8f8fd7e76253e416da76570af8da200d0 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 24 Feb 2023 02:32:40 +0000 Subject: [PATCH 24/35] [test] Support test-case with volatile ID (eg live show) Signalled by regexp ID value, eg: `'id': r're:[\da-zA-Z_-]{8,}'` --- test/test_download.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_download.py b/test/test_download.py index 19936969f..d50008307 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -148,6 +148,7 @@ def generator(test_case, tname): try_rm(tc_filename) try_rm(tc_filename + '.part') try_rm(os.path.splitext(tc_filename)[0] + '.info.json') + try_rm_tcs_files() try: try_num = 1 @@ -213,7 +214,15 @@ def generator(test_case, tname): # First, check test cases' data against extracted data alone expect_info_dict(self, tc_res_dict, tc.get('info_dict', {})) # Now, check downloaded file consistency + # support test-case with volatile ID, signalled by regexp value + if tc.get('info_dict', {}).get('id', '').startswith('re:'): + test_id = tc['info_dict']['id'] + tc['info_dict']['id'] = tc_res_dict['id'] + else: + test_id = None tc_filename = get_tc_filename(tc) + if test_id: + tc['info_dict']['id'] = test_id if not test_case.get('params', {}).get('skip_download', False): self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) self.assertTrue(tc_filename in finished_hook_called) From f7ce98a21e15cb094c772e9082796d009c61578b Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 24 Feb 2023 02:48:37 +0000 Subject: [PATCH 25/35] [YouTube] Support @owner format in uploader_id etc * implement https://github.com/ytdl-org/youtube-dl/issues/31530#issuecomment-1435734719 * update affected tests * misc clean-ups --- youtube_dl/extractor/youtube.py | 319 +++++++++++++++++++------------- 1 file changed, 194 insertions(+), 125 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 66b0257df..4246d84f9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -31,6 +31,7 @@ from ..utils import ( get_element_by_attribute, int_or_none, js_to_json, + merge_dicts, mimetype2ext, parse_codecs, parse_duration, @@ -400,6 +401,62 @@ class YoutubeBaseInfoExtractor(InfoExtractor): break data['continuation'] = token + @staticmethod + def _owner_endpoints_path(): + return [ + Ellipsis, + lambda k, _: k.endswith('SecondaryInfoRenderer'), + ('owner', 'videoOwner'), 'videoOwnerRenderer', 'title', + 'runs', Ellipsis] + + def _extract_channel_id(self, webpage, videodetails={}, metadata={}, renderers=[]): + channel_id = None + if any((videodetails, metadata, renderers)): + channel_id = ( + traverse_obj(videodetails, 'channelId') + or traverse_obj(metadata, 'externalChannelId', 'externalId') + or traverse_obj(renderers, + self._owner_endpoints_path() + [ + 'navigationEndpoint', 'browseEndpoint', 'browseId'], + get_all=False) + ) + return channel_id or self._html_search_meta( + 'channelId', webpage, 'channel id', default=None) + + def _extract_author_var(self, webpage, var_name, + videodetails={}, metadata={}, renderers=[]): + result = None + paths = { + # (HTML, videodetails, metadata, renderers) + 'name': ('content', 'author', (('ownerChannelName', None), 'title'), ['text']), + 'url': ('href', 'ownerProfileUrl', 'vanityChannelUrl', + ['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl']) + } + if any((videodetails, metadata, renderers)): + result = ( + traverse_obj(videodetails, paths[var_name][1], get_all=False) + or traverse_obj(metadata, paths[var_name][2], get_all=False) + or traverse_obj(renderers, + self._owner_endpoints_path() + paths[var_name][3], + get_all=False) + ) + return result or traverse_obj( + extract_attributes(self._search_regex( + r'''(?s)(]+\bitemprop\s*=\s*("|')%s\2[^>]*>)''' + % re.escape(var_name), + get_element_by_attribute('itemprop', 'author', webpage) or '', + 'author link', default='')), + paths[var_name][0]) + + @staticmethod + def _yt_urljoin(url_or_path): + return urljoin('https://www.youtube.com', url_or_path) + + def _extract_uploader_id(self, uploader_url): + return self._search_regex( + r'/(?:(?:channel|user)/|(?=@))([^/?&#]+)', uploader_url or '', + 'uploader id', default=None) + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' @@ -516,8 +573,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'uploader_id': '@PhilippHagemeister', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@PhilippHagemeister', 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', @@ -557,8 +614,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'uploader_id': '@PhilippHagemeister', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@PhilippHagemeister', 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], @@ -588,7 +645,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'youtube_include_dash_manifest': True, 'format': '141', }, - 'skip': 'format 141 not served anymore', + 'skip': 'format 141 not served any more', }, # DASH manifest with encrypted signature { @@ -600,7 +657,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', 'duration': 244, 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', + 'uploader_id': '@AfrojackVEVO', 'upload_date': '20131011', 'abr': 129.495, }, @@ -618,8 +675,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 219, 'upload_date': '20100909', 'uploader': 'Amazing Atheist', - 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', + 'uploader_id': '@theamazingatheist', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@theamazingatheist', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } @@ -635,8 +692,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'duration': 142, 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', + 'uploader_id': '@thewitcher', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@thewitcher', 'upload_date': '20140605', 'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg', 'age_limit': 18, @@ -659,7 +716,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'duration': 177, 'uploader': 'FlyingKitty', - 'uploader_id': 'FlyingKitty900', + 'uploader_id': '@FlyingKitty900', 'upload_date': '20200408', 'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg', 'age_limit': 18, @@ -682,7 +739,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:17eccca93a786d51bc67646756894066', 'duration': 106, 'uploader': 'Projekt Melody', - 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'uploader_id': '@ProjektMelody', 'upload_date': '20191227', 'age_limit': 18, 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', @@ -704,10 +761,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)', 'description': 'Fan Video. Music & Lyrics by OOMPH!.', 'duration': 210, - 'uploader': 'Herr Lurik', - 'uploader_id': 'st3in234', 'upload_date': '20130730', - 'uploader_url': 'http://www.youtube.com/user/st3in234', + 'uploader': 'Herr Lurik', + 'uploader_id': '@HerrLurik', + 'uploader_url': 'http://www.youtube.com/@HerrLurik', 'age_limit': 0, 'thumbnail': 'https://i.ytimg.com/vi/MeJVWBSsPAY/hqdefault.jpg', 'tags': ['oomph', 'such mich find mich', 'lyrics', 'german industrial', 'musica industrial'], @@ -740,8 +797,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 266, 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', + 'uploader_id': '@deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@deadmau5', 'creator': 'deadmau5', 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', 'uploader': 'deadmau5', @@ -762,8 +819,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': r're:(?s)(?:.+\s)?HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games\s*', 'duration': 6085, 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', + 'uploader_id': '@Olympics', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@Olympics', 'uploader': r're:Olympics?', 'age_limit': 0, 'thumbnail': 'https://i.ytimg.com/vi/lqQg6PlCWgI/maxresdefault.jpg', @@ -785,8 +842,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'stretched_ratio': 16 / 9., 'duration': 85, 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', + 'uploader_id': '@AllenMeow', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'uploader': '孫ᄋᄅ', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', @@ -824,7 +881,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'dorappi2000', 'formats': 'mincount:31', }, - 'skip': 'not actual anymore', + 'skip': 'not actual any more', }, # DASH manifest with segment_list { @@ -905,6 +962,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Not multifeed any more', }, { # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) @@ -914,7 +972,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', }, 'playlist_count': 2, - 'skip': 'Not multifeed anymore', + 'skip': 'Not multifeed any more', }, { 'url': 'https://vid.plus/FlRa-iH7PGw', @@ -938,8 +996,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', - 'uploader_id': 'IronSoulElf', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', + 'uploader_id': '@IronSoulElf', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@IronSoulElf', 'uploader': 'IronSoulElf', 'creator': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', 'track': 'Dark Walk', @@ -987,8 +1045,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:a677553cf0840649b731a3024aeff4cc', 'duration': 721, 'upload_date': '20150127', - 'uploader_id': 'BerkmanCenter', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', + 'uploader_id': '@BKCHarvard', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@BKCHarvard', 'uploader': 'The Berkman Klein Center for Internet & Society', 'license': 'Creative Commons Attribution license (reuse allowed)', }, @@ -1007,8 +1065,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 4060, 'upload_date': '20151119', 'uploader': 'Bernie Sanders', - 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', + 'uploader_id': '@BernieSanders', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@BernieSanders', 'license': 'Creative Commons Attribution license (reuse allowed)', }, 'params': { @@ -1054,8 +1112,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', - 'uploader_id': 'Vsauce', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', + 'uploader_id': '@Vsauce', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@Vsauce', 'series': 'Mind Field', 'season_number': 1, 'episode_number': 1, @@ -1134,7 +1192,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, 'youtube_include_dash_manifest': False, }, - 'skip': 'not actual anymore', + 'skip': 'not actual any more', }, { # Youtube Music Auto-generated description @@ -1191,8 +1249,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'IMG 3456', 'description': '', 'upload_date': '20170613', - 'uploader_id': 'ElevageOrVert', 'uploader': 'ElevageOrVert', + 'uploader_id': '@ElevageOrVert', }, 'params': { 'skip_download': True, @@ -1210,8 +1268,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Part 77 Sort a list of simple types in c#', 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', 'upload_date': '20130831', - 'uploader_id': 'kudvenkat', 'uploader': 'kudvenkat', + 'uploader_id': '@Csharp-video-tutorialsBlogspot', }, 'params': { 'skip_download': True, @@ -1263,8 +1321,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', 'upload_date': '20201120', 'uploader': 'Walk around Japan', - 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + 'uploader_id': '@walkaroundjapan7124', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@walkaroundjapan7124', }, 'params': { 'skip_download': True, @@ -1276,11 +1334,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': '4L2J27mJ3Dc', 'ext': 'mp4', + 'title': 'Midwest Squid Game #Shorts', + 'description': 'md5:976512b8a29269b93bbd8a61edc45a6d', 'upload_date': '20211025', 'uploader': 'Charlie Berens', - 'description': 'md5:976512b8a29269b93bbd8a61edc45a6d', - 'uploader_id': 'fivedlrmilkshake', - 'title': 'Midwest Squid Game #Shorts', + 'uploader_id': '@CharlieBerens', }, 'params': { 'skip_download': True, @@ -2088,25 +2146,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): thumbnails = [{'url': thumbnail}] category = microformat.get('category') or search_meta('genre') - channel_id = video_details.get('channelId') \ - or microformat.get('externalChannelId') \ - or search_meta('channelId') + channel_id = self._extract_channel_id( + webpage, videodetails=video_details, metadata=microformat) duration = int_or_none( video_details.get('lengthSeconds') or microformat.get('lengthSeconds')) \ or parse_duration(search_meta('duration')) is_live = video_details.get('isLive') - def gen_owner_profile_url(): - yield microformat.get('ownerProfileUrl') - yield extract_attributes(self._search_regex( - r'''(?s)(]+\bitemprop\s*=\s*("|')url\2[^>]*>)''', - get_element_by_attribute('itemprop', 'author', webpage), - 'owner_profile_url', default='')).get('href') + owner_profile_url = self._yt_urljoin(self._extract_author_var( + webpage, 'url', videodetails=video_details, metadata=microformat)) - owner_profile_url = next( - (x for x in map(url_or_none, gen_owner_profile_url()) if x), - None) + uploader = self._extract_author_var( + webpage, 'name', videodetails=video_details, metadata=microformat) if not player_url: player_url = self._extract_player_url(webpage) @@ -2121,13 +2173,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': unified_strdate( microformat.get('uploadDate') or search_meta('uploadDate')), - 'uploader': video_details['author'], - 'uploader_id': self._search_regex( - r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, - 'uploader id', fatal=False) if owner_profile_url else None, - 'uploader_url': owner_profile_url, + 'uploader': uploader, 'channel_id': channel_id, - 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, 'duration': duration, 'view_count': int_or_none( video_details.get('viewCount') @@ -2257,6 +2304,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] + if not info['channel_id']: + channel_id = self._extract_channel_id('', renderers=contents) + if not info['uploader']: + info['uploader'] = self._extract_author_var('', 'name', renderers=contents) + if not owner_profile_url: + owner_profile_url = self._yt_urljoin(self._extract_author_var('', 'url', renderers=contents)) + for content in contents: vpir = content.get('videoPrimaryInfoRenderer') if vpir: @@ -2304,10 +2358,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) vsir = content.get('videoSecondaryInfoRenderer') if vsir: - info['channel'] = get_text(try_get( - vsir, - lambda x: x['owner']['videoOwnerRenderer']['title'], - dict)) rows = try_get( vsir, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], @@ -2365,7 +2415,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.mark_watched(video_id, player_response) - return info + return merge_dicts( + info, { + 'uploader_id': self._extract_uploader_id(owner_profile_url), + 'uploader_url': owner_profile_url, + 'channel_id': channel_id, + 'channel_url': channel_id and self._yt_urljoin('/channel/' + channel_id), + 'channel': info['uploader'], + }) class YoutubeTabIE(YoutubeBaseInfoExtractor): @@ -2394,6 +2451,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'description': 'Short clips from Super Cooper Sundays!', 'id': 'UCKMA8kHZ8bPYpnMNaUSxfEQ', 'title': 'Super Cooper Shorts - Shorts', + 'uploader': 'Super Cooper Shorts', + 'uploader_id': '@SuperCooperShorts', } }, { # Channel that does not have a Shorts tab. Test should just download videos on Home tab instead @@ -2404,14 +2463,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Emergency Awesome - Home', }, 'playlist_mincount': 5, + 'skip': 'new test page needed to replace `Emergency Awesome - Shorts`', }, { # playlists, multipage 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader': 'Igor Kleiner', + 'uploader_id': '@IgorDataScience', }, }, { # playlists, multipage, different order @@ -2419,8 +2481,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader': 'Igor Kleiner', + 'uploader_id': '@IgorDataScience', }, }, { # playlists, series @@ -2430,6 +2494,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Playlists', 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader': '3Blue1Brown', + 'uploader_id': '@3blue1brown', }, }, { # playlists, singlepage @@ -2439,6 +2505,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', 'title': 'ThirstForScience - Playlists', 'description': 'md5:609399d937ea957b0f53cbffb747a14c', + 'uploader': 'ThirstForScience', + 'uploader_id': '@ThirstForScience', } }, { 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', @@ -2447,20 +2515,22 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): # basic, single video playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'title': 'youtube-dl public playlist', + 'uploader': 'Sergey M.', + 'uploader_id': '@sergeym.6173', + 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', }, 'playlist_count': 1, }, { # empty playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'title': 'youtube-dl empty playlist', + 'uploader': 'Sergey M.', + 'uploader_id': '@sergeym.6173', + 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', }, 'playlist_count': 0, }, { @@ -2470,6 +2540,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Home', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 2, }, { @@ -2479,6 +2551,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 975, }, { @@ -2488,6 +2562,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 199, }, { @@ -2497,6 +2573,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Playlists', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 17, }, { @@ -2506,6 +2584,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Community', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 18, }, { @@ -2515,8 +2595,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Channels', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, - 'playlist_mincount': 138, + 'playlist_mincount': 75, }, { 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, @@ -2533,7 +2615,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'uploader': 'Christiaan008', - 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', + 'uploader_id': '@ChRiStIaAn008', + 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg', }, 'playlist_count': 96, }, { @@ -2543,7 +2626,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', 'uploader': 'Cauchemar', - 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader_id': '@Cauchemar89', + 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', }, 'playlist_mincount': 1123, }, { @@ -2557,7 +2641,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', 'uploader': 'Interstellar Movie', - 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', + 'uploader_id': '@InterstellarMovie', + 'channel_id': 'UCXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincount': 21, }, { @@ -2566,8 +2651,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'title': 'Data Analysis with Dr Mike Pound', 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', 'uploader': 'Computerphile', + 'uploader_id': '@Computerphile', + 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA', }, 'playlist_mincount': 11, }, { @@ -2605,14 +2691,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': '9Auq9mYxFEE', + 'id': r're:[\da-zA-Z_-]{8,}', 'ext': 'mp4', - 'title': 'Watch Sky News live', + 'title': r're:(?s)[A-Z].{20,}', 'uploader': 'Sky News', - 'uploader_id': 'skynews', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', - 'upload_date': '20191102', - 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662', + 'uploader_id': '@SkyNews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@SkyNews', + 'upload_date': r're:\d{8}', + 'description': r're:(?s)(?:.*\n)+SUBSCRIBE to our YouTube channel for more videos: http://www\.youtube\.com/skynews *\n.*', 'categories': ['News & Politics'], 'tags': list, 'like_count': int, @@ -2701,34 +2787,22 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'note': 'Search tab', 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', - 'playlist_mincount': 40, + 'playlist_mincount': 20, 'info_dict': { 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Search - linear algebra', 'description': 'md5:e1384e8a133307dd10edee76e875d62f', 'uploader': '3Blue1Brown', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'uploader_id': '@3blue1brown', + 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', } }] @classmethod def suitable(cls, url): - return False if YoutubeIE.suitable(url) else super( + return not YoutubeIE.suitable(url) and super( YoutubeTabIE, cls).suitable(url) - def _extract_channel_id(self, webpage): - channel_id = self._html_search_meta( - 'channelId', webpage, 'channel id', default=None) - if channel_id: - return channel_id - channel_url = self._html_search_meta( - ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', - 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', - 'twitter:app:url:googleplay'), webpage, 'channel url') - return self._search_regex( - r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', - channel_url, 'channel id') - @staticmethod def _extract_grid_item_renderer(item): assert isinstance(item, dict) @@ -3116,27 +3190,18 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): else: raise ExtractorError('Unable to find selected tab') - @staticmethod - def _extract_uploader(data): + def _extract_uploader(self, metadata, data): uploader = {} - sidebar_renderer = try_get( - data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) - if sidebar_renderer: - for item in sidebar_renderer: - if not isinstance(item, dict): - continue - renderer = item.get('playlistSidebarSecondaryInfoRenderer') - if not isinstance(renderer, dict): - continue - owner = try_get( - renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) - if owner: - uploader['uploader'] = owner.get('text') - uploader['uploader_id'] = try_get( - owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) - uploader['uploader_url'] = urljoin( - 'https://www.youtube.com/', - try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) + renderers = traverse_obj(data, + ('sidebar', 'playlistSidebarRenderer', 'items')) + uploader['channel_id'] = self._extract_channel_id('', metadata=metadata, renderers=renderers) + uploader['uploader'] = ( + self._extract_author_var('', 'name', renderers=renderers) + or self._extract_author_var('', 'name', metadata=metadata)) + uploader['uploader_url'] = self._yt_urljoin( + self._extract_author_var('', 'url', metadata=metadata, renderers=renderers)) + uploader['uploader_id'] = self._extract_uploader_id(uploader['uploader_url']) + uploader['channel'] = uploader['uploader'] return uploader @staticmethod @@ -3187,8 +3252,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self._entries(selected_tab, item_id, webpage), playlist_id=playlist_id, playlist_title=title, playlist_description=description) - playlist.update(self._extract_uploader(data)) - return playlist + return merge_dicts(playlist, self._extract_uploader(renderer, data)) def _extract_from_playlist(self, item_id, url, data, playlist): title = playlist.get('title') or try_get( @@ -3275,8 +3339,9 @@ class YoutubePlaylistIE(InfoExtractor): 'info_dict': { 'title': '[OLD]Team Fortress 2 (Class-based LP)', 'id': 'PLBB231211A4F62143', - 'uploader': 'Wickydoo', - 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', + 'uploader': 'Wickman', + 'uploader_id': '@WickmanVT', + 'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', }, 'playlist_mincount': 29, }, { @@ -3290,21 +3355,25 @@ class YoutubePlaylistIE(InfoExtractor): }, { 'note': 'embedded', 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, + # TODO: full playlist requires _reload_with_unavailable_videos() + # 'playlist_count': 4, + 'playlist_mincount': 1, 'info_dict': { 'title': 'JODA15', 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', 'uploader': 'milan', - 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + 'uploader_id': '@milan5503', + 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', } }, { 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 982, + 'playlist_mincount': 455, 'info_dict': { 'title': '2018 Chinese New Singles (11/6 updated)', 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'uploader': 'LBK', - 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', + 'uploader_id': '@music_king', + 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', } }, { 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', @@ -3342,8 +3411,8 @@ class YoutubeYtBeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Small Scale Baler and Braiding Rugs', 'uploader': 'Backus-Page House Museum', - 'uploader_id': 'backuspagemuseum', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', + 'uploader_id': '@backuspagemuseum', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@backuspagemuseum', 'upload_date': '20161008', 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', 'categories': ['Nonprofits & Activism'], From 3da17834a49fad2a97c308fdd89aa26781ef4d60 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 28 Feb 2023 23:03:44 +0530 Subject: [PATCH 26/35] [Youtube] Construct dash formats with `range` query See yt-dlp/yt_dlp#6369 --- youtube_dl/extractor/youtube.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4246d84f9..89711c84e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1694,8 +1694,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if n_response is None: # give up if descrambling failed break - fmt['url'] = update_url( - parsed_fmt_url, query_update={'n': [n_response]}) + for fmt_dct in traverse_obj(fmt, (None, (None, ('fragments', Ellipsis))), expected_type=dict): + fmt_dct['url'] = update_url( + fmt_dct['url'], query_update={'n': [n_response]}) # from yt-dlp, with tweaks def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): @@ -2047,10 +2048,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if no_video: dct['abr'] = tbr if no_audio or no_video: - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } + CHUNK_SIZE = 10 << 20 + # avoid Youtube throttling + dct.update({ + 'protocol': 'http_dash_segments', + 'fragments': [{ + 'url': update_url_query(dct['url'], { + 'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, dct['filesize'])) + }) + } for range_start in range(0, dct['filesize'], CHUNK_SIZE)] + } if dct['filesize'] else { + 'downloader_options': {'http_chunk_size': CHUNK_SIZE} # No longer useful? + }) + if dct.get('ext'): dct['container'] = dct['ext'] + '_dash' formats.append(dct) From 3e92c60fcd94c37428d57153dbdd14cd0a1f9226 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 3 Mar 2023 16:48:54 +0530 Subject: [PATCH 27/35] [jsinterp] Handle `Date` at epoch 0 See yt-dlp/yt_dlp#6400 --- test/test_youtube_signature.py | 4 ++++ youtube_dl/jsinterp.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index ac37ffa45..decf7ee38 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -67,6 +67,10 @@ _SIG_TESTS = [ ] _NSIG_TESTS = [ + ( + 'https://www.youtube.com/s/player/7862ca1f/player_ias.vflset/en_US/base.js', + 'X_LCxVDjAavgE5t', 'yxJ1dM6iz5ogUg', + ), ( 'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js', 'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w', diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a3bc42a61..e28670a3f 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -405,7 +405,7 @@ class JSInterpreter(object): left, right = self._separate_at_paren(obj[len(klass):]) argvals = self.interpret_iter(left, local_vars, allow_recursion) expr = konstr(*argvals) - if not expr: + if expr is None: raise self.Exception('Failed to parse {klass} {left!r:.100}'.format(**locals()), expr=expr) expr = self._dump(expr, local_vars) + right break From 040271022709c4d20d33c604d1dbc72dc2da472d Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 5 Mar 2023 23:07:07 +0000 Subject: [PATCH 28/35] [jsinterp] Fix regexp parsing and .replace[All] method * For performance, make regexp object instantiation lazy * Other small performance improvements --- test/test_jsinterp.py | 46 ++++++++++++++++++----- youtube_dl/jsinterp.py | 84 ++++++++++++++++++++++++++++-------------- 2 files changed, 93 insertions(+), 37 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index b5962356c..5d129433d 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -139,21 +139,16 @@ class TestJSInterpreter(unittest.TestCase): self.assertTrue(math.isnan(jsi.call_function('x'))) def test_Date(self): - jsi = JSInterpreter(''' - function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } - ''') - self.assertEqual(jsi.call_function('x'), 86000) - jsi = JSInterpreter(''' function x(dt) { return new Date(dt) - 0; } ''') self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) # date format m/d/y - jsi = JSInterpreter(''' - function x() { return new Date('12/31/1969 18:01:26 MDT') - 0; } - ''') - self.assertEqual(jsi.call_function('x'), 86000) + self.assertEqual(jsi.call_function('x', '12/31/1969 18:01:26 MDT'), 86000) + + # epoch 0 + self.assertEqual(jsi.call_function('x', '1 January 1970 00:00:00 UTC'), 0) def test_call(self): jsi = JSInterpreter(''' @@ -445,7 +440,7 @@ class TestJSInterpreter(unittest.TestCase): self.assertIs(jsi.call_function('x'), None) jsi = JSInterpreter(''' - function x() { let a=/,,[/,913,/](,)}/; return a; } + function x() { let a=/,,[/,913,/](,)}/; "".replace(a, ""); return a; } ''') attrs = set(('findall', 'finditer', 'flags', 'groupindex', 'groups', 'match', 'pattern', 'scanner', @@ -457,6 +452,31 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) + jsi = JSInterpreter(r''' + function x() { let a="data-name".replace("data-", ""); return a } + ''') + self.assertEqual(jsi.call_function('x'), 'name') + + jsi = JSInterpreter(r''' + function x() { let a="data-name".replace(new RegExp("^.+-"), ""); return a; } + ''') + self.assertEqual(jsi.call_function('x'), 'name') + + jsi = JSInterpreter(r''' + function x() { let a="data-name".replace(/^.+-/, ""); return a; } + ''') + self.assertEqual(jsi.call_function('x'), 'name') + + jsi = JSInterpreter(r''' + function x() { let a="data-name".replace(/a/g, "o"); return a; } + ''') + self.assertEqual(jsi.call_function('x'), 'doto-nome') + + jsi = JSInterpreter(r''' + function x() { let a="data-name".replaceAll("a", "o"); return a; } + ''') + self.assertEqual(jsi.call_function('x'), 'doto-nome') + jsi = JSInterpreter(r''' function x() { let a=[/[)\\]/]; return a[0]; } ''') @@ -485,6 +505,12 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x(){return 1236566549 << 5}') self.assertEqual(jsi.call_function('x'), 915423904) + """ # fails so far + def test_packed(self): + jsi = JSInterpreter('''function x(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''') + self.assertEqual(jsi.call_function('x', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|'))) + """ + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index e28670a3f..ab7d6f926 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -12,9 +12,11 @@ from .utils import ( js_to_json, remove_quotes, unified_timestamp, + variadic, ) from .compat import ( compat_basestring, + compat_chr, compat_collections_chain_map as ChainMap, compat_itertools_zip_longest as zip_longest, compat_str, @@ -205,10 +207,10 @@ class JSInterpreter(object): super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) class JS_RegExp(object): - _RE_FLAGS = { + RE_FLAGS = { # special knowledge: Python's re flags are bitmask values, current max 128 # invent new bitmask values well above that for literal parsing - # TODO: new pattern class to execute matches with these flags + # TODO: execute matches with these flags (remaining: d, y) 'd': 1024, # Generate indices for substring matches 'g': 2048, # Global search 'i': re.I, # Case-insensitive search @@ -218,12 +220,19 @@ class JSInterpreter(object): 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string } - def __init__(self, pattern_txt, flags=''): + def __init__(self, pattern_txt, flags=0): if isinstance(flags, compat_str): flags, _ = self.regex_flags(flags) - # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern # First, avoid https://github.com/python/cpython/issues/74534 - self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags) + self.__self = None + self.__pattern_txt = pattern_txt.replace('[[', r'[\[') + self.__flags = flags + + def __instantiate(self): + if self.__self: + return + self.__self = re.compile(self.__pattern_txt, self.__flags) + # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern for name in dir(self.__self): # Only these? Obviously __class__, __init__. # PyPy creates a __weakref__ attribute with value None @@ -232,15 +241,21 @@ class JSInterpreter(object): continue setattr(self, name, getattr(self.__self, name)) + def __getattr__(self, name): + self.__instantiate() + if hasattr(self, name): + return getattr(self, name) + return super(JSInterpreter.JS_RegExp, self).__getattr__(name) + @classmethod def regex_flags(cls, expr): flags = 0 if not expr: return flags, expr for idx, ch in enumerate(expr): - if ch not in cls._RE_FLAGS: + if ch not in cls.RE_FLAGS: break - flags |= cls._RE_FLAGS[ch] + flags |= cls.RE_FLAGS[ch] return flags, expr[idx + 1:] @classmethod @@ -265,17 +280,17 @@ class JSInterpreter(object): counters = dict((k, 0) for k in _MATCHING_PARENS.values()) start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, skipping = None, False, 0 - after_op, in_regex_char_group, skip_re = True, False, 0 + after_op, in_regex_char_group = True, False for idx, char in enumerate(expr): - if skip_re > 0: - skip_re -= 1 - continue + paren_delta = 0 if not in_quote: if char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 + paren_delta = 1 elif char in counters: counters[char] -= 1 + paren_delta = -1 if not escaping: if char in _QUOTES and in_quote in (char, None): if in_quote or after_op or char != '/': @@ -283,7 +298,7 @@ class JSInterpreter(object): elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op)) + after_op = not in_quote and (char in cls.OP_CHARS or paren_delta > 0 or (after_op and char.isspace())) if char != delim[pos] or any(counters.values()) or in_quote: pos = skipping = 0 @@ -293,7 +308,7 @@ class JSInterpreter(object): continue elif pos == 0 and skip_delims: here = expr[idx:] - for s in skip_delims if isinstance(skip_delims, (list, tuple)) else [skip_delims]: + for s in variadic(skip_delims): if here.startswith(s) and s: skipping = len(s) - 1 break @@ -316,7 +331,7 @@ class JSInterpreter(object): separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: - raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals())) + raise cls.Exception('No terminating paren {delim} in {expr!r:.5500}'.format(**locals())) return separated[0][1:].strip(), separated[1].strip() @staticmethod @@ -361,6 +376,20 @@ class JSInterpreter(object): except TypeError: return self._named_object(namespace, obj) + # used below + _VAR_RET_THROW_RE = re.compile(r'''(?x) + (?P(?:var|const|let)\s)|return(?:\s+|(?=["'])|$)|(?Pthrow\s+) + ''') + _COMPOUND_RE = re.compile(r'''(?x) + (?Ptry)\s*\{| + (?Pif)\s*\(| + (?Pswitch)\s*\(| + (?Pfor)\s*\(| + (?Pwhile)\s*\( + ''') + _FINALLY_RE = re.compile(r'finally\s*\{') + _SWITCH_RE = re.compile(r'switch\s*\(') + def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: raise self.Exception('Recursion limit reached') @@ -375,7 +404,7 @@ class JSInterpreter(object): if should_return: return ret, should_return - m = re.match(r'(?P(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?Pthrow\s+)', stmt) + m = self._VAR_RET_THROW_RE.match(stmt) if m: expr = stmt[len(m.group(0)):].strip() if m.group('throw'): @@ -447,13 +476,7 @@ class JSInterpreter(object): for item in self._separate(inner)]) expr = name + outer - m = re.match(r'''(?x) - (?Ptry)\s*\{| - (?Pif)\s*\(| - (?Pswitch)\s*\(| - (?Pfor)\s*\(| - (?Pwhile)\s*\( - ''', expr) + m = self._COMPOUND_RE.match(expr) md = m.groupdict() if m else {} if md.get('if'): cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) @@ -512,7 +535,7 @@ class JSInterpreter(object): err = None pending = self.interpret_statement(sub_expr, catch_vars, allow_recursion) - m = re.match(r'finally\s*\{', expr) + m = self._FINALLY_RE.match(expr) if m: sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) @@ -531,7 +554,7 @@ class JSInterpreter(object): if remaining.startswith('{'): body, expr = self._separate_at_paren(remaining) else: - switch_m = re.match(r'switch\s*\(', remaining) # FIXME + switch_m = self._SWITCH_RE.match(remaining) # FIXME if switch_m: switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:]) body, expr = self._separate_at_paren(remaining, '}') @@ -735,7 +758,7 @@ class JSInterpreter(object): if obj == compat_str: if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') - return ''.join(map(chr, argvals)) + return ''.join(map(compat_chr, argvals)) raise self.Exception('Unsupported string method ' + member, expr=expr) elif obj == float: if member == 'pow': @@ -808,10 +831,17 @@ class JSInterpreter(object): if idx >= len(obj): return None return ord(obj[idx]) - elif member == 'replace': + elif member in ('replace', 'replaceAll'): assertion(isinstance(obj, compat_str), 'must be applied on a string') assertion(len(argvals) == 2, 'takes exactly two arguments') - return re.sub(argvals[0], argvals[1], obj) + # TODO: argvals[1] callable, other Py vs JS edge cases + if isinstance(argvals[0], self.JS_RegExp): + count = 0 if argvals[0].flags & self.JS_RegExp.RE_FLAGS['g'] else 1 + assertion(member != 'replaceAll' or count == 0, + 'replaceAll must be called with a global RegExp') + return argvals[0].sub(argvals[1], obj, count=count) + count = ('replaceAll', 'replace').index(member) + return re.sub(re.escape(argvals[0]), argvals[1], obj, count=count) idx = int(member) if isinstance(obj, list) else member return obj[idx](argvals, allow_recursion=allow_recursion) From 27d41d73655b8fbf2dedf88cac96220520d526b5 Mon Sep 17 00:00:00 2001 From: Sophira Date: Tue, 7 Mar 2023 15:49:31 +0000 Subject: [PATCH 29/35] [doc] Recommend "Get cookies.txt LOCALLY" extension in README.md (#31763) * remove link to suspect "Get cookies.txt" extension, dropped from Chrome store * link to new Manifest V3-compatible open-source "Get cookies.txt LOCALLY" extension. Fixes #31465. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6e07ddb1c..227e34046 100644 --- a/README.md +++ b/README.md @@ -918,7 +918,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. -In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [Get cookies.txt](https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid/) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [Get cookies.txt LOCALLY](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. From 8c86fd33dca48ebb505ed04150d9e35993b9fe7e Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 9 Mar 2023 16:40:30 +0000 Subject: [PATCH 30/35] [doc] Improve "guidance" on bug reporting --- README.md | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 227e34046..14a3d6c86 100644 --- a/README.md +++ b/README.md @@ -1408,7 +1408,11 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: # BUGS -Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). +Bugs and suggestions should be reported in the issue tracker: ( is an alias for this). Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). + +## Opening a bug report or suggestion + +Be sure to follow instructions provided **below** and **in the issue tracker**. Complete the appropriate issue template fully. Consider whether your problem is covered by an existing issue: if so, follow the discussion there. Avoid commenting on existing duplicate issues as such comments do not add to the discussion of the issue and are liable to be treated as spam. **Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` @@ -1428,17 +1432,17 @@ $ youtube-dl -v The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. -Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): +Finally please review your issue to avoid various common mistakes (you can and should use this as a checklist) listed below. ### Is the description of the issue itself sufficient? -We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. Many contributors, including myself, are also not native speakers, so we may misread some parts. +We often get issue reports that are hard to understand. To avoid subsequent clarifications, and to assist participants who are not native English speakers, please elaborate on what feature you are requesting, or what bug you want to be fixed. -So please elaborate on what feature you are requesting, or what bug you want to be fixed. Make sure that it's obvious +Make sure that it's obvious - What the problem is - How it could be fixed -- How your proposed solution would look like +- How your proposed solution would look If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. @@ -1448,14 +1452,14 @@ If your server has multiple IPs or you suspect censorship, adding `--call-home` **Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL. +### Is the issue already documented? + +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. Initially, at least, use the search term `-label:duplicate` to focus on active issues. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. + ### Are you using the latest version? Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. -### Is the issue already documented? - -Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. - ### Why are existing options not enough? Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/ytdl-org/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. From 5c985d4f81a43ada75dafb23233e7fe39913907a Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 11 Mar 2023 12:09:55 +0000 Subject: [PATCH 31/35] [downloader] Let _ffmpeg_ handle DASH segments Fixes https://github.com/ytdl-org/youtube-dl/issues/31792 after 3da1783. --- youtube_dl/downloader/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index bffcd10b6..1b6bd1fa2 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -273,7 +273,7 @@ class HttpieFD(ExternalFD): class FFmpegFD(ExternalFD): @classmethod def supports(cls, info_dict): - return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') + return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms', 'http_dash_segments') @classmethod def available(cls): From baa6c5e95cb307e7d716645780ff8aef22de6aca Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 11 Mar 2023 12:17:00 +0000 Subject: [PATCH 32/35] [FragmentFD] Respect `--no-continue` * discard partial fragment on `--no-continue` * continue with correct progress display otherwise Resolves #21467 --- youtube_dl/downloader/common.py | 24 +++++++++++----- youtube_dl/downloader/dash.py | 10 +++---- youtube_dl/downloader/fragment.py | 46 +++++++++++++++++++++---------- youtube_dl/downloader/http.py | 15 ++++------ 4 files changed, 58 insertions(+), 37 deletions(-) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 1cdba89cd..c86ce2aa5 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -88,17 +88,21 @@ class FileDownloader(object): return '---.-%' return '%6s' % ('%3.1f%%' % percent) - @staticmethod - def calc_eta(start, now, total, current): + @classmethod + def calc_eta(cls, start_or_rate, now_or_remaining, *args): + if len(args) < 2: + rate, remaining = (start_or_rate, now_or_remaining) + if None in (rate, remaining): + return None + return int(float(remaining) / rate) + start, now = (start_or_rate, now_or_remaining) + total, current = args if total is None: return None if now is None: now = time.time() - dif = now - start - if current == 0 or dif < 0.001: # One millisecond - return None - rate = float(current) / dif - return int((float(total) - float(current)) / rate) + rate = cls.calc_speed(start, now, current) + return rate and int((float(total) - float(current)) / rate) @staticmethod def format_eta(eta): @@ -123,6 +127,12 @@ class FileDownloader(object): def format_retries(retries): return 'inf' if retries == float('inf') else '%.0f' % retries + @staticmethod + def filesize_or_none(unencoded_filename): + fn = encodeFilename(unencoded_filename) + if os.path.isfile(fn): + return os.path.getsize(fn) + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index c6d674bc6..cc30485f8 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -38,8 +38,7 @@ class DashSegmentsFD(FragmentFD): # In DASH, the first segment contains necessary headers to # generate a valid MP4 file, so always abort for the first segment fatal = i == 0 or not skip_unavailable_fragments - count = 0 - while count <= fragment_retries: + for count in range(fragment_retries + 1): try: fragment_url = fragment.get('url') if not fragment_url: @@ -57,9 +56,8 @@ class DashSegmentsFD(FragmentFD): # is usually enough) thus allowing to download the whole file successfully. # To be future-proof we will retry all fragments that fail with any # HTTP error. - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) + if count < fragment_retries: + self.report_retry_fragment(err, frag_index, count + 1, fragment_retries) except DownloadError: # Don't retry fragment if error occurred during HTTP downloading # itself since it has own retry settings @@ -68,7 +66,7 @@ class DashSegmentsFD(FragmentFD): break raise - if count > fragment_retries: + if count >= fragment_retries: if not fatal: self.report_skip_fragment(frag_index) continue diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 35c76feba..913e91b64 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -71,7 +71,7 @@ class FragmentFD(FileDownloader): @staticmethod def __do_ytdl_file(ctx): - return not ctx['live'] and not ctx['tmpfilename'] == '-' + return ctx['live'] is not True and ctx['tmpfilename'] != '-' def _read_ytdl_file(self, ctx): assert 'ytdl_corrupt' not in ctx @@ -101,6 +101,13 @@ class FragmentFD(FileDownloader): 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), } + frag_resume_len = 0 + if ctx['dl'].params.get('continuedl', True): + frag_resume_len = self.filesize_or_none( + self.temp_name(fragment_filename)) + fragment_info_dict['frag_resume_len'] = frag_resume_len + ctx['frag_resume_len'] = frag_resume_len or 0 + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None @@ -124,9 +131,7 @@ class FragmentFD(FileDownloader): del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): - if 'live' not in ctx: - ctx['live'] = False - if not ctx['live']: + if not ctx.setdefault('live', False): total_frags_str = '%d' % ctx['total_frags'] ad_frags = ctx.get('ad_frags', 0) if ad_frags: @@ -136,10 +141,11 @@ class FragmentFD(FileDownloader): self.to_screen( '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) self.report_destination(ctx['filename']) + continuedl = self.params.get('continuedl', True) dl = HttpQuietDownloader( self.ydl, { - 'continuedl': True, + 'continuedl': continuedl, 'quiet': True, 'noprogress': True, 'ratelimit': self.params.get('ratelimit'), @@ -150,12 +156,11 @@ class FragmentFD(FileDownloader): ) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' - resume_len = 0 # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): + resume_len = self.filesize_or_none(tmpfilename) or 0 + if resume_len > 0: open_mode = 'ab' - resume_len = os.path.getsize(encodeFilename(tmpfilename)) # Should be initialized before ytdl file check ctx.update({ @@ -164,7 +169,8 @@ class FragmentFD(FileDownloader): }) if self.__do_ytdl_file(ctx): - if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + if continuedl and ytdl_file_exists: self._read_ytdl_file(ctx) is_corrupt = ctx.get('ytdl_corrupt') is True is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 @@ -178,7 +184,12 @@ class FragmentFD(FileDownloader): if 'ytdl_corrupt' in ctx: del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) + else: + if not continuedl: + if ytdl_file_exists: + self._read_ytdl_file(ctx) + ctx['fragment_index'] = resume_len = 0 self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 @@ -209,6 +220,7 @@ class FragmentFD(FileDownloader): start = time.time() ctx.update({ 'started': start, + 'fragment_started': start, # Amount of fragment's bytes downloaded by the time of the previous # frag progress hook invocation 'prev_frag_downloaded_bytes': 0, @@ -218,6 +230,9 @@ class FragmentFD(FileDownloader): if s['status'] not in ('downloading', 'finished'): return + if not total_frags and ctx.get('fragment_count'): + state['fragment_count'] = ctx['fragment_count'] + time_now = time.time() state['elapsed'] = time_now - start frag_total_bytes = s.get('total_bytes') or 0 @@ -232,16 +247,17 @@ class FragmentFD(FileDownloader): ctx['fragment_index'] = state['fragment_index'] state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_total_bytes) + ctx['fragment_started'] = time.time() ctx['prev_frag_downloaded_bytes'] = 0 else: frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx['frag_resume_len']) if not ctx['live']: - state['eta'] = self.calc_eta( - start, time_now, estimated_size - resume_len, - state['downloaded_bytes'] - resume_len) - state['speed'] = s.get('speed') or ctx.get('speed') - ctx['speed'] = state['speed'] + state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state) @@ -268,7 +284,7 @@ class FragmentFD(FileDownloader): os.utime(ctx['filename'], (time.time(), filetime)) except Exception: pass - downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) + downloaded_bytes = self.filesize_or_none(ctx['filename']) or 0 self._hook_progress({ 'downloaded_bytes': downloaded_bytes, diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index d8ac41dcc..440471aa0 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -58,9 +58,9 @@ class HttpFD(FileDownloader): if self.params.get('continuedl', True): # Establish possible resume length - if os.path.isfile(encodeFilename(ctx.tmpfilename)): - ctx.resume_len = os.path.getsize( - encodeFilename(ctx.tmpfilename)) + ctx.resume_len = info_dict.get('frag_resume_len') + if ctx.resume_len is None: + ctx.resume_len = self.filesize_or_none(ctx.tmpfilename) or 0 ctx.is_resume = ctx.resume_len > 0 @@ -115,9 +115,9 @@ class HttpFD(FileDownloader): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked - # to match the value of requested Range HTTP header. This is due to a webservers + # to match the value of requested Range HTTP header. This is due to webservers # that don't support resuming and serve a whole file with no Content-Range - # set in response despite of requested Range (see + # set in response despite requested Range (see # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) if has_range: content_range = ctx.data.headers.get('Content-Range') @@ -293,10 +293,7 @@ class HttpFD(FileDownloader): # Progress message speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) - if ctx.data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) + eta = self.calc_eta(speed, ctx.data_len and (ctx.data_len - ctx.resume_len)) self._hook_progress({ 'status': 'downloading', From e8de54bce50f6f77a4d7e8e80675f7003d5bf630 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 13 Mar 2023 19:45:54 +0000 Subject: [PATCH 33/35] [core] Handle `/../` sequences in HTTP URLs * use Python's RFC implementation for embedded sequences * hack: strip unbalanced leading `../` from path, like eg Firefox See https://github.com/yt-dlp/yt-dlp/issues/3355 --- youtube_dl/YoutubeDL.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8e8546596..bcf781744 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -39,6 +39,7 @@ from .compat import ( compat_str, compat_tokenize_tokenize, compat_urllib_error, + compat_urllib_parse, compat_urllib_request, compat_urllib_request_DataHandler, ) @@ -60,6 +61,7 @@ from .utils import ( format_bytes, formatSeconds, GeoRestrictedError, + HEADRequest, int_or_none, ISO3166Utils, locked_file, @@ -74,6 +76,7 @@ from .utils import ( preferredencoding, prepend_extension, process_communicate_or_kill, + PUTRequest, register_socks_protocols, render_table, replace_extension, @@ -2297,6 +2300,27 @@ class YoutubeDL(object): """ Start an HTTP download """ if isinstance(req, compat_basestring): req = sanitized_Request(req) + # an embedded /../ sequence is not automatically handled by urllib2 + # see https://github.com/yt-dlp/yt-dlp/issues/3355 + url = req.get_full_url() + parts = url.partition('/../') + if parts[1]: + url = compat_urllib_parse.urljoin(parts[0] + parts[1][:1], parts[1][1:] + parts[2]) + if url: + # worse, URL path may have initial /../ against RFCs: work-around + # by stripping such prefixes, like eg Firefox + parts = compat_urllib_parse.urlsplit(url) + path = parts.path + while path.startswith('/../'): + path = path[3:] + url = parts._replace(path=path).geturl() + # get a new Request with the munged URL + if url != req.get_full_url(): + req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get( + req.get_method(), compat_urllib_request.Request) + req = req_type( + url, data=req.data, headers=dict(req.header_items()), + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): From 70ff01391068c98b4377c5cc17a8d00d5645e734 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 14 Mar 2023 00:58:59 +0000 Subject: [PATCH 34/35] [devscripts] Add a hack to convert command-line options to API options --- devscripts/cli_to_api.py | 64 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 devscripts/cli_to_api.py diff --git a/devscripts/cli_to_api.py b/devscripts/cli_to_api.py new file mode 100755 index 000000000..2f4d6a458 --- /dev/null +++ b/devscripts/cli_to_api.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import unicode_literals + +""" +This script displays the API parameters corresponding to a yt-dl command line + +Example: +$ ./cli_to_api.py -f best +{u'format': 'best'} +$ +""" + +# Allow direct execution +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import youtube_dl +from types import MethodType + + +def cli_to_api(*opts): + YDL = youtube_dl.YoutubeDL + + # to extract the parsed options, break out of YoutubeDL instantiation + + # return options via this Exception + class ParseYTDLResult(Exception): + def __init__(self, result): + super(ParseYTDLResult, self).__init__('result') + self.opts = result + + # replacement constructor that raises ParseYTDLResult + def ytdl_init(ydl, ydl_opts): + super(YDL, ydl).__init__(ydl_opts) + raise ParseYTDLResult(ydl_opts) + + # patch in the constructor + YDL.__init__ = MethodType(ytdl_init, YDL) + + # core parser + def parsed_options(argv): + try: + youtube_dl._real_main(list(argv)) + except ParseYTDLResult as result: + return result.opts + + # from https://github.com/yt-dlp/yt-dlp/issues/5859#issuecomment-1363938900 + default = parsed_options([]) + diff = dict((k, v) for k, v in parsed_options(opts).items() if default[k] != v) + if 'postprocessors' in diff: + diff['postprocessors'] = [pp for pp in diff['postprocessors'] if pp not in default['postprocessors']] + return diff + + +def main(): + from pprint import pprint + pprint(cli_to_api(*sys.argv)) + + +if __name__ == '__main__': + main() From 6fece0a96b3cd8677f5c1185a57c6e21403fcb44 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 14 Mar 2023 13:01:32 +0000 Subject: [PATCH 35/35] [AENetworksBaseIE] Report missing show data instead of crash --- youtube_dl/extractor/aenetworks.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 2a1f08e39..59fbe048a 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -8,6 +8,8 @@ from ..utils import ( ExtractorError, GeoRestrictedError, int_or_none, + remove_start, + traverse_obj, update_url_query, urlencode_postdata, ) @@ -33,14 +35,17 @@ class AENetworksBaseIE(ThePlatformIE): } def _extract_aen_smil(self, smil_url, video_id, auth=None): - query = {'mbr': 'true'} + query = { + 'mbr': 'true', + 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', + } if auth: query['auth'] = auth TP_SMIL_QUERY = [{ 'assetTypes': 'high_video_ak', - 'switch': 'hls_high_ak' + 'switch': 'hls_high_ak', }, { - 'assetTypes': 'high_video_s3' + 'assetTypes': 'high_video_s3', }, { 'assetTypes': 'high_video_s3', 'switch': 'hls_high_fastly', @@ -75,7 +80,14 @@ class AENetworksBaseIE(ThePlatformIE): requestor_id, brand = self._DOMAIN_MAP[domain] result = self._download_json( 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + filter_value, query={'filter[%s]' % filter_key: filter_value}) + result = traverse_obj( + result, ('results', + lambda k, v: k == 0 and v[filter_key] == filter_value), + get_all=False) + if not result: + raise ExtractorError('Show not found in A&E feed (too new?)', expected=True, + video_id=remove_start(filter_value, '/')) title = result['title'] video_id = result['id'] media_url = result['publicUrl'] @@ -126,7 +138,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'skip': 'This video is only available for users of participating TV providers.', + 'skip': 'Geo-restricted - This content is not available in your location.' }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'info_dict': { @@ -143,6 +155,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True