From d947ffe8e385a541f44c6125b4cbc269de6055a4 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 4 Feb 2023 00:19:48 +0000 Subject: [PATCH] [IGN] Overhaul extractor to avoid URL redirection loop Consequently/also: * centralise video data extraction * detect 404 and 503 expected errors * handle the test video in IGNVideo * handle two additional page formats for the tests in IGNArticle --- youtube_dl/extractor/ign.py | 347 ++++++++++++++++++++++++++---------- 1 file changed, 252 insertions(+), 95 deletions(-) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 0d9f50ed2..c7daa30e5 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -1,19 +1,29 @@ +# coding: utf-8 + from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( + compat_filter as filter, + compat_HTTPError, compat_parse_qs, - compat_urllib_parse_urlparse, + compat_urlparse, ) from ..utils import ( - HEADRequest, determine_ext, + error_to_compat_str, + extract_attributes, + ExtractorError, int_or_none, + merge_dicts, + orderedSet, parse_iso8601, strip_or_none, - try_get, + traverse_obj, + url_or_none, + urljoin, ) @@ -22,14 +32,102 @@ class IGNBaseIE(InfoExtractor): return self._download_json( 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + raise + + def _extract_video_info(self, video, fatal=True): + video_id = video['videoId'] + + formats = [] + refs = traverse_obj(video, 'refs', expected_type=dict) or {} + + m3u8_url = url_or_none(refs.get('m3uUrl')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + f4m_url = url_or_none(refs.get('f4mUrl')) + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + for asset in (video.get('assets') or []): + asset_url = url_or_none(asset.get('url')) + if not asset_url: + continue + formats.append({ + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + + mezzanine_url = traverse_obj( + video, ('system', 'mezzanineUrl'), expected_type=url_or_none) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'preference': 1, + 'url': mezzanine_url, + }) + + if formats or fatal: + self._sort_formats(formats) + else: + return + + thumbnails = traverse_obj( + video, ('thumbnails', Ellipsis, {'url': 'url'}), expected_type=url_or_none) + tags = traverse_obj( + video, ('tags', Ellipsis, 'displayName'), + expected_type=lambda x: x.strip() or None) + + metadata = traverse_obj(video, 'metadata', expected_type=dict) or {} + title = traverse_obj( + metadata, 'longTitle', 'title', 'name', + expected_type=lambda x: x.strip() or None) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + 'tags': tags, + } + + # yt-dlp shim + @classmethod + def _extract_from_webpage(cls, url, webpage): + for embed_url in orderedSet( + cls._extract_embed_urls(url, webpage) or [], lazy=True): + yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls) + class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - - _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P[^/?&#]+)' + _VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P.+?)' + _PLAYLIST_PATH_RE = r'(?:/?\?(?P[^&#]+))?' + _VALID_URL = ( + r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)' + % '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE))) IE_NAME = 'ign.com' _PAGE_TYPE = 'video' @@ -44,7 +142,10 @@ class IGNIE(IGNBaseIE): 'timestamp': 1370440800, 'upload_date': '20130605', 'tags': 'count:9', - } + }, + 'params': { + 'nocheckcertificate': True, + }, }, { 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', @@ -56,86 +157,51 @@ class IGNIE(IGNBaseIE): 'timestamp': 1420571160, 'upload_date': '20150106', 'tags': 'count:4', - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', 'only_matching': True, }] + @classmethod + def _extract_embed_urls(cls, url, webpage): + grids = re.findall( + r'''(?s)]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)]*>''', + webpage) + return filter(None, + (urljoin(url, m.group('path')) for m in re.finditer( + r''']+\bhref\s*=\s*('|")(?P/videos%s)\1''' + % cls._VIDEO_PATH_RE, grids[0] if grids else ''))) + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + display_id = m.group('id') + if display_id: + return self._extract_video(url, display_id) + display_id = m.group('filt') or 'all' + return self._extract_playlist(url, display_id) + + def _extract_playlist(self, url, display_id): + webpage = self._download_webpage(url, display_id) + + return self.playlist_result( + (self.url_result(u, ie=self.ie_key()) + for u in self._extract_embed_urls(url, webpage)), + playlist_id=display_id) + + def _extract_video(self, url, display_id): display_id = self._match_id(url) - video = self._call_api(display_id) - video_id = video['videoId'] - metadata = video['metadata'] - title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] + video = self._checked_call_api(display_id) - formats = [] - refs = video.get('refs') or {} + info = self._extract_video_info(video) - m3u8_url = refs.get('m3uUrl') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - f4m_url = refs.get('f4mUrl') - if f4m_url: - formats.extend(self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False)) - - for asset in (video.get('assets') or []): - asset_url = asset.get('url') - if not asset_url: - continue - formats.append({ - 'url': asset_url, - 'tbr': int_or_none(asset.get('bitrate'), 1000), - 'fps': int_or_none(asset.get('frame_rate')), - 'height': int_or_none(asset.get('height')), - 'width': int_or_none(asset.get('width')), - }) - - mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) - if mezzanine_url: - formats.append({ - 'ext': determine_ext(mezzanine_url, 'mp4'), - 'format_id': 'mezzanine', - 'preference': 1, - 'url': mezzanine_url, - }) - - self._sort_formats(formats) - - thumbnails = [] - for thumbnail in (video.get('thumbnails') or []): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - }) - - tags = [] - for tag in (video.get('tags') or []): - display_name = tag.get('displayName') - if not display_name: - continue - tags.append(display_name) - - return { - 'id': video_id, - 'title': title, - 'description': strip_or_none(metadata.get('description')), - 'timestamp': parse_iso8601(metadata.get('publishDate')), - 'duration': int_or_none(metadata.get('duration')), + return merge_dicts({ 'display_id': display_id, - 'thumbnails': thumbnails, - 'formats': formats, - 'tags': tags, - } + }, info) -class IGNVideoIE(InfoExtractor): +class IGNVideoIE(IGNBaseIE): _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P\d+)/(?:video|trailer)/' _TESTS = [{ 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', @@ -147,7 +213,8 @@ class IGNVideoIE(InfoExtractor): 'description': 'Taking out assassination targets in Hitman has never been more stylish.', 'timestamp': 1444665600, 'upload_date': '20151012', - } + }, + 'expected_warnings': ['HTTP Error 400: Bad Request'], }, { 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'only_matching': True, @@ -167,22 +234,38 @@ class IGNVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') - url = self._request_webpage(req, video_id).geturl() + parsed_url = compat_urlparse.urlparse(url) + embed_url = compat_urlparse.urlunparse( + parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed')) + + webpage, urlh = self._download_webpage_handle(embed_url, video_id) + new_url = urlh.geturl() ign_url = compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + compat_urlparse.urlparse(new_url).query).get('url', [None])[-1] if ign_url: return self.url_result(ign_url, IGNIE.ie_key()) - return self.url_result(url) + video = self._search_regex(r'(]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False) + if not video: + if new_url == url: + raise ExtractorError('Redirect loop: ' + url) + return self.url_result(new_url) + video = extract_attributes(video) + video_data = video.get('data-settings') or '{}' + video_data = self._parse_json(video_data, video_id)['video'] + info = self._extract_video_info(video_data) + + return merge_dicts({ + 'display_id': video_id, + }, info) class IGNArticleIE(IGNBaseIE): - _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P[^/?&#]+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P[^/?&#]+)' _PAGE_TYPE = 'article' _TESTS = [{ 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': '524497489e4e8ff5848ece34', + 'id': '72113', 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', }, 'playlist': [ @@ -190,7 +273,7 @@ class IGNArticleIE(IGNBaseIE): 'info_dict': { 'id': '5ebbd138523268b93c9141af17bec937', 'ext': 'mp4', - 'title': 'GTA 5 Video Review', + 'title': 'Grand Theft Auto V Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'timestamp': 1379339880, 'upload_date': '20130916', @@ -200,7 +283,7 @@ class IGNArticleIE(IGNBaseIE): 'info_dict': { 'id': '638672ee848ae4ff108df2a296418ee2', 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'title': 'GTA 5 In Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', 'timestamp': 1386878820, 'upload_date': '20131212', @@ -208,16 +291,17 @@ class IGNArticleIE(IGNBaseIE): }, ], 'params': { - 'playlist_items': '2-3', 'skip_download': True, }, + 'expected_warnings': ['Backend fetch failed'], }, { 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { 'id': '53ee806780a81ec46e0790f8', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', }, - 'playlist_count': 2, + 'playlist_count': 1, + 'expected_warnings': ['Backend fetch failed'], }, { # videoId pattern 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', @@ -240,18 +324,91 @@ class IGNArticleIE(IGNBaseIE): 'only_matching': True, }] + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + if e.cause.code == 404: + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + elif e.cause.code == 503: + self.report_warning(error_to_compat_str(e.cause)) + return + raise + + def _search_nextjs_data(self, webpage, video_id, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', **kw), + video_id, **kw) + def _real_extract(self, url): display_id = self._match_id(url) - article = self._call_api(display_id) + article = self._checked_call_api(display_id) - def entries(): - media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) - if media_url: - yield self.url_result(media_url, IGNIE.ie_key()) - for content in (article.get('content') or []): - for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): - yield self.url_result(video_url) + if article: + # obsolete ? + def entries(): + media_url = traverse_obj( + article, ('mediaRelations', 0, 'media', 'metadata', 'url'), + expected_type=url_or_none) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): + if url_or_none(video_url): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + traverse_obj( + article, ('metadata', 'headline'), + expected_type=lambda x: x.strip() or None)) + + webpage = self._download_webpage(url, display_id) + + playlist_id = self._html_search_meta('dable:item_id', webpage, default=None) + if playlist_id: + + def entries(): + for m in re.finditer( + r'''(?s)]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P.+?)]+\bname\s*=\s*("|')flashvars\2[^>]*>)''', + m.group('params'), 'flashvars', default='') + flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '') + v_url = url_or_none((flashvars.get('url') or [None])[-1]) + if v_url: + yield self.url_result(v_url) + else: + playlist_id = self._search_regex( + r'''\bdata-post-id\s*=\s*("|')(?P[\da-f]+)\1''', + webpage, 'id', group='id', default=None) + + nextjs_data = self._search_nextjs_data(webpage, display_id) + + def entries(): + for player in traverse_obj( + nextjs_data, + ('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')): + # skip promo links (which may not always be served, eg GH CI servers) + if traverse_obj(nextjs_data, + ('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')), + expected_type=dict): + continue + video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {} + info = self._extract_video_info(video, fatal=False) + if info: + yield merge_dicts({ + 'display_id': display_id, + }, info) return self.playlist_result( - entries(), article.get('articleId'), - strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) + entries(), playlist_id or display_id, + re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None)