From b6a24ad93d9d4d92356b0c489d3e4a3dd2a0a3f6 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 22 Feb 2022 18:47:03 +0000 Subject: [PATCH 1/4] [Anvato] Update extractor for new API; fix tests --- youtube_dl/extractor/anvato.py | 84 ++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index b7398563b..eb00975c5 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -17,6 +17,7 @@ from ..utils import ( intlist_to_bytes, int_or_none, strip_jsonp, + try_get, unescapeHTML, unsmuggle_url, ) @@ -203,6 +204,7 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } + _API_PREFIX = 'https://tkx.mp.lura.live/rest/v2/' _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' _ANVP_RE = r']+\bdata-anvp\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' @@ -211,15 +213,18 @@ class AnvatoIE(InfoExtractor): _TESTS = [{ # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', + 'only_matching': True, + }, { + # from https://miami.cbslocal.com/2022/02/12/no-appetite-for-new-miami-restaurant-glorifying-castro-communism/ + 'url': 'anvato:5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl:6197559', # 8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', 'info_dict': { - 'id': '4465496', + 'id': '6197559', 'ext': 'mp4', - 'title': 'VIDEO: Humpback whale breaches right next to NH boat', - 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', - 'duration': 22, - 'timestamp': 1534855680, - 'upload_date': '20180821', - 'uploader': 'ANV', + 'upload_date': '20220209', + 'uploader': 'CBS', + 'description': 'CBS4\'s Joel Waldman has more on the backlash Cafe Habana is receiving.', + 'timestamp': 1644381300, + 'title': 'Miamians Want No Part Of New Restaurant Set To Open In Brickell That Glorifies Fidel Castro & Communism', }, 'params': { 'skip_download': True, @@ -228,46 +233,85 @@ class AnvatoIE(InfoExtractor): # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', 'only_matching': True, + }, { + # from https://sanfrancisco.cbslocal.com/2022/02/16/san-francisco-voters-recall-embattled-school-board-members/ + 'url': 'anvato:5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl:6201051', + 'info_dict': { + 'id': '6201051', + 'ext': 'mp4', + 'upload_date': '20220216', + 'uploader': 'CBS', + 'description': 'Voters were successful in their high-profile effort to recall three San Francisco school board members. Anne Makovec reports.', + 'timestamp': 1645043880, + 'title': 'Voters Recall San Francisco School Board Members', + }, + 'params': { + 'skip_download': True, + }, }] def __init__(self, *args, **kwargs): super(AnvatoIE, self).__init__(*args, **kwargs) self.__server_time = None - def _server_time(self, access_key, video_id): + def _server_time(self, access_key, video_id, server_url=None): if self.__server_time is not None: return self.__server_time + if not server_url: + server_url = self._API_PREFIX + 'server_time?anvack={ANVACK}' + self.__server_time = int(self._download_json( - self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, + server_url.format(ANVACK=access_key), video_id, note='Fetching server time')['server_time']) return self.__server_time - def _api_prefix(self, access_key): - return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') - def _get_video_json(self, access_key, video_id): - # See et() in anvplayer.min.js, which is an alias of getVideoJSON() - video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) - server_time = self._server_time(access_key, video_id) + + def fix_template_vars(template): + return re.sub(r'\{(\{\w+})}', r'\1', template) + + # https://access.mp.lura.live/anvacks/5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl?apikey=3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA + access_info = self._download_json( + 'https://access.mp.lura.live/anvacks/%s?apikey=%s' + % (access_key, self._API_KEY), + video_id, note='Downloading access details') + server_time_url = try_get(access_info, lambda x: x['api']['time']) + + server_time_url = ( + server_time_url + and fix_template_vars(server_time_url).format(ANVACK=access_key)) + server_time = self._server_time(access_key, video_id, server_time_url) + + video_data_url = access_info['api'].get('video') + + if not video_data_url: + # use special knowledge + video_data_url = self._API_PREFIX + 'mcp/video/{{VIDEO_ID}}?anvack={{ANVACK}}' + + video_data_url = fix_template_vars(video_data_url).format(ANVACK=access_key, VIDEO_ID=video_id) + input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) auth_secret = intlist_to_bytes(aes_encrypt( bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) - video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') anvrid = md5_text(time.time() * 1000 * random.random())[:30] + + query = { + 'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'), + 'rtyp': 'fp', + } api = { 'anvrid': anvrid, 'anvts': server_time, + 'anvstk2': 'default', } - api['anvstk'] = md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))) - return self._download_json( video_data_url, video_id, transform_source=strip_jsonp, + note='Downloading video details', + query=query, data=json.dumps({'api': api}).encode('utf-8')) def _get_anvato_videos(self, access_key, video_id): From ebf0fcd916db3d462ed9ac75bc2d04381e091ba8 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 22 Feb 2022 19:01:44 +0000 Subject: [PATCH 2/4] [SendtoNews] Improve _VALID_URL, fix handling API result, add/fix tests Media links now come in `configuration.sources.src` of playlist item --- youtube_dl/extractor/sendtonews.py | 133 ++++++++++++++++++++--------- 1 file changed, 94 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py index 9d9652949..2d63918a3 100644 --- a/youtube_dl/extractor/sendtonews.py +++ b/youtube_dl/extractor/sendtonews.py @@ -5,43 +5,45 @@ import re from .common import InfoExtractor from ..utils import ( + dict_get, + ExtractorError, float_or_none, parse_iso8601, update_url_query, int_or_none, + determine_ext, determine_protocol, + strip_or_none, + try_get, unescapeHTML, + urljoin, ) class SendtoNewsIE(InfoExtractor): - _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P[0-9A-Za-z-]+)' - - _TEST = { + # TODO handle items with ?fk=XXXX6789&cid=1234 -> SC=XXXX6789-???????-1234 + _VALID_URL = r'https?://embed\.sendtonews\.com/(?:player\d/embed(?:player|code)\.(?:php|js)|oembed/?)\?.*\bSC=(?P[\w-]+)' + _TESTS = [{ # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', 'info_dict': { 'id': 'GxfCe0Zo7D-175909-5588' }, - 'playlist_count': 8, - # test the first video only to prevent lengthy tests - 'playlist': [{ - 'info_dict': { - 'id': '240385', - 'ext': 'mp4', - 'title': 'Indians introduce Encarnacion', - 'description': 'Indians president of baseball operations Chris Antonetti and Edwin Encarnacion discuss the slugger\'s three-year contract with Cleveland', - 'duration': 137.898, - 'thumbnail': r're:https?://.*\.jpg$', - 'upload_date': '20170105', - 'timestamp': 1483649762, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, + 'playlist_count': 10, + }, { + 'url': 'https://embed.sendtonews.com/player4/embedplayer.php?SC=mq3wIKSb68-1206898-8402&type=single', + 'info_dict': { + 'id': '1752278', + 'ext': 'mp4', + 'title': 'Las vegas homebuilders had banner sales year in 2021, and other top stories from January 24, 2022.', + 'description': 'LAS VEGAS HOMEBUILDERS HAD BANNER SALES YEAR IN 2021., and other top stories from January 24, 2022.', + 'timestamp': 1643063702, + 'upload_date': '20220124', + 'thumbnail': r're:https?://.*\.(?:png|jpg)$', + 'categories': ['Business'], + 'tags': list, }, - } + }] _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' @@ -59,17 +61,62 @@ class SendtoNewsIE(InfoExtractor): playlist_id = self._match_id(url) data_url = update_url_query( - url.replace('embedplayer.php', 'data_read.php'), - {'cmd': 'loadInitial'}) + re.sub( + r'(?Pplayer\d)?(?embed.+?|oembed/)\?', + lambda m: '/%s/data_read.php?' % ((m.group('player') or 'player4'), ), + url), + {'cmd': 'loadInitial', 'type': 'single', }) playlist_data = self._download_json(data_url, playlist_id) + playlist = try_get(playlist_data, lambda x: x['playlistData'][0], (dict, list)) or {} + if isinstance(playlist, dict): + err = playlist.get('error', 'No or invalid data returned from API') + raise ExtractorError(err) entries = [] - for video in playlist_data['playlistData'][0]: - info_dict = self._parse_jwplayer_data( - video['jwconfiguration'], - require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True}) + info_dict = {} + for video in playlist: + try: + err = video.get('error') + if err and video.get('S_ID') is not None: + e = ExtractorError(err) + e.msg = err + raise e + except AttributeError: + continue + except ExtractorError as e: + self.report_warning(e.msg, playlist_id) + continue + if 'jwconfiguration' in video: + info_dict.update(self._parse_jwplayer_data( + video['jwconfiguration'], + require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True})) + elif 'configuration' not in video: + continue + else: + fmt_url = urljoin( + url, + try_get(video, lambda x: x['configuration']['sources']['src'])) + if not fmt_url: + continue + video_id = strip_or_none(video.get('SM_ID') or video['configuration']['mediaid']) + title = strip_or_none(video.get('S_headLine') or video['configuration']['title']) + if not video_id or not title: + continue + ext = determine_ext(fmt_url) + if ext == 'm3u8': + formats = self._extract_m3u8_formats( + fmt_url, playlist_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + else: + formats = [{ + 'url': fmt_url, + 'ext': ext, + 'width': int_or_none(video.get('SM_M_VIDEO_WIDTH')), + 'height': int_or_none(video.get('SM_M_VIDEO_HEIGHT')), + }] + info_dict['formats'] = formats - for f in info_dict['formats']: + for f in info_dict.get('formats') or []: if f.get('tbr'): continue tbr = int_or_none(self._search_regex( @@ -83,23 +130,31 @@ class SendtoNewsIE(InfoExtractor): self._sort_formats(info_dict['formats'], ('tbr', 'height', 'width', 'format_id')) thumbnails = [] - if video.get('thumbnailUrl'): + for tn_id, tn in (('poster', video['configuration'].get('poster')), + ('normal', video.get('thumbnailUrl')), + ('small', video.get('smThumbnailUrl'))): + tn = urljoin(url, tn) + if not tn: + continue thumbnails.append({ - 'id': 'normal', - 'url': video['thumbnailUrl'], - }) - if video.get('smThumbnailUrl'): - thumbnails.append({ - 'id': 'small', - 'url': video['smThumbnailUrl'], + 'id': tn_id, + 'url': tn, }) info_dict.update({ - 'title': video['S_headLine'].strip(), - 'description': unescapeHTML(video.get('S_fullStory')), + 'id': video_id, + 'title': title, + 'description': unescapeHTML(dict_get(video, ('S_fullStory', 'S_shortSummary'))), 'thumbnails': thumbnails, - 'duration': float_or_none(video.get('SM_length')), + 'duration': float_or_none( + dict_get(video, ('SM_length', 'SM_M_LENGTH')) + or video['configuration'].get('duration')), 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + 'tags': [t for t in video.get('S_tags', '').split(',') if t], + 'categories': [c for c in video.get('S_category', '').split(',') if c], }) entries.append(info_dict) + if len(entries) == 1: + entries[0]['display_id'] = playlist_id + return entries[0] return self.playlist_result(entries, playlist_id) From 015954f21a369aa63101b3e9aca0ee06a340f93a Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 22 Feb 2022 22:20:11 +0000 Subject: [PATCH 3/4] [core] Make default upload_/release_date a compat_str Ensures CBSLocal download test passes in Python 2 (pre-release from PR #29698) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fe30758ef..69736acff 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1529,7 +1529,7 @@ class YoutubeDL(object): # see http://bugs.python.org/issue1646728) try: upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) - info_dict[date_key] = upload_date.strftime('%Y%m%d') + info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d')) except (ValueError, OverflowError, OSError): pass From 108737d60067d92357358765724090b8bccdbe39 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 23 Feb 2022 02:55:38 +0000 Subject: [PATCH 4/4] [CBSLocal] Handle rehosted legacy Anvato video, add/fix tests Update regex for Anvato player JSON --- youtube_dl/extractor/anvato.py | 46 +++++++++++--------- youtube_dl/extractor/cbslocal.py | 74 ++++++++++++++++++-------------- 2 files changed, 67 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index eb00975c5..ae4c5f2e9 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -207,7 +207,9 @@ class AnvatoIE(InfoExtractor): _API_PREFIX = 'https://tkx.mp.lura.live/rest/v2/' _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' - _ANVP_RE = r']+\bdata-anvp\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' + _ANVP_RE = ( + r']*>[^<]*?\bAnvatoPlayer\s*\(\s*["\w]+\s*\)\s*\.\s*init\s*\(\s*(?P{[^<]+?})\s*\);', + r']+\bdata-anvp\s*=\s*(["\'])(?P(?:(?!\1).)+)\1') _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' _TESTS = [{ @@ -381,26 +383,28 @@ class AnvatoIE(InfoExtractor): @staticmethod def _extract_urls(ie, webpage, video_id): entries = [] - for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): - anvplayer_data = ie._parse_json( - mobj.group('anvp'), video_id, transform_source=unescapeHTML, - fatal=False) - if not anvplayer_data: - continue - video = anvplayer_data.get('video') - if not isinstance(video, compat_str) or not video.isdigit(): - continue - access_key = anvplayer_data.get('accessKey') - if not access_key: - mcp = anvplayer_data.get('mcp') - if mcp: - access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( - mcp.lower()) - if not access_key: - continue - entries.append(ie.url_result( - 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), - video_id=video)) + anvp_res = AnvatoIE._ANVP_RE + for anvp_re in anvp_res if isinstance(anvp_res, (list, tuple, )) else (anvp_res, ): + for mobj in re.finditer(anvp_re, webpage): + anvplayer_data = ie._parse_json( + mobj.group('anvp'), video_id, transform_source=unescapeHTML, + fatal=False) + if not anvplayer_data: + continue + video = anvplayer_data.get('video') + if not isinstance(video, compat_str) or not video.isdigit(): + continue + access_key = anvplayer_data.get('accessKey') + if not access_key: + mcp = anvplayer_data.get('mcp') + if mcp: + access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( + mcp.lower()) + if not access_key: + continue + entries.append(ie.url_result( + 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), + video_id=video)) return entries def _extract_anvato_videos(self, webpage, video_id): diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 3b7e1a8b9..f8bffab27 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -5,6 +5,7 @@ from .anvato import AnvatoIE from .sendtonews import SendtoNewsIE from ..compat import compat_urlparse from ..utils import ( + merge_dicts, parse_iso8601, unified_timestamp, ) @@ -14,6 +15,8 @@ class CBSLocalIE(AnvatoIE): _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' _VALID_URL = _VALID_URL_BASE + r'video/(?P\d+)' + _OLD_ANVATO_KEY = 'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67' + _TESTS = [{ 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', 'info_dict': { @@ -30,10 +33,6 @@ class CBSLocalIE(AnvatoIE): }, 'categories': [ 'Stations\\Spoken Word\\WCBSTV', - 'Syndication\\AOL', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\Yahoo', 'Content\\News', 'Content\\News\\Local News', ], @@ -42,12 +41,21 @@ class CBSLocalIE(AnvatoIE): 'params': { 'skip_download': True, }, + 'expected_warnings': ('Failed to download m3u8 information', ), }] def _real_extract(self, url): + mcp_id = self._match_id(url) - return self.url_result( - 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) + webpage = self._download_webpage(url, mcp_id) + + json_ld = self._search_json_ld(webpage, mcp_id, fatal=False) or {} + json_ld.pop('url', None) + + return merge_dicts( + self._extract_anvato_videos(webpage, mcp_id) + or self.url_result(self._OLD_ANVATO_KEY + ':' + mcp_id, 'Anvato', mcp_id), + json_ld) class CBSLocalArticleIE(AnvatoIE): @@ -56,30 +64,25 @@ class CBSLocalArticleIE(AnvatoIE): _TESTS = [{ # Anvato backend 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', - 'md5': 'f0ee3081e3843f575fccef901199b212', + 'only_matching': True + }, { + 'url': 'https://losangeles.cbslocal.com/2022/02/16/rams-super-bowl-parade-to-take-place-wednesday/', + 'md5': '36bdac3fb24ec8a6d7790218a0357b08', 'info_dict': { - 'id': '3401037', + 'id': '6201053', 'ext': 'mp4', - 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', - 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', - 'thumbnail': 're:^https?://.*', - 'timestamp': 1463440500, - 'upload_date': '20160516', + 'display_id': 'rams-super-bowl-parade-to-take-place-wednesday', + 'upload_date': '20220216', 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, + 'description': 'Jeff Nguyen is live from outside the L.A. Memorial Coliseum where fans cheered on the Los Angeles Rams.', + 'timestamp': 1645044990, + 'title': 'Rams Fans Gather Outside The LA Memorial Coliseum', 'categories': [ - 'Stations\\Spoken Word\\KCBSTV', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\AOL', - 'Syndication\\Yahoo', - 'Syndication\\Tribune', - 'Syndication\\Curb.tv', - 'Content\\News' + 'Stations\\Spoken Word\\KCALTV', + 'Content\\News', + 'Content\\Top Story', ], - 'tags': ['CBS 2 News Evening'], + 'tags': ['KCAL 9 News Afternoon'], }, }, { # SendtoNews embed @@ -92,18 +95,24 @@ class CBSLocalArticleIE(AnvatoIE): # m3u8 download 'skip_download': True, }, + 'skip': 'Redirects to CBS News home page', }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + json_ld = self._search_json_ld(webpage, display_id, fatal=False) or {} + json_ld.pop('url', None) + sendtonews_url = SendtoNewsIE._extract_url(webpage) if sendtonews_url: - return self.url_result( + result = self.url_result( compat_urlparse.urljoin(url, sendtonews_url), ie=SendtoNewsIE.ie_key()) + return merge_dicts(result, json_ld) + # returns a dict, or raises info_dict = self._extract_anvato_videos(webpage, display_id) timestamp = unified_timestamp(self._html_search_regex( @@ -111,9 +120,10 @@ class CBSLocalArticleIE(AnvatoIE): 'released date', default=None)) or parse_iso8601( self._html_search_meta('uploadDate', webpage)) - info_dict.update({ - 'display_id': display_id, - 'timestamp': timestamp, - }) - - return info_dict + return merge_dicts( + info_dict, + json_ld, + { + 'display_id': display_id, + 'timestamp': timestamp, + })