diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5dcd4ced3..bee3ba22c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -426,10 +426,7 @@ from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gaskrank import GaskrankIE from .gazeta import GazetaIE -from .gbnews import ( - GBNewsIE, - GBNewsLiveIE, -) +from .gbnews import GBNewsIE from .gdcvault import GDCVaultIE from .gedidigital import GediDigitalIE from .generic import GenericIE diff --git a/youtube_dl/extractor/gbnews.py b/youtube_dl/extractor/gbnews.py index f8ecef02d..f04f30e5a 100644 --- a/youtube_dl/extractor/gbnews.py +++ b/youtube_dl/extractor/gbnews.py @@ -2,23 +2,24 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, -) from ..utils import ( extract_attributes, ExtractorError, - try_get, + T, + traverse_obj, + txt_or_none, + url_or_none, ) class GBNewsIE(InfoExtractor): - '''GB News clips and features''' + IE_DESC = 'GB News clips, features and live stream' # \w+ is normally shows or news, but apparently any word redirects to the correct URL - _VALID_URL = r'https?://(?:www\.)?gbnews\.uk/(?:\w+(?:/(?P[^/]+))?|a)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?gbnews\.(?:uk|com)/(?:\w+/)?(?P[^#?]+)' + _PLATFORM = 'safari' - _SSMP_URL = 'https://mm-dev.simplestream.com/ssmp/api.php' + _SSMP_URL = 'https://mm-v2.simplestream.com/ssmp/api.php' _TESTS = [{ 'url': 'https://www.gbnews.uk/shows/andrew-neils-message-to-companies-choosing-to-boycott-gb-news/106889', 'info_dict': { @@ -27,11 +28,32 @@ class GBNewsIE(InfoExtractor): 'title': "Andrew Neil's message to companies choosing to boycott GB News", 'description': 'md5:b281f5d22fd6d5eda64a4e3ba771b351', }, - }, - ] + 'skip': '404 not found', + }, { + 'url': 'https://www.gbnews.com/news/bbc-claudine-gay-harvard-university-antisemitism-row', + 'info_dict': { + 'id': '52264136', + 'display_id': 'bbc-claudine-gay-harvard-university-antisemitism-row', + 'ext': 'mp4', + 'title': 'BBC deletes post after furious backlash over headline downplaying antisemitism', + 'description': 'The post was criticised by former employers of the broadcaster', + }, + }, { + 'url': 'https://www.gbnews.uk/watchlive', + 'info_dict': { + 'id': '1069', + 'display_id': 'watchlive', + 'ext': 'mp4', + 'title': 'GB News Live', + 'is_live': True, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id = self._match_id(url).split('/')[-1] webpage = self._download_webpage(url, display_id) # extraction based on https://github.com/ytdl-org/youtube-dl/issues/29341 @@ -50,35 +72,32 @@ class GBNewsIE(InfoExtractor): ''' # exception if no match video_data = self._search_regex( - r'<\s*div\s[^>]*class\s*=\s*([\'"])simplestream\1[^>]*>', - webpage, "video data", group=0) + r'(]*\bclass\s*=\s*(\'|")(?!.*sidebar\b)simplestream(?:\s[\s\w$-]*)?\2[^>]*>)', + webpage, 'video data') - # print(video_data) video_data = extract_attributes(video_data) - ss_id = try_get(video_data, lambda x: x['data-id']) + ss_id = video_data.get('data-id') if not ss_id: raise ExtractorError('Simplestream ID not found') - # exception if no JSON json_data = self._download_json( self._SSMP_URL, display_id, note='Downloading Simplestream JSON metadata', errnote='Unable to download Simplestream JSON metadata', query={ 'id': ss_id, - 'env': video_data.get('data-env'), - }) + 'env': video_data.get('data-env', 'production'), + }, fatal=False) - meta_url = try_get(json_data, lambda x: x['response']['api_hostname'], compat_str) + meta_url = traverse_obj(json_data, ('response', 'api_hostname')) if not meta_url: raise ExtractorError('No API host found') - uvid = video_data.get('data-uvid') + uvid = video_data['data-uvid'] dtype = video_data.get('data-type') - # exception if no JSON stream_data = self._download_json( '%s/api/%s/stream/%s' % (meta_url, 'show' if dtype == 'vod' else dtype, uvid), - display_id, + uvid, query={ 'key': video_data.get('data-key'), 'platform': self._PLATFORM, @@ -87,66 +106,34 @@ class GBNewsIE(InfoExtractor): 'Token': video_data.get('data-token'), 'Token-Expiry': video_data.get('data-expiry'), 'Uvid': uvid, - }) + }, fatal=False) - stream_url = try_get(stream_data, lambda x: x['response']['stream'], compat_str) + stream_url = traverse_obj(stream_data, ( + 'response', 'stream', T(url_or_none))) if not stream_url: - raise ExtractorError('No stream data') + raise ExtractorError('No stream data/URL') # now known to be a dict stream_data = stream_data['response'] drm = stream_data.get('drm') if drm: - raise ExtractorError( - 'Stream is requesting DRM (%s) playback: unsupported' % drm, - expected=True) - - formats = [] - formats.extend( - self._extract_m3u8_formats(stream_url, display_id, ext='mp4', fatal=False)) + self.report_drm(uvid) + formats = self._extract_m3u8_formats( + stream_url, uvid, ext='mp4', entry_protocol='m3u8_native', + fatal=False) # exception if no formats self._sort_formats(formats) - # no 'title' attribute seen, but if it comes ... - title = stream_data.get('title') or self._og_search_title(webpage) - return { - 'id': display_id, - 'title': title, + 'id': uvid, + 'display_id': display_id, + 'title': (traverse_obj(stream_data, ('title', T(txt_or_none))) + or self._og_search_title(webpage, default=None) + or display_id.replace('-', ' ').capitalize()), 'description': self._og_search_description(webpage, default=None), - 'thumbnail': video_data.get('data-poster') or None, + 'thumbnail': (traverse_obj(video_data, ('data-poster', T(url_or_none))) + or self._og_search_thumbnail(webpage)), 'formats': formats, - 'is_live': 'Live' in self.IE_NAME, + 'is_live': (dtype == 'live') or None, } - - -class GBNewsLiveIE(GBNewsIE): - '''GB News live programme stream''' - - _VALID_URL = r'https?://(?:www.)?gbnews.uk/(?Pwatchlive)(?:$|[/?#])' - _TESTS = [{ - 'url': 'https://www.gbnews.uk/watchlive', - 'info_dict': { - 'id': 'watchlive', - 'ext': 'mp4', - 'title': "Watchlive", - 'is_live': True, - }, - }, - ] - - ''' -
-
- '''