From 9e96dc8b3561c1e6e62ce6a34efba485e5e49054 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:36:59 -0500 Subject: [PATCH 01/13] Support BBC News (bbc.com/news) --- docs/supportedsites.md | 1 + youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bbcnews.py | 162 +++++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 220e52b98..d4ccbbd3a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,6 +50,7 @@ - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer + - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2..51d2d20e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -36,6 +36,7 @@ from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE +from .bbcnews import BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py new file mode 100644 index 000000000..b10e30a81 --- /dev/null +++ b/youtube_dl/extractor/bbcnews.py @@ -0,0 +1,162 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) +from ..compat import compat_HTTPError +import re +from .bbccouk import BBCCoUkIE + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _duration_str2int(self, str): + if not str: + return None + ret = re.match(r'^\d+$', str) + if ret: + return int(ret.group(0)) + ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) + if ret: + total=int(ret.group('s')) + if ret.group('m'): + total+=(int(ret.group('m'))*60) + if ret.group('h'): + total+=(int(ret.group('h'))*3600) + return total + return None + + def _download_media_selector(self, programme_id): + # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not + # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ + # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it + + try: + media_selection = self._download_xml( + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, + programme_id, 'Downloading media selection XML') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) + else: + raise + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + return formats, subtitles + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = self._duration_str2int(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) From a8b081a0523c412fd4e01d5cddec7ae382c4793e Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:52:25 -0500 Subject: [PATCH 02/13] BBCNewsIE: eliminate redundant function. BBCCoUkIE._download_media_selector: use class variable instead of hardcoded string for mediaselector_url template. --- youtube_dl/extractor/bbccouk.py | 4 +++- youtube_dl/extractor/bbcnews.py | 42 ++------------------------------- 2 files changed, 5 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 0305f88b5..dcc5fc2fa 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -15,6 +15,8 @@ class BBCCoUkIE(InfoExtractor): IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -277,7 +279,7 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): try: media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id, + self.mediaselector_url % programme_id, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index b10e30a81..9bb8d42e6 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -14,6 +14,8 @@ class BBCNewsIE(BBCCoUkIE): IE_DESC = 'BBC news' _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + _TESTS = [{ 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { @@ -59,46 +61,6 @@ class BBCNewsIE(BBCCoUkIE): return total return None - def _download_media_selector(self, programme_id): - # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not - # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ - # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it - - try: - media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, - programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) - else: - raise - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - return formats, subtitles - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) From d5552a3477a0970f4aaaa746ce07c816267bb9cf Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 06:25:50 -0500 Subject: [PATCH 03/13] bbcnews: Switch to parse_duration, revert change to docs/supportedsites.md --- docs/supportedsites.md | 1 - youtube_dl/extractor/bbcnews.py | 19 ++----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d4ccbbd3a..220e52b98 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,7 +50,6 @@ - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer - - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index 9bb8d42e6..fd4a5e38f 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError @@ -45,22 +46,6 @@ class BBCNewsIE(BBCCoUkIE): } }] - def _duration_str2int(self, str): - if not str: - return None - ret = re.match(r'^\d+$', str) - if ret: - return int(ret.group(0)) - ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) - if ret: - total=int(ret.group('s')) - if ret.group('m'): - total+=(int(ret.group('m'))*60) - if ret.group('h'): - total+=(int(ret.group('h'))*3600) - return total - return None - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) @@ -88,7 +73,7 @@ class BBCNewsIE(BBCCoUkIE): xml_url = jent.get('href', None) title = jent['caption'] - duration = self._duration_str2int(jent.get('duration',None)) + duration = parse_duration(jent.get('duration',None)) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): From 10273d6e0846cd8f3762e3777712d5cd2a0cafcd Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:22:13 -0500 Subject: [PATCH 04/13] toss new stuff into old file --- youtube_dl/extractor/__init__.py | 3 +- youtube_dl/extractor/bbccouk.py | 101 ++++++++++++++++++++++++++++ youtube_dl/extractor/bbcnews.py | 109 ------------------------------- 3 files changed, 102 insertions(+), 111 deletions(-) delete mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 51d2d20e9..f9f7bdfaf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,8 +35,7 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE -from .bbcnews import BBCNewsIE +from .bbccouk import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index dcc5fc2fa..ea682fb6f 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -5,9 +5,11 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError +import re class BBCCoUkIE(InfoExtractor): @@ -394,3 +396,102 @@ class BBCCoUkIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = parse_duration(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py deleted file mode 100644 index fd4a5e38f..000000000 --- a/youtube_dl/extractor/bbcnews.py +++ /dev/null @@ -1,109 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - int_or_none, -) -from ..compat import compat_HTTPError -import re -from .bbccouk import BBCCoUkIE - -class BBCNewsIE(BBCCoUkIE): - IE_NAME = 'bbc.com' - IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' - - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - - _TESTS = [{ - 'url': 'http://www.bbc.com/news/world-europe-32668511', - 'info_dict': { - 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade despite Western boycott', - }, - 'playlist_count': 2, - },{ - 'url': 'http://www.bbc.com/news/business-28299555', - 'info_dict': { - 'id': 'business-28299555', - 'title': 'Farnborough Airshow: Video highlights', - }, - 'playlist_count': 9, - },{ - 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'note': 'Video', - 'info_dict': { - 'id': 'p02mprgb', - 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'duration': 47, - }, - 'params': { - 'skip_download': True, - } - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') - - pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) - if pubdate: - pubdate = pubdate.replace('-','') - - ret = [] - # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: - raise ExtractorError('No video found', expected=True) - - for ent in matches: - jent = self._parse_json(ent,list_id) - - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) - - title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) - description = list_title + ' - ' + jent.get('caption','') - thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') - - self._sort_formats(formats) - - ret.append( { - 'id': programme_id, - 'uploader': 'BBC News', - 'upload_date': pubdate, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } ) - - if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) - raise ExtractorError('No video found', expected=True) From 75ab0ebcf593ec91a46d83e69854ffa313d33309 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:24:02 -0500 Subject: [PATCH 05/13] no .get('..',None) --- youtube_dl/extractor/bbccouk.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index ea682fb6f..de4d7f9c0 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -457,15 +457,15 @@ class BBCNewsIE(BBCCoUkIE): for ent in matches: jent = self._parse_json(ent,list_id) - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) + programme_id = jent.get('externalId') + xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) + duration = parse_duration(jent.get('duration') description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) + thumbnail=jent['image'].get('href') if programme_id: formats, subtitles = self._download_media_selector(programme_id) From 77c975f536befbe89bf718e86282958d391d9ffe Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:28:14 -0500 Subject: [PATCH 06/13] typofix --- youtube_dl/extractor/bbccouk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index de4d7f9c0..f9404f3fa 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -461,7 +461,7 @@ class BBCNewsIE(BBCCoUkIE): xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration') + duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): From de939d89eb83c851c6db66933e5fc0c401a1a679 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:04:46 -0500 Subject: [PATCH 07/13] Support BBC news in other languages, non-mediaselector videos --- youtube_dl/extractor/bbccouk.py | 87 +++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f9404f3fa..72e20857b 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -401,7 +401,7 @@ class BBCCoUkIE(InfoExtractor): class BBCNewsIE(BBCCoUkIE): IE_NAME = 'bbc.com' IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' @@ -432,56 +432,115 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } + },{ + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'note': 'Video', + 'info_dict': { + 'id': 'NA', + 'ext': 'mp4', + 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + },{ + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'note': 'Video', + 'info_dict': { + 'id': '39275083', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'duration': 87, + }, + 'params': { + 'skip_download': True, + } }] def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: pubdate = pubdate.replace('-','') ret = [] + jsent = [] + # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) + ) + + if len(jsent) == 0: + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset,list_id) + for key, val in jmasset.get('videos',{}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) + + if len(jsent) == 0: # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) + + if len(jsent) == 0: raise ExtractorError('No video found', expected=True) - for ent in matches: - jent = self._parse_json(ent,list_id) - + for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('href') + xml_url = jent.get('hxref') + + title = jent.get('caption',list_title) - title = jent['caption'] duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') + formats = [] + subtitles = [] + if programme_id: formats, subtitles = self._download_media_selector(programme_id) + elif jent.has_key('sourceFiles'): + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append( { + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + } ) elif xml_url: # Cheap fallback # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') + + if len(formats) == 0: + raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') self._sort_formats(formats) ret.append( { - 'id': programme_id, + 'id': jent.get('programme_id',jent.get('id')), 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From 7bb23aeca4e9076528e3d31d501a9a288dcd444c Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:08:13 -0500 Subject: [PATCH 08/13] rename bbccouk.py -> bbc.py --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/{bbccouk.py => bbc.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename youtube_dl/extractor/{bbccouk.py => bbc.py} (100%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f9f7bdfaf..a48346e60 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,7 +35,7 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE, BBCNewsIE +from .bbc import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbc.py similarity index 100% rename from youtube_dl/extractor/bbccouk.py rename to youtube_dl/extractor/bbc.py From 2a282a3b5f366ba0569bae477d5060329ba254fb Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:11:41 -0500 Subject: [PATCH 09/13] Unbreak breakage that was broken to test breakage --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 72e20857b..310db9d1d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -502,7 +502,7 @@ class BBCNewsIE(BBCCoUkIE): for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('hxref') + xml_url = jent.get('href') title = jent.get('caption',list_title) From a9dcf4a860214e37971ab05f27f74bbae65ff8ae Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 23 Jun 2015 01:08:07 -0500 Subject: [PATCH 10/13] Prefer externalId over non-mediaserver-specific hashkey for video id. --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 310db9d1d..fed344ea0 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -540,7 +540,7 @@ class BBCNewsIE(BBCCoUkIE): self._sort_formats(formats) ret.append( { - 'id': jent.get('programme_id',jent.get('id')), + 'id': jent.get('id') if programme_id == None else programme_id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From da92eeae42f556926cb676b3c14e270603b7e38e Mon Sep 17 00:00:00 2001 From: fnord Date: Thu, 25 Jun 2015 00:31:32 -0500 Subject: [PATCH 11/13] Fix tests, description formatting --- youtube_dl/extractor/bbc.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index fed344ea0..bb671d473 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -428,6 +428,8 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, + 'upload_date': '20150324', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -438,8 +440,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': 'NA', 'ext': 'mp4', - 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', + 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', 'duration': 47, + 'upload_date': '20150615', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -450,8 +455,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': '39275083', 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', + 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', 'duration': 87, + 'upload_date': '20150619', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -507,7 +515,9 @@ class BBCNewsIE(BBCCoUkIE): title = jent.get('caption',list_title) duration = parse_duration(jent.get('duration')) - description = list_title + ' - ' + jent.get('caption','') + description = list_title + if jent.get('caption'): + description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') @@ -539,8 +549,12 @@ class BBCNewsIE(BBCCoUkIE): self._sort_formats(formats) + id = jent.get('id') if programme_id == None else programme_id + if id == None: + id = 'NA' + ret.append( { - 'id': jent.get('id') if programme_id == None else programme_id, + 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From 36da48798a28b8261d2f39f73f2522651d58a364 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:27:50 -0500 Subject: [PATCH 12/13] handle titles and captions set to '' --- youtube_dl/extractor/bbc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 471d865d2..c910eb55a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -497,11 +497,13 @@ class BBCNewsIE(BBCCoUkIE): programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption',list_title) + title = jent.get('caption','') + if title == '': + title = list_title duration = parse_duration(jent.get('duration')) description = list_title - if jent.get('caption'): + if jent.get('caption', '') != '': description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): From a3bfddfa5ee33cf085b959536f1025c0aa53cc77 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:47:02 -0500 Subject: [PATCH 13/13] bbc.py: correct syntax --- youtube_dl/extractor/bbc.py | 106 ++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index c910eb55a..c8f285165 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -397,14 +397,14 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Russia stages massive WW2 parade despite Western boycott', }, 'playlist_count': 2, - },{ + }, { 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', 'title': 'Farnborough Airshow: Video highlights', }, 'playlist_count': 9, - },{ + }, { 'url': 'http://www.bbc.com/news/world-europe-32041533', 'note': 'Video', 'info_dict': { @@ -419,7 +419,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'note': 'Video', 'info_dict': { @@ -434,7 +434,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'note': 'Video', 'info_dict': { @@ -459,88 +459,88 @@ class BBCNewsIE(BBCCoUkIE): pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: - pubdate = pubdate.replace('-','') + pubdate = pubdate.replace('-', '') ret = [] jsent = [] # works with bbc.com/news/something-something-123456 articles jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"data-media-meta='({[^']+})'", webpage) + lambda m: self._parse_json(m, list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) ) if len(jsent) == 0: - # http://www.bbc.com/news/video_and_audio/international - # and single-video articles - masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) - if masset: - jmasset = self._parse_json(masset,list_id) - for key, val in jmasset.get('videos',{}).items(): - for skey, sval in val.items(): - sval['id'] = skey - jsent.append(sval) + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset, list_id) + for key, val in jmasset.get('videos', {}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) if len(jsent) == 0: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - # prone to breaking if entries have sourceFiles list - jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - ) + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m, list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) if len(jsent) == 0: - raise ExtractorError('No video found', expected=True) + raise ExtractorError('No video found', expected=True) for jent in jsent: programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption','') + title = jent.get('caption', '') if title == '': - title = list_title + title = list_title duration = parse_duration(jent.get('duration')) description = list_title if jent.get('caption', '') != '': - description += ' - ' + jent.get('caption') + description += ' - ' + jent.get('caption') thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href') + if jent.get('image') is not None: + thumbnail = jent['image'].get('href') formats = [] subtitles = [] if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif jent.has_key('sourceFiles'): - # mediaselector not used at - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu - for key, val in jent['sourceFiles'].items(): - formats.append( { - 'ext': val.get('encoding'), - 'url': val.get('url'), - 'filesize': int(val.get('filesize')), - 'format_id': key - } ) + formats, subtitles = self._download_media_selector(programme_id) + elif jent.get('sourceFiles') is not None: + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append({ + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + }) elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) if len(formats) == 0: - raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') - + raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n') + self._sort_formats(formats) - id = jent.get('id') if programme_id == None else programme_id - if id == None: - id = 'NA' + id = jent.get('id') if programme_id is None else programme_id + if id is None: + id = 'NA' - ret.append( { + ret.append({ 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, @@ -550,8 +550,8 @@ class BBCNewsIE(BBCCoUkIE): 'duration': duration, 'formats': formats, 'subtitles': subtitles, - } ) + }) if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) + return self.playlist_result(ret, list_id, list_title) raise ExtractorError('No video found', expected=True)