From dade9111f11c2eba8216e88e57bace2c948d4163 Mon Sep 17 00:00:00 2001 From: Olivier Trichet Date: Sun, 4 Sep 2022 19:44:05 -0400 Subject: [PATCH 1/5] [RadioFrance] Remove old Radio France stations extractors These are not working anymore after their respectives websites were merged into www.radiofrance.fr. --- youtube_dl/extractor/extractors.py | 3 -- youtube_dl/extractor/franceculture.py | 73 --------------------------- youtube_dl/extractor/franceinter.py | 59 ---------------------- youtube_dl/extractor/radiofrance.py | 59 ---------------------- 4 files changed, 194 deletions(-) delete mode 100644 youtube_dl/extractor/franceculture.py delete mode 100644 youtube_dl/extractor/franceinter.py delete mode 100644 youtube_dl/extractor/radiofrance.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 947cbe8fd..e583d42fb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -403,8 +403,6 @@ from .foxnews import ( FoxNewsArticleIE, ) from .foxsports import FoxSportsIE -from .franceculture import FranceCultureIE -from .franceinter import FranceInterIE from .francetv import ( FranceTVIE, FranceTVSiteIE, @@ -995,7 +993,6 @@ from .radiocanada import ( from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE -from .radiofrance import RadioFranceIE from .rai import ( RaiPlayIE, RaiPlayLiveIE, diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py deleted file mode 100644 index 14f4cb489..000000000 --- a/youtube_dl/extractor/franceculture.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - extract_attributes, - int_or_none, -) - - -class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', - 'info_dict': { - 'id': 'rendez-vous-au-pays-des-geeks', - 'display_id': 'rendez-vous-au-pays-des-geeks', - 'ext': 'mp3', - 'title': 'Rendez-vous au pays des geeks', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140301', - 'timestamp': 1393700400, - 'vcodec': 'none', - } - }, { - # no thumbnail - 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_data = extract_attributes(self._search_regex( - r'''(?sx) - (?: - | - ]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> - ).*? - (]+data-(?:url|asset-source)="[^"]+"[^>]+>) - ''', - webpage, 'video data')) - - video_url = video_data.get('data-url') or video_data['data-asset-source'] - title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage) - - description = self._html_search_regex( - r'(?s)]+class="intro"[^>]*>.*?

(.+?)

', - webpage, 'description', default=None) - thumbnail = self._search_regex( - r'(?s)]+itemtype="https://schema.org/ImageObject"[^>]*>.*?]+(?:data-dejavu-)?src="([^"]+)"', - webpage, 'thumbnail', default=None) - uploader = self._html_search_regex( - r'(?s)(.*?)', - webpage, 'uploader', default=None) - ext = determine_ext(video_url.lower()) - - return { - 'id': display_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'ext': ext, - 'vcodec': 'none' if ext == 'mp3' else None, - 'uploader': uploader, - 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')), - 'duration': int_or_none(video_data.get('data-duration')), - } diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py deleted file mode 100644 index ae822a50e..000000000 --- a/youtube_dl/extractor/franceinter.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import month_by_name - - -class FranceInterIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P[^?#]+)' - - _TEST = { - 'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016', - 'md5': '9e54d7bdb6fdc02a841007f8a975c094', - 'info_dict': { - 'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016', - 'ext': 'mp3', - 'title': 'Affaire Cahuzac : le contentieux du compte en Suisse', - 'description': 'md5:401969c5d318c061f86bda1fa359292b', - 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20160907', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex( - r'(?s)]+class=["\']page-diffusion["\'][^>]*>.*?]+data-url=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'video url', group='url') - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) - - upload_date_str = self._search_regex( - r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', - webpage, 'upload date', fatal=False) - if upload_date_str: - upload_date_list = upload_date_str.split() - upload_date_list.reverse() - upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0) - upload_date_list[2] = '%02d' % int(upload_date_list[2]) - upload_date = ''.join(upload_date_list) - else: - upload_date = None - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'formats': [{ - 'url': video_url, - 'vcodec': 'none', - }], - } diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py deleted file mode 100644 index a8afc0014..000000000 --- a/youtube_dl/extractor/radiofrance.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class RadioFranceIE(InfoExtractor): - _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P[^?#]+)' - IE_NAME = 'radiofrance' - - _TEST = { - 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', - 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', - 'info_dict': { - 'id': 'one-one', - 'ext': 'ogg', - 'title': 'One to one', - 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", - 'uploader': 'Thomas Hercouët', - }, - } - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'

(.*?)

', webpage, 'title') - description = self._html_search_regex( - r'
(.*?)
', - webpage, 'description', fatal=False) - uploader = self._html_search_regex( - r'
  © (.*?)
', - webpage, 'uploader', fatal=False) - - formats_str = self._html_search_regex( - r'class="jp-jplayer[^"]*" data-source="([^"]+)">', - webpage, 'audio URLs') - formats = [ - { - 'format_id': fm[0], - 'url': fm[1], - 'vcodec': 'none', - 'preference': i, - } - for i, fm in - enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) - ] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'uploader': uploader, - } From 7270ecf3d6c393d3a09d377bc2d95466048b9ff9 Mon Sep 17 00:00:00 2001 From: Olivier Trichet Date: Sun, 4 Sep 2022 19:51:40 -0400 Subject: [PATCH 2/5] [RadioFrance] Extractor for podcast of Radio France stations --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/radiofrance.py | 99 +++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 youtube_dl/extractor/radiofrance.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e583d42fb..fd92a69df 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -993,6 +993,7 @@ from .radiocanada import ( from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE +from .radiofrance import RadioFrancePodcastIE from .rai import ( RaiPlayIE, RaiPlayLiveIE, diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py new file mode 100644 index 000000000..454601762 --- /dev/null +++ b/youtube_dl/extractor/radiofrance.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_attribute, + int_or_none, + parse_iso8601, + strip_or_none, + url_or_none +) + + +class RadioFrancePodcastIE(InfoExtractor): + _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/.*-(?P\d+)$' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-brief-eco/le-brief-eco-du-lundi-05-septembre-2022-8310713', + 'info_dict': { + 'id': '8310713', + 'ext': 'mp3', + 'url': r're:^https?://.*\.mp3$', + 'title': 'Pour la première fois en vingt ans, l’euro passe sous les 0,99\u00a0dollar', + 'description': str, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': int, + 'duration': int, + 'upload_date': str, + } + }, { + 'url': 'https://www.radiofrance.fr/francemusique/podcasts/allegretto/lever-du-jour-9233228', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/rendez-vous-avec-x/un-mysterieux-echange-digne-de-la-guerre-froide-9343281', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/la-science-cqfd/teotihuacan-la-plus-mysterieuse-des-cites-d-or-9224610', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/mouv/podcasts/mouv-dj-la-caution/ncr2a-ne-cherche-rien-d-autre-ailleurs-1197950', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip/hommage-au-cinema-de-vangelis-4734742', + 'only_matching': True, + }] + + def extract_api_data(self, id, html): + pattern = r'' + json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json') + if json: + json = self._parse_json(json, id) + if json and 'body' in json: + json = self._parse_json(json.get('body'), id) + if not json: + raise ExtractorError('%s: JSON data not found' % id) + return json + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) + + api_data = self.extract_api_data(media_id, webpage) + api_data = api_data['content'] + + url = url_or_none(api_data['manifestations'][0]['url']) + duration = int_or_none(api_data['manifestations'][0].get('duration')) + + title = strip_or_none(api_data.get('title')) + title = title or strip_or_none(self._og_search_title(webpage)) + title = title or strip_or_none(get_element_by_attribute('h1', None, webpage, False)) + + description = strip_or_none(api_data.get('standFirst')) + description = description or strip_or_none(self._og_search_description(webpage)) + + visual = api_data.get('visual') + thumbnail = None + if visual: + thumbnail = url_or_none(visual.get('src')) + if not thumbnail: + thumbnail = self._og_search_thumbnail(webpage) + + channel_id = self._og_search_property('site_name', webpage, 'Station name', fatal=False) + + publication_time = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', )) + + return { + 'id': media_id, + 'title': title, + 'url': url, + 'description': description, + 'thumbnail': thumbnail, + 'channel_id': channel_id, + 'timestamp': publication_time, + 'duration': duration, + 'is_live': False + } From ea02c4053943c5913c0f433522a965ed8eeadeab Mon Sep 17 00:00:00 2001 From: Olivier Trichet Date: Sun, 25 Sep 2022 11:12:58 -0400 Subject: [PATCH 3/5] [RadioFrance] Extractor for podcast playlists --- youtube_dl/extractor/radiofrance.py | 169 ++++++++++++++++++++++------ 1 file changed, 135 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py index 454601762..72e8cda05 100644 --- a/youtube_dl/extractor/radiofrance.py +++ b/youtube_dl/extractor/radiofrance.py @@ -15,9 +15,17 @@ from ..utils import ( class RadioFrancePodcastIE(InfoExtractor): - _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/.*-(?P\d+)$' + _BASE_URL = r'https://www.radiofrance.fr/' + _VALID_URL = r'''(?x)https?://www\.radiofrance\.fr/ + (?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/ + podcasts/( + .+/.+-(?P\d+) + | + (?P[^/]+?)(?:[?#].*)? + )$''' _TESTS = [{ + 'note': 'Podcast episode with audio from France Info', 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-brief-eco/le-brief-eco-du-lundi-05-septembre-2022-8310713', 'info_dict': { 'id': '8310713', @@ -28,72 +36,165 @@ class RadioFrancePodcastIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': int, 'duration': int, - 'upload_date': str, + 'upload_date': str } }, { + 'note': 'Podcast episode from France Musique', 'url': 'https://www.radiofrance.fr/francemusique/podcasts/allegretto/lever-du-jour-9233228', - 'only_matching': True, + 'only_matching': True }, { + 'note': 'Podcast episode from FranceInter', 'url': 'https://www.radiofrance.fr/franceinter/podcasts/rendez-vous-avec-x/un-mysterieux-echange-digne-de-la-guerre-froide-9343281', - 'only_matching': True, + 'only_matching': True }, { + 'note': 'Podcast episode from France Culture', 'url': 'https://www.radiofrance.fr/franceculture/podcasts/la-science-cqfd/teotihuacan-la-plus-mysterieuse-des-cites-d-or-9224610', - 'only_matching': True, + 'only_matching': True }, { + 'note': 'Podcast episode from Le Mouv', 'url': 'https://www.radiofrance.fr/mouv/podcasts/mouv-dj-la-caution/ncr2a-ne-cherche-rien-d-autre-ailleurs-1197950', - 'only_matching': True, + 'only_matching': True }, { + 'note': 'Podcast episode from FIP', 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip/hommage-au-cinema-de-vangelis-4734742', - 'only_matching': True, + 'only_matching': True + }, { + 'note': 'Podcast show with multiple pages of episodes and some of them are missing', + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/une-semaine-dans-le-monde-10-11?p=2', + 'info_dict': { + 'id': 'une-semaine-dans-le-monde-10-11', + 'title': 'Une semaine dans le monde | 10-11', + 'description': str, + 'timestamp': int + }, + 'playlist_count': 23, }] def extract_api_data(self, id, html): pattern = r'' json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json') - if json: - json = self._parse_json(json, id) - if json and 'body' in json: - json = self._parse_json(json.get('body'), id) if not json: raise ExtractorError('%s: JSON data not found' % id) - return json - def _real_extract(self, url): - media_id = self._match_id(url) - webpage = self._download_webpage(url, media_id) - - api_data = self.extract_api_data(media_id, webpage) - api_data = api_data['content'] - - url = url_or_none(api_data['manifestations'][0]['url']) - duration = int_or_none(api_data['manifestations'][0].get('duration')) + try: + json = self._parse_json(json, id) + json = self._parse_json(json['body'], id) + return json['content'] + except KeyError: + raise ExtractorError('%s: Invalid JSON' % id) + def parse_api_data_info(self, api_data): title = strip_or_none(api_data.get('title')) - title = title or strip_or_none(self._og_search_title(webpage)) - title = title or strip_or_none(get_element_by_attribute('h1', None, webpage, False)) - description = strip_or_none(api_data.get('standFirst')) - description = description or strip_or_none(self._og_search_description(webpage)) - + channel_id = strip_or_none(api_data.get('brand')) visual = api_data.get('visual') + publication_time = api_data.get('publishedDate') thumbnail = None if visual: thumbnail = url_or_none(visual.get('src')) - if not thumbnail: - thumbnail = self._og_search_thumbnail(webpage) + return { + 'title': title, + 'description': description, + 'channel_id': channel_id, + 'thumbnail': thumbnail, + 'timestamp': publication_time, + } + + def parse_html_info(self, webpage): + title = strip_or_none(self._og_search_title(webpage)) or strip_or_none(get_element_by_attribute('h1', None, webpage, False)) + description = strip_or_none(self._og_search_description(webpage)) + thumbnail = self._og_search_thumbnail(webpage) channel_id = self._og_search_property('site_name', webpage, 'Station name', fatal=False) - publication_time = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', )) return { - 'id': media_id, 'title': title, - 'url': url, 'description': description, 'thumbnail': thumbnail, 'channel_id': channel_id, - 'timestamp': publication_time, - 'duration': duration, - 'is_live': False + 'timestamp': publication_time } + + def extract_episode(self, episode_id, api_data): + manifestations = api_data.get('manifestations') + if manifestations is None or len(manifestations) == 0: + return None + + url = url_or_none(manifestations[0]['url']) + duration = int_or_none(manifestations[0].get('duration')) + episode_info = { + 'id': episode_id, + 'url': url, + 'duration': duration + } + return self.parse_api_data_info(api_data) | episode_info + + def extract_playlist_entries(self, url, playlist_id, api_data, direction): + playlist_data = api_data['expressions'] + + entries = [] + items = playlist_data.get('items') + for item in items: + episode_path = item.get('path') + if episode_path is None: + self.report_warning('No path found for episode "%s"', item.get('title')) + continue + episode_id = self._match_id(self._BASE_URL + item.get('path')) + if episode_id is None: + self.report_warning('Could not parse id of episode from path: "%s"' % item.get('path')) + continue + entry = self.extract_episode(episode_id, item) + if entry is None: + msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.' + self.to_screen('Episode "%s" is not available' % episode_path) + continue + entries.append(entry) + + page_number = int_or_none(playlist_data.get('pageNumber')) + if page_number: + if direction in ['both', 'prev'] and playlist_data.get('prev') is not None: + webpage, other_api_data = self.get_data(url, playlist_id, page=page_number - 1) + entries = self.extract_playlist_entries(url, playlist_id, other_api_data, direction='prev') + entries + if direction in ['both', 'next'] and playlist_data.get('next') is not None: + webpage, other_api_data = self.get_data(url, playlist_id, page=page_number + 1) + entries = entries + self.extract_playlist_entries(url, playlist_id, other_api_data, direction='next') + + return entries + + def extract_playlist(self, playlist_id, url, api_data): + entries = self.extract_playlist_entries(url, playlist_id, api_data, direction='both') + entries = list(filter(lambda e: e is not None, entries)) + entries.reverse() + playlist_info = { + '_type': 'playlist', + 'id': playlist_id, + 'entries': entries + } + return self.parse_api_data_info(api_data) | playlist_info + + def get_data(self, url, id, page=None): + query = {} + note = None + if page: + query['p'] = page + note = "Downloading page %i" % page + webpage = self._download_webpage(url, id, query=query, note=note) + api_data = self.extract_api_data(id, webpage) + return webpage, api_data + + def _real_extract(self, url): + episode_id, playlist_id = re.match(self._VALID_URL, url).group('id', 'playlist_id') + media_id = episode_id or playlist_id + + webpage, api_data = self.get_data(url, media_id) + + html_info = self.parse_html_info(webpage) + if episode_id: + api_data_info = self.extract_episode(episode_id, api_data) + if api_data_info is None: + msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.' + raise ExtractorError(msg, expected=True, video_id=episode_id) + return html_info | api_data_info + + return html_info | self.extract_playlist(playlist_id, url, api_data) From fc933e686b5e540b1b41cf290cda3abdfe0d7576 Mon Sep 17 00:00:00 2001 From: Olivier Trichet Date: Sat, 1 Oct 2022 15:06:19 -0400 Subject: [PATCH 4/5] [RadioFrance] Refactoring --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/radiofrance.py | 137 ++++++++++++++-------------- 2 files changed, 75 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index fd92a69df..d838b3981 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -993,7 +993,10 @@ from .radiocanada import ( from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE -from .radiofrance import RadioFrancePodcastIE +from .radiofrance import ( + RadioFrancePodcastEpisodeIE, + RadioFrancePodcastPlaylistIE +) from .rai import ( RaiPlayIE, RaiPlayLiveIE, diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py index 72e8cda05..7a8eeb327 100644 --- a/youtube_dl/extractor/radiofrance.py +++ b/youtube_dl/extractor/radiofrance.py @@ -14,61 +14,8 @@ from ..utils import ( ) -class RadioFrancePodcastIE(InfoExtractor): +class RadioFranceBaseIE(InfoExtractor): _BASE_URL = r'https://www.radiofrance.fr/' - _VALID_URL = r'''(?x)https?://www\.radiofrance\.fr/ - (?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/ - podcasts/( - .+/.+-(?P\d+) - | - (?P[^/]+?)(?:[?#].*)? - )$''' - - _TESTS = [{ - 'note': 'Podcast episode with audio from France Info', - 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-brief-eco/le-brief-eco-du-lundi-05-septembre-2022-8310713', - 'info_dict': { - 'id': '8310713', - 'ext': 'mp3', - 'url': r're:^https?://.*\.mp3$', - 'title': 'Pour la première fois en vingt ans, l’euro passe sous les 0,99\u00a0dollar', - 'description': str, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': int, - 'duration': int, - 'upload_date': str - } - }, { - 'note': 'Podcast episode from France Musique', - 'url': 'https://www.radiofrance.fr/francemusique/podcasts/allegretto/lever-du-jour-9233228', - 'only_matching': True - }, { - 'note': 'Podcast episode from FranceInter', - 'url': 'https://www.radiofrance.fr/franceinter/podcasts/rendez-vous-avec-x/un-mysterieux-echange-digne-de-la-guerre-froide-9343281', - 'only_matching': True - }, { - 'note': 'Podcast episode from France Culture', - 'url': 'https://www.radiofrance.fr/franceculture/podcasts/la-science-cqfd/teotihuacan-la-plus-mysterieuse-des-cites-d-or-9224610', - 'only_matching': True - }, { - 'note': 'Podcast episode from Le Mouv', - 'url': 'https://www.radiofrance.fr/mouv/podcasts/mouv-dj-la-caution/ncr2a-ne-cherche-rien-d-autre-ailleurs-1197950', - 'only_matching': True - }, { - 'note': 'Podcast episode from FIP', - 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip/hommage-au-cinema-de-vangelis-4734742', - 'only_matching': True - }, { - 'note': 'Podcast show with multiple pages of episodes and some of them are missing', - 'url': 'https://www.radiofrance.fr/franceculture/podcasts/une-semaine-dans-le-monde-10-11?p=2', - 'info_dict': { - 'id': 'une-semaine-dans-le-monde-10-11', - 'title': 'Une semaine dans le monde | 10-11', - 'description': str, - 'timestamp': int - }, - 'playlist_count': 23, - }] def extract_api_data(self, id, html): pattern = r'' @@ -140,7 +87,7 @@ class RadioFrancePodcastIE(InfoExtractor): if episode_path is None: self.report_warning('No path found for episode "%s"', item.get('title')) continue - episode_id = self._match_id(self._BASE_URL + item.get('path')) + episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + item.get('path')) if episode_id is None: self.report_warning('Could not parse id of episode from path: "%s"' % item.get('path')) continue @@ -183,18 +130,76 @@ class RadioFrancePodcastIE(InfoExtractor): api_data = self.extract_api_data(id, webpage) return webpage, api_data - def _real_extract(self, url): - episode_id, playlist_id = re.match(self._VALID_URL, url).group('id', 'playlist_id') - media_id = episode_id or playlist_id - webpage, api_data = self.get_data(url, media_id) +class RadioFrancePodcastEpisodeIE(RadioFranceBaseIE): + _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/.+/.+-(?P\d+)$' + + _TESTS = [{ + 'note': 'Podcast episode with audio from France Info', + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-brief-eco/le-brief-eco-du-lundi-05-septembre-2022-8310713', + 'info_dict': { + 'id': '8310713', + 'ext': 'mp3', + 'url': r're:^https?://.*\.mp3$', + 'title': 'Pour la première fois en vingt ans, l’euro passe sous les 0,99\u00a0dollar', + 'description': str, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': int, + 'duration': int, + 'upload_date': str + } + }, { + 'note': 'Podcast episode from France Musique', + 'url': 'https://www.radiofrance.fr/francemusique/podcasts/allegretto/lever-du-jour-9233228', + 'only_matching': True + }, { + 'note': 'Podcast episode from FranceInter', + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/rendez-vous-avec-x/un-mysterieux-echange-digne-de-la-guerre-froide-9343281', + 'only_matching': True + }, { + 'note': 'Podcast episode from France Culture', + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/la-science-cqfd/teotihuacan-la-plus-mysterieuse-des-cites-d-or-9224610', + 'only_matching': True + }, { + 'note': 'Podcast episode from Le Mouv', + 'url': 'https://www.radiofrance.fr/mouv/podcasts/mouv-dj-la-caution/ncr2a-ne-cherche-rien-d-autre-ailleurs-1197950', + 'only_matching': True + }, { + 'note': 'Podcast episode from FIP', + 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip/hommage-au-cinema-de-vangelis-4734742', + 'only_matching': True + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage, api_data = self.get_data(url, id) + api_data_info = self.extract_episode(id, api_data) + if api_data_info is None: + msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.' + raise ExtractorError(msg, expected=True, video_id=id) html_info = self.parse_html_info(webpage) - if episode_id: - api_data_info = self.extract_episode(episode_id, api_data) - if api_data_info is None: - msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.' - raise ExtractorError(msg, expected=True, video_id=episode_id) - return html_info | api_data_info + return html_info | api_data_info - return html_info | self.extract_playlist(playlist_id, url, api_data) + +class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE): + _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/(?P[^/]+?)(?:[?#].*)?$' + + _TESTS = [{ + 'note': 'Podcast show with multiple pages of episodes and some of them are missing', + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/une-semaine-dans-le-monde-10-11?p=2', + 'info_dict': { + 'id': 'une-semaine-dans-le-monde-10-11', + 'title': 'Une semaine dans le monde | 10-11', + 'description': str, + 'timestamp': int + }, + 'playlist_count': 23, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage, api_data = self.get_data(url, id) + + html_info = self.parse_html_info(webpage) + return html_info | self.extract_playlist(id, url, api_data) From 72db2172897a76665414fc9da3fc79f096df9fab Mon Sep 17 00:00:00 2001 From: Olivier Trichet Date: Sun, 2 Oct 2022 15:24:03 -0400 Subject: [PATCH 5/5] [RadioFrance] Extractor fo thematic webradios --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/radiofrance.py | 214 +++++++++++++++++++--------- 2 files changed, 150 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d838b3981..874ef6de7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -995,7 +995,8 @@ from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import ( RadioFrancePodcastEpisodeIE, - RadioFrancePodcastPlaylistIE + RadioFrancePodcastPlaylistIE, + RadioFranceWebradioIE, ) from .rai import ( RaiPlayIE, diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py index 7a8eeb327..c495f464d 100644 --- a/youtube_dl/extractor/radiofrance.py +++ b/youtube_dl/extractor/radiofrance.py @@ -17,67 +17,69 @@ from ..utils import ( class RadioFranceBaseIE(InfoExtractor): _BASE_URL = r'https://www.radiofrance.fr/' - def extract_api_data(self, id, html): - pattern = r'' + def extract_api_data(self, api_path, id, html): + pattern = r'' % api_path json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json') + if not json: raise ExtractorError('%s: JSON data not found' % id) try: json = self._parse_json(json, id) json = self._parse_json(json['body'], id) - return json['content'] + + if api_path == 'path': + return json['content'] + elif api_path == 'stations': + return json + else: + raise ExtractorError('Coding error') except KeyError: raise ExtractorError('%s: Invalid JSON' % id) - def parse_api_data_info(self, api_data): + def get_title(self, api_data, webpage=None): title = strip_or_none(api_data.get('title')) + if not title and webpage: + title = strip_or_none(get_element_by_attribute('h1', None, webpage, False)) or strip_or_none(self._og_search_title(webpage)) + return title + + def get_description(self, api_data, webpage=None): description = strip_or_none(api_data.get('standFirst')) - channel_id = strip_or_none(api_data.get('brand')) - visual = api_data.get('visual') - publication_time = api_data.get('publishedDate') + if not description and webpage: + description = strip_or_none(self._og_search_description(webpage)) + return description + + def get_thumbnail(self, api_data, webpage=None): thumbnail = None + visual = api_data.get('visual') if visual: thumbnail = url_or_none(visual.get('src')) + if not thumbnail and webpage: + thumbnail = self._og_search_thumbnail(webpage) + return thumbnail - return { - 'title': title, - 'description': description, - 'channel_id': channel_id, - 'thumbnail': thumbnail, - 'timestamp': publication_time, - } + def get_timestamp(self, api_data, webpage=None): + timestamp = api_data.get('publishedDate') + if not timestamp and webpage: + timestamp = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', )) + return timestamp - def parse_html_info(self, webpage): - title = strip_or_none(self._og_search_title(webpage)) or strip_or_none(get_element_by_attribute('h1', None, webpage, False)) - description = strip_or_none(self._og_search_description(webpage)) - thumbnail = self._og_search_thumbnail(webpage) - channel_id = self._og_search_property('site_name', webpage, 'Station name', fatal=False) - publication_time = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', )) - - return { - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'channel_id': channel_id, - 'timestamp': publication_time - } + def get_brand(self, api_data, webpage=None): + brand = strip_or_none(api_data.get('brand')) + if not brand and webpage: + brand = self._og_search_property('site_name', webpage, 'Station name', fatal=False) + return brand def extract_episode(self, episode_id, api_data): manifestations = api_data.get('manifestations') if manifestations is None or len(manifestations) == 0: - return None + return None, None url = url_or_none(manifestations[0]['url']) duration = int_or_none(manifestations[0].get('duration')) - episode_info = { - 'id': episode_id, - 'url': url, - 'duration': duration - } - return self.parse_api_data_info(api_data) | episode_info + return url, duration - def extract_playlist_entries(self, url, playlist_id, api_data, direction): + def get_playlist_entries(self, playlist_url, playlist_id, api_data, direction): playlist_data = api_data['expressions'] entries = [] @@ -87,47 +89,44 @@ class RadioFranceBaseIE(InfoExtractor): if episode_path is None: self.report_warning('No path found for episode "%s"', item.get('title')) continue - episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + item.get('path')) + episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + episode_path) if episode_id is None: - self.report_warning('Could not parse id of episode from path: "%s"' % item.get('path')) + self.report_warning('Could not parse id of episode from path: "%s"' % episode_path) continue - entry = self.extract_episode(episode_id, item) - if entry is None: - msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.' + episode_url, duration = self.extract_episode(episode_id, item) + if episode_url is None: self.to_screen('Episode "%s" is not available' % episode_path) continue + entry = { + 'id': episode_id, + 'url': episode_url, + 'title': self.get_title(item), + 'description': self.get_description(item), + 'timestamp': self.get_timestamp(item), + 'thumbnail': self.get_thumbnail(item), + 'duration': duration, + } entries.append(entry) page_number = int_or_none(playlist_data.get('pageNumber')) if page_number: if direction in ['both', 'prev'] and playlist_data.get('prev') is not None: - webpage, other_api_data = self.get_data(url, playlist_id, page=page_number - 1) - entries = self.extract_playlist_entries(url, playlist_id, other_api_data, direction='prev') + entries + webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number - 1) + entries = self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='prev') + entries if direction in ['both', 'next'] and playlist_data.get('next') is not None: - webpage, other_api_data = self.get_data(url, playlist_id, page=page_number + 1) - entries = entries + self.extract_playlist_entries(url, playlist_id, other_api_data, direction='next') + webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number + 1) + entries = entries + self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='next') return entries - def extract_playlist(self, playlist_id, url, api_data): - entries = self.extract_playlist_entries(url, playlist_id, api_data, direction='both') - entries = list(filter(lambda e: e is not None, entries)) - entries.reverse() - playlist_info = { - '_type': 'playlist', - 'id': playlist_id, - 'entries': entries - } - return self.parse_api_data_info(api_data) | playlist_info - - def get_data(self, url, id, page=None): + def get_data(self, url, api_path, id, page=None): query = {} note = None if page: query['p'] = page note = "Downloading page %i" % page webpage = self._download_webpage(url, id, query=query, note=note) - api_data = self.extract_api_data(id, webpage) + api_data = self.extract_api_data(api_path, id, webpage) return webpage, api_data @@ -172,14 +171,22 @@ class RadioFrancePodcastEpisodeIE(RadioFranceBaseIE): def _real_extract(self, url): id = self._match_id(url) - webpage, api_data = self.get_data(url, id) - api_data_info = self.extract_episode(id, api_data) - if api_data_info is None: + webpage, api_data = self.get_data(url, 'path', id) + url, duration = self.extract_episode(id, api_data) + if url is None: msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.' raise ExtractorError(msg, expected=True, video_id=id) - html_info = self.parse_html_info(webpage) - return html_info | api_data_info + return { + 'id': id, + 'url': url, + 'title': self.get_title(api_data, webpage), + 'description': self.get_description(api_data, webpage), + 'timestamp': self.get_timestamp(api_data, webpage), + 'thumbnail': self.get_thumbnail(api_data, webpage), + 'channel_id': self.get_brand(api_data, webpage), + 'duration': duration, + } class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE): @@ -199,7 +206,82 @@ class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE): def _real_extract(self, url): id = self._match_id(url) - webpage, api_data = self.get_data(url, id) + webpage, api_data = self.get_data(url, 'path', id) - html_info = self.parse_html_info(webpage) - return html_info | self.extract_playlist(id, url, api_data) + entries = self.get_playlist_entries(url, id, api_data, direction='both') + entries.reverse() + + return { + 'id': id, + '_type': 'playlist', + 'entries': entries, + 'title': self.get_title(api_data, webpage), + 'description': self.get_description(api_data, webpage), + 'timestamp': self.get_timestamp(api_data, webpage), + 'thumbnail': self.get_thumbnail(api_data, webpage), + 'channel_id': self.get_brand(api_data, webpage), + } + + +class RadioFranceWebradioIE(RadioFranceBaseIE): + _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/(?Pradio-[^/]+)$' + + _TESTS = [{ + 'note': 'Full list of webradios available at https://www.radiofrance.fr/ecouter-musique', + 'url': 'https://www.radiofrance.fr/fip/radio-metal', + 'info_dict': { + 'id': 'radio-metal', + 'ext': 'aac', + 'title': str, + }, + 'params': { + 'format': 'aac', + 'skip_download': True, + } + }] + + def get_livestream_formats(self, id, api_data): + sources = api_data['media']['sources'] + + formats = [] + for source in sources: + url = source.get('url') + if not url: + continue + + format_id = source.get('format') + format = { + 'url': url, + 'format_id': format_id, + 'asr': 48000, + 'vcodec': 'none' + } + if format_id == 'mp3': + format['preference'] = 1 + format['acodec'] = 'mp3' + format['abr'] = source.get('bitrate') + elif format_id == 'aac': + format['preference'] = 2 + format['acodec'] = 'aac' + format['abr'] = source.get('bitrate') + elif format_id == 'hls': + format['preference'] = 0 + format['manifest_url'] = url + formats.append(format) + + if len(formats) == 0: + raise ExtractorError('No live streaming URL found') + return formats + + def _real_extract(self, url): + id = self._match_id(url) + webpage, api_data = self.get_data(url, 'stations', id) + + return { + 'id': id, + 'title': self.get_title(api_data, webpage), + 'formats': self.get_livestream_formats(id, api_data), + 'thumbnail': self.get_thumbnail(api_data, webpage), + 'channel_id': self.get_brand(api_data, webpage), + 'is_live': True + }