From 72db2172897a76665414fc9da3fc79f096df9fab Mon Sep 17 00:00:00 2001 From: Olivier Trichet Date: Sun, 2 Oct 2022 15:24:03 -0400 Subject: [PATCH] [RadioFrance] Extractor fo thematic webradios --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/radiofrance.py | 214 +++++++++++++++++++--------- 2 files changed, 150 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d838b3981..874ef6de7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -995,7 +995,8 @@ from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import ( RadioFrancePodcastEpisodeIE, - RadioFrancePodcastPlaylistIE + RadioFrancePodcastPlaylistIE, + RadioFranceWebradioIE, ) from .rai import ( RaiPlayIE, diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py index 7a8eeb327..c495f464d 100644 --- a/youtube_dl/extractor/radiofrance.py +++ b/youtube_dl/extractor/radiofrance.py @@ -17,67 +17,69 @@ from ..utils import ( class RadioFranceBaseIE(InfoExtractor): _BASE_URL = r'https://www.radiofrance.fr/' - def extract_api_data(self, id, html): - pattern = r'' + def extract_api_data(self, api_path, id, html): + pattern = r'' % api_path json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json') + if not json: raise ExtractorError('%s: JSON data not found' % id) try: json = self._parse_json(json, id) json = self._parse_json(json['body'], id) - return json['content'] + + if api_path == 'path': + return json['content'] + elif api_path == 'stations': + return json + else: + raise ExtractorError('Coding error') except KeyError: raise ExtractorError('%s: Invalid JSON' % id) - def parse_api_data_info(self, api_data): + def get_title(self, api_data, webpage=None): title = strip_or_none(api_data.get('title')) + if not title and webpage: + title = strip_or_none(get_element_by_attribute('h1', None, webpage, False)) or strip_or_none(self._og_search_title(webpage)) + return title + + def get_description(self, api_data, webpage=None): description = strip_or_none(api_data.get('standFirst')) - channel_id = strip_or_none(api_data.get('brand')) - visual = api_data.get('visual') - publication_time = api_data.get('publishedDate') + if not description and webpage: + description = strip_or_none(self._og_search_description(webpage)) + return description + + def get_thumbnail(self, api_data, webpage=None): thumbnail = None + visual = api_data.get('visual') if visual: thumbnail = url_or_none(visual.get('src')) + if not thumbnail and webpage: + thumbnail = self._og_search_thumbnail(webpage) + return thumbnail - return { - 'title': title, - 'description': description, - 'channel_id': channel_id, - 'thumbnail': thumbnail, - 'timestamp': publication_time, - } + def get_timestamp(self, api_data, webpage=None): + timestamp = api_data.get('publishedDate') + if not timestamp and webpage: + timestamp = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', )) + return timestamp - def parse_html_info(self, webpage): - title = strip_or_none(self._og_search_title(webpage)) or strip_or_none(get_element_by_attribute('h1', None, webpage, False)) - description = strip_or_none(self._og_search_description(webpage)) - thumbnail = self._og_search_thumbnail(webpage) - channel_id = self._og_search_property('site_name', webpage, 'Station name', fatal=False) - publication_time = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', )) - - return { - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'channel_id': channel_id, - 'timestamp': publication_time - } + def get_brand(self, api_data, webpage=None): + brand = strip_or_none(api_data.get('brand')) + if not brand and webpage: + brand = self._og_search_property('site_name', webpage, 'Station name', fatal=False) + return brand def extract_episode(self, episode_id, api_data): manifestations = api_data.get('manifestations') if manifestations is None or len(manifestations) == 0: - return None + return None, None url = url_or_none(manifestations[0]['url']) duration = int_or_none(manifestations[0].get('duration')) - episode_info = { - 'id': episode_id, - 'url': url, - 'duration': duration - } - return self.parse_api_data_info(api_data) | episode_info + return url, duration - def extract_playlist_entries(self, url, playlist_id, api_data, direction): + def get_playlist_entries(self, playlist_url, playlist_id, api_data, direction): playlist_data = api_data['expressions'] entries = [] @@ -87,47 +89,44 @@ class RadioFranceBaseIE(InfoExtractor): if episode_path is None: self.report_warning('No path found for episode "%s"', item.get('title')) continue - episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + item.get('path')) + episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + episode_path) if episode_id is None: - self.report_warning('Could not parse id of episode from path: "%s"' % item.get('path')) + self.report_warning('Could not parse id of episode from path: "%s"' % episode_path) continue - entry = self.extract_episode(episode_id, item) - if entry is None: - msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.' + episode_url, duration = self.extract_episode(episode_id, item) + if episode_url is None: self.to_screen('Episode "%s" is not available' % episode_path) continue + entry = { + 'id': episode_id, + 'url': episode_url, + 'title': self.get_title(item), + 'description': self.get_description(item), + 'timestamp': self.get_timestamp(item), + 'thumbnail': self.get_thumbnail(item), + 'duration': duration, + } entries.append(entry) page_number = int_or_none(playlist_data.get('pageNumber')) if page_number: if direction in ['both', 'prev'] and playlist_data.get('prev') is not None: - webpage, other_api_data = self.get_data(url, playlist_id, page=page_number - 1) - entries = self.extract_playlist_entries(url, playlist_id, other_api_data, direction='prev') + entries + webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number - 1) + entries = self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='prev') + entries if direction in ['both', 'next'] and playlist_data.get('next') is not None: - webpage, other_api_data = self.get_data(url, playlist_id, page=page_number + 1) - entries = entries + self.extract_playlist_entries(url, playlist_id, other_api_data, direction='next') + webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number + 1) + entries = entries + self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='next') return entries - def extract_playlist(self, playlist_id, url, api_data): - entries = self.extract_playlist_entries(url, playlist_id, api_data, direction='both') - entries = list(filter(lambda e: e is not None, entries)) - entries.reverse() - playlist_info = { - '_type': 'playlist', - 'id': playlist_id, - 'entries': entries - } - return self.parse_api_data_info(api_data) | playlist_info - - def get_data(self, url, id, page=None): + def get_data(self, url, api_path, id, page=None): query = {} note = None if page: query['p'] = page note = "Downloading page %i" % page webpage = self._download_webpage(url, id, query=query, note=note) - api_data = self.extract_api_data(id, webpage) + api_data = self.extract_api_data(api_path, id, webpage) return webpage, api_data @@ -172,14 +171,22 @@ class RadioFrancePodcastEpisodeIE(RadioFranceBaseIE): def _real_extract(self, url): id = self._match_id(url) - webpage, api_data = self.get_data(url, id) - api_data_info = self.extract_episode(id, api_data) - if api_data_info is None: + webpage, api_data = self.get_data(url, 'path', id) + url, duration = self.extract_episode(id, api_data) + if url is None: msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.' raise ExtractorError(msg, expected=True, video_id=id) - html_info = self.parse_html_info(webpage) - return html_info | api_data_info + return { + 'id': id, + 'url': url, + 'title': self.get_title(api_data, webpage), + 'description': self.get_description(api_data, webpage), + 'timestamp': self.get_timestamp(api_data, webpage), + 'thumbnail': self.get_thumbnail(api_data, webpage), + 'channel_id': self.get_brand(api_data, webpage), + 'duration': duration, + } class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE): @@ -199,7 +206,82 @@ class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE): def _real_extract(self, url): id = self._match_id(url) - webpage, api_data = self.get_data(url, id) + webpage, api_data = self.get_data(url, 'path', id) - html_info = self.parse_html_info(webpage) - return html_info | self.extract_playlist(id, url, api_data) + entries = self.get_playlist_entries(url, id, api_data, direction='both') + entries.reverse() + + return { + 'id': id, + '_type': 'playlist', + 'entries': entries, + 'title': self.get_title(api_data, webpage), + 'description': self.get_description(api_data, webpage), + 'timestamp': self.get_timestamp(api_data, webpage), + 'thumbnail': self.get_thumbnail(api_data, webpage), + 'channel_id': self.get_brand(api_data, webpage), + } + + +class RadioFranceWebradioIE(RadioFranceBaseIE): + _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/(?Pradio-[^/]+)$' + + _TESTS = [{ + 'note': 'Full list of webradios available at https://www.radiofrance.fr/ecouter-musique', + 'url': 'https://www.radiofrance.fr/fip/radio-metal', + 'info_dict': { + 'id': 'radio-metal', + 'ext': 'aac', + 'title': str, + }, + 'params': { + 'format': 'aac', + 'skip_download': True, + } + }] + + def get_livestream_formats(self, id, api_data): + sources = api_data['media']['sources'] + + formats = [] + for source in sources: + url = source.get('url') + if not url: + continue + + format_id = source.get('format') + format = { + 'url': url, + 'format_id': format_id, + 'asr': 48000, + 'vcodec': 'none' + } + if format_id == 'mp3': + format['preference'] = 1 + format['acodec'] = 'mp3' + format['abr'] = source.get('bitrate') + elif format_id == 'aac': + format['preference'] = 2 + format['acodec'] = 'aac' + format['abr'] = source.get('bitrate') + elif format_id == 'hls': + format['preference'] = 0 + format['manifest_url'] = url + formats.append(format) + + if len(formats) == 0: + raise ExtractorError('No live streaming URL found') + return formats + + def _real_extract(self, url): + id = self._match_id(url) + webpage, api_data = self.get_data(url, 'stations', id) + + return { + 'id': id, + 'title': self.get_title(api_data, webpage), + 'formats': self.get_livestream_formats(id, api_data), + 'thumbnail': self.get_thumbnail(api_data, webpage), + 'channel_id': self.get_brand(api_data, webpage), + 'is_live': True + }