From 7bb8d9418465f7d66a3977c37908a946ba93351d Mon Sep 17 00:00:00 2001 From: Glenn Pavlovic Date: Tue, 17 Jan 2023 18:40:37 -0800 Subject: [PATCH] Rumble + UsaWatchdog - improves Rumble support and adds UsaWatchdog support --- youtube_dl/extractor/extractors.py | 10 ++- youtube_dl/extractor/rumble.py | 105 +++++++++++++++++++++++----- youtube_dl/extractor/usawatchdog.py | 46 ++++++++++++ 3 files changed, 141 insertions(+), 20 deletions(-) create mode 100644 youtube_dl/extractor/usawatchdog.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 947cbe8fd..6c3990189 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1049,7 +1049,11 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE -from .rumble import RumbleEmbedIE +from .rumble import ( + RumbleEmbedIE, + RumblePageIE, + RumblePlaylistIE, +) from .rutube import ( RutubeIE, RutubeChannelIE, @@ -1414,6 +1418,10 @@ from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE from .usatoday import USATodayIE +from .usawatchdog import ( + UsaWatchdogStoryIE, + UsaWatchdogIE, +) from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( UstudioIE, diff --git a/youtube_dl/extractor/rumble.py b/youtube_dl/extractor/rumble.py index 4a0225109..7431a3b7e 100644 --- a/youtube_dl/extractor/rumble.py +++ b/youtube_dl/extractor/rumble.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..compat import compat_str @@ -11,28 +12,14 @@ from ..utils import ( ) -class RumbleEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P[0-9a-z]+)' - _TESTS = [{ - 'url': 'https://rumble.com/embed/v5pv5f', - 'md5': '36a18a049856720189f30977ccbb2c34', - 'info_dict': { - 'id': 'v5pv5f', - 'ext': 'mp4', - 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', - 'timestamp': 1571611968, - 'upload_date': '20191020', - } - }, { - 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) +class rumbleBase(InfoExtractor): + def rumble_video_info(self, video_id): video = self._download_json( 'https://rumble.com/embedJS/', video_id, query={'request': 'video', 'v': video_id}) + if not video: + return None + title = video['title'] formats = [] @@ -65,3 +52,83 @@ class RumbleEmbedIE(InfoExtractor): 'channel_url': author.get('url'), 'duration': int_or_none(video.get('duration')), } + + +class RumbleEmbedIE(rumbleBase): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.rumble_video_info(video_id) + + +class RumblePageIE(rumbleBase): + _VALID_URL = r'https?://rumble\.com/[a-zA-Z0-9-_.]*\.html' + _TEST = { + 'url': 'https://rumble.com/v8c1bt-wmar-2-news-latest-headlines-october-20-6pm.html', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + }} + + _RUMBLE_JS_RE = r'Rumble *\( *["\']play["\'], *\{[^}]*["\']video["\'] *: *["\'](?P[^"\']+)' + + def _real_extract(self, url): + page = self._download_webpage(url, 'Rumble Page') + video_id = self._search_regex(self._RUMBLE_JS_RE, page, "id") + return self.rumble_video_info(video_id) + + +class RumblePlaylistIE(rumbleBase): + _VALID_URL = r'https?://rumble.com/(?:c|user)/(?P[^/]+)' + _TEST = { + 'url': 'https://rumble.com/c/PeakProsperity', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'PeakProsperity', + }} + + def _real_extract(self, url): + urls = [] + id = self._match_id(url) + page = self._download_webpage(url, id) + for mobj in re.finditer(r'[a-zA-Z0-9\-.]+)>', page): + urls.append('https://rumble.com/' + mobj.group('href')) + + return self.playlist_from_matches(urls, id) + + +def rumble_embedded_id(page_data): + '''For use by extractors of sites which use emedded Rumble videos. Given + a webpage as a string returns a list of url result dicts for each embedded + rumble video found. None is returned if no embeds were found. Duplicates + are not removed''' + + embeds = [] + # The JS embeds + for mobj in re.finditer(RumblePageIE._RUMBLE_JS_RE, page_data): + embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id'))) + + # The iframes embeds + for mobj in re.finditer(RumbleEmbedIE._VALID_URL, page_data): + embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id'))) + + return embeds if embeds else None diff --git a/youtube_dl/extractor/usawatchdog.py b/youtube_dl/extractor/usawatchdog.py new file mode 100644 index 000000000..4bb576a9b --- /dev/null +++ b/youtube_dl/extractor/usawatchdog.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +import re + +from .common import InfoExtractor + +from .rumble import rumble_embedded_id + + +class UsaWatchdogStoryIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/(?P[^/]+)' + _TEST = { + 'url': 'https://usawatchdog.com/cv-19-vaccine-warning-cv-19-cure-must-watch-videos/', + 'md5': 'bf40e20aebca9016ca195534028cbb6f', + 'info_dict': { + 'id': 'vcl8gx', + 'ext': 'mp4', + 'timestamp': 1617141926, + 'upload_date': '20210330', + 'title': u'Vaccine Warning \u2013 CV-19 Cure Must Watch Videos', + }} + + def _real_extract(self, url): + title = self._match_id(url) + embeds = rumble_embedded_id(self._download_webpage(url, title)) + return embeds[0] if embeds is not None else None + + +class UsaWatchdogIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/$' + _TEST = { + 'url': 'https://usawatchdog.com/', + 'playlist_mincount': 15, + 'info_dict': { + 'id': 'USA Watchdog', + }} + + def _real_extract(self, url): + matches = [] + for mobj in re.finditer(r'front-view-title[^<]+https?:(?:www\.)?//usawatchdog.com/[^/]+\/?)[^>]+>(?P[^<]+)', + self._download_webpage(url, 'Site Root')): + matches.append(self.url_result(mobj.group('href'), + 'UsaWatchdogStory', None, + mobj.group('title').encode('utf8'))) + + return self.playlist_result(matches, 'USA Watchdog')