From 9c644a641922e5ac3b5b4a1c9386fa599973e885 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 09:51:01 +0100 Subject: [PATCH 001/120] [fujitv] fix HLS formats extension(closes #28416) --- youtube_dl/extractor/fujitv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/fujitv.py b/youtube_dl/extractor/fujitv.py index 39685e075..a02a94374 100644 --- a/youtube_dl/extractor/fujitv.py +++ b/youtube_dl/extractor/fujitv.py @@ -17,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) formats = self._extract_m3u8_formats( - self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4') for f in formats: wh = self._BITRATE_MAP.get(f.get('tbr')) if wh: From 43d986acd8bf7247725fc9de34648c0eda560daf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 10:14:28 +0100 Subject: [PATCH 002/120] [tver] improve title extraction(closes #28418) --- youtube_dl/extractor/tver.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 931d4d650..a54f49319 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -9,6 +9,7 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, + strip_or_none, try_get, ) @@ -25,6 +26,10 @@ class TVerIE(InfoExtractor): }, { 'url': 'https://tver.jp/episode/79622438', 'only_matching': True, + }, { + # subtitle = ' ' + 'url': 'https://tver.jp/corner/f0068870', + 'only_matching': True, }] _TOKEN = None BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' @@ -47,8 +52,12 @@ class TVerIE(InfoExtractor): } if service == 'cx': + title = main['title'] + subtitle = strip_or_none(main.get('subtitle')) + if subtitle: + title += ' - ' + subtitle info.update({ - 'title': main.get('subtitle') or main['title'], + 'title': title, 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), 'ie_key': 'FujiTVFODPlus7', }) From ef414343e5fa2bc4fddae3097ecde5a8e32c2d4c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 10:48:58 +0100 Subject: [PATCH 003/120] [peertube] improve thumbnail extraction(closes #28419) --- youtube_dl/extractor/peertube.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 32ff51653..d9b13adc2 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -599,11 +599,13 @@ class PeerTubeIE(InfoExtractor): else: age_limit = None + webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) + return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), 'uploader': account_data('displayName', compat_str), 'uploader_id': str_or_none(account_data('id', int)), @@ -621,5 +623,6 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, + 'webpage_url': webpage_url, } From 1182f9567b86f2af747cdb8769ab87649c8ce4c2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 18:11:11 +0100 Subject: [PATCH 004/120] [pinterest] reduce the number of HLS format requests --- youtube_dl/extractor/pinterest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py index b249c9eda..42528d746 100644 --- a/youtube_dl/extractor/pinterest.py +++ b/youtube_dl/extractor/pinterest.py @@ -31,6 +31,7 @@ class PinterestBaseIE(InfoExtractor): title = (data.get('title') or data.get('grid_title') or video_id).strip() + urls = [] formats = [] duration = None if extract_formats: @@ -38,8 +39,9 @@ class PinterestBaseIE(InfoExtractor): if not isinstance(format_dict, dict): continue format_url = url_or_none(format_dict.get('url')) - if not format_url: + if not format_url or format_url in urls: continue + urls.append(format_url) duration = float_or_none(format_dict.get('duration'), scale=1000) ext = determine_ext(format_url) if 'hls' in format_id.lower() or ext == 'm3u8': From 60845121ca2f49172e7cd941c0cb43363cb86e46 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 13 Mar 2021 15:19:24 +0100 Subject: [PATCH 005/120] [sportdeutschland] fix extraction(closes #21856)(closes #28425) --- youtube_dl/extractor/sportdeutschland.py | 147 +++++++++++++---------- 1 file changed, 85 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 378fc7568..3e497a939 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -1,82 +1,105 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + clean_html, + float_or_none, + int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, + try_get, ) class SportDeutschlandIE(InfoExtractor): - _VALID_URL = r'https?://sportdeutschland\.tv/(?P[^/?#]+)/(?P[^?#/]+)(?:$|[?#])' + _VALID_URL = r'https?://sportdeutschland\.tv/(?P(?:[^/]+/)?[^?#/&]+)' _TESTS = [{ 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { - 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', + 'id': '5318cac0275701382770543d7edaf0a0', 'ext': 'mp4', - 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', - 'categories': ['Badminton-Deutschland'], - 'view_count': int, - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': int, - 'upload_date': '20200201', - 'description': 're:.*', # meaningless description for THIS video + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', + 'duration': 16106.36, }, + 'params': { + 'noplaylist': True, + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': 'c6e2fdd01f63013854c47054d2ab776f', + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', + 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', + 'duration': 31397, + }, + 'playlist_count': 2, + }, { + 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - sport_id = mobj.group('sport') - - api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( - sport_id, video_id) - req = sanitized_Request(api_url, headers={ - 'Accept': 'application/vnd.vidibus.v2.html+json', - 'Referer': url, - }) - data = self._download_json(req, video_id) - + display_id = self._match_id(url) + data = self._download_json( + 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, + display_id, query={'access_token': 'true'}) asset = data['asset'] - categories = [data['section']['title']] - - formats = [] - smil_url = asset['video'] - if '.smil' in smil_url: - m3u8_url = smil_url.replace('.smil', '.m3u8') - formats.extend( - self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')) - - smil_doc = self._download_xml( - smil_url, video_id, note='Downloading SMIL metadata') - base_url_el = smil_doc.find('./head/meta') - if base_url_el: - base_url = base_url_el.attrib['base'] - formats.extend([{ - 'format_id': 'rmtp', - 'url': base_url if base_url_el else n.attrib['src'], - 'play_path': n.attrib['src'], - 'ext': 'flv', - 'preference': -100, - 'format_note': 'Seems to fail at example stream', - } for n in smil_doc.findall('./body/video')]) - else: - formats.append({'url': smil_url}) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': asset['title'], - 'thumbnail': asset.get('image'), - 'description': asset.get('teaser'), - 'duration': asset.get('duration'), - 'categories': categories, - 'view_count': asset.get('views'), - 'rtmp_live': asset.get('live'), - 'timestamp': parse_iso8601(asset.get('date')), + title = (asset.get('title') or asset['label']).strip() + asset_id = asset.get('id') or asset.get('uuid') + info = { + 'id': asset_id, + 'title': title, + 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), + 'duration': int_or_none(asset.get('seconds')), } + videos = asset.get('videos') or [] + if len(videos) > 1: + playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0] + if playlist_id: + if self._downloader.params.get('noplaylist'): + videos = [videos[int(playlist_id)]] + self.to_screen('Downloading just a single video because of --no-playlist') + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) + + def entries(): + for i, video in enumerate(videos, 1): + video_id = video.get('uuid') + video_url = video.get('url') + if not (video_id and video_url): + continue + formats = self._extract_m3u8_formats( + video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) + if not formats: + continue + yield { + 'id': video_id, + 'formats': formats, + 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), + 'duration': float_or_none(video.get('duration')), + } + info.update({ + '_type': 'multi_video', + 'entries': entries(), + }) + else: + formats = self._extract_m3u8_formats( + videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') + section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) + info.update({ + 'formats': formats, + 'display_id': asset.get('permalink'), + 'thumbnail': try_get(asset, lambda x: x['images'][0]), + 'categories': [section_title] if section_title else None, + 'view_count': int_or_none(asset.get('views')), + 'is_live': asset.get('is_live') is True, + 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), + }) + return info From 1860d0f41cf50b1a0876174c4e1ee7adbbd4a0f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Mar 2021 09:26:54 +0700 Subject: [PATCH 006/120] [southpark] Fix extraction and add support for southparkstudios.com (closes #26763, closes #28413) --- youtube_dl/extractor/southpark.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index da75a43a7..0774da06e 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -6,9 +6,9 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?Psouthpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' - _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', @@ -23,8 +23,20 @@ class SouthParkIE(MTVServicesInfoExtractor): }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', 'only_matching': True, + }, { + 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', + 'only_matching': True, }] + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'shared.southpark.global', + 'ep': '90877963', + 'imageEp': 'shared.southpark.global', + 'mgid': uri, + } + class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' From b509d24b2fef8d5994b7d925db51befcbbf996fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Mar 2021 09:36:11 +0700 Subject: [PATCH 007/120] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 238ca3965..924d202b0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core ++ Introduce release_timestamp meta field (#28386) + +Extractors ++ [southpark] Add support for southparkstudios.com (#28413) +* [southpark] Fix extraction (#26763, #28413) +* [sportdeutschland] Fix extraction (#21856, #28425) +* [pinterest] Reduce the number of HLS format requests +* [peertube] Improve thumbnail extraction (#28419) +* [tver] Improve title extraction (#28418) +* [fujitv] Fix HLS formats extension (#28416) +* [shahid] Fix format extraction (#28383) ++ [lbry] Add support for channel filters (#28385) ++ [bandcamp] Extract release timestamp ++ [lbry] Extract release timestamp (#28386) +* [pornhub] Detect flagged videos ++ [pornhub] Extract formats from get_media end point (#28395) +* [bilibili] Fix video info extraction (#28341) ++ [cbs] Add support for Paramount+ (#28342) ++ [trovo] Add Origin header to VOD formats (#28346) +* [voxmedia] Fix volume embed extraction (#28338) + + version 2021.03.03 Extractors From ebfd66c4b1d6ffabd8a5bc52737f2bacac341d3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Mar 2021 09:38:16 +0700 Subject: [PATCH 008/120] release 2021.03.14 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index a8eba3214..9cc79eff4 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.03 + [debug] youtube-dl version 2021.03.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 7d59a9f2d..3296e44a8 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 523408f03..f74c29736 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 6e9e094e4..ae9e273ef 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.03 + [debug] youtube-dl version 2021.03.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 46af4e420..04fbea2f6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 924d202b0..73fe316b9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.14 Core + Introduce release_timestamp meta field (#28386) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a1c68e384..5a540119c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.03' +__version__ = '2021.03.14' From 9955bb4a2704f98b74a448c82dfd690ec6775b8d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Mar 2021 14:49:23 +0100 Subject: [PATCH 009/120] [rtve] improve extraction - extract all formats - fix RTVE Infantil extraction(closes #24851) - extract is_live and series --- youtube_dl/extractor/rtve.py | 232 ++++++++++++++++------------------- 1 file changed, 104 insertions(+), 128 deletions(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index ce9db0629..d2fb754cf 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -2,8 +2,9 @@ from __future__ import unicode_literals import base64 +import io import re -import time +import sys from .common import InfoExtractor from ..compat import ( @@ -14,56 +15,13 @@ from ..utils import ( determine_ext, ExtractorError, float_or_none, + qualities, remove_end, remove_start, - sanitized_Request, std_headers, ) - -def _decrypt_url(png): - encrypted_data = compat_b64decode(png) - text_index = encrypted_data.find(b'tEXt') - text_chunk = encrypted_data[text_index - 4:] - length = compat_struct_unpack('!I', text_chunk[:4])[0] - # Use bytearray to get integers when iterating in both python 2.x and 3.x - data = bytearray(text_chunk[8:8 + length]) - data = [chr(b) for b in data if b != 0] - hash_index = data.index('#') - alphabet_data = data[:hash_index] - url_data = data[hash_index + 1:] - if url_data[0] == 'H' and url_data[3] == '%': - # remove useless HQ%% at the start - url_data = url_data[4:] - - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data: - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data: - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - return url +_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) class RTVEALaCartaIE(InfoExtractor): @@ -79,28 +37,31 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', 'duration': 5024.566, + 'series': 'Balonmano', }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', 'info_dict': { 'id': '1694255', - 'ext': 'flv', - 'title': 'TODO', + 'ext': 'mp4', + 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': 'live stream', }, - 'skip': 'The f4m manifest can\'t be used yet', }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'md5': 'd850f3c8731ea53952ebab489cf81cbf', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104 ', + 'title': 'Servir y proteger - Capítulo 104', 'duration': 3222.0, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, @@ -111,58 +72,102 @@ class RTVEALaCartaIE(InfoExtractor): def _real_initialize(self): user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') - manager_info = self._download_json( + self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info') - self._manager = manager_info['manager'] + None, 'Fetching manager info')['manager'] + + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) + while True: + length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + alphabet_data, text = data.split(b'\0') + quality, url_data = text.split(b'%%') + alphabet = [] + e = 0 + d = 0 + for l in _bytes_to_chr(alphabet_data): + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in _bytes_to_chr(url_data): + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + yield quality.decode(), url + encrypted_data.read(4) # CRC + + def _extract_png_formats(self, video_id): + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), + video_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + self._sort_formats(formats) + return formats def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) info = self._download_json( 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'] - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) - png_request = sanitized_Request(png_url) - png_request.add_header('Referer', url) - png = self._download_webpage(png_request, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - ext = determine_ext(video_url) - - formats = [] - if not video_url.endswith('.f4m') and ext != 'm3u8': - if '?' not in video_url: - video_url = video_url.replace('resources/', 'auth/resources/') - video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') - - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds', fatal=False)) - else: - formats.append({ - 'url': video_url, - }) - self._sort_formats(formats) + title = info['title'].strip() + formats = self._extract_png_formats(video_id) subtitles = None - if info.get('sbtFile') is not None: - subtitles = self.extract_subtitles(video_id, info['sbtFile']) + sbt_file = info.get('sbtFile') + if sbt_file: + subtitles = self.extract_subtitles(video_id, sbt_file) + + is_live = info.get('live') is True return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': info.get('image'), - 'page_url': url, 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), scale=1000), + 'duration': float_or_none(info.get('duration'), 1000), + 'is_live': is_live, + 'series': info.get('programTitle'), } def _get_subtitles(self, video_id, sub_file): @@ -174,48 +179,26 @@ class RTVEALaCartaIE(InfoExtractor): for s in subs) -class RTVEInfantilIE(InfoExtractor): +class RTVEInfantilIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P[^/]*)/video/(?P[^/]*)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' _TESTS = [{ 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '915319587b33720b8e0357caaa6617e6', + 'md5': '5747454717aedf9f9fdf212d1bcfc48d', 'info_dict': { 'id': '3040283', 'ext': 'mp4', 'title': 'Maneras de vivir', - 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'thumbnail': r're:https?://.+/1426182947956\.JPG', 'duration': 357.958, }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }] - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, - video_id)['page']['items'][0] - webpage = self._download_webpage(url, video_id) - vidplayer_id = self._search_regex( - r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') - - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - - return { - 'id': video_id, - 'ext': 'mp4', - 'title': info['title'], - 'url': video_url, - 'thumbnail': info.get('image'), - 'duration': float_or_none(info.get('duration'), scale=1000), - } - - -class RTVELiveIE(InfoExtractor): +class RTVELiveIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' @@ -225,7 +208,7 @@ class RTVELiveIE(InfoExtractor): 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', }, 'params': { 'skip_download': 'live stream', @@ -234,29 +217,22 @@ class RTVELiveIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - start_time = time.gmtime() video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') title = remove_start(title, 'Estoy viendo ') - title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( (r'playerId=player([0-9]+)', r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', r'data-id=["\'](\d+)'), webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - m3u8_url = _decrypt_url(png) - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'formats': formats, + 'title': self._live_title(title), + 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, } From 3be098010f667b14075e3dfad1e74e5e2becc8ea Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Mar 2021 20:08:46 +0100 Subject: [PATCH 010/120] [applepodcasts] fix extraction(closes #28445) --- youtube_dl/extractor/applepodcasts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index 95758fece..6a74de758 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -42,6 +42,7 @@ class ApplePodcastsIE(InfoExtractor): ember_data = self._parse_json(self._search_regex( r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', webpage, 'ember data'), episode_id) + ember_data = ember_data.get(episode_id) or ember_data episode = ember_data['data']['attributes'] description = episode.get('description') or {} From 357bfe251d7f4f8bb9319bc6531a3813b5a355a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Mar 2021 20:42:20 +0700 Subject: [PATCH 011/120] [svtplay] Improve extraction (closes #28448) --- youtube_dl/extractor/svt.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 4acc29fce..aba9bb447 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -146,18 +146,19 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) + (?:.*?modalId=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ - 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'url': 'https://www.svtplay.se/video/30479064', 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': 'jNwpV9P', + 'id': '8zVbDPA', 'ext': 'mp4', - 'title': 'Det här är himlen', - 'timestamp': 1586044800, - 'upload_date': '20200405', - 'duration': 3515, + 'title': 'Designdrömmar i Stenungsund', + 'timestamp': 1615770000, + 'upload_date': '20210315', + 'duration': 3519, 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { @@ -173,6 +174,9 @@ class SVTPlayIE(SVTPlayBaseIE): # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B 'skip_download': True, }, + }, { + 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -219,7 +223,8 @@ class SVTPlayIE(SVTPlayBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id, svt_id = mobj.group('id', 'svt_id') + video_id = mobj.group('id') + svt_id = mobj.group('svt_id') or mobj.group('modal_id') if svt_id: return self._extract_by_video_id(svt_id) @@ -254,6 +259,7 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', From f912d6c8cf5a68d576abe4426e12554c3404a7dd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Mar 2021 21:43:53 +0100 Subject: [PATCH 012/120] [mlb] fix video extracion(#21241) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/mlb.py | 189 +++++++++++++++++++++++++---- 2 files changed, 172 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 07a8af055..c2f67323b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -673,7 +673,10 @@ from .mixcloud import ( MixcloudUserIE, MixcloudPlaylistIE, ) -from .mlb import MLBIE +from .mlb import ( + MLBIE, + MLBVideoIE, +) from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import ( diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index b907f6b49..b69301d97 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,15 +1,91 @@ from __future__ import unicode_literals -from .nhl import NHLBaseIE +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + try_get, +) -class MLBIE(NHLBaseIE): +class MLBBaseIE(InfoExtractor): + def _real_extract(self, url): + display_id = self._match_id(url) + video = self._download_video_data(display_id) + video_id = video['id'] + title = video['title'] + feed = self._get_feed(video) + + formats = [] + for playback in (feed.get('playbacks') or []): + playback_url = playback.get('url') + if not playback_url: + continue + name = playback.get('name') + ext = determine_ext(playback_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + 'm3u8_native', m3u8_id=name, fatal=False)) + else: + f = { + 'format_id': name, + 'url': playback_url, + } + mobj = re.search(r'_(\d+)K_(\d+)X(\d+)', name) + if mobj: + f.update({ + 'height': int(mobj.group(3)), + 'tbr': int(mobj.group(1)), + 'width': int(mobj.group(2)), + }) + mobj = re.search(r'_(\d+)x(\d+)_(\d+)_(\d+)K\.mp4', playback_url) + if mobj: + f.update({ + 'fps': int(mobj.group(3)), + 'height': int(mobj.group(2)), + 'tbr': int(mobj.group(4)), + 'width': int(mobj.group(1)), + }) + formats.append(f) + self._sort_formats(formats) + + thumbnails = [] + for cut in (try_get(feed, lambda x: x['image']['cuts'], list) or []): + src = cut.get('src') + if not src: + continue + thumbnails.append({ + 'height': int_or_none(cut.get('height')), + 'url': src, + 'width': int_or_none(cut.get('width')), + }) + + language = (video.get('language') or 'EN').lower() + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': video.get('description'), + 'duration': parse_duration(feed.get('duration')), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video.get(self._TIMESTAMP_KEY)), + 'subtitles': self._extract_mlb_subtitles(feed, language), + } + + +class MLBIE(MLBBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*(?Pmlb)\.com/ + (?:[\da-z_-]+\.)*mlb\.com/ (?: (?: - (?:[^/]+/)*c-| + (?:[^/]+/)*video/[^/]+/c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| @@ -18,7 +94,6 @@ class MLBIE(NHLBaseIE): (?P\d+) ) ''' - _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', @@ -76,18 +151,6 @@ class MLBIE(NHLBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', }, }, - { - 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', - 'md5': 'e09e37b552351fddbf4d9e699c924d68', - 'info_dict': { - 'id': '75609783', - 'ext': 'mp4', - 'title': 'Must C: Pillar climbs for catch', - 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429139220, - 'upload_date': '20150415', - } - }, { 'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694', 'only_matching': True, @@ -113,8 +176,92 @@ class MLBIE(NHLBaseIE): 'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb', 'only_matching': True, }, - { - 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', - 'only_matching': True, - } ] + _TIMESTAMP_KEY = 'date' + + @staticmethod + def _get_feed(video): + return video + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for keyword in (feed.get('keywordsAll') or []): + keyword_type = keyword.get('type') + if keyword_type and keyword_type.startswith('closed_captions_location_'): + cc_location = keyword.get('value') + if cc_location: + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + return subtitles + + def _download_video_data(self, display_id): + return self._download_json( + 'http://content.mlb.com/mlb/item/id/v1/%s/details/web-v1.json' % display_id, + display_id) + + +class MLBVideoIE(MLBBaseIE): + _VALID_URL = r'https?://(?:www\.)?mlb\.com/(?:[^/]+/)*video/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.mlb.com/mariners/video/ackley-s-spectacular-catch-c34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', + 'info_dict': { + 'id': 'c04a8863-f569-42e6-9f87-992393657614', + 'ext': 'mp4', + 'title': "Ackley's spectacular catch", + 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', + 'duration': 66, + 'timestamp': 1405995000, + 'upload_date': '20140722', + 'thumbnail': r're:^https?://.+', + }, + } + _TIMESTAMP_KEY = 'timestamp' + + @classmethod + def suitable(cls, url): + return False if MLBIE.suitable(url) else super(MLBVideoIE, cls).suitable(url) + + @staticmethod + def _get_feed(video): + return video['feeds'][0] + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for cc_location in (feed.get('closedCaptions') or []): + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + + def _download_video_data(self, display_id): + # https://www.mlb.com/data-service/en/videos/[SLUG] + return self._download_json( + 'https://fastball-gateway.mlb.com/graphql', + display_id, query={ + 'query': '''{ + mediaPlayback(ids: "%s") { + description + feeds(types: CMS) { + closedCaptions + duration + image { + cuts { + width + height + src + } + } + playbacks { + name + url + } + } + id + timestamp + title + } +}''' % display_id, + })['data']['mediaPlayback'][0] From fa6bf0a7112e83d36567072985d56440bb34de72 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Mar 2021 12:37:22 +0100 Subject: [PATCH 013/120] [vvvvid] fix kenc format extraction(closes #28473) --- youtube_dl/extractor/vvvvid.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 7c94c4ee2..bc196f8a0 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -182,17 +182,20 @@ class VVVVIDIE(InfoExtractor): if not embed_code: continue embed_code = ds(embed_code) - if video_type in ('video/rcs', 'video/kenc'): - if video_type == 'video/kenc': - kenc = self._download_json( - 'https://www.vvvvid.it/kenc', video_id, query={ - 'action': 'kt', - 'conn_id': self._conn_id, - 'url': embed_code, - }, fatal=False) or {} - kenc_message = kenc.get('message') - if kenc_message: - embed_code += '?' + ds(kenc_message) + if video_type == 'video/kenc': + embed_code = re.sub(r'https?(://[^/]+)/z/', r'https\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') + kenc = self._download_json( + 'https://www.vvvvid.it/kenc', video_id, query={ + 'action': 'kt', + 'conn_id': self._conn_id, + 'url': embed_code, + }, fatal=False) or {} + kenc_message = kenc.get('message') + if kenc_message: + embed_code += '?' + ds(kenc_message) + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif video_type == 'video/rcs': formats.extend(self._extract_akamai_formats(embed_code, video_id)) elif video_type == 'video/youtube': info.update({ From 7e79ba7dd6e6649dd2ce3a74004b2044f2182881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 20 Mar 2021 05:45:36 +0700 Subject: [PATCH 014/120] [vimeo:album] Fix extraction for albums with number of videos multiple to page size (closes #28486) --- youtube_dl/extractor/vimeo.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bd2663fe0..955651bec 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -939,11 +939,15 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): } if hashed_pass: query['_hashed_pass'] = hashed_pass - videos = self._download_json( - 'https://api.vimeo.com/albums/%s/videos' % album_id, - album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorization, - })['data'] + try: + videos = self._download_json( + 'https://api.vimeo.com/albums/%s/videos' % album_id, + album_id, 'Downloading page %d' % api_page, query=query, headers={ + 'Authorization': 'jwt ' + authorization, + })['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + return for video in videos: link = video.get('link') if not link: From 21ccd0d7f46002acc61eb21bd0d4e492064c7fe1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Mar 2021 09:10:38 +0100 Subject: [PATCH 015/120] [tiktok] detect private videos(closes #28453) --- youtube_dl/extractor/tiktok.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index ea1beb8af..4faa6de54 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -107,9 +107,12 @@ class TikTokIE(TikTokBaseIE): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( + page_props = self._parse_json(self._search_regex( r']+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s* Date: Mon, 22 Mar 2021 14:56:58 +0100 Subject: [PATCH 016/120] [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) Co-authored-by: Sergey M --- youtube_dl/extractor/vgtv.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index fe7a26b62..22e99e8f0 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -23,6 +23,8 @@ class VGTVIE(XstreamIE): 'fvn.no/fvntv': 'fvntv', 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', + 'tv.aftonbladet.se': 'abtv', + # obsolete URL schemas, kept in order to save one HTTP redirect 'tv.aftonbladet.se/abtv': 'abtv', 'www.aftonbladet.se/tv': 'abtv', } @@ -140,6 +142,10 @@ class VGTVIE(XstreamIE): 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', 'only_matching': True, }, + { + 'url': 'https://tv.aftonbladet.se/video/36015/vulkanutbrott-i-rymden-nu-slapper-nasa-bilderna', + 'only_matching': True, + }, { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, From 8117d613acdd0a2874e52bfa52c3574f46e3a4fb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 22 Mar 2021 15:58:44 +0100 Subject: [PATCH 017/120] [zingmp3] fix extraction(closes #11589, closes #16409, closes #16968, closes #27205) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/zingmp3.py | 212 ++++++++++++++++------------- 2 files changed, 119 insertions(+), 98 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c2f67323b..8b55947f6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1624,5 +1624,8 @@ from .zattoo import ( ) from .zdf import ZDFIE, ZDFChannelIE from .zhihu import ZhihuIE -from .zingmp3 import ZingMp3IE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, +) from .zype import ZypeIE diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index adfdcaabf..207c04f5e 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -1,93 +1,94 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - update_url_query, ) -class ZingMp3BaseInfoExtractor(InfoExtractor): +class ZingMp3BaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P\w+)\.html' + _GEO_COUNTRIES = ['VN'] - def _extract_item(self, item, page_type, fatal=True): - error_message = item.get('msg') - if error_message: - if not fatal: - return - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), - expected=True) + def _extract_item(self, item, fatal): + item_id = item['id'] + title = item.get('name') or item['title'] formats = [] - for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): - if not source_url or source_url == 'require vip': + for k, v in (item.get('source') or {}).items(): + if not v: continue - if not re.match(r'https?://', source_url): - source_url = '//' + source_url - source_url = self._proto_relative_url(source_url, 'http:') - quality_num = int_or_none(quality) - f = { - 'format_id': quality, - 'url': source_url, - } - if page_type == 'video': - f.update({ - 'height': quality_num, - 'ext': 'mp4', - }) + if k in ('mp4', 'hls'): + for res, video_url in v.items(): + if not video_url: + continue + if k == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, item_id, 'mp4', + 'm3u8_native', m3u8_id=k, fatal=False)) + elif k == 'mp4': + formats.append({ + 'format_id': 'mp4-' + res, + 'url': video_url, + 'height': int_or_none(self._search_regex( + r'^(\d+)p', res, 'resolution', default=None)), + }) else: - f.update({ - 'abr': quality_num, + formats.append({ 'ext': 'mp3', + 'format_id': k, + 'tbr': int_or_none(k), + 'url': self._proto_relative_url(v), + 'vcodec': 'none', }) - formats.append(f) + if not formats: + if not fatal: + return + msg = item['msg'] + if msg == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(msg, expected=True) + self._sort_formats(formats) - cover = item.get('cover') + subtitles = None + lyric = item.get('lyric') + if lyric: + subtitles = { + 'origin': [{ + 'url': lyric, + }], + } + + album = item.get('album') or {} return { - 'title': (item.get('name') or item.get('title')).strip(), + 'id': item_id, + 'title': title, 'formats': formats, - 'thumbnail': 'http:/' + cover if cover else None, - 'artist': item.get('artist'), + 'thumbnail': item.get('thumbnail'), + 'subtitles': subtitles, + 'duration': int_or_none(item.get('duration')), + 'track': title, + 'artist': item.get('artists_names'), + 'album': album.get('name') or album.get('title'), + 'album_artist': album.get('artists_names'), } - def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): - player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') - items = player_json['data'] - if 'item' in items: - items = items['item'] - - if len(items) == 1: - # one single song - data = self._extract_item(items[0], page_type) - data['id'] = id - - return data - else: - # playlist of songs - entries = [] - - for i, item in enumerate(items, 1): - entry = self._extract_item(item, page_type, fatal=False) - if not entry: - continue - entry['id'] = '%s-%d' % (id, i) - entries.append(entry) - - return { - '_type': 'playlist', - 'id': id, - 'title': playlist_title, - 'entries': entries, - } + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage( + url.replace('://zingmp3.vn/', '://mp3.zing.vn/'), + page_id, query={'play_song': 1}) + data_path = self._search_regex( + r'data-xml="([^"]+)', webpage, 'data path') + return self._process_data(self._download_json( + 'https://mp3.zing.vn/xhr' + data_path, page_id)['data']) -class ZingMp3IE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P\w+)\.html' +class ZingMp3IE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -95,49 +96,66 @@ class ZingMp3IE(ZingMp3BaseInfoExtractor): 'id': 'ZWZB9WAB', 'title': 'Xa Mãi Xa', 'ext': 'mp3', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.+\.jpg', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }] + }, + 'duration': 255, + 'track': 'Xa Mãi Xa', + 'artist': 'Bảo Thy', + 'album': 'Special Album', + 'album_artist': 'Bảo Thy', }, }, { - 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', - 'md5': '870295a9cd8045c0e15663565902618d', + 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', + 'md5': 'e9c972b693aa88301ef981c8151c4343', 'info_dict': { - 'id': 'ZW6BAEA0', - 'title': 'Let It Go (Frozen OST)', + 'id': 'ZO8ZF7C7', + 'title': 'Sương Hoa Đưa Lối', 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 207, + 'track': 'Sương Hoa Đưa Lối', + 'artist': 'K-ICM, RYO', }, }, { - 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', - 'info_dict': { - '_type': 'playlist', - 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', - }, - 'playlist_count': 10, - 'skip': 'removed at the request of the owner', - }, { - 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'only_matching': True, }] IE_NAME = 'zingmp3' IE_DESC = 'mp3.zing.vn' - def _real_extract(self, url): - page_id = self._match_id(url) + def _process_data(self, data): + return self._extract_item(data, True) - webpage = self._download_webpage(url, page_id) - player_json_url = self._search_regex([ - r'data-xml="([^"]+)', - r'&xmlURL=([^&]+)&' - ], webpage, 'player xml url') +class ZingMp3AlbumIE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'album|playlist' + _TESTS = [{ + 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'info_dict': { + '_type': 'playlist', + 'id': 'ZWZBWDAF', + 'title': 'Lâu Đài Tình Ái', + }, + 'playlist_count': 10, + }, { + 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'only_matching': True, + }, { + 'url': 'https://zingmp3.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'only_matching': True, + }] + IE_NAME = 'zingmp3:album' - playlist_title = None - page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') - if page_type == 'video': - player_json_url = update_url_query(player_json_url, {'format': 'json'}) - else: - player_json_url = player_json_url.replace('/xml/', '/html5xml/') - if page_type == 'album': - playlist_title = self._og_search_title(webpage) - - return self._extract_player_json(player_json_url, page_id, page_type, playlist_title) + def _process_data(self, data): + def entries(): + for item in (data.get('items') or []): + entry = self._extract_item(item, False) + if entry: + yield entry + info = data.get('info') or {} + return self.playlist_result( + entries(), info.get('id'), info.get('name') or info.get('title')) From 5208ae92fc3e2916cdccae45c6b9a516be3d5796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 24 Mar 2021 02:57:35 +0700 Subject: [PATCH 018/120] [youtube] Fix default value for youtube_include_dash_manifest (closes #28523) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eb5a58807..badca3977 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1617,7 +1617,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['format_id'] = itag formats.append(f) - if self._downloader.params.get('youtube_include_dash_manifest'): + if self._downloader.params.get('youtube_include_dash_manifest', True): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: for f in self._extract_mpd_formats( From a40002444e64957594a1305bb2740fddb477beeb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Mar 2021 15:10:19 +0100 Subject: [PATCH 019/120] [bbc] fix BBC IPlayer Episodes/Group extraction(closes #28360) --- youtube_dl/extractor/bbc.py | 205 ++++++++++++++++++++++++++--- youtube_dl/extractor/extractors.py | 3 +- 2 files changed, 192 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 92e6f1bea..e8d000bbb 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,17 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import json import re from .common import InfoExtractor from ..compat import ( compat_etree_Element, compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( ExtractorError, + OnDemandPagedList, clean_html, dict_get, float_or_none, @@ -811,7 +816,7 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE) return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) else super(BBCIE, cls).suitable(url)) @@ -1338,21 +1343,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor): playlist_id, title, description) -class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): - IE_NAME = 'bbc.co.uk:iplayer:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX - _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' - _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' +class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P%s)' % BBCCoUkIE._ID_REGEX + + @staticmethod + def _get_default(episode, key, default_key='default'): + return try_get(episode, lambda x: x[key][default_key]) + + def _get_description(self, data): + synopsis = data.get(self._DESCRIPTION_KEY) or {} + return dict_get(synopsis, ('large', 'medium', 'small')) + + def _fetch_page(self, programme_id, per_page, series_id, page): + elements = self._get_elements(self._call_api( + programme_id, per_page, page + 1, series_id)) + for element in elements: + episode = self._get_episode(element) + episode_id = episode.get('id') + if not episode_id: + continue + thumbnail = None + image = self._get_episode_image(episode) + if image: + thumbnail = image.replace('{recipe}', 'raw') + category = self._get_default(episode, 'labels', 'category') + yield { + '_type': 'url', + 'id': episode_id, + 'title': self._get_episode_field(episode, 'subtitle'), + 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id, + 'thumbnail': thumbnail, + 'description': self._get_description(episode), + 'categories': [category] if category else None, + 'series': self._get_episode_field(episode, 'title'), + 'ie_key': BBCCoUkIE.ie_key(), + } + + def _real_extract(self, url): + pid = self._match_id(url) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + series_id = qs.get('seriesId', [None])[0] + page = qs.get('page', [None])[0] + per_page = 36 if page else self._PAGE_SIZE + fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id) + entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE) + playlist_data = self._get_playlist_data(self._call_api(pid, 1)) + return self.playlist_result( + entries, pid, self._get_playlist_title(playlist_data), + self._get_description(playlist_data)) + + +class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:episodes' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes' _TESTS = [{ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', 'info_dict': { 'id': 'b05rcz9v', 'title': 'The Disappearance', - 'description': 'French thriller serial about a missing teenager.', + 'description': 'md5:58eb101aee3116bad4da05f91179c0cb', }, - 'playlist_mincount': 6, - 'skip': 'This programme is not currently available on BBC iPlayer', + 'playlist_mincount': 8, }, { + # all seasons + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 10, + }, { + # explicit season + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 5, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 37, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 1, + }] + _PAGE_SIZE = 100 + _DESCRIPTION_KEY = 'synopsis' + + def _get_episode_image(self, episode): + return self._get_default(episode, 'image') + + def _get_episode_field(self, episode, field): + return self._get_default(episode, field) + + @staticmethod + def _get_elements(data): + return data['entities']['results'] + + @staticmethod + def _get_episode(element): + return element.get('episode') or {} + + def _call_api(self, pid, per_page, page=1, series_id=None): + variables = { + 'id': pid, + 'page': page, + 'perPage': per_page, + } + if series_id: + variables['sliceId'] = series_id + return self._download_json( + 'https://graph.ibl.api.bbc.co.uk/', pid, headers={ + 'Content-Type': 'application/json' + }, data=json.dumps({ + 'id': '5692d93d5aac8d796a0305e895e61551', + 'variables': variables, + }).encode('utf-8'))['data']['programme'] + + @staticmethod + def _get_playlist_data(data): + return data + + def _get_playlist_title(self, data): + return self._get_default(data, 'title') + + +class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:group' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group' + _TESTS = [{ # Available for over a year unlike 30 days for most other programmes 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', 'info_dict': { @@ -1361,14 +1494,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', }, 'playlist_mincount': 10, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 47, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 11, }] + _PAGE_SIZE = 200 + _DESCRIPTION_KEY = 'synopses' - def _extract_title_and_description(self, webpage): - title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) - description = self._search_regex( - r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

', - webpage, 'description', fatal=False, group='value') - return title, description + def _get_episode_image(self, episode): + return self._get_default(episode, 'images', 'standard') + + def _get_episode_field(self, episode, field): + return episode.get(field) + + @staticmethod + def _get_elements(data): + return data['elements'] + + @staticmethod + def _get_episode(element): + return element + + def _call_api(self, pid, per_page, page=1, series_id=None): + return self._download_json( + 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid, + pid, query={ + 'page': page, + 'per_page': per_page, + })['group_episodes'] + + @staticmethod + def _get_playlist_data(data): + return data['group'] + + def _get_playlist_title(self, data): + return data.get('title') class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b55947f6..e0fd0b648 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -95,7 +95,8 @@ from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, - BBCCoUkIPlayerPlaylistIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, BBCIE, ) From eafcadea261dba64c44c5c17ea8a47ac17256617 Mon Sep 17 00:00:00 2001 From: The Hatsune Daishi Date: Wed, 24 Mar 2021 23:33:19 +0900 Subject: [PATCH 020/120] [extractor] escape forgotten dot for hostnames in regular expression (#28530) --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/mtv.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c2b1b3bdf..f99d887ca 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2953,7 +2953,7 @@ class GenericIE(InfoExtractor): webpage) if not mobj: mobj = re.search( - r'data-video-link=["\'](?Phttp://m.mlb.com/video/[^"\']+)', + r'data-video-link=["\'](?Phttp://m\.mlb\.com/video/[^"\']+)', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'MLB') diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f5e30d22d..600cf2d89 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -320,7 +320,7 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage) if mobj: return mobj.group('url') From d1069d33b4ad3987acc2452756459065ce635d68 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Sun, 12 Apr 2020 23:27:58 +0200 Subject: [PATCH 021/120] [zoom] Add new extractor(closes #16597, closes #27002, closes #28531) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/zoom.py | 68 ++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 youtube_dl/extractor/zoom.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e0fd0b648..b2b39e4dd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1629,4 +1629,5 @@ from .zingmp3 import ( ZingMp3IE, ZingMp3AlbumIE, ) +from .zoom import ZoomIE from .zype import ZypeIE diff --git a/youtube_dl/extractor/zoom.py b/youtube_dl/extractor/zoom.py new file mode 100644 index 000000000..db073d91d --- /dev/null +++ b/youtube_dl/extractor/zoom.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + parse_filesize, + urlencode_postdata, +) + + +class ZoomIE(InfoExtractor): + IE_NAME = 'zoom' + _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P[A-Za-z0-9_.-]+)' + _TEST = { + 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', + 'info_dict': { + 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'ext': 'mp4', + 'title': 'China\'s "two sessions" and the new five-year plan', + } + } + + def _real_extract(self, url): + base_url, play_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, play_id) + + try: + form = self._form_hidden_inputs('password_form', webpage) + except ExtractorError: + form = None + if form: + password = self._downloader.params.get('videopassword') + if not password: + raise ExtractorError( + 'This video is protected by a passcode, use the --video-password option', expected=True) + is_meeting = form.get('useWhichPasswd') == 'meeting' + validation = self._download_json( + base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), + play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ + 'id': form[('meet' if is_meeting else 'file') + 'Id'], + 'passwd': password, + 'action': form.get('action'), + })) + if not validation.get('status'): + raise ExtractorError(validation['errorMessage'], expected=True) + webpage = self._download_webpage(url, play_id) + + data = self._parse_json(self._search_regex( + r'(?s)window\.__data__\s*=\s*({.+?});', + webpage, 'data'), play_id, js_to_json) + + return { + 'id': play_id, + 'title': data['topic'], + 'url': data['viewMp4Url'], + 'width': int_or_none(data.get('viewResolvtionsWidth')), + 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'http_headers': { + 'Referer': base_url, + }, + 'filesize_approx': parse_filesize(data.get('fileSize')), + } From c2fbfb49da2657002fafcf4c609f8d91030a6ef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 25 Mar 2021 00:03:00 +0700 Subject: [PATCH 022/120] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 73fe316b9..1c3313280 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version + +Extractors ++ [zoom] Add support for zoom.us (#16597, #27002, #28531) +* [bbc] Fix BBC IPlayer Episodes/Group extraction (#28360) +* [youtube] Fix default value for youtube_include_dash_manifest (#28523) +* [zingmp3] Fix extraction (#11589, #16409, #16968, #27205) ++ [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) ++ [tiktok] Detect private videos (#28453) +* [vimeo:album] Fix extraction for albums with number of videos multiple + to page size (#28486) +* [vvvvid] Fix kenc format extraction (#28473) +* [mlb] Fix video extraction (#21241) +* [svtplay] Improve extraction (#28448) +* [applepodcasts] Fix extraction (#28445) +* [rtve] Improve extraction + + Extract all formats + * Fix RTVE Infantil extraction (#24851) + + Extract is_live and series + + version 2021.03.14 Core From 76da1c954aebba4af8def73dd2319fab2e27e50a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 25 Mar 2021 00:04:10 +0700 Subject: [PATCH 023/120] release 2021.03.25 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 +++++- youtube_dl/version.py | 2 +- 8 files changed, 19 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 9cc79eff4..7feb0298c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.14 + [debug] youtube-dl version 2021.03.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 3296e44a8..49e18173d 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index f74c29736..a1486b133 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index ae9e273ef..7eaf5a202 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.14 + [debug] youtube-dl version 2021.03.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 04fbea2f6..20042d98c 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 1c3313280..1b49e411a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.25 Extractors + [zoom] Add support for zoom.us (#16597, #27002, #28531) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2c00ec406..d2ad937a4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -90,7 +90,8 @@ - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles - - **bbc.co.uk:iplayer:playlist** + - **bbc.co.uk:iplayer:episodes** + - **bbc.co.uk:iplayer:group** - **bbc.co.uk:playlist** - **BBVTV** - **Beatport** @@ -522,6 +523,7 @@ - **mixcloud:playlist** - **mixcloud:user** - **MLB** + - **MLBVideo** - **Mnet** - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -1212,4 +1214,6 @@ - **ZDFChannel** - **Zhihu** - **zingmp3**: mp3.zing.vn + - **zingmp3:album** + - **zoom** - **Zype** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5a540119c..e87b820fa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.14' +__version__ = '2021.03.25' From 8562218350a79d4709da8593bb0c538aa0824acf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Mar 2021 19:28:51 +0100 Subject: [PATCH 024/120] [ard] improve clip id extraction(#22724)(closes #28528) --- youtube_dl/extractor/ard.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 143fc51e9..d57c5ba0f 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -335,7 +335,7 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?PY3JpZDovL[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -365,22 +365,22 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): }, { 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id + video_id = self._match_id(url) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ + video_id, data=json.dumps({ 'query': '''{ - playerPage(client:"%s", clipId: "%s") { + playerPage(client: "ard", clipId: "%s") { blockedByFsk broadcastedOn maturityContentRating @@ -410,7 +410,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % video_id, }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] @@ -435,7 +435,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) info.update({ 'age_limit': age_limit, - 'display_id': display_id, 'title': title, 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), From 605e7b5e47c60c3ed7c2ca71df4d6bbd49fa8a77 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 25 Mar 2021 12:53:18 +0100 Subject: [PATCH 025/120] [youtube:tab] fix playlist/comunity continuation items extraction(closes #28266) --- youtube_dl/extractor/youtube.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index badca3977..faf3a344e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -24,6 +24,7 @@ from ..jsinterp import JSInterpreter from ..utils import ( ExtractorError, clean_html, + dict_get, float_or_none, int_or_none, mimetype2ext, @@ -2541,13 +2542,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation = self._extract_continuation(continuation_renderer) continue + on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) continuation_items = try_get( - response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) + on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) if continuation_items: continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue - renderer = continuation_item.get('gridVideoRenderer') + renderer = self._extract_grid_item_renderer(continuation_item) if renderer: grid_renderer = {'items': continuation_items} for entry in self._grid_entries(grid_renderer): @@ -2561,6 +2563,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(video_list_renderer) continue + renderer = continuation_item.get('backstagePostThreadRenderer') + if renderer: + continuation_renderer = {'contents': continuation_items} + for entry in self._post_thread_continuation_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue break From c78591187080e7316c0042309fe956bfd0d38d30 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 25 Mar 2021 17:06:57 +0100 Subject: [PATCH 026/120] [vimeo] fix unlisted video extraction(closes #28414) --- youtube_dl/extractor/vimeo.py | 39 ++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 955651bec..5800962ea 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -24,6 +24,7 @@ from ..utils import ( merge_dicts, OnDemandPagedList, parse_filesize, + parse_iso8601, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -278,7 +279,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? (?:videos?/)? (?P[0-9]+) - (?:/[\da-f]+)? + (?:/(?P[\da-f]{10}))? /?(?:[?&].*)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' @@ -577,11 +578,37 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) - # Extract ID from URL - video_id = self._match_id(url) + video_id, unlisted_hash = re.match(self._VALID_URL, url).groups() + if unlisted_hash: + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + video = self._download_json( + 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash), + video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + orig_url = url is_pro = 'vimeopro.com/' in url is_player = '://player.vimeo.com/video/' in url @@ -756,6 +783,8 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None info_dict = { From cc777dcaa0f3331626f33a7e4c61d804c43f4b5c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 27 Mar 2021 17:37:45 +0100 Subject: [PATCH 027/120] [picarto] fix live stream extraction(closes #28532) --- youtube_dl/extractor/picarto.py | 96 ++++++++++++--------------------- 1 file changed, 35 insertions(+), 61 deletions(-) diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py index 8099ef1d6..e6c51e16b 100644 --- a/youtube_dl/extractor/picarto.py +++ b/youtube_dl/extractor/picarto.py @@ -1,22 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import time - from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, js_to_json, - try_get, - update_url_query, - urlencode_postdata, ) class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)(?:/(?P[a-zA-Z0-9]+))?' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -34,65 +27,46 @@ class PicartoIE(InfoExtractor): return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') + channel_id = self._match_id(url) - metadata = self._download_json( - 'https://api.picarto.tv/v1/channel/name/' + channel_id, - channel_id) + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', channel_id, query={ + 'query': '''{ + channel(name: "%s") { + adult + id + online + stream_name + title + } + getLoadBalancerUrl(channel_name: "%s") { + url + } +}''' % (channel_id, channel_id), + })['data'] + metadata = data['channel'] - if metadata.get('online') is False: + if metadata.get('online') == 0: raise ExtractorError('Stream is offline', expected=True) + title = metadata['title'] cdn_data = self._download_json( - 'https://picarto.tv/process/channel', channel_id, - data=urlencode_postdata({'loadbalancinginfo': channel_id}), - note='Downloading load balancing info') + data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js', + channel_id, 'Downloading load balancing info') - token = mobj.group('token') or 'public' - params = { - 'con': int(time.time() * 1000), - 'token': token, - } - - prefered_edge = cdn_data.get('preferedEdge') formats = [] - - for edge in cdn_data['edges']: - edge_ep = edge.get('ep') - if not edge_ep or not isinstance(edge_ep, compat_str): + for source in (cdn_data.get('source') or []): + source_url = source.get('url') + if not source_url: continue - edge_id = edge.get('id') - for tech in cdn_data['techs']: - tech_label = tech.get('label') - tech_type = tech.get('type') - preference = 0 - if edge_id == prefered_edge: - preference += 1 - format_id = [] - if edge_id: - format_id.append(edge_id) - if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': - format_id.append('hls') - formats.extend(self._extract_m3u8_formats( - update_url_query( - 'https://%s/hls/%s/index.m3u8' - % (edge_ep, channel_id), params), - channel_id, 'mp4', preference=preference, - m3u8_id='-'.join(format_id), fatal=False)) - continue - elif tech_type == 'video/mp4' or tech_label == 'MP4': - format_id.append('mp4') - formats.append({ - 'url': update_url_query( - 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), - params), - 'format_id': '-'.join(format_id), - 'preference': preference, - }) - else: - # rtmp format does not seem to work - continue + source_type = source.get('type') + if source_type == 'html5/application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + source_url, channel_id, 'mp4', m3u8_id='hls', fatal=False)) + elif source_type == 'html5/video/mp4': + formats.append({ + 'url': source_url, + }) self._sort_formats(formats) mature = metadata.get('adult') @@ -103,10 +77,10 @@ class PicartoIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(metadata.get('title') or channel_id), + 'title': self._live_title(title.strip()), 'is_live': True, - 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), 'channel': channel_id, + 'channel_id': metadata.get('id'), 'channel_url': 'https://picarto.tv/%s' % channel_id, 'age_limit': age_limit, 'formats': formats, From 49fc0a567febda65709cc5154ff046684a3b8427 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 27 Mar 2021 19:11:41 +0100 Subject: [PATCH 028/120] [youtube] fix video's channel extraction(closes #28562) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index faf3a344e..e48c5a7d2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1896,7 +1896,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info['channel'] = get_text(try_get( vsir, lambda x: x['owner']['videoOwnerRenderer']['title'], - compat_str)) + dict)) rows = try_get( vsir, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], From 87a8bde7775ebc31175ebb111015b4052b50b7db Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 28 Mar 2021 08:46:33 +0100 Subject: [PATCH 029/120] [sbs] add support for ondemand watch URLs(closes #28566) --- youtube_dl/extractor/sbs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index f722528cd..0a806ee4e 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -10,7 +10,7 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -43,6 +43,9 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', + 'only_matching': True, }] def _real_extract(self, url): From da762c4e329f6158956ddd51ac294e9183e5ce89 Mon Sep 17 00:00:00 2001 From: Chris Hranj Date: Mon, 29 Mar 2021 15:05:19 -0400 Subject: [PATCH 030/120] [instagram] Improve title extraction and extract duration (#28469) Co-authored-by: Sergey M. --- youtube_dl/extractor/instagram.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 1eeddc3b6..12e10143c 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -12,6 +12,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, get_element_by_attribute, int_or_none, lowercase_escape, @@ -32,6 +33,7 @@ class InstagramIE(InfoExtractor): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', @@ -48,6 +50,7 @@ class InstagramIE(InfoExtractor): 'ext': 'mp4', 'title': 'Video by britneyspears', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1453760977, 'upload_date': '20160125', 'uploader_id': 'britneyspears', @@ -86,6 +89,24 @@ class InstagramIE(InfoExtractor): 'title': 'Post by instagram', 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', }, + }, { + # IGTV + 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', + 'info_dict': { + 'id': 'BkfuX9UB-eK', + 'ext': 'mp4', + 'title': 'Fingerboarding Tricks with @cass.fb', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 53.83, + 'timestamp': 1530032919, + 'upload_date': '20180626', + 'uploader_id': 'instagram', + 'uploader': 'Instagram', + 'like_count': int, + 'comment_count': int, + 'comments': list, + 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', + } }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -159,7 +180,9 @@ class InstagramIE(InfoExtractor): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') + title = media.get('title') thumbnail = media.get('display_src') or media.get('display_url') + duration = float_or_none(media.get('video_duration')) timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') @@ -200,9 +223,10 @@ class InstagramIE(InfoExtractor): continue entries.append({ 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, + 'title': node.get('title') or 'Video %d' % edge_num, 'url': node_video_url, 'thumbnail': node.get('display_url'), + 'duration': float_or_none(node.get('video_duration')), 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), 'view_count': int_or_none(node.get('video_view_count')), @@ -239,8 +263,9 @@ class InstagramIE(InfoExtractor): 'id': video_id, 'formats': formats, 'ext': 'mp4', - 'title': 'Video by %s' % uploader_id, + 'title': title or 'Video by %s' % uploader_id, 'description': description, + 'duration': duration, 'thumbnail': thumbnail, 'timestamp': timestamp, 'uploader_id': uploader_id, From 287e50b56b4c71da8fd0c3ffdeca9bff5ab0b005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 30 Mar 2021 03:37:43 +0700 Subject: [PATCH 031/120] [francetvinfo] Improve video id extraction (closes #28584) --- youtube_dl/extractor/francetv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3ca415077..7cc88bf18 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -399,7 +399,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): video_id = self._search_regex( (r'player\.load[^;]+src:\s*["\']([^"\']+)', r'id-video=([^@]+@[^"]+)', - r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'data-id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), webpage, 'video id') return self._make_url_result(video_id) From 955894e72fd8d4fdce5d85fc006d548278e6d9eb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 30 Mar 2021 10:00:40 +0100 Subject: [PATCH 032/120] [vlive] fix inkey request(closes #28589) --- youtube_dl/extractor/vlive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index e2f5d81b8..42da34d44 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -106,7 +106,7 @@ class VLiveIE(VLiveBaseIE): raise ExtractorError('Unable to log in', expected=True) def _call_api(self, path_template, video_id, fields=None): - query = {'appId': self._APP_ID, 'gcc': 'KR'} + query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'} if fields: query['fields'] = fields try: From 207bc35d348efdfdfe2bd7119e004a1acf0ab3d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 31 Mar 2021 02:58:01 +0700 Subject: [PATCH 033/120] [ChangeLog] Actualize [ci skip] --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index 1b49e411a..1297c19f7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version + +Extractors +* [vlive] Fix inkey request (#28589) +* [francetvinfo] Improve video id extraction (#28584) ++ [instagram] Extract duration (#28469) +* [instagram] Improve title extraction (#28469) ++ [sbs] Add support for ondemand watch URLs (#28566) +* [youtube] Fix video's channel extraction (#28562) +* [picarto] Fix live stream extraction (#28532) +* [vimeo] Fix unlisted video extraction (#28414) +* [youtube:tab] Fix playlist/community continuation items extraction (#28266) +* [ard] Improve clip id extraction (#22724, #28528) + + version 2021.03.25 Extractors From 8f493de9fb3a7f123bdf887163efa06ce9d6b051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 31 Mar 2021 02:59:07 +0700 Subject: [PATCH 034/120] release 2021.03.31 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 7feb0298c..2ac4df8db 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.25 + [debug] youtube-dl version 2021.03.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 49e18173d..5ad5590bf 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index a1486b133..ea96c4c20 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 7eaf5a202..ed3abd45c 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.25 + [debug] youtube-dl version 2021.03.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 20042d98c..c1067ee1f 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 1297c19f7..4c094b771 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.31 Extractors * [vlive] Fix inkey request (#28589) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e87b820fa..bcfdae23d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.25' +__version__ = '2021.03.31' From 28bab774a0df2c80b689c277390da8617131db35 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 30 Mar 2021 21:44:41 +0100 Subject: [PATCH 035/120] [youtube] imporve age-restricted video extraction(#28578) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e48c5a7d2..6a92938a5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1432,7 +1432,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id webpage = self._download_webpage( - webpage_url + '&bpctr=9999999999', video_id, fatal=False) + webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) player_response = None if webpage: From b97fb2edac25182ff3dcf4cb8537517a1ec9e4de Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 31 Mar 2021 20:07:13 +0100 Subject: [PATCH 036/120] [vimeo] fix password protected review extraction(closes #27591) --- youtube_dl/extractor/vimeo.py | 64 ++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 5800962ea..a90cf0630 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -75,25 +75,28 @@ class VimeoBaseInfoExtractor(InfoExtractor): expected=True) raise ExtractorError('Unable to log in') - def _verify_video_password(self, url, video_id, webpage): + def _get_video_password(self): password = self._downloader.params.get('videopassword') if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ - 'password': password, - 'token': token, - }) + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + return password + + def _verify_video_password(self, url, video_id, password, token, vuid): if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) self._set_vimeo_cookie('vuid', vuid) return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + url + '/password', video_id, 'Verifying the password', + 'Wrong password', data=urlencode_postdata({ + 'password': password, + 'token': token, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( @@ -332,9 +335,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '54469442', 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', - 'uploader': 'The BLN & Business of Software', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', - 'uploader_id': 'theblnbusinessofsoftware', + 'uploader': 'Business of Software', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/businessofsoftware', + 'uploader_id': 'businessofsoftware', 'duration': 3610, 'description': None, }, @@ -469,6 +472,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Unable to download JSON metadata'], + 'skip': 'this page is no longer available.', }, { 'url': 'http://player.vimeo.com/video/68375962', @@ -551,9 +555,7 @@ class VimeoIE(VimeoBaseInfoExtractor): return urls[0] if urls else None def _verify_player_video_password(self, url, video_id, headers): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + password = self._get_video_password() data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) @@ -697,7 +699,10 @@ class VimeoIE(VimeoBaseInfoExtractor): if re.search(r']+?id="pw_form"', webpage) is not None: if '_video_password_verified' in data: raise ExtractorError('video password verification failed!') - self._verify_video_password(redirect_url, video_id, webpage) + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) return self._real_extract( smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) else: @@ -1091,10 +1096,23 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _real_extract(self, url): page_url, video_id = re.match(self._VALID_URL, url).groups() - clip_data = self._download_json( - page_url.replace('/review/', '/review/data/'), - video_id)['clipData'] - config_url = clip_data['configUrl'] + data = self._download_json( + page_url.replace('/review/', '/review/data/'), video_id) + if data.get('isLocked') is True: + video_password = self._get_video_password() + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', video_id) + webpage = self._verify_video_password( + 'https://vimeo.com/' + video_id, video_id, + video_password, viewer['xsrft'], viewer['vuid']) + clip_page_config = self._parse_json(self._search_regex( + r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', + webpage, 'clip page config'), video_id) + config_url = clip_page_config['player']['config_url'] + clip_data = clip_page_config.get('clip') or {} + else: + clip_data = data['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( From 14f29f087e6097feb46bdb84878924bc410a57eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Apr 2021 04:05:10 +0700 Subject: [PATCH 037/120] [youtube] Setup CONSENT cookie when needed (closes #28604) --- youtube_dl/extractor/youtube.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6a92938a5..b940c0bad 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -249,7 +249,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _initialize_consent(self): + cookies = self._get_cookies('https://www.youtube.com/') + if cookies.get('__Secure-3PSID'): + return + consent_id = None + consent = cookies.get('CONSENT') + if consent: + if 'YES' in consent.value: + return + consent_id = self._search_regex( + r'PENDING\+(\d+)', consent.value, 'consent', default=None) + if not consent_id: + consent_id = random.randint(100, 999) + self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _real_initialize(self): + self._initialize_consent() if self._downloader is None: return if not self._login(): From e789bb1aa4cb627d3d7ca79a5e5daa8d2f58cda6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Apr 2021 04:43:08 +0700 Subject: [PATCH 038/120] [ChangeLog] Actualize [ci skip] --- ChangeLog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4c094b771..ee2dc88bc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +version + +Extractors +* [youtube] Setup CONSENT cookie when needed (#28604) +* [vimeo] Fix password protected review extraction (#27591) +* [youtube] Improve age-restricted video extraction (#28578) + + version 2021.03.31 Extractors From ca304beb1538e54c5a18fdd50846ed2259d63b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Apr 2021 04:47:11 +0700 Subject: [PATCH 039/120] release 2021.04.01 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 2ac4df8db..98ec799e8 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.31 + [debug] youtube-dl version 2021.04.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5ad5590bf..5387a6cd1 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index ea96c4c20..945c80366 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index ed3abd45c..0acc8b679 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.31 + [debug] youtube-dl version 2021.04.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index c1067ee1f..42c3126a3 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index ee2dc88bc..4304ecd9e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.01 Extractors * [youtube] Setup CONSENT cookie when needed (#28604) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index bcfdae23d..0457d1a15 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.31' +__version__ = '2021.04.01' From 37488630703944b4f2bda84a26391ae61d29e15b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Apr 2021 11:50:30 +0100 Subject: [PATCH 040/120] [youtube:tab] Add support for hashtag videos extraction(closes #28308) --- youtube_dl/extractor/youtube.py | 137 ++++++++++++++++++++------------ 1 file changed, 84 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b940c0bad..1f5497e24 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1959,7 +1959,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): invidio\.us )/ (?: - (?:channel|c|user|feed)/| + (?:channel|c|user|feed|hashtag)/| (?:playlist|watch)\?.*?\blist=| (?!(?:watch|embed|v|e)\b) ) @@ -2245,6 +2245,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/TheYoungTurks/live', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': '#cctv9', + }, + 'playlist_mincount': 350, }] @classmethod @@ -2392,6 +2399,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): for entry in self._post_thread_entries(renderer): yield entry + def _rich_grid_entries(self, contents): + for content in contents: + video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + @staticmethod def _build_continuation_query(continuation, ctp=None): query = { @@ -2442,55 +2457,60 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not tab_content: return slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict) - if not slr_renderer: - return - is_channels_tab = tab.get('title') == 'Channels' - continuation = None - slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] - for slr_content in slr_contents: - if not isinstance(slr_content, dict): - continue - is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): + if slr_renderer: + is_channels_tab = tab.get('title') == 'Channels' + continuation = None + slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] + for slr_content in slr_contents: + if not isinstance(slr_content, dict): continue - renderer = isr_content.get('playlistVideoListRenderer') - if renderer: - for entry in self._playlist_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) + is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: continue - renderer = isr_content.get('gridRenderer') - if renderer: - for entry in self._grid_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('shelfRenderer') - if renderer: - for entry in self._shelf_entries(renderer, not is_channels_tab): - yield entry - continue - renderer = isr_content.get('backstagePostThreadRenderer') - if renderer: - for entry in self._post_thread_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('videoRenderer') - if renderer: - entry = self._video_entry(renderer) - if entry: - yield entry + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer, not is_channels_tab): + yield entry + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + if not continuation: + continuation = self._extract_continuation(is_renderer) if not continuation: - continuation = self._extract_continuation(is_renderer) - - if not continuation: - continuation = self._extract_continuation(slr_renderer) + continuation = self._extract_continuation(slr_renderer) + else: + rich_grid_renderer = tab_content.get('richGridRenderer') + if not rich_grid_renderer: + return + for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): + yield entry + continuation = self._extract_continuation(rich_grid_renderer) headers = { 'x-youtube-client-name': '1', @@ -2586,6 +2606,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(continuation_renderer) continue + renderer = continuation_item.get('richItemRenderer') + if renderer: + for entry in self._rich_grid_entries(continuation_items): + yield entry + continuation = self._extract_continuation({'contents': continuation_items}) + continue break @@ -2642,7 +2668,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - playlist_id = title = description = None + playlist_id = item_id + title = description = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -2651,12 +2678,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title += ' - %s' % tab_title description = renderer.get('description') playlist_id = renderer.get('externalId') - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) - if renderer: - title = renderer.get('title') - description = None - playlist_id = item_id + else: + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + if renderer: + title = renderer.get('title') + else: + renderer = try_get( + data, lambda x: x['header']['hashtagHeaderRenderer'], dict) + if renderer: + title = try_get(renderer, lambda x: x['hashtag']['simpleText']) playlist = self.playlist_result( self._entries(selected_tab, identity_token), playlist_id=playlist_id, playlist_title=title, From c5aa8f36bf636c3db81afd556d0e95d91b72b9c7 Mon Sep 17 00:00:00 2001 From: Vid Date: Thu, 18 Mar 2021 18:53:06 +0100 Subject: [PATCH 041/120] [arnes] Add new extractor(closes #28483) --- youtube_dl/extractor/arnes.py | 101 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 102 insertions(+) create mode 100644 youtube_dl/extractor/arnes.py diff --git a/youtube_dl/extractor/arnes.py b/youtube_dl/extractor/arnes.py new file mode 100644 index 000000000..c0032fcab --- /dev/null +++ b/youtube_dl/extractor/arnes.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + remove_start, +) + + +class ArnesIE(InfoExtractor): + IE_NAME = 'video.arnes.si' + IE_DESC = 'Arnes Video' + _VALID_URL = r'https?://video\.arnes\.si/(?:[a-z]{2}/)?(?:watch|embed|api/(?:asset|public/video))/(?P[0-9a-zA-Z]{12})' + _TESTS = [{ + 'url': 'https://video.arnes.si/watch/a1qrWTOQfVoU?t=10', + 'md5': '4d0f4d0a03571b33e1efac25fd4a065d', + 'info_dict': { + 'id': 'a1qrWTOQfVoU', + 'ext': 'mp4', + 'title': 'Linearna neodvisnost, definicija', + 'description': 'Linearna neodvisnost, definicija', + 'license': 'PRIVATE', + 'creator': 'Polona Oblak', + 'timestamp': 1585063725, + 'upload_date': '20200324', + 'channel': 'Polona Oblak', + 'channel_id': 'q6pc04hw24cj', + 'channel_url': 'https://video.arnes.si/?channel=q6pc04hw24cj', + 'duration': 596.75, + 'view_count': int, + 'tags': ['linearna_algebra'], + 'start_time': 10, + } + }, { + 'url': 'https://video.arnes.si/api/asset/s1YjnV7hadlC/play.mp4', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/en/watch/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC?t=123&hideRelated=1', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/api/public/video/s1YjnV7hadlC', + 'only_matching': True, + }] + _BASE_URL = 'https://video.arnes.si' + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + self._BASE_URL + '/api/public/video/' + video_id, video_id)['data'] + title = video['title'] + + formats = [] + for media in (video.get('media') or []): + media_url = media.get('url') + if not media_url: + continue + formats.append({ + 'url': self._BASE_URL + media_url, + 'format_id': remove_start(media.get('format'), 'FORMAT_'), + 'format_note': media.get('formatTranslation'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + }) + self._sort_formats(formats) + + channel = video.get('channel') or {} + channel_id = channel.get('url') + thumbnail = video.get('thumbnailUrl') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': self._BASE_URL + thumbnail, + 'description': video.get('description'), + 'license': video.get('license'), + 'creator': video.get('author'), + 'timestamp': parse_iso8601(video.get('creationTime')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None, + 'duration': float_or_none(video.get('duration'), 1000), + 'view_count': int_or_none(video.get('views')), + 'tags': video.get('hashtags'), + 'start_time': int_or_none(compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('t', [None])[0]), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b2b39e4dd..8cf348772 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -72,6 +72,7 @@ from .arte import ( ArteTVEmbedIE, ArteTVPlaylistIE, ) +from .arnes import ArnesIE from .asiancrush import ( AsianCrushIE, AsianCrushPlaylistIE, From 392c467f95cbf89114235038e1938c72d97144d9 Mon Sep 17 00:00:00 2001 From: Allan Daemon Date: Mon, 15 May 2017 00:04:39 -0300 Subject: [PATCH 042/120] [palcomp3] Add new extractor(closes #13120) --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/palcomp3.py | 148 +++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 youtube_dl/extractor/palcomp3.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8cf348772..65fefabe8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -879,6 +879,11 @@ from .packtpub import ( PacktPubIE, PacktPubCourseIE, ) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py new file mode 100644 index 000000000..fb29d83f9 --- /dev/null +++ b/youtube_dl/extractor/palcomp3.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, +) + + +class PalcoMP3BaseIE(InfoExtractor): + _GQL_QUERY_TMPL = '''{ + artist(slug: "%s") { + %s + } +}''' + _ARTIST_FIELDS_TMPL = '''music(slug: "%%s") { + %s + }''' + _MUSIC_FIELDS = '''duration + hls + mp3File + musicID + plays + title''' + + def _call_api(self, artist_slug, artist_fields): + return self._download_json( + 'https://www.palcomp3.com.br/graphql/', artist_slug, query={ + 'query': self._GQL_QUERY_TMPL % (artist_slug, artist_fields), + })['data'] + + def _parse_music(self, music): + music_id = compat_str(music['musicID']) + title = music['title'] + + formats = [] + hls_url = music.get('hls') + if hls_url: + formats.append({ + 'url': hls_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + mp3_file = music.get('mp3File') + if mp3_file: + formats.append({ + 'url': mp3_file, + }) + + return { + 'id': music_id, + 'title': title, + 'formats': formats, + 'duration': int_or_none(music.get('duration')), + 'view_count': int_or_none(music.get('plays')), + } + + def _real_initialize(self): + self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS + + def _real_extract(self, url): + artist_slug, music_slug = re.match(self._VALID_URL, url).groups() + artist_fields = self._ARTIST_FIELDS_TMPL % music_slug + music = self._call_api(artist_slug, artist_fields)['artist']['music'] + return self._parse_music(music) + + +class PalcoMP3IE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:song' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/', + 'md5': '99fd6405b2d8fd589670f6db1ba3b358', + 'info_dict': { + 'id': '3162927', + 'ext': 'mp3', + 'title': 'Nossas Composições - CUIDA BEM DELA', + 'duration': 210, + 'view_count': int, + } + }] + + @classmethod + def suitable(cls, url): + return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url) + + +class PalcoMP3ArtistIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:artist' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com.br/condedoforro/', + 'info_dict': { + 'id': '358396', + 'title': 'Conde do Forró', + }, + 'playlist_mincount': 188, + }] + _ARTIST_FIELDS_TMPL = '''artistID + musics { + nodes { + %s + } + } + name''' + + @ classmethod + def suitable(cls, url): + return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url) + + def _real_extract(self, url): + artist_slug = self._match_id(url) + artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist'] + + def entries(): + for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []): + yield self._parse_music(music) + + return self.playlist_result( + entries(), str_or_none(artist.get('artistID')), artist.get('name')) + + +class PalcoMP3VideoIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:video' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)/?#clipe' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/maiara-e-maraisa-voce-faz-falta-aqui-ao-vivo-em-vicosa-mg/#clipe', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': '_pD1nR2qqPg', + 'ext': 'mp4', + 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande', + 'description': 'md5:7043342c09a224598e93546e98e49282', + 'upload_date': '20161107', + 'uploader_id': 'maiaramaraisaoficial', + 'uploader': 'Maiara e Maraisa', + } + }] + _MUSIC_FIELDS = 'youtubeID' + + def _parse_music(self, music): + youtube_id = music['youtubeID'] + return self.url_result(youtube_id, 'Youtube', youtube_id) From 04d4a3b136060158438c3f2c1b31c884c6961712 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Apr 2021 19:03:45 +0100 Subject: [PATCH 043/120] [screencastomatic] fix extraction(closes #11976, closes #24489) --- youtube_dl/extractor/screencastomatic.py | 48 +++++++++++++++--------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index b5e76c9af..0afdc1715 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -2,12 +2,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + get_element_by_class, + int_or_none, + remove_start, + strip_or_none, + unified_strdate, +) class ScreencastOMaticIE(InfoExtractor): - _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://screencast-o-matic\.com/(?:(?:watch|player)/|embed\?.*?\bsc=)(?P[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', 'md5': '483583cb80d92588f15ccbedd90f0c18', 'info_dict': { @@ -16,22 +22,30 @@ class ScreencastOMaticIE(InfoExtractor): 'title': 'Welcome to 3-4 Philosophy @ DECV!', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', - 'duration': 369.163, + 'duration': 369, + 'upload_date': '20141216', } - } + }, { + 'url': 'http://screencast-o-matic.com/player/c2lD3BeOPl', + 'only_matching': True, + }, { + 'url': 'http://screencast-o-matic.com/embed?ff=true&sc=cbV2r4Q5TL&fromPH=true&a=1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - jwplayer_data = self._parse_json( - self._search_regex( - r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), - video_id, transform_source=js_to_json) - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict.update({ - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), + webpage = self._download_webpage( + 'https://screencast-o-matic.com/player/' + video_id, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ + 'id': video_id, + 'title': get_element_by_class('overlayTitle', webpage), + 'description': strip_or_none(get_element_by_class('overlayDescription', webpage)) or None, + 'duration': int_or_none(self._search_regex( + r'player\.duration\s*=\s*function\(\)\s*{\s*return\s+(\d+);\s*};', + webpage, 'duration', default=None)), + 'upload_date': unified_strdate(remove_start( + get_element_by_class('overlayPublished', webpage), 'Published: ')), }) - return info_dict + return info From 1df2596f81695bf452ffbfd89596d115d9b2daf5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 3 Apr 2021 07:54:02 +0100 Subject: [PATCH 044/120] [extractor/common] fix _get_cookies method for python 2(#20673, #23256, #20326, closes #28640) --- youtube_dl/extractor/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d3b6724df..fcbf18ee6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2899,7 +2899,10 @@ class InfoExtractor(object): """ Return a compat_cookies.SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies.SimpleCookie(req.get_header('Cookie')) + cookie = req.get_header('Cookie') + if cookie and sys.version_info[0] == 2: + cookie = str(cookie) + return compat_cookies.SimpleCookie(cookie) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 654b4f4ff2718f38b3182c1188c5d569c14cc70a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 3 Apr 2021 08:23:35 +0100 Subject: [PATCH 045/120] [youtube] prioritize information from YoutubeIE for playlist entries(closes #28619, closes #28636) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f5497e24..2e027528d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -329,7 +329,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): (lambda x: x['ownerText']['runs'][0]['text'], lambda x: x['shortBylineText']['runs'][0]['text']), compat_str) return { - '_type': 'url_transparent', + '_type': 'url', 'ie_key': YoutubeIE.ie_key(), 'id': video_id, 'url': video_id, From aee6feb02adaa316455ea9497e92cc82b720f231 Mon Sep 17 00:00:00 2001 From: RomanEmelyanov Date: Sun, 4 Apr 2021 11:14:37 +0300 Subject: [PATCH 046/120] [youku] Update ccode(closes #17852, closes #28447, closes #28460) (#28648) --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 61d1ab209..880c89687 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0590', + 'ccode': '0532', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, From e165f5641fdf62975d3b6a40132a475c9cbaea2a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 15:52:14 +0100 Subject: [PATCH 047/120] [extractor/common] fix JSON-LD VideoObject author extraction --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fcbf18ee6..8ef22779a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -70,6 +70,7 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, + try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -1282,7 +1283,7 @@ class InfoExtractor(object): 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), - 'uploader': str_or_none(e.get('author')), + 'uploader': try_get(e, lambda x: x['author']['name'], compat_str), 'filesize': float_or_none(e.get('contentSize')), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), From 3ae9c0f410b1d4f63e8bada67dd62a8d2852be32 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 16:26:56 +0100 Subject: [PATCH 048/120] [vimeo] improve extraction(closes #28591) --- youtube_dl/extractor/vimeo.py | 239 ++++++++++++++-------------------- 1 file changed, 100 insertions(+), 139 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a90cf0630..102687b82 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import base64 import functools -import json import re import itertools @@ -17,15 +16,14 @@ from ..compat import ( from ..utils import ( clean_html, determine_ext, - dict_get, ExtractorError, + get_element_by_class, js_to_json, int_or_none, merge_dicts, OnDemandPagedList, parse_filesize, parse_iso8601, - RegexNotFoundError, sanitized_Request, smuggle_url, std_headers, @@ -127,10 +125,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_title = video_data['title'] live_event = video_data.get('live_event') or {} is_live = live_event.get('status') == 'started' + request = config.get('request') or {} formats = [] - config_files = video_data.get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): + config_files = video_data.get('files') or request.get('files') or {} + for f in (config_files.get('progressive') or []): video_url = f.get('url') if not video_url: continue @@ -146,7 +145,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): # TODO: fix handling of 308 status code returned for live archive manifest requests sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): - for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): manifest_url = cdn_data.get('url') if not manifest_url: continue @@ -192,17 +191,15 @@ class VimeoBaseInfoExtractor(InfoExtractor): f['preference'] = -40 subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), - }] + for tt in (request.get('text_tracks') or []): + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': urljoin('https://vimeo.com', tt['url']), + }] thumbnails = [] if not is_live: - for key, thumb in video_data.get('thumbs', {}).items(): + for key, thumb in (video_data.get('thumbs') or {}).items(): thumbnails.append({ 'id': key, 'width': int_or_none(key), @@ -322,6 +319,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 1595, 'upload_date': '20130610', 'timestamp': 1370893156, + 'license': 'by', }, 'params': { 'format': 'best[protocol=https]', @@ -400,6 +398,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, + 'subtitles': { + 'de': [{'ext': 'vtt'}], + 'en': [{'ext': 'vtt'}], + 'es': [{'ext': 'vtt'}], + 'fr': [{'ext': 'vtt'}], + }, } }, { @@ -572,6 +576,37 @@ class VimeoIE(VimeoBaseInfoExtractor): def _real_initialize(self): self._login() + def _extract_from_api(self, video_id, unlisted_hash=None): + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + api_url = 'https://api.vimeo.com/videos/' + video_id + if unlisted_hash: + api_url += ':' + unlisted_hash + video = self._download_json( + api_url, video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + def _real_extract(self, url): url, data = unsmuggle_url(url, {}) headers = std_headers.copy() @@ -580,48 +615,19 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url - # Extract ID from URL - video_id, unlisted_hash = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url).groupdict() + video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash') if unlisted_hash: - token = self._download_json( - 'https://vimeo.com/_rv/jwt', video_id, headers={ - 'X-Requested-With': 'XMLHttpRequest' - })['token'] - video = self._download_json( - 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash), - video_id, headers={ - 'Authorization': 'jwt ' + token, - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - info = self._parse_config(self._download_json( - video['config_url'], video_id), video_id) - self._vimeo_sort_formats(info['formats']) - get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) - info.update({ - 'description': video.get('description'), - 'license': video.get('license'), - 'release_timestamp': get_timestamp('release'), - 'timestamp': get_timestamp('created'), - 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), - }) - connections = try_get( - video, lambda x: x['metadata']['connections'], dict) or {} - for k in ('comment', 'like'): - info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) - return info + return self._extract_from_api(video_id, unlisted_hash) orig_url = url is_pro = 'vimeopro.com/' in url - is_player = '://player.vimeo.com/video/' in url if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) if not url: url = 'https://vimeo.com/' + video_id - elif is_player: - url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id @@ -641,14 +647,25 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) raise - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) + if '://player.vimeo.com/video/' in url: + config = self._parse_json(self._search_regex( + r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + if config.get('view') == 4: + config = self._verify_player_video_password( + redirect_url, video_id, headers) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info + + if re.search(r']+?id="pw_form"', webpage): + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + webpage = self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: - seed_status = vimeo_config.get('seed_status', {}) + seed_status = vimeo_config.get('seed_status') or {} if seed_status.get('state') == 'failed': raise ExtractorError( '%s said: %s' % (self.IE_NAME, seed_status['title']), @@ -657,70 +674,40 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None video_description = None + info_dict = {} - # Extract the config JSON - try: - try: - config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, - 'config URL', default=None) - if not config_url: - # Sometimes new react-based page is served instead of old one that require - # different config URL extraction approach (see - # https://github.com/ytdl-org/youtube-dl/pull/7209) - page_config = self._parse_json(self._search_regex( - r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', - webpage, 'page config'), video_id) - config_url = page_config['player']['config_url'] - cc_license = page_config.get('cc_license') - timestamp = try_get( - page_config, lambda x: x['clip']['uploaded_on'], - compat_str) - video_description = clean_html(dict_get( - page_config, ('description', 'description_html_escaped'))) - config = self._download_json(config_url, video_id) - except RegexNotFoundError: - # For pro videos or player.vimeo.com urls - # We try to find out to which variable is assigned the config dic - m_variable_name = re.search(r'(\w)\.video\.id', webpage) - if m_variable_name is not None: - config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] - else: - config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] - config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') - config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') - config = self._search_regex(config_re, webpage, 'info section', - flags=re.DOTALL) - config = json.loads(config) - except Exception as e: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') - - if re.search(r']+?id="pw_form"', webpage) is not None: - if '_video_password_verified' in data: - raise ExtractorError('video password verification failed!') - video_password = self._get_video_password() - token, vuid = self._extract_xsrft_and_vuid(webpage) - self._verify_video_password( - redirect_url, video_id, video_password, token, vuid) - return self._real_extract( - smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) - else: - raise ExtractorError('Unable to extract info section', - cause=e) + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + if channel_id: + config_url = self._html_search_regex( + r'\bdata-config-url="([^"]+)"', webpage, 'config URL') + video_description = clean_html(get_element_by_class('description', webpage)) + info_dict.update({ + 'channel_id': channel_id, + 'channel_url': 'https://vimeo.com/channels/' + channel_id, + }) else: - if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id, headers) - + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config', default='{}'), video_id, fatal=False) + if not page_config: + return self._extract_from_api(video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + clip = page_config.get('clip') or {} + timestamp = clip.get('uploaded_on') + video_description = clean_html( + clip.get('description') or page_config.get('description_html_escaped')) + config = self._download_json(config_url, video_id) video = config.get('video') or {} vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: return True - if config.get('user', {}).get('purchased'): + if try_get(config, lambda x: x['user']['purchased']): return True - for purchase_option in vod.get('purchase_options', []): + for purchase_option in (vod.get('purchase_options') or []): if purchase_option.get('purchased'): return True label = purchase_option.get('label_string') @@ -735,14 +722,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract video description - if not video_description: - video_description = self._html_search_regex( - r'(?s)]*>(.*?)', - webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( - 'description', webpage, default=None) + ['description', 'og:description', 'twitter:description'], + webpage, default=None) if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, @@ -751,25 +734,14 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not is_player: + if not video_description: self._downloader.report_warning('Cannot find video description') - # Extract upload date if not timestamp: timestamp = self._search_regex( r']+datetime="([^"]+)"', webpage, 'timestamp', default=None) - try: - view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) - like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) - comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) - except RegexNotFoundError: - # This info is only available in vimeo.com/{id} urls - view_count = None - like_count = None - comment_count = None - formats = [] source_format = self._extract_original_format( @@ -788,31 +760,20 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) - channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None - - info_dict = { + info_dict.update({ 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, 'webpage_url': url, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, 'license': cc_license, - 'channel_id': channel_id, - 'channel_url': channel_url, - } + }) - info_dict = merge_dicts(info_dict, info_dict_config, json_ld) - - return info_dict + return merge_dicts(info_dict, info_dict_config, json_ld) class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', From 6beb1ac65b03415764c487fd139298f22e1e0313 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 19:16:17 +0100 Subject: [PATCH 049/120] [extractor/common] keep support for non standard JSON-LD VideoObject author values --- youtube_dl/extractor/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ef22779a..78ff5b6d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -70,7 +70,6 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, - try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -1276,6 +1275,7 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' + author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), @@ -1283,7 +1283,11 @@ class InfoExtractor(object): 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), - 'uploader': try_get(e, lambda x: x['author']['name'], compat_str), + # author can be an instance of 'Organization' or 'Person' types. + # both types can have 'name' property(inherited from 'Thing' type). [1] + # however some websites are using 'Text' type instead. + # 1. https://schema.org/VideoObject + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, 'filesize': float_or_none(e.get('contentSize')), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), From 162bf9e10a4e6a08f5ed156a68054ef9b4d2b60e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 19:49:24 +0100 Subject: [PATCH 050/120] [compat] add compat_SimpleCookie --- youtube_dl/compat.py | 9 +++++++++ youtube_dl/extractor/common.py | 9 +++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 6c3d49d45..8bbebebcf 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -73,6 +73,15 @@ try: except ImportError: # Python 2 import Cookie as compat_cookies +if sys.version_info[0] == 2: + class compat_SimpleCookie(compat_cookies.SimpleCookie): + def load(self, rawdata): + if isinstance(rawdata, unicode): + rawdata = str(rawdata) + return super(compat_SimpleCookie, self).load(rawdata) +else: + compat_SimpleCookie = compat_cookies.SimpleCookie + try: import html.entities as compat_html_entities except ImportError: # Python 2 diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78ff5b6d0..af289d705 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -17,13 +17,13 @@ import math from ..compat import ( compat_cookiejar_Cookie, - compat_cookies, compat_etree_Element, compat_etree_fromstring, compat_getpass, compat_integer_types, compat_http_client, compat_os_name, + compat_SimpleCookie, compat_str, compat_urllib_error, compat_urllib_parse_unquote, @@ -2901,13 +2901,10 @@ class InfoExtractor(object): self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + """ Return a compat_SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - cookie = req.get_header('Cookie') - if cookie and sys.version_info[0] == 2: - cookie = str(cookie) - return compat_cookies.SimpleCookie(cookie) + return compat_SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 760c911299aa607ca967d6d4be2985528bacf29f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 5 Apr 2021 07:16:50 +0100 Subject: [PATCH 051/120] [compat] add compat_SimpleCookie to __all__ array --- youtube_dl/compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 8bbebebcf..8a5262dc8 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3002,6 +3002,7 @@ __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', + 'compat_SimpleCookie', 'compat_Struct', 'compat_b64decode', 'compat_basestring', From 25b1287323f5836c9416a8183096adc63809d5ce Mon Sep 17 00:00:00 2001 From: guredora Date: Sun, 4 Apr 2021 22:12:07 +0900 Subject: [PATCH 052/120] [line] add support live.line.me (closes #17205)(closes #28658) --- youtube_dl/extractor/extractors.py | 6 +- youtube_dl/extractor/line.py | 142 ++++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 65fefabe8..d5cd364e8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -595,7 +595,11 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) -from .line import LineTVIE +from .line import ( + LineTVIE, + LineLiveIE, + LineLiveChannelIE, +) from .linkedin import ( LinkedInLearningIE, LinkedInLearningCourseIE, diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py index 7f5fa446e..2526daa77 100644 --- a/youtube_dl/extractor/line.py +++ b/youtube_dl/extractor/line.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import js_to_json +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + str_or_none, +) class LineTVIE(InfoExtractor): @@ -88,3 +94,137 @@ class LineTVIE(InfoExtractor): for thumbnail in video_info.get('thumbnails', {}).get('list', [])], 'view_count': video_info.get('meta', {}).get('count'), } + + +class LineLiveBaseIE(InfoExtractor): + _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' + + def _parse_broadcast_item(self, item): + broadcast_id = compat_str(item['id']) + title = item['title'] + is_live = item.get('isBroadcastingNow') + + thumbnails = [] + for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items(): + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + }) + + channel = item.get('channel') or {} + channel_id = str_or_none(channel.get('id')) + + return { + 'id': broadcast_id, + 'title': self._live_title(title) if is_live else title, + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('createdAt')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None, + 'duration': int_or_none(item.get('archiveDuration')), + 'view_count': int_or_none(item.get('viewerCount')), + 'comment_count': int_or_none(item.get('chatCount')), + 'is_live': is_live, + } + + +class LineLiveIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)/broadcast/(?P\d+)' + _TESTS = [{ + 'url': 'https://live.line.me/channels/4867368/broadcast/16331360', + 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3', + 'info_dict': { + 'id': '16331360', + 'title': '振りコピ講座😙😙😙', + 'ext': 'mp4', + 'timestamp': 1617095132, + 'upload_date': '20210330', + 'channel': '白川ゆめか', + 'channel_id': '4867368', + 'view_count': int, + 'comment_count': int, + 'is_live': False, + } + }, { + # archiveStatus == 'DELETED' + 'url': 'https://live.line.me/channels/4778159/broadcast/16378488', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id, broadcast_id = re.match(self._VALID_URL, url).groups() + broadcast = self._download_json( + self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id), + broadcast_id) + item = broadcast['item'] + info = self._parse_broadcast_item(item) + protocol = 'm3u8' if info['is_live'] else 'm3u8_native' + formats = [] + for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items(): + if not v: + continue + if k == 'abr': + formats.extend(self._extract_m3u8_formats( + v, broadcast_id, 'mp4', protocol, + m3u8_id='hls', fatal=False)) + continue + f = { + 'ext': 'mp4', + 'format_id': 'hls-' + k, + 'protocol': protocol, + 'url': v, + } + if not k.isdigit(): + f['vcodec'] = 'none' + formats.append(f) + if not formats: + archive_status = item.get('archiveStatus') + if archive_status != 'ARCHIVED': + raise ExtractorError('this video has been ' + archive_status.lower(), expected=True) + self._sort_formats(formats) + info['formats'] = formats + return info + + +class LineLiveChannelIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)(?!/broadcast/\d+)(?:[/?&#]|$)' + _TEST = { + 'url': 'https://live.line.me/channels/5893542', + 'info_dict': { + 'id': '5893542', + 'title': 'いくらちゃん', + 'description': 'md5:c3a4af801f43b2fac0b02294976580be', + }, + 'playlist_mincount': 29 + } + + def _archived_broadcasts_entries(self, archived_broadcasts, channel_id): + while True: + for row in (archived_broadcasts.get('rows') or []): + share_url = str_or_none(row.get('shareURL')) + if not share_url: + continue + info = self._parse_broadcast_item(row) + info.update({ + '_type': 'url', + 'url': share_url, + 'ie_key': LineLiveIE.ie_key(), + }) + yield info + if not archived_broadcasts.get('hasNextPage'): + return + archived_broadcasts = self._download_json( + self._API_BASE_URL + channel_id + '/archived_broadcasts', + channel_id, query={ + 'lastId': info['id'], + }) + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel = self._download_json(self._API_BASE_URL + channel_id, channel_id) + return self.playlist_result( + self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id), + channel_id, channel.get('title'), channel.get('information')) From 6b315d96bc0b07ddc3abaa7318583775828cce30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 6 Apr 2021 14:15:13 +0700 Subject: [PATCH 053/120] [compat] flake8 --- youtube_dl/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 8a5262dc8..566e9d5ec 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -76,7 +76,7 @@ except ImportError: # Python 2 if sys.version_info[0] == 2: class compat_SimpleCookie(compat_cookies.SimpleCookie): def load(self, rawdata): - if isinstance(rawdata, unicode): + if isinstance(rawdata, compat_str): rawdata = str(rawdata) return super(compat_SimpleCookie, self).load(rawdata) else: From 70d0d4f9beba0e5b6d95ee50ad62ae7ab5be9be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 6 Apr 2021 14:22:28 +0700 Subject: [PATCH 054/120] [compat] Use more conventional name for compat SimpleCookie --- youtube_dl/compat.py | 8 ++++---- youtube_dl/extractor/common.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 566e9d5ec..9e45c454b 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -74,13 +74,13 @@ except ImportError: # Python 2 import Cookie as compat_cookies if sys.version_info[0] == 2: - class compat_SimpleCookie(compat_cookies.SimpleCookie): + class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): def load(self, rawdata): if isinstance(rawdata, compat_str): rawdata = str(rawdata) - return super(compat_SimpleCookie, self).load(rawdata) + return super(compat_cookies_SimpleCookie, self).load(rawdata) else: - compat_SimpleCookie = compat_cookies.SimpleCookie + compat_cookies_SimpleCookie = compat_cookies.SimpleCookie try: import html.entities as compat_html_entities @@ -3002,7 +3002,6 @@ __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', - 'compat_SimpleCookie', 'compat_Struct', 'compat_b64decode', 'compat_basestring', @@ -3010,6 +3009,7 @@ __all__ = [ 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', + 'compat_cookies_SimpleCookie', 'compat_ctypes_WINFUNCTYPE', 'compat_etree_Element', 'compat_etree_fromstring', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index af289d705..797c35fd5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -17,13 +17,13 @@ import math from ..compat import ( compat_cookiejar_Cookie, + compat_cookies_SimpleCookie, compat_etree_Element, compat_etree_fromstring, compat_getpass, compat_integer_types, compat_http_client, compat_os_name, - compat_SimpleCookie, compat_str, compat_urllib_error, compat_urllib_parse_unquote, @@ -2901,10 +2901,10 @@ class InfoExtractor(object): self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_SimpleCookie with the cookies for the url """ + """ Return a compat_cookies_SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - return compat_SimpleCookie(req.get_header('Cookie')) + return compat_cookies_SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 6b116f0c03ac0b1aff01cd08bbe1d5cb87dff853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 7 Apr 2021 03:34:43 +0700 Subject: [PATCH 055/120] [youtube] Fix videos with restricted location (closes #28685) --- youtube_dl/extractor/youtube.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2e027528d..6b4c7912c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1084,6 +1084,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', 'only_matching': True, }, + { + # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685 + 'url': 'cBvYw8_A0vQ', + 'info_dict': { + 'id': 'cBvYw8_A0vQ', + 'ext': 'mp4', + 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き', + 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', + 'upload_date': '20201120', + 'uploader': 'Walk around Japan', + 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1485,7 +1502,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def get_text(x): if not x: return - return x.get('simpleText') or ''.join([r['text'] for r in x['runs']]) + text = x.get('simpleText') + if text and isinstance(text, compat_str): + return text + runs = x.get('runs') + if not isinstance(runs, list): + return + return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)]) search_meta = ( lambda x: self._html_search_meta(x, webpage, default=None)) \ From 445db582a27c44cb02d57ac9171d58651cafbd76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 7 Apr 2021 03:35:25 +0700 Subject: [PATCH 056/120] [ChangeLog] Actualize [ci skip] --- ChangeLog | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4304ecd9e..e5e546744 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +version + +Core +* [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies ++ [compat] Introduce compat_cookies_SimpleCookie +* [extractor/common] Improve JSON-LD author extraction +* [extractor/common] Fix _get_cookies on python 2 (#20673, #23256, #20326, + #28640) + +Extractors +* [youtube] Fix extraction of videos with restricted location (#28685) ++ [line] Add support for live.line.me (#17205, #28658) +* [vimeo] Improve extraction (#28591) +* [youku] Update ccode (#17852, #28447, #28460, #28648) +* [youtube] Prefer direct entry metadata over entry metadata from playlist + (#28619, #28636) +* [screencastomatic] Fix extraction (#11976, #24489) ++ [palcomp3] Add support for palcomp3.com (#13120) ++ [arnes] Add support for video.arnes.si (#28483) ++ [youtube:tab] Add support for hashtags (#28308) + + version 2021.04.01 Extractors From 72a2c0a9ede04c6b82235e453b1a933faf072a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 7 Apr 2021 03:42:24 +0700 Subject: [PATCH 057/120] release 2021.04.07 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 ++++++ youtube_dl/version.py | 2 +- 8 files changed, 20 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 98ec799e8..febbd2344 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.01 + [debug] youtube-dl version 2021.04.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5387a6cd1..d7296d0a9 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 945c80366..92e616a1a 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 0acc8b679..b55739f6c 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.01 + [debug] youtube-dl version 2021.04.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 42c3126a3..dbdb8356a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index e5e546744..22b4fa67d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.07 Core * [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d2ad937a4..ff9177a2c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -463,6 +463,8 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LineLive** + - **LineLiveChannel** - **LineTV** - **linkedin:learning** - **linkedin:learning:course** @@ -679,6 +681,9 @@ - **OutsideTV** - **PacktPub** - **PacktPubCourse** + - **PalcoMP3:artist** + - **PalcoMP3:song** + - **PalcoMP3:video** - **pandora.tv**: 판도라TV - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos @@ -1059,6 +1064,7 @@ - **Vidbit** - **Viddler** - **Videa** + - **video.arnes.si**: Arnes Video - **video.google:search**: Google Video search - **video.sky.it** - **video.sky.it:live** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0457d1a15..a6b1b8dce 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.01' +__version__ = '2021.04.07' From c0c5134c5771dd2a1caeeaee62dcd207d169e981 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Apr 2021 09:27:05 +0100 Subject: [PATCH 058/120] [curiositystream] fix format extraction(closes #26845, closes #28668) --- youtube_dl/extractor/curiositystream.py | 103 +++++++++++++----------- 1 file changed, 58 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index e4a7fca6c..ae64a07d7 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -25,12 +25,12 @@ class CuriosityStreamBaseIE(InfoExtractor): raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) - def _call_api(self, path, video_id): + def _call_api(self, path, video_id, query=None): headers = {} if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( - self._API_BASE_URL + path, video_id, headers=headers) + self._API_BASE_URL + path, video_id, headers=headers, query=query) self._handle_errors(result) return result['data'] @@ -52,62 +52,75 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P\d+)' _TEST = { 'url': 'https://app.curiositystream.com/video/2', - 'md5': '262bb2f257ff301115f1973540de8983', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - } + }, + 'params': { + 'format': 'bestvideo', + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url) - media = self._call_api('media/' + video_id, video_id) - title = media['title'] formats = [] - for encoding in media.get('encodings', []): - m3u8_url = encoding.get('master_playlist_url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - encoding_url = encoding.get('url') - file_url = encoding.get('file_url') - if not encoding_url and not file_url: - continue - f = { - 'width': int_or_none(encoding.get('width')), - 'height': int_or_none(encoding.get('height')), - 'vbr': int_or_none(encoding.get('video_bitrate')), - 'abr': int_or_none(encoding.get('audio_bitrate')), - 'filesize': int_or_none(encoding.get('size_in_bytes')), - 'vcodec': encoding.get('video_codec'), - 'acodec': encoding.get('audio_codec'), - 'container': encoding.get('container_type'), - } - for f_url in (encoding_url, file_url): - if not f_url: + for encoding_format in ('m3u8', 'mpd'): + media = self._call_api('media/' + video_id, video_id, query={ + 'encodingsNew': 'true', + 'encodingsFormat': encoding_format, + }) + for encoding in media.get('encodings', []): + playlist_url = encoding.get('master_playlist_url') + if encoding_format == 'm3u8': + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol + formats.extend(self._extract_m3u8_formats( + playlist_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + elif encoding_format == 'mpd': + formats.extend(self._extract_mpd_formats( + playlist_url, video_id, mpd_id='dash', fatal=False)) + encoding_url = encoding.get('url') + file_url = encoding.get('file_url') + if not encoding_url and not file_url: continue - fmt = f.copy() - rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) - if rtmp: - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - 'format_id': 'rtmp', - }) - else: - fmt.update({ - 'url': f_url, - 'format_id': 'http', - }) - formats.append(fmt) + f = { + 'width': int_or_none(encoding.get('width')), + 'height': int_or_none(encoding.get('height')), + 'vbr': int_or_none(encoding.get('video_bitrate')), + 'abr': int_or_none(encoding.get('audio_bitrate')), + 'filesize': int_or_none(encoding.get('size_in_bytes')), + 'vcodec': encoding.get('video_codec'), + 'acodec': encoding.get('audio_codec'), + 'container': encoding.get('container_type'), + } + for f_url in (encoding_url, file_url): + if not f_url: + continue + fmt = f.copy() + rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': 'rtmp', + }) + else: + fmt.update({ + 'url': f_url, + 'format_id': 'http', + }) + formats.append(fmt) self._sort_formats(formats) + title = media['title'] + subtitles = {} for closed_caption in media.get('closed_captions', []): sub_url = closed_caption.get('file') @@ -140,7 +153,7 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): 'title': 'Curious Minds: The Internet', 'description': 'How is the internet shaping our lives in the 21st Century?', }, - 'playlist_mincount': 17, + 'playlist_mincount': 16, }, { 'url': 'https://curiositystream.com/series/2', 'only_matching': True, From 281b8e34432d8dba9902be2c1eb77d3e6371cd73 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Apr 2021 10:41:06 +0100 Subject: [PATCH 059/120] [jamendo] fix track extraction(closes #28686) --- youtube_dl/extractor/jamendo.py | 74 ++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 490efa8fb..1db7c64af 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -29,34 +29,51 @@ class JamendoIE(InfoExtractor): 'id': '196219', 'display_id': 'stories-from-emona-i', 'ext': 'flac', - 'title': 'Maya Filipič - Stories from Emona I', - 'artist': 'Maya Filipič', + # 'title': 'Maya Filipič - Stories from Emona I', + 'title': 'Stories from Emona I', + # 'artist': 'Maya Filipič', 'track': 'Stories from Emona I', 'duration': 210, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1217438117, 'upload_date': '20080730', + 'license': 'by-nc-nd', + 'view_count': int, + 'like_count': int, + 'average_rating': int, + 'tags': ['piano', 'peaceful', 'newage', 'strings', 'upbeat'], } }, { 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', 'only_matching': True, }] + def _call_api(self, resource, resource_id): + path = '/api/%ss' % resource + rand = compat_str(random.random()) + return self._download_json( + 'https://www.jamendo.com' + path, resource_id, query={ + 'id[]': resource_id, + }, headers={ + 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) + })[0] + def _real_extract(self, url): track_id, display_id = self._VALID_URL_RE.match(url).groups() - webpage = self._download_webpage( - 'https://www.jamendo.com/track/' + track_id, track_id) - models = self._parse_json(self._html_search_regex( - r"data-bundled-models='([^']+)", - webpage, 'bundled models'), track_id) - track = models['track']['models'][0] + # webpage = self._download_webpage( + # 'https://www.jamendo.com/track/' + track_id, track_id) + # models = self._parse_json(self._html_search_regex( + # r"data-bundled-models='([^']+)", + # webpage, 'bundled models'), track_id) + # track = models['track']['models'][0] + track = self._call_api('track', track_id) title = track_name = track['name'] - get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} - artist = get_model('artist') - artist_name = artist.get('name') - if artist_name: - title = '%s - %s' % (artist_name, title) - album = get_model('album') + # get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} + # artist = get_model('artist') + # artist_name = artist.get('name') + # if artist_name: + # title = '%s - %s' % (artist_name, title) + # album = get_model('album') formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -74,7 +91,7 @@ class JamendoIE(InfoExtractor): urls = [] thumbnails = [] - for _, covers in track.get('cover', {}).items(): + for covers in (track.get('cover') or {}).values(): for cover_id, cover_url in covers.items(): if not cover_url or cover_url in urls: continue @@ -88,13 +105,14 @@ class JamendoIE(InfoExtractor): }) tags = [] - for tag in track.get('tags', []): + for tag in (track.get('tags') or []): tag_name = tag.get('name') if not tag_name: continue tags.append(tag_name) stats = track.get('stats') or {} + license = track.get('licenseCC') or [] return { 'id': track_id, @@ -103,11 +121,11 @@ class JamendoIE(InfoExtractor): 'title': title, 'description': track.get('description'), 'duration': int_or_none(track.get('duration')), - 'artist': artist_name, + # 'artist': artist_name, 'track': track_name, - 'album': album.get('name'), + # 'album': album.get('name'), 'formats': formats, - 'license': '-'.join(track.get('licenseCC', [])) or None, + 'license': '-'.join(license) if license else None, 'timestamp': int_or_none(track.get('dateCreated')), 'view_count': int_or_none(stats.get('listenedAll')), 'like_count': int_or_none(stats.get('favorited')), @@ -116,9 +134,9 @@ class JamendoIE(InfoExtractor): } -class JamendoAlbumIE(InfoExtractor): +class JamendoAlbumIE(JamendoIE): _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'info_dict': { 'id': '121486', @@ -151,17 +169,7 @@ class JamendoAlbumIE(InfoExtractor): 'params': { 'playlistend': 2 } - } - - def _call_api(self, resource, resource_id): - path = '/api/%ss' % resource - rand = compat_str(random.random()) - return self._download_json( - 'https://www.jamendo.com' + path, resource_id, query={ - 'id[]': resource_id, - }, headers={ - 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) - })[0] + }] def _real_extract(self, url): album_id = self._match_id(url) @@ -169,7 +177,7 @@ class JamendoAlbumIE(InfoExtractor): album_name = album.get('name') entries = [] - for track in album.get('tracks', []): + for track in (album.get('tracks') or []): track_id = track.get('id') if not track_id: continue From 006eea564d55130bb2e2ea7feb3a0e286d75d91f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Apr 2021 14:01:48 +0100 Subject: [PATCH 060/120] [cbssports] fix extraction(closes #28682) --- youtube_dl/extractor/cbssports.py | 125 +++++++++++++++++++++++------ youtube_dl/extractor/extractors.py | 6 +- 2 files changed, 105 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 83b764762..a891c9a55 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -1,38 +1,113 @@ from __future__ import unicode_literals -from .cbs import CBSBaseIE +import re + +# from .cbs import CBSBaseIE +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, +) -class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P[^/?#&]+)' - +# class CBSSportsEmbedIE(CBSBaseIE): +class CBSSportsEmbedIE(InfoExtractor): + IE_NAME = 'cbssports:embed' + _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? + (?: + ids%3D(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})| + pcid%3D(?P\d+) + )''' _TESTS = [{ - 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', - 'info_dict': { - 'id': '1214315075735', - 'ext': 'mp4', - 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', - 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', - 'timestamp': 1524111457, - 'upload_date': '20180419', - 'uploader': 'CBSI-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod', + 'only_matching': True, }, { - 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue', 'only_matching': True, }] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + # def _extract_video_info(self, filter_query, video_id): + # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): + uuid, pcid = re.match(self._VALID_URL, url).groups() + query = {'id': uuid} if uuid else {'pcid': pcid} + video = self._download_json( + 'https://www.cbssports.com/api/content/video/', + uuid or pcid, query=query)[0] + video_id = video['id'] + title = video['title'] + metadata = video.get('metaData') or {} + # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id) + # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id) + + formats = self._extract_m3u8_formats( + metadata['files'][0]['url'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + image = video.get('image') + thumbnails = None + if image: + image_path = image.get('path') + if image_path: + thumbnails = [{ + 'url': image_path, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + 'filesize': int_or_none(image.get('size')), + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video.get('description'), + 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])), + 'duration': int_or_none(metadata.get('duration')), + } + + +class CBSSportsBaseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], - webpage, 'video id') - return self._extract_video_info('byId=%s' % video_id, video_id) + iframe_url = self._search_regex( + r']+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"', + webpage, 'embed url') + return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key()) + + +class CBSSportsIE(CBSSportsBaseIE): + IE_NAME = 'cbssports' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/', + 'info_dict': { + 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636', + 'ext': 'mp4', + 'title': 'Cover 3: Stanford Spring Gleaning', + 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.', + 'timestamp': 1617218398, + 'upload_date': '20210331', + 'duration': 502, + }, + }] + + +class TwentyFourSevenSportsIE(CBSSportsBaseIE): + IE_NAME = '247sports' + _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P\d+)' + _TESTS = [{ + 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/', + 'info_dict': { + 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc', + 'ext': 'mp4', + 'title': '2021 QB Jake Garcia senior highlights through five games', + 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b', + 'timestamp': 1607114223, + 'upload_date': '20201204', + 'duration': 208, + }, + }] diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d5cd364e8..5ff9110b4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -191,7 +191,11 @@ from .cbsnews import ( CBSNewsIE, CBSNewsLiveVideoIE, ) -from .cbssports import CBSSportsIE +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) from .ccc import ( CCCIE, CCCPlaylistIE, From 545d6cb9d06a8bf32bcd24463c0fd25e650bb2c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Apr 2021 15:32:59 +0700 Subject: [PATCH 061/120] [pornhub] Extract DASH and HLS formats from get_media end point (closes #28698) --- youtube_dl/extractor/pornhub.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2a7818e41..031454600 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -398,6 +398,16 @@ class PornHubIE(PornHubBaseIE): formats = [] def add_format(format_url, height=None): + ext = determine_ext(format_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + return + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + return tbr = None mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', format_url) if mobj: @@ -417,16 +427,6 @@ class PornHubIE(PornHubBaseIE): r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) if upload_date: upload_date = upload_date.replace('/', '') - ext = determine_ext(video_url) - if ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) - continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue if '/video/get_media' in video_url: medias = self._download_json(video_url, video_id, fatal=False) if isinstance(medias, list): From 27e5a4464d1d4c418d4937492e18a9d47d30fc50 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 8 Apr 2021 18:53:36 +0100 Subject: [PATCH 062/120] [mtv] Fix Viacom A/B Testing Video Player extraction(closes #28703) --- youtube_dl/extractor/mtv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 600cf2d89..5a5205c0e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -255,7 +255,9 @@ class MTVServicesInfoExtractor(InfoExtractor): @staticmethod def _extract_child_with_type(parent, t): - return next(c for c in parent['children'] if c.get('type') == t) + for c in parent['children']: + if c.get('type') == t: + return c def _extract_mgid(self, webpage): try: @@ -286,7 +288,8 @@ class MTVServicesInfoExtractor(InfoExtractor): data = self._parse_json(self._search_regex( r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) main_container = self._extract_child_with_type(data, 'MainContainer') - video_player = self._extract_child_with_type(main_container, 'VideoPlayer') + ab_testing = self._extract_child_with_type(main_container, 'ABTesting') + video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') mgid = video_player['props']['media']['video']['config']['uri'] return mgid From 1b0a13f33cfb3644cc718d35951ea85bb1905459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 9 Apr 2021 02:09:52 +0700 Subject: [PATCH 063/120] [youtube:tab] Pass innertube context and x-goog-visitor-id header along with continuation requests (closes #28702) --- youtube_dl/extractor/youtube.py | 42 +++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6b4c7912c..79e47c919 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -306,7 +306,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._parse_json( self._search_regex( r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', - default='{}'), video_id, fatal=False) + default='{}'), video_id, fatal=False) or {} def _extract_video(self, renderer): video_id = renderer['videoId'] @@ -2475,7 +2475,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): ctp = continuation_ep.get('clickTrackingParams') return YoutubeTabIE._build_continuation_query(continuation, ctp) - def _entries(self, tab, identity_token): + def _entries(self, tab, item_id, webpage): tab_content = try_get(tab, lambda x: x['content'], dict) if not tab_content: return @@ -2535,26 +2535,37 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(rich_grid_renderer) + ytcfg = self._extract_ytcfg(item_id, webpage) + client_version = try_get( + ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or '2.20210407.08.00' + headers = { 'x-youtube-client-name': '1', - 'x-youtube-client-version': '2.20201112.04.01', + 'x-youtube-client-version': client_version, 'content-type': 'application/json', } + + context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict) or { + 'client': { + 'clientName': 'WEB', + 'clientVersion': client_version, + } + } + visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str) + + identity_token = self._extract_identity_token(ytcfg, webpage) if identity_token: headers['x-youtube-identity-token'] = identity_token data = { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20201021.03.00', - } - }, + 'context': context, } for page_num in itertools.count(1): if not continuation: break + if visitor_data: + headers['x-goog-visitor-id'] = visitor_data data['continuation'] = continuation['continuation'] data['clickTracking'] = { 'clickTrackingParams': continuation['itct'] @@ -2579,6 +2590,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not response: break + visitor_data = try_get( + response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data + continuation_contents = try_get( response, lambda x: x['continuationContents'], dict) if continuation_contents: @@ -2687,7 +2701,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): alerts.append(text) return '\n'.join(alerts) - def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + def _extract_from_tabs(self, item_id, webpage, data, tabs): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) @@ -2712,7 +2726,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if renderer: title = try_get(renderer, lambda x: x['hashtag']['simpleText']) playlist = self.playlist_result( - self._entries(selected_tab, identity_token), + self._entries(selected_tab, item_id, webpage), playlist_id=playlist_id, playlist_title=title, playlist_description=description) playlist.update(self._extract_uploader(data)) @@ -2736,8 +2750,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self._playlist_entries(playlist), playlist_id=playlist_id, playlist_title=title) - def _extract_identity_token(self, webpage, item_id): - ytcfg = self._extract_ytcfg(item_id, webpage) + def _extract_identity_token(self, ytcfg, webpage): if ytcfg: token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) if token: @@ -2760,12 +2773,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) - identity_token = self._extract_identity_token(webpage, item_id) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) if tabs: - return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + return self._extract_from_tabs(item_id, webpage, data, tabs) playlist = try_get( data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) if playlist: From 4fb25ff5a3be5206bb72e5c4046715b1529fb2c7 Mon Sep 17 00:00:00 2001 From: Aaron Lipinski Date: Thu, 8 Apr 2021 19:59:36 +1200 Subject: [PATCH 064/120] [maoritv] Add new extractor(closes #24552) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/maoritv.py | 31 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 youtube_dl/extractor/maoritv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5ff9110b4..ac33cd996 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -640,6 +640,7 @@ from .mangomolo import ( MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE from .markiza import ( MarkizaIE, MarkizaPageIE, diff --git a/youtube_dl/extractor/maoritv.py b/youtube_dl/extractor/maoritv.py new file mode 100644 index 000000000..0d23fec75 --- /dev/null +++ b/youtube_dl/extractor/maoritv.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MaoriTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?maoritelevision\.com/shows/(?:[^/]+/)+(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.maoritelevision.com/shows/korero-mai/S01E054/korero-mai-series-1-episode-54', + 'md5': '5ade8ef53851b6a132c051b1cd858899', + 'info_dict': { + 'id': '4774724855001', + 'ext': 'mp4', + 'title': 'Kōrero Mai, Series 1 Episode 54', + 'upload_date': '20160226', + 'timestamp': 1456455018, + 'description': 'md5:59bde32fd066d637a1a55794c56d8dcb', + 'uploader_id': '1614493167001', + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1614493167001/HJlhIQhQf_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex( + r'data-main-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'BrightcoveNew', brightcove_id) From 06159135ef148a6ddc632d0c89b90c937d5bb021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 00:07:32 +0700 Subject: [PATCH 065/120] [youtube] Improve URL to extractor routing (closes #27572, closes #28335, closes #28742) --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 79e47c919..4d7f3f837 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -46,6 +46,10 @@ from ..utils import ( ) +def parse_qs(url): + return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -413,16 +417,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID - (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) + (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" % { - 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, 'invidious': '|'.join(_INVIDIOUS_SITES), } _PLAYER_INFO_RE = ( @@ -1208,6 +1205,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } + @classmethod + def suitable(cls, url): + qs = parse_qs(url) + if qs.get('list', [None])[0]: + return False + return super(YoutubeIE, cls).suitable(url) + def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) self._code_cache = {} @@ -2275,6 +2279,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': '#cctv9', }, 'playlist_mincount': 350, + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, }] @classmethod @@ -2764,7 +2771,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) # Handle both video/playlist URLs - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) video_id = qs.get('v', [None])[0] playlist_id = qs.get('list', [None])[0] if video_id and playlist_id: @@ -2860,12 +2867,16 @@ class YoutubePlaylistIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubeTabIE.suitable(url) else super( - YoutubePlaylistIE, cls).suitable(url) + if YoutubeTabIE.suitable(url): + return False + qs = parse_qs(url) + if qs.get('v', [None])[0]: + return False + return super(YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): playlist_id = self._match_id(url) - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) if not qs: qs = {'list': playlist_id} return self.url_result( From 79e4ccfc4b395127bb3e5e957b20b04e75cba355 Mon Sep 17 00:00:00 2001 From: quyleanh Date: Sat, 17 Apr 2021 00:30:10 +0700 Subject: [PATCH 066/120] [pluralsight] Extend anti-throttling timeout (#28712) --- youtube_dl/extractor/pluralsight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index abd08bc28..2d63855df 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -393,7 +393,7 @@ query viewClip { # To somewhat reduce the probability of these consequences # we will sleep random amount of time before each call to ViewClip. self._sleep( - random.randint(2, 5), display_id, + random.randint(5, 10), display_id, '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') if not viewclip: From d01e261a15abd779decae6e0858d8586f7a71621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20=C3=81vila?= Date: Fri, 16 Apr 2021 14:31:34 -0300 Subject: [PATCH 067/120] [youtube] Add more invidious instances (#28706) --- youtube_dl/extractor/youtube.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4d7f3f837..7fa9b473a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -359,21 +359,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'(?:www\.)?invidious\.mastodon\.host', r'(?:www\.)?invidious\.zapashcanon\.fr', r'(?:www\.)?invidious\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', r'(?:www\.)?invidious\.tube', r'(?:www\.)?invidiou\.site', r'(?:www\.)?invidious\.site', r'(?:www\.)?invidious\.xyz', r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', r'(?:www\.)?tube\.poal\.co', r'(?:www\.)?tube\.connect\.cafe', r'(?:www\.)?vid\.wxzm\.sx', r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', r'(?:www\.)?yewtu\.be', r'(?:www\.)?yt\.elukerio\.org', r'(?:www\.)?yt\.lelux\.fi', r'(?:www\.)?invidious\.ggc-project\.de', r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', r'(?:www\.)?invidious\.13ad\.de', r'(?:www\.)?invidious\.toot\.koeln', r'(?:www\.)?invidious\.fdn\.fr', From ea87ed8394127c4bf824688b8780eaf5a804e7a3 Mon Sep 17 00:00:00 2001 From: zraktvor <=> Date: Sat, 10 Apr 2021 15:11:35 +0200 Subject: [PATCH 068/120] [youtube:tab] Detect series playlist on playlists page (closes #28723) --- youtube_dl/extractor/youtube.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7fa9b473a..581687d96 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2019,6 +2019,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Игорь Клейнер - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', }, + }, { + # playlists, series + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + }, }, { # playlists, singlepage 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', @@ -2311,7 +2320,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_grid_item_renderer(item): - for item_kind in ('Playlist', 'Video', 'Channel'): + for item_kind in ('Playlist', 'Video', 'Channel', 'Show'): renderer = item.get('grid%sRenderer' % item_kind) if renderer: return renderer @@ -2344,6 +2353,19 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) + # show + if playlist_id is None: # needs to check for playlist_id, or non-series playlists are recognized twice + show_playlist_url = try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str) + if show_playlist_url: + playlist_id = self._search_regex(r'/playlist\?list=([0-9a-zA-Z-_]+)', show_playlist_url, + 'playlist id', default=None) + if playlist_id: + title = try_get(renderer, lambda x: x['title']['simpleText'], compat_str) + yield self.url_result( + "https://www.youtube.com/playlist?list=%s" % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') From 7c5239547928079513b65f62e4c84aea21ce76e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 01:05:44 +0700 Subject: [PATCH 069/120] [youtube:tab] Improve grid extraction (closes #28725) --- youtube_dl/extractor/youtube.py | 38 ++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 581687d96..b6945570f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2320,10 +2320,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_grid_item_renderer(item): - for item_kind in ('Playlist', 'Video', 'Channel', 'Show'): - renderer = item.get('grid%sRenderer' % item_kind) - if renderer: - return renderer + assert isinstance(item, dict) + for key, renderer in item.items(): + if not key.startswith('grid') or not key.endswith('Renderer'): + continue + if not isinstance(renderer, dict): + continue + return renderer def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: @@ -2333,7 +2336,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not isinstance(renderer, dict): continue title = try_get( - renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + renderer, (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) # playlist playlist_id = renderer.get('playlistId') if playlist_id: @@ -2341,10 +2345,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'https://www.youtube.com/playlist?list=%s' % playlist_id, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) + continue # video video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) + continue # channel channel_id = renderer.get('channelId') if channel_id: @@ -2353,19 +2359,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) - # show - if playlist_id is None: # needs to check for playlist_id, or non-series playlists are recognized twice - show_playlist_url = try_get( - renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str) - if show_playlist_url: - playlist_id = self._search_regex(r'/playlist\?list=([0-9a-zA-Z-_]+)', show_playlist_url, - 'playlist id', default=None) - if playlist_id: - title = try_get(renderer, lambda x: x['title']['simpleText'], compat_str) + continue + # generic endpoint URL support + ep_url = urljoin('https://www.youtube.com/', try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str)) + if ep_url: + for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): + if ie.suitable(ep_url): yield self.url_result( - "https://www.youtube.com/playlist?list=%s" % playlist_id, - ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) + ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) + break def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') From 54558e0baa4d62a94af105cd1d7f8abcbd16b468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 02:27:54 +0700 Subject: [PATCH 070/120] [youtube] Improve stretch extraction and fix stretched ratio calculation (closes #28769) --- youtube_dl/extractor/youtube.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b6945570f..75751d5a6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -812,6 +812,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'skip': 'This video does not exist.', }, + { + # Video with incomplete 'yt:stretch=16:' + 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI', + 'only_matching': True, + }, { # Video licensed under Creative Commons 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', @@ -1717,13 +1722,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] for keyword in keywords: if keyword.startswith('yt:stretch='): - w, h = keyword.split('=')[1].split(':') - w, h = int(w), int(h) - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio + mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword) + if mobj: + # NB: float is intentional for forcing float division + w, h = (float(v) for v in mobj.groups()) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio + break thumbnails = [] for container in (video_details, microformat): From a00a7e0cad3308d999599bf17df5d3e6aba502d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:22:13 +0700 Subject: [PATCH 071/120] [utils] Add support for support for experimental HTTP response status code 308 Permanent Redirect (refs #27877, refs #28768) --- youtube_dl/utils.py | 62 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8e4d144c9..538cc2b63 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ import zlib from .compat import ( compat_HTMLParseError, compat_HTMLParser, + compat_HTTPError, compat_basestring, compat_chr, compat_cookiejar, @@ -2879,12 +2880,61 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - if sys.version_info[0] < 3: - def redirect_request(self, req, fp, code, msg, headers, newurl): - # On python 2 urlh.geturl() may sometimes return redirect URL - # as byte string instead of unicode. This workaround allows - # to force it always return unicode. - return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl)) + """YoutubeDL redirect handler + + The code is based on HTTPRedirectHandler implementation from CPython [1]. + + This redirect handler solves two issues: + - ensures redirect URL is always unicode under python 2 + - introduces support for experimental HTTP response status code + 308 Permanent Redirect [2] used by some sites [3] + + 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py + 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 + 3. https://github.com/ytdl-org/youtube-dl/issues/28768 + """ + + http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") + or code in (301, 302, 303) and m == "POST")): + raise compat_HTTPError(req.full_url, code, msg, headers, fp) + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib.request, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + + # On python 2 urlh.geturl() may sometimes return redirect URL + # as byte string instead of unicode. This workaround allows + # to force it always return unicode. + if sys.version_info[0] < 3: + newurl = compat_str(newurl) + + # Be conciliant with URIs containing a space. This is mainly + # redundant with the more complete encoding done in http_error_302(), + # but it is kept for compatibility with other callers. + newurl = newurl.replace(' ', '%20') + + CONTENT_HEADERS = ("content-length", "content-type") + # NB: don't use dict comprehension for python 2.6 compatibility + newheaders = dict((k, v) for k, v in req.headers.items() + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request(newurl, + headers=newheaders, + origin_req_host=req.origin_req_host, + unverifiable=True) def extract_timezone(date_str): From 30a3a4c70fdcad10ef1dc6c3402457a95fe1ae5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:23:47 +0700 Subject: [PATCH 072/120] [lbry] Add support for HLS videos (closes #27877, closes #28768) --- youtube_dl/extractor/lbry.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index ae43d56ea..cfd6b8393 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -120,6 +120,26 @@ class LBRYIE(LBRYBaseIE): 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', 'vcodec': 'none', } + }, { + # HLS + 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', + 'md5': 'fc82f45ea54915b1495dd7cb5cc1289f', + 'info_dict': { + 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', + 'ext': 'mp4', + 'title': 'PLANTS I WILL NEVER GROW AGAIN. THE BLACK LIST PLANTS FOR A CANADIAN GARDEN | Gardening in Canada 🍁', + 'description': 'md5:9c539c6a03fb843956de61a4d5288d5e', + 'timestamp': 1618254123, + 'upload_date': '20210412', + 'release_timestamp': 1618254002, + 'release_date': '20210412', + 'tags': list, + 'duration': 554, + 'channel': 'Gardening In Canada', + 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'formats': 'mincount:3', + } }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, @@ -163,10 +183,18 @@ class LBRYIE(LBRYBaseIE): streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] info = self._parse_stream(result, url) + urlh = self._request_webpage( + streaming_url, display_id, note='Downloading streaming redirect url info') + if determine_ext(urlh.geturl()) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(info['formats']) + else: + info['url'] = streaming_url info.update({ 'id': claim_id, 'title': title, - 'url': streaming_url, }) return info From cfee2dfe83c5593d46bd0c8e8ce6a3d8c6e42db7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:32:04 +0700 Subject: [PATCH 073/120] [utils] PEP 8 --- youtube_dl/utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 538cc2b63..e722eed58 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2908,7 +2908,7 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): """ m = req.get_method() if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST")): + or code in (301, 302, 303) and m == "POST")): raise compat_HTTPError(req.full_url, code, msg, headers, fp) # Strictly (according to RFC 2616), 301 or 302 in response to # a POST MUST NOT cause a redirection without confirmation @@ -2930,11 +2930,10 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): CONTENT_HEADERS = ("content-length", "content-type") # NB: don't use dict comprehension for python 2.6 compatibility newheaders = dict((k, v) for k, v in req.headers.items() - if k.lower() not in CONTENT_HEADERS) - return compat_urllib_request.Request(newurl, - headers=newheaders, - origin_req_host=req.origin_req_host, - unverifiable=True) + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request( + newurl, headers=newheaders, origin_req_host=req.origin_req_host, + unverifiable=True) def extract_timezone(date_str): From f20b505b46e6654464635ca4afb0f37e1f14c57b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:47:00 +0700 Subject: [PATCH 074/120] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 22b4fa67d..c249412e2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core ++ [utils] Add support for experimental HTTP response status code + 308 Permanent Redirect (#27877, #28768) + +Extractors ++ [lbry] Add support for HLS videos (#27877, #28768) +* [youtube] Fix stretched ratio calculation +* [youtube] Improve stretch extraction (#28769) +* [youtube:tab] Improve grid extraction (#28725) ++ [youtube:tab] Detect series playlist on playlists page (#28723) ++ [youtube] Add more invidious instances (#28706) +* [pluralsight] Extend anti-throttling timeout (#28712) +* [youtube] Improve URL to extractor routing (#27572, #28335, #28742) ++ [maoritv] Add support for maoritelevision.com (#24552) ++ [youtube:tab] Pass innertube context and x-goog-visitor-id header along with + continuation requests (#28702) +* [mtv] Fix Viacom A/B Testing Video Player extraction (#28703) ++ [pornhub] Extract DASH and HLS formats from get_media end point (#28698) +* [cbssports] Fix extraction (#28682) +* [jamendo] Fix track extraction (#28686) +* [curiositystream] Fix format extraction (#26845, #28668) + + version 2021.04.07 Core From 596b26606cfe20aa9f776ac0658b6bfb1ea95397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:50:09 +0700 Subject: [PATCH 075/120] release 2021.04.17 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 5 ++++- youtube_dl/version.py | 2 +- 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index febbd2344..e2a89d5c2 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.07 + [debug] youtube-dl version 2021.04.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index d7296d0a9..4d7abd775 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 92e616a1a..d8dce6fd4 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index b55739f6c..d95ee291a 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.07 + [debug] youtube-dl version 2021.04.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index dbdb8356a..ac5dd2f27 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index c249412e2..45d5c2ebf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.17 Core + [utils] Add support for experimental HTTP response status code diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ff9177a2c..a23da1a31 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -3,6 +3,7 @@ - **20min** - **220.ro** - **23video** + - **247sports** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -160,7 +161,8 @@ - **cbsnews**: CBS News - **cbsnews:embed** - **cbsnews:livevideo**: CBS News Live Videos - - **CBSSports** + - **cbssports** + - **cbssports:embed** - **CCMA** - **CCTV**: 央视网 - **CDA** @@ -490,6 +492,7 @@ - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **MaoriTV** - **Markiza** - **MarkizaPage** - **massengeschmack.tv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a6b1b8dce..2b041d593 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.07' +__version__ = '2021.04.17' From 9f6c03a00602eb1119e43a522cf50682f6d6a6dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 05:05:31 +0700 Subject: [PATCH 076/120] [cbsnews] Fix extraction for python <3.6 (closes #23359) --- youtube_dl/extractor/cbsnews.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 345debcf0..1285ed65e 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -26,7 +26,7 @@ class CBSNewsEmbedIE(CBSIE): def _real_extract(self, url): item = self._parse_json(zlib.decompress(compat_b64decode( compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS), None)['video']['items'][0] + -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] return self._extract_video_info(item['mpxRefId'], 'cbsnews') From 41920fc80e4fe4a8996aeb31a04826a5a2534814 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 20 Apr 2021 20:51:55 +0100 Subject: [PATCH 077/120] [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) --- youtube_dl/extractor/bbc.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index e8d000bbb..71ea25881 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -25,8 +26,10 @@ from ..utils import ( js_to_json, parse_duration, parse_iso8601, + strip_or_none, try_get, unescapeHTML, + unified_timestamp, url_or_none, urlencode_postdata, urljoin, @@ -761,8 +764,17 @@ class BBCIE(BBCCoUkIE): 'only_matching': True, }, { # custom redirection to www.bbc.com + # also, video with window.__INITIAL_DATA__ 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', - 'only_matching': True, + 'info_dict': { + 'id': 'p02xzws1', + 'ext': 'mp4', + 'title': "Pluto may have 'nitrogen glaciers'", + 'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.", + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1437785037, + 'upload_date': '20150725', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -1164,12 +1176,23 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) + item_desc = try_get( + media, + lambda x: x['summary']['blocks'][0]['model']['text'], + compat_str) + item_time = None + for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: + if try_get(meta, lambda x: x['label']) == 'Published': + item_time = unified_timestamp(meta.get('timestamp')) + break entries.append({ 'id': item_id, 'title': item_title, 'thumbnail': item.get('holdingImageUrl'), 'formats': formats, 'subtitles': subtitles, + 'timestamp': item_time, + 'description': strip_or_none(item_desc), }) for resp in (initial_data.get('data') or {}).values(): name = resp.get('name') From dab83a25972e0dbcc69583bf78d2a992f581563d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 03:00:56 +0700 Subject: [PATCH 078/120] [bbc] Extract full description from __INITIAL_DATA__ (refs #28774) --- youtube_dl/extractor/bbc.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 71ea25881..247d982ce 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -770,7 +770,7 @@ class BBCIE(BBCCoUkIE): 'id': 'p02xzws1', 'ext': 'mp4', 'title': "Pluto may have 'nitrogen glaciers'", - 'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.", + 'description': 'md5:6a95b593f528d7a5f2605221bc56912f', 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1437785037, 'upload_date': '20150725', @@ -1176,10 +1176,16 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) - item_desc = try_get( - media, - lambda x: x['summary']['blocks'][0]['model']['text'], - compat_str) + item_desc = None + blocks = try_get(media, lambda x: x['summary']['blocks'], list) + if blocks: + summary = [] + for block in blocks: + text = try_get(block, lambda x: x['model']['text'], compat_str) + if text: + summary.append(text) + if summary: + item_desc = '\n\n'.join(summary) item_time = None for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: if try_get(meta, lambda x: x['label']) == 'Published': From 32290307a45260885b2210aa3c2a57e64abf8c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 03:56:04 +0700 Subject: [PATCH 079/120] [youtube] Fix lazy extractors (closes #28780) --- youtube_dl/extractor/youtube.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 75751d5a6..c16dc7ab8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1219,6 +1219,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('list', [None])[0]: return False @@ -2910,6 +2913,9 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('v', [None])[0]: return False From 5ad69d3d0e7d1d8d15a1f2497d602b1b91fcc74a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 04:45:13 +0700 Subject: [PATCH 080/120] [test_youtube_misc] Move YoutubeIE.extract_id test into separate module --- test/test_all_urls.py | 9 --------- test/test_youtube_misc.py | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 test/test_youtube_misc.py diff --git a/test/test_all_urls.py b/test/test_all_urls.py index df6d81b5d..365b66bad 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -70,15 +70,6 @@ class TestAllURLsMatching(unittest.TestCase): # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) - def test_youtube_extract(self): - assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) - assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') - assertExtractId('BaW_jenozKc', 'BaW_jenozKc') - def test_facebook_matching(self): self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) self.assertTrue(FacebookIE.suitable('https://www.facebook.com/cindyweather?fref=ts#!/photo.php?v=10152183998945793')) diff --git a/test/test_youtube_misc.py b/test/test_youtube_misc.py new file mode 100644 index 000000000..e18e71101 --- /dev/null +++ b/test/test_youtube_misc.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from youtube_dl.extractor import YoutubeIE + + +class TestYoutubeMisc(unittest.TestCase): + def test_youtube_extract(self): + assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) + assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') + assertExtractId('BaW_jenozKc', 'BaW_jenozKc') + + +if __name__ == '__main__': + unittest.main() From c4a451bcdd2b743fdb96fcbae261c86ed91022ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 04:47:29 +0700 Subject: [PATCH 081/120] [test_execution] Add test for lazy extractors (refs #28780) --- test/test_execution.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_execution.py b/test/test_execution.py index 11661bb68..32948d93e 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -39,6 +39,16 @@ class TestExecution(unittest.TestCase): _, stderr = p.communicate() self.assertFalse(stderr) + def test_lazy_extractors(self): + try: + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) + finally: + try: + os.remove('youtube_dl/extractor/lazy_extractors.py') + except (IOError, OSError): + pass + if __name__ == '__main__': unittest.main() From ac19c3ac8035fbf9369fd4bd336c9045d4eeafa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 05:35:39 +0700 Subject: [PATCH 082/120] [go] Improve video id extraction (closes #25207, closes #25216, closes #26058) --- youtube_dl/extractor/go.py | 46 +++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 0d731e90a..878ba14e6 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals import re from .adobepass import AdobePassIE +from ..compat import compat_str from ..utils import ( int_or_none, determine_ext, parse_age_limit, + try_get, urlencode_postdata, ExtractorError, ) @@ -116,6 +118,18 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', + 'info_dict': { + 'id': 'VDKA22600213', + 'ext': 'mp4', + 'title': 'Pilot', + 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -149,14 +163,30 @@ class GoIE(AdobePassIE): brand = site_info.get('brand') if not video_id or not site_info: webpage = self._download_webpage(url, display_id or video_id) - video_id = self._search_regex( - ( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', - # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet - r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' - ), webpage, 'video id', default=video_id) + data = self._parse_json( + self._search_regex( + r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, + 'data', default='{}'), + display_id or video_id, fatal=False) + # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot + layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) + video_id = None + if layout: + video_id = try_get( + layout, + (lambda x: x['videoid'], lambda x: x['video']['id']), + compat_str) + if not video_id: + video_id = self._search_regex( + ( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*(VDKA\w+)', + # page.analytics.videoIdCode + r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', + # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + ), webpage, 'video id', default=video_id) if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', From 7e8b3f9439ebefb3a3a4e5da9c0bd2b595976438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 05:37:51 +0700 Subject: [PATCH 083/120] [youtube] Remove unused code --- youtube_dl/extractor/youtube.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c16dc7ab8..0c52e5a8b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -65,11 +65,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - def _login(self): """ Attempt to log in to YouTube. From 0db79d8181c1c3ebf74b9b6d38262c8dcaaf0f4f Mon Sep 17 00:00:00 2001 From: The Hatsune Daishi Date: Sat, 24 Apr 2021 20:58:03 +0900 Subject: [PATCH 084/120] [tver] Redirect all downloads to Brightcove (#28849) --- youtube_dl/extractor/tver.py | 37 +++++++++++------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index a54f49319..a4a30b1e6 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -9,7 +9,6 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, - strip_or_none, try_get, ) @@ -45,32 +44,18 @@ class TVerIE(InfoExtractor): query={'token': self._TOKEN})['main'] p_id = main['publisher_id'] service = remove_start(main['service'], 'ts_') - info = { + + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + + return { '_type': 'url_transparent', 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'url': bc_url, + 'ie_key': 'BrightcoveNew', } - - if service == 'cx': - title = main['title'] - subtitle = strip_or_none(main.get('subtitle')) - if subtitle: - title += ' - ' + subtitle - info.update({ - 'title': title, - 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), - 'ie_key': 'FujiTVFODPlus7', - }) - else: - r_id = main['reference_id'] - if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): - r_id = 'ref:' + r_id - bc_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), - {'geo_countries': ['JP']}) - info.update({ - 'url': bc_url, - 'ie_key': 'BrightcoveNew', - }) - - return info From c6ab79299034492c5af9dd29b0c49585e4efc4cd Mon Sep 17 00:00:00 2001 From: catboy <79282513+catboy-oss@users.noreply.github.com> Date: Sat, 24 Apr 2021 12:10:35 +0000 Subject: [PATCH 085/120] [medaltv] Fix extraction (#28807) numeric clip ids are no longer used by medal, and integer user ids are now sent as strings. --- youtube_dl/extractor/medaltv.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py index 1603b55f6..ef2283dea 100644 --- a/youtube_dl/extractor/medaltv.py +++ b/youtube_dl/extractor/medaltv.py @@ -15,32 +15,32 @@ from ..utils import ( class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', 'info_dict': { - 'id': '34934644', + 'id': '2mA60jWAGQCBH', 'ext': 'mp4', 'title': 'Quad Cold', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'MowgliSB', 'timestamp': 1603165266, 'upload_date': '20201020', - 'uploader_id': 10619174, + 'uploader_id': '10619174', } }, { - 'url': 'https://medal.tv/clips/36787208', + 'url': 'https://medal.tv/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { - 'id': '36787208', + 'id': '2um24TWdty0NA', 'ext': 'mp4', 'title': 'u tk me i tk u bigger', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'Mimicc', 'timestamp': 1605580939, 'upload_date': '20201117', - 'uploader_id': 5156321, + 'uploader_id': '5156321', } }] From 999329cf6b29878d054c6ccdd24573489a2886d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Apr 2021 23:52:16 +0700 Subject: [PATCH 086/120] [workflows/ci.yml] Fix install nose for Jython --- .github/workflows/ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a9dc47a71..97b50aedc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,7 +53,14 @@ jobs: java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose + if: ${{ matrix.python-impl != 'jython' }} run: pip install nose + - name: Install nose (Jython) + if: ${{ matrix.python-impl == 'jython' }} + # Working around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) + run: | + wget https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl + pip install nose-1.3.7-py2-none-any.whl - name: Run tests continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: From 57eaaff5cf1ae63d4e3ae89f301c0f9b3e86bb55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 25 Apr 2021 22:52:28 +0700 Subject: [PATCH 087/120] [francetvinfo] Improve video id extraction (closes #28792) --- youtube_dl/extractor/francetv.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 7cc88bf18..e4ec2e200 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -383,6 +383,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }, { 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', 'only_matching': True, + }, { + # "
]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', - r'data-id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), + r'(?:data-id| Date: Sun, 25 Apr 2021 19:32:47 +0200 Subject: [PATCH 088/120] [xfileshare] Add support for wolfstream.tv (#28858) --- youtube_dl/extractor/xfileshare.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index cbd5d1cbb..df9efa9fa 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -58,6 +58,7 @@ class XFileShareIE(InfoExtractor): (r'vidlocker\.xyz', 'VidLocker'), (r'vidshare\.tv', 'VidShare'), (r'vup\.to', 'VUp'), + (r'wolfstream\.tv', 'WolfStream'), (r'xvideosharing\.com', 'XVideoSharing'), ) @@ -82,6 +83,9 @@ class XFileShareIE(InfoExtractor): }, { 'url': 'https://aparat.cam/n4d6dh0wvlpr', 'only_matching': True, + }, { + 'url': 'https://wolfstream.tv/nthme29v9u2x', + 'only_matching': True, }] @staticmethod From 346dd3b5e87503c52fc6800d9f73cd6bdbce71bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 01:29:50 +0700 Subject: [PATCH 089/120] [ChangeLog] Actualize [ci skip] --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index 45d5c2ebf..c59984d63 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version + +Extractors ++ [xfileshare] Add support for wolfstream.tv (#28858) +* [francetvinfo] Improve video id extraction (#28792) +* [medaltv] Fix extraction (#28807) +* [tver] Redirect all downloads to Brightcove (#28849) +* [go] Improve video id extraction (#25207, #25216, #26058) +* [youtube] Fix lazy extractors (#28780) ++ [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) +* [cbsnews] Fix extraction for python <3.6 (#23359) + + version 2021.04.17 Core From 273964d190fb048477e71114c4734fcb819c5c16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 01:33:30 +0700 Subject: [PATCH 090/120] release 2021.04.26 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index e2a89d5c2..6ece3e031 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.17 + [debug] youtube-dl version 2021.04.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 4d7abd775..f923b2d5f 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index d8dce6fd4..97d605653 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index d95ee291a..73a806833 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.17 + [debug] youtube-dl version 2021.04.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index ac5dd2f27..ee19a75f5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index c59984d63..f15c84225 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.26 Extractors + [xfileshare] Add support for wolfstream.tv (#28858) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a23da1a31..88d474de4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1162,7 +1162,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing + - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, WolfStream, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2b041d593..576f721db 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.17' +__version__ = '2021.04.26' From 94520568b399d5b4f35a9708f5643d8b16c6c4ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 02:16:47 +0700 Subject: [PATCH 091/120] [workflows/ci.yml] Update link to jython-installer --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 97b50aedc..90bd63c32 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,7 +49,7 @@ jobs: - name: Install Jython if: ${{ matrix.python-impl == 'jython' }} run: | - wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + wget https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose From e33dfb445c547f210a7060e8b7abd592dbe42808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 17:53:27 +0700 Subject: [PATCH 092/120] [tv2dk] Fix extraction (closes #28888) --- youtube_dl/extractor/tv2dk.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 8bda9348d..8bd5fd640 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -74,6 +74,12 @@ class TV2DKIE(InfoExtractor): webpage = self._download_webpage(url, video_id) entries = [] + + def add_entry(partner_id, kaltura_id): + entries.append(self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', + video_id=kaltura_id)) + for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): video = extract_attributes(video_el) kaltura_id = video.get('data-entryid') @@ -82,9 +88,14 @@ class TV2DKIE(InfoExtractor): partner_id = video.get('data-partnerid') if not partner_id: continue - entries.append(self.url_result( - 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', - video_id=kaltura_id)) + add_entry(partner_id, kaltura_id) + if not entries: + kaltura_id = self._search_regex( + r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + partner_id = self._search_regex( + (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, + 'partner id') + add_entry(partner_id, kaltura_id) return self.playlist_result(entries) From d2f72c40db0d1fe1102c98c017682b283579ad97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 18:09:32 +0700 Subject: [PATCH 093/120] [svtplay] Improve extraction (closes #28507, closes #28876) --- youtube_dl/extractor/svt.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index aba9bb447..a5bb6daa7 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -146,7 +146,7 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) - (?:.*?modalId=(?P[\da-zA-Z-]+))? + (?:.*?(?:modalId|id)=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ @@ -177,6 +177,9 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -259,7 +262,7 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', - r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id), + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', From ff04d43c469e4cf8c14ba3e2e79da0d35ef3c7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 18:33:05 +0700 Subject: [PATCH 094/120] [xtube] Fix formats extraction (closes #28870) --- youtube_dl/extractor/xtube.py | 51 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 18969058f..7246409e3 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -11,6 +11,7 @@ from ..utils import ( parse_duration, sanitized_Request, str_to_int, + url_or_none, ) @@ -87,10 +88,10 @@ class XTubeIE(InfoExtractor): 'Cookie': 'age_verified=1; cookiesAccepted=1', }) - title, thumbnail, duration = [None] * 3 + title, thumbnail, duration, sources, media_definition = [None] * 5 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') @@ -99,20 +100,52 @@ class XTubeIE(InfoExtractor): thumbnail = config.get('poster') duration = int_or_none(config.get('duration')) sources = config.get('sources') or config.get('format') + media_definition = config.get('mediaDefinition') - if not isinstance(sources, dict): + if not isinstance(sources, dict) and not media_definition: sources = self._parse_json(self._search_regex( r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', webpage, 'sources', group='sources'), video_id, transform_source=js_to_json) formats = [] - for format_id, format_url in sources.items(): - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) + format_urls = set() + + if isinstance(sources, dict): + for format_id, format_url in sources.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + + if isinstance(media_definition, list): + for media in media_definition: + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + if video_url in format_urls: + continue + format_urls.add(video_url) + format_id = media.get('format') + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id == 'mp4': + height = int_or_none(media.get('quality')) + formats.append({ + 'url': video_url, + 'format_id': '%s-%d' % (format_id, height) if height else format_id, + 'height': height, + }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) From d1b9a5e2eff1c075b38815a3d2b25eb8b3f626bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 19:00:39 +0700 Subject: [PATCH 095/120] [twitter] Improve formats extraction from vmap URL (closes #28909) --- youtube_dl/extractor/twitter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index ed495f297..cfa7a7326 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -19,6 +19,7 @@ from ..utils import ( strip_or_none, unified_timestamp, update_url_query, + url_or_none, xpath_text, ) @@ -52,6 +53,9 @@ class TwitterBaseIE(InfoExtractor): return [f] def _extract_formats_from_vmap_url(self, vmap_url, video_id): + vmap_url = url_or_none(vmap_url) + if not vmap_url: + return [] vmap_data = self._download_xml(vmap_url, video_id) formats = [] urls = [] From a0df8a06178e530a1097f177a1faf1d2c609ac99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 22:53:30 +0700 Subject: [PATCH 096/120] [cda] Improve extraction (closes #28709, closes #28937) --- youtube_dl/extractor/cda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1b4362144..e1b391937 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -133,6 +133,8 @@ class CDAIE(InfoExtractor): 'age_limit': 18 if need_confirm_age else 0, } + info = self._search_json_ld(webpage, video_id, default={}) + # Source: https://www.cda.pl/js/player.js?t=1606154898 def decrypt_file(a): for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): @@ -197,7 +199,7 @@ class CDAIE(InfoExtractor): handler = self._download_webpage webpage = handler( - self._BASE_URL + href, video_id, + urljoin(self._BASE_URL, href), video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: # Manually report warning because empty page is returned when @@ -209,6 +211,4 @@ class CDAIE(InfoExtractor): self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(info_dict, info) From 0204838163bd4068fe23b40414573d1307d817ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 01:57:02 +0700 Subject: [PATCH 097/120] [kaltura] Make embed code alternatives actually work --- youtube_dl/extractor/kaltura.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 49d13460d..5d0ff0418 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -120,7 +120,7 @@ class KalturaIE(InfoExtractor): def _extract_urls(webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( - re.finditer( + list(re.finditer( r"""(?xs) kWidget\.(?:thumb)?[Ee]mbed\( \{.*? @@ -128,8 +128,8 @@ class KalturaIE(InfoExtractor): (?P['"])_?(?P(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P['"])entry_?[Ii]d(?P=q3)\s*:\s* (?P['"])(?P(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) - """, webpage) - or re.finditer( + """, webpage)) + or list(re.finditer( r'''(?xs) (?P["']) (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* @@ -142,8 +142,8 @@ class KalturaIE(InfoExtractor): \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage) - or re.finditer( + ''', webpage)) + or list(re.finditer( r'''(?xs) <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) @@ -151,7 +151,7 @@ class KalturaIE(InfoExtractor): [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) (?:(?!(?P=q1)).)* (?P=q1) - ''', webpage) + ''', webpage)) ) urls = [] for mobj in finditer: From fe05191b8c59538a48b6cbc95f4fe54fc7e6a0ac Mon Sep 17 00:00:00 2001 From: Ben Rog-Wilhelm Date: Tue, 4 May 2021 14:14:35 -0500 Subject: [PATCH 098/120] [kaltura] Improve iframe extraction (#28969) Co-authored-by: Sergey M. --- youtube_dl/extractor/gdcvault.py | 15 +++++++++++++++ youtube_dl/extractor/kaltura.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 2f555c1d4..5ad40ee23 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -102,6 +102,21 @@ class GDCVaultIE(InfoExtractor): 'format': 'mp4-408', }, }, + { + # Kaltura embed, whitespace between quote and embedded URL in iframe's src + 'url': 'https://www.gdcvault.com/play/1025699', + 'info_dict': { + 'id': '0_zagynv0a', + 'ext': 'mp4', + 'title': 'Tech Toolbox', + 'upload_date': '20190408', + 'uploader_id': 'joe@blazestreaming.com', + 'timestamp': 1554764629, + }, + 'params': { + 'skip_download': True, + }, + }, ] def _login(self, webpage_url, display_id): diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 5d0ff0418..c731612c4 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -145,7 +145,7 @@ class KalturaIE(InfoExtractor): ''', webpage)) or list(re.finditer( r'''(?xs) - <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["'])\s* (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) (?:(?!(?P=q1)).)* [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) From b8645c1f5885522ec8bb77649f49ce842e947c25 Mon Sep 17 00:00:00 2001 From: Ben Rog-Wilhelm Date: Sat, 17 Apr 2021 23:15:10 -0500 Subject: [PATCH 099/120] [dispeak] Improve FLV extraction (closes #13513) --- youtube_dl/extractor/dispeak.py | 50 ++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index c345e0274..e776ac00c 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -32,6 +32,14 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1013700/Advanced-Material 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, + }, { + # From https://gdcvault.com/play/1016624 + 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', + 'info_dict': { + 'id': '201210-822101_1349794556671DDDD', + 'ext': 'flv', + 'title': 'Pre-launch - Preparing to Take the Plunge', + }, }] def _parse_mp4(self, metadata): @@ -84,26 +92,28 @@ class DigitallySpeakingIE(InfoExtractor): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', - }) + slide_video_path = xpath_text(metadata, './slideVideo') + if slide_video_path: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(slide_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'slide deck video', + 'quality': -2, + 'preference': -2, + 'format_id': 'slides', + }) + speaker_video_path = xpath_text(metadata, './speakerVideo') + if speaker_video_path: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(speaker_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'speaker video', + 'quality': -1, + 'preference': -1, + 'format_id': 'speaker', + }) return formats def _real_extract(self, url): From 1786cd3fe4e555b83bdd3eea77ade3477293330d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 02:30:42 +0700 Subject: [PATCH 100/120] [dispeak] DRY and update tests (closes #28970) --- youtube_dl/extractor/dispeak.py | 34 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index e776ac00c..276fd4b09 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -33,13 +33,17 @@ class DigitallySpeakingIE(InfoExtractor): 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, }, { - # From https://gdcvault.com/play/1016624 + # From https://gdcvault.com/play/1016624, empty speakerVideo 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', 'info_dict': { 'id': '201210-822101_1349794556671DDDD', 'ext': 'flv', 'title': 'Pre-launch - Preparing to Take the Plunge', }, + }, { + # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo + 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): @@ -92,27 +96,19 @@ class DigitallySpeakingIE(InfoExtractor): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo') - if slide_video_path: + for video_key, format_id, preference in ( + ('slide', 'slides', -2), ('speaker', 'speaker', -1)): + video_path = xpath_text(metadata, './%sVideo' % video_key) + if not video_path: + continue formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), + 'play_path': remove_end(video_path, '.flv'), 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo') - if speaker_video_path: - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', + 'format_note': '%s video' % video_key, + 'quality': preference, + 'preference': preference, + 'format_id': format_id, }) return formats From 504e4d804df0ee666d80ba6796017cf97e026c0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 02:44:29 +0700 Subject: [PATCH 101/120] [gdcvault] Add support for HTML5 videos --- youtube_dl/extractor/gdcvault.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 5ad40ee23..acc6478b8 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( HEADRequest, + remove_start, sanitized_Request, smuggle_url, urlencode_postdata, @@ -117,6 +118,11 @@ class GDCVaultIE(InfoExtractor): 'skip_download': True, }, }, + { + # HTML5 video + 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', + 'only_matching': True, + }, ] def _login(self, webpage_url, display_id): @@ -190,7 +196,18 @@ class GDCVaultIE(InfoExtractor): xml_name = self._html_search_regex( r'