From fecc1dc727c1a6964b9b5eb063d45830c14447df Mon Sep 17 00:00:00 2001 From: Yuval Hager Date: Mon, 25 Jan 2021 00:06:57 -0800 Subject: [PATCH 1/9] [kan] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kan.py | 101 +++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 youtube_dl/extractor/kan.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ef57f5556..ed7f1c3ba 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -521,6 +521,7 @@ from .joj import JojIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE +from .kan import KanIE from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE diff --git a/youtube_dl/extractor/kan.py b/youtube_dl/extractor/kan.py new file mode 100644 index 000000000..4bc43c197 --- /dev/null +++ b/youtube_dl/extractor/kan.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from ..utils import unified_strdate, parse_duration + + +def get_thumbnail(data): + for media in data.get('media_group', []): + if media.get('type') == 'image': + for item in media.get('media_item'): + thumbnail = item.get('src') + if thumbnail: + return thumbnail + + +class KanIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kan\.org\.il/(?:[iI]tem/\?item[iI]d|program/\?cat[iI]d)=(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.kan.org.il/Item/?itemId=74658', + 'md5': 'c28763bdb61c1bb7823528dd024e6129', + 'info_dict': { + 'id': '74658', + 'ext': 'mp4', + 'title': 'העד - פרק 2', + 'thumbnail': r're:^https://.*36805_A\.jpeg$', + 'description': 'הגופות ממשיכות להיערם, אך איזי עדיין מפקפק בחשדות נגד ברק', + 'creator': 'מערכת כאן', + 'release_date': '20200803', + 'duration': 2393} + }, { + 'url': 'https://www.kan.org.il/program/?catId=1636', + 'playlist_mincount': 9, + 'info_dict': { + 'id': '1636', + 'title': 'מנאייכ - פרקים מלאים לצפייה ישירה | כאן', + 'description': 'md5:9dfbd501189d08674d20762464c5301b' + } + }] + _GEO_COUNTRIES = ['IL'] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url, + video_id, + headers=self.geo_verification_headers()) + if 'itemId' in url.lower(): + return self._extract_item(video_id, webpage) + elif 'catid' in url.lower(): + return self._extract_list(video_id, webpage) + return {} + + def _extract_list(self, list_id, webpage): + ids = re.findall(r'onclick="playVideo\(.*,\'([0-9]+)\'\)', webpage) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + entries = [] + for video_id in ids: + url = 'https://www.kan.org.il/Item/?itemId=%s' % video_id + webpage = self._download_webpage( + url, + video_id, + headers=self.geo_verification_headers()) + entries.append(self._extract_item(video_id, webpage)) + return { + '_type': 'playlist', + 'id': list_id, + 'entries': entries, + 'title': title, + 'description': description + } + + def _extract_item(self, video_id, webpage): + data = self._parse_json( + self._search_regex( + r'', + webpage, 'data'), + video_id) + title = data.get('title') or \ + self._og_search_title(webpage) or \ + self._html_search_regex(r'([^<]+)', webpage, 'title') + description = data.get('summary') or \ + self._og_search_description(webpage, fatal=False) + creator = data.get('author', {}).get('name') or \ + self._og_search_property('site_name', webpage, fatal=False) + thumbnail = get_thumbnail(data) + m3u8_url = data.get('content', {}).get('src') + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + return { + '_type': 'video', + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + 'description': description, + 'creator': creator, + 'release_date': unified_strdate(data.get('published')), + 'duration': parse_duration(data.get('extensions', {}).get('duration')) + } From 55079794029de49161dd88f91dd34b7bbd4e30f1 Mon Sep 17 00:00:00 2001 From: Yuval Hager Date: Mon, 25 Jan 2021 01:19:43 -0800 Subject: [PATCH 2/9] typo fix --- youtube_dl/extractor/kan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kan.py b/youtube_dl/extractor/kan.py index 4bc43c197..b8b1124d9 100644 --- a/youtube_dl/extractor/kan.py +++ b/youtube_dl/extractor/kan.py @@ -46,7 +46,7 @@ class KanIE(InfoExtractor): url, video_id, headers=self.geo_verification_headers()) - if 'itemId' in url.lower(): + if 'itemid' in url.lower(): return self._extract_item(video_id, webpage) elif 'catid' in url.lower(): return self._extract_list(video_id, webpage) From e3a900e707fecdd352b0a87b13c1ff09af668e76 Mon Sep 17 00:00:00 2001 From: Yuval Hager Date: Tue, 26 Jan 2021 00:44:23 -0800 Subject: [PATCH 3/9] minor fixes --- youtube_dl/extractor/kan.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/kan.py b/youtube_dl/extractor/kan.py index b8b1124d9..8ca4a160e 100644 --- a/youtube_dl/extractor/kan.py +++ b/youtube_dl/extractor/kan.py @@ -9,7 +9,7 @@ from ..utils import unified_strdate, parse_duration def get_thumbnail(data): for media in data.get('media_group', []): if media.get('type') == 'image': - for item in media.get('media_item'): + for item in media.get('media_item', []): thumbnail = item.get('src') if thumbnail: return thumbnail @@ -53,11 +53,11 @@ class KanIE(InfoExtractor): return {} def _extract_list(self, list_id, webpage): - ids = re.findall(r'onclick="playVideo\(.*,\'([0-9]+)\'\)', webpage) + video_ids = re.findall(r'onclick="playVideo\(.*,\'([0-9]+)\'\)', webpage) title = self._og_search_title(webpage) description = self._og_search_description(webpage) entries = [] - for video_id in ids: + for video_id in video_ids: url = 'https://www.kan.org.il/Item/?itemId=%s' % video_id webpage = self._download_webpage( url, From e6c7b3c1543cd0b9f9f1a3a5ab11bb1c25adc818 Mon Sep 17 00:00:00 2001 From: Yuval Hager Date: Wed, 27 Jan 2021 16:03:01 -0800 Subject: [PATCH 4/9] code review fixes --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/kan.py | 132 ++++++++++++++++------------- 2 files changed, 78 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ed7f1c3ba..28b3e84be 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -521,7 +521,10 @@ from .joj import JojIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE -from .kan import KanIE +from .kan import ( + KanEpisodeIE, + KanPlaylistIE +) from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE diff --git a/youtube_dl/extractor/kan.py b/youtube_dl/extractor/kan.py index 8ca4a160e..c5ed61d05 100644 --- a/youtube_dl/extractor/kan.py +++ b/youtube_dl/extractor/kan.py @@ -3,7 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import unified_strdate, parse_duration +from ..utils import ( + ExtractorError, + parse_duration, + try_get, + unified_strdate, +) def get_thumbnail(data): @@ -15,9 +20,49 @@ def get_thumbnail(data): return thumbnail -class KanIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kan\.org\.il/(?:[iI]tem/\?item[iI]d|program/\?cat[iI]d)=(?P[0-9]+)' - _TESTS = [{ +class KanBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['IL'] + + def download_webpage(self, url, video_id): + return self._download_webpage( + url, + video_id, + headers=self.geo_verification_headers()) + + def extract_item(self, video_id, webpage): + data = self._parse_json( + self._search_regex( + r'', + webpage, + 'data', + ), + video_id, + ) + title = data.get('title') or self._og_search_title(webpage) + description = data.get('summary') or \ + self._og_search_description(webpage, fatal=False) + creator = try_get(data, lambda x: x['author']['name'], str) or \ + self._og_search_property('site_name', webpage, fatal=False) + thumbnail = get_thumbnail(data) + m3u8_url = try_get(data, lambda x: x['content']['src'], str) + if not m3u8_url: + raise ExtractorError('Unable to extract m3u8 url') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'), + 'description': description, + 'creator': creator, + 'release_date': unified_strdate(data.get('published')), + 'duration': parse_duration(data.get('extensions', {}).get('duration')), + } + + +class KanEpisodeIE(KanBaseIE): + _VALID_URL = r'https?://(?:www\.)?kan\.org\.il/[iI]tem/\?item[iI]d=(?P[0-9]+)' + _TEST = { 'url': 'https://www.kan.org.il/Item/?itemId=74658', 'md5': 'c28763bdb61c1bb7823528dd024e6129', 'info_dict': { @@ -28,74 +73,45 @@ class KanIE(InfoExtractor): 'description': 'הגופות ממשיכות להיערם, אך איזי עדיין מפקפק בחשדות נגד ברק', 'creator': 'מערכת כאן', 'release_date': '20200803', - 'duration': 2393} - }, { + 'duration': 2393, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.extract_item(video_id, self.download_webpage(url, video_id)) + + +class KanPlaylistIE(KanBaseIE): + _VALID_URL = r'https?://(?:www\.)?kan\.org\.il/program/\?cat[iI]d=(?P[0-9]+)' + _TEST = { 'url': 'https://www.kan.org.il/program/?catId=1636', 'playlist_mincount': 9, 'info_dict': { 'id': '1636', 'title': 'מנאייכ - פרקים מלאים לצפייה ישירה | כאן', - 'description': 'md5:9dfbd501189d08674d20762464c5301b' - } - }] - _GEO_COUNTRIES = ['IL'] + 'description': 'md5:9dfbd501189d08674d20762464c5301b', + }, + } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - url, - video_id, - headers=self.geo_verification_headers()) - if 'itemid' in url.lower(): - return self._extract_item(video_id, webpage) - elif 'catid' in url.lower(): - return self._extract_list(video_id, webpage) - return {} - - def _extract_list(self, list_id, webpage): + list_id = self._match_id(url) + webpage = self.download_webpage(url, list_id) video_ids = re.findall(r'onclick="playVideo\(.*,\'([0-9]+)\'\)', webpage) - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) entries = [] for video_id in video_ids: url = 'https://www.kan.org.il/Item/?itemId=%s' % video_id - webpage = self._download_webpage( - url, + entries.append(self.extract_item( video_id, - headers=self.geo_verification_headers()) - entries.append(self._extract_item(video_id, webpage)) + self.download_webpage(url, video_id)) + ) + if not entries: + raise ExtractorError('Unable to extract playlist entries') + return { '_type': 'playlist', 'id': list_id, 'entries': entries, - 'title': title, - 'description': description - } - - def _extract_item(self, video_id, webpage): - data = self._parse_json( - self._search_regex( - r'', - webpage, 'data'), - video_id) - title = data.get('title') or \ - self._og_search_title(webpage) or \ - self._html_search_regex(r'([^<]+)', webpage, 'title') - description = data.get('summary') or \ - self._og_search_description(webpage, fatal=False) - creator = data.get('author', {}).get('name') or \ - self._og_search_property('site_name', webpage, fatal=False) - thumbnail = get_thumbnail(data) - m3u8_url = data.get('content', {}).get('src') - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - return { - '_type': 'video', - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'description': description, - 'creator': creator, - 'release_date': unified_strdate(data.get('published')), - 'duration': parse_duration(data.get('extensions', {}).get('duration')) + 'title': self._og_search_title(webpage, fatal=False), + 'description': self._og_search_description(webpage), } From c0fd80c113c05cdddedc74078cf66b29400ee970 Mon Sep 17 00:00:00 2001 From: Yuval Hager Date: Sat, 13 Feb 2021 15:52:37 -0800 Subject: [PATCH 5/9] code review fixes --- youtube_dl/extractor/kan.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/kan.py b/youtube_dl/extractor/kan.py index c5ed61d05..fb57ac2e3 100644 --- a/youtube_dl/extractor/kan.py +++ b/youtube_dl/extractor/kan.py @@ -32,31 +32,33 @@ class KanBaseIE(InfoExtractor): def extract_item(self, video_id, webpage): data = self._parse_json( self._search_regex( - r'', + r']+id="kan_app_search_data"[^>]*>([^<]+)', webpage, 'data', ), video_id, ) title = data.get('title') or self._og_search_title(webpage) + m3u8_url = try_get(data, lambda x: x['content']['src'], compat_str) + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + if not formats: + raise ExtractorError('Unable to extract video formats') description = data.get('summary') or \ self._og_search_description(webpage, fatal=False) - creator = try_get(data, lambda x: x['author']['name'], str) or \ + creator = try_get(data, lambda x: x['author']['name'], compat_str) or \ self._og_search_property('site_name', webpage, fatal=False) thumbnail = get_thumbnail(data) - m3u8_url = try_get(data, lambda x: x['content']['src'], str) - if not m3u8_url: - raise ExtractorError('Unable to extract m3u8 url') return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, - 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'), + 'formats': formats, 'description': description, 'creator': creator, 'release_date': unified_strdate(data.get('published')), - 'duration': parse_duration(data.get('extensions', {}).get('duration')), + 'duration': parse_duration( + try_get(data, lambda x: x['extensions']['duration'])) } @@ -100,10 +102,10 @@ class KanPlaylistIE(KanBaseIE): video_ids = re.findall(r'onclick="playVideo\(.*,\'([0-9]+)\'\)', webpage) entries = [] for video_id in video_ids: - url = 'https://www.kan.org.il/Item/?itemId=%s' % video_id + video_url = 'https://www.kan.org.il/Item/?itemId=%s' % video_id entries.append(self.extract_item( video_id, - self.download_webpage(url, video_id)) + self.download_webpage(video_url, video_id)) ) if not entries: raise ExtractorError('Unable to extract playlist entries') From 440aba21de485dfa23ffc135a6d11516cc5199b0 Mon Sep 17 00:00:00 2001 From: Yuval Hager Date: Sat, 13 Feb 2021 20:37:15 -0800 Subject: [PATCH 6/9] fix typo --- youtube_dl/extractor/kan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/kan.py b/youtube_dl/extractor/kan.py index fb57ac2e3..84f8c4f77 100644 --- a/youtube_dl/extractor/kan.py +++ b/youtube_dl/extractor/kan.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals +from ..compat import compat_str import re from .common import InfoExtractor From 279539e995e5aca23c3be39a442574f013391d66 Mon Sep 17 00:00:00 2001 From: Yuval Hager Date: Tue, 18 May 2021 18:17:43 -0700 Subject: [PATCH 7/9] fix trailing parentheses --- youtube_dl/extractor/kan.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/kan.py b/youtube_dl/extractor/kan.py index 84f8c4f77..48d000007 100644 --- a/youtube_dl/extractor/kan.py +++ b/youtube_dl/extractor/kan.py @@ -8,8 +8,7 @@ from ..utils import ( ExtractorError, parse_duration, try_get, - unified_strdate, -) + unified_strdate) def get_thumbnail(data): @@ -35,10 +34,8 @@ class KanBaseIE(InfoExtractor): self._search_regex( r']+id="kan_app_search_data"[^>]*>([^<]+)', webpage, - 'data', - ), - video_id, - ) + 'data'), + video_id) title = data.get('title') or self._og_search_title(webpage) m3u8_url = try_get(data, lambda x: x['content']['src'], compat_str) formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') @@ -106,8 +103,7 @@ class KanPlaylistIE(KanBaseIE): video_url = 'https://www.kan.org.il/Item/?itemId=%s' % video_id entries.append(self.extract_item( video_id, - self.download_webpage(video_url, video_id)) - ) + self.download_webpage(video_url, video_id))) if not entries: raise ExtractorError('Unable to extract playlist entries') From 803b071036b33d0378a2e01e37c7e0bd33e64db2 Mon Sep 17 00:00:00 2001 From: Yuval Hager Date: Tue, 18 May 2021 18:32:26 -0700 Subject: [PATCH 8/9] add skip_download to tests using ffmpeg --- youtube_dl/extractor/kan.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/kan.py b/youtube_dl/extractor/kan.py index 48d000007..503dd2287 100644 --- a/youtube_dl/extractor/kan.py +++ b/youtube_dl/extractor/kan.py @@ -75,6 +75,9 @@ class KanEpisodeIE(KanBaseIE): 'release_date': '20200803', 'duration': 2393, }, + 'params': { + 'skip_download': True + }, } def _real_extract(self, url): From 57c3cb420c31e78732566b77dfb16fd78ebfd310 Mon Sep 17 00:00:00 2001 From: Yuval Hager Date: Wed, 19 May 2021 07:38:25 -0700 Subject: [PATCH 9/9] remove geo_countries --- youtube_dl/extractor/kan.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/kan.py b/youtube_dl/extractor/kan.py index 503dd2287..9dd18bc8f 100644 --- a/youtube_dl/extractor/kan.py +++ b/youtube_dl/extractor/kan.py @@ -21,13 +21,10 @@ def get_thumbnail(data): class KanBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['IL'] - def download_webpage(self, url, video_id): return self._download_webpage( url, - video_id, - headers=self.geo_verification_headers()) + video_id) def extract_item(self, video_id, webpage): data = self._parse_json(