1
0
Fork 0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2024-11-24 19:22:06 +00:00

Add Ant1NewsGr IEs

* Add new IEs
  * Ant1NewsGrBaseIE: Base IE class
  * Ant1NewsGrWatchIE: Extract videos from TV VOD pages
  * Ant1NewsGrArticleIE: Extract videos from news articles
  * Ant1NewsGrEmbedIE: Extract iframe-embeddable ant1news.gr videos
* NB: There is a different platform at vod.antenna.gr, which is not
  covered here
* The Generic extractor can also be used to extract videos from news
  article pages (through Ant1NewsGrEmbed._extract_urls), however a
  specific IE is used to extract JSON-LD @type='NewsArticle' metadata
  * However that does not work currently, as _json_ld does not support
    @graph nesting
* Ant1NewsGrArticleIE defers to Ant1NewsGrEmbedIE, either as a playlist
  or a single video
* Ant1NewsGrWatchIE and Ant1NewsGrEmbedIE query an API endpoint to
  extract metadata, get the respective stream/source URLs and detect
  video formats
  * The endpoint HTTP path varies per IE
  * Ant1NewsGrEmbedIE first resolves any redirects for its' own URL, to
    derive the properl base URL for the API query
This commit is contained in:
Zenon Mousmoulas 2021-11-11 08:21:41 +02:00
parent a803582717
commit 9af0f299bf
3 changed files with 203 additions and 0 deletions

View file

@ -0,0 +1,190 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
compat_urlparse,
)
from ..utils import (
HEADRequest,
ExtractorError,
determine_ext,
smuggle_url,
unsmuggle_url,
unescapeHTML,
)
class Ant1NewsGrBaseIE(InfoExtractor):
@staticmethod
def _smuggle_parent_info(url, **info_dict):
return smuggle_url(url, {'parent_info': info_dict})
@staticmethod
def _unsmuggle_parent_info(url):
unsmuggled_url, data = unsmuggle_url(url, default={'parent_info': {}})
return unsmuggled_url, data['parent_info']
def _download_api_data(self, netloc, cid, scheme='https'):
url_parts = (scheme, netloc, self._API_PATH, None, None, None)
url = compat_urlparse.urlunparse(url_parts)
query = {'cid': cid}
return self._download_json(
url, cid,
'Downloading JSON',
'Unable to download JSON',
query=query)
def _download_and_extract_api_data(self, video_id, *args, **kwargs):
info = self._download_api_data(*args, **kwargs)
try:
source = info['url']
except KeyError:
raise ExtractorError('no source found for %s' % video_id)
formats = self._extract_m3u8_formats(source, video_id, 'mp4') \
if determine_ext(source) == 'm3u8' else [source]
self._sort_formats(formats)
return {
'id': video_id,
'title': info['title'],
'thumbnail': info['thumb'],
'formats': formats,
}
class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:watch'
IE_DESC = 'ant1news.gr videos'
_VALID_URL = r'https?://(?:www\.)?ant1news\.gr/watch/(?P<id>\d+)/'
_API_PATH = '/templates/data/player'
_TEST = {
'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45',
'md5': '60a984da5ffc98c9924e6d9dd46c6f04',
'info_dict': {
'id': '1506168',
'ext': 'mp4',
'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a',
'description': 'md5:18665af715a6dcfeac1d6153a44f16b0',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
scheme, netloc, _, _, _, _ = compat_urllib_parse_urlparse(url)
info = self._download_and_extract_api_data(
video_id, netloc, video_id, scheme=scheme)
info['description'] = self._og_search_description(webpage)
return info
class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:article'
IE_DESC = 'ant1news.gr articles'
_VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/'
_TESTS = [{
'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron',
'md5': 'eb635a194c15272c2611a751766b0200',
'info_dict': {
'id': '_xvg/m_cmbatw=',
'ext': 'mp4',
'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411',
},
'expected_warnings': [r'^[Uu]nable to extract JSON-LD'],
}, {
'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn',
'info_dict': {
'id': '620286',
'title': 'md5:91fe569e952e4d146485740ae927662b',
},
'expected_warnings': [r'^[Uu]nable to extract JSON-LD'],
'playlist_mincount': 2,
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
info = self._search_json_ld(webpage, video_id,
expected_type='NewsArticle',
fatal=False)
# workaround as _json_ld does not recognize @graph nesting
if not info:
info['title'] = self._og_search_title(webpage)
embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage, url, **info))
if not embed_urls:
raise ExtractorError('no videos found for %s' % video_id)
if len(embed_urls) == 1:
return self.url_result(embed_urls[0], ie=Ant1NewsGrEmbedIE.ie_key(),
video_title=info['title'])
return self.playlist_from_matches(
embed_urls, video_id, info['title'], ie=Ant1NewsGrEmbedIE.ie_key())
class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:embed'
IE_DESC = 'ant1news.gr embedded videos'
_VALID_URL = r'''(?x)https?://(?:[a-zA-Z0-9\-]+\.)?
(?:antenna|ant1news)\.gr/templates/pages/player
\?(?:(?:cid=(?P<id>[^&#]+)|[^&=#]+=[^&#]+)&?)+'''
_API_PATH = '/news/templates/data/jsonPlayer'
_TEST = {
'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377',
'md5': '12872b12af18b5dbf76528786728de8c',
'info_dict': {
'id': '3f_li_c_az_jw_y_u=',
'ext': 'mp4',
'title': 'md5:a30c93332455f53e1e84ae0724f0adf7',
},
}
@classmethod
def _extract_urls(cls, webpage, origin_url=None, **parent_info):
# make the scheme in _VALID_URL optional
_URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1]
# simplify the query string part of _VALID_URL; after extracting iframe
# src, the URL will be matched again
_URL_RE = _URL_RE.split(r'\?', 1)[0] + r'\?(?:(?!(?P=_q1)).)+'
EMBED_RE = r'''(?x)
<iframe[^>]+?src=(?P<_q1>%(quot_re)s)(?P<url>%(url_re)s)(?P=_q1)
''' % {'quot_re': r'[\"\']', 'url_re': _URL_RE}
for mobj in re.finditer(EMBED_RE, webpage):
url = unescapeHTML(mobj.group('url'))
if url.startswith('//'):
scheme = compat_urllib_parse_urlparse(origin_url).scheme \
if origin_url else 'https'
url = '%s:%s' % (scheme, url)
if not cls.suitable(url):
continue
if isinstance(parent_info, dict):
url = cls._smuggle_parent_info(url, **parent_info)
yield url
def _real_extract(self, url):
url, parent_info = type(self)._unsmuggle_parent_info(url)
video_id = self._match_id(url)
# resolve any redirects, to derive the proper base URL for the API query
canonical_url = self._request_webpage(
HEADRequest(url), video_id,
note='Resolve canonical player URL',
errnote='Could not resolve canonical player URL').geturl()
scheme, netloc, _, _, query, _ = compat_urllib_parse_urlparse(
canonical_url)
query = compat_parse_qs(query)
cid = query['cid'][0]
info = self._download_and_extract_api_data(
video_id, netloc, cid, scheme=scheme)
if 'timestamp' not in info and 'timestamp' in parent_info:
info['timestamp'] = parent_info['timestamp']
return info

View file

@ -1038,6 +1038,11 @@ from .rutube import (
RutubePersonIE,
RutubePlaylistIE,
)
from .ant1newsgr import (
Ant1NewsGrWatchIE,
Ant1NewsGrArticleIE,
Ant1NewsGrEmbedIE,
)
from .rutv import RUTVIE
from .ruutu import RuutuIE
from .ruv import RuvIE

View file

@ -100,6 +100,7 @@ from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .ant1newsgr import Ant1NewsGrEmbedIE
from .limelight import LimelightBaseIE
from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE
@ -3199,6 +3200,13 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
# Look for ant1news.gr embeds
ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage, url,
title=video_title))
if ant1newsgr_urls:
return self.playlist_from_matches(
ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key())
# Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage)
if wapo_urls: