From 91dd25fe1e18aa3f617005799a8f5018a551c7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Dec 2020 00:59:25 +0700 Subject: [PATCH] [extractor/common] Add support for dl8-* media tags (closes #27283) --- youtube_dl/extractor/common.py | 3 ++- youtube_dl/extractor/generic.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 877873ebd..dd07a1cae 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2513,7 +2513,8 @@ class InfoExtractor(object): # amp-video and amp-audio are very similar to their HTML5 counterparts # so we wll include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) - _MEDIA_TAG_NAME_RE = r'(?:amp-)?(video|audio)' + # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ + _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' media_tags = [(media_tag, media_type, '') for media_tag, media_type in re.findall(r'(?s)(<%s[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d2ba07839..85dc1d02d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2466,7 +2466,9 @@ class GenericIE(InfoExtractor): # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way - webpage = compat_urllib_parse_unquote(webpage) + # FIXME: unescaping the whole page may break URLs, commenting out for now. + # There probably should be a second run of generic extractor on unescaped webpage. + # webpage = compat_urllib_parse_unquote(webpage) # Unescape squarespace embeds to be detected by generic extractor, # see https://github.com/ytdl-org/youtube-dl/issues/21294