From 96a0ad4778da7f30ed5be627f2c10df6d0af3ca8 Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Sat, 13 Nov 2021 11:50:05 +0200 Subject: [PATCH] MegaTVComEmbedIE: Make canonical URL extraction more robust --- youtube_dl/extractor/megatvcom.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/megatvcom.py b/youtube_dl/extractor/megatvcom.py index 26fbcff4b..46db816d8 100644 --- a/youtube_dl/extractor/megatvcom.py +++ b/youtube_dl/extractor/megatvcom.py @@ -166,10 +166,26 @@ class MegaTVComEmbedIE(MegaTVComBaseIE): url = '%s:%s' % (scheme, url) yield url + def _match_canonical_url(self, webpage): + LINK_RE = r'''(?x) + %(quot_re)s)(?Pcanonical)(?P=_q1)| + href=(?P<_q2>%(quot_re)s)(?P(?:(?!(?P=_q2)).)+)(?P=_q2)| + [^>]*? + )+> + ''' % {'quot_re': r'["\']'} + for mobj in re.finditer(LINK_RE, webpage): + canonical, href = mobj.group('canonical', 'href') + if canonical and href: + return unescapeHTML(href) + def _real_extract(self, url): webpage = self._download_webpage(url, 'N/A') player_attrs = self._extract_player_attrs(webpage) - canonical_url = player_attrs['share_url'] + canonical_url = player_attrs.get('share_url') or \ + self._match_canonical_url(webpage) + if not canonical_url: + raise ExtractorError('canonical URL not found') video_id = compat_parse_qs(compat_urllib_parse_urlparse( canonical_url).query)['p'][0]