mirror of
https://github.com/ytdl-org/youtube-dl.git
synced 2024-11-22 02:01:50 +00:00
[InfoExtractor] Add _match_valid_url()
class method and refactor
* API compatible with yt-dlp * also support Sequence of patterns in _VALID_URL * one place to compile _VALID_URL * TODO: remove existing extractor shims
This commit is contained in:
parent
a190b55964
commit
b2ba24bb02
3 changed files with 49 additions and 22 deletions
|
@ -4,6 +4,7 @@ from inspect import getsource
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
from os.path import dirname as dirn
|
from os.path import dirname as dirn
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr)
|
print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr)
|
||||||
|
@ -29,11 +30,18 @@ from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
|
||||||
with open('devscripts/lazy_load_template.py', 'rt') as f:
|
with open('devscripts/lazy_load_template.py', 'rt') as f:
|
||||||
module_template = f.read()
|
module_template = f.read()
|
||||||
|
|
||||||
|
|
||||||
|
def get_source(m):
|
||||||
|
return re.sub(r'(?m)^\s*#.*\n', '', getsource(m))
|
||||||
|
|
||||||
|
|
||||||
module_contents = [
|
module_contents = [
|
||||||
module_template + '\n' + getsource(InfoExtractor.suitable) + '\n',
|
module_template,
|
||||||
|
get_source(InfoExtractor.suitable),
|
||||||
|
get_source(InfoExtractor._match_valid_url) + '\n',
|
||||||
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
|
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
|
||||||
# needed for suitable() methods of Youtube extractor (see #28780)
|
# needed for suitable() methods of Youtube extractor (see #28780)
|
||||||
'from youtube_dl.utils import parse_qs\n',
|
'from youtube_dl.utils import parse_qs, variadic\n',
|
||||||
]
|
]
|
||||||
|
|
||||||
ie_template = '''
|
ie_template = '''
|
||||||
|
@ -66,7 +74,7 @@ def build_lazy_ie(ie, name):
|
||||||
valid_url=valid_url,
|
valid_url=valid_url,
|
||||||
module=ie.__module__)
|
module=ie.__module__)
|
||||||
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
|
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
|
||||||
s += '\n' + getsource(ie.suitable)
|
s += '\n' + get_source(ie.suitable)
|
||||||
if hasattr(ie, '_make_valid_url'):
|
if hasattr(ie, '_make_valid_url'):
|
||||||
# search extractors
|
# search extractors
|
||||||
s += make_valid_template.format(valid_url=ie._make_valid_url())
|
s += make_valid_template.format(valid_url=ie._make_valid_url())
|
||||||
|
|
|
@ -83,6 +83,7 @@ from ..utils import (
|
||||||
urljoin,
|
urljoin,
|
||||||
url_basename,
|
url_basename,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
|
variadic,
|
||||||
xpath_element,
|
xpath_element,
|
||||||
xpath_text,
|
xpath_text,
|
||||||
xpath_with_ns,
|
xpath_with_ns,
|
||||||
|
@ -371,9 +372,22 @@ class InfoExtractor(object):
|
||||||
title, description etc.
|
title, description etc.
|
||||||
|
|
||||||
|
|
||||||
Subclasses of this one should re-define the _real_initialize() and
|
A subclass of InfoExtractor must be defined to handle each specific site (or
|
||||||
_real_extract() methods and define a _VALID_URL regexp.
|
several sites). Such a concrete subclass should be added to the list of
|
||||||
Probably, they should also be added to the list of extractors.
|
extractors. It should also:
|
||||||
|
* define its _VALID_URL attribute as a regexp, or a Sequence of alternative
|
||||||
|
regexps (but see below)
|
||||||
|
* re-define the _real_extract() method
|
||||||
|
* optionally re-define the _real_initialize() method.
|
||||||
|
|
||||||
|
An extractor subclass may also override suitable() if necessary, but the
|
||||||
|
function signature must be preserved and the function must import everything
|
||||||
|
it needs (except other extractors), so that lazy_extractors works correctly.
|
||||||
|
If the subclass's suitable() and _real_extract() functions avoid using
|
||||||
|
_VALID_URL, the subclass need not set that class attribute.
|
||||||
|
|
||||||
|
An abstract subclass of InfoExtractor may be used to simplify implementation
|
||||||
|
within an extractor module; it should not be added to the list of extractors.
|
||||||
|
|
||||||
_GEO_BYPASS attribute may be set to False in order to disable
|
_GEO_BYPASS attribute may be set to False in order to disable
|
||||||
geo restriction bypass mechanisms for a particular extractor.
|
geo restriction bypass mechanisms for a particular extractor.
|
||||||
|
@ -408,22 +422,33 @@ class InfoExtractor(object):
|
||||||
self._x_forwarded_for_ip = None
|
self._x_forwarded_for_ip = None
|
||||||
self.set_downloader(downloader)
|
self.set_downloader(downloader)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __match_valid_url(cls, url):
|
||||||
|
# This does not use has/getattr intentionally - we want to know whether
|
||||||
|
# we have cached the regexp for cls, whereas getattr would also
|
||||||
|
# match its superclass
|
||||||
|
if '_VALID_URL_RE' not in cls.__dict__:
|
||||||
|
# _VALID_URL can now be a list/tuple of patterns
|
||||||
|
cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
|
||||||
|
# 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7
|
||||||
|
for p in cls._VALID_URL_RE:
|
||||||
|
p = p.match(url)
|
||||||
|
if p:
|
||||||
|
return p
|
||||||
|
|
||||||
|
# The public alias can safely be overridden, as in some back-ports
|
||||||
|
_match_valid_url = __match_valid_url
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def suitable(cls, url):
|
def suitable(cls, url):
|
||||||
"""Receives a URL and returns True if suitable for this IE."""
|
"""Receives a URL and returns True if suitable for this IE."""
|
||||||
|
# This function must import everything it needs (except other extractors),
|
||||||
# This does not use has/getattr intentionally - we want to know whether
|
# so that lazy_extractors works correctly
|
||||||
# we have cached the regexp for *this* class, whereas getattr would also
|
return cls.__match_valid_url(url) is not None
|
||||||
# match the superclass
|
|
||||||
if '_VALID_URL_RE' not in cls.__dict__:
|
|
||||||
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
|
|
||||||
return cls._VALID_URL_RE.match(url) is not None
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _match_id(cls, url):
|
def _match_id(cls, url):
|
||||||
if '_VALID_URL_RE' not in cls.__dict__:
|
m = cls.__match_valid_url(url)
|
||||||
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
|
|
||||||
m = cls._VALID_URL_RE.match(url)
|
|
||||||
assert m
|
assert m
|
||||||
return compat_str(m.group('id'))
|
return compat_str(m.group('id'))
|
||||||
|
|
||||||
|
|
|
@ -18,12 +18,6 @@ from ..utils import (
|
||||||
|
|
||||||
class GlobalPlayerBaseIE(InfoExtractor):
|
class GlobalPlayerBaseIE(InfoExtractor):
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _match_valid_url(cls, url):
|
|
||||||
return cls.re.match(cls._VALID_URL, url)
|
|
||||||
|
|
||||||
def _get_page_props(self, url, video_id):
|
def _get_page_props(self, url, video_id):
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
|
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
|
||||||
|
|
Loading…
Reference in a new issue