From 9be9fffb25fb26c124f30c36f1a01da91d5fd9a7 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 19 Aug 2022 15:34:33 +0100 Subject: [PATCH] [jsinterp] Clean up and pull yt-dlp style * add compat_re_Pattern * improve compat_collections_chain_map * use class JS_Undefined * remove unused code --- test/test_jsinterp.py | 20 +++--- test/test_youtube_signature.py | 3 +- youtube_dl/compat.py | 21 +++++- youtube_dl/jsinterp.py | 123 ++++++++++++--------------------- 4 files changed, 77 insertions(+), 90 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index faddf00d5..96786a84c 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -11,8 +11,9 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import math import re -from youtube_dl.jsinterp import JSInterpreter -undefined = JSInterpreter.undefined +from youtube_dl.compat import compat_re_Pattern + +from youtube_dl.jsinterp import JS_Undefined, JSInterpreter class TestJSInterpreter(unittest.TestCase): @@ -261,12 +262,12 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { return undefined; } ''') - self.assertIs(jsi.call_function('x'), undefined) + self.assertIs(jsi.call_function('x'), JS_Undefined) jsi = JSInterpreter(''' function x() { let v; return v; } ''') - self.assertIs(jsi.call_function('x'), undefined) + self.assertIs(jsi.call_function('x'), JS_Undefined) jsi = JSInterpreter(''' function x() { return [undefined === undefined, undefined == undefined, undefined < undefined, undefined > undefined]; } @@ -307,7 +308,7 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let v; return [v>42, v<=42, v&&42, 42&&v]; } ''') - self.assertEqual(jsi.call_function('x'), [False, False, undefined, undefined]) + self.assertEqual(jsi.call_function('x'), [False, False, JS_Undefined, JS_Undefined]) jsi = JSInterpreter('function x(){return undefined ?? 42; }') self.assertEqual(jsi.call_function('x'), 42) @@ -326,12 +327,12 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let a; return a?.qq; } ''') - self.assertIs(jsi.call_function('x'), undefined) + self.assertIs(jsi.call_function('x'), JS_Undefined) jsi = JSInterpreter(''' function x() { let a = {m1: 42, m2: 0 }; return a?.qq; } ''') - self.assertIs(jsi.call_function('x'), undefined) + self.assertIs(jsi.call_function('x'), JS_Undefined) def test_regex(self): jsi = JSInterpreter(''' @@ -342,13 +343,12 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/; return a; } ''') - # Pythons disagree on the type of a pattern - self.assertTrue(isinstance(jsi.call_function('x'), type(re.compile('')))) + self.assertIsInstance(jsi.call_function('x'), compat_re_Pattern) jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/i; return a; } ''') - self.assertEqual(jsi.call_function('x').flags & re.I, re.I) + self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) if __name__ == '__main__': diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 43e22388d..327d4c40d 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -12,10 +12,11 @@ import io import re import string +from youtube_dl.compat import compat_str, compat_urlretrieve + from test.helper import FakeYDL from youtube_dl.extractor import YoutubeIE from youtube_dl.jsinterp import JSInterpreter -from youtube_dl.compat import compat_str, compat_urlretrieve _SIG_TESTS = [ ( diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 6d2c31a61..3002109ca 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3023,18 +3023,34 @@ except ImportError: self.maps[0].__setitem__(k, v) return - def __delitem__(self, k): + def __contains__(self, k): + return any((k in m) for m in self.maps) + + def __delitem(self, k): if k in self.maps[0]: del self.maps[0][k] return raise KeyError(k) + def __delitem__(self, k): + self.__delitem(k) + def __iter__(self): return itertools.chain(*reversed(self.maps)) def __len__(self): return len(iter(self)) + # to match Py3, don't del directly + def pop(self, k, *args): + if self.__contains__(k): + off = self.__getitem__(k) + self.__delitem(k) + return off + elif len(args) > 0: + return args[0] + raise KeyError(k) + def new_child(self, m=None, **kwargs): m = m or {} m.update(kwargs) @@ -3044,6 +3060,8 @@ except ImportError: def parents(self): return compat_collections_chain_map(*(self.maps[1:])) +# Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?) +compat_re_Pattern = type(re.compile('')) if sys.version_info < (3, 3): def compat_b64decode(s, *args, **kwargs): @@ -3110,6 +3128,7 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_re_Pattern', 'compat_realpath', 'compat_setenv', 'compat_shlex_quote', diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 48c27a1c0..6719d0dfd 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -19,16 +19,12 @@ from .compat import ( compat_str, ) -_NAME_RE = r'[a-zA-Z_$][\w$]*' - -_UNDEFINED = object() - def _js_bit_op(op): def wrapped(a, b): def zeroise(x): - return 0 if x in (None, _UNDEFINED) else x + return 0 if x in (None, JS_Undefined) else x return op(zeroise(a), zeroise(b)) return wrapped @@ -37,7 +33,7 @@ def _js_bit_op(op): def _js_arith_op(op): def wrapped(a, b): - if _UNDEFINED in (a, b): + if JS_Undefined in (a, b): return float('nan') return op(a or 0, b or 0) @@ -45,22 +41,21 @@ def _js_arith_op(op): def _js_div(a, b): - if _UNDEFINED in (a, b) or not (a and b): + if JS_Undefined in (a, b) or not (a and b): return float('nan') return float('inf') if not b else operator.truediv(a or 0, b) def _js_mod(a, b): - if _UNDEFINED in (a, b) or not b: + if JS_Undefined in (a, b) or not b: return float('nan') return (a or 0) % b def _js_exp(a, b): if not b: - # even 0 ** 0 !! - return 1 - if _UNDEFINED in (a, b): + return 1 # even 0 ** 0 !! + elif JS_Undefined in (a, b): return float('nan') return (a or 0) ** b @@ -68,7 +63,7 @@ def _js_exp(a, b): def _js_eq_op(op): def wrapped(a, b): - if set((a, b)) <= set((None, _UNDEFINED)): + if set((a, b)) <= set((None, JS_Undefined)): return op(a, a) return op(a, b) @@ -78,21 +73,28 @@ def _js_eq_op(op): def _js_comp_op(op): def wrapped(a, b): - if _UNDEFINED in (a, b): + if JS_Undefined in (a, b): return False return op(a or 0, b or 0) return wrapped +def _js_ternary(cndn, if_true=True, if_false=False): + """Simulate JS's ternary operator (cndn?if_true:if_false)""" + if cndn in (False, None, 0, '', JS_Undefined): + return if_false + try: + if math.isnan(cndn): # NB: NaN cannot be checked by membership + return if_false + except TypeError: + pass + return if_true + + # (op, definition) in order of binding priority, tightest first # avoid dict to maintain order # definition None => Defined in JSInterpreter._operator -_DOT_OPERATORS = ( - ('.', None), - # TODO: ('?.', None), -) - _OPERATORS = ( ('>>', _js_bit_op(operator.rshift)), ('<<', _js_bit_op(operator.lshift)), @@ -130,20 +132,13 @@ _SC_OPERATORS = ( _OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS)) +_NAME_RE = r'[a-zA-Z_$][\w$]*' _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) _QUOTES = '\'"/' -def _ternary(cndn, if_true=True, if_false=False): - """Simulate JS's ternary operator (cndn?if_true:if_false)""" - if cndn in (False, None, 0, '', _UNDEFINED): - return if_false - try: - if math.isnan(cndn): # NB: NaN cannot be checked by membership - return if_false - except TypeError: - pass - return if_true +class JS_Undefined(object): + pass class JS_Break(ExtractorError): @@ -167,7 +162,7 @@ class LocalNameSpace(ChainMap): try: return super(LocalNameSpace, self).__getitem__(key) except KeyError: - return _UNDEFINED + return JS_Undefined def __setitem__(self, key, value): for scope in self.maps: @@ -179,24 +174,6 @@ class LocalNameSpace(ChainMap): def __delitem__(self, key): raise NotImplementedError('Deleting is not supported') - # except - def pop(self, key, *args): - try: - off = self.__getitem__(key) - super(LocalNameSpace, self).__delitem__(key) - return off - except KeyError: - if len(args) > 0: - return args[0] - raise - - def __contains__(self, key): - try: - super(LocalNameSpace, self).__getitem__(key) - return True - except KeyError: - return False - def __repr__(self): return 'LocalNameSpace%s' % (self.maps, ) @@ -204,9 +181,7 @@ class LocalNameSpace(ChainMap): class JSInterpreter(object): __named_object_counter = 0 - undefined = _UNDEFINED - - RE_FLAGS = { + _RE_FLAGS = { # special knowledge: Python's re flags are bitmask values, current max 128 # invent new bitmask values well above that for literal parsing # TODO: new pattern class to execute matches with these flags @@ -257,10 +232,10 @@ class JSInterpreter(object): if not expr: return flags, expr for idx, ch in enumerate(expr): - if ch not in cls.RE_FLAGS: + if ch not in cls._RE_FLAGS: break - flags |= cls.RE_FLAGS[ch] - return flags, expr[idx:] if idx > 0 else expr + flags |= cls._RE_FLAGS[ch] + return flags, expr[idx + 1:] @classmethod def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): @@ -283,14 +258,6 @@ class JSInterpreter(object): if not escaping and char in _QUOTES and in_quote in (char, None): if in_quote or after_op or char != '/': in_quote = None if in_quote and not in_regex_char_group else char - if in_quote is None and char == '/' and delim != '/': - # regexp flags - n_idx = idx + 1 - while n_idx < len(expr) and expr[n_idx] in cls.RE_FLAGS: - n_idx += 1 - skip_re = n_idx - idx - 1 - if skip_re > 0: - continue elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' @@ -336,13 +303,13 @@ class JSInterpreter(object): def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): if op in ('||', '&&'): - if (op == '&&') ^ _ternary(left_val): + if (op == '&&') ^ _js_ternary(left_val): return left_val # short circuiting elif op == '??': - if left_val not in (None, self.undefined): + if left_val not in (None, JS_Undefined): return left_val elif op == '?': - right_expr = _ternary(left_val, *self._separate(right_expr, ':', 1)) + right_expr = _js_ternary(left_val, *self._separate(right_expr, ':', 1)) right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) opfunc = op and next((v for k, v in self._all_operators() if k == op), None) @@ -361,7 +328,7 @@ class JSInterpreter(object): return obj[int(idx)] if isinstance(obj, list) else obj[idx] except Exception as e: if allow_undefined: - return self.undefined + return JS_Undefined raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e) def _dump(self, obj, namespace): @@ -395,9 +362,8 @@ class JSInterpreter(object): if expr[0] in _QUOTES: inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': - flags, _ = self._regex_flags(outer) - inner, outer = inner.replace('"', r'\"'), '' - inner = re.compile(js_to_json(inner + expr[0]), flags=flags) # , strict=True)) + flags, outer = self._regex_flags(outer) + inner = re.compile(inner[1:], flags=flags) # , strict=True)) else: inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) if not outer: @@ -422,7 +388,7 @@ class JSInterpreter(object): if expr.startswith('{'): inner, outer = self._separate_at_paren(expr, '}') - # try for object expression + # try for object expression (Map) sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)] if all(len(sub_expr) == 2 for sub_expr in sub_expressions): return dict( @@ -455,7 +421,8 @@ class JSInterpreter(object): (?Ptry|finally)\s*| (?Pcatch\s*(?P\(\s*{_NAME_RE}\s*\)))| (?Pswitch)\s*\(| - (?Pfor)\s*\(|'''.format(**globals()), expr) + (?Pfor)\s*\(| + '''.format(**globals()), expr) md = m.groupdict() if m else {} if md.get('try'): if expr[m.end()] == '{': @@ -500,7 +467,7 @@ class JSInterpreter(object): start, cndn, increment = self._separate(constructor, ';') self.interpret_expression(start, local_vars, allow_recursion) while True: - if not _ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): + if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): break try: ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) @@ -587,7 +554,7 @@ class JSInterpreter(object): local_vars[m.group('out')] = self._operator( m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion) return local_vars[m.group('out')], should_return - elif left_val in (None, self.undefined): + elif left_val in (None, JS_Undefined): raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr) idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) @@ -607,7 +574,7 @@ class JSInterpreter(object): raise JS_Continue() elif expr == 'undefined': - return self.undefined, should_return + return JS_Undefined, should_return elif md.get('return'): return local_vars[m.group('name')], should_return @@ -663,9 +630,9 @@ class JSInterpreter(object): 'Math': float, } obj = local_vars.get(variable) - if obj in (self.undefined, None): - obj = types.get(variable, self.undefined) - if obj is self.undefined: + if obj in (JS_Undefined, None): + obj = types.get(variable, JS_Undefined) + if obj is JS_Undefined: try: if variable not in self._objects: self._objects[variable] = self.extract_object(variable) @@ -674,8 +641,8 @@ class JSInterpreter(object): if not nullish: raise - if nullish and obj is self.undefined: - return self.undefined + if nullish and obj is JS_Undefined: + return JS_Undefined # Member access if arg_str is None: