From 80cb917ef60335055810df1772bdfe61b15af9aa Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 11 Mar 2024 15:49:59 +0000 Subject: [PATCH] [utils] Unescape HTML5 named character references (with no ;) --- test/test_utils.py | 5 ++++- youtube_dl/utils.py | 35 ++++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 06a7a5d2b..ea81ef01f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -337,6 +337,9 @@ class TestUtil(unittest.TestCase): self.assertEqual(unescapeHTML('&a"'), '&a"') # HTML5 entities self.assertEqual(unescapeHTML('.''), '.\'') + # non-semicolon HTML5 (bah!) entities + self.assertEqual(unescapeHTML('&&etc'), '&&etc') + self.assertEqual(unescapeHTML('£&POUNDetc'), '£&POUNDetc') def test_date_from_str(self): self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) @@ -1251,7 +1254,7 @@ class TestUtil(unittest.TestCase): def test_args_to_str(self): self.assertEqual( args_to_str(['foo', 'ba/r', '-baz', '2 be', '']), - 'foo ba/r -baz \'2 be\' \'\'' if not(compat_os_name in ('nt', 'ce')) else 'foo ba/r -baz "2 be" ""' + 'foo ba/r -baz \'2 be\' \'\'' if not (compat_os_name in ('nt', 'ce')) else 'foo ba/r -baz "2 be" ""' ) def test_parse_filesize(self): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 60fa29103..459502554 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2232,25 +2232,25 @@ def orderedSet(iterable): return res -def _htmlentity_transform(entity_with_semicolon): +def _htmlentity_transform(entity): """Transforms an HTML entity to a character.""" + entity_with_semicolon = entity if entity[-1] == ';' else (entity + ';') entity = entity_with_semicolon[:-1] # Known non-numeric HTML entity if entity in compat_html_entities.name2codepoint: return compat_chr(compat_html_entities.name2codepoint[entity]) - # TODO: HTML5 allows entities without a semicolon. For example, - # 'Éric' should be decoded as 'Éric'. if entity_with_semicolon in compat_html_entities_html5: return compat_html_entities_html5[entity_with_semicolon] - mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) + # numeric entity + mobj = re.match(r'(?i)#(x[0-9a-f]+|[0-9]+)', entity) if mobj is not None: numstr = mobj.group(1) - if numstr.startswith('x'): + if numstr[0] in 'xX': base = 16 - numstr = '0%s' % numstr + numstr = '0%s' % numstr.lower() else: base = 10 # See https://github.com/ytdl-org/youtube-dl/issues/7518 @@ -2263,13 +2263,34 @@ def _htmlentity_transform(entity_with_semicolon): return '&%s;' % entity +# Based on https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#cite_note-semicolon-2 +# (someone else read WHATWG so we didn't have to) +_html5_CI_non_semicolon_entities = ( + # canonically use lower-case REs + 'quot', 'amp', '[lg]t', 'copy', 'reg', 'eth', 'thorn', +) +_html5_non_semicolon_entities = itertools.chain( + _html5_CI_non_semicolon_entities, + (e.upper() for e in _html5_CI_non_semicolon_entities), + ('nbsp', 'i(?:excl|quest)', 'cent', 'pound', 'curren', 'yen', 'brvbar', + 'sect', 'ord[fm]', '[lr]aquo', 'not', 'shy' 'macr', 'dseg', + 'plusmn', 'sup[231]', 'micro', 'para', 'middot', '[cC]?cedil', + 'frac(?:12|[13]4)', '[aAeEiIoOuUyY]?(?:acute|uml)', + '[aAeEiIoOuU](?:grave|circ)', '[aA]ring', '[aAnNoO]tilde', + '(?:ae|AE|sz|SZ)lig', '[oO]slash', 'divide', 'times', ) +) +_html5_entities_re = '&([^&;]+;|%s)' % '|'.join(_html5_non_semicolon_entities) + + def unescapeHTML(s): if s is None: return None assert isinstance(s, compat_str) return re.sub( - r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) + # match generic &xxx;, &nnn; entities, and also + # HTML5 "named character references" with *omitted* final ; + _html5_entities_re, lambda m: _htmlentity_transform(m.group(1)), s) def process_communicate_or_kill(p, *args, **kwargs):