mirror of
https://github.com/ytdl-org/youtube-dl.git
synced 2024-11-25 11:41:52 +00:00
[utils] Rework decoding of Content-Encoding
s
* support nested encodings * support optional `br` encoding, if brotli package is installed * support optional 'compress' encoding, if ncompress package is installed * response `Content-Encoding` has only unprocessed encodings, or removed * response `Content-Length` is decoded length (usable for filesize metadata) * use zlib for both deflate and gzip decompression * some elements taken from yt-dlp: thx especially coletdjnz
This commit is contained in:
parent
87e578c9b8
commit
e7926ae9f4
3 changed files with 107 additions and 43 deletions
|
@ -461,33 +461,23 @@ class TestHTTP(unittest.TestCase):
|
||||||
sanitized_Request(
|
sanitized_Request(
|
||||||
self._test_url('content-encoding'),
|
self._test_url('content-encoding'),
|
||||||
headers={'ytdl-encoding': encoding}))
|
headers={'ytdl-encoding': encoding}))
|
||||||
self.assertEqual(res.headers.get('Content-Encoding'), encoding)
|
# decoded encodings are removed: only check for valid decompressed data
|
||||||
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
|
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
|
||||||
|
|
||||||
@unittest.skipUnless(brotli, 'brotli support is not installed')
|
@unittest.skipUnless(brotli, 'brotli support is not installed')
|
||||||
@unittest.expectedFailure
|
|
||||||
def test_brotli(self):
|
def test_brotli(self):
|
||||||
self.__test_compression('br')
|
self.__test_compression('br')
|
||||||
|
|
||||||
@unittest.expectedFailure
|
|
||||||
def test_deflate(self):
|
def test_deflate(self):
|
||||||
self.__test_compression('deflate')
|
self.__test_compression('deflate')
|
||||||
|
|
||||||
@unittest.expectedFailure
|
|
||||||
def test_gzip(self):
|
def test_gzip(self):
|
||||||
self.__test_compression('gzip')
|
self.__test_compression('gzip')
|
||||||
|
|
||||||
@unittest.expectedFailure # not yet implemented
|
|
||||||
def test_multiple_encodings(self):
|
def test_multiple_encodings(self):
|
||||||
# https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4
|
# https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4
|
||||||
with FakeYDL() as ydl:
|
|
||||||
for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
|
for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
|
||||||
res = ydl.urlopen(
|
self.__test_compression(pair)
|
||||||
sanitized_Request(
|
|
||||||
self._test_url('content-encoding'),
|
|
||||||
headers={'ytdl-encoding': pair}))
|
|
||||||
self.assertEqual(res.headers.get('Content-Encoding'), pair)
|
|
||||||
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
|
|
||||||
|
|
||||||
def test_unsupported_encoding(self):
|
def test_unsupported_encoding(self):
|
||||||
# it should return the raw content
|
# it should return the raw content
|
||||||
|
|
|
@ -3200,6 +3200,18 @@ except AttributeError:
|
||||||
def compat_datetime_timedelta_total_seconds(td):
|
def compat_datetime_timedelta_total_seconds(td):
|
||||||
return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
|
return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
|
||||||
|
|
||||||
|
# optional decompression packages
|
||||||
|
# PyPi brotli package implements 'br' Content-Encoding
|
||||||
|
try:
|
||||||
|
import brotli as compat_brotli
|
||||||
|
except ImportError:
|
||||||
|
compat_brotli = None
|
||||||
|
# PyPi ncompress package implements 'compress' Content-Encoding
|
||||||
|
try:
|
||||||
|
import ncompress as compat_ncompress
|
||||||
|
except ImportError:
|
||||||
|
compat_ncompress = None
|
||||||
|
|
||||||
|
|
||||||
legacy = [
|
legacy = [
|
||||||
'compat_HTMLParseError',
|
'compat_HTMLParseError',
|
||||||
|
@ -3234,6 +3246,7 @@ __all__ = [
|
||||||
'compat_Struct',
|
'compat_Struct',
|
||||||
'compat_base64_b64decode',
|
'compat_base64_b64decode',
|
||||||
'compat_basestring',
|
'compat_basestring',
|
||||||
|
'compat_brotli',
|
||||||
'compat_casefold',
|
'compat_casefold',
|
||||||
'compat_chr',
|
'compat_chr',
|
||||||
'compat_collections_abc',
|
'compat_collections_abc',
|
||||||
|
@ -3259,6 +3272,7 @@ __all__ = [
|
||||||
'compat_itertools_zip_longest',
|
'compat_itertools_zip_longest',
|
||||||
'compat_kwargs',
|
'compat_kwargs',
|
||||||
'compat_map',
|
'compat_map',
|
||||||
|
'compat_ncompress',
|
||||||
'compat_numeric_types',
|
'compat_numeric_types',
|
||||||
'compat_open',
|
'compat_open',
|
||||||
'compat_ord',
|
'compat_ord',
|
||||||
|
|
|
@ -15,7 +15,6 @@ import email.utils
|
||||||
import email.header
|
import email.header
|
||||||
import errno
|
import errno
|
||||||
import functools
|
import functools
|
||||||
import gzip
|
|
||||||
import inspect
|
import inspect
|
||||||
import io
|
import io
|
||||||
import itertools
|
import itertools
|
||||||
|
@ -42,6 +41,7 @@ from .compat import (
|
||||||
compat_HTMLParseError,
|
compat_HTMLParseError,
|
||||||
compat_HTMLParser,
|
compat_HTMLParser,
|
||||||
compat_basestring,
|
compat_basestring,
|
||||||
|
compat_brotli as brotli,
|
||||||
compat_casefold,
|
compat_casefold,
|
||||||
compat_chr,
|
compat_chr,
|
||||||
compat_collections_abc,
|
compat_collections_abc,
|
||||||
|
@ -55,6 +55,7 @@ from .compat import (
|
||||||
compat_http_client,
|
compat_http_client,
|
||||||
compat_integer_types,
|
compat_integer_types,
|
||||||
compat_kwargs,
|
compat_kwargs,
|
||||||
|
compat_ncompress as ncompress,
|
||||||
compat_os_name,
|
compat_os_name,
|
||||||
compat_re_Match,
|
compat_re_Match,
|
||||||
compat_re_Pattern,
|
compat_re_Pattern,
|
||||||
|
@ -2638,11 +2639,44 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
|
||||||
req)
|
req)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def deflate(data):
|
def deflate_gz(data):
|
||||||
try:
|
try:
|
||||||
return zlib.decompress(data, -zlib.MAX_WBITS)
|
# format:zlib,gzip + windowsize:32768
|
||||||
|
return data and zlib.decompress(data, 32 + zlib.MAX_WBITS)
|
||||||
except zlib.error:
|
except zlib.error:
|
||||||
return zlib.decompress(data)
|
# raw zlib * windowsize:32768 (RFC 9110: "non-conformant")
|
||||||
|
return zlib.decompress(data, -zlib.MAX_WBITS)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def gzip(data):
|
||||||
|
|
||||||
|
from gzip import GzipFile
|
||||||
|
|
||||||
|
def _gzip(data):
|
||||||
|
with io.BytesIO(data) as data_buf:
|
||||||
|
gz = GzipFile(fileobj=data_buf, mode='rb')
|
||||||
|
return gz.read()
|
||||||
|
|
||||||
|
try:
|
||||||
|
return _gzip(data)
|
||||||
|
except IOError as original_ioerror:
|
||||||
|
# There may be junk at the end of the file
|
||||||
|
# See http://stackoverflow.com/q/4928560/35070 for details
|
||||||
|
for i in range(1, 1024):
|
||||||
|
try:
|
||||||
|
return _gzip(data[:-i])
|
||||||
|
except IOError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise original_ioerror
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def brotli(data):
|
||||||
|
return data and brotli.decompress(data)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def compress(data):
|
||||||
|
return data and ncompress.decompress(data)
|
||||||
|
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
|
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
|
||||||
|
@ -2679,33 +2713,59 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
old_resp = resp
|
old_resp = resp
|
||||||
# gzip
|
|
||||||
if resp.headers.get('Content-encoding', '') == 'gzip':
|
# Content-Encoding header lists the encodings in order that they were applied [1].
|
||||||
content = resp.read()
|
# To decompress, we simply do the reverse.
|
||||||
gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
|
# [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
|
||||||
|
decoded_response = None
|
||||||
|
decoders = {
|
||||||
|
'gzip': self.deflate_gz,
|
||||||
|
'deflate': self.deflate_gz,
|
||||||
|
}
|
||||||
|
if brotli:
|
||||||
|
decoders['br'] = self.brotli
|
||||||
|
if ncompress:
|
||||||
|
decoders['compress'] = self.compress
|
||||||
|
if sys.platform.startswith('java'):
|
||||||
|
# Jython zlib implementation misses gzip
|
||||||
|
decoders['gzip'] = self.gzip
|
||||||
|
|
||||||
|
def encodings(hdrs):
|
||||||
|
# A header field that allows multiple values can have multiple instances [2].
|
||||||
|
# [2]: https://datatracker.ietf.org/doc/html/rfc9110#name-fields
|
||||||
|
for e in reversed(','.join(hdrs).split(',')):
|
||||||
|
if e:
|
||||||
|
yield e.strip()
|
||||||
|
|
||||||
|
encodings_left = []
|
||||||
try:
|
try:
|
||||||
uncompressed = io.BytesIO(gz.read())
|
resp.headers.get_all
|
||||||
except IOError as original_ioerror:
|
hdrs = resp.headers
|
||||||
# There may be junk at the end of the file
|
except AttributeError:
|
||||||
# See http://stackoverflow.com/q/4928560/35070 for details
|
# Py2 has no get_all() method: headers are rfc822.Message
|
||||||
for i in range(1, 1024):
|
from email.message import Message
|
||||||
try:
|
hdrs = Message()
|
||||||
gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
|
for k, v in resp.headers.items():
|
||||||
uncompressed = io.BytesIO(gz.read())
|
hdrs[k] = v
|
||||||
except IOError:
|
|
||||||
|
decoder, decoded_response = True, None
|
||||||
|
for encoding in encodings(hdrs.get_all('Content-Encoding', [])):
|
||||||
|
# "SHOULD consider" x-compress, x-gzip as compress, gzip
|
||||||
|
decoder = decoder and decoders.get(remove_start(encoding, 'x-'))
|
||||||
|
if not decoder:
|
||||||
|
encodings_left.insert(0, encoding)
|
||||||
continue
|
continue
|
||||||
break
|
decoded_response = decoder(decoded_response or resp.read())
|
||||||
else:
|
if decoded_response is not None:
|
||||||
raise original_ioerror
|
resp = compat_urllib_request.addinfourl(
|
||||||
resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
|
io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
|
||||||
resp.msg = old_resp.msg
|
resp.msg = old_resp.msg
|
||||||
del resp.headers['Content-encoding']
|
del resp.headers['Content-Length']
|
||||||
# deflate
|
resp.headers['Content-Length'] = '%d' % len(decoded_response)
|
||||||
if resp.headers.get('Content-encoding', '') == 'deflate':
|
del resp.headers['Content-Encoding']
|
||||||
gz = io.BytesIO(self.deflate(resp.read()))
|
if encodings_left:
|
||||||
resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
|
resp.headers['Content-Encoding'] = ', '.join(encodings_left)
|
||||||
resp.msg = old_resp.msg
|
|
||||||
del resp.headers['Content-encoding']
|
|
||||||
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
|
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
|
||||||
# https://github.com/ytdl-org/youtube-dl/issues/6457).
|
# https://github.com/ytdl-org/youtube-dl/issues/6457).
|
||||||
if 300 <= resp.code < 400:
|
if 300 <= resp.code < 400:
|
||||||
|
|
Loading…
Reference in a new issue