1
0
Fork 0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2024-05-29 00:19:32 +00:00

Merge branch 'master' into df-testdl-patch

This commit is contained in:
dirkf 2024-02-21 11:27:17 +00:00 committed by GitHub
commit 0203fa9bb6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
88 changed files with 8902 additions and 2030 deletions

View file

@ -1,81 +1,476 @@
name: CI
on: [push, pull_request]
env:
all-cpython-versions: 2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12
main-cpython-versions: 2.7, 3.2, 3.5, 3.9, 3.11
pypy-versions: pypy-2.7, pypy-3.6, pypy-3.7
cpython-versions: main
test-set: core
# Python beta version to be built using pyenv before setup-python support
# Must also be included in all-cpython-versions
next: 3.13
on:
push:
# push inputs aren't known to GitHub
inputs:
cpython-versions:
type: string
default: all
test-set:
type: string
default: core
pull_request:
# pull_request inputs aren't known to GitHub
inputs:
cpython-versions:
type: string
default: main
test-set:
type: string
default: both
workflow_dispatch:
inputs:
cpython-versions:
type: choice
description: CPython versions (main = 2.7, 3.2, 3.5, 3.9, 3.11)
options:
- all
- main
required: true
default: main
test-set:
type: choice
description: core, download
options:
- both
- core
- download
required: true
default: both
permissions:
contents: read
jobs:
select:
name: Select tests from inputs
runs-on: ubuntu-latest
outputs:
cpython-versions: ${{ steps.run.outputs.cpython-versions }}
test-set: ${{ steps.run.outputs.test-set }}
own-pip-versions: ${{ steps.run.outputs.own-pip-versions }}
steps:
# push and pull_request inputs aren't known to GitHub (pt3)
- name: Set push defaults
if: ${{ github.event_name == 'push' }}
env:
cpython-versions: all
test-set: core
run: |
echo "cpython-versions=${{env.cpython-versions}}" >> "$GITHUB_ENV"
echo "test_set=${{env.test_set}}" >> "$GITHUB_ENV"
- name: Get pull_request inputs
if: ${{ github.event_name == 'pull_request' }}
env:
cpython-versions: main
test-set: both
run: |
echo "cpython-versions=${{env.cpython-versions}}" >> "$GITHUB_ENV"
echo "test_set=${{env.test_set}}" >> "$GITHUB_ENV"
- name: Make version array
id: run
run: |
# Make a JSON Array from comma/space-separated string (no extra escaping)
json_list() { \
ret=""; IFS="${IFS},"; set -- $*; \
for a in "$@"; do \
ret=$(printf '%s"%s"' "${ret}${ret:+, }" "$a"); \
done; \
printf '[%s]' "$ret"; }
tests="${{ inputs.test-set || env.test-set }}"
[ $tests = both ] && tests="core download"
printf 'test-set=%s\n' "$(json_list $tests)" >> "$GITHUB_OUTPUT"
versions="${{ inputs.cpython-versions || env.cpython-versions }}"
if [ "$versions" = all ]; then \
versions="${{ env.all-cpython-versions }}"; else \
versions="${{ env.main-cpython-versions }}"; \
fi
printf 'cpython-versions=%s\n' \
"$(json_list ${versions}${versions:+, }${{ env.pypy-versions }})" >> "$GITHUB_OUTPUT"
# versions with a special get-pip.py in a per-version subdirectory
printf 'own-pip-versions=%s\n' \
"$(json_list 2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6)" >> "$GITHUB_OUTPUT"
tests:
name: Tests
name: Run tests
needs: select
permissions:
contents: read
packages: write
runs-on: ${{ matrix.os }}
env:
PIP: python -m pip
PIP_DISABLE_PIP_VERSION_CHECK: true
PIP_NO_PYTHON_VERSION_WARNING: true
strategy:
fail-fast: true
matrix:
os: [ubuntu-18.04]
# TODO: python 2.6
python-version: [2.7, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7]
os: [ubuntu-20.04]
python-version: ${{ fromJSON(needs.select.outputs.cpython-versions) }}
python-impl: [cpython]
ytdl-test-set: [core, download]
ytdl-test-set: ${{ fromJSON(needs.select.outputs.test-set) }}
run-tests-ext: [sh]
include:
# python 3.2 is only available on windows via setup-python
- os: windows-2019
python-version: 3.2
python-version: 3.4
python-impl: cpython
ytdl-test-set: core
ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }}
run-tests-ext: bat
- os: windows-2019
python-version: 3.2
python-version: 3.4
python-impl: cpython
ytdl-test-set: download
ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }}
run-tests-ext: bat
# jython
- os: ubuntu-18.04
- os: ubuntu-20.04
python-version: 2.7
python-impl: jython
ytdl-test-set: core
ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }}
run-tests-ext: sh
- os: ubuntu-18.04
- os: ubuntu-20.04
python-version: 2.7
python-impl: jython
ytdl-test-set: download
ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }}
run-tests-ext: sh
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
if: ${{ matrix.python-impl == 'cpython' }}
- name: Prepare Linux
if: ${{ startswith(matrix.os, 'ubuntu') }}
shell: bash
run: |
# apt in runner, if needed, may not be up-to-date
sudo apt-get update
- name: Checkout
uses: actions/checkout@v3
#-------- Python 3 -----
- name: Set up supported Python ${{ matrix.python-version }}
id: setup-python
if: ${{ matrix.python-impl == 'cpython' && matrix.python-version != '2.6' && matrix.python-version != '2.7' && matrix.python-version != env.next }}
# wrap broken actions/setup-python@v4
# NB may run apt-get install in Linux
uses: ytdl-org/setup-python@v1
with:
python-version: ${{ matrix.python-version }}
cache-build: true
allow-build: info
- name: Locate supported Python ${{ matrix.python-version }}
if: ${{ env.pythonLocation }}
shell: bash
run: |
echo "PYTHONHOME=${pythonLocation}" >> "$GITHUB_ENV"
export expected="${{ steps.setup-python.outputs.python-path }}"
dirname() { printf '%s\n' \
'import os, sys' \
'print(os.path.dirname(sys.argv[1]))' \
| ${expected} - "$1"; }
expd="$(dirname "$expected")"
export python="$(command -v python)"
[ "$expd" = "$(dirname "$python")" ] || echo "PATH=$expd:${PATH}" >> "$GITHUB_ENV"
[ -x "$python" ] || printf '%s\n' \
'import os' \
'exp = os.environ["expected"]' \
'python = os.environ["python"]' \
'exps = os.path.split(exp)' \
'if python and (os.path.dirname(python) == exp[0]):' \
' exit(0)' \
'exps[1] = "python" + os.path.splitext(exps[1])[1]' \
'python = os.path.join(*exps)' \
'try:' \
' os.symlink(exp, python)' \
'except AttributeError:' \
' os.rename(exp, python)' \
| ${expected} -
printf '%s\n' \
'import sys' \
'print(sys.path)' \
| ${expected} -
#-------- Python next (was 3.12) -
- name: Set up CPython 3.next environment
if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == env.next }}
shell: bash
run: |
PYENV_ROOT=$HOME/.local/share/pyenv
echo "PYENV_ROOT=${PYENV_ROOT}" >> "$GITHUB_ENV"
- name: Cache Python 3.next
id: cachenext
if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == env.next }}
uses: actions/cache@v3
with:
key: python-${{ env.next }}
path: |
${{ env.PYENV_ROOT }}
- name: Build and set up Python 3.next
if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == env.next && ! steps.cachenext.outputs.cache-hit }}
# dl and build locally
shell: bash
run: |
# Install build environment
sudo apt-get install -y build-essential llvm libssl-dev tk-dev \
libncursesw5-dev libreadline-dev libsqlite3-dev \
libffi-dev xz-utils zlib1g-dev libbz2-dev liblzma-dev
# Download PyEnv from its GitHub repository.
export PYENV_ROOT=${{ env.PYENV_ROOT }}
export PATH=$PYENV_ROOT/bin:$PATH
git clone "https://github.com/pyenv/pyenv.git" "$PYENV_ROOT"
pyenv install ${{ env.next }}
- name: Locate Python 3.next
if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == env.next }}
shell: bash
run: |
PYTHONHOME="$(echo "${{ env.PYENV_ROOT }}/versions/${{ env.next }}."*)"
test -n "$PYTHONHOME"
echo "PYTHONHOME=$PYTHONHOME" >> "$GITHUB_ENV"
echo "PATH=${PYTHONHOME}/bin:$PATH" >> "$GITHUB_ENV"
#-------- Python 2.7 --
- name: Set up Python 2.7
if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == '2.7' }}
# install 2.7
shell: bash
run: |
sudo apt-get install -y python2 python-is-python2
echo "PYTHONHOME=/usr" >> "$GITHUB_ENV"
#-------- Python 2.6 --
- name: Set up Python 2.6 environment
if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == '2.6' }}
shell: bash
run: |
openssl_name=openssl-1.0.2u
echo "openssl_name=${openssl_name}" >> "$GITHUB_ENV"
openssl_dir=$HOME/.local/opt/$openssl_name
echo "openssl_dir=${openssl_dir}" >> "$GITHUB_ENV"
PYENV_ROOT=$HOME/.local/share/pyenv
echo "PYENV_ROOT=${PYENV_ROOT}" >> "$GITHUB_ENV"
sudo apt-get install -y openssl ca-certificates
- name: Cache Python 2.6
id: cache26
if: ${{ matrix.python-version == '2.6' }}
uses: actions/cache@v3
with:
key: python-2.6.9
path: |
${{ env.openssl_dir }}
${{ env.PYENV_ROOT }}
- name: Build and set up Python 2.6
if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == '2.6' && ! steps.cache26.outputs.cache-hit }}
# dl and build locally
shell: bash
run: |
# Install build environment
sudo apt-get install -y build-essential llvm libssl-dev tk-dev \
libncursesw5-dev libreadline-dev libsqlite3-dev \
libffi-dev xz-utils zlib1g-dev libbz2-dev liblzma-dev
# Download and install OpenSSL 1.0.2, back in time
openssl_name=${{ env.openssl_name }}
openssl_targz=${openssl_name}.tar.gz
openssl_dir=${{ env.openssl_dir }}
openssl_inc=$openssl_dir/include
openssl_lib=$openssl_dir/lib
openssl_ssl=$openssl_dir/ssl
curl -L "https://www.openssl.org/source/$openssl_targz" -o $openssl_targz
tar -xf $openssl_targz
( cd $openssl_name; \
./config --prefix=$openssl_dir --openssldir=${openssl_dir}/ssl \
--libdir=lib -Wl,-rpath=${openssl_dir}/lib shared zlib-dynamic && \
make && \
make install )
rm -rf $openssl_name
rmdir $openssl_ssl/certs && ln -s /etc/ssl/certs $openssl_ssl/certs
# Download PyEnv from its GitHub repository.
export PYENV_ROOT=${{ env.PYENV_ROOT }}
export PATH=$PYENV_ROOT/bin:$PATH
git clone "https://github.com/pyenv/pyenv.git" "$PYENV_ROOT"
# Prevent pyenv build trying (and failing) to update pip
export GET_PIP=get-pip-2.6.py
echo 'import sys; sys.exit(0)' > ${GET_PIP}
GET_PIP=$(realpath $GET_PIP)
# Build and install Python
export CFLAGS="-I$openssl_inc"
export LDFLAGS="-L$openssl_lib"
export LD_LIBRARY_PATH="$openssl_lib"
pyenv install 2.6.9
- name: Locate Python 2.6
if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == '2.6' }}
shell: bash
run: |
PYTHONHOME="${{ env.PYENV_ROOT }}/versions/2.6.9"
echo "PYTHONHOME=$PYTHONHOME" >> "$GITHUB_ENV"
echo "PATH=${PYTHONHOME}/bin:$PATH" >> "$GITHUB_ENV"
echo "LD_LIBRARY_PATH=${{ env.openssl_dir }}/lib${LD_LIBRARY_PATH:+:}${LD_LIBRARY_PATH}" >> "$GITHUB_ENV"
#-------- Jython ------
- name: Set up Java 8
if: ${{ matrix.python-impl == 'jython' }}
uses: actions/setup-java@v1
uses: actions/setup-java@v3
with:
java-version: 8
distribution: 'zulu'
- name: Setup Jython environment
if: ${{ matrix.python-impl == 'jython' }}
shell: bash
run: |
echo "JYTHON_ROOT=${HOME}/jython" >> "$GITHUB_ENV"
echo "PIP=pip" >> "$GITHUB_ENV"
- name: Cache Jython
id: cachejy
if: ${{ matrix.python-impl == 'jython' && matrix.python-version == '2.7' }}
uses: actions/cache@v3
with:
# 2.7.3 now available, may solve SNI issue
key: jython-2.7.1
path: |
${{ env.JYTHON_ROOT }}
- name: Install Jython
if: ${{ matrix.python-impl == 'jython' }}
if: ${{ matrix.python-impl == 'jython' && matrix.python-version == '2.7' && ! steps.cachejy.outputs.cache-hit }}
shell: bash
run: |
wget https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar
java -jar jython-installer.jar -s -d "$HOME/jython"
echo "$HOME/jython/bin" >> $GITHUB_PATH
- name: Install nose
if: ${{ matrix.python-impl != 'jython' }}
run: pip install nose
- name: Install nose (Jython)
if: ${{ matrix.python-impl == 'jython' }}
# Working around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb)
JYTHON_ROOT="${{ env.JYTHON_ROOT }}"
curl -L "https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar" -o jython-installer.jar
java -jar jython-installer.jar -s -d "${JYTHON_ROOT}"
echo "${JYTHON_ROOT}/bin" >> "$GITHUB_PATH"
- name: Set up cached Jython
if: ${{ steps.cachejy.outputs.cache-hit }}
shell: bash
run: |
wget https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl
pip install nose-1.3.7-py2-none-any.whl
JYTHON_ROOT="${{ env.JYTHON_ROOT }}"
echo "${JYTHON_ROOT}/bin" >> $GITHUB_PATH
- name: Install supporting Python 2.7 if possible
if: ${{ steps.cachejy.outputs.cache-hit }}
shell: bash
run: |
sudo apt-get install -y python2.7 || true
#-------- pip ---------
- name: Set up supported Python ${{ matrix.python-version }} pip
if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || matrix.python-version == '2.7' }}
# This step may run in either Linux or Windows
shell: bash
run: |
echo "$PATH"
echo "$PYTHONHOME"
# curl is available on both Windows and Linux, -L follows redirects, -O gets name
python -m ensurepip || python -m pip --version || { \
get_pip="${{ contains(needs.select.outputs.own-pip-versions, matrix.python-version) && format('{0}/', matrix.python-version) || '' }}"; \
curl -L -O "https://bootstrap.pypa.io/pip/${get_pip}get-pip.py"; \
python get-pip.py; }
- name: Set up Python 2.6 pip
if: ${{ matrix.python-version == '2.6' }}
shell: bash
run: |
python -m pip --version || { \
curl -L -O "https://bootstrap.pypa.io/pip/2.6/get-pip.py"; \
curl -L -O "https://files.pythonhosted.org/packages/ac/95/a05b56bb975efa78d3557efa36acaf9cf5d2fd0ee0062060493687432e03/pip-9.0.3-py2.py3-none-any.whl"; \
python get-pip.py --no-setuptools --no-wheel pip-9.0.3-py2.py3-none-any.whl; }
# work-around to invoke pip module on 2.6: https://bugs.python.org/issue2751
echo "PIP=python -m pip.__main__" >> "$GITHUB_ENV"
- name: Set up other Python ${{ matrix.python-version }} pip
if: ${{ matrix.python-version == '3.2' && steps.setup-python.outputs.python-path }}
shell: bash
run: |
python -m pip --version || { \
curl -L -O "https://bootstrap.pypa.io/pip/3.2/get-pip.py"; \
curl -L -O "https://files.pythonhosted.org/packages/b2/d0/cd115fe345dd6f07ec1c780020a7dfe74966fceeb171e0f20d1d4905b0b7/pip-7.1.2-py2.py3-none-any.whl"; \
python get-pip.py --no-setuptools --no-wheel pip-7.1.2-py2.py3-none-any.whl; }
#-------- unittest ----
- name: Upgrade Unittest for Python 2.6
if: ${{ matrix.python-version == '2.6' }}
shell: bash
run: |
# Work around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb)
$PIP -qq show unittest2 || { \
for u in "65/26/32b8464df2a97e6dd1b656ed26b2c194606c16fe163c695a992b36c11cdf/six-1.13.0-py2.py3-none-any.whl" \
"f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl" \
"c7/a3/c5da2a44c85bfbb6eebcfc1dde24933f8704441b98fdde6528f4831757a6/linecache2-1.0.0-py2.py3-none-any.whl" \
"17/0a/6ac05a3723017a967193456a2efa0aa9ac4b51456891af1e2353bb9de21e/traceback2-1.4.0-py2.py3-none-any.whl" \
"72/20/7f0f433060a962200b7272b8c12ba90ef5b903e218174301d0abfd523813/unittest2-1.1.0-py2.py3-none-any.whl"; do \
curl -L -O "https://files.pythonhosted.org/packages/${u}"; \
$PIP install ${u##*/}; \
done; }
# make tests use unittest2
for test in ./test/test_*.py ./test/helper.py; do
sed -r -i -e '/^import unittest$/s/test/test2 as unittest/' "$test"
done
#-------- nose --------
- name: Install nose for Python ${{ matrix.python-version }}
if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || (matrix.python-impl == 'cpython' && (matrix.python-version == '2.7' || matrix.python-version == env.next)) }}
shell: bash
run: |
echo "$PATH"
echo "$PYTHONHOME"
# Use PyNose for recent Pythons instead of Nose
py3ver="${{ matrix.python-version }}"
py3ver=${py3ver#3.}
[ "$py3ver" != "${{ matrix.python-version }}" ] && py3ver=${py3ver%.*} || py3ver=0
[ "$py3ver" -ge 9 ] && nose=pynose || nose=nose
$PIP -qq show $nose || $PIP install $nose
- name: Install nose for other Python 2
if: ${{ matrix.python-impl == 'jython' || (matrix.python-impl == 'cpython' && matrix.python-version == '2.6') }}
shell: bash
run: |
# Work around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb)
$PIP -qq show nose || { \
curl -L -O "https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl"; \
$PIP install nose-1.3.7-py2-none-any.whl; }
- name: Install nose for other Python 3
if: ${{ matrix.python-version == '3.2' && steps.setup-python.outputs.python-path }}
shell: bash
run: |
$PIP -qq show nose || { \
curl -L -O "https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl"; \
$PIP install nose-1.3.7-py3-none-any.whl; }
- name: Set up nosetest test
if: ${{ contains(needs.select.outputs.test-set, matrix.ytdl-test-set ) }}
shell: bash
run: |
# set PYTHON_VER
PYTHON_VER=${{ matrix.python-version }}
[ "${PYTHON_VER#*-}" != "$PYTHON_VER" ] || PYTHON_VER="${{ matrix.python-impl }}-${PYTHON_VER}"
echo "PYTHON_VER=$PYTHON_VER" >> "$GITHUB_ENV"
echo "PYTHON_IMPL=${{ matrix.python-impl }}" >> "$GITHUB_ENV"
# define a test to validate the Python version used by nosetests
printf '%s\n' \
'from __future__ import unicode_literals' \
'import sys, os, platform' \
'try:' \
' import unittest2 as unittest' \
'except ImportError:' \
' import unittest' \
'class TestPython(unittest.TestCase):' \
' def setUp(self):' \
' self.ver = os.environ["PYTHON_VER"].split("-")' \
' def test_python_ver(self):' \
' self.assertEqual(["%d" % v for v in sys.version_info[:2]], self.ver[-1].split(".")[:2])' \
' self.assertTrue(sys.version.startswith(self.ver[-1]))' \
' self.assertIn(self.ver[0], ",".join((sys.version, platform.python_implementation())).lower())' \
' def test_python_impl(self):' \
' self.assertIn(platform.python_implementation().lower(), (os.environ["PYTHON_IMPL"], self.ver[0]))' \
> test/test_python.py
#-------- TESTS -------
- name: Run tests
if: ${{ contains(needs.select.outputs.test-set, matrix.ytdl-test-set ) }}
continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }}
env:
YTDL_TEST_SET: ${{ matrix.ytdl-test-set }}
run: ./devscripts/run_tests.${{ matrix.run-tests-ext }}
run: |
./devscripts/run_tests.${{ matrix.run-tests-ext }}
flake8:
name: Linter
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install flake8
run: pip install flake8
- name: Run flake8
run: flake8 .

139
README.md
View file

@ -33,7 +33,7 @@ Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.ex
You can also use pip:
sudo -H pip install --upgrade youtube-dl
This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information.
macOS users can install youtube-dl with [Homebrew](https://brew.sh/):
@ -563,7 +563,7 @@ The basic usage is not to set any template arguments when downloading a single f
- `is_live` (boolean): Whether this video is a live stream or a fixed-length video
- `start_time` (numeric): Time in seconds where the reproduction should start, as specified in the URL
- `end_time` (numeric): Time in seconds where the reproduction should end, as specified in the URL
- `format` (string): A human-readable description of the format
- `format` (string): A human-readable description of the format
- `format_id` (string): Format code specified by `--format`
- `format_note` (string): Additional info about the format
- `width` (numeric): Width of the video
@ -632,7 +632,7 @@ To use percent literals in an output template use `%%`. To output to stdout use
The current default template is `%(title)s-%(id)s.%(ext)s`.
In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title:
In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title.
#### Output template and Windows batch files
@ -675,7 +675,7 @@ The general syntax for format selection is `--format FORMAT` or shorter `-f FORM
**tl;dr:** [navigate me to examples](#format-selection-examples).
The simplest case is requesting a specific format, for example with `-f 22` you can download the format with format code equal to 22. You can get the list of available format codes for particular video using `--list-formats` or `-F`. Note that these format codes are extractor specific.
The simplest case is requesting a specific format, for example with `-f 22` you can download the format with format code equal to 22. You can get the list of available format codes for particular video using `--list-formats` or `-F`. Note that these format codes are extractor specific.
You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download the best quality format of a particular file extension served as a single file, e.g. `-f webm` will download the best quality format with the `webm` extension served as a single file.
@ -760,7 +760,7 @@ Videos can be filtered by their upload date using the options `--date`, `--dateb
- Absolute dates: Dates in the format `YYYYMMDD`.
- Relative dates: Dates in the format `(now|today)[+-][0-9](day|week|month|year)(s)?`
Examples:
```bash
@ -918,7 +918,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op
Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`.
In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [Get cookies.txt](https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid/) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox).
In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [Get cookies.txt LOCALLY](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox).
Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format.
@ -1000,6 +1000,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file
python test/test_download.py
nosetests
For Python versions 3.6 and later, you can use [pynose](https://pypi.org/project/pynose/) to implement `nosetests`. The original [nose](https://pypi.org/project/nose/) has not been upgraded for 3.10 and later.
See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases.
If you want to create a build of youtube-dl yourself, you'll need
@ -1091,7 +1093,7 @@ In any case, thank you very much for your contributions!
## youtube-dl coding conventions
This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code.
This section introduces guidelines for writing idiomatic, robust and future-proof extractor code.
Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with a fix incorporated, all the previous versions become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say, some non rolling release distros may never receive an update at all.
@ -1114,7 +1116,7 @@ Say you have some source dictionary `meta` that you've fetched as JSON with HTTP
```python
meta = self._download_json(url, video_id)
```
Assume at this point `meta`'s layout is:
```python
@ -1158,7 +1160,7 @@ description = self._search_regex(
```
On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that may or may not be present.
### Provide fallbacks
When extracting metadata try to do so from multiple sources. For example if `title` is present in several places, try extracting from at least some of them. This makes it more future-proof in case some of the sources become unavailable.
@ -1206,7 +1208,7 @@ r'(id|ID)=(?P<id>\d+)'
#### Make regular expressions relaxed and flexible
When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on.
##### Example
Say you need to extract `title` from the following HTML code:
@ -1230,7 +1232,7 @@ title = self._search_regex(
webpage, 'title', group='title')
```
Note how you tolerate potential changes in the `style` attribute's value or switch from using double quotes to single for `class` attribute:
Note how you tolerate potential changes in the `style` attribute's value or switch from using double quotes to single for `class` attribute:
The code definitely should not look like:
@ -1331,27 +1333,114 @@ Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`]
Use `url_or_none` for safe URL processing.
Use `try_get` for safe metadata extraction from parsed JSON.
Use `traverse_obj` for safe metadata extraction from parsed JSON.
Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction.
Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction.
Explore [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions.
#### More examples
##### Safely extract optional description from parsed JSON
When processing complex JSON, as often returned by site API requests or stashed in web pages for "hydration", you can use the `traverse_obj()` utility function to handle multiple fallback values and to ensure the expected type of metadata items. The function's docstring defines how the function works: also review usage in the codebase for more examples.
In this example, a text `description`, or `None`, is pulled from the `.result.video[0].summary` member of the parsed JSON `response`, if available.
```python
description = traverse_obj(response, ('result', 'video', 0, 'summary', T(compat_str)))
```
`T(...)` is a shorthand for a set literal; if you hate people who still run Python 2.6, `T(type_or_transformation)` could be written as a set literal `{type_or_transformation}`.
Some extractors use the older and less capable `try_get()` function in the same way.
```python
description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str)
```
##### Safely extract more optional metadata
In this example, various optional metadata values are extracted from the `.result.video[0]` member of the parsed JSON `response`, which is expected to be a JS object, parsed into a `dict`, with no crash if that isn't so, or if any of the target values are missing or invalid.
```python
video = try_get(response, lambda x: x['result']['video'][0], dict) or {}
video = traverse_obj(response, ('result', 'video', 0, T(dict))) or {}
# formerly:
# video = try_get(response, lambda x: x['result']['video'][0], dict) or {}
description = video.get('summary')
duration = float_or_none(video.get('durationMs'), scale=1000)
view_count = int_or_none(video.get('views'))
```
#### Safely extract nested lists
Suppose you've extracted JSON like this into a Python data structure named `media_json` using, say, the `_download_json()` or `_parse_json()` methods of `InfoExtractor`:
```json
{
"title": "Example video",
"comment": "try extracting this",
"media": [{
"type": "bad",
"size": 320,
"url": "https://some.cdn.site/bad.mp4"
}, {
"type": "streaming",
"url": "https://some.cdn.site/hls.m3u8"
}, {
"type": "super",
"size": 1280,
"url": "https://some.cdn.site/good.webm"
}],
"moreStuff": "more values",
...
}
```
Then extractor code like this can collect the various fields of the JSON:
```python
...
from ..utils import (
determine_ext,
int_or_none,
T,
traverse_obj,
txt_or_none,
url_or_none,
)
...
...
info_dict = {}
# extract title and description if valid and not empty
info_dict.update(traverse_obj(media_json, {
'title': ('title', T(txt_or_none)),
'description': ('comment', T(txt_or_none)),
}))
# extract any recognisable media formats
fmts = []
# traverse into "media" list, extract `dict`s with desired keys
for fmt in traverse_obj(media_json, ('media', Ellipsis, {
'format_id': ('type', T(txt_or_none)),
'url': ('url', T(url_or_none)),
'width': ('size', T(int_or_none)), })):
# bad `fmt` values were `None` and removed
if 'url' not in fmt:
continue
fmt_url = fmt['url'] # known to be valid URL
ext = determine_ext(fmt_url)
if ext == 'm3u8':
fmts.extend(self._extract_m3u8_formats(fmt_url, video_id, 'mp4', fatal=False))
else:
fmt['ext'] = ext
fmts.append(fmt)
# sort, raise if no formats
self._sort_formats(fmts)
info_dict['formats'] = fmts
...
```
The extractor raises an exception rather than random crashes if the JSON structure changes so that no formats are found.
# EMBEDDING YOUTUBE-DL
youtube-dl makes the best effort to be a good command-line program, and thus should be callable from any programming language. If you encounter any problems parsing its output, feel free to [create a report](https://github.com/ytdl-org/youtube-dl/issues/new).
@ -1408,7 +1497,11 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
# BUGS
Bugs and suggestions should be reported at: <https://github.com/ytdl-org/youtube-dl/issues>. Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)).
Bugs and suggestions should be reported in the issue tracker: <https://github.com/ytdl-org/youtube-dl/issues> (<https://yt-dl.org/bug> is an alias for this). Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)).
## Opening a bug report or suggestion
Be sure to follow instructions provided **below** and **in the issue tracker**. Complete the appropriate issue template fully. Consider whether your problem is covered by an existing issue: if so, follow the discussion there. Avoid commenting on existing duplicate issues as such comments do not add to the discussion of the issue and are liable to be treated as spam.
**Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this:
```
@ -1428,17 +1521,17 @@ $ youtube-dl -v <your command line>
The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever.
Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist):
Finally please review your issue to avoid various common mistakes (you can and should use this as a checklist) listed below.
### Is the description of the issue itself sufficient?
We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. Many contributors, including myself, are also not native speakers, so we may misread some parts.
We often get issue reports that are hard to understand. To avoid subsequent clarifications, and to assist participants who are not native English speakers, please elaborate on what feature you are requesting, or what bug you want to be fixed.
So please elaborate on what feature you are requesting, or what bug you want to be fixed. Make sure that it's obvious
Make sure that it's obvious
- What the problem is
- How it could be fixed
- How your proposed solution would look like
- How your proposed solution would look
If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over.
@ -1448,14 +1541,14 @@ If your server has multiple IPs or you suspect censorship, adding `--call-home`
**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL.
### Is the issue already documented?
Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. Initially, at least, use the search term `-label:duplicate` to focus on active issues. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity.
### Are you using the latest version?
Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well.
### Is the issue already documented?
Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity.
### Why are existing options not enough?
Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/ytdl-org/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem.

1
devscripts/__init__.py Normal file
View file

@ -0,0 +1 @@
# Empty file needed to make devscripts.utils properly importable from outside

View file

@ -5,8 +5,12 @@ import os
from os.path import dirname as dirn
import sys
sys.path.insert(0, dirn(dirn((os.path.abspath(__file__)))))
sys.path.insert(0, dirn(dirn(os.path.abspath(__file__))))
import youtube_dl
from youtube_dl.compat import compat_open as open
from utils import read_file
BASH_COMPLETION_FILE = "youtube-dl.bash-completion"
BASH_COMPLETION_TEMPLATE = "devscripts/bash-completion.in"
@ -18,9 +22,8 @@ def build_completion(opt_parser):
for option in group.option_list:
# for every long flag
opts_flag.append(option.get_opt_string())
with open(BASH_COMPLETION_TEMPLATE) as f:
template = f.read()
with open(BASH_COMPLETION_FILE, "w") as f:
template = read_file(BASH_COMPLETION_TEMPLATE)
with open(BASH_COMPLETION_FILE, "w", encoding='utf-8') as f:
# just using the special char
filled_template = template.replace("{{flags}}", " ".join(opts_flag))
f.write(filled_template)

83
devscripts/cli_to_api.py Executable file
View file

@ -0,0 +1,83 @@
#!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals
"""
This script displays the API parameters corresponding to a yt-dl command line
Example:
$ ./cli_to_api.py -f best
{u'format': 'best'}
$
"""
# Allow direct execution
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import youtube_dl
from types import MethodType
def cli_to_api(*opts):
YDL = youtube_dl.YoutubeDL
# to extract the parsed options, break out of YoutubeDL instantiation
# return options via this Exception
class ParseYTDLResult(Exception):
def __init__(self, result):
super(ParseYTDLResult, self).__init__('result')
self.opts = result
# replacement constructor that raises ParseYTDLResult
def ytdl_init(ydl, ydl_opts):
super(YDL, ydl).__init__(ydl_opts)
raise ParseYTDLResult(ydl_opts)
# patch in the constructor
YDL.__init__ = MethodType(ytdl_init, YDL)
# core parser
def parsed_options(argv):
try:
youtube_dl._real_main(list(argv))
except ParseYTDLResult as result:
return result.opts
# from https://github.com/yt-dlp/yt-dlp/issues/5859#issuecomment-1363938900
default = parsed_options([])
def neq_opt(a, b):
if a == b:
return False
if a is None and repr(type(object)).endswith(".utils.DateRange'>"):
return '0001-01-01 - 9999-12-31' != '{0}'.format(b)
return a != b
diff = dict((k, v) for k, v in parsed_options(opts).items() if neq_opt(default[k], v))
if 'postprocessors' in diff:
diff['postprocessors'] = [pp for pp in diff['postprocessors'] if pp not in default['postprocessors']]
return diff
def main():
from pprint import PrettyPrinter
pprint = PrettyPrinter()
super_format = pprint.format
def format(object, context, maxlevels, level):
if repr(type(object)).endswith(".utils.DateRange'>"):
return '{0}: {1}>'.format(repr(object)[:-2], object), True, False
return super_format(object, context, maxlevels, level)
pprint.format = format
pprint.pprint(cli_to_api(*sys.argv))
if __name__ == '__main__':
main()

View file

@ -1,7 +1,6 @@
#!/usr/bin/env python
from __future__ import unicode_literals
import io
import json
import mimetypes
import netrc
@ -10,7 +9,9 @@ import os
import re
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
dirn = os.path.dirname
sys.path.insert(0, dirn(dirn(os.path.abspath(__file__))))
from youtube_dl.compat import (
compat_basestring,
@ -22,6 +23,7 @@ from youtube_dl.utils import (
make_HTTPS_handler,
sanitized_Request,
)
from utils import read_file
class GitHubReleaser(object):
@ -89,8 +91,7 @@ def main():
changelog_file, version, build_path = args
with io.open(changelog_file, encoding='utf-8') as inf:
changelog = inf.read()
changelog = read_file(changelog_file)
mobj = re.search(r'(?s)version %s\n{2}(.+?)\n{3}' % version, changelog)
body = mobj.group(1) if mobj else ''

View file

@ -6,10 +6,13 @@ import os
from os.path import dirname as dirn
import sys
sys.path.insert(0, dirn(dirn((os.path.abspath(__file__)))))
sys.path.insert(0, dirn(dirn(os.path.abspath(__file__))))
import youtube_dl
from youtube_dl.utils import shell_quote
from utils import read_file, write_file
FISH_COMPLETION_FILE = 'youtube-dl.fish'
FISH_COMPLETION_TEMPLATE = 'devscripts/fish-completion.in'
@ -38,11 +41,9 @@ def build_completion(opt_parser):
complete_cmd.extend(EXTRA_ARGS.get(long_option, []))
commands.append(shell_quote(complete_cmd))
with open(FISH_COMPLETION_TEMPLATE) as f:
template = f.read()
template = read_file(FISH_COMPLETION_TEMPLATE)
filled_template = template.replace('{{commands}}', '\n'.join(commands))
with open(FISH_COMPLETION_FILE, 'w') as f:
f.write(filled_template)
write_file(FISH_COMPLETION_FILE, filled_template)
parser = youtube_dl.parseOpts()[0]

View file

@ -6,16 +6,21 @@ import sys
import hashlib
import os.path
dirn = os.path.dirname
sys.path.insert(0, dirn(dirn(dirn(os.path.abspath(__file__)))))
from devscripts.utils import read_file, write_file
from youtube_dl.compat import compat_open as open
if len(sys.argv) <= 1:
print('Specify the version number as parameter')
sys.exit()
version = sys.argv[1]
with open('update/LATEST_VERSION', 'w') as f:
f.write(version)
write_file('update/LATEST_VERSION', version)
versions_info = json.load(open('update/versions.json'))
versions_info = json.loads(read_file('update/versions.json'))
if 'signature' in versions_info:
del versions_info['signature']
@ -39,5 +44,5 @@ for key, filename in filenames.items():
versions_info['versions'][version] = new_version
versions_info['latest'] = version
with open('update/versions.json', 'w') as jsonf:
json.dump(versions_info, jsonf, indent=4, sort_keys=True)
with open('update/versions.json', 'w', encoding='utf-8') as jsonf:
json.dumps(versions_info, jsonf, indent=4, sort_keys=True)

View file

@ -2,14 +2,21 @@
from __future__ import unicode_literals
import json
import os.path
import sys
versions_info = json.load(open('update/versions.json'))
dirn = os.path.dirname
sys.path.insert(0, dirn(dirn((os.path.abspath(__file__)))))
from utils import read_file, write_file
versions_info = json.loads(read_file('update/versions.json'))
version = versions_info['latest']
version_dict = versions_info['versions'][version]
# Read template page
with open('download.html.in', 'r', encoding='utf-8') as tmplf:
template = tmplf.read()
template = read_file('download.html.in')
template = template.replace('@PROGRAM_VERSION@', version)
template = template.replace('@PROGRAM_URL@', version_dict['bin'][0])
@ -18,5 +25,5 @@ template = template.replace('@EXE_URL@', version_dict['exe'][0])
template = template.replace('@EXE_SHA256SUM@', version_dict['exe'][1])
template = template.replace('@TAR_URL@', version_dict['tar'][0])
template = template.replace('@TAR_SHA256SUM@', version_dict['tar'][1])
with open('download.html', 'w', encoding='utf-8') as dlf:
dlf.write(template)
write_file('download.html', template)

View file

@ -5,17 +5,22 @@ from __future__ import with_statement, unicode_literals
import datetime
import glob
import io # For Python 2 compatibility
import os
import re
import sys
year = str(datetime.datetime.now().year)
dirn = os.path.dirname
sys.path.insert(0, dirn(dirn(dirn(os.path.abspath(__file__)))))
from devscripts.utils import read_file, write_file
from youtube_dl import compat_str
year = compat_str(datetime.datetime.now().year)
for fn in glob.glob('*.html*'):
with io.open(fn, encoding='utf-8') as f:
content = f.read()
content = read_file(fn)
newc = re.sub(r'(?P<copyright>Copyright © 2011-)(?P<year>[0-9]{4})', 'Copyright © 2011-' + year, content)
if content != newc:
tmpFn = fn + '.part'
with io.open(tmpFn, 'wt', encoding='utf-8') as outf:
outf.write(newc)
write_file(tmpFn, newc)
os.rename(tmpFn, fn)

View file

@ -2,10 +2,16 @@
from __future__ import unicode_literals
import datetime
import io
import json
import os.path
import textwrap
import sys
dirn = os.path.dirname
sys.path.insert(0, dirn(dirn(os.path.abspath(__file__))))
from utils import write_file
atom_template = textwrap.dedent("""\
<?xml version="1.0" encoding="utf-8"?>
@ -72,5 +78,4 @@ for v in versions:
entries_str = textwrap.indent(''.join(entries), '\t')
atom_template = atom_template.replace('@ENTRIES@', entries_str)
with io.open('update/releases.atom', 'w', encoding='utf-8') as atom_file:
atom_file.write(atom_template)
write_file('update/releases.atom', atom_template)

View file

@ -5,15 +5,17 @@ import sys
import os
import textwrap
dirn = os.path.dirname
# We must be able to import youtube_dl
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
sys.path.insert(0, dirn(dirn(dirn(os.path.abspath(__file__)))))
import youtube_dl
from devscripts.utils import read_file, write_file
def main():
with open('supportedsites.html.in', 'r', encoding='utf-8') as tmplf:
template = tmplf.read()
template = read_file('supportedsites.html.in')
ie_htmls = []
for ie in youtube_dl.list_extractors(age_limit=None):
@ -29,8 +31,7 @@ def main():
template = template.replace('@SITES@', textwrap.indent('\n'.join(ie_htmls), '\t'))
with open('supportedsites.html', 'w', encoding='utf-8') as sitesf:
sitesf.write(template)
write_file('supportedsites.html', template)
if __name__ == '__main__':

View file

@ -1,10 +1,11 @@
#!/usr/bin/env python
from __future__ import unicode_literals
import io
import optparse
import re
from utils import read_file, write_file
def main():
parser = optparse.OptionParser(usage='%prog INFILE OUTFILE')
@ -14,8 +15,7 @@ def main():
infile, outfile = args
with io.open(infile, encoding='utf-8') as inf:
readme = inf.read()
readme = read_file(infile)
bug_text = re.search(
r'(?s)#\s*BUGS\s*[^\n]*\s*(.*?)#\s*COPYRIGHT', readme).group(1)
@ -25,8 +25,7 @@ def main():
out = bug_text + dev_text
with io.open(outfile, 'w', encoding='utf-8') as outf:
outf.write(out)
write_file(outfile, out)
if __name__ == '__main__':

View file

@ -1,8 +1,11 @@
#!/usr/bin/env python
from __future__ import unicode_literals
import io
import optparse
import os.path
import sys
from utils import read_file, read_version, write_file
def main():
@ -13,17 +16,11 @@ def main():
infile, outfile = args
with io.open(infile, encoding='utf-8') as inf:
issue_template_tmpl = inf.read()
issue_template_tmpl = read_file(infile)
# Get the version from youtube_dl/version.py without importing the package
exec(compile(open('youtube_dl/version.py').read(),
'youtube_dl/version.py', 'exec'))
out = issue_template_tmpl % {'version': read_version()}
out = issue_template_tmpl % {'version': locals()['__version__']}
with io.open(outfile, 'w', encoding='utf-8') as outf:
outf.write(out)
write_file(outfile, out)
if __name__ == '__main__':
main()

View file

@ -1,28 +1,49 @@
from __future__ import unicode_literals, print_function
from inspect import getsource
import io
import os
from os.path import dirname as dirn
import re
import sys
print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr)
sys.path.insert(0, dirn(dirn((os.path.abspath(__file__)))))
sys.path.insert(0, dirn(dirn(os.path.abspath(__file__))))
lazy_extractors_filename = sys.argv[1]
if os.path.exists(lazy_extractors_filename):
os.remove(lazy_extractors_filename)
# Py2: may be confused by leftover lazy_extractors.pyc
if sys.version_info[0] < 3:
for c in ('c', 'o'):
try:
os.remove(lazy_extractors_filename + 'c')
except OSError:
pass
from devscripts.utils import read_file, write_file
from youtube_dl.compat import compat_register_utf8
compat_register_utf8()
from youtube_dl.extractor import _ALL_CLASSES
from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
with open('devscripts/lazy_load_template.py', 'rt') as f:
module_template = f.read()
module_template = read_file('devscripts/lazy_load_template.py')
def get_source(m):
return re.sub(r'(?m)^\s*#.*\n', '', getsource(m))
module_contents = [
module_template + '\n' + getsource(InfoExtractor.suitable) + '\n',
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n']
module_template,
get_source(InfoExtractor.suitable),
get_source(InfoExtractor._match_valid_url) + '\n',
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
# needed for suitable() methods of Youtube extractor (see #28780)
'from youtube_dl.utils import parse_qs, variadic\n',
]
ie_template = '''
class {name}({bases}):
@ -54,7 +75,7 @@ def build_lazy_ie(ie, name):
valid_url=valid_url,
module=ie.__module__)
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
s += '\n' + getsource(ie.suitable)
s += '\n' + get_source(ie.suitable)
if hasattr(ie, '_make_valid_url'):
# search extractors
s += make_valid_template.format(valid_url=ie._make_valid_url())
@ -94,7 +115,17 @@ for ie in ordered_cls:
module_contents.append(
'_ALL_CLASSES = [{0}]'.format(', '.join(names)))
module_src = '\n'.join(module_contents) + '\n'
module_src = '\n'.join(module_contents)
with io.open(lazy_extractors_filename, 'wt', encoding='utf-8') as f:
f.write(module_src)
write_file(lazy_extractors_filename, module_src + '\n')
# work around JVM byte code module limit in Jython
if sys.platform.startswith('java') and sys.version_info[:2] == (2, 7):
import subprocess
from youtube_dl.compat import compat_subprocess_get_DEVNULL
# if Python 2.7 is available, use it to compile the module for Jython
try:
# if Python 2.7 is available, use it to compile the module for Jython
subprocess.check_call(['python2.7', '-m', 'py_compile', lazy_extractors_filename], stdout=compat_subprocess_get_DEVNULL())
except Exception:
pass

View file

@ -1,8 +1,14 @@
from __future__ import unicode_literals
import io
import sys
import os.path
import re
import sys
dirn = os.path.dirname
sys.path.insert(0, dirn(dirn(os.path.abspath(__file__))))
from utils import read_file
from youtube_dl.compat import compat_open as open
README_FILE = 'README.md'
helptext = sys.stdin.read()
@ -10,8 +16,7 @@ helptext = sys.stdin.read()
if isinstance(helptext, bytes):
helptext = helptext.decode('utf-8')
with io.open(README_FILE, encoding='utf-8') as f:
oldreadme = f.read()
oldreadme = read_file(README_FILE)
header = oldreadme[:oldreadme.index('# OPTIONS')]
footer = oldreadme[oldreadme.index('# CONFIGURATION'):]
@ -20,7 +25,7 @@ options = helptext[helptext.index(' General Options:') + 19:]
options = re.sub(r'(?m)^ (\w.+)$', r'## \1', options)
options = '# OPTIONS\n' + options + '\n'
with io.open(README_FILE, 'w', encoding='utf-8') as f:
with open(README_FILE, 'w', encoding='utf-8') as f:
f.write(header)
f.write(options)
f.write(footer)

View file

@ -1,17 +1,19 @@
#!/usr/bin/env python
from __future__ import unicode_literals
import io
import optparse
import os
import os.path
import sys
# Import youtube_dl
ROOT_DIR = os.path.join(os.path.dirname(__file__), '..')
sys.path.insert(0, ROOT_DIR)
dirn = os.path.dirname
sys.path.insert(0, dirn(dirn(os.path.abspath(__file__))))
import youtube_dl
from utils import write_file
def main():
parser = optparse.OptionParser(usage='%prog OUTFILE.md')
@ -38,8 +40,7 @@ def main():
' - ' + md + '\n'
for md in gen_ies_md(ies))
with io.open(outfile, 'w', encoding='utf-8') as outf:
outf.write(out)
write_file(outfile, out)
if __name__ == '__main__':

View file

@ -1,13 +1,13 @@
from __future__ import unicode_literals
import io
import optparse
import os.path
import re
from utils import read_file, write_file
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
README_FILE = os.path.join(ROOT_DIR, 'README.md')
PREFIX = r'''%YOUTUBE-DL(1)
# NAME
@ -29,8 +29,7 @@ def main():
outfile, = args
with io.open(README_FILE, encoding='utf-8') as f:
readme = f.read()
readme = read_file(README_FILE)
readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme)
readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme)
@ -38,8 +37,7 @@ def main():
readme = filter_options(readme)
with io.open(outfile, 'w', encoding='utf-8') as outf:
outf.write(readme)
write_file(outfile, readme)
def filter_options(readme):

62
devscripts/utils.py Normal file
View file

@ -0,0 +1,62 @@
# coding: utf-8
from __future__ import unicode_literals
import argparse
import functools
import os.path
import subprocess
import sys
dirn = os.path.dirname
sys.path.insert(0, dirn(dirn(os.path.abspath(__file__))))
from youtube_dl.compat import (
compat_kwargs,
compat_open as open,
)
def read_file(fname):
with open(fname, encoding='utf-8') as f:
return f.read()
def write_file(fname, content, mode='w'):
with open(fname, mode, encoding='utf-8') as f:
return f.write(content)
def read_version(fname='youtube_dl/version.py'):
"""Get the version without importing the package"""
exec(compile(read_file(fname), fname, 'exec'))
return locals()['__version__']
def get_filename_args(has_infile=False, default_outfile=None):
parser = argparse.ArgumentParser()
if has_infile:
parser.add_argument('infile', help='Input file')
kwargs = {'nargs': '?', 'default': default_outfile} if default_outfile else {}
kwargs['help'] = 'Output file'
parser.add_argument('outfile', **compat_kwargs(kwargs))
opts = parser.parse_args()
if has_infile:
return opts.infile, opts.outfile
return opts.outfile
def compose_functions(*functions):
return lambda x: functools.reduce(lambda y, f: f(y), functions, x)
def run_process(*args, **kwargs):
kwargs.setdefault('text', True)
kwargs.setdefault('check', True)
kwargs.setdefault('capture_output', True)
if kwargs['text']:
kwargs.setdefault('encoding', 'utf-8')
kwargs.setdefault('errors', 'replace')
kwargs = compat_kwargs(kwargs)
return subprocess.run(args, **kwargs)

View file

@ -7,6 +7,8 @@ import sys
sys.path.insert(0, dirn(dirn((os.path.abspath(__file__)))))
import youtube_dl
from utils import read_file, write_file
ZSH_COMPLETION_FILE = "youtube-dl.zsh"
ZSH_COMPLETION_TEMPLATE = "devscripts/zsh-completion.in"
@ -34,15 +36,13 @@ def build_completion(opt_parser):
flags = [opt.get_opt_string() for opt in opts]
with open(ZSH_COMPLETION_TEMPLATE) as f:
template = f.read()
template = read_file(ZSH_COMPLETION_TEMPLATE)
template = template.replace("{{fileopts}}", "|".join(fileopts))
template = template.replace("{{diropts}}", "|".join(diropts))
template = template.replace("{{flags}}", " ".join(flags))
with open(ZSH_COMPLETION_FILE, "w") as f:
f.write(template)
write_file(ZSH_COMPLETION_FILE, template)
parser = youtube_dl.parseOpts()[0]

View file

@ -1,7 +1,6 @@
from __future__ import unicode_literals
import errno
import io
import hashlib
import json
import os.path
@ -9,14 +8,17 @@ import re
import types
import ssl
import sys
import unittest
import youtube_dl.extractor
from youtube_dl import YoutubeDL
from youtube_dl.compat import (
compat_open as open,
compat_os_name,
compat_str,
)
from youtube_dl.utils import (
IDENTITY,
preferredencoding,
write_string,
)
@ -27,10 +29,10 @@ def get_params(override=None):
"parameters.json")
LOCAL_PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"local_parameters.json")
with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
with open(PARAMETERS_FILE, encoding='utf-8') as pf:
parameters = json.load(pf)
if os.path.exists(LOCAL_PARAMETERS_FILE):
with io.open(LOCAL_PARAMETERS_FILE, encoding='utf-8') as pf:
with open(LOCAL_PARAMETERS_FILE, encoding='utf-8') as pf:
parameters.update(json.load(pf))
if override:
parameters.update(override)
@ -72,7 +74,8 @@ class FakeYDL(YoutubeDL):
def to_screen(self, s, skip_eol=None):
print(s)
def trouble(self, s, tb=None):
def trouble(self, *args, **kwargs):
s = args[0] if len(args) > 0 else kwargs.get('message', 'Missing message')
raise Exception(s)
def download(self, x):
@ -89,6 +92,17 @@ class FakeYDL(YoutubeDL):
self.report_warning = types.MethodType(report_warning, self)
class FakeLogger(object):
def debug(self, msg):
pass
def warning(self, msg):
pass
def error(self, msg):
pass
def gettestcases(include_onlymatching=False):
for ie in youtube_dl.extractor.gen_extractors():
for tc in ie.get_testcases(include_onlymatching):
@ -128,7 +142,7 @@ def expect_value(self, got, expected, field):
self.assertTrue(
contains_str in got,
'field %s (value: %r) should contain %r' % (field, got, contains_str))
elif isinstance(expected, compat_str) and re.match(r'^lambda \w+:', expected):
elif isinstance(expected, compat_str) and re.match(r'lambda \w+:', expected):
fn = eval(expected)
suite = expected.split(':', 1)[1].strip()
self.assertTrue(
@ -286,3 +300,7 @@ def http_server_port(httpd):
else:
sock = httpd.socket
return sock.getsockname()[1]
def expectedFailureIf(cond):
return unittest.expectedFailure if cond else IDENTITY

View file

@ -3,19 +3,37 @@
from __future__ import unicode_literals
# Allow direct execution
import io
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
from youtube_dl.compat import compat_etree_fromstring, compat_http_server
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.extractor import YoutubeIE, get_info_extractor
from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
import threading
from test.helper import (
expect_dict,
expect_value,
FakeYDL,
http_server_port,
)
from youtube_dl.compat import (
compat_etree_fromstring,
compat_http_server,
compat_open as open,
)
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.extractor import (
get_info_extractor,
YoutubeIE,
)
from youtube_dl.utils import (
encode_data_uri,
ExtractorError,
RegexNotFoundError,
strip_jsonp,
)
TEAPOT_RESPONSE_STATUS = 418
TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>"
@ -35,13 +53,13 @@ class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler)
assert False
class TestIE(InfoExtractor):
class DummyIE(InfoExtractor):
pass
class TestInfoExtractor(unittest.TestCase):
def setUp(self):
self.ie = TestIE(FakeYDL())
self.ie = DummyIE(FakeYDL())
def test_ie_key(self):
self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE)
@ -62,6 +80,7 @@ class TestInfoExtractor(unittest.TestCase):
<meta name="og:test1" content='foo > < bar'/>
<meta name="og:test2" content="foo >//< bar"/>
<meta property=og-test3 content='Ill-formatted opengraph'/>
<meta property=og:test4 content=unquoted-value/>
'''
self.assertEqual(ie._og_search_title(html), 'Foo')
self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
@ -74,6 +93,7 @@ class TestInfoExtractor(unittest.TestCase):
self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar')
self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True)
self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True)
self.assertEqual(ie._og_search_property('test4', html), 'unquoted-value')
def test_html_search_meta(self):
ie = self.ie
@ -98,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase):
self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
def test_search_nextjs_data(self):
html = '''
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content=
"text/html; charset=utf-8">
<meta name="viewport" content="width=device-width">
<title>Test _search_nextjs_data()</title>
</head>
<body>
<div id="__next">
<div style="background-color:#17171E" class="FU" dir="ltr">
<div class="sc-93de261d-0 dyzzYE">
<div>
<header class="HD"></header>
<main class="MN">
<div style="height:0" class="HT0">
<div style="width:NaN%" data-testid=
"stream-container" class="WDN"></div>
</div>
</main>
</div>
<footer class="sc-6e5faf91-0 dEGaHS"></footer>
</div>
</div>
</div>
<script id="__NEXT_DATA__" type="application/json">
{"props":{"pageProps":{"video":{"id":"testid"}}}}
</script>
</body>
</html>
'''
search = self.ie._search_nextjs_data(html, 'testID')
self.assertEqual(search['props']['pageProps']['video']['id'], 'testid')
def test_search_nuxt_data(self):
html = '''
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content=
"text/html; charset=utf-8">
<title>Nuxt.js Test Page</title>
<meta name="viewport" content=
"width=device-width, initial-scale=1">
<meta data-hid="robots" name="robots" content="all">
</head>
<body class="BD">
<div id="__layout">
<h1 class="H1">Example heading</h1>
<div class="IN">
<p>Decoy text</p>
</div>
</div>
<script>
window.__NUXT__=(function(a,b,c,d,e,f,g,h){return {decoy:" default",data:[{track:{id:f,title:g}}]}}(null,null,"c",null,null,"testid","Nuxt.js title",null));
</script>
<script src="/_nuxt/a12345b.js" defer="defer"></script>
</body>
</html>
'''
search = self.ie._search_nuxt_data(html, 'testID')
self.assertEqual(search['track']['id'], 'testid')
def test_search_json_ld_realworld(self):
# https://github.com/ytdl-org/youtube-dl/issues/23306
expect_dict(
@ -346,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase):
}],
})
# from https://0000.studio/
# with type attribute but without extension in URL
expect_dict(
self,
self.ie._parse_html5_media_entries(
'https://0000.studio',
r'''
<video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92"
controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain">
</video>
''', None)[0],
{
'formats': [{
'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92',
'ext': 'mp4',
}],
})
def test_extract_jwplayer_data_realworld(self):
# from http://www.suffolk.edu/sjc/
expect_dict(
@ -799,8 +902,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
]
for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:
with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
mode='r', encoding='utf-8') as f:
with open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
mode='r', encoding='utf-8') as f:
formats = self.ie._parse_m3u8_formats(
f.read(), m3u8_url, ext='mp4')
self.ie._sort_formats(formats)
@ -890,7 +993,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'tbr': 5997.485,
'width': 1920,
'height': 1080,
}]
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/pull/14844
'urls_only',
@ -973,7 +1077,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'tbr': 4400,
'width': 1920,
'height': 1080,
}]
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/issues/20346
# Media considered unfragmented even though it contains
@ -1019,18 +1124,185 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'width': 360,
'height': 360,
'fps': 30,
}]
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/issues/30235
# Bento4 generated test mpd
# mp4dash --mpd-name=manifest.mpd --no-split --use-segment-list mediafiles
'url_and_range',
'http://unknown/manifest.mpd', # mpd_url
'http://unknown/', # mpd_base_url
[{
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/',
'ext': 'm4a',
'format_id': 'audio-und-mp4a.40.2',
'format_note': 'DASH audio',
'container': 'm4a_dash',
'protocol': 'http_dash_segments',
'acodec': 'mp4a.40.2',
'vcodec': 'none',
'tbr': 98.808,
}, {
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/',
'ext': 'mp4',
'format_id': 'video-avc1',
'format_note': 'DASH video',
'container': 'mp4_dash',
'protocol': 'http_dash_segments',
'acodec': 'none',
'vcodec': 'avc1.4D401E',
'tbr': 699.597,
'width': 768,
'height': 432
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/issues/27575
# GPAC generated test mpd
# MP4Box -dash 10000 -single-file -out manifest.mpd mediafiles
'range_only',
'http://unknown/manifest.mpd', # mpd_url
'http://unknown/', # mpd_base_url
[{
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/audio_dashinit.mp4',
'ext': 'm4a',
'format_id': '2',
'format_note': 'DASH audio',
'container': 'm4a_dash',
'protocol': 'http_dash_segments',
'acodec': 'mp4a.40.2',
'vcodec': 'none',
'tbr': 98.096,
}, {
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/video_dashinit.mp4',
'ext': 'mp4',
'format_id': '1',
'format_note': 'DASH video',
'container': 'mp4_dash',
'protocol': 'http_dash_segments',
'acodec': 'none',
'vcodec': 'avc1.4D401E',
'tbr': 526.987,
'width': 768,
'height': 432
}],
{},
), (
'subtitles',
'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/',
[{
'format_id': 'audio=128001',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'm4a',
'tbr': 128.001,
'asr': 48000,
'format_note': 'DASH audio',
'container': 'm4a_dash',
'vcodec': 'none',
'acodec': 'mp4a.40.2',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=100000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 336,
'height': 144,
'tbr': 100,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=326000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 562,
'height': 240,
'tbr': 326,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=698000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 844,
'height': 360,
'tbr': 698,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=1493000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 1126,
'height': 480,
'tbr': 1493,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=4482000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 1688,
'height': 720,
'tbr': 4482,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}],
{
'en': [
{
'ext': 'mp4',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}
]
},
)
]
for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES:
with io.open('./test/testdata/mpd/%s.mpd' % mpd_file,
mode='r', encoding='utf-8') as f:
formats = self.ie._parse_mpd_formats(
for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES:
with open('./test/testdata/mpd/%s.mpd' % mpd_file,
mode='r', encoding='utf-8') as f:
formats, subtitles = self.ie._parse_mpd_formats_and_subtitles(
compat_etree_fromstring(f.read().encode('utf-8')),
mpd_base_url=mpd_base_url, mpd_url=mpd_url)
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
expect_value(self, subtitles, expected_subtitles, None)
def test_parse_f4m_formats(self):
_TEST_CASES = [
@ -1051,8 +1323,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
]
for f4m_file, f4m_url, expected_formats in _TEST_CASES:
with io.open('./test/testdata/f4m/%s.f4m' % f4m_file,
mode='r', encoding='utf-8') as f:
with open('./test/testdata/f4m/%s.f4m' % f4m_file,
mode='r', encoding='utf-8') as f:
formats = self.ie._parse_f4m_formats(
compat_etree_fromstring(f.read().encode('utf-8')),
f4m_url, None)
@ -1099,8 +1371,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
]
for xspf_file, xspf_url, expected_entries in _TEST_CASES:
with io.open('./test/testdata/xspf/%s.xspf' % xspf_file,
mode='r', encoding='utf-8') as f:
with open('./test/testdata/xspf/%s.xspf' % xspf_file,
mode='r', encoding='utf-8') as f:
entries = self.ie._parse_xspf(
compat_etree_fromstring(f.read().encode('utf-8')),
xspf_file, xspf_url=xspf_url, xspf_base_url=xspf_url)

View file

@ -10,14 +10,31 @@ import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import copy
import json
from test.helper import FakeYDL, assertRegexpMatches
from test.helper import (
FakeYDL,
assertRegexpMatches,
try_rm,
)
from youtube_dl import YoutubeDL
from youtube_dl.compat import compat_str, compat_urllib_error
from youtube_dl.compat import (
compat_http_cookiejar_Cookie,
compat_http_cookies_SimpleCookie,
compat_kwargs,
compat_open as open,
compat_str,
compat_urllib_error,
)
from youtube_dl.extractor import YoutubeIE
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.postprocessor.common import PostProcessor
from youtube_dl.utils import ExtractorError, match_filter_func
from youtube_dl.utils import (
ExtractorError,
match_filter_func,
traverse_obj,
)
TEST_URL = 'http://localhost/sample.mp4'
@ -29,11 +46,14 @@ class YDL(FakeYDL):
self.msgs = []
def process_info(self, info_dict):
self.downloaded_info_dicts.append(info_dict)
self.downloaded_info_dicts.append(info_dict.copy())
def to_screen(self, msg):
self.msgs.append(msg)
def dl(self, *args, **kwargs):
assert False, 'Downloader must not be invoked for test_YoutubeDL'
def _make_result(formats, **kwargs):
res = {
@ -42,8 +62,9 @@ def _make_result(formats, **kwargs):
'title': 'testttitle',
'extractor': 'testex',
'extractor_key': 'TestEx',
'webpage_url': 'http://example.com/watch?v=shenanigans',
}
res.update(**kwargs)
res.update(**compat_kwargs(kwargs))
return res
@ -681,12 +702,12 @@ class TestYoutubeDL(unittest.TestCase):
class SimplePP(PostProcessor):
def run(self, info):
with open(audiofile, 'wt') as f:
with open(audiofile, 'w') as f:
f.write('EXAMPLE')
return [info['filepath']], info
def run_pp(params, PP):
with open(filename, 'wt') as f:
with open(filename, 'w') as f:
f.write('EXAMPLE')
ydl = YoutubeDL(params)
ydl.add_post_processor(PP())
@ -705,7 +726,7 @@ class TestYoutubeDL(unittest.TestCase):
class ModifierPP(PostProcessor):
def run(self, info):
with open(info['filepath'], 'wt') as f:
with open(info['filepath'], 'w') as f:
f.write('MODIFIED')
return [], info
@ -930,17 +951,11 @@ class TestYoutubeDL(unittest.TestCase):
# Test case for https://github.com/ytdl-org/youtube-dl/issues/27064
def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self):
class _YDL(YDL):
def __init__(self, *args, **kwargs):
super(_YDL, self).__init__(*args, **kwargs)
def trouble(self, s, tb=None):
pass
ydl = _YDL({
ydl = YDL({
'format': 'extra',
'ignoreerrors': True,
})
ydl.trouble = lambda *_, **__: None
class VideoIE(InfoExtractor):
_VALID_URL = r'video:(?P<id>\d+)'
@ -1017,5 +1032,160 @@ class TestYoutubeDL(unittest.TestCase):
self.assertEqual(out_info['release_date'], '20210930')
class TestYoutubeDLCookies(unittest.TestCase):
@staticmethod
def encode_cookie(cookie):
if not isinstance(cookie, dict):
cookie = vars(cookie)
for name, value in cookie.items():
yield name, compat_str(value)
@classmethod
def comparable_cookies(cls, cookies):
# Work around cookiejar cookies not being unicode strings
return sorted(map(tuple, map(sorted, map(cls.encode_cookie, cookies))))
def assertSameCookies(self, c1, c2, msg=None):
return self.assertEqual(
*map(self.comparable_cookies, (c1, c2)),
msg=msg)
def assertSameCookieStrings(self, c1, c2, msg=None):
return self.assertSameCookies(
*map(lambda c: compat_http_cookies_SimpleCookie(c).values(), (c1, c2)),
msg=msg)
def test_header_cookies(self):
ydl = FakeYDL()
ydl.report_warning = lambda *_, **__: None
def cookie(name, value, version=None, domain='', path='', secure=False, expires=None):
return compat_http_cookiejar_Cookie(
version or 0, name, value, None, False,
domain, bool(domain), bool(domain), path, bool(path),
secure, expires, False, None, None, rest={})
test_url, test_domain = (t % ('yt.dl',) for t in ('https://%s/test', '.%s'))
def test(encoded_cookies, cookies, headers=False, round_trip=None, error_re=None):
def _test():
ydl.cookiejar.clear()
ydl._load_cookies(encoded_cookies, autoscope=headers)
if headers:
ydl._apply_header_cookies(test_url)
data = {'url': test_url}
ydl._calc_headers(data)
self.assertSameCookies(
cookies, ydl.cookiejar,
'Extracted cookiejar.Cookie is not the same')
if not headers:
self.assertSameCookieStrings(
data.get('cookies'), round_trip or encoded_cookies,
msg='Cookie is not the same as round trip')
ydl.__dict__['_YoutubeDL__header_cookies'] = []
try:
_test()
except AssertionError:
raise
except Exception as e:
if not error_re:
raise
assertRegexpMatches(self, e.args[0], error_re.join(('.*',) * 2))
test('test=value; Domain=' + test_domain, [cookie('test', 'value', domain=test_domain)])
test('test=value', [cookie('test', 'value')], error_re='Unscoped cookies are not allowed')
test('cookie1=value1; Domain={0}; Path=/test; cookie2=value2; Domain={0}; Path=/'.format(test_domain), [
cookie('cookie1', 'value1', domain=test_domain, path='/test'),
cookie('cookie2', 'value2', domain=test_domain, path='/')])
cookie_kw = compat_kwargs(
{'domain': test_domain, 'path': '/test', 'secure': True, 'expires': '9999999999', })
test('test=value; Domain={domain}; Path={path}; Secure; Expires={expires}'.format(**cookie_kw), [
cookie('test', 'value', **cookie_kw)])
test('test="value; "; path=/test; domain=' + test_domain, [
cookie('test', 'value; ', domain=test_domain, path='/test')],
round_trip='test="value\\073 "; Domain={0}; Path=/test'.format(test_domain))
test('name=; Domain=' + test_domain, [cookie('name', '', domain=test_domain)],
round_trip='name=""; Domain=' + test_domain)
test('test=value', [cookie('test', 'value', domain=test_domain)], headers=True)
test('cookie1=value; Domain={0}; cookie2=value'.format(test_domain), [],
headers=True, error_re='Invalid syntax')
ydl.report_warning = ydl.report_error
test('test=value', [], headers=True, error_re='Passing cookies as a header is a potential security risk')
def test_infojson_cookies(self):
TEST_FILE = 'test_infojson_cookies.info.json'
TEST_URL = 'https://example.com/example.mp4'
COOKIES = 'a=b; Domain=.example.com; c=d; Domain=.example.com'
COOKIE_HEADER = {'Cookie': 'a=b; c=d'}
ydl = FakeYDL()
ydl.process_info = lambda x: ydl._write_info_json('test', x, TEST_FILE)
def make_info(info_header_cookies=False, fmts_header_cookies=False, cookies_field=False):
fmt = {'url': TEST_URL}
if fmts_header_cookies:
fmt['http_headers'] = COOKIE_HEADER
if cookies_field:
fmt['cookies'] = COOKIES
return _make_result([fmt], http_headers=COOKIE_HEADER if info_header_cookies else None)
def test(initial_info, note):
def failure_msg(why):
return ' when '.join((why, note))
result = {}
result['processed'] = ydl.process_ie_result(initial_info)
self.assertTrue(ydl.cookiejar.get_cookies_for_url(TEST_URL),
msg=failure_msg('No cookies set in cookiejar after initial process'))
ydl.cookiejar.clear()
with open(TEST_FILE) as infojson:
result['loaded'] = ydl.sanitize_info(json.load(infojson), True)
result['final'] = ydl.process_ie_result(result['loaded'].copy(), download=False)
self.assertTrue(ydl.cookiejar.get_cookies_for_url(TEST_URL),
msg=failure_msg('No cookies set in cookiejar after final process'))
ydl.cookiejar.clear()
for key in ('processed', 'loaded', 'final'):
info = result[key]
self.assertIsNone(
traverse_obj(info, ((None, ('formats', 0)), 'http_headers', 'Cookie'), casesense=False, get_all=False),
msg=failure_msg('Cookie header not removed in {0} result'.format(key)))
self.assertSameCookieStrings(
traverse_obj(info, ((None, ('formats', 0)), 'cookies'), get_all=False), COOKIES,
msg=failure_msg('No cookies field found in {0} result'.format(key)))
test({'url': TEST_URL, 'http_headers': COOKIE_HEADER, 'id': '1', 'title': 'x'}, 'no formats field')
test(make_info(info_header_cookies=True), 'info_dict header cokies')
test(make_info(fmts_header_cookies=True), 'format header cookies')
test(make_info(info_header_cookies=True, fmts_header_cookies=True), 'info_dict and format header cookies')
test(make_info(info_header_cookies=True, fmts_header_cookies=True, cookies_field=True), 'all cookies fields')
test(make_info(cookies_field=True), 'cookies format field')
test({'url': TEST_URL, 'cookies': COOKIES, 'id': '1', 'title': 'x'}, 'info_dict cookies field only')
try_rm(TEST_FILE)
def test_add_headers_cookie(self):
def check_for_cookie_header(result):
return traverse_obj(result, ((None, ('formats', 0)), 'http_headers', 'Cookie'), casesense=False, get_all=False)
ydl = FakeYDL({'http_headers': {'Cookie': 'a=b'}})
ydl._apply_header_cookies(_make_result([])['webpage_url']) # Scope to input webpage URL: .example.com
fmt = {'url': 'https://example.com/video.mp4'}
result = ydl.process_ie_result(_make_result([fmt]), download=False)
self.assertIsNone(check_for_cookie_header(result), msg='http_headers cookies in result info_dict')
self.assertEqual(result.get('cookies'), 'a=b; Domain=.example.com', msg='No cookies were set in cookies field')
self.assertIn('a=b', ydl.cookiejar.get_cookie_header(fmt['url']), msg='No cookies were set in cookiejar')
fmt = {'url': 'https://wrong.com/video.mp4'}
result = ydl.process_ie_result(_make_result([fmt]), download=False)
self.assertIsNone(check_for_cookie_header(result), msg='http_headers cookies for wrong domain')
self.assertFalse(result.get('cookies'), msg='Cookies set in cookies field for wrong domain')
self.assertFalse(ydl.cookiejar.get_cookie_header(fmt['url']), msg='Cookies set in cookiejar for wrong domain')
if __name__ == '__main__':
unittest.main()

View file

@ -46,6 +46,20 @@ class TestYoutubeDLCookieJar(unittest.TestCase):
# will be ignored
self.assertFalse(cookiejar._cookies)
def test_get_cookie_header(self):
cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt')
cookiejar.load(ignore_discard=True, ignore_expires=True)
header = cookiejar.get_cookie_header('https://www.foobar.foobar')
self.assertIn('HTTPONLY_COOKIE', header)
def test_get_cookies_for_url(self):
cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt')
cookiejar.load(ignore_discard=True, ignore_expires=True)
cookies = cookiejar.get_cookies_for_url('https://www.foobar.foobar/')
self.assertEqual(len(cookies), 2)
cookies = cookiejar.get_cookies_for_url('https://foobar.foobar/')
self.assertFalse(cookies)
if __name__ == '__main__':
unittest.main()

View file

@ -11,6 +11,7 @@ from test.helper import try_rm
from youtube_dl import YoutubeDL
from youtube_dl.utils import DownloadError
def _download_restricted(url, filename, age):
@ -26,7 +27,10 @@ def _download_restricted(url, filename, age):
ydl.add_default_info_extractors()
json_filename = os.path.splitext(filename)[0] + '.info.json'
try_rm(json_filename)
ydl.download([url])
try:
ydl.download([url])
except DownloadError:
try_rm(json_filename)
res = os.path.exists(json_filename)
try_rm(json_filename)
return res
@ -38,12 +42,12 @@ class TestAgeRestriction(unittest.TestCase):
self.assertFalse(_download_restricted(url, filename, age))
def test_youtube(self):
self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10)
self._assert_restricted('HtVdAasjOgU', 'HtVdAasjOgU.mp4', 10)
def test_youporn(self):
self._assert_restricted(
'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
'505835.mp4', 2, old_age=25)
'https://www.youporn.com/watch/16715086/sex-ed-in-detention-18-asmr/',
'16715086.mp4', 2, old_age=25)
if __name__ == '__main__':

View file

@ -23,6 +23,7 @@ from youtube_dl.compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlencode,
compat_urllib_request,
)
@ -48,10 +49,11 @@ class TestCompat(unittest.TestCase):
def test_all_present(self):
import youtube_dl.compat
all_names = youtube_dl.compat.__all__
present_names = set(filter(
all_names = sorted(
youtube_dl.compat.__all__ + youtube_dl.compat.legacy)
present_names = set(map(compat_str, filter(
lambda c: '_' in c and not c.startswith('_'),
dir(youtube_dl.compat))) - set(['unicode_literals'])
dir(youtube_dl.compat)))) - set(['unicode_literals'])
self.assertEqual(all_names, sorted(present_names))
def test_compat_urllib_parse_unquote(self):
@ -134,6 +136,19 @@ class TestCompat(unittest.TestCase):
self.assertEqual(compat_casefold('\u03a3'), '\u03c3')
self.assertEqual(compat_casefold('A\u0345\u03a3'), 'a\u03b9\u03c3')
def test_compat_urllib_request_Request(self):
self.assertEqual(
compat_urllib_request.Request('http://127.0.0.1', method='PUT').get_method(),
'PUT')
class PUTrequest(compat_urllib_request.Request):
def get_method(self):
return 'PUT'
self.assertEqual(
PUTrequest('http://127.0.0.1').get_method(),
'PUT')
if __name__ == '__main__':
unittest.main()

View file

@ -20,7 +20,6 @@ from test.helper import (
import hashlib
import io
import json
import socket
import re
@ -32,6 +31,8 @@ from youtube_dl.compat import (
compat_map as map,
compat_urllib_error,
compat_HTTPError,
compat_open as open,
compat_urllib_error,
)
from youtube_dl.utils import (
DownloadError,
@ -172,6 +173,7 @@ def generator(test_case, tname):
try_rm(tc_filename)
try_rm(tc_filename + '.part')
try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
try_rm_tcs_files()
try:
try_num = 1
@ -237,7 +239,15 @@ def generator(test_case, tname):
# First, check test cases' data against extracted data alone
expect_info_dict(self, tc_res_dict, tc.get('info_dict', {}))
# Now, check downloaded file consistency
# support test-case with volatile ID, signalled by regexp value
if tc.get('info_dict', {}).get('id', '').startswith('re:'):
test_id = tc['info_dict']['id']
tc['info_dict']['id'] = tc_res_dict['id']
else:
test_id = None
tc_filename = get_tc_filename(tc)
if test_id:
tc['info_dict']['id'] = test_id
if not test_case.get('params', {}).get('skip_download', False):
self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
self.assertTrue(tc_filename in finished_hook_called)
@ -260,7 +270,7 @@ def generator(test_case, tname):
self.assertTrue(
os.path.exists(info_json_fn),
'Missing info file %s' % info_json_fn)
with io.open(info_json_fn, encoding='utf-8') as infof:
with open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof)
expect_info_dict(self, info_dict, tc.get('info_dict', {}))
finally:

View file

@ -0,0 +1,258 @@
#!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals
# Allow direct execution
import os
import re
import sys
import subprocess
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
FakeLogger,
FakeYDL,
http_server_port,
try_rm,
)
from youtube_dl import YoutubeDL
from youtube_dl.compat import (
compat_http_cookiejar_Cookie,
compat_http_server,
compat_kwargs,
)
from youtube_dl.utils import (
encodeFilename,
join_nonempty,
)
from youtube_dl.downloader.external import (
Aria2cFD,
Aria2pFD,
AxelFD,
CurlFD,
FFmpegFD,
HttpieFD,
WgetFD,
)
import threading
TEST_SIZE = 10 * 1024
TEST_COOKIE = {
'version': 0,
'name': 'test',
'value': 'ytdlp',
'port': None,
'port_specified': False,
'domain': '.example.com',
'domain_specified': True,
'domain_initial_dot': False,
'path': '/',
'path_specified': True,
'secure': False,
'expires': None,
'discard': False,
'comment': None,
'comment_url': None,
'rest': {},
}
TEST_COOKIE_VALUE = join_nonempty('name', 'value', delim='=', from_dict=TEST_COOKIE)
TEST_INFO = {'url': 'http://www.example.com/'}
def cookiejar_Cookie(**cookie_args):
return compat_http_cookiejar_Cookie(**compat_kwargs(cookie_args))
def ifExternalFDAvailable(externalFD):
return unittest.skipUnless(externalFD.available(),
externalFD.get_basename() + ' not found')
class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
def log_message(self, format, *args):
pass
def send_content_range(self, total=None):
range_header = self.headers.get('Range')
start = end = None
if range_header:
mobj = re.match(r'bytes=(\d+)-(\d+)', range_header)
if mobj:
start, end = (int(mobj.group(i)) for i in (1, 2))
valid_range = start is not None and end is not None
if valid_range:
content_range = 'bytes %d-%d' % (start, end)
if total:
content_range += '/%d' % total
self.send_header('Content-Range', content_range)
return (end - start + 1) if valid_range else total
def serve(self, range=True, content_length=True):
self.send_response(200)
self.send_header('Content-Type', 'video/mp4')
size = TEST_SIZE
if range:
size = self.send_content_range(TEST_SIZE)
if content_length:
self.send_header('Content-Length', size)
self.end_headers()
self.wfile.write(b'#' * size)
def do_GET(self):
if self.path == '/regular':
self.serve()
elif self.path == '/no-content-length':
self.serve(content_length=False)
elif self.path == '/no-range':
self.serve(range=False)
elif self.path == '/no-range-no-content-length':
self.serve(range=False, content_length=False)
else:
assert False, 'unrecognised server path'
@ifExternalFDAvailable(Aria2pFD)
class TestAria2pFD(unittest.TestCase):
def setUp(self):
self.httpd = compat_http_server.HTTPServer(
('127.0.0.1', 0), HTTPTestRequestHandler)
self.port = http_server_port(self.httpd)
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
self.server_thread.daemon = True
self.server_thread.start()
def download(self, params, ep):
with subprocess.Popen(
['aria2c', '--enable-rpc'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
) as process:
if not process.poll():
filename = 'testfile.mp4'
params['logger'] = FakeLogger()
params['outtmpl'] = filename
ydl = YoutubeDL(params)
try_rm(encodeFilename(filename))
self.assertEqual(ydl.download(['http://127.0.0.1:%d/%s' % (self.port, ep)]), 0)
self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE)
try_rm(encodeFilename(filename))
process.kill()
def download_all(self, params):
for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'):
self.download(params, ep)
def test_regular(self):
self.download_all({'external_downloader': 'aria2p'})
def test_chunked(self):
self.download_all({
'external_downloader': 'aria2p',
'http_chunk_size': 1000,
})
@ifExternalFDAvailable(HttpieFD)
class TestHttpieFD(unittest.TestCase):
def test_make_cmd(self):
with FakeYDL() as ydl:
downloader = HttpieFD(ydl, {})
self.assertEqual(
downloader._make_cmd('test', TEST_INFO),
['http', '--download', '--output', 'test', 'http://www.example.com/'])
# Test cookie header is added
ydl.cookiejar.set_cookie(cookiejar_Cookie(**TEST_COOKIE))
self.assertEqual(
downloader._make_cmd('test', TEST_INFO),
['http', '--download', '--output', 'test',
'http://www.example.com/', 'Cookie:' + TEST_COOKIE_VALUE])
@ifExternalFDAvailable(AxelFD)
class TestAxelFD(unittest.TestCase):
def test_make_cmd(self):
with FakeYDL() as ydl:
downloader = AxelFD(ydl, {})
self.assertEqual(
downloader._make_cmd('test', TEST_INFO),
['axel', '-o', 'test', '--', 'http://www.example.com/'])
# Test cookie header is added
ydl.cookiejar.set_cookie(cookiejar_Cookie(**TEST_COOKIE))
self.assertEqual(
downloader._make_cmd('test', TEST_INFO),
['axel', '-o', 'test', '-H', 'Cookie: ' + TEST_COOKIE_VALUE,
'--max-redirect=0', '--', 'http://www.example.com/'])
@ifExternalFDAvailable(WgetFD)
class TestWgetFD(unittest.TestCase):
def test_make_cmd(self):
with FakeYDL() as ydl:
downloader = WgetFD(ydl, {})
self.assertNotIn('--load-cookies', downloader._make_cmd('test', TEST_INFO))
# Test cookiejar tempfile arg is added
ydl.cookiejar.set_cookie(cookiejar_Cookie(**TEST_COOKIE))
self.assertIn('--load-cookies', downloader._make_cmd('test', TEST_INFO))
@ifExternalFDAvailable(CurlFD)
class TestCurlFD(unittest.TestCase):
def test_make_cmd(self):
with FakeYDL() as ydl:
downloader = CurlFD(ydl, {})
self.assertNotIn('--cookie', downloader._make_cmd('test', TEST_INFO))
# Test cookie header is added
ydl.cookiejar.set_cookie(cookiejar_Cookie(**TEST_COOKIE))
self.assertIn('--cookie', downloader._make_cmd('test', TEST_INFO))
self.assertIn(TEST_COOKIE_VALUE, downloader._make_cmd('test', TEST_INFO))
@ifExternalFDAvailable(Aria2cFD)
class TestAria2cFD(unittest.TestCase):
def test_make_cmd(self):
with FakeYDL() as ydl:
downloader = Aria2cFD(ydl, {})
downloader._make_cmd('test', TEST_INFO)
self.assertFalse(hasattr(downloader, '_cookies_tempfile'))
# Test cookiejar tempfile arg is added
ydl.cookiejar.set_cookie(cookiejar_Cookie(**TEST_COOKIE))
cmd = downloader._make_cmd('test', TEST_INFO)
self.assertIn('--load-cookies=%s' % downloader._cookies_tempfile, cmd)
@ifExternalFDAvailable(FFmpegFD)
class TestFFmpegFD(unittest.TestCase):
_args = []
def _test_cmd(self, args):
self._args = args
def test_make_cmd(self):
with FakeYDL() as ydl:
downloader = FFmpegFD(ydl, {})
downloader._debug_cmd = self._test_cmd
info_dict = TEST_INFO.copy()
info_dict['ext'] = 'mp4'
downloader._call_downloader('test', info_dict)
self.assertEqual(self._args, [
'ffmpeg', '-y', '-i', 'http://www.example.com/',
'-c', 'copy', '-f', 'mp4', 'file:test'])
# Test cookies arg is added
ydl.cookiejar.set_cookie(cookiejar_Cookie(**TEST_COOKIE))
downloader._call_downloader('test', info_dict)
self.assertEqual(self._args, [
'ffmpeg', '-y', '-cookies', TEST_COOKIE_VALUE + '; path=/; domain=.example.com;\r\n',
'-i', 'http://www.example.com/', '-c', 'copy', '-f', 'mp4', 'file:test'])
if __name__ == '__main__':
unittest.main()

View file

@ -9,7 +9,11 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import http_server_port, try_rm
from test.helper import (
FakeLogger,
http_server_port,
try_rm,
)
from youtube_dl import YoutubeDL
from youtube_dl.compat import compat_http_server
from youtube_dl.downloader.http import HttpFD
@ -66,17 +70,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
assert False
class FakeLogger(object):
def debug(self, msg):
pass
def warning(self, msg):
pass
def error(self, msg):
pass
class TestHttpFD(unittest.TestCase):
def setUp(self):
self.httpd = compat_http_server.HTTPServer(
@ -95,7 +88,7 @@ class TestHttpFD(unittest.TestCase):
self.assertTrue(downloader.real_download(filename, {
'url': 'http://127.0.0.1:%d/%s' % (self.port, ep),
}))
self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE)
self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE, ep)
try_rm(encodeFilename(filename))
def download_all(self, params):

View file

@ -8,46 +8,54 @@ import unittest
import sys
import os
import subprocess
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.utils import encodeArgument
rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, rootDir)
try:
_DEV_NULL = subprocess.DEVNULL
except AttributeError:
_DEV_NULL = open(os.devnull, 'wb')
from youtube_dl.compat import compat_register_utf8, compat_subprocess_get_DEVNULL
from youtube_dl.utils import encodeArgument
compat_register_utf8()
_DEV_NULL = compat_subprocess_get_DEVNULL()
class TestExecution(unittest.TestCase):
def setUp(self):
self.module = 'youtube_dl'
if sys.version_info < (2, 7):
self.module += '.__main__'
def test_import(self):
subprocess.check_call([sys.executable, '-c', 'import youtube_dl'], cwd=rootDir)
def test_module_exec(self):
if sys.version_info >= (2, 7): # Python 2.6 doesn't support package execution
subprocess.check_call([sys.executable, '-m', 'youtube_dl', '--version'], cwd=rootDir, stdout=_DEV_NULL)
subprocess.check_call([sys.executable, '-m', self.module, '--version'], cwd=rootDir, stdout=_DEV_NULL)
def test_main_exec(self):
subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL)
subprocess.check_call([sys.executable, os.path.normpath('youtube_dl/__main__.py'), '--version'], cwd=rootDir, stdout=_DEV_NULL)
def test_cmdline_umlauts(self):
os.environ['PYTHONIOENCODING'] = 'utf-8'
p = subprocess.Popen(
[sys.executable, 'youtube_dl/__main__.py', encodeArgument('ä'), '--version'],
[sys.executable, '-m', self.module, encodeArgument('ä'), '--version'],
cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE)
_, stderr = p.communicate()
self.assertFalse(stderr)
def test_lazy_extractors(self):
lazy_extractors = os.path.normpath('youtube_dl/extractor/lazy_extractors.py')
try:
subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL)
subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL)
subprocess.check_call([sys.executable, os.path.normpath('devscripts/make_lazy_extractors.py'), lazy_extractors], cwd=rootDir, stdout=_DEV_NULL)
subprocess.check_call([sys.executable, os.path.normpath('test/test_all_urls.py')], cwd=rootDir, stdout=_DEV_NULL)
finally:
try:
os.remove('youtube_dl/extractor/lazy_extractors.py')
except (IOError, OSError):
pass
for x in ('', 'c') if sys.version_info[0] < 3 else ('',):
try:
os.remove(lazy_extractors + x)
except OSError:
pass
if __name__ == '__main__':

View file

@ -8,30 +8,163 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import http_server_port
from youtube_dl import YoutubeDL
from youtube_dl.compat import compat_http_server, compat_urllib_request
import contextlib
import gzip
import io
import ssl
import tempfile
import threading
import zlib
# avoid deprecated alias assertRaisesRegexp
if hasattr(unittest.TestCase, 'assertRaisesRegex'):
unittest.TestCase.assertRaisesRegexp = unittest.TestCase.assertRaisesRegex
try:
import brotli
except ImportError:
brotli = None
try:
from urllib.request import pathname2url
except ImportError:
from urllib import pathname2url
from youtube_dl.compat import (
compat_http_cookiejar_Cookie,
compat_http_server,
compat_str as str,
compat_urllib_error,
compat_urllib_HTTPError,
compat_urllib_parse,
compat_urllib_request,
)
from youtube_dl.utils import (
sanitized_Request,
update_Request,
urlencode_postdata,
)
from test.helper import (
expectedFailureIf,
FakeYDL,
FakeLogger,
http_server_port,
)
from youtube_dl import YoutubeDL
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'
# work-around old/new -style class inheritance
def super(self, meth_name, *args, **kwargs):
from types import MethodType
try:
super()
fn = lambda s, m, *a, **k: getattr(super(), m)(*a, **k)
except TypeError:
fn = lambda s, m, *a, **k: getattr(compat_http_server.BaseHTTPRequestHandler, m)(s, *a, **k)
self.super = MethodType(fn, self)
return self.super(meth_name, *args, **kwargs)
def log_message(self, format, *args):
pass
def _headers(self):
payload = str(self.headers).encode('utf-8')
self.send_response(200)
self.send_header('Content-Type', 'application/json')
self.send_header('Content-Length', str(len(payload)))
self.end_headers()
self.wfile.write(payload)
def _redirect(self):
self.send_response(int(self.path[len('/redirect_'):]))
self.send_header('Location', '/method')
self.send_header('Content-Length', '0')
self.end_headers()
def _method(self, method, payload=None):
self.send_response(200)
self.send_header('Content-Length', str(len(payload or '')))
self.send_header('Method', method)
self.end_headers()
if payload:
self.wfile.write(payload)
def _status(self, status):
payload = '<html>{0} NOT FOUND</html>'.format(status).encode('utf-8')
self.send_response(int(status))
self.send_header('Content-Type', 'text/html; charset=utf-8')
self.send_header('Content-Length', str(len(payload)))
self.end_headers()
self.wfile.write(payload)
def _read_data(self):
if 'Content-Length' in self.headers:
return self.rfile.read(int(self.headers['Content-Length']))
def _test_url(self, path, host='127.0.0.1', scheme='http', port=None):
return '{0}://{1}:{2}/{3}'.format(
scheme, host,
port if port is not None
else http_server_port(self.server), path)
def do_POST(self):
data = self._read_data()
if self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
self._method('POST', data)
elif self.path.startswith('/headers'):
self._headers()
else:
self._status(404)
def do_HEAD(self):
if self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
self._method('HEAD')
else:
self._status(404)
def do_PUT(self):
data = self._read_data()
if self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
self._method('PUT', data)
else:
self._status(404)
def do_GET(self):
def respond(payload=b'<html><video src="/vid.mp4" /></html>',
payload_type='text/html; charset=utf-8',
payload_encoding=None,
resp_code=200):
self.send_response(resp_code)
self.send_header('Content-Type', payload_type)
if payload_encoding:
self.send_header('Content-Encoding', payload_encoding)
self.send_header('Content-Length', str(len(payload))) # required for persistent connections
self.end_headers()
self.wfile.write(payload)
def gzip_compress(p):
buf = io.BytesIO()
with contextlib.closing(gzip.GzipFile(fileobj=buf, mode='wb')) as f:
f.write(p)
return buf.getvalue()
if self.path == '/video.html':
self.send_response(200)
self.send_header('Content-Type', 'text/html; charset=utf-8')
self.end_headers()
self.wfile.write(b'<html><video src="/vid.mp4" /></html>')
respond()
elif self.path == '/vid.mp4':
self.send_response(200)
self.send_header('Content-Type', 'video/mp4')
self.end_headers()
self.wfile.write(b'\x00\x00\x00\x00\x20\x66\x74[video]')
respond(b'\x00\x00\x00\x00\x20\x66\x74[video]', 'video/mp4')
elif self.path == '/302':
if sys.version_info[0] == 3:
# XXX: Python 3 http server does not allow non-ASCII header values
@ -39,71 +172,336 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
self.end_headers()
return
new_url = 'http://127.0.0.1:%d/中文.html' % http_server_port(self.server)
new_url = self._test_url('中文.html')
self.send_response(302)
self.send_header(b'Location', new_url.encode('utf-8'))
self.end_headers()
elif self.path == '/%E4%B8%AD%E6%96%87.html':
self.send_response(200)
self.send_header('Content-Type', 'text/html; charset=utf-8')
respond()
elif self.path == '/%c7%9f':
respond()
elif self.path == '/redirect_dotsegments':
self.send_response(301)
# redirect to /headers but with dot segments before
self.send_header('Location', '/a/b/./../../headers')
self.send_header('Content-Length', '0')
self.end_headers()
self.wfile.write(b'<html><video src="/vid.mp4" /></html>')
elif self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
self._method('GET')
elif self.path.startswith('/headers'):
self._headers()
elif self.path.startswith('/308-to-headers'):
self.send_response(308)
self.send_header('Location', '/headers')
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path == '/trailing_garbage':
payload = b'<html><video src="/vid.mp4" /></html>'
compressed = gzip_compress(payload) + b'trailing garbage'
respond(compressed, payload_encoding='gzip')
elif self.path == '/302-non-ascii-redirect':
new_url = self._test_url('中文.html')
# actually respond with permanent redirect
self.send_response(301)
self.send_header('Location', new_url)
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path == '/content-encoding':
encodings = self.headers.get('ytdl-encoding', '')
payload = b'<html><video src="/vid.mp4" /></html>'
for encoding in filter(None, (e.strip() for e in encodings.split(','))):
if encoding == 'br' and brotli:
payload = brotli.compress(payload)
elif encoding == 'gzip':
payload = gzip_compress(payload)
elif encoding == 'deflate':
payload = zlib.compress(payload)
elif encoding == 'unsupported':
payload = b'raw'
break
else:
self._status(415)
return
respond(payload, payload_encoding=encodings)
else:
assert False
self._status(404)
def send_header(self, keyword, value):
"""
Forcibly allow HTTP server to send non percent-encoded non-ASCII characters in headers.
This is against what is defined in RFC 3986: but we need to test that we support this
since some sites incorrectly do this.
"""
if keyword.lower() == 'connection':
return self.super('send_header', keyword, value)
class FakeLogger(object):
def debug(self, msg):
pass
if not hasattr(self, '_headers_buffer'):
self._headers_buffer = []
def warning(self, msg):
pass
self._headers_buffer.append('{0}: {1}\r\n'.format(keyword, value).encode('utf-8'))
def error(self, msg):
pass
def end_headers(self):
if hasattr(self, '_headers_buffer'):
self.wfile.write(b''.join(self._headers_buffer))
self._headers_buffer = []
self.super('end_headers')
class TestHTTP(unittest.TestCase):
# when does it make sense to check the SSL certificate?
_check_cert = (
sys.version_info >= (3, 2)
or (sys.version_info[0] == 2 and sys.version_info[1:] >= (7, 19)))
def setUp(self):
self.httpd = compat_http_server.HTTPServer(
# HTTP server
self.http_httpd = compat_http_server.HTTPServer(
('127.0.0.1', 0), HTTPTestRequestHandler)
self.port = http_server_port(self.httpd)
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
self.server_thread.daemon = True
self.server_thread.start()
self.http_port = http_server_port(self.http_httpd)
self.http_server_thread = threading.Thread(target=self.http_httpd.serve_forever)
self.http_server_thread.daemon = True
self.http_server_thread.start()
try:
from http.server import ThreadingHTTPServer
except ImportError:
try:
from socketserver import ThreadingMixIn
except ImportError:
from SocketServer import ThreadingMixIn
class ThreadingHTTPServer(ThreadingMixIn, compat_http_server.HTTPServer):
pass
# HTTPS server
certfn = os.path.join(TEST_DIR, 'testcert.pem')
self.https_httpd = ThreadingHTTPServer(
('127.0.0.1', 0), HTTPTestRequestHandler)
try:
sslctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
sslctx.verify_mode = ssl.CERT_NONE
sslctx.check_hostname = False
sslctx.load_cert_chain(certfn, None)
self.https_httpd.socket = sslctx.wrap_socket(
self.https_httpd.socket, server_side=True)
except AttributeError:
self.https_httpd.socket = ssl.wrap_socket(
self.https_httpd.socket, certfile=certfn, server_side=True)
self.https_port = http_server_port(self.https_httpd)
self.https_server_thread = threading.Thread(target=self.https_httpd.serve_forever)
self.https_server_thread.daemon = True
self.https_server_thread.start()
def tearDown(self):
def closer(svr):
def _closer():
svr.shutdown()
svr.server_close()
return _closer
shutdown_thread = threading.Thread(target=closer(self.http_httpd))
shutdown_thread.start()
self.http_server_thread.join(2.0)
shutdown_thread = threading.Thread(target=closer(self.https_httpd))
shutdown_thread.start()
self.https_server_thread.join(2.0)
def _test_url(self, path, host='127.0.0.1', scheme='http', port=None):
return '{0}://{1}:{2}/{3}'.format(
scheme, host,
port if port is not None
else self.https_port if scheme == 'https'
else self.http_port, path)
@unittest.skipUnless(_check_cert, 'No support for certificate check in SSL')
def test_nocheckcertificate(self):
with FakeYDL({'logger': FakeLogger()}) as ydl:
with self.assertRaises(compat_urllib_error.URLError):
ydl.urlopen(sanitized_Request(self._test_url('headers', scheme='https')))
with FakeYDL({'logger': FakeLogger(), 'nocheckcertificate': True}) as ydl:
r = ydl.urlopen(sanitized_Request(self._test_url('headers', scheme='https')))
self.assertEqual(r.getcode(), 200)
r.close()
def test_percent_encode(self):
with FakeYDL() as ydl:
# Unicode characters should be encoded with uppercase percent-encoding
res = ydl.urlopen(sanitized_Request(self._test_url('中文.html')))
self.assertEqual(res.getcode(), 200)
res.close()
# don't normalize existing percent encodings
res = ydl.urlopen(sanitized_Request(self._test_url('%c7%9f')))
self.assertEqual(res.getcode(), 200)
res.close()
def test_unicode_path_redirection(self):
# XXX: Python 3 http server does not allow non-ASCII header values
if sys.version_info[0] == 3:
return
with FakeYDL() as ydl:
r = ydl.urlopen(sanitized_Request(self._test_url('302-non-ascii-redirect')))
self.assertEqual(r.url, self._test_url('%E4%B8%AD%E6%96%87.html'))
r.close()
ydl = YoutubeDL({'logger': FakeLogger()})
r = ydl.extract_info('http://127.0.0.1:%d/302' % self.port)
self.assertEqual(r['entries'][0]['url'], 'http://127.0.0.1:%d/vid.mp4' % self.port)
def test_redirect(self):
with FakeYDL() as ydl:
def do_req(redirect_status, method, check_no_content=False):
data = b'testdata' if method in ('POST', 'PUT') else None
res = ydl.urlopen(sanitized_Request(
self._test_url('redirect_{0}'.format(redirect_status)),
method=method, data=data))
if check_no_content:
self.assertNotIn('Content-Type', res.headers)
return res.read().decode('utf-8'), res.headers.get('method', '')
# A 303 must either use GET or HEAD for subsequent request
self.assertEqual(do_req(303, 'POST'), ('', 'GET'))
self.assertEqual(do_req(303, 'HEAD'), ('', 'HEAD'))
self.assertEqual(do_req(303, 'PUT'), ('', 'GET'))
class TestHTTPS(unittest.TestCase):
def setUp(self):
certfn = os.path.join(TEST_DIR, 'testcert.pem')
self.httpd = compat_http_server.HTTPServer(
('127.0.0.1', 0), HTTPTestRequestHandler)
self.httpd.socket = ssl.wrap_socket(
self.httpd.socket, certfile=certfn, server_side=True)
self.port = http_server_port(self.httpd)
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
self.server_thread.daemon = True
self.server_thread.start()
# 301 and 302 turn POST only into a GET, with no Content-Type
self.assertEqual(do_req(301, 'POST', True), ('', 'GET'))
self.assertEqual(do_req(301, 'HEAD'), ('', 'HEAD'))
self.assertEqual(do_req(302, 'POST', True), ('', 'GET'))
self.assertEqual(do_req(302, 'HEAD'), ('', 'HEAD'))
def test_nocheckcertificate(self):
if sys.version_info >= (2, 7, 9): # No certificate checking anyways
ydl = YoutubeDL({'logger': FakeLogger()})
self.assertRaises(
Exception,
ydl.extract_info, 'https://127.0.0.1:%d/video.html' % self.port)
self.assertEqual(do_req(301, 'PUT'), ('testdata', 'PUT'))
self.assertEqual(do_req(302, 'PUT'), ('testdata', 'PUT'))
ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True})
r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
# 307 and 308 should not change method
for m in ('POST', 'PUT'):
self.assertEqual(do_req(307, m), ('testdata', m))
self.assertEqual(do_req(308, m), ('testdata', m))
self.assertEqual(do_req(307, 'HEAD'), ('', 'HEAD'))
self.assertEqual(do_req(308, 'HEAD'), ('', 'HEAD'))
# These should not redirect and instead raise an HTTPError
for code in (300, 304, 305, 306):
with self.assertRaises(compat_urllib_HTTPError):
do_req(code, 'GET')
# Jython 2.7.1 times out for some reason
@expectedFailureIf(sys.platform.startswith('java') and sys.version_info < (2, 7, 2))
def test_content_type(self):
# https://github.com/yt-dlp/yt-dlp/commit/379a4f161d4ad3e40932dcf5aca6e6fb9715ab28
with FakeYDL({'nocheckcertificate': True}) as ydl:
# method should be auto-detected as POST
r = sanitized_Request(self._test_url('headers', scheme='https'), data=urlencode_postdata({'test': 'test'}))
headers = ydl.urlopen(r).read().decode('utf-8')
self.assertIn('Content-Type: application/x-www-form-urlencoded', headers)
# test http
r = sanitized_Request(self._test_url('headers'), data=urlencode_postdata({'test': 'test'}))
headers = ydl.urlopen(r).read().decode('utf-8')
self.assertIn('Content-Type: application/x-www-form-urlencoded', headers)
def test_update_req(self):
req = sanitized_Request('http://example.com')
assert req.data is None
assert req.get_method() == 'GET'
assert not req.has_header('Content-Type')
# Test that zero-byte payloads will be sent
req = update_Request(req, data=b'')
assert req.data == b''
assert req.get_method() == 'POST'
# yt-dl expects data to be encoded and Content-Type to be added by sender
# assert req.get_header('Content-Type') == 'application/x-www-form-urlencoded'
def test_cookiejar(self):
with FakeYDL() as ydl:
ydl.cookiejar.set_cookie(compat_http_cookiejar_Cookie(
0, 'test', 'ytdl', None, False, '127.0.0.1', True,
False, '/headers', True, False, None, False, None, None, {}))
data = ydl.urlopen(sanitized_Request(
self._test_url('headers'))).read().decode('utf-8')
self.assertIn('Cookie: test=ytdl', data)
def test_passed_cookie_header(self):
# We should accept a Cookie header being passed as in normal headers and handle it appropriately.
with FakeYDL() as ydl:
# Specified Cookie header should be used
res = ydl.urlopen(sanitized_Request(
self._test_url('headers'), headers={'Cookie': 'test=test'})).read().decode('utf-8')
self.assertIn('Cookie: test=test', res)
# Specified Cookie header should be removed on any redirect
res = ydl.urlopen(sanitized_Request(
self._test_url('308-to-headers'), headers={'Cookie': 'test=test'})).read().decode('utf-8')
self.assertNotIn('Cookie: test=test', res)
# Specified Cookie header should override global cookiejar for that request
ydl.cookiejar.set_cookie(compat_http_cookiejar_Cookie(
0, 'test', 'ytdlp', None, False, '127.0.0.1', True,
False, '/headers', True, False, None, False, None, None, {}))
data = ydl.urlopen(sanitized_Request(
self._test_url('headers'), headers={'Cookie': 'test=test'})).read().decode('utf-8')
self.assertNotIn('Cookie: test=ytdlp', data)
self.assertIn('Cookie: test=test', data)
def test_no_compression_compat_header(self):
with FakeYDL() as ydl:
data = ydl.urlopen(
sanitized_Request(
self._test_url('headers'),
headers={'Youtubedl-no-compression': True})).read()
self.assertIn(b'Accept-Encoding: identity', data)
self.assertNotIn(b'youtubedl-no-compression', data.lower())
def test_gzip_trailing_garbage(self):
# https://github.com/ytdl-org/youtube-dl/commit/aa3e950764337ef9800c936f4de89b31c00dfcf5
# https://github.com/ytdl-org/youtube-dl/commit/6f2ec15cee79d35dba065677cad9da7491ec6e6f
with FakeYDL() as ydl:
data = ydl.urlopen(sanitized_Request(self._test_url('trailing_garbage'))).read().decode('utf-8')
self.assertEqual(data, '<html><video src="/vid.mp4" /></html>')
def __test_compression(self, encoding):
with FakeYDL() as ydl:
res = ydl.urlopen(
sanitized_Request(
self._test_url('content-encoding'),
headers={'ytdl-encoding': encoding}))
# decoded encodings are removed: only check for valid decompressed data
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
@unittest.skipUnless(brotli, 'brotli support is not installed')
def test_brotli(self):
self.__test_compression('br')
def test_deflate(self):
self.__test_compression('deflate')
def test_gzip(self):
self.__test_compression('gzip')
def test_multiple_encodings(self):
# https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4
for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
self.__test_compression(pair)
def test_unsupported_encoding(self):
# it should return the raw content
with FakeYDL() as ydl:
res = ydl.urlopen(
sanitized_Request(
self._test_url('content-encoding'),
headers={'ytdl-encoding': 'unsupported'}))
self.assertEqual(res.headers.get('Content-Encoding'), 'unsupported')
self.assertEqual(res.read(), b'raw')
def test_remove_dot_segments(self):
with FakeYDL() as ydl:
res = ydl.urlopen(sanitized_Request(self._test_url('a/b/./../../headers')))
self.assertEqual(compat_urllib_parse.urlparse(res.geturl()).path, '/headers')
res = ydl.urlopen(sanitized_Request(self._test_url('redirect_dotsegments')))
self.assertEqual(compat_urllib_parse.urlparse(res.geturl()).path, '/headers')
def _build_proxy_handler(name):
@ -117,7 +515,7 @@ def _build_proxy_handler(name):
self.send_response(200)
self.send_header('Content-Type', 'text/plain; charset=utf-8')
self.end_headers()
self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode('utf-8'))
self.wfile.write('{0}: {1}'.format(self.proxy_name, self.path).encode('utf-8'))
return HTTPTestRequestHandler
@ -137,10 +535,30 @@ class TestProxy(unittest.TestCase):
self.geo_proxy_thread.daemon = True
self.geo_proxy_thread.start()
def tearDown(self):
def closer(svr):
def _closer():
svr.shutdown()
svr.server_close()
return _closer
shutdown_thread = threading.Thread(target=closer(self.proxy))
shutdown_thread.start()
self.proxy_thread.join(2.0)
shutdown_thread = threading.Thread(target=closer(self.geo_proxy))
shutdown_thread.start()
self.geo_proxy_thread.join(2.0)
def _test_proxy(self, host='127.0.0.1', port=None):
return '{0}:{1}'.format(
host, port if port is not None else self.port)
def test_proxy(self):
geo_proxy = '127.0.0.1:{0}'.format(self.geo_port)
geo_proxy = self._test_proxy(port=self.geo_port)
ydl = YoutubeDL({
'proxy': '127.0.0.1:{0}'.format(self.port),
'proxy': self._test_proxy(),
'geo_verification_proxy': geo_proxy,
})
url = 'http://foo.com/bar'
@ -154,7 +572,7 @@ class TestProxy(unittest.TestCase):
def test_proxy_with_idn(self):
ydl = YoutubeDL({
'proxy': '127.0.0.1:{0}'.format(self.port),
'proxy': self._test_proxy(),
})
url = 'http://中文.tw/'
response = ydl.urlopen(url).read().decode('utf-8')
@ -162,5 +580,25 @@ class TestProxy(unittest.TestCase):
self.assertEqual(response, 'normal: http://xn--fiq228c.tw/')
class TestFileURL(unittest.TestCase):
# See https://github.com/ytdl-org/youtube-dl/issues/8227
def test_file_urls(self):
tf = tempfile.NamedTemporaryFile(delete=False)
tf.write(b'foobar')
tf.close()
url = compat_urllib_parse.urljoin('file://', pathname2url(tf.name))
with FakeYDL() as ydl:
self.assertRaisesRegexp(
compat_urllib_error.URLError, 'file:// scheme is explicitly disabled in youtube-dl for security reasons', ydl.urlopen, url)
# not yet implemented
"""
with FakeYDL({'enable_file_urls': True}) as ydl:
res = ydl.urlopen(url)
self.assertEqual(res.read(), b'foobar')
res.close()
"""
os.unlink(tf.name)
if __name__ == '__main__':
unittest.main()

View file

@ -11,8 +11,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import math
import re
from youtube_dl.compat import compat_re_Pattern
from youtube_dl.jsinterp import JS_Undefined, JSInterpreter
@ -20,6 +18,7 @@ class TestJSInterpreter(unittest.TestCase):
def test_basic(self):
jsi = JSInterpreter('function x(){;}')
self.assertEqual(jsi.call_function('x'), None)
self.assertEqual(repr(jsi.extract_function('x')), 'F<x>')
jsi = JSInterpreter('function x3(){return 42;}')
self.assertEqual(jsi.call_function('x3'), 42)
@ -34,6 +33,55 @@ class TestJSInterpreter(unittest.TestCase):
jsi = JSInterpreter('function x4(a){return 2*a+1;}')
self.assertEqual(jsi.call_function('x4', 3), 7)
def test_add(self):
jsi = JSInterpreter('function f(){return 42 + 7;}')
self.assertEqual(jsi.call_function('f'), 49)
jsi = JSInterpreter('function f(){return 42 + undefined;}')
self.assertTrue(math.isnan(jsi.call_function('f')))
jsi = JSInterpreter('function f(){return 42 + null;}')
self.assertEqual(jsi.call_function('f'), 42)
def test_sub(self):
jsi = JSInterpreter('function f(){return 42 - 7;}')
self.assertEqual(jsi.call_function('f'), 35)
jsi = JSInterpreter('function f(){return 42 - undefined;}')
self.assertTrue(math.isnan(jsi.call_function('f')))
jsi = JSInterpreter('function f(){return 42 - null;}')
self.assertEqual(jsi.call_function('f'), 42)
def test_mul(self):
jsi = JSInterpreter('function f(){return 42 * 7;}')
self.assertEqual(jsi.call_function('f'), 294)
jsi = JSInterpreter('function f(){return 42 * undefined;}')
self.assertTrue(math.isnan(jsi.call_function('f')))
jsi = JSInterpreter('function f(){return 42 * null;}')
self.assertEqual(jsi.call_function('f'), 0)
def test_div(self):
jsi = JSInterpreter('function f(a, b){return a / b;}')
self.assertTrue(math.isnan(jsi.call_function('f', 0, 0)))
self.assertTrue(math.isnan(jsi.call_function('f', JS_Undefined, 1)))
self.assertTrue(math.isinf(jsi.call_function('f', 2, 0)))
self.assertEqual(jsi.call_function('f', 0, 3), 0)
def test_mod(self):
jsi = JSInterpreter('function f(){return 42 % 7;}')
self.assertEqual(jsi.call_function('f'), 0)
jsi = JSInterpreter('function f(){return 42 % 0;}')
self.assertTrue(math.isnan(jsi.call_function('f')))
jsi = JSInterpreter('function f(){return 42 % undefined;}')
self.assertTrue(math.isnan(jsi.call_function('f')))
def test_exp(self):
jsi = JSInterpreter('function f(){return 42 ** 2;}')
self.assertEqual(jsi.call_function('f'), 1764)
jsi = JSInterpreter('function f(){return 42 ** undefined;}')
self.assertTrue(math.isnan(jsi.call_function('f')))
jsi = JSInterpreter('function f(){return 42 ** null;}')
self.assertEqual(jsi.call_function('f'), 1)
jsi = JSInterpreter('function f(){return undefined ** 42;}')
self.assertTrue(math.isnan(jsi.call_function('f')))
def test_empty_return(self):
jsi = JSInterpreter('function f(){return; y()}')
self.assertEqual(jsi.call_function('f'), None)
@ -140,15 +188,18 @@ class TestJSInterpreter(unittest.TestCase):
''')
self.assertTrue(math.isnan(jsi.call_function('x')))
jsi = JSInterpreter('''
function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; }
''')
self.assertEqual(jsi.call_function('x'), 86000)
def test_Date(self):
jsi = JSInterpreter('''
function x(dt) { return new Date(dt) - 0; }
''')
self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000)
# date format m/d/y
self.assertEqual(jsi.call_function('x', '12/31/1969 18:01:26 MDT'), 86000)
# epoch 0
self.assertEqual(jsi.call_function('x', '1 January 1970 00:00:00 UTC'), 0)
def test_call(self):
jsi = JSInterpreter('''
function x() { return 2; }
@ -158,6 +209,57 @@ class TestJSInterpreter(unittest.TestCase):
self.assertEqual(jsi.call_function('z'), 5)
self.assertEqual(jsi.call_function('y'), 2)
def test_if(self):
jsi = JSInterpreter('''
function x() {
let a = 9;
if (0==0) {a++}
return a
}''')
self.assertEqual(jsi.call_function('x'), 10)
jsi = JSInterpreter('''
function x() {
if (0==0) {return 10}
}''')
self.assertEqual(jsi.call_function('x'), 10)
jsi = JSInterpreter('''
function x() {
if (0!=0) {return 1}
else {return 10}
}''')
self.assertEqual(jsi.call_function('x'), 10)
""" # Unsupported
jsi = JSInterpreter('''
function x() {
if (0!=0) return 1;
else {return 10}
}''')
self.assertEqual(jsi.call_function('x'), 10)
"""
def test_elseif(self):
jsi = JSInterpreter('''
function x() {
if (0!=0) {return 1}
else if (1==0) {return 2}
else {return 10}
}''')
self.assertEqual(jsi.call_function('x'), 10)
""" # Unsupported
jsi = JSInterpreter('''
function x() {
if (0!=0) return 1;
else if (1==0) {return 2}
else {return 10}
}''')
self.assertEqual(jsi.call_function('x'), 10)
# etc
"""
def test_for_loop(self):
# function x() { a=0; for (i=0; i-10; i++) {a++} a }
jsi = JSInterpreter('''
@ -165,6 +267,13 @@ class TestJSInterpreter(unittest.TestCase):
''')
self.assertEqual(jsi.call_function('x'), 10)
def test_while_loop(self):
# function x() { a=0; while (a<10) {a++} a }
jsi = JSInterpreter('''
function x() { a=0; while (a<10) {a++} return a }
''')
self.assertEqual(jsi.call_function('x'), 10)
def test_switch(self):
jsi = JSInterpreter('''
function x(f) { switch(f){
@ -381,15 +490,57 @@ class TestJSInterpreter(unittest.TestCase):
self.assertIs(jsi.call_function('x'), None)
jsi = JSInterpreter('''
function x() { let a=/,,[/,913,/](,)}/; return a; }
function x() { let a=/,,[/,913,/](,)}/; "".replace(a, ""); return a; }
''')
self.assertIsInstance(jsi.call_function('x'), compat_re_Pattern)
attrs = set(('findall', 'finditer', 'match', 'scanner', 'search',
'split', 'sub', 'subn'))
if sys.version_info >= (2, 7):
# documented for 2.6 but may not be found
attrs.update(('flags', 'groupindex', 'groups', 'pattern'))
self.assertSetEqual(set(dir(jsi.call_function('x'))) & attrs, attrs)
jsi = JSInterpreter('''
function x() { let a=/,,[/,913,/](,)}/i; return a; }
''')
self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I)
jsi = JSInterpreter(r'''
function x() { let a="data-name".replace("data-", ""); return a }
''')
self.assertEqual(jsi.call_function('x'), 'name')
jsi = JSInterpreter(r'''
function x() { let a="data-name".replace(new RegExp("^.+-"), ""); return a; }
''')
self.assertEqual(jsi.call_function('x'), 'name')
jsi = JSInterpreter(r'''
function x() { let a="data-name".replace(/^.+-/, ""); return a; }
''')
self.assertEqual(jsi.call_function('x'), 'name')
jsi = JSInterpreter(r'''
function x() { let a="data-name".replace(/a/g, "o"); return a; }
''')
self.assertEqual(jsi.call_function('x'), 'doto-nome')
jsi = JSInterpreter(r'''
function x() { let a="data-name".replaceAll("a", "o"); return a; }
''')
self.assertEqual(jsi.call_function('x'), 'doto-nome')
jsi = JSInterpreter(r'''
function x() { let a=[/[)\\]/]; return a[0]; }
''')
self.assertEqual(jsi.call_function('x').pattern, r'[)\\]')
""" # fails
jsi = JSInterpreter(r'''
function x() { let a=100; a/=/[0-9]+/.exec('divide by 20 today')[0]; }
''')
self.assertEqual(jsi.call_function('x'), 5)
"""
def test_char_code_at(self):
jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}')
self.assertEqual(jsi.call_function('x', 0), 116)
@ -406,6 +557,36 @@ class TestJSInterpreter(unittest.TestCase):
jsi = JSInterpreter('function x(){return 1236566549 << 5}')
self.assertEqual(jsi.call_function('x'), 915423904)
def test_bitwise_operators_madness(self):
jsi = JSInterpreter('function x(){return null << 5}')
self.assertEqual(jsi.call_function('x'), 0)
jsi = JSInterpreter('function x(){return undefined >> 5}')
self.assertEqual(jsi.call_function('x'), 0)
jsi = JSInterpreter('function x(){return 42 << NaN}')
self.assertEqual(jsi.call_function('x'), 42)
jsi = JSInterpreter('function x(){return 42 << Infinity}')
self.assertEqual(jsi.call_function('x'), 42)
def test_32066(self):
jsi = JSInterpreter("function x(){return Math.pow(3, 5) + new Date('1970-01-01T08:01:42.000+08:00') / 1000 * -239 - -24205;}")
self.assertEqual(jsi.call_function('x'), 70)
def test_unary_operators(self):
jsi = JSInterpreter('function f(){return 2 - - - 2;}')
self.assertEqual(jsi.call_function('f'), 0)
# fails
# jsi = JSInterpreter('function f(){return 2 + - + - - 2;}')
# self.assertEqual(jsi.call_function('f'), 0)
""" # fails so far
def test_packed(self):
jsi = JSInterpreter('''function x(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''')
self.assertEqual(jsi.call_function('x', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("<q />").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|')))
"""
if __name__ == '__main__':
unittest.main()

View file

@ -295,6 +295,7 @@ class TestNRKSubtitles(BaseTestSubtitles):
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
self.DL.params['format'] = 'best/bestvideo'
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['nb-ttv']))
self.assertEqual(md5(subtitles['nb-ttv']), '67e06ff02d0deaf975e68f6cb8f6a149')

View file

@ -5,16 +5,18 @@ from __future__ import unicode_literals
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
dirn = os.path.dirname
sys.path.insert(0, dirn(dirn(os.path.abspath(__file__))))
import errno
import io
import json
import re
import subprocess
from youtube_dl.swfinterp import SWFInterpreter
from youtube_dl.compat import compat_open as open
TEST_DIR = os.path.join(
@ -43,7 +45,7 @@ def _make_testfunc(testfile):
'-static-link-runtime-shared-libraries', as_file])
except OSError as ose:
if ose.errno == errno.ENOENT:
print('mxmlc not found! Skipping test.')
self.skipTest('mxmlc not found!')
return
raise
@ -51,7 +53,7 @@ def _make_testfunc(testfile):
swf_content = swf_f.read()
swfi = SWFInterpreter(swf_content)
with io.open(as_file, 'r', encoding='utf-8') as as_f:
with open(as_file, 'r', encoding='utf-8') as as_f:
as_content = as_f.read()
def _find_spec(key):

View file

@ -2,19 +2,21 @@ from __future__ import unicode_literals
# Allow direct execution
import os
import re
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import io
import re
dirn = os.path.dirname
rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
rootDir = dirn(dirn(os.path.abspath(__file__)))
sys.path.insert(0, rootDir)
IGNORED_FILES = [
'setup.py', # http://bugs.python.org/issue13943
'conf.py',
'buildserver.py',
'get-pip.py',
]
IGNORED_DIRS = [
@ -23,6 +25,7 @@ IGNORED_DIRS = [
]
from test.helper import assertRegexpMatches
from youtube_dl.compat import compat_open as open
class TestUnicodeLiterals(unittest.TestCase):
@ -40,7 +43,7 @@ class TestUnicodeLiterals(unittest.TestCase):
continue
fn = os.path.join(dirpath, basename)
with io.open(fn, encoding='utf-8') as inf:
with open(fn, encoding='utf-8') as inf:
code = inf.read()
if "'" not in code and '"' not in code:

View file

@ -20,7 +20,7 @@ import xml.etree.ElementTree
from youtube_dl.utils import (
age_restricted,
args_to_str,
encode_base_n,
base_url,
caesar,
clean_html,
clean_podcast_url,
@ -29,10 +29,12 @@ from youtube_dl.utils import (
detect_exe_version,
determine_ext,
dict_get,
encode_base_n,
encode_compat_str,
encodeFilename,
escape_rfc3986,
escape_url,
expand_path,
extract_attributes,
ExtractorError,
find_xpath_attr,
@ -51,6 +53,7 @@ from youtube_dl.utils import (
js_to_json,
LazyList,
limit_length,
lowercase_escape,
merge_dicts,
mimetype2ext,
month_by_name,
@ -59,30 +62,33 @@ from youtube_dl.utils import (
OnDemandPagedList,
orderedSet,
parse_age_limit,
parse_bitrate,
parse_duration,
parse_filesize,
parse_codecs,
parse_count,
parse_iso8601,
parse_resolution,
parse_bitrate,
parse_qs,
pkcs1pad,
read_batch_urls,
sanitize_filename,
sanitize_path,
sanitize_url,
expand_path,
prepend_extension,
replace_extension,
read_batch_urls,
remove_start,
remove_end,
remove_quotes,
replace_extension,
rot47,
sanitize_filename,
sanitize_path,
sanitize_url,
shell_quote,
smuggle_url,
str_or_none,
str_to_int,
strip_jsonp,
strip_or_none,
subtitles_filename,
T,
timeconvert,
traverse_obj,
try_call,
@ -91,10 +97,8 @@ from youtube_dl.utils import (
unified_timestamp,
unsmuggle_url,
uppercase_escape,
lowercase_escape,
url_basename,
url_or_none,
base_url,
urljoin,
urlencode_postdata,
urshift,
@ -112,7 +116,7 @@ from youtube_dl.utils import (
cli_option,
cli_valueless_option,
cli_bool_option,
parse_codecs,
YoutubeDLHandler,
)
from youtube_dl.compat import (
compat_chr,
@ -122,7 +126,6 @@ from youtube_dl.compat import (
compat_setenv,
compat_str,
compat_urlparse,
compat_parse_qs,
)
@ -250,6 +253,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar')
self.assertEqual(sanitize_url('rmtps://foo.bar'), 'rtmps://foo.bar')
self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar')
self.assertEqual(sanitize_url('foo bar'), 'foo bar')
def test_expand_path(self):
def env(var):
@ -679,38 +683,36 @@ class TestUtil(unittest.TestCase):
self.assertTrue(isinstance(data, bytes))
def test_update_url_query(self):
def query_dict(url):
return compat_parse_qs(compat_urlparse.urlparse(url).query)
self.assertEqual(query_dict(update_url_query(
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path', {'quality': ['HD'], 'format': ['mp4']})),
query_dict('http://example.com/path?quality=HD&format=mp4'))
self.assertEqual(query_dict(update_url_query(
parse_qs('http://example.com/path?quality=HD&format=mp4'))
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path', {'system': ['LINUX', 'WINDOWS']})),
query_dict('http://example.com/path?system=LINUX&system=WINDOWS'))
self.assertEqual(query_dict(update_url_query(
parse_qs('http://example.com/path?system=LINUX&system=WINDOWS'))
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path', {'fields': 'id,formats,subtitles'})),
query_dict('http://example.com/path?fields=id,formats,subtitles'))
self.assertEqual(query_dict(update_url_query(
parse_qs('http://example.com/path?fields=id,formats,subtitles'))
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path', {'fields': ('id,formats,subtitles', 'thumbnails')})),
query_dict('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails'))
self.assertEqual(query_dict(update_url_query(
parse_qs('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails'))
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path?manifest=f4m', {'manifest': []})),
query_dict('http://example.com/path'))
self.assertEqual(query_dict(update_url_query(
parse_qs('http://example.com/path'))
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path?system=LINUX&system=WINDOWS', {'system': 'LINUX'})),
query_dict('http://example.com/path?system=LINUX'))
self.assertEqual(query_dict(update_url_query(
parse_qs('http://example.com/path?system=LINUX'))
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path', {'fields': b'id,formats,subtitles'})),
query_dict('http://example.com/path?fields=id,formats,subtitles'))
self.assertEqual(query_dict(update_url_query(
parse_qs('http://example.com/path?fields=id,formats,subtitles'))
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path', {'width': 1080, 'height': 720})),
query_dict('http://example.com/path?width=1080&height=720'))
self.assertEqual(query_dict(update_url_query(
parse_qs('http://example.com/path?width=1080&height=720'))
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path', {'bitrate': 5020.43})),
query_dict('http://example.com/path?bitrate=5020.43'))
self.assertEqual(query_dict(update_url_query(
parse_qs('http://example.com/path?bitrate=5020.43'))
self.assertEqual(parse_qs(update_url_query(
'http://example.com/path', {'test': '第二行тест'})),
query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82'))
parse_qs('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82'))
def test_multipart_encode(self):
self.assertEqual(
@ -902,6 +904,111 @@ class TestUtil(unittest.TestCase):
)
self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
def test_remove_dot_segments(self):
def remove_dot_segments(p):
q = '' if p.startswith('/') else '/'
p = 'http://example.com' + q + p
p = compat_urlparse.urlsplit(YoutubeDLHandler._fix_path(p)).path
return p[1:] if q else p
self.assertEqual(remove_dot_segments('/a/b/c/./../../g'), '/a/g')
self.assertEqual(remove_dot_segments('mid/content=5/../6'), 'mid/6')
self.assertEqual(remove_dot_segments('/ad/../cd'), '/cd')
self.assertEqual(remove_dot_segments('/ad/../cd/'), '/cd/')
self.assertEqual(remove_dot_segments('/..'), '/')
self.assertEqual(remove_dot_segments('/./'), '/')
self.assertEqual(remove_dot_segments('/./a'), '/a')
self.assertEqual(remove_dot_segments('/abc/./.././d/././e/.././f/./../../ghi'), '/ghi')
self.assertEqual(remove_dot_segments('/'), '/')
self.assertEqual(remove_dot_segments('/t'), '/t')
self.assertEqual(remove_dot_segments('t'), 't')
self.assertEqual(remove_dot_segments(''), '')
self.assertEqual(remove_dot_segments('/../a/b/c'), '/a/b/c')
self.assertEqual(remove_dot_segments('../a'), 'a')
self.assertEqual(remove_dot_segments('./a'), 'a')
self.assertEqual(remove_dot_segments('.'), '')
self.assertEqual(remove_dot_segments('////'), '////')
def test_js_to_json_vars_strings(self):
self.assertDictEqual(
json.loads(js_to_json(
'''{
'null': a,
'nullStr': b,
'true': c,
'trueStr': d,
'false': e,
'falseStr': f,
'unresolvedVar': g,
}''',
{
'a': 'null',
'b': '"null"',
'c': 'true',
'd': '"true"',
'e': 'false',
'f': '"false"',
'g': 'var',
}
)),
{
'null': None,
'nullStr': 'null',
'true': True,
'trueStr': 'true',
'false': False,
'falseStr': 'false',
'unresolvedVar': 'var'
}
)
self.assertDictEqual(
json.loads(js_to_json(
'''{
'int': a,
'intStr': b,
'float': c,
'floatStr': d,
}''',
{
'a': '123',
'b': '"123"',
'c': '1.23',
'd': '"1.23"',
}
)),
{
'int': 123,
'intStr': '123',
'float': 1.23,
'floatStr': '1.23',
}
)
self.assertDictEqual(
json.loads(js_to_json(
'''{
'object': a,
'objectStr': b,
'array': c,
'arrayStr': d,
}''',
{
'a': '{}',
'b': '"{}"',
'c': '[]',
'd': '"[]"',
}
)),
{
'object': {},
'objectStr': '{}',
'array': [],
'arrayStr': '[]',
}
)
def test_js_to_json_realworld(self):
inp = '''{
'clip':{'provider':'pseudo'}
@ -972,10 +1079,10 @@ class TestUtil(unittest.TestCase):
!42: 42
}''')
self.assertEqual(json.loads(on), {
'a': 0,
'b': 1,
'c': 0,
'd': 42.42,
'a': True,
'b': False,
'c': False,
'd': True,
'e': [],
'f': "abc",
'g': "",
@ -1045,10 +1152,26 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{ "040": "040" }')
self.assertEqual(json.loads(on), {'040': '040'})
on = js_to_json('[1,//{},\n2]')
self.assertEqual(json.loads(on), [1, 2])
on = js_to_json(r'"\^\$\#"')
self.assertEqual(json.loads(on), R'^$#', msg='Unnecessary escapes should be stripped')
on = js_to_json('\'"\\""\'')
self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped')
def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')
def test_js_to_json_template_literal(self):
self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"')
self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"')
self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"')
self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""')
self.assertEqual(js_to_json('`${name}`', {}), '"name"')
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
@ -1562,8 +1685,10 @@ Line 1
self.assertEqual(variadic(None), (None, ))
self.assertEqual(variadic('spam'), ('spam', ))
self.assertEqual(variadic('spam', allowed_types=dict), 'spam')
self.assertEqual(variadic('spam', allowed_types=[dict]), 'spam')
def test_traverse_obj(self):
str = compat_str
_TEST_DATA = {
100: 100,
1.2: 1.2,
@ -1581,6 +1706,11 @@ Line 1
'dict': {},
}
# define a pukka Iterable
def iter_range(stop):
for from_ in range(stop):
yield from_
# Test base functionality
self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str',
msg='allow tuple path')
@ -1596,22 +1726,60 @@ Line 1
# Test Ellipsis behavior
self.assertCountEqual(traverse_obj(_TEST_DATA, Ellipsis),
(item for item in _TEST_DATA.values() if item is not None),
msg='`...` should give all values except `None`')
(item for item in _TEST_DATA.values() if item not in (None, {})),
msg='`...` should give all non-discarded values')
self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, Ellipsis)), _TEST_DATA['urls'][0].values(),
msg='`...` selection for dicts should select all values')
self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'url')),
['https://www.example.com/0', 'https://www.example.com/1'],
msg='nested `...` queries should work')
self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), range(4),
self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), iter_range(4),
msg='`...` query result should be flattened')
self.assertEqual(traverse_obj(iter(range(4)), Ellipsis), list(range(4)),
msg='`...` should accept iterables')
# Test function as key
self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)),
[_TEST_DATA['urls']],
msg='function as query key should perform a filter based on (key, value)')
self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], compat_str)), {'str'},
self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), set(('str',)),
msg='exceptions in the query function should be caught')
self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2],
msg='function key should accept iterables')
if __debug__:
with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'):
traverse_obj(_TEST_DATA, lambda a: Ellipsis)
with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'):
traverse_obj(_TEST_DATA, lambda a, b, c: Ellipsis)
# Test set as key (transformation/type, like `expected_type`)
self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, T(str.upper), )), ['STR'],
msg='Function in set should be a transformation')
self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, T(str))), ['str'],
msg='Type in set should be a type filter')
self.assertEqual(traverse_obj(_TEST_DATA, T(dict)), _TEST_DATA,
msg='A single set should be wrapped into a path')
self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, T(str.upper))), ['STR'],
msg='Transformation function should not raise')
self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, T(str_or_none))),
[item for item in map(str_or_none, _TEST_DATA.values()) if item is not None],
msg='Function in set should be a transformation')
if __debug__:
with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'):
traverse_obj(_TEST_DATA, set())
with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'):
traverse_obj(_TEST_DATA, set((str.upper, str)))
# Test `slice` as a key
_SLICE_DATA = [0, 1, 2, 3, 4]
self.assertEqual(traverse_obj(_TEST_DATA, ('dict', slice(1))), None,
msg='slice on a dictionary should not throw')
self.assertEqual(traverse_obj(_SLICE_DATA, slice(1)), _SLICE_DATA[:1],
msg='slice key should apply slice to sequence')
self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 2)), _SLICE_DATA[1:2],
msg='slice key should apply slice to sequence')
self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 4, 2)), _SLICE_DATA[1:4:2],
msg='slice key should apply slice to sequence')
# Test alternative paths
self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str',
@ -1657,15 +1825,23 @@ Line 1
{0: ['https://www.example.com/1', 'https://www.example.com/0']},
msg='triple nesting in dict path should be treated as branches')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}), {},
msg='remove `None` values when dict key')
msg='remove `None` values when top level dict key fails')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}, default=Ellipsis), {0: Ellipsis},
msg='do not remove `None` values if `default`')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {0: {}},
msg='do not remove empty values when dict key')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: {}},
msg='do not remove empty values when dict key and a default')
self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', Ellipsis)}), {0: []},
msg='if branch in dict key not successful, return `[]`')
msg='use `default` if key fails and `default`')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {},
msg='remove empty values when dict key')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: Ellipsis},
msg='use `default` when dict key and a default')
self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}), {},
msg='remove empty values when nested dict key fails')
self.assertEqual(traverse_obj(None, {0: 'fail'}), {},
msg='default to dict if pruned')
self.assertEqual(traverse_obj(None, {0: 'fail'}, default=Ellipsis), {0: Ellipsis},
msg='default to dict if pruned and default is given')
self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}, default=Ellipsis), {0: {0: Ellipsis}},
msg='use nested `default` when nested dict key fails and `default`')
self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', Ellipsis)}), {},
msg='remove key if branch in dict key not successful')
# Testing default parameter behavior
_DEFAULT_DATA = {'None': None, 'int': 0, 'list': []}
@ -1689,20 +1865,55 @@ Line 1
msg='if branched but not successful return `[]`, not `default`')
self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', Ellipsis)), [],
msg='if branched but object is empty return `[]`, not `default`')
self.assertEqual(traverse_obj(None, Ellipsis), [],
msg='if branched but object is `None` return `[]`, not `default`')
self.assertEqual(traverse_obj({0: None}, (0, Ellipsis)), [],
msg='if branched but state is `None` return `[]`, not `default`')
branching_paths = [
('fail', Ellipsis),
(Ellipsis, 'fail'),
100 * ('fail',) + (Ellipsis,),
(Ellipsis,) + 100 * ('fail',),
]
for branching_path in branching_paths:
self.assertEqual(traverse_obj({}, branching_path), [],
msg='if branched but state is `None`, return `[]` (not `default`)')
self.assertEqual(traverse_obj({}, 'fail', branching_path), [],
msg='if branching in last alternative and previous did not match, return `[]` (not `default`)')
self.assertEqual(traverse_obj({0: 'x'}, 0, branching_path), 'x',
msg='if branching in last alternative and previous did match, return single value')
self.assertEqual(traverse_obj({0: 'x'}, branching_path, 0), 'x',
msg='if branching in first alternative and non-branching path does match, return single value')
self.assertEqual(traverse_obj({}, branching_path, 'fail'), None,
msg='if branching in first alternative and non-branching path does not match, return `default`')
# Testing expected_type behavior
_EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0}
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=compat_str), 'str',
msg='accept matching `expected_type` type')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), None,
msg='reject non matching `expected_type` type')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: compat_str(x)), '0',
msg='transform type using type function')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str',
expected_type=lambda _: 1 / 0), None,
msg='wrap expected_type function in try_call')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, Ellipsis, expected_type=compat_str), ['str'],
msg='eliminate items that expected_type fails on')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str),
'str', msg='accept matching `expected_type` type')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int),
None, msg='reject non-matching `expected_type` type')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)),
'0', msg='transform type using type function')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0),
None, msg='wrap expected_type function in try_call')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, Ellipsis, expected_type=str),
['str'], msg='eliminate items that expected_type fails on')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}, expected_type=int),
{0: 100}, msg='type as expected_type should filter dict values')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none),
{0: '100', 1: '1.2'}, msg='function as expected_type should transform dict values')
self.assertEqual(traverse_obj(_TEST_DATA, ({0: 1.2}, 0, set((int_or_none,))), expected_type=int),
1, msg='expected_type should not filter non-final dict values')
self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int),
{0: {0: 100}}, msg='expected_type should transform deep dict values')
self.assertEqual(traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(Ellipsis)),
[{0: Ellipsis}, {0: Ellipsis}], msg='expected_type should transform branched dict values')
self.assertEqual(traverse_obj({1: {3: 4}}, [(1, 2), 3], expected_type=int),
[4], msg='expected_type regression for type matching in tuple branching')
self.assertEqual(traverse_obj(_TEST_DATA, ['data', Ellipsis], expected_type=int),
[], msg='expected_type regression for type matching in dict result')
# Test get_all behavior
_GET_ALL_DATA = {'key': [0, 1, 2]}
@ -1747,14 +1958,23 @@ Line 1
_traverse_string=True), '.',
msg='traverse into converted data if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', Ellipsis),
_traverse_string=True), list('str'),
msg='`...` branching into string should result in list')
_traverse_string=True), 'str',
msg='`...` should result in string (same value) if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)),
_traverse_string=True), 'sr',
msg='`slice` should result in string if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == 's'),
_traverse_string=True), 'str',
msg='function should result in string if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)),
_traverse_string=True), ['s', 'r'],
msg='branching into string should result in list')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda _, x: x),
_traverse_string=True), list('str'),
msg='function branching into string should result in list')
msg='branching should result in list if `traverse_string`')
self.assertEqual(traverse_obj({}, (0, Ellipsis), _traverse_string=True), [],
msg='branching should result in list if `traverse_string`')
self.assertEqual(traverse_obj({}, (0, lambda x, y: True), _traverse_string=True), [],
msg='branching should result in list if `traverse_string`')
self.assertEqual(traverse_obj({}, (0, slice(1)), _traverse_string=True), [],
msg='branching should result in list if `traverse_string`')
# Test is_user_input behavior
_IS_USER_INPUT_DATA = {'range8': list(range(8))}
@ -1791,6 +2011,8 @@ Line 1
msg='failing str key on a `re.Match` should return `default`')
self.assertEqual(traverse_obj(mobj, 8), None,
msg='failing int key on a `re.Match` should return `default`')
self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 'group')), ['0123', '3'],
msg='function on a `re.Match` should give group name as well')
def test_get_first(self):
self.assertEqual(get_first([{'a': None}, {'a': 'spam'}], 'a'), 'spam')

View file

@ -11,12 +11,11 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import get_params, try_rm
import io
import xml.etree.ElementTree
import youtube_dl.YoutubeDL
import youtube_dl.extractor
from youtube_dl.compat import compat_open as open
class YoutubeDL(youtube_dl.YoutubeDL):
@ -51,7 +50,7 @@ class TestAnnotations(unittest.TestCase):
ydl.download([TEST_ID])
self.assertTrue(os.path.exists(ANNOTATIONS_FILE))
annoxml = None
with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof:
with open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof:
annoxml = xml.etree.ElementTree.parse(annof)
self.assertTrue(annoxml is not None, 'Failed to parse annotations XML')
root = annoxml.getroot()

View file

@ -8,11 +8,14 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import io
import re
import string
from youtube_dl.compat import compat_str, compat_urlretrieve
from youtube_dl.compat import (
compat_open as open,
compat_str,
compat_urlretrieve,
)
from test.helper import FakeYDL
from youtube_dl.extractor import YoutubeIE
@ -67,6 +70,10 @@ _SIG_TESTS = [
]
_NSIG_TESTS = [
(
'https://www.youtube.com/s/player/7862ca1f/player_ias.vflset/en_US/base.js',
'X_LCxVDjAavgE5t', 'yxJ1dM6iz5ogUg',
),
(
'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js',
'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w',
@ -135,6 +142,22 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js',
'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ',
),
(
'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js',
'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A',
),
(
'https://www.youtube.com/s/player/6f20102c/player_ias.vflset/en_US/base.js',
'lE8DhoDmKqnmJJ', 'pJTTX6XyJP2BYw',
),
(
'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js',
'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ',
),
(
'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js',
'_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ',
),
]
@ -188,7 +211,7 @@ def t_factory(name, sig_func, url_pattern):
if not os.path.exists(fn):
compat_urlretrieve(url, fn)
with io.open(fn, encoding='utf-8') as testf:
with open(fn, encoding='utf-8') as testf:
jscode = testf.read()
self.assertEqual(sig_func(jscode, sig_input), expected_sig)

35
test/testdata/mpd/range_only.mpd vendored Normal file
View file

@ -0,0 +1,35 @@
<?xml version="1.0"?>
<!-- MPD file Generated with GPAC version 1.0.1-revrelease at 2021-11-27T20:53:11.690Z -->
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" minBufferTime="PT1.500S" type="static" mediaPresentationDuration="PT0H0M30.196S" maxSegmentDuration="PT0H0M10.027S" profiles="urn:mpeg:dash:profile:full:2011">
<ProgramInformation moreInformationURL="http://gpac.io">
<Title>manifest.mpd generated by GPAC</Title>
</ProgramInformation>
<Period duration="PT0H0M30.196S">
<AdaptationSet segmentAlignment="true" maxWidth="768" maxHeight="432" maxFrameRate="30000/1001" par="16:9" lang="und" startWithSAP="1">
<Representation id="1" mimeType="video/mp4" codecs="avc1.4D401E" width="768" height="432" frameRate="30000/1001" sar="1:1" bandwidth="526987">
<BaseURL>video_dashinit.mp4</BaseURL>
<SegmentList timescale="90000" duration="900000">
<Initialization range="0-881"/>
<SegmentURL mediaRange="882-876094" indexRange="882-925"/>
<SegmentURL mediaRange="876095-1466732" indexRange="876095-876138"/>
<SegmentURL mediaRange="1466733-1953615" indexRange="1466733-1466776"/>
<SegmentURL mediaRange="1953616-1994211" indexRange="1953616-1953659"/>
</SegmentList>
</Representation>
</AdaptationSet>
<AdaptationSet segmentAlignment="true" lang="und" startWithSAP="1">
<Representation id="2" mimeType="audio/mp4" codecs="mp4a.40.2" audioSamplingRate="48000" bandwidth="98096">
<AudioChannelConfiguration schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011" value="2"/>
<BaseURL>audio_dashinit.mp4</BaseURL>
<SegmentList timescale="48000" duration="480000">
<Initialization range="0-752"/>
<SegmentURL mediaRange="753-124129" indexRange="753-796"/>
<SegmentURL mediaRange="124130-250544" indexRange="124130-124173"/>
<SegmentURL mediaRange="250545-374929" indexRange="250545-250588"/>
</SegmentList>
</Representation>
</AdaptationSet>
</Period>
</MPD>

351
test/testdata/mpd/subtitles.mpd vendored Normal file
View file

@ -0,0 +1,351 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Created with Unified Streaming Platform (version=1.10.18-20255) -->
<MPD
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="urn:mpeg:dash:schema:mpd:2011"
xsi:schemaLocation="urn:mpeg:dash:schema:mpd:2011 http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-DASH_schema_files/DASH-MPD.xsd"
type="static"
mediaPresentationDuration="PT14M48S"
maxSegmentDuration="PT1M"
minBufferTime="PT10S"
profiles="urn:mpeg:dash:profile:isoff-live:2011">
<Period
id="1"
duration="PT14M48S">
<BaseURL>dash/</BaseURL>
<AdaptationSet
id="1"
group="1"
contentType="audio"
segmentAlignment="true"
audioSamplingRate="48000"
mimeType="audio/mp4"
codecs="mp4a.40.2"
startWithSAP="1">
<AudioChannelConfiguration
schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011"
value="2" />
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" />
<SegmentTemplate
timescale="48000"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="3584" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="audio=128001"
bandwidth="128001">
</Representation>
</AdaptationSet>
<AdaptationSet
id="2"
group="3"
contentType="text"
lang="en"
mimeType="application/mp4"
codecs="stpp"
startWithSAP="1">
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="subtitle" />
<SegmentTemplate
timescale="1000"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="60000" r="9" />
<S d="24000" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="textstream_eng=1000"
bandwidth="1000">
</Representation>
</AdaptationSet>
<AdaptationSet
id="3"
group="2"
contentType="video"
par="960:409"
minBandwidth="100000"
maxBandwidth="4482000"
maxWidth="1689"
maxHeight="720"
segmentAlignment="true"
mimeType="video/mp4"
codecs="avc1.4D401F"
startWithSAP="1">
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" />
<SegmentTemplate
timescale="12288"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="24576" r="443" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="video=100000"
bandwidth="100000"
width="336"
height="144"
sar="2880:2863"
scanType="progressive">
</Representation>
<Representation
id="video=326000"
bandwidth="326000"
width="562"
height="240"
sar="115200:114929"
scanType="progressive">
</Representation>
<Representation
id="video=698000"
bandwidth="698000"
width="844"
height="360"
sar="86400:86299"
scanType="progressive">
</Representation>
<Representation
id="video=1493000"
bandwidth="1493000"
width="1126"
height="480"
sar="230400:230267"
scanType="progressive">
</Representation>
<Representation
id="video=4482000"
bandwidth="4482000"
width="1688"
height="720"
sar="86400:86299"
scanType="progressive">
</Representation>
</AdaptationSet>
</Period>
</MPD>

32
test/testdata/mpd/url_and_range.mpd vendored Normal file
View file

@ -0,0 +1,32 @@
<?xml version="1.0" ?>
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" profiles="urn:mpeg:dash:profile:isoff-live:2011" minBufferTime="PT10.01S" mediaPresentationDuration="PT30.097S" type="static">
<!-- Created with Bento4 mp4-dash.py, VERSION=2.0.0-639 -->
<Period>
<!-- Video -->
<AdaptationSet mimeType="video/mp4" segmentAlignment="true" startWithSAP="1" maxWidth="768" maxHeight="432">
<Representation id="video-avc1" codecs="avc1.4D401E" width="768" height="432" scanType="progressive" frameRate="30000/1001" bandwidth="699597">
<SegmentList timescale="1000" duration="10010">
<Initialization sourceURL="video-frag.mp4" range="36-746"/>
<SegmentURL media="video-frag.mp4" mediaRange="747-876117"/>
<SegmentURL media="video-frag.mp4" mediaRange="876118-1466913"/>
<SegmentURL media="video-frag.mp4" mediaRange="1466914-1953954"/>
<SegmentURL media="video-frag.mp4" mediaRange="1953955-1994652"/>
</SegmentList>
</Representation>
</AdaptationSet>
<!-- Audio -->
<AdaptationSet mimeType="audio/mp4" startWithSAP="1" segmentAlignment="true">
<Representation id="audio-und-mp4a.40.2" codecs="mp4a.40.2" bandwidth="98808" audioSamplingRate="48000">
<AudioChannelConfiguration schemeIdUri="urn:mpeg:mpegB:cicp:ChannelConfiguration" value="2"/>
<SegmentList timescale="1000" duration="10010">
<Initialization sourceURL="audio-frag.mp4" range="32-623"/>
<SegmentURL media="audio-frag.mp4" mediaRange="624-124199"/>
<SegmentURL media="audio-frag.mp4" mediaRange="124200-250303"/>
<SegmentURL media="audio-frag.mp4" mediaRange="250304-374365"/>
<SegmentURL media="audio-frag.mp4" mediaRange="374366-374836"/>
</SegmentList>
</Representation>
</AdaptationSet>
</Period>
</MPD>

View file

@ -4,11 +4,9 @@
from __future__ import absolute_import, unicode_literals
import collections
import contextlib
import copy
import datetime
import errno
import fileinput
import io
import itertools
import json
@ -26,25 +24,38 @@ import tokenize
import traceback
import random
try:
from ssl import OPENSSL_VERSION
except ImportError:
# Must be Python 2.6, should be built against 1.0.2
OPENSSL_VERSION = 'OpenSSL 1.0.2(?)'
from string import ascii_letters
from .compat import (
compat_basestring,
compat_cookiejar,
compat_collections_chain_map as ChainMap,
compat_filter as filter,
compat_get_terminal_size,
compat_http_client,
compat_http_cookiejar_Cookie,
compat_http_cookies_SimpleCookie,
compat_integer_types,
compat_kwargs,
compat_map as map,
compat_numeric_types,
compat_open as open,
compat_os_name,
compat_str,
compat_tokenize_tokenize,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
compat_urllib_request_DataHandler,
)
from .utils import (
age_restricted,
args_to_str,
bug_reports_message,
ContentTooShortError,
date_from_str,
DateRange,
@ -62,7 +73,9 @@ from .utils import (
GeoRestrictedError,
int_or_none,
ISO3166Utils,
join_nonempty,
locked_file,
LazyList,
make_HTTPS_handler,
MaxDownloadsReached,
orderedSet,
@ -85,6 +98,7 @@ from .utils import (
std_headers,
str_or_none,
subtitles_filename,
traverse_obj,
UnavailableVideoError,
url_basename,
version_tuple,
@ -94,6 +108,7 @@ from .utils import (
YoutubeDLCookieProcessor,
YoutubeDLHandler,
YoutubeDLRedirectHandler,
ytdl_is_updateable,
)
from .cache import Cache
from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
@ -363,6 +378,9 @@ class YoutubeDL(object):
self.params.update(params)
self.cache = Cache(self)
self._header_cookies = []
self._load_cookies_from_headers(self.params.get('http_headers'))
def check_deprecated(param, option, suggestion):
if self.params.get(param) is not None:
self.report_warning(
@ -569,7 +587,7 @@ class YoutubeDL(object):
if self.params.get('cookiefile') is not None:
self.cookiejar.save(ignore_discard=True, ignore_expires=True)
def trouble(self, message=None, tb=None):
def trouble(self, *args, **kwargs):
"""Determine action to take when a download problem appears.
Depending on if the downloader has been configured to ignore
@ -578,6 +596,11 @@ class YoutubeDL(object):
tb, if given, is additional traceback information.
"""
# message=None, tb=None, is_error=True
message = args[0] if len(args) > 0 else kwargs.get('message', None)
tb = args[1] if len(args) > 1 else kwargs.get('tb', None)
is_error = args[2] if len(args) > 2 else kwargs.get('is_error', True)
if message is not None:
self.to_stderr(message)
if self.params.get('verbose'):
@ -590,7 +613,10 @@ class YoutubeDL(object):
else:
tb_data = traceback.format_list(traceback.extract_stack())
tb = ''.join(tb_data)
self.to_stderr(tb)
if tb:
self.to_stderr(tb)
if not is_error:
return
if not self.params.get('ignoreerrors', False):
if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
exc_info = sys.exc_info()[1].exc_info
@ -599,11 +625,18 @@ class YoutubeDL(object):
raise DownloadError(message, exc_info)
self._download_retcode = 1
def report_warning(self, message):
def report_warning(self, message, only_once=False, _cache={}):
'''
Print the message to stderr, it will be prefixed with 'WARNING:'
If stderr is a tty file the 'WARNING:' will be colored
'''
if only_once:
m_hash = hash((self, message))
m_cnt = _cache.setdefault(m_hash, 0)
_cache[m_hash] = m_cnt + 1
if m_cnt > 0:
return
if self.params.get('logger') is not None:
self.params['logger'].warning(message)
else:
@ -616,7 +649,7 @@ class YoutubeDL(object):
warning_message = '%s %s' % (_msg_header, message)
self.to_stderr(warning_message)
def report_error(self, message, tb=None):
def report_error(self, message, *args, **kwargs):
'''
Do the same as trouble, but prefixes the message with 'ERROR:', colored
in red if stderr is a tty file.
@ -625,8 +658,18 @@ class YoutubeDL(object):
_msg_header = '\033[0;31mERROR:\033[0m'
else:
_msg_header = 'ERROR:'
error_message = '%s %s' % (_msg_header, message)
self.trouble(error_message, tb)
kwargs['message'] = '%s %s' % (_msg_header, message)
self.trouble(*args, **kwargs)
def report_unscoped_cookies(self, *args, **kwargs):
# message=None, tb=False, is_error=False
if len(args) <= 2:
kwargs.setdefault('is_error', False)
if len(args) <= 0:
kwargs.setdefault(
'message',
'Unscoped cookies are not allowed: please specify some sort of scoping')
self.report_error(*args, **kwargs)
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
@ -822,7 +865,7 @@ class YoutubeDL(object):
msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
self.report_error(msg)
except ExtractorError as e: # An error we somewhat expected
self.report_error(compat_str(e), e.format_traceback())
self.report_error(compat_str(e), tb=e.format_traceback())
except MaxDownloadsReached:
raise
except Exception as e:
@ -832,8 +875,83 @@ class YoutubeDL(object):
raise
return wrapper
def _remove_cookie_header(self, http_headers):
"""Filters out `Cookie` header from an `http_headers` dict
The `Cookie` header is removed to prevent leaks as a result of unscoped cookies.
See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
@param http_headers An `http_headers` dict from which any `Cookie` header
should be removed, or None
"""
return dict(filter(lambda pair: pair[0].lower() != 'cookie', (http_headers or {}).items()))
def _load_cookies(self, data, **kwargs):
"""Loads cookies from a `Cookie` header
This tries to work around the security vulnerability of passing cookies to every domain.
@param data The Cookie header as a string to load the cookies from
@param autoscope If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains
If `True`, save cookies for later to be stored in the jar with a limited scope
If a URL, save cookies in the jar with the domain of the URL
"""
# autoscope=True (kw-only)
autoscope = kwargs.get('autoscope', True)
for cookie in compat_http_cookies_SimpleCookie(data).values() if data else []:
if autoscope and any(cookie.values()):
raise ValueError('Invalid syntax in Cookie Header')
domain = cookie.get('domain') or ''
expiry = cookie.get('expires')
if expiry == '': # 0 is valid so we check for `''` explicitly
expiry = None
prepared_cookie = compat_http_cookiejar_Cookie(
cookie.get('version') or 0, cookie.key, cookie.value, None, False,
domain, True, True, cookie.get('path') or '', bool(cookie.get('path')),
bool(cookie.get('secure')), expiry, False, None, None, {})
if domain:
self.cookiejar.set_cookie(prepared_cookie)
elif autoscope is True:
self.report_warning(
'Passing cookies as a header is a potential security risk; '
'they will be scoped to the domain of the downloaded urls. '
'Please consider loading cookies from a file or browser instead.',
only_once=True)
self._header_cookies.append(prepared_cookie)
elif autoscope:
self.report_warning(
'The extractor result contains an unscoped cookie as an HTTP header. '
'If you are specifying an input URL, ' + bug_reports_message(),
only_once=True)
self._apply_header_cookies(autoscope, [prepared_cookie])
else:
self.report_unscoped_cookies()
def _load_cookies_from_headers(self, headers):
self._load_cookies(traverse_obj(headers, 'cookie', casesense=False))
def _apply_header_cookies(self, url, cookies=None):
"""This method applies stray header cookies to the provided url
This loads header cookies and scopes them to the domain provided in `url`.
While this is not ideal, it helps reduce the risk of them being sent to
an unintended destination.
"""
parsed = compat_urllib_parse.urlparse(url)
if not parsed.hostname:
return
for cookie in map(copy.copy, cookies or self._header_cookies):
cookie.domain = '.' + parsed.hostname
self.cookiejar.set_cookie(cookie)
@__handle_extraction_exceptions
def __extract_info(self, url, ie, download, extra_info, process):
# Compat with passing cookies in http headers
self._apply_header_cookies(url)
ie_result = ie.extract(url)
if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
return
@ -859,7 +977,7 @@ class YoutubeDL(object):
def process_ie_result(self, ie_result, download=True, extra_info={}):
"""
Take the result of the ie(may be modified) and resolve all unresolved
Take the result of the ie (may be modified) and resolve all unresolved
references (URLs, playlist items).
It will also download the videos if 'download'.
@ -1386,17 +1504,16 @@ class YoutubeDL(object):
'abr': formats_info[1].get('abr'),
'ext': output_ext,
}
video_selector, audio_selector = map(_build_selector_function, selector.selector)
def selector_function(ctx):
for pair in itertools.product(
video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
selector_fn = lambda x: _build_selector_function(x)(ctx)
for pair in itertools.product(*map(selector_fn, selector.selector)):
yield _merge(pair)
filters = [self._build_format_filter(f) for f in selector.filters]
def final_selector(ctx):
ctx_copy = copy.deepcopy(ctx)
ctx_copy = dict(ctx)
for _filter in filters:
ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
return selector_function(ctx_copy)
@ -1431,23 +1548,45 @@ class YoutubeDL(object):
parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
return _build_selector_function(parsed_selector)
def _calc_headers(self, info_dict):
res = std_headers.copy()
def _calc_headers(self, info_dict, load_cookies=False):
if load_cookies: # For --load-info-json
# load cookies from http_headers in legacy info.json
self._load_cookies(traverse_obj(info_dict, ('http_headers', 'Cookie'), casesense=False),
autoscope=info_dict['url'])
# load scoped cookies from info.json
self._load_cookies(info_dict.get('cookies'), autoscope=False)
add_headers = info_dict.get('http_headers')
if add_headers:
res.update(add_headers)
cookies = self._calc_cookies(info_dict)
cookies = self.cookiejar.get_cookies_for_url(info_dict['url'])
if cookies:
res['Cookie'] = cookies
# Make a string like name1=val1; attr1=a_val1; ...name2=val2; ...
# By convention a cookie name can't be a well-known attribute name
# so this syntax is unambiguous and can be parsed by (eg) SimpleCookie
encoder = compat_http_cookies_SimpleCookie()
values = []
attributes = (('Domain', '='), ('Path', '='), ('Secure',), ('Expires', '='), ('Version', '='))
attributes = tuple([x[0].lower()] + list(x) for x in attributes)
for cookie in cookies:
_, value = encoder.value_encode(cookie.value)
# Py 2 '' --> '', Py 3 '' --> '""'
if value == '':
value = '""'
values.append('='.join((cookie.name, value)))
for attr in attributes:
value = getattr(cookie, attr[0], None)
if value:
values.append('%s%s' % (''.join(attr[1:]), value if len(attr) == 3 else ''))
info_dict['cookies'] = '; '.join(values)
res = std_headers.copy()
res.update(info_dict.get('http_headers') or {})
res = self._remove_cookie_header(res)
if 'X-Forwarded-For' not in res:
x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
if x_forwarded_for_ip:
res['X-Forwarded-For'] = x_forwarded_for_ip
return res
return res or None
def _calc_cookies(self, info_dict):
pr = sanitized_Request(info_dict['url'])
@ -1626,10 +1765,13 @@ class YoutubeDL(object):
format['protocol'] = determine_protocol(format)
# Add HTTP headers, so that external programs can use them from the
# json output
full_format_info = info_dict.copy()
full_format_info.update(format)
format['http_headers'] = self._calc_headers(full_format_info)
# Remove private housekeeping stuff
format['http_headers'] = self._calc_headers(ChainMap(format, info_dict), load_cookies=True)
# Safeguard against old/insecure infojson when using --load-info-json
info_dict['http_headers'] = self._remove_cookie_header(
info_dict.get('http_headers') or {}) or None
# Remove private housekeeping stuff (copied to http_headers in _calc_headers())
if '__x_forwarded_for_ip' in info_dict:
del info_dict['__x_forwarded_for_ip']
@ -1772,7 +1914,7 @@ class YoutubeDL(object):
self.to_stdout(formatSeconds(info_dict['duration']))
print_mandatory('format')
if self.params.get('forcejson', False):
self.to_stdout(json.dumps(info_dict))
self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
def process_info(self, info_dict):
"""Process a single resolved IE result."""
@ -1832,7 +1974,7 @@ class YoutubeDL(object):
else:
try:
self.to_screen('[info] Writing video description to: ' + descfn)
with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
descfile.write(info_dict['description'])
except (OSError, IOError):
self.report_error('Cannot write description file ' + descfn)
@ -1847,7 +1989,7 @@ class YoutubeDL(object):
else:
try:
self.to_screen('[info] Writing video annotations to: ' + annofn)
with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
annofile.write(info_dict['annotations'])
except (KeyError, TypeError):
self.report_warning('There are no annotations to write.')
@ -1874,7 +2016,7 @@ class YoutubeDL(object):
try:
# Use newline='' to prevent conversion of newline characters
# See https://github.com/ytdl-org/youtube-dl/issues/10268
with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
with open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
subfile.write(sub_info['data'])
except (OSError, IOError):
self.report_error('Cannot write subtitles file ' + sub_filename)
@ -1883,24 +2025,16 @@ class YoutubeDL(object):
try:
sub_data = ie._request_webpage(
sub_info['url'], info_dict['id'], note=False).read()
with io.open(encodeFilename(sub_filename), 'wb') as subfile:
with open(encodeFilename(sub_filename), 'wb') as subfile:
subfile.write(sub_data)
except (ExtractorError, IOError, OSError, ValueError) as err:
self.report_warning('Unable to download subtitle for "%s": %s' %
(sub_lang, error_to_compat_str(err)))
continue
if self.params.get('writeinfojson', False):
infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
self.to_screen('[info] Video description metadata is already present')
else:
self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
try:
write_json_file(self.filter_requested_info(info_dict), infofn)
except (OSError, IOError):
self.report_error('Cannot write metadata to JSON file ' + infofn)
return
self._write_info_json(
'video description', info_dict,
replace_extension(filename, 'info.json', info_dict.get('ext')))
self._write_thumbnails(info_dict, filename)
@ -1921,7 +2055,11 @@ class YoutubeDL(object):
fd.add_progress_hook(ph)
if self.params.get('verbose'):
self.to_screen('[debug] Invoking downloader on %r' % info.get('url'))
return fd.download(name, info)
new_info = dict((k, v) for k, v in info.items() if not k.startswith('__p'))
new_info['http_headers'] = self._calc_headers(new_info)
return fd.download(name, new_info)
if info_dict.get('requested_formats') is not None:
downloaded = []
@ -2086,16 +2224,13 @@ class YoutubeDL(object):
raise
else:
if self.params.get('dump_single_json', False):
self.to_stdout(json.dumps(res))
self.to_stdout(json.dumps(self.sanitize_info(res)))
return self._download_retcode
def download_with_info_file(self, info_filename):
with contextlib.closing(fileinput.FileInput(
[info_filename], mode='r',
openhook=fileinput.hook_encoded('utf-8'))) as f:
# FileInput doesn't have a read method, we can't call json.load
info = self.filter_requested_info(json.loads('\n'.join(f)))
with open(info_filename, encoding='utf-8') as f:
info = self.filter_requested_info(json.load(f))
try:
self.process_ie_result(info, download=True)
except DownloadError:
@ -2108,10 +2243,36 @@ class YoutubeDL(object):
return self._download_retcode
@staticmethod
def filter_requested_info(info_dict):
return dict(
(k, v) for k, v in info_dict.items()
if k not in ['requested_formats', 'requested_subtitles'])
def sanitize_info(info_dict, remove_private_keys=False):
''' Sanitize the infodict for converting to json '''
if info_dict is None:
return info_dict
if remove_private_keys:
reject = lambda k, v: (v is None
or k.startswith('__')
or k in ('requested_formats',
'requested_subtitles'))
else:
reject = lambda k, v: False
def filter_fn(obj):
if isinstance(obj, dict):
return dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v))
elif isinstance(obj, (list, tuple, set, LazyList)):
return list(map(filter_fn, obj))
elif obj is None or any(isinstance(obj, c)
for c in (compat_integer_types,
(compat_str, float, bool))):
return obj
else:
return repr(obj)
return filter_fn(info_dict)
@classmethod
def filter_requested_info(cls, info_dict):
return cls.sanitize_info(info_dict, True)
def post_process(self, filename, ie_info):
"""Run all the postprocessors on the given file."""
@ -2318,9 +2479,12 @@ class YoutubeDL(object):
self.get_encoding()))
write_string(encoding_str, encoding=None)
self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
writeln_debug = lambda *s: self._write_string('[debug] %s\n' % (''.join(s), ))
writeln_debug('youtube-dl version ', __version__)
if _LAZY_LOADER:
self._write_string('[debug] Lazy loading extractors enabled' + '\n')
writeln_debug('Lazy loading extractors enabled')
if ytdl_is_updateable():
writeln_debug('Single file build')
try:
sp = subprocess.Popen(
['git', 'rev-parse', '--short', 'HEAD'],
@ -2329,7 +2493,7 @@ class YoutubeDL(object):
out, err = process_communicate_or_kill(sp)
out = out.decode().strip()
if re.match('[0-9a-f]+', out):
self._write_string('[debug] Git HEAD: ' + out + '\n')
writeln_debug('Git HEAD: ', out)
except Exception:
try:
sys.exc_clear()
@ -2342,9 +2506,22 @@ class YoutubeDL(object):
return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
return impl_name
self._write_string('[debug] Python version %s (%s) - %s\n' % (
platform.python_version(), python_implementation(),
platform_name()))
def libc_ver():
try:
return platform.libc_ver()
except OSError: # We may not have access to the executable
return []
libc = join_nonempty(*libc_ver(), delim=' ')
writeln_debug('Python %s (%s %s %s) - %s - %s%s' % (
platform.python_version(),
python_implementation(),
platform.machine(),
platform.architecture()[0],
platform_name(),
OPENSSL_VERSION,
(' - %s' % (libc, )) if libc else ''
))
exe_versions = FFmpegPostProcessor.get_versions(self)
exe_versions['rtmpdump'] = rtmpdump_version()
@ -2356,17 +2533,17 @@ class YoutubeDL(object):
)
if not exe_str:
exe_str = 'none'
self._write_string('[debug] exe versions: %s\n' % exe_str)
writeln_debug('exe versions: %s' % (exe_str, ))
proxy_map = {}
for handler in self._opener.handlers:
if hasattr(handler, 'proxies'):
proxy_map.update(handler.proxies)
self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
writeln_debug('Proxy map: ', compat_str(proxy_map))
if self.params.get('call_home', False):
ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
self._write_string('[debug] Public IP address: %s\n' % ipaddr)
writeln_debug('Public IP address: %s' % (ipaddr, ))
latest_version = self.urlopen(
'https://yt-dl.org/latest/version').read().decode('utf-8')
if version_tuple(latest_version) > version_tuple(__version__):
@ -2383,7 +2560,7 @@ class YoutubeDL(object):
opts_proxy = self.params.get('proxy')
if opts_cookiefile is None:
self.cookiejar = compat_cookiejar.CookieJar()
self.cookiejar = YoutubeDLCookieJar()
else:
opts_cookiefile = expand_path(opts_cookiefile)
self.cookiejar = YoutubeDLCookieJar(opts_cookiefile)
@ -2444,6 +2621,28 @@ class YoutubeDL(object):
encoding = preferredencoding()
return encoding
def _write_info_json(self, label, info_dict, infofn, overwrite=None):
if not self.params.get('writeinfojson', False):
return False
def msg(fmt, lbl):
return fmt % (lbl + ' metadata',)
if overwrite is None:
overwrite = not self.params.get('nooverwrites', False)
if not overwrite and os.path.exists(encodeFilename(infofn)):
self.to_screen(msg('[info] %s is already present', label.title()))
return 'exists'
else:
self.to_screen(msg('[info] Writing %s as JSON to: ', label) + infofn)
try:
write_json_file(self.filter_requested_info(info_dict), infofn)
return True
except (OSError, IOError):
self.report_error(msg('Cannot write %s to JSON file ', label) + infofn)
return
def _write_thumbnails(self, info_dict, filename):
if self.params.get('writethumbnail', False):
thumbnails = info_dict.get('thumbnails')

View file

@ -5,7 +5,6 @@ from __future__ import unicode_literals
__license__ = 'Public Domain'
import codecs
import io
import os
import random
@ -17,6 +16,7 @@ from .options import (
)
from .compat import (
compat_getpass,
compat_register_utf8,
compat_shlex_split,
workaround_optparse_bug9161,
)
@ -46,10 +46,8 @@ from .YoutubeDL import YoutubeDL
def _real_main(argv=None):
# Compatibility fixes for Windows
if sys.platform == 'win32':
# https://github.com/ytdl-org/youtube-dl/issues/820
codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
# Compatibility fix for Windows
compat_register_utf8()
workaround_optparse_bug9161()

View file

@ -1,14 +1,16 @@
from __future__ import unicode_literals
import errno
import io
import json
import os
import re
import shutil
import traceback
from .compat import compat_getenv
from .compat import (
compat_getenv,
compat_open as open,
)
from .utils import (
error_to_compat_str,
expand_path,
@ -83,7 +85,7 @@ class Cache(object):
cache_fn = self._get_cache_fn(section, key, dtype)
try:
try:
with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
with open(cache_fn, 'r', encoding='utf-8') as cachef:
return self._validate(json.load(cachef), min_ver)
except ValueError:
try:

View file

@ -1663,5 +1663,5 @@ def casefold(s):
__all__ = [
casefold
'casefold',
]

View file

@ -1,10 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
from __future__ import division
import base64
import binascii
import collections
import ctypes
import datetime
import email
import getpass
import io
@ -19,20 +21,30 @@ import socket
import struct
import subprocess
import sys
import types
import xml.etree.ElementTree
# naming convention
# 'compat_' + Python3_name.replace('.', '_')
# other aliases exist for convenience and/or legacy
# deal with critical unicode/str things first
try:
# Python 2
compat_str, compat_basestring, compat_chr = (
unicode, basestring, unichr
)
from .casefold import casefold as compat_casefold
except NameError:
compat_str, compat_basestring, compat_chr = (
str, str, chr
str, (str, bytes), chr
)
# casefold
try:
compat_str.casefold
compat_casefold = lambda s: s.casefold()
except AttributeError:
from .casefold import casefold as compat_casefold
try:
import collections.abc as compat_collections_abc
@ -44,6 +56,29 @@ try:
except ImportError: # Python 2
import urllib2 as compat_urllib_request
# Also fix up lack of method arg in old Pythons
try:
type(compat_urllib_request.Request('http://127.0.0.1', method='GET'))
except TypeError:
def _add_init_method_arg(cls):
init = cls.__init__
def wrapped_init(self, *args, **kwargs):
method = kwargs.pop('method', 'GET')
init(self, *args, **kwargs)
if any(callable(x.__dict__.get('get_method')) for x in (self.__class__, self) if x != cls):
# allow instance or its subclass to override get_method()
return
if self.has_data() and method == 'GET':
method = 'POST'
self.get_method = types.MethodType(lambda _: method, self)
cls.__init__ = wrapped_init
_add_init_method_arg(compat_urllib_request.Request)
del _add_init_method_arg
try:
import urllib.error as compat_urllib_error
except ImportError: # Python 2
@ -53,26 +88,32 @@ try:
import urllib.parse as compat_urllib_parse
except ImportError: # Python 2
import urllib as compat_urllib_parse
import urlparse as _urlparse
for a in dir(_urlparse):
if not hasattr(compat_urllib_parse, a):
setattr(compat_urllib_parse, a, getattr(_urlparse, a))
del _urlparse
try:
from urllib.parse import urlparse as compat_urllib_parse_urlparse
except ImportError: # Python 2
from urlparse import urlparse as compat_urllib_parse_urlparse
try:
import urllib.parse as compat_urlparse
except ImportError: # Python 2
import urlparse as compat_urlparse
# unfavoured aliases
compat_urlparse = compat_urllib_parse
compat_urllib_parse_urlparse = compat_urllib_parse.urlparse
try:
import urllib.response as compat_urllib_response
except ImportError: # Python 2
import urllib as compat_urllib_response
try:
compat_urllib_response.addinfourl.status
except AttributeError:
# .getcode() is deprecated in Py 3.
compat_urllib_response.addinfourl.status = property(lambda self: self.getcode())
try:
import http.cookiejar as compat_cookiejar
except ImportError: # Python 2
import cookielib as compat_cookiejar
compat_http_cookiejar = compat_cookiejar
if sys.version_info[0] == 2:
class compat_cookiejar_Cookie(compat_cookiejar.Cookie):
@ -84,20 +125,35 @@ if sys.version_info[0] == 2:
compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs)
else:
compat_cookiejar_Cookie = compat_cookiejar.Cookie
compat_http_cookiejar_Cookie = compat_cookiejar_Cookie
try:
import http.cookies as compat_cookies
except ImportError: # Python 2
import Cookie as compat_cookies
compat_http_cookies = compat_cookies
if sys.version_info[0] == 2:
if sys.version_info[0] == 2 or sys.version_info < (3, 3):
class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie):
def load(self, rawdata):
if isinstance(rawdata, compat_str):
rawdata = str(rawdata)
return super(compat_cookies_SimpleCookie, self).load(rawdata)
must_have_value = 0
if not isinstance(rawdata, dict):
if sys.version_info[:2] != (2, 7) or sys.platform.startswith('java'):
# attribute must have value for parsing
rawdata, must_have_value = re.subn(
r'(?i)(;\s*)(secure|httponly)(\s*(?:;|$))', r'\1\2=\2\3', rawdata)
if sys.version_info[0] == 2:
if isinstance(rawdata, compat_str):
rawdata = str(rawdata)
super(compat_cookies_SimpleCookie, self).load(rawdata)
if must_have_value > 0:
for morsel in self.values():
for attr in ('secure', 'httponly'):
if morsel.get(attr):
morsel[attr] = True
else:
compat_cookies_SimpleCookie = compat_cookies.SimpleCookie
compat_http_cookies_SimpleCookie = compat_cookies_SimpleCookie
try:
import html.entities as compat_html_entities
@ -2346,21 +2402,29 @@ try:
import http.client as compat_http_client
except ImportError: # Python 2
import httplib as compat_http_client
try:
compat_http_client.HTTPResponse.getcode
except AttributeError:
# Py < 3.1
compat_http_client.HTTPResponse.getcode = lambda self: self.status
try:
from urllib.error import HTTPError as compat_HTTPError
except ImportError: # Python 2
from urllib2 import HTTPError as compat_HTTPError
compat_urllib_HTTPError = compat_HTTPError
try:
from urllib.request import urlretrieve as compat_urlretrieve
except ImportError: # Python 2
from urllib import urlretrieve as compat_urlretrieve
compat_urllib_request_urlretrieve = compat_urlretrieve
try:
from html.parser import HTMLParser as compat_HTMLParser
except ImportError: # Python 2
from HTMLParser import HTMLParser as compat_HTMLParser
compat_html_parser_HTMLParser = compat_HTMLParser
try: # Python 2
from HTMLParser import HTMLParseError as compat_HTMLParseError
@ -2374,6 +2438,7 @@ except ImportError: # Python <3.4
# and uniform cross-version exception handling
class compat_HTMLParseError(Exception):
pass
compat_html_parser_HTMLParseError = compat_HTMLParseError
try:
from subprocess import DEVNULL
@ -2390,6 +2455,8 @@ try:
from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
from urllib.parse import unquote as compat_urllib_parse_unquote
from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
from urllib.parse import urlencode as compat_urllib_parse_urlencode
from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2
_asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
else re.compile(r'([\x00-\x7f]+)'))
@ -2456,9 +2523,6 @@ except ImportError: # Python 2
string = string.replace('+', ' ')
return compat_urllib_parse_unquote(string, encoding, errors)
try:
from urllib.parse import urlencode as compat_urllib_parse_urlencode
except ImportError: # Python 2
# Python 2 will choke in urlencode on mixture of byte and unicode strings.
# Possible solutions are to either port it from python 3 with all
# the friends or manually ensure input query contains only byte strings.
@ -2480,7 +2544,62 @@ except ImportError: # Python 2
def encode_list(l):
return [encode_elem(e) for e in l]
return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq)
return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq)
# HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
# Python 2's version is apparently totally broken
def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
qs, _coerce_result = qs, compat_str
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
r = []
for name_value in pairs:
if not name_value and not strict_parsing:
continue
nv = name_value.split('=', 1)
if len(nv) != 2:
if strict_parsing:
raise ValueError('bad query field: %r' % (name_value,))
# Handle case of a control-name with no equal sign
if keep_blank_values:
nv.append('')
else:
continue
if len(nv[1]) or keep_blank_values:
name = nv[0].replace('+', ' ')
name = compat_urllib_parse_unquote(
name, encoding=encoding, errors=errors)
name = _coerce_result(name)
value = nv[1].replace('+', ' ')
value = compat_urllib_parse_unquote(
value, encoding=encoding, errors=errors)
value = _coerce_result(value)
r.append((name, value))
return r
def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
parsed_result = {}
pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
encoding=encoding, errors=errors)
for name, value in pairs:
if name in parsed_result:
parsed_result[name].append(value)
else:
parsed_result[name] = [value]
return parsed_result
setattr(compat_urllib_parse, '_urlencode',
getattr(compat_urllib_parse, 'urlencode'))
for name, fix in (
('unquote_to_bytes', compat_urllib_parse_unquote_to_bytes),
('parse_unquote', compat_urllib_parse_unquote),
('unquote_plus', compat_urllib_parse_unquote_plus),
('urlencode', compat_urllib_parse_urlencode),
('parse_qs', compat_parse_qs)):
setattr(compat_urllib_parse, name, fix)
compat_urllib_parse_parse_qs = compat_parse_qs
try:
from urllib.request import DataHandler as compat_urllib_request_DataHandler
@ -2520,6 +2639,7 @@ try:
from xml.etree.ElementTree import ParseError as compat_xml_parse_error
except ImportError: # Python 2.6
from xml.parsers.expat import ExpatError as compat_xml_parse_error
compat_xml_etree_ElementTree_ParseError = compat_xml_parse_error
etree = xml.etree.ElementTree
@ -2533,10 +2653,11 @@ try:
# xml.etree.ElementTree.Element is a method in Python <=2.6 and
# the following will crash with:
# TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types
isinstance(None, xml.etree.ElementTree.Element)
isinstance(None, etree.Element)
from xml.etree.ElementTree import Element as compat_etree_Element
except TypeError: # Python <=2.6
from xml.etree.ElementTree import _ElementInterface as compat_etree_Element
compat_xml_etree_ElementTree_Element = compat_etree_Element
if sys.version_info[0] >= 3:
def compat_etree_fromstring(text):
@ -2592,6 +2713,7 @@ else:
if k == uri or v == prefix:
del etree._namespace_map[k]
etree._namespace_map[uri] = prefix
compat_xml_etree_register_namespace = compat_etree_register_namespace
if sys.version_info < (2, 7):
# Here comes the crazy part: In 2.6, if the xpath is a unicode,
@ -2603,53 +2725,6 @@ if sys.version_info < (2, 7):
else:
compat_xpath = lambda xpath: xpath
try:
from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2
# HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
# Python 2's version is apparently totally broken
def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
qs, _coerce_result = qs, compat_str
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
r = []
for name_value in pairs:
if not name_value and not strict_parsing:
continue
nv = name_value.split('=', 1)
if len(nv) != 2:
if strict_parsing:
raise ValueError('bad query field: %r' % (name_value,))
# Handle case of a control-name with no equal sign
if keep_blank_values:
nv.append('')
else:
continue
if len(nv[1]) or keep_blank_values:
name = nv[0].replace('+', ' ')
name = compat_urllib_parse_unquote(
name, encoding=encoding, errors=errors)
name = _coerce_result(name)
value = nv[1].replace('+', ' ')
value = compat_urllib_parse_unquote(
value, encoding=encoding, errors=errors)
value = _coerce_result(value)
r.append((name, value))
return r
def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
parsed_result = {}
pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
encoding=encoding, errors=errors)
for name, value in pairs:
if name in parsed_result:
parsed_result[name].append(value)
else:
parsed_result[name] = [value]
return parsed_result
compat_os_name = os._name if os.name == 'java' else os.name
@ -2774,6 +2849,8 @@ else:
else:
compat_expanduser = os.path.expanduser
compat_os_path_expanduser = compat_expanduser
if compat_os_name == 'nt' and sys.version_info < (3, 8):
# os.path.realpath on Windows does not follow symbolic links
@ -2785,6 +2862,8 @@ if compat_os_name == 'nt' and sys.version_info < (3, 8):
else:
compat_realpath = os.path.realpath
compat_os_path_realpath = compat_realpath
if sys.version_info < (3, 0):
def compat_print(s):
@ -2805,11 +2884,15 @@ if sys.version_info < (3, 0) and sys.platform == 'win32':
else:
compat_getpass = getpass.getpass
compat_getpass_getpass = compat_getpass
try:
compat_input = raw_input
except NameError: # Python 3
compat_input = input
# Python < 2.6.5 require kwargs to be bytes
try:
def _testfunc(x):
@ -2915,15 +2998,16 @@ else:
lines = _lines
return _terminal_size(columns, lines)
try:
itertools.count(start=0, step=1)
compat_itertools_count = itertools.count
except TypeError: # Python 2.6
def compat_itertools_count(start=0, step=1):
n = start
while True:
yield n
n += step
yield start
start += step
if sys.version_info >= (3, 0):
from tokenize import tokenize as compat_tokenize_tokenize
@ -3075,6 +3159,8 @@ if sys.version_info < (3, 3):
else:
compat_b64decode = base64.b64decode
compat_base64_b64decode = compat_b64decode
if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0):
# PyPy2 prior to version 5.4.0 expects byte strings as Windows function
@ -3094,30 +3180,95 @@ else:
return ctypes.WINFUNCTYPE(*args, **kwargs)
__all__ = [
if sys.version_info < (3, 0):
# open(file, mode='r', buffering=- 1, encoding=None, errors=None, newline=None, closefd=True) not: opener=None
def compat_open(file_, *args, **kwargs):
if len(args) > 6 or 'opener' in kwargs:
raise ValueError('open: unsupported argument "opener"')
return io.open(file_, *args, **kwargs)
else:
compat_open = open
# compat_register_utf8
def compat_register_utf8():
if sys.platform == 'win32':
# https://github.com/ytdl-org/youtube-dl/issues/820
from codecs import register, lookup
register(
lambda name: lookup('utf-8') if name == 'cp65001' else None)
# compat_datetime_timedelta_total_seconds
try:
compat_datetime_timedelta_total_seconds = datetime.timedelta.total_seconds
except AttributeError:
# Py 2.6
def compat_datetime_timedelta_total_seconds(td):
return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
# optional decompression packages
# PyPi brotli package implements 'br' Content-Encoding
try:
import brotli as compat_brotli
except ImportError:
compat_brotli = None
# PyPi ncompress package implements 'compress' Content-Encoding
try:
import ncompress as compat_ncompress
except ImportError:
compat_ncompress = None
legacy = [
'compat_HTMLParseError',
'compat_HTMLParser',
'compat_HTTPError',
'compat_Struct',
'compat_b64decode',
'compat_basestring',
'compat_casefold',
'compat_chr',
'compat_collections_abc',
'compat_collections_chain_map',
'compat_cookiejar',
'compat_cookiejar_Cookie',
'compat_cookies',
'compat_cookies_SimpleCookie',
'compat_ctypes_WINFUNCTYPE',
'compat_etree_Element',
'compat_etree_fromstring',
'compat_etree_register_namespace',
'compat_expanduser',
'compat_getpass',
'compat_parse_qs',
'compat_realpath',
'compat_urllib_parse_parse_qs',
'compat_urllib_parse_unquote',
'compat_urllib_parse_unquote_plus',
'compat_urllib_parse_unquote_to_bytes',
'compat_urllib_parse_urlencode',
'compat_urllib_parse_urlparse',
'compat_urlparse',
'compat_urlretrieve',
'compat_xml_parse_error',
]
__all__ = [
'compat_html_parser_HTMLParseError',
'compat_html_parser_HTMLParser',
'compat_Struct',
'compat_base64_b64decode',
'compat_basestring',
'compat_brotli',
'compat_casefold',
'compat_chr',
'compat_collections_abc',
'compat_collections_chain_map',
'compat_datetime_timedelta_total_seconds',
'compat_http_cookiejar',
'compat_http_cookiejar_Cookie',
'compat_http_cookies',
'compat_http_cookies_SimpleCookie',
'compat_ctypes_WINFUNCTYPE',
'compat_etree_fromstring',
'compat_filter',
'compat_get_terminal_size',
'compat_getenv',
'compat_getpass',
'compat_getpass_getpass',
'compat_html_entities',
'compat_html_entities_html5',
'compat_http_client',
@ -3128,14 +3279,17 @@ __all__ = [
'compat_itertools_zip_longest',
'compat_kwargs',
'compat_map',
'compat_ncompress',
'compat_numeric_types',
'compat_open',
'compat_ord',
'compat_os_name',
'compat_parse_qs',
'compat_os_path_expanduser',
'compat_os_path_realpath',
'compat_print',
'compat_re_Match',
'compat_re_Pattern',
'compat_realpath',
'compat_register_utf8',
'compat_setenv',
'compat_shlex_quote',
'compat_shlex_split',
@ -3147,17 +3301,14 @@ __all__ = [
'compat_tokenize_tokenize',
'compat_urllib_error',
'compat_urllib_parse',
'compat_urllib_parse_unquote',
'compat_urllib_parse_unquote_plus',
'compat_urllib_parse_unquote_to_bytes',
'compat_urllib_parse_urlencode',
'compat_urllib_parse_urlparse',
'compat_urllib_request',
'compat_urllib_request_DataHandler',
'compat_urllib_response',
'compat_urlparse',
'compat_urlretrieve',
'compat_xml_parse_error',
'compat_urllib_request_urlretrieve',
'compat_urllib_HTTPError',
'compat_xml_etree_ElementTree_Element',
'compat_xml_etree_ElementTree_ParseError',
'compat_xml_etree_register_namespace',
'compat_xpath',
'compat_zip',
'workaround_optparse_bug9161',

View file

@ -88,17 +88,21 @@ class FileDownloader(object):
return '---.-%'
return '%6s' % ('%3.1f%%' % percent)
@staticmethod
def calc_eta(start, now, total, current):
@classmethod
def calc_eta(cls, start_or_rate, now_or_remaining, *args):
if len(args) < 2:
rate, remaining = (start_or_rate, now_or_remaining)
if None in (rate, remaining):
return None
return int(float(remaining) / rate)
start, now = (start_or_rate, now_or_remaining)
total, current = args[:2]
if total is None:
return None
if now is None:
now = time.time()
dif = now - start
if current == 0 or dif < 0.001: # One millisecond
return None
rate = float(current) / dif
return int((float(total) - float(current)) / rate)
rate = cls.calc_speed(start, now, current)
return rate and int((float(total) - float(current)) / rate)
@staticmethod
def format_eta(eta):
@ -123,6 +127,12 @@ class FileDownloader(object):
def format_retries(retries):
return 'inf' if retries == float('inf') else '%.0f' % retries
@staticmethod
def filesize_or_none(unencoded_filename):
fn = encodeFilename(unencoded_filename)
if os.path.isfile(fn):
return os.path.getsize(fn)
@staticmethod
def best_block_size(elapsed_time, bytes):
new_min = max(bytes / 2.0, 1.0)
@ -329,6 +339,10 @@ class FileDownloader(object):
def download(self, filename, info_dict):
"""Download to a filename using the info from info_dict
Return True on success and False otherwise
This method filters the `Cookie` header from the info_dict to prevent leaks.
Downloaders have their own way of handling cookies.
See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
"""
nooverwrites_and_exists = (

View file

@ -1,5 +1,7 @@
from __future__ import unicode_literals
import itertools
from .fragment import FragmentFD
from ..compat import compat_urllib_error
from ..utils import (
@ -30,26 +32,28 @@ class DashSegmentsFD(FragmentFD):
fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
frag_index = 0
for i, fragment in enumerate(fragments):
frag_index += 1
for frag_index, fragment in enumerate(fragments, 1):
if frag_index <= ctx['fragment_index']:
continue
success = False
# In DASH, the first segment contains necessary headers to
# generate a valid MP4 file, so always abort for the first segment
fatal = i == 0 or not skip_unavailable_fragments
count = 0
while count <= fragment_retries:
fatal = frag_index == 1 or not skip_unavailable_fragments
fragment_url = fragment.get('url')
if not fragment_url:
assert fragment_base_url
fragment_url = urljoin(fragment_base_url, fragment['path'])
headers = info_dict.get('http_headers')
fragment_range = fragment.get('range')
if fragment_range:
headers = headers.copy() if headers else {}
headers['Range'] = 'bytes=%s' % (fragment_range,)
for count in itertools.count():
try:
fragment_url = fragment.get('url')
if not fragment_url:
assert fragment_base_url
fragment_url = urljoin(fragment_base_url, fragment['path'])
success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
success, frag_content = self._download_fragment(ctx, fragment_url, info_dict, headers)
if not success:
return False
self._append_fragment(ctx, frag_content)
break
except compat_urllib_error.HTTPError as err:
# YouTube may often return 404 HTTP error for a fragment causing the
# whole download to fail. However if the same fragment is immediately
@ -57,22 +61,21 @@ class DashSegmentsFD(FragmentFD):
# is usually enough) thus allowing to download the whole file successfully.
# To be future-proof we will retry all fragments that fail with any
# HTTP error.
count += 1
if count <= fragment_retries:
self.report_retry_fragment(err, frag_index, count, fragment_retries)
if count < fragment_retries:
self.report_retry_fragment(err, frag_index, count + 1, fragment_retries)
continue
except DownloadError:
# Don't retry fragment if error occurred during HTTP downloading
# itself since it has own retry settings
if not fatal:
self.report_skip_fragment(frag_index)
break
raise
# itself since it has its own retry settings
if fatal:
raise
break
if count > fragment_retries:
if not success:
if not fatal:
self.report_skip_fragment(frag_index)
continue
self.report_error('giving up after %s fragment retries' % fragment_retries)
self.report_error('giving up after %s fragment retries' % count)
return False
self._finish_frag_download(ctx)

View file

@ -1,9 +1,10 @@
from __future__ import unicode_literals
import os.path
import os
import re
import subprocess
import sys
import tempfile
import time
from .common import FileDownloader
@ -23,6 +24,8 @@ from ..utils import (
check_executable,
is_outdated_version,
process_communicate_or_kill,
T,
traverse_obj,
)
@ -30,6 +33,7 @@ class ExternalFD(FileDownloader):
def real_download(self, filename, info_dict):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
self._cookies_tempfile = None
try:
started = time.time()
@ -42,6 +46,13 @@ class ExternalFD(FileDownloader):
# should take place
retval = 0
self.to_screen('[%s] Interrupted by user' % self.get_basename())
finally:
if self._cookies_tempfile and os.path.isfile(self._cookies_tempfile):
try:
os.remove(self._cookies_tempfile)
except OSError:
self.report_warning(
'Unable to delete temporary cookies file "{0}"'.format(self._cookies_tempfile))
if retval == 0:
status = {
@ -97,6 +108,16 @@ class ExternalFD(FileDownloader):
def _configuration_args(self, default=[]):
return cli_configuration_args(self.params, 'external_downloader_args', default)
def _write_cookies(self):
if not self.ydl.cookiejar.filename:
tmp_cookies = tempfile.NamedTemporaryFile(suffix='.cookies', delete=False)
tmp_cookies.close()
self._cookies_tempfile = tmp_cookies.name
self.to_screen('[download] Writing temporary cookies file to "{0}"'.format(self._cookies_tempfile))
# real_download resets _cookies_tempfile; if it's None, save() will write to cookiejar.filename
self.ydl.cookiejar.save(self._cookies_tempfile, ignore_discard=True, ignore_expires=True)
return self.ydl.cookiejar.filename or self._cookies_tempfile
def _call_downloader(self, tmpfilename, info_dict):
""" Either overwrite this or implement _make_cmd """
cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)]
@ -110,13 +131,21 @@ class ExternalFD(FileDownloader):
self.to_stderr(stderr.decode('utf-8', 'replace'))
return p.returncode
@staticmethod
def _header_items(info_dict):
return traverse_obj(
info_dict, ('http_headers', T(dict.items), Ellipsis))
class CurlFD(ExternalFD):
AVAILABLE_OPT = '-V'
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '--location', '-o', tmpfilename]
for key, val in info_dict['http_headers'].items():
cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed']
cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url'])
if cookie_header:
cmd += ['--cookie', cookie_header]
for key, val in self._header_items(info_dict):
cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._bool_option('--continue-at', 'continuedl', '-', '0')
cmd += self._valueless_option('--silent', 'noprogress')
@ -151,8 +180,11 @@ class AxelFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-o', tmpfilename]
for key, val in info_dict['http_headers'].items():
for key, val in self._header_items(info_dict):
cmd += ['-H', '%s: %s' % (key, val)]
cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url'])
if cookie_header:
cmd += ['-H', 'Cookie: {0}'.format(cookie_header), '--max-redirect=0']
cmd += self._configuration_args()
cmd += ['--', info_dict['url']]
return cmd
@ -162,8 +194,10 @@ class WgetFD(ExternalFD):
AVAILABLE_OPT = '--version'
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
for key, val in info_dict['http_headers'].items():
cmd = [self.exe, '-O', tmpfilename, '-nv', '--compression=auto']
if self.ydl.cookiejar.get_cookie_header(info_dict['url']):
cmd += ['--load-cookies', self._write_cookies()]
for key, val in self._header_items(info_dict):
cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._option('--limit-rate', 'ratelimit')
retry = self._option('--tries', 'retries')
@ -182,24 +216,121 @@ class WgetFD(ExternalFD):
class Aria2cFD(ExternalFD):
AVAILABLE_OPT = '-v'
@staticmethod
def _aria2c_filename(fn):
return fn if os.path.isabs(fn) else os.path.join('.', fn)
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-c']
cmd += self._configuration_args([
'--min-split-size', '1M', '--max-connection-per-server', '4'])
dn = os.path.dirname(tmpfilename)
if dn:
cmd += ['--dir', dn]
cmd += ['--out', os.path.basename(tmpfilename)]
for key, val in info_dict['http_headers'].items():
cmd = [self.exe, '-c',
'--console-log-level=warn', '--summary-interval=0', '--download-result=hide',
'--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16']
if 'fragments' in info_dict:
cmd += ['--allow-overwrite=true', '--allow-piece-length-change=true']
else:
cmd += ['--min-split-size', '1M']
if self.ydl.cookiejar.get_cookie_header(info_dict['url']):
cmd += ['--load-cookies={0}'.format(self._write_cookies())]
for key, val in self._header_items(info_dict):
cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._configuration_args(['--max-connection-per-server', '4'])
cmd += ['--out', os.path.basename(tmpfilename)]
cmd += self._option('--max-overall-download-limit', 'ratelimit')
cmd += self._option('--interface', 'source_address')
cmd += self._option('--all-proxy', 'proxy')
cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=')
cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=')
cmd += ['--', info_dict['url']]
cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=')
cmd += self._configuration_args()
# aria2c strips out spaces from the beginning/end of filenames and paths.
# We work around this issue by adding a "./" to the beginning of the
# filename and relative path, and adding a "/" at the end of the path.
# See: https://github.com/yt-dlp/yt-dlp/issues/276
# https://github.com/ytdl-org/youtube-dl/issues/20312
# https://github.com/aria2/aria2/issues/1373
dn = os.path.dirname(tmpfilename)
if dn:
cmd += ['--dir', self._aria2c_filename(dn) + os.path.sep]
if 'fragments' not in info_dict:
cmd += ['--out', self._aria2c_filename(os.path.basename(tmpfilename))]
cmd += ['--auto-file-renaming=false']
if 'fragments' in info_dict:
cmd += ['--file-allocation=none', '--uri-selector=inorder']
url_list_file = '%s.frag.urls' % (tmpfilename, )
url_list = []
for frag_index, fragment in enumerate(info_dict['fragments']):
fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index)
url_list.append('%s\n\tout=%s' % (fragment['url'], self._aria2c_filename(fragment_filename)))
stream, _ = self.sanitize_open(url_list_file, 'wb')
stream.write('\n'.join(url_list).encode())
stream.close()
cmd += ['-i', self._aria2c_filename(url_list_file)]
else:
cmd += ['--', info_dict['url']]
return cmd
class Aria2pFD(ExternalFD):
''' Aria2pFD class
This class support to use aria2p as downloader.
(Aria2p, a command-line tool and Python library to interact with an aria2c daemon process
through JSON-RPC.)
It can help you to get download progress more easily.
To use aria2p as downloader, you need to install aria2c and aria2p, aria2p can download with pip.
Then run aria2c in the background and enable with the --enable-rpc option.
'''
try:
import aria2p
__avail = True
except ImportError:
__avail = False
@classmethod
def available(cls):
return cls.__avail
def _call_downloader(self, tmpfilename, info_dict):
aria2 = self.aria2p.API(
self.aria2p.Client(
host='http://localhost',
port=6800,
secret=''
)
)
options = {
'min-split-size': '1M',
'max-connection-per-server': 4,
'auto-file-renaming': 'false',
}
options['dir'] = os.path.dirname(tmpfilename) or os.path.abspath('.')
options['out'] = os.path.basename(tmpfilename)
if self.ydl.cookiejar.get_cookie_header(info_dict['url']):
options['load-cookies'] = self._write_cookies()
options['header'] = []
for key, val in self._header_items(info_dict):
options['header'].append('{0}: {1}'.format(key, val))
download = aria2.add_uris([info_dict['url']], options)
status = {
'status': 'downloading',
'tmpfilename': tmpfilename,
}
started = time.time()
while download.status in ['active', 'waiting']:
download = aria2.get_download(download.gid)
status.update({
'downloaded_bytes': download.completed_length,
'total_bytes': download.total_length,
'elapsed': time.time() - started,
'eta': download.eta.total_seconds(),
'speed': download.download_speed,
})
self._hook_progress(status)
time.sleep(.5)
return download.status != 'complete'
class HttpieFD(ExternalFD):
@classmethod
def available(cls):
@ -207,15 +338,23 @@ class HttpieFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict):
cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
for key, val in info_dict['http_headers'].items():
for key, val in self._header_items(info_dict):
cmd += ['%s:%s' % (key, val)]
# httpie 3.1.0+ removes the Cookie header on redirect, so this should be safe for now. [1]
# If we ever need cookie handling for redirects, we can export the cookiejar into a session. [2]
# 1: https://github.com/httpie/httpie/security/advisories/GHSA-9w4w-cpc8-h2fq
# 2: https://httpie.io/docs/cli/sessions
cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url'])
if cookie_header:
cmd += ['Cookie:%s' % cookie_header]
return cmd
class FFmpegFD(ExternalFD):
@classmethod
def supports(cls, info_dict):
return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms')
return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms', 'http_dash_segments')
@classmethod
def available(cls):
@ -254,7 +393,14 @@ class FFmpegFD(ExternalFD):
# if end_time:
# args += ['-t', compat_str(end_time - start_time)]
if info_dict['http_headers'] and re.match(r'^https?://', url):
cookies = self.ydl.cookiejar.get_cookies_for_url(url)
if cookies:
args.extend(['-cookies', ''.join(
'{0}={1}; path={2}; domain={3};\r\n'.format(
cookie.name, cookie.value, cookie.path, cookie.domain)
for cookie in cookies)])
if info_dict.get('http_headers') and re.match(r'^https?://', url):
# Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
# [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
headers = handle_youtubedl_headers(info_dict['http_headers'])

View file

@ -71,7 +71,7 @@ class FragmentFD(FileDownloader):
@staticmethod
def __do_ytdl_file(ctx):
return not ctx['live'] and not ctx['tmpfilename'] == '-'
return ctx['live'] is not True and ctx['tmpfilename'] != '-'
def _read_ytdl_file(self, ctx):
assert 'ytdl_corrupt' not in ctx
@ -101,6 +101,13 @@ class FragmentFD(FileDownloader):
'url': frag_url,
'http_headers': headers or info_dict.get('http_headers'),
}
frag_resume_len = 0
if ctx['dl'].params.get('continuedl', True):
frag_resume_len = self.filesize_or_none(
self.temp_name(fragment_filename))
fragment_info_dict['frag_resume_len'] = frag_resume_len
ctx['frag_resume_len'] = frag_resume_len or 0
success = ctx['dl'].download(fragment_filename, fragment_info_dict)
if not success:
return False, None
@ -124,9 +131,7 @@ class FragmentFD(FileDownloader):
del ctx['fragment_filename_sanitized']
def _prepare_frag_download(self, ctx):
if 'live' not in ctx:
ctx['live'] = False
if not ctx['live']:
if not ctx.setdefault('live', False):
total_frags_str = '%d' % ctx['total_frags']
ad_frags = ctx.get('ad_frags', 0)
if ad_frags:
@ -136,10 +141,11 @@ class FragmentFD(FileDownloader):
self.to_screen(
'[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str))
self.report_destination(ctx['filename'])
continuedl = self.params.get('continuedl', True)
dl = HttpQuietDownloader(
self.ydl,
{
'continuedl': True,
'continuedl': continuedl,
'quiet': True,
'noprogress': True,
'ratelimit': self.params.get('ratelimit'),
@ -150,12 +156,11 @@ class FragmentFD(FileDownloader):
)
tmpfilename = self.temp_name(ctx['filename'])
open_mode = 'wb'
resume_len = 0
# Establish possible resume length
if os.path.isfile(encodeFilename(tmpfilename)):
resume_len = self.filesize_or_none(tmpfilename) or 0
if resume_len > 0:
open_mode = 'ab'
resume_len = os.path.getsize(encodeFilename(tmpfilename))
# Should be initialized before ytdl file check
ctx.update({
@ -164,7 +169,8 @@ class FragmentFD(FileDownloader):
})
if self.__do_ytdl_file(ctx):
if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))):
ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename'])))
if continuedl and ytdl_file_exists:
self._read_ytdl_file(ctx)
is_corrupt = ctx.get('ytdl_corrupt') is True
is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0
@ -178,7 +184,12 @@ class FragmentFD(FileDownloader):
if 'ytdl_corrupt' in ctx:
del ctx['ytdl_corrupt']
self._write_ytdl_file(ctx)
else:
if not continuedl:
if ytdl_file_exists:
self._read_ytdl_file(ctx)
ctx['fragment_index'] = resume_len = 0
self._write_ytdl_file(ctx)
assert ctx['fragment_index'] == 0
@ -209,6 +220,7 @@ class FragmentFD(FileDownloader):
start = time.time()
ctx.update({
'started': start,
'fragment_started': start,
# Amount of fragment's bytes downloaded by the time of the previous
# frag progress hook invocation
'prev_frag_downloaded_bytes': 0,
@ -218,6 +230,9 @@ class FragmentFD(FileDownloader):
if s['status'] not in ('downloading', 'finished'):
return
if not total_frags and ctx.get('fragment_count'):
state['fragment_count'] = ctx['fragment_count']
time_now = time.time()
state['elapsed'] = time_now - start
frag_total_bytes = s.get('total_bytes') or 0
@ -232,16 +247,17 @@ class FragmentFD(FileDownloader):
ctx['fragment_index'] = state['fragment_index']
state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes']
ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes']
ctx['speed'] = state['speed'] = self.calc_speed(
ctx['fragment_started'], time_now, frag_total_bytes)
ctx['fragment_started'] = time.time()
ctx['prev_frag_downloaded_bytes'] = 0
else:
frag_downloaded_bytes = s['downloaded_bytes']
state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes']
ctx['speed'] = state['speed'] = self.calc_speed(
ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx['frag_resume_len'])
if not ctx['live']:
state['eta'] = self.calc_eta(
start, time_now, estimated_size - resume_len,
state['downloaded_bytes'] - resume_len)
state['speed'] = s.get('speed') or ctx.get('speed')
ctx['speed'] = state['speed']
state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes'])
ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes
self._hook_progress(state)
@ -268,7 +284,7 @@ class FragmentFD(FileDownloader):
os.utime(ctx['filename'], (time.time(), filetime))
except Exception:
pass
downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename']))
downloaded_bytes = self.filesize_or_none(ctx['filename']) or 0
self._hook_progress({
'downloaded_bytes': downloaded_bytes,

View file

@ -58,9 +58,9 @@ class HttpFD(FileDownloader):
if self.params.get('continuedl', True):
# Establish possible resume length
if os.path.isfile(encodeFilename(ctx.tmpfilename)):
ctx.resume_len = os.path.getsize(
encodeFilename(ctx.tmpfilename))
ctx.resume_len = info_dict.get('frag_resume_len')
if ctx.resume_len is None:
ctx.resume_len = self.filesize_or_none(ctx.tmpfilename) or 0
ctx.is_resume = ctx.resume_len > 0
@ -115,9 +115,9 @@ class HttpFD(FileDownloader):
raise RetryDownload(err)
raise err
# When trying to resume, Content-Range HTTP header of response has to be checked
# to match the value of requested Range HTTP header. This is due to a webservers
# to match the value of requested Range HTTP header. This is due to webservers
# that don't support resuming and serve a whole file with no Content-Range
# set in response despite of requested Range (see
# set in response despite requested Range (see
# https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799)
if has_range:
content_range = ctx.data.headers.get('Content-Range')
@ -141,7 +141,8 @@ class HttpFD(FileDownloader):
# Content-Range is either not present or invalid. Assuming remote webserver is
# trying to send the whole file, resume is not possible, so wiping the local file
# and performing entire redownload
self.report_unable_to_resume()
if range_start > 0:
self.report_unable_to_resume()
ctx.resume_len = 0
ctx.open_mode = 'wb'
ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None))
@ -293,10 +294,7 @@ class HttpFD(FileDownloader):
# Progress message
speed = self.calc_speed(start, now, byte_counter - ctx.resume_len)
if ctx.data_len is None:
eta = None
else:
eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len)
eta = self.calc_eta(speed, ctx.data_len and (ctx.data_len - byte_counter))
self._hook_progress({
'status': 'downloading',

View file

@ -8,6 +8,8 @@ from ..utils import (
ExtractorError,
GeoRestrictedError,
int_or_none,
remove_start,
traverse_obj,
update_url_query,
urlencode_postdata,
)
@ -33,14 +35,17 @@ class AENetworksBaseIE(ThePlatformIE):
}
def _extract_aen_smil(self, smil_url, video_id, auth=None):
query = {'mbr': 'true'}
query = {
'mbr': 'true',
'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3',
}
if auth:
query['auth'] = auth
TP_SMIL_QUERY = [{
'assetTypes': 'high_video_ak',
'switch': 'hls_high_ak'
'switch': 'hls_high_ak',
}, {
'assetTypes': 'high_video_s3'
'assetTypes': 'high_video_s3',
}, {
'assetTypes': 'high_video_s3',
'switch': 'hls_high_fastly',
@ -75,7 +80,14 @@ class AENetworksBaseIE(ThePlatformIE):
requestor_id, brand = self._DOMAIN_MAP[domain]
result = self._download_json(
'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand,
filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0]
filter_value, query={'filter[%s]' % filter_key: filter_value})
result = traverse_obj(
result, ('results',
lambda k, v: k == 0 and v[filter_key] == filter_value),
get_all=False)
if not result:
raise ExtractorError('Show not found in A&E feed (too new?)', expected=True,
video_id=remove_start(filter_value, '/'))
title = result['title']
video_id = result['id']
media_url = result['publicUrl']
@ -126,7 +138,7 @@ class AENetworksIE(AENetworksBaseIE):
'skip_download': True,
},
'add_ie': ['ThePlatform'],
'skip': 'This video is only available for users of participating TV providers.',
'skip': 'Geo-restricted - This content is not available in your location.'
}, {
'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'info_dict': {
@ -143,6 +155,7 @@ class AENetworksIE(AENetworksBaseIE):
'skip_download': True,
},
'add_ie': ['ThePlatform'],
'skip': 'This video is only available for users of participating TV providers.',
}, {
'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
'only_matching': True

View file

@ -15,7 +15,7 @@ from ..utils import (
class AmericasTestKitchenIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers',
'md5': 'b861c3e365ac38ad319cfd509c30577f',
@ -23,15 +23,20 @@ class AmericasTestKitchenIE(InfoExtractor):
'id': '5b400b9ee338f922cb06450c',
'title': 'Japanese Suppers',
'ext': 'mp4',
'display_id': 'weeknight-japanese-suppers',
'description': 'md5:64e606bfee910627efc4b5f050de92b3',
'thumbnail': r're:^https?://',
'timestamp': 1523318400,
'upload_date': '20180410',
'release_date': '20180410',
'timestamp': 1523304000,
'upload_date': '20180409',
'release_date': '20180409',
'series': "America's Test Kitchen",
'season': 'Season 18',
'season_number': 18,
'episode': 'Japanese Suppers',
'episode_number': 15,
'duration': 1376,
'thumbnail': r're:^https?://',
'average_rating': 0,
'view_count': int,
},
'params': {
'skip_download': True,
@ -44,15 +49,20 @@ class AmericasTestKitchenIE(InfoExtractor):
'id': '5fbe8c61bda2010001c6763b',
'title': 'Simple Chicken Dinner',
'ext': 'mp4',
'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4',
'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7',
'thumbnail': r're:^https?://',
'timestamp': 1610755200,
'upload_date': '20210116',
'release_date': '20210116',
'timestamp': 1610737200,
'upload_date': '20210115',
'release_date': '20210115',
'series': "America's Test Kitchen",
'season': 'Season 21',
'season_number': 21,
'episode': 'Simple Chicken Dinner',
'episode_number': 3,
'duration': 1397,
'thumbnail': r're:^https?://',
'view_count': int,
'average_rating': 0,
},
'params': {
'skip_download': True,
@ -60,6 +70,12 @@ class AmericasTestKitchenIE(InfoExtractor):
}, {
'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon',
'only_matching': True,
}, {
'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do',
'only_matching': True,
}, {
'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do',
'only_matching': True,
@ -94,7 +110,7 @@ class AmericasTestKitchenIE(InfoExtractor):
class AmericasTestKitchenSeasonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|(?P<cooks>cooks(?:country|illustrated)))\.com(?:(?:/(?P<show2>cooks(?:country|illustrated)))?(?:/?$|(?<!ated)(?<!ated\.com)/episodes/browse/season_(?P<season>\d+)))'
_TESTS = [{
# ATK Season
'url': 'https://www.americastestkitchen.com/episodes/browse/season_1',
@ -105,48 +121,93 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
'playlist_count': 13,
}, {
# Cooks Country Season
'url': 'https://www.cookscountry.com/episodes/browse/season_12',
'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12',
'info_dict': {
'id': 'season_12',
'title': 'Season 12',
},
'playlist_count': 13,
}, {
# America's Test Kitchen Series
'url': 'https://www.americastestkitchen.com/',
'info_dict': {
'id': 'americastestkitchen',
'title': 'America\'s Test Kitchen',
},
'playlist_count': 558,
}, {
# Cooks Country Series
'url': 'https://www.americastestkitchen.com/cookscountry',
'info_dict': {
'id': 'cookscountry',
'title': 'Cook\'s Country',
},
'playlist_count': 199,
}, {
'url': 'https://www.americastestkitchen.com/cookscountry/',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com/episodes/browse/season_12',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com',
'only_matching': True,
}, {
'url': 'https://www.americastestkitchen.com/cooksillustrated/',
'only_matching': True,
}, {
'url': 'https://www.cooksillustrated.com',
'only_matching': True,
}]
def _real_extract(self, url):
show_name, season_number = re.match(self._VALID_URL, url).groups()
season_number = int(season_number)
match = re.match(self._VALID_URL, url).groupdict()
show = match.get('show2')
show_path = ('/' + show) if show else ''
show = show or match['show']
season_number = int_or_none(match.get('season'))
slug = 'atk' if show_name == 'americastestkitchen' else 'cco'
slug, title = {
'americastestkitchen': ('atk', 'America\'s Test Kitchen'),
'cookscountry': ('cco', 'Cook\'s Country'),
'cooksillustrated': ('cio', 'Cook\'s Illustrated'),
}[show]
season = 'Season %d' % season_number
facet_filters = [
'search_document_klass:episode',
'search_show_slug:' + slug,
]
if season_number:
playlist_id = 'season_%d' % season_number
playlist_title = 'Season %d' % season_number
facet_filters.append('search_season_list:' + playlist_title)
else:
playlist_id = show
playlist_title = title
season_search = self._download_json(
'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug,
season, headers={
'Origin': 'https://www.%s.com' % show_name,
playlist_id, headers={
'Origin': 'https://www.americastestkitchen.com',
'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805',
'X-Algolia-Application-Id': 'Y1FNZXUI30',
}, query={
'facetFilters': json.dumps([
'search_season_list:' + season,
'search_document_klass:episode',
'search_show_slug:' + slug,
]),
'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug,
'facetFilters': json.dumps(facet_filters),
'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug,
'attributesToHighlight': '',
'hitsPerPage': 1000,
})
def entries():
for episode in (season_search.get('hits') or []):
search_url = episode.get('search_url')
search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode'
if not search_url:
continue
yield {
'_type': 'url',
'url': 'https://www.%s.com%s' % (show_name, search_url),
'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]),
'url': 'https://www.americastestkitchen.com%s%s' % (show_path, search_url),
'id': try_get(episode, lambda e: e['objectID'].rsplit('_', 1)[-1]),
'title': episode.get('title'),
'description': episode.get('description'),
'timestamp': unified_timestamp(episode.get('search_document_date')),
@ -156,4 +217,4 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
}
return self.playlist_result(
entries(), 'season_%d' % season_number, season)
entries(), playlist_id, playlist_title)

View file

@ -0,0 +1,173 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from ..utils import (
strip_or_none,
traverse_obj,
)
from .common import InfoExtractor
class BlerpIE(InfoExtractor):
IE_NAME = 'blerp'
_VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{
'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a',
'info_dict': {
'id': '6320fe8745636cb4dd677a5a',
'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016',
'uploader': 'luminousaj',
'uploader_id': '5fb81e51aa66ae000c395478',
'ext': 'mp3',
'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'],
}
}, {
'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f',
'info_dict': {
'id': '5bc94ef4796001000498429f',
'title': 'Yee',
'uploader': '179617322678353920',
'uploader_id': '5ba99cf71386730004552c42',
'ext': 'mp3',
'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee']
}
}]
_GRAPHQL_OPERATIONNAME = "webBitePageGetBite"
_GRAPHQL_QUERY = (
'''query webBitePageGetBite($_id: MongoID!) {
web {
biteById(_id: $_id) {
...bitePageFrag
__typename
}
__typename
}
}
fragment bitePageFrag on Bite {
_id
title
userKeywords
keywords
color
visibility
isPremium
owned
price
extraReview
isAudioExists
image {
filename
original {
url
__typename
}
__typename
}
userReactions {
_id
reactions
createdAt
__typename
}
topReactions
totalSaveCount
saved
blerpLibraryType
license
licenseMetaData
playCount
totalShareCount
totalFavoriteCount
totalAddedToBoardCount
userCategory
userAudioQuality
audioCreationState
transcription
userTranscription
description
createdAt
updatedAt
author
listingType
ownerObject {
_id
username
profileImage {
filename
original {
url
__typename
}
__typename
}
__typename
}
transcription
favorited
visibility
isCurated
sourceUrl
audienceRating
strictAudienceRating
ownerId
reportObject {
reportedContentStatus
__typename
}
giphy {
mp4
gif
__typename
}
audio {
filename
original {
url
__typename
}
mp3 {
url
__typename
}
__typename
}
__typename
}
''')
def _real_extract(self, url):
audio_id = self._match_id(url)
data = {
'operationName': self._GRAPHQL_OPERATIONNAME,
'query': self._GRAPHQL_QUERY,
'variables': {
'_id': audio_id
}
}
headers = {
'Content-Type': 'application/json'
}
json_result = self._download_json('https://api.blerp.com/graphql',
audio_id, data=json.dumps(data).encode('utf-8'), headers=headers)
bite_json = json_result['data']['web']['biteById']
info_dict = {
'id': bite_json['_id'],
'url': bite_json['audio']['mp3']['url'],
'title': bite_json['title'],
'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none),
'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none),
'ext': 'mp3',
'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None)
}
return info_dict

View file

@ -0,0 +1,74 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
traverse_obj,
try_get,
)
class CallinIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?callin\.com/episode/(?:[^/#?-]+-)*(?P<id>[^/#?-]+)'
_TESTS = [{
'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW',
'md5': '14ede27ee2c957b7e4db93140fc0745c',
'info_dict': {
'id': 'PrumRdSQJW',
'ext': 'mp4',
'title': 'FCC Commissioner Brendan Carr on Elons Starlink',
'description': 'Or, why the government doesnt like SpaceX',
'channel': 'The Pull Request',
'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa',
}
}, {
'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA',
'md5': '16f704ddbf82a27e3930533b12062f07',
'info_dict': {
'id': 'lzxMidUnjA',
'ext': 'mp4',
'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?',
'description': 'Lets talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.',
'channel': 'The DEBRIEF With Briahna Joy Gray',
'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm',
}
}]
def _search_nextjs_data(self, webpage, video_id, transform_source=None, fatal=True, **kw):
return self._parse_json(
self._search_regex(
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
webpage, 'next.js data', fatal=fatal, **kw),
video_id, transform_source=transform_source, fatal=fatal)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
next_data = self._search_nextjs_data(webpage, video_id)
episode = traverse_obj(next_data, ('props', 'pageProps', 'episode'), expected_type=dict)
if not episode:
raise ExtractorError('Failed to find episode data')
title = episode.get('title') or self._og_search_title(webpage)
description = episode.get('description') or self._og_search_description(webpage)
formats = []
formats.extend(self._extract_m3u8_formats(
episode.get('m3u8'), video_id, 'mp4',
entry_protocol='m3u8_native', fatal=False))
self._sort_formats(formats)
channel = try_get(episode, lambda x: x['show']['title'], compat_str)
channel_url = try_get(episode, lambda x: x['show']['linkObj']['resourceUrl'], compat_str)
return {
'id': video_id,
'title': title,
'description': description,
'formats': formats,
'channel': channel,
'channel_url': channel_url,
}

View file

@ -3,7 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
url_or_none,
)
@ -20,32 +19,11 @@ class CamModelsIE(InfoExtractor):
def _real_extract(self, url):
user_id = self._match_id(url)
webpage = self._download_webpage(
url, user_id, headers=self.geo_verification_headers())
manifest_root = self._html_search_regex(
r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None)
if not manifest_root:
ERRORS = (
("I'm offline, but let's stay connected", 'This user is currently offline'),
('in a private show', 'This user is in a private show'),
('is currently performing LIVE', 'This model is currently performing live'),
)
for pattern, message in ERRORS:
if pattern in webpage:
error = message
expected = True
break
else:
error = 'Unable to find manifest URL root'
expected = False
raise ExtractorError(error, expected=expected)
manifest = self._download_json(
'%s%s.json' % (manifest_root, user_id), user_id)
'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id)
formats = []
thumbnails = []
for format_id, format_dict in manifest['formats'].items():
if not isinstance(format_dict, dict):
continue
@ -85,6 +63,13 @@ class CamModelsIE(InfoExtractor):
'preference': -1,
})
else:
if format_id == 'jpeg':
thumbnails.append({
'url': f['url'],
'width': f['width'],
'height': f['height'],
'format_id': f['format_id'],
})
continue
formats.append(f)
self._sort_formats(formats)
@ -92,6 +77,7 @@ class CamModelsIE(InfoExtractor):
return {
'id': user_id,
'title': self._live_title(user_id),
'thumbnails': thumbnails,
'is_live': True,
'formats': formats,
'age_limit': 18

View file

@ -0,0 +1,69 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
merge_dicts,
T,
traverse_obj,
unified_timestamp,
url_or_none,
)
class ClipchampIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU',
'info_dict': {
'id': 'gRXZ4ZhdDaU',
'ext': 'mp4',
'title': 'Untitled video',
'uploader': 'Alexander Schwartz',
'timestamp': 1680805580,
'upload_date': '20230406',
'thumbnail': r're:^https?://.+\.jpg',
},
'params': {
'skip_download': 'm3u8',
'format': 'bestvideo',
},
}]
_STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'
_STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video']
storage_location = data.get('storage_location')
if storage_location != 'cf_stream':
raise ExtractorError('Unsupported clip storage location "%s"' % (storage_location,))
path = data['download_url']
iframe = self._download_webpage(
'https://iframe.cloudflarestream.com/' + path, video_id, 'Downloading player iframe')
subdomain = self._search_regex(
r'''\bcustomer-domain-prefix\s*=\s*("|')(?P<sd>[\w-]+)\1''', iframe,
'subdomain', group='sd', fatal=False) or 'customer-2ut9yn3y6fta1yxe'
formats = self._extract_mpd_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id,
query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash')
formats.extend(self._extract_m3u8_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4',
query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls'))
return merge_dicts({
'id': video_id,
'formats': formats,
'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), T(compat_str)))) or None,
}, traverse_obj(data, {
'title': ('project', 'project_name', T(compat_str)),
'timestamp': ('created_at', T(unified_timestamp)),
'thumbnail': ('thumbnail_url', T(url_or_none)),
}), rev=True)

View file

@ -2,7 +2,9 @@
from __future__ import unicode_literals
import base64
import collections
import datetime
import functools
import hashlib
import json
import netrc
@ -23,6 +25,8 @@ from ..compat import (
compat_getpass,
compat_integer_types,
compat_http_client,
compat_map as map,
compat_open as open,
compat_os_name,
compat_str,
compat_urllib_error,
@ -31,6 +35,7 @@ from ..compat import (
compat_urllib_request,
compat_urlparse,
compat_xml_parse_error,
compat_zip as zip,
)
from ..downloader.f4m import (
get_base_url,
@ -54,6 +59,7 @@ from ..utils import (
GeoRestrictedError,
GeoUtils,
int_or_none,
join_nonempty,
js_to_json,
JSON_LD_RE,
mimetype2ext,
@ -70,6 +76,8 @@ from ..utils import (
str_or_none,
str_to_int,
strip_or_none,
T,
traverse_obj,
try_get,
unescapeHTML,
unified_strdate,
@ -79,6 +87,7 @@ from ..utils import (
urljoin,
url_basename,
url_or_none,
variadic,
xpath_element,
xpath_text,
xpath_with_ns,
@ -174,6 +183,8 @@ class InfoExtractor(object):
fragment_base_url
* "duration" (optional, int or float)
* "filesize" (optional, int)
* "range" (optional, str of the form "start-end"
to use in HTTP Range header)
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field, regardless of all other values.
@ -367,9 +378,22 @@ class InfoExtractor(object):
title, description etc.
Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
A subclass of InfoExtractor must be defined to handle each specific site (or
several sites). Such a concrete subclass should be added to the list of
extractors. It should also:
* define its _VALID_URL attribute as a regexp, or a Sequence of alternative
regexps (but see below)
* re-define the _real_extract() method
* optionally re-define the _real_initialize() method.
An extractor subclass may also override suitable() if necessary, but the
function signature must be preserved and the function must import everything
it needs (except other extractors), so that lazy_extractors works correctly.
If the subclass's suitable() and _real_extract() functions avoid using
_VALID_URL, the subclass need not set that class attribute.
An abstract subclass of InfoExtractor may be used to simplify implementation
within an extractor module; it should not be added to the list of extractors.
_GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
@ -404,22 +428,33 @@ class InfoExtractor(object):
self._x_forwarded_for_ip = None
self.set_downloader(downloader)
@classmethod
def __match_valid_url(cls, url):
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for cls, whereas getattr would also
# match its superclass
if '_VALID_URL_RE' not in cls.__dict__:
# _VALID_URL can now be a list/tuple of patterns
cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
# 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7
for p in cls._VALID_URL_RE:
p = p.match(url)
if p:
return p
# The public alias can safely be overridden, as in some back-ports
_match_valid_url = __match_valid_url
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
return cls._VALID_URL_RE.match(url) is not None
# This function must import everything it needs (except other extractors),
# so that lazy_extractors works correctly
return cls.__match_valid_url(url) is not None
@classmethod
def _match_id(cls, url):
if '_VALID_URL_RE' not in cls.__dict__:
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
m = cls._VALID_URL_RE.match(url)
m = cls.__match_valid_url(url)
assert m
return compat_str(m.group('id'))
@ -566,6 +601,14 @@ class InfoExtractor(object):
"""Sets the downloader for this IE."""
self._downloader = downloader
@property
def cache(self):
return self._downloader.cache
@property
def cookiejar(self):
return self._downloader.cookiejar
def _real_initialize(self):
"""Real initialization process. Redefine in subclasses."""
pass
@ -912,14 +955,47 @@ class InfoExtractor(object):
else:
self.report_warning(errmsg + str(ve))
def report_warning(self, msg, video_id=None):
def __ie_msg(self, *msg):
return '[{0}] {1}'.format(self.IE_NAME, ''.join(msg))
# msg, video_id=None, *args, only_once=False, **kwargs
def report_warning(self, msg, *args, **kwargs):
if len(args) > 0:
video_id = args[0]
args = args[1:]
else:
video_id = kwargs.pop('video_id', None)
idstr = '' if video_id is None else '%s: ' % video_id
self._downloader.report_warning(
'[%s] %s%s' % (self.IE_NAME, idstr, msg))
self.__ie_msg(idstr, msg), *args, **kwargs)
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
self._downloader.to_screen(self.__ie_msg(msg))
def write_debug(self, msg, only_once=False, _cache=[]):
'''Log debug message or Print message to stderr'''
if not self.get_param('verbose', False):
return
message = '[debug] ' + self.__ie_msg(msg)
logger = self.get_param('logger')
if logger:
logger.debug(message)
else:
if only_once and hash(message) in _cache:
return
self._downloader.to_stderr(message)
_cache.append(hash(message))
# name, default=None, *args, **kwargs
def get_param(self, name, *args, **kwargs):
default, args = (args[0], args[1:]) if len(args) > 0 else (kwargs.pop('default', None), args)
if self._downloader:
return self._downloader.params.get(name, default, *args, **kwargs)
return default
def report_drm(self, video_id):
self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
def report_extraction(self, id_or_name):
"""Report information extraction."""
@ -947,6 +1023,15 @@ class InfoExtractor(object):
def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
raise GeoRestrictedError(msg, countries=countries)
def raise_no_formats(self, msg, expected=False, video_id=None):
if expected and (
self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
self.report_warning(msg, video_id)
elif isinstance(msg, ExtractorError):
raise msg
else:
raise ExtractorError(msg, expected=expected, video_id=video_id)
# Methods for following #608
@staticmethod
def url_result(url, ie=None, video_id=None, video_title=None):
@ -1005,6 +1090,8 @@ class InfoExtractor(object):
if group is None:
# return the first matching group
return next(g for g in mobj.groups() if g is not None)
elif isinstance(group, (list, tuple)):
return tuple(mobj.group(g) for g in group)
else:
return mobj.group(group)
elif default is not NO_DEFAULT:
@ -1020,10 +1107,9 @@ class InfoExtractor(object):
Like _search_regex, but strips HTML tags and unescapes entities.
"""
res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if res:
return clean_html(res).strip()
else:
return res
if isinstance(res, tuple):
return tuple(map(clean_html, res))
return clean_html(res)
def _get_netrc_login_info(self, netrc_machine=None):
username = None
@ -1087,7 +1173,7 @@ class InfoExtractor(object):
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
% {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s'
@ -1348,6 +1434,44 @@ class InfoExtractor(object):
break
return dict((k, v) for k, v in info.items() if v is not None)
def _search_nextjs_data(self, webpage, video_id, **kw):
nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal'))
kw.pop('transform_source', None)
next_data = self._search_regex(
r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''',
webpage, 'next.js data', group='nd', **kw)
if not next_data:
return {}
return self._parse_json(next_data, video_id, **nkw)
def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
# self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)
context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__')
fatal = kwargs.get('fatal', True)
traverse = kwargs.get('traverse', ('data', 0))
re_ctx = re.escape(context_name)
FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*'
r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)')
js, arg_keys, arg_vals = self._search_regex(
(p.format(re_ctx, FUNCTION_RE) for p in
(r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>',
r'{0}\s*\([\s\S]*?{1}')),
webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
default=NO_DEFAULT if fatal else (None, None, None))
if js is None:
return {}
args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
'[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ())))
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
return traverse_obj(ret, traverse) or {}
@staticmethod
def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
@ -1632,6 +1756,12 @@ class InfoExtractor(object):
'format_note': 'Quality selection URL',
}
def _report_ignoring_subs(self, name):
self.report_warning(bug_reports_message(
'Ignoring subtitle tracks found in the {0} manifest; '
'if any subtitle tracks are missing,'.format(name)
), only_once=True)
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
@ -2072,23 +2202,46 @@ class InfoExtractor(object):
})
return entries
def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
def _extract_mpd_formats(self, *args, **kwargs):
fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
self._report_ignoring_subs('DASH')
return fmts
def _extract_mpd_formats_and_subtitles(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers=None, query=None):
# TODO: or not? param not yet implemented
if self.get_param('ignore_no_formats_error'):
fatal = False
res = self._download_xml_handle(
mpd_url, video_id,
note=note or 'Downloading MPD manifest',
errnote=errnote or 'Failed to download MPD manifest',
fatal=fatal, data=data, headers=headers, query=query)
note='Downloading MPD manifest' if note is None else note,
errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers or {}, query=query or {})
if res is False:
return []
return [], {}
mpd_doc, urlh = res
if mpd_doc is None:
return []
mpd_base_url = base_url(urlh.geturl())
return [], {}
return self._parse_mpd_formats(
# We could have been redirected to a new url when we retrieved our mpd file.
mpd_url = urlh.geturl()
mpd_base_url = base_url(mpd_url)
return self._parse_mpd_formats_and_subtitles(
mpd_doc, mpd_id, mpd_base_url, mpd_url)
def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
self._report_ignoring_subs('DASH')
return fmts
def _parse_mpd_formats_and_subtitles(
self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@ -2096,8 +2249,10 @@ class InfoExtractor(object):
http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
"""
if mpd_doc.get('type') == 'dynamic':
return []
# TODO: param not yet implemented: default like previous yt-dl logic
if not self.get_param('dynamic_mpd', False):
if mpd_doc.get('type') == 'dynamic':
return [], {}
namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
@ -2107,8 +2262,24 @@ class InfoExtractor(object):
def is_drm_protected(element):
return element.find(_add_ns('ContentProtection')) is not None
from ..utils import YoutubeDLHandler
fix_path = YoutubeDLHandler._fix_path
def resolve_base_url(element, parent_base_url=None):
# TODO: use native XML traversal when ready
b_url = traverse_obj(element, (
T(lambda e: e.find(_add_ns('BaseURL')).text)))
if parent_base_url and b_url:
if not parent_base_url[-1] in ('/', ':'):
parent_base_url += '/'
b_url = compat_urlparse.urljoin(parent_base_url, b_url)
if b_url:
b_url = fix_path(b_url)
return b_url or parent_base_url
def extract_multisegment_info(element, ms_parent_info):
ms_info = ms_parent_info.copy()
base_url = ms_info['base_url'] = resolve_base_url(element, ms_info.get('base_url'))
# As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
# common attributes and elements. We will only extract relevant
@ -2142,15 +2313,27 @@ class InfoExtractor(object):
def extract_Initialization(source):
initialization = source.find(_add_ns('Initialization'))
if initialization is not None:
ms_info['initialization_url'] = initialization.attrib['sourceURL']
ms_info['initialization_url'] = initialization.get('sourceURL') or base_url
initialization_url_range = initialization.get('range')
if initialization_url_range:
ms_info['initialization_url_range'] = initialization_url_range
segment_list = element.find(_add_ns('SegmentList'))
if segment_list is not None:
extract_common(segment_list)
extract_Initialization(segment_list)
segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
if segment_urls_e:
ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
segment_urls = traverse_obj(segment_urls_e, (
Ellipsis, T(lambda e: e.attrib), 'media'))
if segment_urls:
ms_info['segment_urls'] = segment_urls
segment_urls_range = traverse_obj(segment_urls_e, (
Ellipsis, T(lambda e: e.attrib), 'mediaRange',
T(lambda r: re.findall(r'^\d+-\d+$', r)), 0))
if segment_urls_range:
ms_info['segment_urls_range'] = segment_urls_range
if not segment_urls:
ms_info['segment_urls'] = [base_url for _ in segment_urls_range]
else:
segment_template = element.find(_add_ns('SegmentTemplate'))
if segment_template is not None:
@ -2166,17 +2349,20 @@ class InfoExtractor(object):
return ms_info
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats = []
formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
mpd_base_url = resolve_base_url(mpd_doc, mpd_base_url or mpd_url)
for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
'timescale': 1,
'base_url': mpd_base_url,
})
for adaptation_set in period.findall(_add_ns('AdaptationSet')):
if is_drm_protected(adaptation_set):
continue
adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
adaptation_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
for representation in adaptation_set.findall(_add_ns('Representation')):
if is_drm_protected(representation):
continue
@ -2184,27 +2370,35 @@ class InfoExtractor(object):
representation_attrib.update(representation.attrib)
# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
mime_type = representation_attrib['mimeType']
content_type = mime_type.split('/')[0]
if content_type == 'text':
# TODO implement WebVTT downloading
pass
elif content_type in ('video', 'audio'):
base_url = ''
for element in (representation, adaptation_set, period, mpd_doc):
base_url_e = element.find(_add_ns('BaseURL'))
if base_url_e is not None:
base_url = base_url_e.text + base_url
if re.match(r'^https?://', base_url):
break
if mpd_base_url and not re.match(r'^https?://', base_url):
if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
mpd_base_url += '/'
base_url = mpd_base_url + base_url
representation_id = representation_attrib.get('id')
lang = representation_attrib.get('lang')
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
content_type = representation_attrib.get('contentType') or mime_type.split('/')[0]
codec_str = representation_attrib.get('codecs', '')
# Some kind of binary subtitle found in some youtube livestreams
if mime_type == 'application/x-rawcc':
codecs = {'scodec': codec_str}
else:
codecs = parse_codecs(codec_str)
if content_type not in ('video', 'audio', 'text'):
if mime_type == 'image/jpeg':
content_type = mime_type
elif codecs.get('vcodec', 'none') != 'none':
content_type = 'video'
elif codecs.get('acodec', 'none') != 'none':
content_type = 'audio'
elif codecs.get('scodec', 'none') != 'none':
content_type = 'text'
elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
content_type = 'text'
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
continue
representation_id = representation_attrib.get('id')
lang = representation_attrib.get('lang')
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
format_id = join_nonempty(representation_id or content_type, mpd_id)
if content_type in ('video', 'audio'):
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'manifest_url': mpd_url,
@ -2219,104 +2413,130 @@ class InfoExtractor(object):
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
}
f.update(parse_codecs(representation_attrib.get('codecs')))
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
f.update(codecs)
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
'manifest_url': mpd_url,
'filesize': filesize,
}
elif content_type == 'image/jpeg':
# See test case in VikiIE
# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
f = {
'format_id': format_id,
'ext': 'mhtml',
'manifest_url': mpd_url,
'format_note': 'DASH storyboards (jpeg)',
'acodec': 'none',
'vcodec': 'none',
}
if is_drm_protected(adaptation_set) or is_drm_protected(representation):
f['has_drm'] = True
representation_ms_info = extract_multisegment_info(representation, adaptation_set_ms_info)
def prepare_template(template_name, identifiers):
tmpl = representation_ms_info[template_name]
# First of, % characters outside $...$ templates
# must be escaped by doubling for proper processing
# by % operator string formatting used further (see
# https://github.com/ytdl-org/youtube-dl/issues/16867).
t = ''
in_template = False
for c in tmpl:
def prepare_template(template_name, identifiers):
tmpl = representation_ms_info[template_name]
# First of, % characters outside $...$ templates
# must be escaped by doubling for proper processing
# by % operator string formatting used further (see
# https://github.com/ytdl-org/youtube-dl/issues/16867).
t = ''
in_template = False
for c in tmpl:
t += c
if c == '$':
in_template = not in_template
elif c == '%' and not in_template:
t += c
if c == '$':
in_template = not in_template
elif c == '%' and not in_template:
t += c
# Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator
t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
t.replace('$$', '$')
return t
# Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator
t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
t.replace('$$', '$')
return t
# @initialization is a regular template like @media one
# so it should be handled just the same way (see
# https://github.com/ytdl-org/youtube-dl/issues/11605)
if 'initialization' in representation_ms_info:
initialization_template = prepare_template(
'initialization',
# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
# $Time$ shall not be included for @initialization thus
# only $Bandwidth$ remains
('Bandwidth', ))
representation_ms_info['initialization_url'] = initialization_template % {
'Bandwidth': bandwidth,
}
# @initialization is a regular template like @media one
# so it should be handled just the same way (see
# https://github.com/ytdl-org/youtube-dl/issues/11605)
if 'initialization' in representation_ms_info:
initialization_template = prepare_template(
'initialization',
# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
# $Time$ shall not be included for @initialization thus
# only $Bandwidth$ remains
('Bandwidth', ))
representation_ms_info['initialization_url'] = initialization_template % {
'Bandwidth': bandwidth,
}
def location_key(location):
return 'url' if re.match(r'^https?://', location) else 'path'
def location_key(location):
return 'url' if re.match(r'^https?://', location) else 'path'
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
def calc_segment_duration():
return float_or_none(
representation_ms_info['segment_duration'],
representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
media_location_key = location_key(media_template)
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{
media_location_key: media_template % {
'Number': segment_number,
'Bandwidth': bandwidth,
},
'duration': segment_duration,
} for segment_number in range(
representation_ms_info['start_number'],
representation_ms_info['total_number'] + representation_ms_info['start_number'])]
else:
# $Number*$ or $Time$ in media template with S list available
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
representation_ms_info['fragments'] = []
segment_time = 0
segment_d = None
segment_number = representation_ms_info['start_number']
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
media_location_key = location_key(media_template)
def add_segment_url():
segment_url = media_template % {
'Time': segment_time,
'Bandwidth': bandwidth,
'Number': segment_number,
}
representation_ms_info['fragments'].append({
media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(
float_or_none(period_duration, segment_duration, default=0)))
representation_ms_info['fragments'] = [{
media_location_key: media_template % {
'Number': segment_number,
'Bandwidth': bandwidth,
},
'duration': segment_duration,
} for segment_number in range(
representation_ms_info['start_number'],
representation_ms_info['total_number'] + representation_ms_info['start_number'])]
else:
# $Number*$ or $Time$ in media template with S list available
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
representation_ms_info['fragments'] = []
segment_time = 0
segment_d = None
segment_number = representation_ms_info['start_number']
for num, s in enumerate(representation_ms_info['s']):
segment_time = s.get('t') or segment_time
segment_d = s['d']
def add_segment_url():
segment_url = media_template % {
'Time': segment_time,
'Bandwidth': bandwidth,
'Number': segment_number,
}
representation_ms_info['fragments'].append({
media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
for num, s in enumerate(representation_ms_info['s']):
segment_time = s.get('t') or segment_time
segment_d = s['d']
add_segment_url()
segment_number += 1
for r in range(s.get('r', 0)):
segment_time += segment_d
add_segment_url()
segment_number += 1
for r in range(s.get('r', 0)):
segment_time += segment_d
add_segment_url()
segment_number += 1
segment_time += segment_d
elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
segment_time += segment_d
elif 'segment_urls' in representation_ms_info:
fragments = []
if 's' in representation_ms_info:
# No media template
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
# or any YouTube dashsegments video
fragments = []
segment_index = 0
timescale = representation_ms_info['timescale']
for s in representation_ms_info['s']:
@ -2328,48 +2548,78 @@ class InfoExtractor(object):
'duration': duration,
})
segment_index += 1
representation_ms_info['fragments'] = fragments
elif 'segment_urls' in representation_ms_info:
elif 'segment_urls_range' in representation_ms_info:
# Segment URLs with mediaRange
# Example: https://kinescope.io/200615537/master.mpd
# https://github.com/ytdl-org/youtube-dl/issues/30235
# or any mpd generated with Bento4 `mp4dash --no-split --use-segment-list`
segment_duration = calc_segment_duration()
for segment_url, segment_url_range in zip(
representation_ms_info['segment_urls'], representation_ms_info['segment_urls_range']):
fragments.append({
location_key(segment_url): segment_url,
'range': segment_url_range,
'duration': segment_duration,
})
else:
# Segment URLs with no SegmentTimeline
# Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
# https://github.com/ytdl-org/youtube-dl/pull/14844
fragments = []
segment_duration = float_or_none(
representation_ms_info['segment_duration'],
representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
segment_duration = calc_segment_duration()
for segment_url in representation_ms_info['segment_urls']:
fragment = {
fragments.append({
location_key(segment_url): segment_url,
}
if segment_duration:
fragment['duration'] = segment_duration
fragments.append(fragment)
representation_ms_info['fragments'] = fragments
# If there is a fragments key available then we correctly recognized fragmented media.
# Otherwise we will assume unfragmented media with direct access. Technically, such
# assumption is not necessarily correct since we may simply have no support for
# some forms of fragmented media renditions yet, but for now we'll use this fallback.
if 'fragments' in representation_ms_info:
f.update({
# NB: mpd_url may be empty when MPD manifest is parsed from a string
'url': mpd_url or base_url,
'fragment_base_url': base_url,
'fragments': [],
'protocol': 'http_dash_segments',
'duration': segment_duration,
})
representation_ms_info['fragments'] = fragments
# If there is a fragments key available then we correctly recognized fragmented media.
# Otherwise we will assume unfragmented media with direct access. Technically, such
# assumption is not necessarily correct since we may simply have no support for
# some forms of fragmented media renditions yet, but for now we'll use this fallback.
if 'fragments' in representation_ms_info:
base_url = representation_ms_info['base_url']
f.update({
# NB: mpd_url may be empty when MPD manifest is parsed from a string
'url': mpd_url or base_url,
'fragment_base_url': base_url,
'fragments': [],
'protocol': 'http_dash_segments',
})
if 'initialization_url' in representation_ms_info and 'initialization_url_range' in representation_ms_info:
# Initialization URL with range (accompanied by Segment URLs with mediaRange above)
# https://github.com/ytdl-org/youtube-dl/issues/30235
initialization_url = representation_ms_info['initialization_url']
f['fragments'].append({
location_key(initialization_url): initialization_url,
'range': representation_ms_info['initialization_url_range'],
})
if 'initialization_url' in representation_ms_info:
initialization_url = representation_ms_info['initialization_url']
if not f.get('url'):
f['url'] = initialization_url
f['fragments'].append({location_key(initialization_url): initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
formats.append(f)
elif 'initialization_url' in representation_ms_info:
initialization_url = representation_ms_info['initialization_url']
if not f.get('url'):
f['url'] = initialization_url
f['fragments'].append({location_key(initialization_url): initialization_url})
elif 'initialization_url_range' in representation_ms_info:
# no Initialization URL but range (accompanied by no Segment URLs but mediaRange above)
# https://github.com/ytdl-org/youtube-dl/issues/27575
f['fragments'].append({
location_key(base_url): base_url,
'range': representation_ms_info['initialization_url_range'],
})
f['fragments'].extend(representation_ms_info['fragments'])
if not period_duration:
period_duration = sum(traverse_obj(representation_ms_info, (
'fragments', Ellipsis, 'duration', T(float_or_none))))
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
# Assuming direct URL to unfragmented media.
f['url'] = representation_ms_info['base_url']
if content_type in ('video', 'audio', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
return formats, subtitles
def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
@ -2495,7 +2745,8 @@ class InfoExtractor(object):
return f
return {}
def _media_formats(src, cur_media_type, type_info={}):
def _media_formats(src, cur_media_type, type_info=None):
type_info = type_info or {}
full_url = absolute_url(src)
ext = type_info.get('ext') or determine_ext(full_url)
if ext == 'm3u8':
@ -2513,6 +2764,7 @@ class InfoExtractor(object):
formats = [{
'url': full_url,
'vcodec': 'none' if cur_media_type == 'audio' else None,
'ext': ext,
}]
return is_plain_url, formats
@ -2521,7 +2773,7 @@ class InfoExtractor(object):
# so we wll include them right here (see
# https://www.ampproject.org/docs/reference/components/amp-video)
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)'
media_tags = [(media_tag, media_tag_name, media_type, '')
for media_tag, media_tag_name, media_type
in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
@ -2539,7 +2791,8 @@ class InfoExtractor(object):
media_attributes = extract_attributes(media_tag)
src = strip_or_none(media_attributes.get('src'))
if src:
_, formats = _media_formats(src, media_type)
f = parse_content_type(media_attributes.get('type'))
_, formats = _media_formats(src, media_type, f)
media_info['formats'].extend(formats)
media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
if media_content:

204
youtube_dl/extractor/dlf.py Normal file
View file

@ -0,0 +1,204 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
merge_dicts,
traverse_obj,
url_or_none,
variadic,
)
class DLFBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/'
_BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)'
def _parse_button_attrs(self, button, audio_id=None):
attrs = extract_attributes(button)
audio_id = audio_id or attrs['data-audio-diraid']
url = traverse_obj(
attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference',
'data-audio-src', expected_type=url_or_none)
ext = determine_ext(url)
formats = (self._extract_m3u8_formats(url, audio_id, fatal=False)
if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}])
self._sort_formats(formats)
def traverse_attrs(path):
path = list(variadic(path))
t = path.pop() if callable(path[-1]) else None
return traverse_obj(attrs, path, expected_type=t, get_all=False)
def txt_or_none(v, default=None):
return default if v is None else (compat_str(v).strip() or default)
return merge_dicts(*reversed([{
'id': audio_id,
# 'extractor_key': DLFIE.ie_key(),
# 'extractor': DLFIE.IE_NAME,
'formats': formats,
}, dict((k, traverse_attrs(v)) for k, v in {
'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), txt_or_none),
'duration': (('data-audioduration', 'data-audio-duration'), int_or_none),
'thumbnail': ('data-audioimage', url_or_none),
'uploader': 'data-audio-producer',
'series': 'data-audio-series',
'channel': 'data-audio-origin-site-name',
'webpage_url': ('data-audio-download-tracking-path', url_or_none),
}.items())]))
class DLFIE(DLFBaseIE):
IE_NAME = 'dlf'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html'
_TESTS = [
# Audio as an HLS stream
{
'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html',
'info_dict': {
'id': '03a3eb19',
'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien',
'ext': 'm4a',
'duration': 3298,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'On Stage',
'channel': 'deutschlandfunk'
},
'params': {
'skip_download': 'm3u8'
},
'skip': 'This webpage no longer exists'
}, {
'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html',
'info_dict': {
'id': 'd9cc1856',
'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner',
'ext': 'mp3',
'duration': 291,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'Kommentare und Themen der Woche',
'channel': 'deutschlandfunk'
}
},
]
def _real_extract(self, url):
audio_id = self._match_id(url)
webpage = self._download_webpage(url, audio_id)
return self._parse_button_attrs(
self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id)
class DLFCorpusIE(DLFBaseIE):
IE_NAME = 'dlf:corpus'
IE_DESC = 'DLF Multi-feed Archives'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html'
_TESTS = [
# Recorded news broadcast with referrals to related broadcasts
{
'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html',
'info_dict': {
'id': 'fechten-russland-belarus-ukraine-protest-100',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad'
},
'playlist_mincount': 5,
'playlist': [{
'info_dict': {
'id': '1fc5d64a',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'ext': 'mp3',
'duration': 252,
'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313',
'uploader': 'Deutschlandfunk',
'series': 'Sport',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '2ada145f',
'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten',
'ext': 'mp3',
'duration': 336,
'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005',
'uploader': 'Deutschlandfunk',
'series': 'Deutschlandfunk Nova',
'channel': 'deutschlandfunk-nova'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '47e1a096',
'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"',
'ext': 'mp3',
'duration': 602,
'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}]
},
# Podcast feed with tag buttons, playlist count fluctuates
{
'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html',
'info_dict': {
'id': 'kommentare-und-themen-der-woche-100',
'title': 'Meinung - Kommentare und Themen der Woche',
'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5',
},
'playlist_mincount': 10,
},
# Podcast feed with no description
{
'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html',
'info_dict': {
'id': 'podcast-tolle-idee-100',
'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?',
},
'playlist_mincount': 11,
},
]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
return self.playlist_result(
map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)),
playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, default=None))

View file

@ -0,0 +1,101 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
float_or_none,
T,
traverse_obj,
txt_or_none,
unified_timestamp,
url_or_none,
)
class EpidemicSoundIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?epidemicsound\.com/track/(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{
'url': 'https://www.epidemicsound.com/track/yFfQVRpSPz/',
'md5': 'd98ff2ddb49e8acab9716541cbc9dfac',
'info_dict': {
'id': '45014',
'display_id': 'yFfQVRpSPz',
'ext': 'mp3',
'tags': ['foley', 'door', 'knock', 'glass', 'window', 'glass door knock'],
'title': 'Door Knock Door 1',
'duration': 1,
'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/default-sfx/3000x3000.jpg',
'timestamp': 1415320353,
'upload_date': '20141107',
'age_limit': None,
# check that the "best" format was found, since test file MD5 doesn't
# distinguish the formats
'format': 'full',
},
}, {
'url': 'https://www.epidemicsound.com/track/mj8GTTwsZd/',
'md5': 'c82b745890f9baf18dc2f8d568ee3830',
'info_dict': {
'id': '148700',
'display_id': 'mj8GTTwsZd',
'ext': 'mp3',
'tags': ['liquid drum n bass', 'energetic'],
'title': 'Noplace',
'duration': 237,
'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/11138/3000x3000.jpg',
'timestamp': 1694426482,
'release_timestamp': 1700535606,
'upload_date': '20230911',
'age_limit': None,
'format': 'full',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
json_data = self._download_json('https://www.epidemicsound.com/json/track/' + video_id, video_id)
def fmt_or_none(f):
if not f.get('format'):
f['format'] = f.get('format_id')
elif not f.get('format_id'):
f['format_id'] = f['format']
if not (f['url'] and f['format']):
return
if f.get('format_note'):
f['format_note'] = 'track ID ' + f['format_note']
f['preference'] = -1 if f['format'] == 'full' else -2
return f
formats = traverse_obj(json_data, (
'stems', T(dict.items), Ellipsis, {
'format': (0, T(txt_or_none)),
'format_note': (1, 's3TrackId', T(txt_or_none)),
'format_id': (1, 'stemType', T(txt_or_none)),
'url': (1, 'lqMp3Url', T(url_or_none)),
}, T(fmt_or_none)))
self._sort_formats(formats)
info = traverse_obj(json_data, {
'id': ('id', T(txt_or_none)),
'tags': ('metadataTags', Ellipsis, T(txt_or_none)),
'title': ('title', T(txt_or_none)),
'duration': ('length', T(float_or_none)),
'timestamp': ('added', T(unified_timestamp)),
'thumbnail': (('imageUrl', 'cover'), T(url_or_none)),
'age_limit': ('isExplicit', T(lambda b: 18 if b else None)),
'release_timestamp': ('releaseDate', T(unified_timestamp)),
}, get_all=False)
info.update(traverse_obj(json_data, {
'categories': ('genres', Ellipsis, 'tag', T(txt_or_none)),
'tags': ('metadataTags', Ellipsis, T(txt_or_none)),
}))
info.update({
'display_id': video_id,
'formats': formats,
})
return info

View file

@ -138,6 +138,7 @@ from .bleacherreport import (
BleacherReportIE,
BleacherReportCMSIE,
)
from .blerp import BlerpIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
from .bongacams import BongaCamsIE
@ -158,6 +159,7 @@ from .businessinsider import BusinessInsiderIE
from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE
from .c56 import C56IE
from .callin import CallinIE
from .camdemy import (
CamdemyIE,
CamdemyFolderIE
@ -224,6 +226,7 @@ from .ciscolive import (
CiscoLiveSearchIE,
)
from .cjsw import CJSWIE
from .clipchamp import ClipchampIE
from .cliphunter import CliphunterIE
from .clippit import ClippitIE
from .cliprs import ClipRsIE
@ -293,6 +296,10 @@ from .dbtv import DBTVIE
from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE
from .democracynow import DemocracynowIE
from .dlf import (
DLFCorpusIE,
DLFIE,
)
from .dfb import DFBIE
from .dhm import DHMIE
from .digg import DiggIE
@ -350,6 +357,7 @@ from .ellentube import (
from .elpais import ElPaisIE
from .embedly import EmbedlyIE
from .engadget import EngadgetIE
from .epidemicsound import EpidemicSoundIE
from .eporner import EpornerIE
from .eroprofile import EroProfileIE
from .escapist import EscapistIE
@ -374,6 +382,8 @@ from .fc2 import (
FC2EmbedIE,
)
from .fczenit import FczenitIE
from .filemoon import FileMoonIE
from .fifa import FifaIE
from .filmon import (
FilmOnIE,
FilmOnChannelIE,
@ -440,6 +450,13 @@ from .gfycat import GfycatIE
from .giantbomb import GiantBombIE
from .giga import GigaIE
from .glide import GlideIE
from .globalplayer import (
GlobalPlayerLiveIE,
GlobalPlayerLivePlaylistIE,
GlobalPlayerAudioIE,
GlobalPlayerAudioEpisodeIE,
GlobalPlayerVideoIE
)
from .globo import (
GloboIE,
GloboArticleIE,
@ -553,6 +570,7 @@ from .khanacademy import (
from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE
from .kommunetv import KommunetvIE
from .konserthusetplay import KonserthusetPlayIE
from .krasview import KrasViewIE
from .kth import KTHIE
@ -725,6 +743,7 @@ from .myvi import (
MyviIE,
MyviEmbedIE,
)
from .myvideoge import MyVideoGeIE
from .myvidster import MyVidsterIE
from .nationalgeographic import (
NationalGeographicVideoIE,
@ -969,6 +988,10 @@ from .pornhub import (
from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE
from .pr0gramm import (
Pr0grammIE,
Pr0grammStaticIE,
)
from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
@ -1006,6 +1029,10 @@ from .raywenderlich import (
RayWenderlichIE,
RayWenderlichCourseIE,
)
from .rbgtum import (
RbgTumIE,
RbgTumCourseIE,
)
from .rbmaradio import RBMARadioIE
from .rds import RDSIE
from .redbulltv import (
@ -1061,6 +1088,10 @@ from .rutube import (
from .rutv import RUTVIE
from .ruutu import RuutuIE
from .ruv import RuvIE
from .s4c import (
S4CIE,
S4CSeriesIE,
)
from .safari import (
SafariIE,
SafariApiIE,
@ -1196,6 +1227,7 @@ from .storyfire import (
from .streamable import StreamableIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streamsb import StreamsbIE
from .streetvoice import StreetVoiceIE
from .stretchinternet import StretchInternetIE
from .stv import STVPlayerIE
@ -1554,6 +1586,7 @@ from .weibo import (
WeiboMobileIE
)
from .weiqitv import WeiqiTVIE
from .whyp import WhypIE
from .wistia import (
WistiaIE,
WistiaPlaylistIE,

View file

@ -0,0 +1,101 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
traverse_obj,
unified_timestamp,
)
if not callable(getattr(InfoExtractor, '_match_valid_url', None)):
BaseInfoExtractor = InfoExtractor
import re
class InfoExtractor(BaseInfoExtractor):
@classmethod
def _match_valid_url(cls, url):
return re.match(cls._VALID_URL, url)
class FifaIE(InfoExtractor):
_VALID_URL = r'https?://www.fifa.com/fifaplus/(?P<locale>\w{2})/watch/([^#?]+/)?(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y',
'info_dict': {
'id': '7on10qPcnyLajDDU3ntg6y',
'title': 'Italy v France | Final | 2006 FIFA World Cup Germany™ | Full Match Replay',
'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b',
'ext': 'mp4',
'categories': ['FIFA Tournaments'],
'thumbnail': 'https://digitalhub.fifa.com/transform/135e2656-3a51-407b-8810-6c34bec5b59b/FMR_2006_Italy_France_Final_Hero',
'duration': 8165,
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.fifa.com/fifaplus/pt/watch/1cg5r5Qt6Qt12ilkDgb1sV',
'info_dict': {
'id': '1cg5r5Qt6Qt12ilkDgb1sV',
'title': 'Brazil v Germany | Semi-finals | 2014 FIFA World Cup Brazil™ | Extended Highlights',
'description': 'md5:d908c74ee66322b804ae2e521b02a855',
'ext': 'mp4',
'categories': ['FIFA Tournaments', 'Highlights'],
'thumbnail': 'https://digitalhub.fifa.com/transform/d8fe6f61-276d-4a73-a7fe-6878a35fd082/FIFAPLS_100EXTHL_2014BRAvGER_TMB',
'duration': 902,
'release_timestamp': 1404777600,
'release_date': '20140708',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.fifa.com/fifaplus/fr/watch/3C6gQH9C2DLwzNx7BMRQdp',
'info_dict': {
'id': '3C6gQH9C2DLwzNx7BMRQdp',
'title': 'Josimar goal against Northern Ireland | Classic Goals',
'description': 'md5:cbe7e7bb52f603c9f1fe9a4780fe983b',
'ext': 'mp4',
'categories': ['FIFA Tournaments', 'Goal'],
'duration': 28,
'thumbnail': 'https://digitalhub.fifa.com/transform/f9301391-f8d9-48b5-823e-c093ac5e3e11/CG_MEN_1986_JOSIMAR',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id, locale = self._match_valid_url(url).group('id', 'locale')
webpage = self._download_webpage(url, video_id)
preconnect_link = self._search_regex(
r'<link\b[^>]+\brel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link')
video_details = self._download_json(
'{preconnect_link}/sections/videoDetails/{video_id}'.format(**locals()), video_id, 'Downloading Video Details', fatal=False)
preplay_parameters = self._download_json(
'{preconnect_link}/videoPlayerData/{video_id}'.format(**locals()), video_id, 'Downloading Preplay Parameters')['preplayParameters']
content_data = self._download_json(
# 1. query string is expected to be sent as-is
# 2. `sig` must be appended
# 3. if absent, the call appears to work but the manifest is bad (404)
'https://content.uplynk.com/preplay/{contentId}/multiple.json?{queryStr}&sig={signature}'.format(**preplay_parameters),
video_id, 'Downloading Content Data')
# formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id)
formats, subtitles = self._extract_m3u8_formats(content_data['playURL'], video_id, ext='mp4', entry_protocol='m3u8_native'), None
self._sort_formats(formats)
return {
'id': video_id,
'title': video_details['title'],
'description': video_details.get('description'),
'duration': int_or_none(video_details.get('duration')),
'release_timestamp': unified_timestamp(video_details.get('dateOfRelease')),
'categories': traverse_obj(video_details, (('videoCategory', 'videoSubcategory'),)),
'thumbnail': traverse_obj(video_details, ('backgroundImage', 'src')),
'formats': formats,
'subtitles': subtitles,
}

View file

@ -0,0 +1,43 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
decode_packed_codes,
js_to_json,
)
class FileMoonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?filemoon\.sx/./(?P<id>\w+)'
_TEST = {
'url': 'https://filemoon.sx/e/dw40rxrzruqz',
'md5': '5a713742f57ac4aef29b74733e8dda01',
'info_dict': {
'id': 'dw40rxrzruqz',
'title': 'dw40rxrzruqz',
'ext': 'mp4'
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
matches = re.findall(r'(?s)(eval.*?)</script>', webpage)
packed = matches[-1]
unpacked = decode_packed_codes(packed)
jwplayer_sources = self._parse_json(
self._search_regex(
r'(?s)player\s*\.\s*setup\s*\(\s*\{\s*sources\s*:\s*(.*?])', unpacked, 'jwplayer sources'),
video_id, transform_source=js_to_json)
formats = self._parse_jwplayer_formats(jwplayer_sources, video_id)
return {
'id': video_id,
'title': self._generic_title(url) or video_id,
'formats': formats
}

View file

@ -2320,6 +2320,25 @@ class GenericIE(InfoExtractor):
'height': 720,
'age_limit': 18,
},
}, {
# would like to use the yt-dl test video but searching for
# '"\'/\\ä↭𝕐' fails, so using an old vid from YouTube Korea
'note': 'Test default search',
'url': 'Shorts로 허락 필요없이 놀자! (BTS편)',
'info_dict': {
'id': 'usDGO4Zb-dc',
'ext': 'mp4',
'title': 'YouTube Shorts로 허락 필요없이 놀자! (BTS편)',
'description': 'md5:96e31607eba81ab441567b5e289f4716',
'upload_date': '20211107',
'uploader': 'YouTube Korea',
'location': '대한민국',
},
'params': {
'default_search': 'ytsearch',
'skip_download': True,
},
'expected_warnings': ['uploader id'],
},
]

View file

@ -0,0 +1,273 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
join_nonempty,
merge_dicts,
parse_duration,
str_or_none,
T,
traverse_obj,
unified_strdate,
unified_timestamp,
urlhandle_detect_ext,
)
class GlobalPlayerBaseIE(InfoExtractor):
def _get_page_props(self, url, video_id):
webpage = self._download_webpage(url, video_id)
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
def _request_ext(self, url, video_id):
return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests
url, video_id, note='Determining source extension'))
@staticmethod
def _clean_desc(x):
x = clean_html(x)
if x:
x = x.replace('\xa0', ' ')
return x
def _extract_audio(self, episode, series):
return merge_dicts({
'vcodec': 'none',
}, traverse_obj(series, {
'series': 'title',
'series_id': 'id',
'thumbnail': 'imageUrl',
'uploader': 'itunesAuthor', # podcasts only
}), traverse_obj(episode, {
'id': 'id',
'description': ('description', T(self._clean_desc)),
'duration': ('duration', T(parse_duration)),
'thumbnail': 'imageUrl',
'url': 'streamUrl',
'timestamp': (('pubDate', 'startDate'), T(unified_timestamp)),
'title': 'title',
}, get_all=False), rev=True)
class GlobalPlayerLiveIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+'
_TESTS = [{
'url': 'https://www.globalplayer.com/live/smoothchill/uk/',
'info_dict': {
'id': '2mx1E',
'ext': 'aac',
'display_id': 'smoothchill-uk',
'title': 're:^Smooth Chill.+$',
'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png',
'description': 'Music To Chill To',
# 'live_status': 'is_live',
'is_live': True,
},
}, {
# national station
'url': 'https://www.globalplayer.com/live/heart/uk/',
'info_dict': {
'id': '2mwx4',
'ext': 'aac',
'description': 'turn up the feel good!',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
# 'live_status': 'is_live',
'is_live': True,
'title': 're:^Heart UK.+$',
'display_id': 'heart-uk',
},
}, {
# regional variation
'url': 'https://www.globalplayer.com/live/heart/london/',
'info_dict': {
'id': 'AMqg',
'ext': 'aac',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
'title': 're:^Heart London.+$',
# 'live_status': 'is_live',
'is_live': True,
'display_id': 'heart-london',
'description': 'turn up the feel good!',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['station']
stream_url = station['streamUrl']
return merge_dicts({
'id': station['id'],
'display_id': (
join_nonempty('brandSlug', 'slug', from_dict=station)
or station.get('legacyStationPrefix')),
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
}, {
'title': self._live_title(traverse_obj(
station, (('name', 'brandName'), T(str_or_none)),
get_all=False)),
}, traverse_obj(station, {
'description': 'tagline',
'thumbnail': 'brandLogo',
}), rev=True)
class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)'
_TESTS = [{
# "live playlist"
'url': 'https://www.globalplayer.com/playlists/8bLk/',
'info_dict': {
'id': '8bLk',
'ext': 'aac',
# 'live_status': 'is_live',
'is_live': True,
'description': r're:(?s).+\bclassical\b.+\bClassic FM Hall [oO]f Fame\b',
'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
'title': 're:Classic FM Hall of Fame.+$'
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['playlistData']
stream_url = station['streamUrl']
return merge_dicts({
'id': video_id,
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
}, traverse_obj(station, {
'title': 'title',
'description': ('description', T(self._clean_desc)),
'thumbnail': 'image',
}), rev=True)
class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/42KuaM/',
'playlist_mincount': 5,
'info_dict': {
'id': '42KuaM',
'title': 'Filthy Ritual',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'categories': ['Society & Culture', 'True Crime'],
'uploader': 'Global',
'description': r're:(?s).+\bscam\b.+?\bseries available now\b',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/',
'playlist_mincount': 2,
'info_dict': {
'id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
series = props['podcastInfo'] if podcast else props['catchupInfo']
return merge_dicts({
'_type': 'playlist',
'id': video_id,
'entries': [self._extract_audio(ep, series) for ep in traverse_obj(
series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None,
}, traverse_obj(series, {
'description': ('description', T(self._clean_desc)),
'thumbnail': 'imageUrl',
'title': 'title',
'uploader': 'itunesAuthor', # podcasts only
}), rev=True)
class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/',
'info_dict': {
'id': '7DrfNnE',
'ext': 'mp3',
'title': 'Filthy Ritual - Trailer',
'description': 'md5:1f1562fd0f01b4773b590984f94223e0',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'duration': 225.0,
'timestamp': 1681254900,
'series': 'Filthy Ritual',
'series_id': '42KuaM',
'upload_date': '20230411',
'uploader': 'Global',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/',
'only_matching': True,
# expired: refresh the details with a current show for a full test
'info_dict': {
'id': '2zGq26Vcv1fCWhddC4JAwETXWe',
'ext': 'm4a',
'timestamp': 1682056800,
'series': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
'upload_date': '20230421',
'series_id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'duration': 10800.0,
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
episode = props['podcastEpisode'] if podcast else props['catchupEpisode']
return self._extract_audio(
episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {})
class GlobalPlayerVideoIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/',
'info_dict': {
'id': '2JsSZ7Gm2uP',
'ext': 'mp4',
'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd',
'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550',
'upload_date': '20230420',
'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
meta = self._get_page_props(url, video_id)['videoData']
return merge_dicts({
'id': video_id,
}, traverse_obj(meta, {
'url': 'url',
'thumbnail': ('image', 'url'),
'title': 'title',
'upload_date': ('publish_date', T(unified_strdate)),
'description': 'description',
}), rev=True)

View file

@ -1,19 +1,29 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_filter as filter,
compat_HTTPError,
compat_parse_qs,
compat_urllib_parse_urlparse,
compat_urlparse,
)
from ..utils import (
HEADRequest,
determine_ext,
error_to_compat_str,
extract_attributes,
ExtractorError,
int_or_none,
merge_dicts,
orderedSet,
parse_iso8601,
strip_or_none,
try_get,
traverse_obj,
url_or_none,
urljoin,
)
@ -22,14 +32,102 @@ class IGNBaseIE(InfoExtractor):
return self._download_json(
'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug)
def _checked_call_api(self, slug):
try:
return self._call_api(slug)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
e.cause.args = e.cause.args or [
e.cause.geturl(), e.cause.getcode(), e.cause.reason]
raise ExtractorError(
'Content not found: expired?', cause=e.cause,
expected=True)
raise
def _extract_video_info(self, video, fatal=True):
video_id = video['videoId']
formats = []
refs = traverse_obj(video, 'refs', expected_type=dict) or {}
m3u8_url = url_or_none(refs.get('m3uUrl'))
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
f4m_url = url_or_none(refs.get('f4mUrl'))
if f4m_url:
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in (video.get('assets') or []):
asset_url = url_or_none(asset.get('url'))
if not asset_url:
continue
formats.append({
'url': asset_url,
'tbr': int_or_none(asset.get('bitrate'), 1000),
'fps': int_or_none(asset.get('frame_rate')),
'height': int_or_none(asset.get('height')),
'width': int_or_none(asset.get('width')),
})
mezzanine_url = traverse_obj(
video, ('system', 'mezzanineUrl'), expected_type=url_or_none)
if mezzanine_url:
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
'format_id': 'mezzanine',
'preference': 1,
'url': mezzanine_url,
})
if formats or fatal:
self._sort_formats(formats)
else:
return
thumbnails = traverse_obj(
video, ('thumbnails', Ellipsis, {'url': 'url'}), expected_type=url_or_none)
tags = traverse_obj(
video, ('tags', Ellipsis, 'displayName'),
expected_type=lambda x: x.strip() or None)
metadata = traverse_obj(video, 'metadata', expected_type=dict) or {}
title = traverse_obj(
metadata, 'longTitle', 'title', 'name',
expected_type=lambda x: x.strip() or None)
return {
'id': video_id,
'title': title,
'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
'thumbnails': thumbnails,
'formats': formats,
'tags': tags,
}
# yt-dlp shim
@classmethod
def _extract_from_webpage(cls, url, webpage):
for embed_url in orderedSet(
cls._extract_embed_urls(url, webpage) or [], lazy=True):
yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
class IGNIE(IGNBaseIE):
"""
Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
Some videos of it.ign.com are also supported
"""
_VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)'
_VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>.+?)'
_PLAYLIST_PATH_RE = r'(?:/?\?(?P<filt>[^&#]+))?'
_VALID_URL = (
r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)'
% '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE)))
IE_NAME = 'ign.com'
_PAGE_TYPE = 'video'
@ -44,7 +142,10 @@ class IGNIE(IGNBaseIE):
'timestamp': 1370440800,
'upload_date': '20130605',
'tags': 'count:9',
}
},
'params': {
'nocheckcertificate': True,
},
}, {
'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
'md5': 'f1581a6fe8c5121be5b807684aeac3f6',
@ -56,86 +157,51 @@ class IGNIE(IGNBaseIE):
'timestamp': 1420571160,
'upload_date': '20150106',
'tags': 'count:4',
}
},
'skip': '404 Not Found',
}, {
'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix',
'only_matching': True,
}]
@classmethod
def _extract_embed_urls(cls, url, webpage):
grids = re.findall(
r'''(?s)<section\b[^>]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)</section[^>]*>''',
webpage)
return filter(None,
(urljoin(url, m.group('path')) for m in re.finditer(
r'''<a\b[^>]+\bhref\s*=\s*('|")(?P<path>/videos%s)\1'''
% cls._VIDEO_PATH_RE, grids[0] if grids else '')))
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
display_id = m.group('id')
if display_id:
return self._extract_video(url, display_id)
display_id = m.group('filt') or 'all'
return self._extract_playlist(url, display_id)
def _extract_playlist(self, url, display_id):
webpage = self._download_webpage(url, display_id)
return self.playlist_result(
(self.url_result(u, ie=self.ie_key())
for u in self._extract_embed_urls(url, webpage)),
playlist_id=display_id)
def _extract_video(self, url, display_id):
display_id = self._match_id(url)
video = self._call_api(display_id)
video_id = video['videoId']
metadata = video['metadata']
title = metadata.get('longTitle') or metadata.get('title') or metadata['name']
video = self._checked_call_api(display_id)
formats = []
refs = video.get('refs') or {}
info = self._extract_video_info(video)
m3u8_url = refs.get('m3uUrl')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
f4m_url = refs.get('f4mUrl')
if f4m_url:
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in (video.get('assets') or []):
asset_url = asset.get('url')
if not asset_url:
continue
formats.append({
'url': asset_url,
'tbr': int_or_none(asset.get('bitrate'), 1000),
'fps': int_or_none(asset.get('frame_rate')),
'height': int_or_none(asset.get('height')),
'width': int_or_none(asset.get('width')),
})
mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl'])
if mezzanine_url:
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
'format_id': 'mezzanine',
'preference': 1,
'url': mezzanine_url,
})
self._sort_formats(formats)
thumbnails = []
for thumbnail in (video.get('thumbnails') or []):
thumbnail_url = thumbnail.get('url')
if not thumbnail_url:
continue
thumbnails.append({
'url': thumbnail_url,
})
tags = []
for tag in (video.get('tags') or []):
display_name = tag.get('displayName')
if not display_name:
continue
tags.append(display_name)
return {
'id': video_id,
'title': title,
'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
return merge_dicts({
'display_id': display_id,
'thumbnails': thumbnails,
'formats': formats,
'tags': tags,
}
}, info)
class IGNVideoIE(InfoExtractor):
class IGNVideoIE(IGNBaseIE):
_VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/'
_TESTS = [{
'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
@ -147,7 +213,8 @@ class IGNVideoIE(InfoExtractor):
'description': 'Taking out assassination targets in Hitman has never been more stylish.',
'timestamp': 1444665600,
'upload_date': '20151012',
}
},
'expected_warnings': ['HTTP Error 400: Bad Request'],
}, {
'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
'only_matching': True,
@ -167,22 +234,38 @@ class IGNVideoIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
req = HEADRequest(url.rsplit('/', 1)[0] + '/embed')
url = self._request_webpage(req, video_id).geturl()
parsed_url = compat_urlparse.urlparse(url)
embed_url = compat_urlparse.urlunparse(
parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed'))
webpage, urlh = self._download_webpage_handle(embed_url, video_id)
new_url = urlh.geturl()
ign_url = compat_parse_qs(
compat_urllib_parse_urlparse(url).query).get('url', [None])[0]
compat_urlparse.urlparse(new_url).query).get('url', [None])[-1]
if ign_url:
return self.url_result(ign_url, IGNIE.ie_key())
return self.url_result(url)
video = self._search_regex(r'(<div\b[^>]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False)
if not video:
if new_url == url:
raise ExtractorError('Redirect loop: ' + url)
return self.url_result(new_url)
video = extract_attributes(video)
video_data = video.get('data-settings') or '{}'
video_data = self._parse_json(video_data, video_id)['video']
info = self._extract_video_info(video_data)
return merge_dicts({
'display_id': video_id,
}, info)
class IGNArticleIE(IGNBaseIE):
_VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)'
_VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P<id>[^/?&#]+)'
_PAGE_TYPE = 'article'
_TESTS = [{
'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
'info_dict': {
'id': '524497489e4e8ff5848ece34',
'id': '72113',
'title': '100 Little Things in GTA 5 That Will Blow Your Mind',
},
'playlist': [
@ -190,7 +273,7 @@ class IGNArticleIE(IGNBaseIE):
'info_dict': {
'id': '5ebbd138523268b93c9141af17bec937',
'ext': 'mp4',
'title': 'GTA 5 Video Review',
'title': 'Grand Theft Auto V Video Review',
'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
'timestamp': 1379339880,
'upload_date': '20130916',
@ -200,7 +283,7 @@ class IGNArticleIE(IGNBaseIE):
'info_dict': {
'id': '638672ee848ae4ff108df2a296418ee2',
'ext': 'mp4',
'title': '26 Twisted Moments from GTA 5 in Slow Motion',
'title': 'GTA 5 In Slow Motion',
'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
'timestamp': 1386878820,
'upload_date': '20131212',
@ -208,16 +291,17 @@ class IGNArticleIE(IGNBaseIE):
},
],
'params': {
'playlist_items': '2-3',
'skip_download': True,
},
'expected_warnings': ['Backend fetch failed'],
}, {
'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
'info_dict': {
'id': '53ee806780a81ec46e0790f8',
'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
},
'playlist_count': 2,
'playlist_count': 1,
'expected_warnings': ['Backend fetch failed'],
}, {
# videoId pattern
'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
@ -240,18 +324,91 @@ class IGNArticleIE(IGNBaseIE):
'only_matching': True,
}]
def _checked_call_api(self, slug):
try:
return self._call_api(slug)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError):
e.cause.args = e.cause.args or [
e.cause.geturl(), e.cause.getcode(), e.cause.reason]
if e.cause.code == 404:
raise ExtractorError(
'Content not found: expired?', cause=e.cause,
expected=True)
elif e.cause.code == 503:
self.report_warning(error_to_compat_str(e.cause))
return
raise
def _search_nextjs_data(self, webpage, video_id, **kw):
return self._parse_json(
self._search_regex(
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
webpage, 'next.js data', **kw),
video_id, **kw)
def _real_extract(self, url):
display_id = self._match_id(url)
article = self._call_api(display_id)
article = self._checked_call_api(display_id)
def entries():
media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url'])
if media_url:
yield self.url_result(media_url, IGNIE.ie_key())
for content in (article.get('content') or []):
for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
yield self.url_result(video_url)
if article:
# obsolete ?
def entries():
media_url = traverse_obj(
article, ('mediaRelations', 0, 'media', 'metadata', 'url'),
expected_type=url_or_none)
if media_url:
yield self.url_result(media_url, IGNIE.ie_key())
for content in (article.get('content') or []):
for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
if url_or_none(video_url):
yield self.url_result(video_url)
return self.playlist_result(
entries(), article.get('articleId'),
traverse_obj(
article, ('metadata', 'headline'),
expected_type=lambda x: x.strip() or None))
webpage = self._download_webpage(url, display_id)
playlist_id = self._html_search_meta('dable:item_id', webpage, default=None)
if playlist_id:
def entries():
for m in re.finditer(
r'''(?s)<object\b[^>]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P<params>.+?)</object''',
webpage):
flashvars = self._search_regex(
r'''(<param\b[^>]+\bname\s*=\s*("|')flashvars\2[^>]*>)''',
m.group('params'), 'flashvars', default='')
flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '')
v_url = url_or_none((flashvars.get('url') or [None])[-1])
if v_url:
yield self.url_result(v_url)
else:
playlist_id = self._search_regex(
r'''\bdata-post-id\s*=\s*("|')(?P<id>[\da-f]+)\1''',
webpage, 'id', group='id', default=None)
nextjs_data = self._search_nextjs_data(webpage, display_id)
def entries():
for player in traverse_obj(
nextjs_data,
('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')):
# skip promo links (which may not always be served, eg GH CI servers)
if traverse_obj(nextjs_data,
('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')),
expected_type=dict):
continue
video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {}
info = self._extract_video_info(video, fatal=False)
if info:
yield merge_dicts({
'display_id': display_id,
}, info)
return self.playlist_result(
entries(), article.get('articleId'),
strip_or_none(try_get(article, lambda x: x['metadata']['headline'])))
entries(), playlist_id or display_id,
re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None)

View file

@ -1,101 +1,267 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
int_or_none,
js_to_json,
merge_dicts,
mimetype2ext,
ExtractorError,
parse_iso8601,
T,
traverse_obj,
txt_or_none,
url_or_none,
)
class ImgurIE(InfoExtractor):
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P<id>[a-zA-Z0-9]+)'
class ImgurBaseIE(InfoExtractor):
# hard-coded value, as also used by ArchiveTeam
_CLIENT_ID = '546c25a59c58ad7'
@classmethod
def _imgur_result(cls, item_id):
return cls.url_result('imgur:%s' % item_id, ImgurIE.ie_key(), item_id)
def _call_api(self, endpoint, video_id, **kwargs):
return self._download_json(
'https://api.imgur.com/post/v1/%s/%s?client_id=%s&include=media,account' % (endpoint, video_id, self._CLIENT_ID),
video_id, **kwargs)
@staticmethod
def get_description(s):
if 'Discover the magic of the internet at Imgur' in s:
return None
return txt_or_none(s)
class ImgurIE(ImgurBaseIE):
_VALID_URL = r'''(?x)
(?:
https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)|
imgur:
)(?P<id>[a-zA-Z0-9]+)
'''
_TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv',
'url': 'https://imgur.com/A61SaA1',
'info_dict': {
'id': 'A61SaA1',
'ext': 'mp4',
'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
'timestamp': 1416446068,
'upload_date': '20141120',
},
}, {
'url': 'https://imgur.com/A61SaA1',
'url': 'https://i.imgur.com/A61SaA1.gifv',
'only_matching': True,
}, {
'url': 'https://i.imgur.com/crGpqCV.mp4',
'only_matching': True,
}, {
# no title
# previously, no title
'url': 'https://i.imgur.com/jxBXAMC.gifv',
'only_matching': True,
'info_dict': {
'id': 'jxBXAMC',
'ext': 'mp4',
'title': 'Fahaka puffer feeding',
'timestamp': 1533835503,
'upload_date': '20180809',
},
}]
def _extract_twitter_formats(self, html, tw_id='twitter', **kwargs):
fatal = kwargs.pop('fatal', False)
tw_stream = self._html_search_meta('twitter:player:stream', html, fatal=fatal, **kwargs)
if not tw_stream:
return []
ext = mimetype2ext(self._html_search_meta(
'twitter:player:stream:content_type', html, default=None))
width, height = (int_or_none(self._html_search_meta('twitter:player:' + v, html, default=None))
for v in ('width', 'height'))
return [{
'format_id': tw_id,
'url': tw_stream,
'ext': ext or determine_ext(tw_stream),
'width': width,
'height': height,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._call_api('media', video_id, fatal=False, expected_status=404)
webpage = self._download_webpage(
'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id)
'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id, fatal=not data) or ''
width = int_or_none(self._og_search_property(
'video:width', webpage, default=None))
height = int_or_none(self._og_search_property(
'video:height', webpage, default=None))
if not traverse_obj(data, ('media', 0, (
('type', T(lambda t: t == 'video' or None)),
('metadata', 'is_animated'))), get_all=False):
raise ExtractorError(
'%s is not a video or animated image' % video_id,
expected=True)
media_fmt = traverse_obj(data, ('media', 0, {
'url': ('url', T(url_or_none)),
'ext': 'ext',
'width': ('width', T(int_or_none)),
'height': ('height', T(int_or_none)),
'filesize': ('size', T(int_or_none)),
'acodec': ('metadata', 'has_sound', T(lambda b: None if b else 'none')),
}))
media_url = traverse_obj(media_fmt, 'url')
if media_url:
if not media_fmt.get('ext'):
media_fmt['ext'] = mimetype2ext(traverse_obj(
data, ('media', 0, 'mime_type'))) or determine_ext(media_url)
if traverse_obj(data, ('media', 0, 'type')) == 'image':
media_fmt['acodec'] = 'none'
media_fmt.setdefault('preference', -10)
tw_formats = self._extract_twitter_formats(webpage)
if traverse_obj(tw_formats, (0, 'url')) == media_url:
tw_formats = []
else:
# maybe this isn't an animated image/video?
self._check_formats(tw_formats, video_id)
video_elements = self._search_regex(
r'(?s)<div class="video-elements">(.*?)</div>',
webpage, 'video elements', default=None)
if not video_elements:
if not (video_elements or tw_formats or media_url):
raise ExtractorError(
'No sources found for video %s. Maybe an image?' % video_id,
'No sources found for video %s. Maybe a plain image?' % video_id,
expected=True)
formats = []
for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements):
formats.append({
'format_id': m.group('type').partition('/')[2],
'url': self._proto_relative_url(m.group('src')),
'ext': mimetype2ext(m.group('type')),
'width': width,
'height': height,
def mung_format(fmt, *extra):
fmt.update({
'http_headers': {
'User-Agent': 'youtube-dl (like wget)',
},
})
for d in extra:
fmt.update(d)
return fmt
gif_json = self._search_regex(
r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
webpage, 'GIF code', fatal=False)
if gif_json:
gifd = self._parse_json(
gif_json, video_id, transform_source=js_to_json)
formats.append({
'format_id': 'gif',
'preference': -10,
'width': width,
'height': height,
'ext': 'gif',
'acodec': 'none',
'vcodec': 'gif',
'container': 'gif',
'url': self._proto_relative_url(gifd['gifUrl']),
'filesize': gifd.get('size'),
'http_headers': {
'User-Agent': 'youtube-dl (like wget)',
},
})
if video_elements:
def og_get_size(media_type):
return dict((p, int_or_none(self._og_search_property(
':'.join((media_type, p)), webpage, default=None)))
for p in ('width', 'height'))
size = og_get_size('video')
if all(v is None for v in size.values()):
size = og_get_size('image')
formats = traverse_obj(
re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements),
(Ellipsis, {
'format_id': ('type', T(lambda s: s.partition('/')[2])),
'url': ('src', T(self._proto_relative_url)),
'ext': ('type', T(mimetype2ext)),
}, T(lambda f: mung_format(f, size))))
gif_json = self._search_regex(
r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
webpage, 'GIF code', fatal=False)
MUST_BRANCH = (None, T(lambda _: None))
formats.extend(traverse_obj(gif_json, (
T(lambda j: self._parse_json(
j, video_id, transform_source=js_to_json, fatal=False)), {
'url': ('gifUrl', T(self._proto_relative_url)),
'filesize': ('size', T(int_or_none)),
}, T(lambda f: mung_format(f, size, {
'format_id': 'gif',
'preference': -10, # gifs are worse than videos
'ext': 'gif',
'acodec': 'none',
'vcodec': 'gif',
'container': 'gif',
})), MUST_BRANCH)))
else:
formats = []
# maybe add formats from JSON or page Twitter metadata
if not any((u == media_url) for u in traverse_obj(formats, (Ellipsis, 'url'))):
formats.append(mung_format(media_fmt))
tw_url = traverse_obj(tw_formats, (0, 'url'))
if not any((u == tw_url) for u in traverse_obj(formats, (Ellipsis, 'url'))):
formats.extend(mung_format(f) for f in tw_formats)
self._sort_formats(formats)
return {
return merge_dicts(traverse_obj(data, {
'uploader_id': ('account_id', T(txt_or_none),
T(lambda a: a if int_or_none(a) != 0 else None)),
'uploader': ('account', 'username', T(txt_or_none)),
'uploader_url': ('account', 'avatar_url', T(url_or_none)),
'like_count': ('upvote_count', T(int_or_none)),
'dislike_count': ('downvote_count', T(int_or_none)),
'comment_count': ('comment_count', T(int_or_none)),
'age_limit': ('is_mature', T(lambda x: 18 if x else None)),
'timestamp': (('updated_at', 'created_at'), T(parse_iso8601)),
'release_timestamp': ('created_at', T(parse_iso8601)),
}, get_all=False), traverse_obj(data, ('media', 0, 'metadata', {
'title': ('title', T(txt_or_none)),
'description': ('description', T(self.get_description)),
'duration': ('duration', T(float_or_none)),
'timestamp': (('updated_at', 'created_at'), T(parse_iso8601)),
'release_timestamp': ('created_at', T(parse_iso8601)),
})), {
'id': video_id,
'formats': formats,
'title': self._og_search_title(webpage, default=video_id),
}
'title': self._og_search_title(webpage, default='Imgur video ' + video_id),
'description': self.get_description(self._og_search_description(webpage)),
'thumbnail': url_or_none(self._html_search_meta('thumbnailUrl', webpage, default=None)),
})
class ImgurGalleryIE(InfoExtractor):
class ImgurGalleryBaseIE(ImgurBaseIE):
_GALLERY = True
def _real_extract(self, url):
gallery_id = self._match_id(url)
data = self._call_api('albums', gallery_id, fatal=False, expected_status=404)
info = traverse_obj(data, {
'title': ('title', T(txt_or_none)),
'description': ('description', T(self.get_description)),
})
if traverse_obj(data, 'is_album'):
def yield_media_ids():
for m_id in traverse_obj(data, (
'media', lambda _, v: v.get('type') == 'video' or v['metadata']['is_animated'],
'id', T(txt_or_none))):
yield m_id
# if a gallery with exactly one video, apply album metadata to video
media_id = (
self._GALLERY
and traverse_obj(data, ('image_count', T(lambda c: c == 1)))
and next(yield_media_ids(), None))
if not media_id:
result = self.playlist_result(
map(self._imgur_result, yield_media_ids()), gallery_id)
result.update(info)
return result
gallery_id = media_id
result = self._imgur_result(gallery_id)
info['_type'] = 'url_transparent'
result.update(info)
return result
class ImgurGalleryIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:gallery'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P<id>[a-zA-Z0-9]+)'
@ -106,49 +272,93 @@ class ImgurGalleryIE(InfoExtractor):
'title': 'Adding faces make every GIF better',
},
'playlist_count': 25,
'skip': 'Zoinks! You\'ve taken a wrong turn.',
}, {
# TODO: static images - replace with animated/video gallery
'url': 'http://imgur.com/topic/Aww/ll5Vk',
'only_matching': True,
}, {
'url': 'https://imgur.com/gallery/YcAQlkx',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'YcAQlkx',
'ext': 'mp4',
'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....',
}
'timestamp': 1358554297,
'upload_date': '20130119',
'uploader_id': '1648642',
'uploader': 'wittyusernamehere',
},
}, {
# TODO: static image - replace with animated/video gallery
'url': 'http://imgur.com/topic/Funny/N8rOudd',
'only_matching': True,
}, {
'url': 'http://imgur.com/r/aww/VQcQPhM',
'only_matching': True,
'add_ies': ['Imgur'],
'info_dict': {
'id': 'VQcQPhM',
'ext': 'mp4',
'title': 'The boss is here',
'timestamp': 1476494751,
'upload_date': '20161015',
'uploader_id': '19138530',
'uploader': 'thematrixcam',
},
},
# from PR #16674
{
'url': 'https://imgur.com/t/unmuted/6lAn9VQ',
'info_dict': {
'id': '6lAn9VQ',
'title': 'Penguins !',
},
'playlist_count': 3,
}, {
'url': 'https://imgur.com/t/unmuted/kx2uD3C',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'ZVMv45i',
'ext': 'mp4',
'title': 'Intruder',
'timestamp': 1528129683,
'upload_date': '20180604',
},
}, {
'url': 'https://imgur.com/t/unmuted/wXSK0YH',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'JCAP4io',
'ext': 'mp4',
'title': 're:I got the blues$',
'description': 'Lukas vocal stylings.\n\nFP edit: dont encourage me. Ill never stop posting Luka and friends.',
'timestamp': 1527809525,
'upload_date': '20180531',
},
}]
def _real_extract(self, url):
gallery_id = self._match_id(url)
data = self._download_json(
'https://imgur.com/gallery/%s.json' % gallery_id,
gallery_id)['data']['image']
if data.get('is_album'):
entries = [
self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash'])
for image in data['album_images']['images'] if image.get('hash')]
return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description'))
return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id)
class ImgurAlbumIE(ImgurGalleryIE):
class ImgurAlbumIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:album'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)'
_GALLERY = False
_TESTS = [{
# TODO: only static images - replace with animated/video gallery
'url': 'http://imgur.com/a/j6Orj',
'only_matching': True,
},
# from PR #21693
{
'url': 'https://imgur.com/a/iX265HX',
'info_dict': {
'id': 'j6Orj',
'title': 'A Literary Analysis of "Star Wars: The Force Awakens"',
'id': 'iX265HX',
'title': 'enen-no-shouboutai'
},
'playlist_count': 12,
'playlist_count': 2,
}, {
'url': 'https://imgur.com/a/8pih2Ed',
'info_dict': {
'id': '8pih2Ed'
},
'playlist_mincount': 1,
}]

View file

@ -3,123 +3,266 @@ from __future__ import unicode_literals
import json
import re
import sys
from .common import InfoExtractor
from .brightcove import BrightcoveNewIE
from ..compat import (
compat_HTTPError,
compat_integer_types,
compat_kwargs,
compat_urlparse,
)
from ..utils import (
clean_html,
determine_ext,
error_to_compat_str,
extract_attributes,
get_element_by_class,
JSON_LD_RE,
ExtractorError,
get_element_by_attribute,
int_or_none,
merge_dicts,
parse_duration,
parse_iso8601,
remove_start,
smuggle_url,
strip_or_none,
traverse_obj,
url_or_none,
urljoin,
)
class ITVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
_GEO_COUNTRIES = ['GB']
class ITVBaseIE(InfoExtractor):
def _search_nextjs_data(self, webpage, video_id, **kw):
transform_source = kw.pop('transform_source', None)
fatal = kw.pop('fatal', True)
return self._parse_json(
self._search_regex(
r'''<script\b[^>]+\bid=('|")__NEXT_DATA__\1[^>]*>(?P<js>[^<]+)</script>''',
webpage, 'next.js data', group='js', fatal=fatal, **kw),
video_id, transform_source=transform_source, fatal=fatal)
def __handle_request_webpage_error(self, err, video_id=None, errnote=None, fatal=True):
if errnote is False:
return False
if errnote is None:
errnote = 'Unable to download webpage'
errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err, video_id=video_id)
else:
self._downloader.report_warning(errmsg)
return False
@staticmethod
def _vanilla_ua_header():
return {'User-Agent': 'Mozilla/5.0'}
def _download_webpage_handle(self, url, video_id, *args, **kwargs):
# specialised to (a) use vanilla UA (b) detect geo-block
params = self._downloader.params
nkwargs = {}
if (
'user_agent' not in params
and not any(re.match(r'(?i)user-agent\s*:', h)
for h in (params.get('headers') or []))
and 'User-Agent' not in (kwargs.get('headers') or {})):
kwargs.setdefault('headers', {})
kwargs['headers'] = self._vanilla_ua_header()
nkwargs = kwargs
if kwargs.get('expected_status') is not None:
exp = kwargs['expected_status']
if isinstance(exp, compat_integer_types):
exp = [exp]
if isinstance(exp, (list, tuple)) and 403 not in exp:
kwargs['expected_status'] = [403]
kwargs['expected_status'].extend(exp)
nkwargs = kwargs
else:
kwargs['expected_status'] = 403
nkwargs = kwargs
if nkwargs:
kwargs = compat_kwargs(kwargs)
ret = super(ITVBaseIE, self)._download_webpage_handle(url, video_id, *args, **kwargs)
if ret is False:
return ret
webpage, urlh = ret
if urlh.getcode() == 403:
# geo-block error is like this, with an unnecessary 'Of':
# '{\n "Message" : "Request Originated Outside Of Allowed Geographic Region",\
# \n "TransactionId" : "oas-magni-475082-xbYF0W"\n}'
if '"Request Originated Outside Of Allowed Geographic Region"' in webpage:
self.raise_geo_restricted(countries=['GB'])
ret = self.__handle_request_webpage_error(
compat_HTTPError(urlh.geturl(), 403, 'HTTP Error 403: Forbidden', urlh.headers, urlh),
fatal=kwargs.get('fatal'))
return ret
class ITVIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?P<w>watch)|hub)/[^/]+/(?(w)[\w-]+/)(?P<id>\w+)'
_IE_DESC = 'ITVX'
_TESTS = [{
'note': 'Hub URLs redirect to ITVX',
'url': 'https://www.itv.com/hub/liar/2a4547a0012',
'info_dict': {
'id': '2a4547a0012',
'ext': 'mp4',
'title': 'Liar - Series 2 - Episode 6',
'description': 'md5:d0f91536569dec79ea184f0a44cca089',
'series': 'Liar',
'season_number': 2,
'episode_number': 6,
},
'params': {
# m3u8 download
'skip_download': True,
},
'only_matching': True,
}, {
# unavailable via data-playlist-url
'note': 'Hub page unavailable via data-playlist-url (404 now)',
'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033',
'only_matching': True,
}, {
# InvalidVodcrid
'note': 'Hub page with InvalidVodcrid (404 now)',
'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034',
'only_matching': True,
}, {
# ContentUnavailable
'note': 'Hub page with ContentUnavailable (404 now)',
'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024',
'only_matching': True,
}]
}, {
'note': 'ITVX, or itvX, show',
'url': 'https://www.itv.com/watch/vera/1a7314/1a7314a0014',
'md5': 'bd0ad666b2c058fffe7d036785880064',
'info_dict': {
'id': '1a7314a0014',
'ext': 'mp4',
'title': 'Vera - Series 3 - Episode 4 - Prodigal Son',
'description': 'Vera and her team investigate the fatal stabbing of an ex-Met police officer outside a busy Newcastle nightclub - but there aren\'t many clues.',
'timestamp': 1653591600,
'upload_date': '20220526',
'uploader': 'ITVX',
'thumbnail': r're:https://\w+\.itv\.com/images/(?:\w+/)+\d+x\d+\?',
'duration': 5340.8,
'age_limit': 16,
'series': 'Vera',
'series_number': 3,
'episode': 'Prodigal Son',
'episode_number': 4,
'channel': 'ITV3',
'categories': list,
},
'params': {
# m3u8 download
# 'skip_download': True,
},
'skip': 'only available in UK',
}, {
'note': 'Latest ITV news bulletin: details change daily',
'url': 'https://www.itv.com/watch/news/varies-but-is-not-checked/6js5d0f',
'info_dict': {
'id': '6js5d0f',
'ext': 'mp4',
'title': r're:The latest ITV News headlines - \S.+',
'description': r'''re:.* today's top stories from the ITV News team.$''',
'timestamp': int,
'upload_date': r're:2\d\d\d(?:0[1-9]|1[0-2])(?:[012][1-9]|3[01])',
'uploader': 'ITVX',
'thumbnail': r're:https://images\.ctfassets\.net/(?:\w+/)+[\w.]+\.(?:jpg|png)',
'duration': float,
'age_limit': None,
},
'params': {
# variable download
# 'skip_download': True,
},
'skip': 'only available in UK',
}
]
def _og_extract(self, webpage, require_title=False):
return {
'title': self._og_search_title(webpage, fatal=require_title),
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'uploader': self._og_search_property('site_name', webpage, default=None),
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
params = extract_attributes(self._search_regex(
r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
ios_playlist_url = params.get('data-video-playlist') or params['data-video-id']
hmac = params['data-video-hmac']
webpage = self._download_webpage(url, video_id)
# now quite different params!
params = extract_attributes(self._search_regex(
r'''(<[^>]+\b(?:class|data-testid)\s*=\s*("|')genie-container\2[^>]*>)''',
webpage, 'params'))
ios_playlist_url = traverse_obj(
params, 'data-video-id', 'data-video-playlist',
get_all=False, expected_type=url_or_none)
headers = self.geo_verification_headers()
headers.update({
'Accept': 'application/vnd.itv.vod.playlist.v2+json',
'Content-Type': 'application/json',
'hmac': hmac.upper(),
})
ios_playlist = self._download_json(
ios_playlist_url, video_id, data=json.dumps({
'user': {
'itvUserId': '',
'entitlements': [],
'token': ''
},
'device': {
'manufacturer': 'Safari',
'model': '5',
'manufacturer': 'Mobile Safari',
'model': '5.1',
'os': {
'name': 'Windows NT',
'version': '6.1',
'type': 'desktop'
'name': 'iOS',
'version': '5.0',
'type': ' mobile'
}
},
'client': {
'version': '4.1',
'id': 'browser'
'id': 'browser',
'supportsAdPods': True,
'service': 'itv.x',
'appversion': '2.43.28',
},
'variantAvailability': {
'player': 'hls',
'featureset': {
'min': ['hls', 'aes', 'outband-webvtt'],
'max': ['hls', 'aes', 'outband-webvtt']
},
'platformTag': 'dotcom'
'platformTag': 'mobile'
}
}).encode(), headers=headers)
video_data = ios_playlist['Playlist']['Video']
ios_base_url = video_data.get('Base')
ios_base_url = traverse_obj(video_data, 'Base', expected_type=url_or_none)
media_url = (
(lambda u: url_or_none(urljoin(ios_base_url, u)))
if ios_base_url else url_or_none)
formats = []
for media_file in (video_data.get('MediaFiles') or []):
href = media_file.get('Href')
for media_file in traverse_obj(video_data, 'MediaFiles', expected_type=list) or []:
href = traverse_obj(media_file, 'Href', expected_type=media_url)
if not href:
continue
if ios_base_url:
href = ios_base_url + href
ext = determine_ext(href)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
href, video_id, 'mp4', entry_protocol='m3u8_native',
href, video_id, 'mp4', entry_protocol='m3u8',
m3u8_id='hls', fatal=False))
else:
formats.append({
'url': href,
})
self._sort_formats(formats)
for f in formats:
f.setdefault('http_headers', {})
f['http_headers'].update(self._vanilla_ua_header())
subtitles = {}
subs = video_data.get('Subtitles') or []
for sub in subs:
if not isinstance(sub, dict):
continue
href = url_or_none(sub.get('Href'))
for sub in traverse_obj(video_data, 'Subtitles', expected_type=list) or []:
href = traverse_obj(sub, 'Href', expected_type=url_or_none)
if not href:
continue
subtitles.setdefault('en', []).append({
@ -127,59 +270,132 @@ class ITVIE(InfoExtractor):
'ext': determine_ext(href, 'vtt'),
})
info = self._search_json_ld(webpage, video_id, default={})
if not info:
json_ld = self._parse_json(self._search_regex(
JSON_LD_RE, webpage, 'JSON-LD', '{}',
group='json_ld'), video_id, fatal=False)
if json_ld and json_ld.get('@type') == 'BreadcrumbList':
for ile in (json_ld.get('itemListElement:') or []):
item = ile.get('item:') or {}
if item.get('@type') == 'TVEpisode':
item['@context'] = 'http://schema.org'
info = self._json_ld(item, video_id, fatal=False) or {}
break
next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default='{}')
video_data.update(traverse_obj(next_data, ('props', 'pageProps', ('title', 'episode')), expected_type=dict)[0] or {})
title = traverse_obj(video_data, 'headerTitle', 'episodeTitle')
info = self._og_extract(webpage, require_title=not title)
tn = info.pop('thumbnail', None)
if tn:
info['thumbnails'] = [{'url': tn}]
# num. episode title
num_ep_title = video_data.get('numberedEpisodeTitle')
if not num_ep_title:
num_ep_title = clean_html(get_element_by_attribute('data-testid', 'episode-hero-description-strong', webpage))
num_ep_title = num_ep_title and num_ep_title.rstrip(' -')
ep_title = strip_or_none(
video_data.get('episodeTitle')
or (num_ep_title.split('.', 1)[-1] if num_ep_title else None))
title = title or re.sub(r'\s+-\s+ITVX$', '', info['title'])
if ep_title and ep_title != title:
title = title + ' - ' + ep_title
def get_thumbnails():
tns = []
for w, x in (traverse_obj(video_data, ('imagePresets'), expected_type=dict) or {}).items():
if isinstance(x, dict):
for y, z in x.items():
tns.append({'id': w + '_' + y, 'url': z})
return tns or None
video_str = lambda *x: traverse_obj(
video_data, *x, get_all=False, expected_type=strip_or_none)
return merge_dicts({
'id': video_id,
'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
'title': title,
'formats': formats,
'subtitles': subtitles,
'duration': parse_duration(video_data.get('Duration')),
'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)),
# parsing hh:mm:ss:nnn not yet patched
'duration': parse_duration(re.sub(r'(\d{2})(:)(\d{3}$)', r'\1.\3', video_data.get('Duration') or '')),
'description': video_str('synopsis'),
'timestamp': traverse_obj(video_data, 'broadcastDateTime', 'dateTime', expected_type=parse_iso8601),
'thumbnails': get_thumbnails(),
'series': video_str('showTitle', 'programmeTitle'),
'series_number': int_or_none(video_data.get('seriesNumber')),
'episode': ep_title,
'episode_number': int_or_none((num_ep_title or '').split('.')[0]),
'channel': video_str('channel'),
'categories': traverse_obj(video_data, ('categories', 'formatted'), expected_type=list),
'age_limit': {False: 16, True: 0}.get(video_data.get('isChildrenCategory')),
}, info)
class ITVBTCCIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
class ITVBTCCIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P<id>[^/?#&]+)'
_IE_DESC = 'ITV articles: News, British Touring Car Championship'
_TESTS = [{
'note': 'British Touring Car Championship',
'url': 'https://www.itv.com/btcc/articles/btcc-2018-all-the-action-from-brands-hatch',
'info_dict': {
'id': 'btcc-2018-all-the-action-from-brands-hatch',
'title': 'BTCC 2018: All the action from Brands Hatch',
},
'playlist_mincount': 9,
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
}, {
'note': 'redirects to /btcc/articles/...',
'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
'only_matching': True,
}, {
'note': 'news article',
'url': 'https://www.itv.com/news/wales/2020-07-23/sean-fletcher-shows-off-wales-coastline-in-new-itv-series-as-british-tourists-opt-for-staycations',
'info_dict': {
'id': 'sean-fletcher-shows-off-wales-coastline-in-new-itv-series-as-british-tourists-opt-for-staycations',
'title': '''Sean Fletcher on why Wales' coastline should be your 'staycation' destination | ITV News''',
},
'playlist_mincount': 1,
}]
# should really be a class var of the BC IE
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
BRIGHTCOVE_ACCOUNT = '1582188683001'
BRIGHTCOVE_PLAYER = 'HkiHLnNRx'
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
webpage, urlh = self._download_webpage_handle(url, playlist_id)
link = compat_urlparse.urlparse(urlh.geturl()).path.strip('/')
entries = [
self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
# ITV does not like some GB IP ranges, so here are some
# IP blocks it accepts
'geo_ip_blocks': [
'193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21'
],
'referrer': url,
}),
ie=BrightcoveNewIE.ie_key(), video_id=video_id)
for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)]
next_data = self._search_nextjs_data(webpage, playlist_id, fatal=False, default='{}')
path_prefix = compat_urlparse.urlparse(next_data.get('assetPrefix') or '').path.strip('/')
link = remove_start(link, path_prefix).strip('/')
content = traverse_obj(
next_data, ('props', 'pageProps', Ellipsis),
expected_type=lambda x: x if x['link'] == link else None,
get_all=False, default={})
content = traverse_obj(
content, ('body', 'content', Ellipsis, 'data'),
expected_type=lambda x: x if x.get('name') == 'Brightcove' or x.get('type') == 'Brightcove' else None)
contraband = {
# ITV does not like some GB IP ranges, so here are some
# IP blocks it accepts
'geo_ip_blocks': [
'193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21'
],
'referrer': urlh.geturl(),
}
def entries():
for data in content or []:
video_id = data.get('id')
if not video_id:
continue
account = data.get('accountId') or self.BRIGHTCOVE_ACCOUNT
player = data.get('playerId') or self.BRIGHTCOVE_PLAYER
yield self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account, player, video_id), contraband),
ie=BrightcoveNewIE.ie_key(), video_id=video_id)
# obsolete ?
for video_id in re.findall(r'''data-video-id=["'](\d+)''', webpage):
yield self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (self.BRIGHTCOVE_ACCOUNT, self.BRIGHTCOVE_PLAYER, video_id), contraband),
ie=BrightcoveNewIE.ie_key(), video_id=video_id)
title = self._og_search_title(webpage, fatal=False)
return self.playlist_result(entries, playlist_id, title)
return self.playlist_result(entries(), playlist_id, title)

View file

@ -0,0 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import update_url
class KommunetvIE(InfoExtractor):
_VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P<id>\w+)'
_TEST = {
'url': 'https://oslo.kommunetv.no/archive/921',
'md5': '5f102be308ee759be1e12b63d5da4bbc',
'info_dict': {
'id': '921',
'title': 'Bystyremøte',
'ext': 'mp4'
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
headers = {
'Accept': 'application/json'
}
data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers)
title = data['stream']['title']
file = data['playlist'][0]['playlist'][0]['file']
url = update_url(file, query=None, fragment=None)
formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'title': title
}

View file

@ -0,0 +1,87 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
get_element_by_id,
get_element_by_class,
int_or_none,
js_to_json,
MONTH_NAMES,
qualities,
unified_strdate,
)
class MyVideoGeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?myvideo\.ge/v/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://www.myvideo.ge/v/3941048',
'md5': '8c192a7d2b15454ba4f29dc9c9a52ea9',
'info_dict': {
'id': '3941048',
'ext': 'mp4',
'title': 'The best prikol',
'upload_date': '20200611',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'chixa33',
'description': 'md5:5b067801318e33c2e6eea4ab90b1fdd3',
},
# working from local dev system
'skip': 'site blocks CI servers',
}
_MONTH_NAMES_KA = ['იანვარი', 'თებერვალი', 'მარტი', 'აპრილი', 'მაისი', 'ივნისი', 'ივლისი', 'აგვისტო', 'სექტემბერი', 'ოქტომბერი', 'ნოემბერი', 'დეკემბერი']
_quality = staticmethod(qualities(('SD', 'HD')))
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = (
self._og_search_title(webpage, default=None)
or clean_html(get_element_by_class('my_video_title', webpage))
or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title\b', webpage, 'title'))
jwplayer_sources = self._parse_json(
self._search_regex(
r'''(?s)jwplayer\s*\(\s*['"]mvplayer['"]\s*\)\s*\.\s*setup\s*\(.*?\bsources\s*:\s*(\[.*?])\s*[,});]''', webpage, 'jwplayer sources', fatal=False)
or '',
video_id, transform_source=js_to_json, fatal=False)
formats = self._parse_jwplayer_formats(jwplayer_sources or [], video_id)
for f in formats or []:
f['preference'] = self._quality(f['format_id'])
self._sort_formats(formats)
description = (
self._og_search_description(webpage)
or get_element_by_id('long_desc_holder', webpage)
or self._html_search_meta('description', webpage))
uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False)
upload_date = get_element_by_class('mv_vid_upl_date', webpage)
# as ka locale may not be present roll a local date conversion
upload_date = (unified_strdate(
# translate any ka month to an en one
re.sub('|'.join(self._MONTH_NAMES_KA),
lambda m: MONTH_NAMES['en'][self._MONTH_NAMES_KA.index(m.group(0))],
upload_date, re.I))
if upload_date else None)
return {
'id': video_id,
'title': title,
'description': description,
'uploader': uploader,
'formats': formats,
'thumbnail': self._og_search_thumbnail(webpage),
'upload_date': upload_date,
'view_count': int_or_none(get_element_by_class('mv_vid_views', webpage)),
'like_count': int_or_none(get_element_by_id('likes_count', webpage)),
'dislike_count': int_or_none(get_element_by_id('dislikes_count', webpage)),
}

View file

@ -7,6 +7,7 @@ import subprocess
import tempfile
from ..compat import (
compat_open as open,
compat_urlparse,
compat_kwargs,
)

View file

@ -0,0 +1,105 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
import re
from ..utils import (
merge_dicts,
)
class Pr0grammStaticIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/static/5466437
_VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://pr0gramm.com/static/5466437',
'md5': '52fa540d70d3edc286846f8ca85938aa',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'uploader': 'g11st',
'upload_date': '20221221',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# Fetch media sources
entries = self._parse_html5_media_entries(url, webpage, video_id)
media_info = entries[0]
# this raises if there are no formats
self._sort_formats(media_info.get('formats') or [])
# Fetch author
uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader')
# Fetch approx upload timestamp from filename
# Have None-defaults in case the extraction fails
uploadDay = None
uploadMon = None
uploadYear = None
uploadTimestr = None
# (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4)
m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage)
if (m):
# Up to a day of accuracy should suffice...
uploadDay = m.groupdict().get('day')
uploadMon = m.groupdict().get('mon')
uploadYear = m.groupdict().get('year')
uploadTimestr = uploadYear + uploadMon + uploadDay
return merge_dicts({
'id': video_id,
'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''),
'uploader': uploader,
'upload_date': uploadTimestr
}, media_info)
# This extractor is for the primary url (used for sharing, and appears in the
# location bar) Since this page loads the DOM via JS, yt-dl can't find any
# video information here. So let's redirect to a compatibility version of
# the site, which does contain the <video>-element by itself, without requiring
# js to be ran.
class Pr0grammIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/new/546637
# https://pr0gramm.com/new/video/546637
# https://pr0gramm.com/top/546637
# https://pr0gramm.com/top/video/546637
# https://pr0gramm.com/user/g11st/uploads/5466437
# https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290
# https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030
# https://pr0gramm.com/user/froschler/1elf/5232030
# https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id!
# https://pr0gramm.com/top/fruher war alles damals/5498175
_VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)'
_TEST = {
'url': 'https://pr0gramm.com/new/video/5466437',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'uploader': 'g11st',
'upload_date': '20221221',
}
}
def _generic_title():
return "oof"
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
'https://pr0gramm.com/static/' + video_id,
video_id=video_id,
ie=Pr0grammStaticIE.ie_key())

View file

@ -0,0 +1,97 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class RbgTumIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)'
_TESTS = [{
# Combined view
'url': 'https://live.rbg.tum.de/w/cpp/22128',
'md5': '53a5e7b3e07128e33bbf36687fe1c08f',
'info_dict': {
'id': 'cpp/22128',
'ext': 'mp4',
'title': 'Lecture: October 18. 2022',
'series': 'Concepts of C++ programming (IN2377)',
}
}, {
# Presentation only
'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES',
'md5': '36c584272179f3e56b0db5d880639cba',
'info_dict': {
'id': 'I2DL/12349/PRES',
'ext': 'mp4',
'title': 'Lecture 3: Introduction to Neural Networks',
'series': 'Introduction to Deep Learning (IN2346)',
}
}, {
# Camera only
'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM',
'md5': 'e04189d92ff2f56aedf5cede65d37aad',
'info_dict': {
'id': 'fvv-info/16130/CAM',
'ext': 'mp4',
'title': 'Fachschaftsvollversammlung',
'series': 'Fachschaftsvollversammlung Informatik',
}
}, ]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8')
lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
lecture_series_title = self._html_search_regex(
r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?</title>', webpage, 'series')
formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
self._sort_formats(formats)
return {
'id': video_id,
'title': lecture_title,
'series': lecture_series_title,
'formats': formats,
}
class RbgTumCourseIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P<id>.+)'
_TESTS = [{
'url': 'https://live.rbg.tum.de/course/2022/S/fpv',
'info_dict': {
'title': 'Funktionale Programmierung und Verifikation (IN0003)',
'id': '2022/S/fpv',
},
'params': {
'noplaylist': False,
},
'playlist_count': 13,
}, {
'url': 'https://live.rbg.tum.de/course/2022/W/set',
'info_dict': {
'title': 'SET FSMPIC',
'id': '2022/W/set',
},
'params': {
'noplaylist': False,
},
'playlist_count': 6,
}, ]
def _real_extract(self, url):
course_id = self._match_id(url)
webpage = self._download_webpage(url, course_id)
lecture_series_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
lecture_urls = []
for lecture_url in re.findall(r'(?i)href="/w/(.+)(?<!/cam)(?<!/pres)(?<!/chat)"', webpage):
lecture_urls.append(self.url_result('https://live.rbg.tum.de/w/' + lecture_url, ie=RbgTumIE.ie_key()))
return self.playlist_result(lecture_urls, course_id, lecture_series_title)

124
youtube_dl/extractor/s4c.py Normal file
View file

@ -0,0 +1,124 @@
# coding: utf-8
from __future__ import unicode_literals
from functools import partial as partial_f
from .common import InfoExtractor
from ..utils import (
float_or_none,
merge_dicts,
T,
traverse_obj,
txt_or_none,
url_or_none,
)
class S4CIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/programme/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.s4c.cymru/clic/programme/861362209',
'info_dict': {
'id': '861362209',
'ext': 'mp4',
'title': 'Y Swn',
'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0',
'duration': 5340,
'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg',
},
}, {
'url': 'https://www.s4c.cymru/clic/programme/856636948',
'info_dict': {
'id': '856636948',
'ext': 'mp4',
'title': 'Am Dro',
'duration': 2880,
'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe',
'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
details = self._download_json(
'https://www.s4c.cymru/df/full_prog_details',
video_id, query={
'lang': 'e',
'programme_id': video_id,
}, fatal=False)
player_config = self._download_json(
'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={
'programme_id': video_id,
'signed': '0',
'lang': 'en',
'mode': 'od',
'appId': 'clic',
'streamName': '',
}, note='Downloading player config JSON')
m3u8_url = self._download_json(
'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={
'mode': 'od',
'application': 'clic',
'region': 'WW',
'extra': 'false',
'thirdParty': 'false',
'filename': player_config['filename'],
}, note='Downloading streaming urls JSON')['hls']
formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native')
self._sort_formats(formats)
subtitles = {}
for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))):
subtitles.setdefault(sub.get('3', 'en'), []).append({
'url': sub['0'],
'name': sub.get('1'),
})
return merge_dicts({
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'thumbnail': url_or_none(player_config.get('poster')),
}, traverse_obj(details, ('full_prog_details', 0, {
'title': (('programme_title', 'series_title'), T(txt_or_none)),
'description': ('full_billing', T(txt_or_none)),
'duration': ('duration', T(partial_f(float_or_none, invscale=60))),
}), get_all=False),
rev=True)
class S4CSeriesIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.s4c.cymru/clic/series/864982911',
'playlist_mincount': 6,
'info_dict': {
'id': '864982911',
'title': 'Iaith ar Daith',
},
}, {
'url': 'https://www.s4c.cymru/clic/series/866852587',
'playlist_mincount': 8,
'info_dict': {
'id': '866852587',
'title': 'FFIT Cymru',
},
}]
def _real_extract(self, url):
series_id = self._match_id(url)
series_details = self._download_json(
'https://www.s4c.cymru/df/series_details', series_id, query={
'lang': 'e',
'series_id': series_id,
'show_prog_in_series': 'Y'
}, note='Downloading series details JSON')
return self.playlist_result(
(self.url_result('https://www.s4c.cymru/clic/programme/' + episode_id, S4CIE, episode_id)
for episode_id in traverse_obj(series_details, ('other_progs_in_series', Ellipsis, 'id'))),
playlist_id=series_id, playlist_title=traverse_obj(
series_details, ('full_prog_details', 0, 'series_title', T(txt_or_none))))

View file

@ -0,0 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals
import binascii
import random
import re
import string
from .common import InfoExtractor
from ..utils import urljoin, url_basename
def to_ascii_hex(str1):
return binascii.hexlify(str1.encode('utf-8')).decode('ascii')
def generate_random_string(length):
return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length))
class StreamsbIE(InfoExtractor):
_DOMAINS = ('viewsb.com', )
_VALID_URL = r'https://(?P<domain>%s)/(?P<id>.+)' % '|'.join(_DOMAINS)
_TEST = {
'url': 'https://viewsb.com/dxfvlu4qanjx',
'md5': '488d111a63415369bf90ea83adc8a325',
'info_dict': {
'id': 'dxfvlu4qanjx',
'ext': 'mp4',
'title': 'Sintel'
}
}
def _real_extract(self, url):
domain, video_id = re.match(self._VALID_URL, url).group('domain', 'id')
webpage = self._download_webpage(url, video_id)
iframe_rel_url = self._search_regex(r'''(?i)<iframe\b[^>]+\bsrc\s*=\s*('|")(?P<path>/.*\.html)\1''', webpage, 'iframe', group='path')
iframe_url = urljoin('https://' + domain, iframe_rel_url)
iframe_data = self._download_webpage(iframe_url, video_id)
app_version = self._search_regex(r'''<script\b[^>]+\bsrc\s*=\s*["|'].*/app\.min\.(\d+)\.js''', iframe_data, 'app version', fatal=False) or '50'
video_code = url_basename(iframe_url).rsplit('.')[0]
length = 12
req = '||'.join((generate_random_string(length), video_code, generate_random_string(length), 'streamsb'))
ereq = 'https://{0}/sources{1}/{2}'.format(domain, app_version, to_ascii_hex(req))
video_data = self._download_webpage(ereq, video_id, headers={
'Referer': iframe_url,
'watchsb': 'sbstream',
})
player_data = self._parse_json(video_data, video_id)
title = player_data['stream_data']['title']
formats = self._extract_m3u8_formats(player_data['stream_data']['file'], video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
return {
'id': video_id,
'formats': formats,
'title': title,
}

View file

@ -3,17 +3,23 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
url_or_none,
)
class TelewebionIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?telewebion\.com/#!/episode/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?telewebion\.com/(episode|clip)/(?P<id>[a-zA-Z0-9]+)'
_TEST = {
'url': 'http://www.telewebion.com/#!/episode/1263668/',
'url': 'http://www.telewebion.com/episode/0x1b3139c/',
'info_dict': {
'id': '1263668',
'id': '0x1b3139c',
'ext': 'mp4',
'title': 'قرعه\u200cکشی لیگ قهرمانان اروپا',
'thumbnail': r're:^https?://.*\.jpg',
'thumbnail': r're:^https?://static\.telewebion\.com/episodeImages/.*/default',
'view_count': int,
},
'params': {
@ -25,31 +31,24 @@ class TelewebionIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
secure_token = self._download_webpage(
'http://m.s2.telewebion.com/op/op?action=getSecurityToken', video_id)
episode_details = self._download_json(
'http://m.s2.telewebion.com/op/op', video_id,
query={'action': 'getEpisodeDetails', 'episode_id': video_id})
episode_details = self._download_json('https://gateway.telewebion.ir/kandoo/episode/getEpisodeDetail/?EpisodeId={0}'.format(video_id), video_id)
episode_details = episode_details['body']['queryEpisode'][0]
m3u8_url = 'http://m.s1.telewebion.com/smil/%s.m3u8?filepath=%s&m3u8=1&secure_token=%s' % (
video_id, episode_details['file_path'], secure_token)
channel_id = episode_details['channel']['descriptor']
episode_image_id = episode_details.get('image')
episode_image = 'https://static.telewebion.com/episodeImages/{0}/default'.format(episode_image_id) if episode_image_id else None
m3u8_url = 'https://cdna.telewebion.com/{0}/episode/{1}/playlist.m3u8'.format(channel_id, video_id)
formats = self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4', m3u8_id='hls')
picture_paths = [
episode_details.get('picture_path'),
episode_details.get('large_picture_path'),
]
thumbnails = [{
'url': picture_path,
'preference': idx,
} for idx, picture_path in enumerate(picture_paths) if picture_path is not None]
m3u8_url, video_id, ext='mp4', m3u8_id='hls',
entry_protocol='m3u8_native')
self._sort_formats(formats)
return {
'id': video_id,
'title': episode_details['title'],
'formats': formats,
'thumbnails': thumbnails,
'view_count': episode_details.get('view_count'),
'thumbnail': url_or_none(episode_image),
'view_count': int_or_none(episode_details.get('view_count')),
'duration': float_or_none(episode_details.get('duration')),
}

View file

@ -2,9 +2,22 @@
from __future__ import unicode_literals
import re
import time
from .common import InfoExtractor
from ..utils import ExtractorError
from ..compat import compat_kwargs
from ..utils import (
base_url,
determine_ext,
ExtractorError,
float_or_none,
merge_dicts,
T,
traverse_obj,
txt_or_none,
url_basename,
url_or_none,
)
class Vbox7IE(InfoExtractor):
@ -20,23 +33,27 @@ class Vbox7IE(InfoExtractor):
)
(?P<id>[\da-fA-F]+)
'''
_EMBED_REGEX = [r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)']
_GEO_COUNTRIES = ['BG']
_TESTS = [{
'url': 'http://vbox7.com/play:0946fff23c',
'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf',
# the http: URL just redirects here
'url': 'https://vbox7.com/play:0946fff23c',
'md5': '50ca1f78345a9c15391af47d8062d074',
'info_dict': {
'id': '0946fff23c',
'ext': 'mp4',
'title': 'Борисов: Притеснен съм за бъдещето на България',
'description': 'По думите му е опасно страната ни да бъде обявена за "сигурна"',
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1470982814,
'upload_date': '20160812',
'uploader': 'zdraveibulgaria',
'thumbnail': r're:^https?://.*\.jpg$',
'view_count': int,
'duration': 2640,
},
'params': {
'proxy': '127.0.0.1:8118',
},
'expected_warnings': [
'Unable to download webpage',
],
}, {
'url': 'http://vbox7.com/play:249bb972c2',
'md5': '99f65c0c9ef9b682b97313e052734c3f',
@ -44,8 +61,15 @@ class Vbox7IE(InfoExtractor):
'id': '249bb972c2',
'ext': 'mp4',
'title': 'Смях! Чудо - чист за секунди - Скрита камера',
'description': 'Смях! Чудо - чист за секунди - Скрита камера',
'timestamp': 1360215023,
'upload_date': '20130207',
'uploader': 'svideteliat_ot_varshava',
'thumbnail': 'https://i49.vbox7.com/o/249/249bb972c20.jpg',
'view_count': int,
'duration': 83,
},
'skip': 'georestricted',
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1',
'only_matching': True,
@ -54,52 +78,127 @@ class Vbox7IE(InfoExtractor):
'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)',
webpage)
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(cls._EMBED_REGEX[0], webpage)
if mobj:
return mobj.group('url')
# specialisation to transform what looks like ld+json that
# may contain invalid character combinations
# transform_source=None, fatal=True
def _parse_json(self, json_string, video_id, *args, **kwargs):
if '"@context"' in json_string[:30]:
# this is ld+json, or that's the way to bet
transform_source = args[0] if len(args) > 0 else kwargs.get('transform_source')
if not transform_source:
def fix_chars(src):
# fix malformed ld+json: replace raw CRLFs with escaped LFs
return re.sub(
r'"[^"]+"', lambda m: re.sub(r'\r?\n', r'\\n', m.group(0)), src)
if len(args) > 0:
args = (fix_chars,) + args[1:]
else:
kwargs['transform_source'] = fix_chars
kwargs = compat_kwargs(kwargs)
return super(Vbox7IE, self)._parse_json(
json_string, video_id, *args, **kwargs)
def _real_extract(self, url):
video_id = self._match_id(url)
url = 'https://vbox7.com/play:%s' % (video_id,)
now = time.time()
response = self._download_json(
'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id,
video_id)
'https://www.vbox7.com/aj/player/item/options', video_id,
query={'vid': video_id}, headers={'Referer': url})
# estimate time to which possible `ago` member is relative
now = now + 0.5 * (time.time() - now)
if 'error' in response:
if traverse_obj(response, 'error'):
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
video = response['options']
src_url = traverse_obj(response, ('options', 'src', T(url_or_none))) or ''
title = video['title']
video_url = video['src']
if '/na.mp4' in video_url:
fmt_base = url_basename(src_url).rsplit('.', 1)[0].rsplit('_', 1)[0]
if fmt_base in ('na', 'vn'):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
uploader = video.get('uploader')
ext = determine_ext(src_url)
if ext == 'mpd':
# extract MPD
try:
formats, subtitles = self._extract_mpd_formats_and_subtitles(
src_url, video_id, 'dash', fatal=False)
except KeyError: # fatal doesn't catch this
self.report_warning('Failed to parse MPD manifest')
formats, subtitles = [], {}
elif ext != 'm3u8':
formats = [{
'url': src_url,
}] if src_url else []
subtitles = {}
webpage = self._download_webpage(
'http://vbox7.com/play:%s' % video_id, video_id, fatal=None)
if src_url:
# possibly extract HLS, based on https://github.com/yt-dlp/yt-dlp/pull/9100
fmt_base = base_url(src_url) + fmt_base
# prepare for _extract_m3u8_formats_and_subtitles()
# hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(
hls_formats = self._extract_m3u8_formats(
'{0}.m3u8'.format(fmt_base), video_id, m3u8_id='hls', fatal=False)
formats.extend(hls_formats)
# self._merge_subtitles(hls_subs, target=subtitles)
info = {}
# In case MPD/HLS cannot be parsed, or anyway, get mp4 combined
# formats usually provided to Safari, iOS, and old Windows
video = response['options']
resolutions = (1080, 720, 480, 240, 144)
highest_res = traverse_obj(video, (
'highestRes', T(int))) or resolutions[0]
resolutions = traverse_obj(video, (
'resolutions', lambda _, r: highest_res >= int(r) > 0)) or resolutions
mp4_formats = traverse_obj(resolutions, (
Ellipsis, T(lambda res: {
'url': '{0}_{1}.mp4'.format(fmt_base, res),
'format_id': 'http-{0}'.format(res),
'height': res,
})))
# if above formats are flaky, enable the line below
# self._check_formats(mp4_formats, video_id)
formats.extend(mp4_formats)
if webpage:
info = self._search_json_ld(
webpage.replace('"/*@context"', '"@context"'), video_id,
fatal=False)
self._sort_formats(formats)
info.update({
webpage = self._download_webpage(url, video_id, fatal=False) or ''
info = self._search_json_ld(
webpage.replace('"/*@context"', '"@context"'), video_id,
fatal=False) if webpage else {}
if not info.get('title'):
info['title'] = traverse_obj(response, (
'options', 'title', T(txt_or_none))) or self._og_search_title(webpage)
def if_missing(k):
return lambda x: None if k in info else x
info = merge_dicts(info, {
'id': video_id,
'title': title,
'url': video_url,
'uploader': uploader,
'thumbnail': self._proto_relative_url(
'formats': formats,
'subtitles': subtitles or None,
}, info, traverse_obj(response, ('options', {
'uploader': ('uploader', T(txt_or_none)),
'timestamp': ('ago', T(if_missing('timestamp')), T(lambda t: int(round((now - t) / 60.0)) * 60)),
'duration': ('duration', T(if_missing('duration')), T(float_or_none)),
})))
if 'thumbnail' not in info:
info['thumbnail'] = self._proto_relative_url(
info.get('thumbnail') or self._og_search_thumbnail(webpage),
'http:'),
})
'https:'),
return info

View file

@ -261,27 +261,33 @@ class VimeoIE(VimeoBaseInfoExtractor):
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'''(?x)
https?://
(?:
(?:
www|
player
)
\.
)?
vimeo(?:pro)?\.com/
(?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
(?:.*?/)??
(?:
(?:
play_redirect_hls|
moogaloop\.swf)\?clip_id=
)?
(?:videos?/)?
(?P<id>[0-9]+)
(?:/(?P<unlisted_hash>[\da-f]{10}))?
/?(?:[?&].*)?(?:[#].*)?$
'''
https?://
(?:
(?:
www|
player
)
\.
)?
vimeo(?:pro)?\.com/
(?:
(?P<u>user)|
(?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
(?:.*?/)??
(?P<q>
(?:
play_redirect_hls|
moogaloop\.swf)\?clip_id=
)?
(?:videos?/)?
)
(?P<id>[0-9]+)
(?(u)
/(?!videos|likes)[^/?#]+/?|
(?(q)|/(?P<unlisted_hash>[\da-f]{10}))?
)
(?:(?(q)[&]|(?(u)|/?)[?]).+?)?(?:[#].*)?$
'''
IE_NAME = 'vimeo'
_TESTS = [
{
@ -539,7 +545,12 @@ class VimeoIE(VimeoBaseInfoExtractor):
'params': {
'skip_download': True,
},
}
},
{
# user playlist alias -> https://vimeo.com/258705797
'url': 'https://vimeo.com/user26785108/newspiritualguide',
'only_matching': True,
},
# https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header
]
@ -663,7 +674,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
if '//player.vimeo.com/video/' in url:
config = self._parse_json(self._search_regex(
r'\b(?:playerC|c)onfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id)
r'(?s)\b(?:playerC|c)onfig\s*=\s*({.+?})\s*[;\n]', webpage, 'info section'), video_id)
if config.get('view') == 4:
config = self._verify_player_video_password(
redirect_url, video_id, headers)

View file

@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
float_or_none,
merge_dicts,
str_or_none,
T,
traverse_obj,
url_or_none,
)
class WhypIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?whyp\.it/tracks/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.whyp.it/tracks/18337/home-page-example-track-b4kq7',
'md5': 'c1187b42ebf8605284e3dc92aeb33d16',
'info_dict': {
'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',
'id': '18337',
'title': 'Home Page Example Track',
'description': r're:(?s).+\bexample track\b',
'ext': 'mp3',
'duration': 52.82,
'uploader': 'Brad',
'uploader_id': '1',
'thumbnail': 'https://cdn.whyp.it/a537bb36-3373-4c61-96c8-27fc1b2f427a.jpg',
},
}, {
'url': 'https://www.whyp.it/tracks/18337',
'only_matching': True,
}]
def _real_extract(self, url):
unique_id = self._match_id(url)
webpage = self._download_webpage(url, unique_id)
data = self._search_nuxt_data(webpage, unique_id)['rawTrack']
return merge_dicts({
'url': data['audio_url'],
'id': unique_id,
}, traverse_obj(data, {
'title': 'title',
'description': 'description',
'duration': ('duration', T(float_or_none)),
'uploader': ('user', 'username'),
'uploader_id': ('user', 'id', T(str_or_none)),
'thumbnail': ('artwork_url', T(url_or_none)),
}), {
'ext': 'mp3',
'vcodec': 'none',
'http_headers': {'Referer': 'https://whyp.it/'},
}, rev=True)

View file

@ -24,7 +24,7 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
_DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com)'
_DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)'
_VALID_URL = r'''(?x)
https?://
(?:.+?\.)?%s/
@ -123,6 +123,9 @@ class XHamsterIE(InfoExtractor):
}, {
'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf',
'only_matching': True,
}, {
'url': 'https://xhvid.com/videos/lk-mm-xhc6wn6',
'only_matching': True,
}]
def _real_extract(self, url):
@ -433,6 +436,9 @@ class XHamsterUserIE(InfoExtractor):
}, {
'url': 'https://xhday.com/users/mobhunter',
'only_matching': True,
}, {
'url': 'https://xhvid.com/users/pelushe21',
'only_matching': True,
}]
def _entries(self, user_id):

File diff suppressed because it is too large Load diff

View file

@ -2,30 +2,68 @@ from __future__ import unicode_literals
import itertools
import json
import math
import operator
import re
from functools import update_wrapper
from .utils import (
error_to_compat_str,
ExtractorError,
js_to_json,
remove_quotes,
unified_timestamp,
variadic,
)
from .compat import (
compat_basestring,
compat_chr,
compat_collections_chain_map as ChainMap,
compat_itertools_zip_longest as zip_longest,
compat_str,
)
# name JS functions
class function_with_repr(object):
# from yt_dlp/utils.py, but in this module
# repr_ is always set
def __init__(self, func, repr_):
update_wrapper(self, func)
self.func, self.__repr = func, repr_
def __call__(self, *args, **kwargs):
return self.func(*args, **kwargs)
def __repr__(self):
return self.__repr
# name JS operators
def wraps_op(op):
def update_and_rename_wrapper(w):
f = update_wrapper(w, op)
# fn names are str in both Py 2/3
f.__name__ = str('JS_') + f.__name__
return f
return update_and_rename_wrapper
# NB In principle NaN cannot be checked by membership.
# Here all NaN values are actually this one, so _NaN is _NaN,
# although _NaN != _NaN.
_NaN = float('nan')
def _js_bit_op(op):
def zeroise(x):
return 0 if x in (None, JS_Undefined) else x
return 0 if x in (None, JS_Undefined, _NaN) else x
@wraps_op(op)
def wrapped(a, b):
return op(zeroise(a), zeroise(b)) & 0xffffffff
@ -34,23 +72,24 @@ def _js_bit_op(op):
def _js_arith_op(op):
@wraps_op(op)
def wrapped(a, b):
if JS_Undefined in (a, b):
return float('nan')
return _NaN
return op(a or 0, b or 0)
return wrapped
def _js_div(a, b):
if JS_Undefined in (a, b) or not (a and b):
return float('nan')
if JS_Undefined in (a, b) or not (a or b):
return _NaN
return operator.truediv(a or 0, b) if b else float('inf')
def _js_mod(a, b):
if JS_Undefined in (a, b) or not b:
return float('nan')
return _NaN
return (a or 0) % b
@ -58,12 +97,13 @@ def _js_exp(a, b):
if not b:
return 1 # even 0 ** 0 !!
elif JS_Undefined in (a, b):
return float('nan')
return _NaN
return (a or 0) ** b
def _js_eq_op(op):
@wraps_op(op)
def wrapped(a, b):
if set((a, b)) <= set((None, JS_Undefined)):
return op(a, a)
@ -74,6 +114,7 @@ def _js_eq_op(op):
def _js_comp_op(op):
@wraps_op(op)
def wrapped(a, b):
if JS_Undefined in (a, b):
return False
@ -88,13 +129,8 @@ def _js_comp_op(op):
def _js_ternary(cndn, if_true=True, if_false=False):
"""Simulate JS's ternary operator (cndn?if_true:if_false)"""
if cndn in (False, None, 0, '', JS_Undefined):
if cndn in (False, None, 0, '', JS_Undefined, _NaN):
return if_false
try:
if math.isnan(cndn): # NB: NaN cannot be checked by membership
return if_false
except TypeError:
pass
return if_true
@ -187,19 +223,6 @@ class LocalNameSpace(ChainMap):
class JSInterpreter(object):
__named_object_counter = 0
_RE_FLAGS = {
# special knowledge: Python's re flags are bitmask values, current max 128
# invent new bitmask values well above that for literal parsing
# TODO: new pattern class to execute matches with these flags
'd': 1024, # Generate indices for substring matches
'g': 2048, # Global search
'i': re.I, # Case-insensitive search
'm': re.M, # Multi-line search
's': re.S, # Allows . to match newline characters
'u': re.U, # Treat a pattern as a sequence of unicode code points
'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string
}
_OBJ_NAME = '__youtube_dl_jsinterp_obj'
OP_CHARS = None
@ -217,9 +240,72 @@ class JSInterpreter(object):
msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr)
super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs)
class JS_RegExp(object):
RE_FLAGS = {
# special knowledge: Python's re flags are bitmask values, current max 128
# invent new bitmask values well above that for literal parsing
# TODO: execute matches with these flags (remaining: d, y)
'd': 1024, # Generate indices for substring matches
'g': 2048, # Global search
'i': re.I, # Case-insensitive search
'm': re.M, # Multi-line search
's': re.S, # Allows . to match newline characters
'u': re.U, # Treat a pattern as a sequence of unicode code points
'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string
}
def __init__(self, pattern_txt, flags=0):
if isinstance(flags, compat_str):
flags, _ = self.regex_flags(flags)
# First, avoid https://github.com/python/cpython/issues/74534
self.__self = None
self.__pattern_txt = pattern_txt.replace('[[', r'[\[')
self.__flags = flags
def __instantiate(self):
if self.__self:
return
self.__self = re.compile(self.__pattern_txt, self.__flags)
# Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern
for name in dir(self.__self):
# Only these? Obviously __class__, __init__.
# PyPy creates a __weakref__ attribute with value None
# that can't be setattr'd but also can't need to be copied.
if name in ('__class__', '__init__', '__weakref__'):
continue
setattr(self, name, getattr(self.__self, name))
def __getattr__(self, name):
self.__instantiate()
# make Py 2.6 conform to its lying documentation
if name == 'flags':
self.flags = self.__flags
return self.flags
elif name == 'pattern':
self.pattern = self.__pattern_txt
return self.pattern
elif hasattr(self.__self, name):
v = getattr(self.__self, name)
setattr(self, name, v)
return v
elif name in ('groupindex', 'groups'):
return 0 if name == 'groupindex' else {}
raise AttributeError('{0} has no attribute named {1}'.format(self, name))
@classmethod
def regex_flags(cls, expr):
flags = 0
if not expr:
return flags, expr
for idx, ch in enumerate(expr):
if ch not in cls.RE_FLAGS:
break
flags |= cls.RE_FLAGS[ch]
return flags, expr[idx + 1:]
@classmethod
def __op_chars(cls):
op_chars = set(';,')
op_chars = set(';,[')
for op in cls._all_operators():
for c in op[0]:
op_chars.add(c)
@ -228,39 +314,30 @@ class JSInterpreter(object):
def _named_object(self, namespace, obj):
self.__named_object_counter += 1
name = '%s%d' % (self._OBJ_NAME, self.__named_object_counter)
if callable(obj) and not isinstance(obj, function_with_repr):
obj = function_with_repr(obj, 'F<%s>' % (self.__named_object_counter, ))
namespace[name] = obj
return name
@classmethod
def _regex_flags(cls, expr):
flags = 0
if not expr:
return flags, expr
for idx, ch in enumerate(expr):
if ch not in cls._RE_FLAGS:
break
flags |= cls._RE_FLAGS[ch]
return flags, expr[idx + 1:]
@classmethod
def _separate(cls, expr, delim=',', max_split=None, skip_delims=None):
if not expr:
return
# collections.Counter() is ~10% slower in both 2.7 and 3.9
counters = {k: 0 for k in _MATCHING_PARENS.values()}
counters = dict((k, 0) for k in _MATCHING_PARENS.values())
start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
in_quote, escaping, skipping = None, False, 0
after_op, in_regex_char_group, skip_re = True, False, 0
after_op, in_regex_char_group = True, False
for idx, char in enumerate(expr):
if skip_re > 0:
skip_re -= 1
continue
paren_delta = 0
if not in_quote:
if char in _MATCHING_PARENS:
counters[_MATCHING_PARENS[char]] += 1
paren_delta = 1
elif char in counters:
counters[char] -= 1
paren_delta = -1
if not escaping:
if char in _QUOTES and in_quote in (char, None):
if in_quote or after_op or char != '/':
@ -268,7 +345,7 @@ class JSInterpreter(object):
elif in_quote == '/' and char in '[]':
in_regex_char_group = char == '['
escaping = not escaping and in_quote and char == '\\'
after_op = not in_quote and (char in cls.OP_CHARS or char == '[' or (char.isspace() and after_op))
after_op = not in_quote and (char in cls.OP_CHARS or paren_delta > 0 or (after_op and char.isspace()))
if char != delim[pos] or any(counters.values()) or in_quote:
pos = skipping = 0
@ -278,7 +355,7 @@ class JSInterpreter(object):
continue
elif pos == 0 and skip_delims:
here = expr[idx:]
for s in skip_delims if isinstance(skip_delims, (list, tuple)) else [skip_delims]:
for s in variadic(skip_delims):
if here.startswith(s) and s:
skipping = len(s) - 1
break
@ -301,7 +378,7 @@ class JSInterpreter(object):
separated = list(cls._separate(expr, delim, 1))
if len(separated) < 2:
raise cls.Exception('No terminating paren {delim} in {expr:.100}'.format(**locals()))
raise cls.Exception('No terminating paren {delim} in {expr!r:.5500}'.format(**locals()))
return separated[0][1:].strip(), separated[1].strip()
@staticmethod
@ -326,9 +403,10 @@ class JSInterpreter(object):
return right_val
try:
# print('Eval:', opfunc.__name__, left_val, right_val)
return opfunc(left_val, right_val)
except Exception as e:
raise self.Exception('Failed to evaluate {left_val!r} {op} {right_val!r}'.format(**locals()), expr, cause=e)
raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e)
def _index(self, obj, idx, allow_undefined=False):
if idx == 'length':
@ -338,7 +416,7 @@ class JSInterpreter(object):
except Exception as e:
if allow_undefined:
return JS_Undefined
raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e)
raise self.Exception('Cannot get index {idx:.100}'.format(**locals()), expr=repr(obj), cause=e)
def _dump(self, obj, namespace):
try:
@ -346,12 +424,28 @@ class JSInterpreter(object):
except TypeError:
return self._named_object(namespace, obj)
# used below
_VAR_RET_THROW_RE = re.compile(r'''(?x)
(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["'])|$)|(?P<throw>throw\s+)
''')
_COMPOUND_RE = re.compile(r'''(?x)
(?P<try>try)\s*\{|
(?P<if>if)\s*\(|
(?P<switch>switch)\s*\(|
(?P<for>for)\s*\(|
(?P<while>while)\s*\(
''')
_FINALLY_RE = re.compile(r'finally\s*\{')
_SWITCH_RE = re.compile(r'switch\s*\(')
def interpret_statement(self, stmt, local_vars, allow_recursion=100):
if allow_recursion < 0:
raise self.Exception('Recursion limit reached')
allow_recursion -= 1
# print('At: ' + stmt[:60])
should_return = False
# fails on (eg) if (...) stmt1; else stmt2;
sub_statements = list(self._separate(stmt, ';')) or ['']
expr = stmt = sub_statements.pop().strip()
for sub_stmt in sub_statements:
@ -359,7 +453,7 @@ class JSInterpreter(object):
if should_return:
return ret, should_return
m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?P<throw>throw\s+)', stmt)
m = self._VAR_RET_THROW_RE.match(stmt)
if m:
expr = stmt[len(m.group(0)):].strip()
if m.group('throw'):
@ -371,25 +465,30 @@ class JSInterpreter(object):
if expr[0] in _QUOTES:
inner, outer = self._separate(expr, expr[0], 1)
if expr[0] == '/':
flags, outer = self._regex_flags(outer)
inner = re.compile(inner[1:], flags=flags) # , strict=True))
flags, outer = self.JS_RegExp.regex_flags(outer)
inner = self.JS_RegExp(inner[1:], flags=flags)
else:
inner = json.loads(js_to_json(inner + expr[0])) # , strict=True))
if not outer:
return inner, should_return
expr = self._named_object(local_vars, inner) + outer
if expr.startswith('new '):
obj = expr[4:]
if obj.startswith('Date('):
left, right = self._separate_at_paren(obj[4:])
expr = unified_timestamp(
self.interpret_expression(left, local_vars, allow_recursion), False)
if not expr:
raise self.Exception('Failed to parse date {left!r}'.format(**locals()), expr=expr)
expr = self._dump(int(expr * 1000), local_vars) + right
new_kw, _, obj = expr.partition('new ')
if not new_kw:
for klass, konstr in (('Date', lambda x: int(unified_timestamp(x, False) * 1000)),
('RegExp', self.JS_RegExp),
('Error', self.Exception)):
if not obj.startswith(klass + '('):
continue
left, right = self._separate_at_paren(obj[len(klass):])
argvals = self.interpret_iter(left, local_vars, allow_recursion)
expr = konstr(*argvals)
if expr is None:
raise self.Exception('Failed to parse {klass} {left!r:.100}'.format(**locals()), expr=expr)
expr = self._dump(expr, local_vars) + right
break
else:
raise self.Exception('Unsupported object {obj}'.format(**locals()), expr=expr)
raise self.Exception('Unsupported object {obj:.100}'.format(**locals()), expr=expr)
if expr.startswith('void '):
left = self.interpret_expression(expr[5:], local_vars, allow_recursion)
@ -412,8 +511,15 @@ class JSInterpreter(object):
expr = self._dump(inner, local_vars) + outer
if expr.startswith('('):
inner, outer = self._separate_at_paren(expr)
inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion)
m = re.match(r'\((?P<d>[a-z])%(?P<e>[a-z])\.length\+(?P=e)\.length\)%(?P=e)\.length', expr)
if m:
# short-cut eval of frequently used `(d%e.length+e.length)%e.length`, worth ~6% on `pytest -k test_nsig`
outer = None
inner, should_abort = self._offset_e_by_d(m.group('d'), m.group('e'), local_vars)
else:
inner, outer = self._separate_at_paren(expr)
inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion)
if not outer or should_abort:
return inner, should_abort or should_return
else:
@ -426,13 +532,43 @@ class JSInterpreter(object):
for item in self._separate(inner)])
expr = name + outer
m = re.match(r'''(?x)
(?P<try>try)\s*\{|
(?P<switch>switch)\s*\(|
(?P<for>for)\s*\(
''', expr)
m = self._COMPOUND_RE.match(expr)
md = m.groupdict() if m else {}
if md.get('try'):
if md.get('if'):
cndn, expr = self._separate_at_paren(expr[m.end() - 1:])
if expr.startswith('{'):
if_expr, expr = self._separate_at_paren(expr)
else:
# may lose ... else ... because of ll.368-374
if_expr, expr = self._separate_at_paren(expr, delim=';')
else_expr = None
m = re.match(r'else\s*(?P<block>\{)?', expr)
if m:
if m.group('block'):
else_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
else:
# handle subset ... else if (...) {...} else ...
# TODO: make interpret_statement do this properly, if possible
exprs = list(self._separate(expr[m.end():], delim='}', max_split=2))
if len(exprs) > 1:
if re.match(r'\s*if\s*\(', exprs[0]) and re.match(r'\s*else\b', exprs[1]):
else_expr = exprs[0] + '}' + exprs[1]
expr = (exprs[2] + '}') if len(exprs) == 3 else None
else:
else_expr = exprs[0]
exprs.append('')
expr = '}'.join(exprs[1:])
else:
else_expr = exprs[0]
expr = None
else_expr = else_expr.lstrip() + '}'
cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion))
ret, should_abort = self.interpret_statement(
if_expr if cndn else else_expr, local_vars, allow_recursion)
if should_abort:
return ret, True
elif md.get('try'):
try_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
err = None
try:
@ -455,7 +591,7 @@ class JSInterpreter(object):
err = None
pending = self.interpret_statement(sub_expr, catch_vars, allow_recursion)
m = re.match(r'finally\s*\{', expr)
m = self._FINALLY_RE.match(expr)
if m:
sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion)
@ -469,23 +605,24 @@ class JSInterpreter(object):
if err:
raise err
elif md.get('for'):
constructor, remaining = self._separate_at_paren(expr[m.end() - 1:])
elif md.get('for') or md.get('while'):
init_or_cond, remaining = self._separate_at_paren(expr[m.end() - 1:])
if remaining.startswith('{'):
body, expr = self._separate_at_paren(remaining)
else:
switch_m = re.match(r'switch\s*\(', remaining) # FIXME
switch_m = self._SWITCH_RE.match(remaining) # FIXME
if switch_m:
switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:])
body, expr = self._separate_at_paren(remaining, '}')
body = 'switch(%s){%s}' % (switch_val, body)
else:
body, expr = remaining, ''
start, cndn, increment = self._separate(constructor, ';')
self.interpret_expression(start, local_vars, allow_recursion)
while True:
if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)):
break
if md.get('for'):
start, cndn, increment = self._separate(init_or_cond, ';')
self.interpret_expression(start, local_vars, allow_recursion)
else:
cndn, increment = init_or_cond, None
while _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)):
try:
ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion)
if should_abort:
@ -494,7 +631,8 @@ class JSInterpreter(object):
break
except JS_Continue:
pass
self.interpret_expression(increment, local_vars, allow_recursion)
if increment:
self.interpret_expression(increment, local_vars, allow_recursion)
elif md.get('switch'):
switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:])
@ -593,7 +731,7 @@ class JSInterpreter(object):
elif expr == 'undefined':
return JS_Undefined, should_return
elif expr == 'NaN':
return float('NaN'), should_return
return _NaN, should_return
elif md.get('return'):
return local_vars[m.group('name')], should_return
@ -620,9 +758,24 @@ class JSInterpreter(object):
continue
right_expr = separated.pop()
while op == '-' and len(separated) > 1 and not separated[-1].strip():
right_expr = '-' + right_expr
separated.pop()
# handle operators that are both unary and binary, minimal BODMAS
if op in ('+', '-'):
undone = 0
while len(separated) > 1 and not separated[-1].strip():
undone += 1
separated.pop()
if op == '-' and undone % 2 != 0:
right_expr = op + right_expr
left_val = separated[-1]
for dm_op in ('*', '%', '/', '**'):
bodmas = tuple(self._separate(left_val, dm_op, skip_delims=skip_delim))
if len(bodmas) > 1 and not bodmas[-1].strip():
expr = op.join(separated) + op + right_expr
right_expr = None
break
if right_expr is None:
continue
left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion)
return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), should_return
@ -640,7 +793,7 @@ class JSInterpreter(object):
""" assert, but without risk of getting optimized out """
if not cndn:
memb = member
raise self.Exception('{member} {msg}'.format(**locals()), expr=expr)
raise self.Exception('{memb} {msg}'.format(**locals()), expr=expr)
def eval_method():
if (variable, member) == ('console', 'debug'):
@ -676,7 +829,7 @@ class JSInterpreter(object):
if obj == compat_str:
if member == 'fromCharCode':
assertion(argvals, 'takes one or more arguments')
return ''.join(map(chr, argvals))
return ''.join(map(compat_chr, argvals))
raise self.Exception('Unsupported string method ' + member, expr=expr)
elif obj == float:
if member == 'pow':
@ -749,6 +902,17 @@ class JSInterpreter(object):
if idx >= len(obj):
return None
return ord(obj[idx])
elif member in ('replace', 'replaceAll'):
assertion(isinstance(obj, compat_str), 'must be applied on a string')
assertion(len(argvals) == 2, 'takes exactly two arguments')
# TODO: argvals[1] callable, other Py vs JS edge cases
if isinstance(argvals[0], self.JS_RegExp):
count = 0 if argvals[0].flags & self.JS_RegExp.RE_FLAGS['g'] else 1
assertion(member != 'replaceAll' or count == 0,
'replaceAll must be called with a global RegExp')
return argvals[0].sub(argvals[1], obj, count=count)
count = ('replaceAll', 'replace').index(member)
return re.sub(re.escape(argvals[0]), argvals[1], obj, count=count)
idx = int(member) if isinstance(obj, list) else member
return obj[idx](argvals, allow_recursion=allow_recursion)
@ -780,19 +944,26 @@ class JSInterpreter(object):
raise self.Exception('Cannot return from an expression', expr)
return ret
def interpret_iter(self, list_txt, local_vars, allow_recursion):
for v in self._separate(list_txt):
yield self.interpret_expression(v, local_vars, allow_recursion)
def extract_object(self, objname):
_FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
obj = {}
obj_m = re.search(
r'''(?x)
(?<!this\.)%s\s*=\s*{\s*
(?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
}\s*;
''' % (re.escape(objname), _FUNC_NAME_RE),
self.code)
if not obj_m:
fields = None
for obj_m in re.finditer(
r'''(?xs)
{0}\s*\.\s*{1}|{1}\s*=\s*\{{\s*
(?P<fields>({2}\s*:\s*function\s*\(.*?\)\s*\{{.*?}}(?:,\s*)?)*)
}}\s*;
'''.format(_NAME_RE, re.escape(objname), _FUNC_NAME_RE),
self.code):
fields = obj_m.group('fields')
if fields:
break
else:
raise self.Exception('Could not find object ' + objname)
fields = obj_m.group('fields')
# Currently, it only supports function definitions
fields_m = re.finditer(
r'''(?x)
@ -805,6 +976,17 @@ class JSInterpreter(object):
return obj
@staticmethod
def _offset_e_by_d(d, e, local_vars):
""" Short-cut eval: (d%e.length+e.length)%e.length """
try:
d = local_vars[d]
e = local_vars[e]
e = len(e)
return _js_mod(_js_mod(d, e) + e, e), False
except Exception:
return None, True
def extract_function_code(self, funcname):
""" @returns argnames, code """
func_m = re.search(
@ -817,13 +999,15 @@ class JSInterpreter(object):
\((?P<args>[^)]*)\)\s*
(?P<code>{.+})''' % {'name': re.escape(funcname)},
self.code)
code, _ = self._separate_at_paren(func_m.group('code')) # refine the match
if func_m is None:
raise self.Exception('Could not find JS function "{funcname}"'.format(**locals()))
code, _ = self._separate_at_paren(func_m.group('code')) # refine the match
return self.build_arglist(func_m.group('args')), code
def extract_function(self, funcname):
return self.extract_function_from_code(*self.extract_function_code(funcname))
return function_with_repr(
self.extract_function_from_code(*self.extract_function_code(funcname)),
'F<%s>' % (funcname, ))
def extract_function_from_code(self, argnames, code, *global_stack):
local_vars = {}

View file

@ -11,6 +11,7 @@ from .compat import (
compat_get_terminal_size,
compat_getenv,
compat_kwargs,
compat_open as open,
compat_shlex_split,
)
from .utils import (
@ -41,14 +42,11 @@ def _hide_login_info(opts):
def parseOpts(overrideArguments=None):
def _readOptions(filename_bytes, default=[]):
try:
optionf = open(filename_bytes)
optionf = open(filename_bytes, encoding=preferredencoding())
except IOError:
return default # silently skip if file is not present
try:
# FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
contents = optionf.read()
if sys.version_info < (3,):
contents = contents.decode(preferredencoding())
res = compat_shlex_split(contents, comments=True)
finally:
optionf.close()
@ -546,12 +544,14 @@ def parseOpts(overrideArguments=None):
workarounds.add_option(
'--referer',
metavar='URL', dest='referer', default=None,
help='Specify a custom referer, use if the video access is restricted to one domain',
help='Specify a custom Referer: use if the video access is restricted to one domain',
)
workarounds.add_option(
'--add-header',
metavar='FIELD:VALUE', dest='headers', action='append',
help='Specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times',
help=('Specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times. '
'NB Use --cookies rather than adding a Cookie header if its contents may be sensitive; '
'data from a Cookie header will be sent to all domains, not just the one intended')
)
workarounds.add_option(
'--bidi-workaround',
@ -733,9 +733,13 @@ def parseOpts(overrideArguments=None):
'--no-part',
action='store_true', dest='nopart', default=False,
help='Do not use .part files - write directly into output file')
filesystem.add_option(
'--mtime',
action='store_true', dest='updatetime', default=True,
help='Use the Last-modified header to set the file modification time (default)')
filesystem.add_option(
'--no-mtime',
action='store_false', dest='updatetime', default=True,
action='store_false', dest='updatetime',
help='Do not use the Last-modified header to set the file modification time')
filesystem.add_option(
'--write-description',

View file

@ -18,6 +18,8 @@ from ..utils import (
shell_quote,
)
from ..compat import compat_open as open
class EmbedThumbnailPPError(PostProcessingError):
pass

View file

@ -1,6 +1,5 @@
from __future__ import unicode_literals
import io
import os
import subprocess
import time
@ -9,6 +8,7 @@ import re
from .common import AudioConversionError, PostProcessor
from ..compat import compat_open as open
from ..utils import (
encodeArgument,
encodeFilename,
@ -493,7 +493,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
chapters = info.get('chapters', [])
if chapters:
metadata_filename = replace_extension(filename, 'meta')
with io.open(metadata_filename, 'wt', encoding='utf-8') as f:
with open(metadata_filename, 'w', encoding='utf-8') as f:
def ffmpeg_escape(text):
return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text)
@ -636,7 +636,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
with open(dfxp_file, 'rb') as f:
srt_data = dfxp2srt(f.read())
with io.open(srt_file, 'wt', encoding='utf-8') as f:
with open(srt_file, 'w', encoding='utf-8') as f:
f.write(srt_data)
old_file = srt_file
@ -652,7 +652,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
self.run_ffmpeg(old_file, new_file, ['-f', new_format])
with io.open(new_file, 'rt', encoding='utf-8') as f:
with open(new_file, 'r', encoding='utf-8') as f:
subs[lang] = {
'ext': new_ext,
'data': f.read(),

View file

@ -727,7 +727,7 @@ class SWFInterpreter(object):
stack.append(res)
continue
assert isinstance(obj, (dict, _ScopeDict)),\
assert isinstance(obj, (dict, _ScopeDict)), \
'Accessing member %r on %r' % (pname, obj)
res = obj.get(pname, undefined)
stack.append(res)

View file

@ -1,6 +1,5 @@
from __future__ import unicode_literals
import io
import json
import traceback
import hashlib
@ -9,7 +8,10 @@ import subprocess
import sys
from zipimport import zipimporter
from .compat import compat_realpath
from .compat import (
compat_open as open,
compat_realpath,
)
from .utils import encode_compat_str
from .version import __version__
@ -127,7 +129,7 @@ def update_self(to_screen, verbose, opener):
try:
bat = os.path.join(directory, 'youtube-dl-updater.bat')
with io.open(bat, 'w') as batfile:
with open(bat, 'w') as batfile:
batfile.write('''
@echo off
echo Waiting for file handle to be closed ...

File diff suppressed because it is too large Load diff