youtube-dl/youtube_dl/extractor/voe.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_base64_b64decode
from ..utils import (
    int_or_none,
    js_to_json,
    url_or_none,
)


class VOEIE(InfoExtractor):
    IE_NAME = 'voe'
    IE_DESC = 'VOE.SX'
    _VALID_URL = r'https?://voe\.sx(?P<path>/(e/)?(?P<id>[a-z0-9]+))'
    _TESTS = [{
        'url': 'https://voe.sx/e/ng7ja5n5n2y8',
        'info_dict': {
            'id': 'ng7ja5n5n2y8',
            'title': 'md5:8dd774de9b73851151d80ef6baaea7f1',
            'thumbnail': r're:^https?://.*\.jpg$',
            'ext': 'm3u8',
        },
    }]

    def _real_extract(self, url):
        video_id, video_path = self._match_valid_url(url).group('id', 'path')

        webpage = self._download_webpage(
            'https://voe.sx/e/%s' % video_id, video_id)

        player_url = self._search_regex(
            r'''("|')(?P<url>https://(?!voe\.sx/)[^/]+%s)\1\s*;''' % (video_path,),
            webpage, 'redirect', group='url', default=None)
        if player_url:
            webpage = self._download_webpage(
                player_url, video_id, note='Redirecting to player page')

        sources = self._search_json(
            r'\bsources\s*=', webpage, 'sources', video_id, transform_source=js_to_json)

        title = self._search_regex(
            r'<title>(?:Watch\s+)?(?P<title>.+?)(?:-\s+VOE\s+\|.+)?</title>',
            webpage, 'title', group='title')

        formats = []
        for fmt in ('mp4', 'hls'):
            if fmt not in sources:
                continue
            sources[fmt] = url_or_none(sources[fmt]) or url_or_none(compat_base64_b64decode(sources[fmt]).decode('utf-8'))
            f_url = sources.get('hls')
        if f_url:
            formats.extend(self._extract_m3u8_formats(
                f_url, video_id, entry_protocol='m3u8_native', fatal=False))
        f_url = sources.get('mp4')
        if f_url:
            formats.append({
                'url': f_url,
                'ext': 'mp4',
                'height': int_or_none(sources.get('video_height')),
            })

        self._sort_formats(formats)

        thumbnail = url_or_none(self._search_regex(
            r'''(?:VOEPlayer\s*\.\s*|data-)poster\s*=\s*("|')(?P<thumbnail>(?:(?!\1)\S)+)\1''',
            webpage, 'thumbnail', group='thumbnail', default=None))

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'thumbnail': thumbnail,
        }
[voe] Add new extractor 2021-07-22 10:46:23 +00:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`from .common import InfoExtractor`
Linty 2024-11-18 03:36:46 +00:00			`from ..compat import compat_base64_b64decode`
Fix extraction 2022-11-17 19:23:43 +00:00			`from ..utils import (`
			`int_or_none,`
			`js_to_json,`
			`url_or_none,`
			`)`
[voe] Add new extractor 2021-07-22 10:46:23 +00:00
Linty 2022-11-17 19:29:59 +00:00
[voe] Add new extractor 2021-07-22 10:46:23 +00:00			`class VOEIE(InfoExtractor):`
			`IE_NAME = 'voe'`
			`IE_DESC = 'VOE.SX'`
Update extractor for current site behaviour 2024-11-18 02:28:15 +00:00			`_VALID_URL = r'https?://voe\.sx(?P<path>/(e/)?(?P<id>[a-z0-9]+))'`
			`_TESTS = [{`
[voe] Add new extractor 2021-07-22 10:46:23 +00:00			`'url': 'https://voe.sx/e/ng7ja5n5n2y8',`
			`'info_dict': {`
			`'id': 'ng7ja5n5n2y8',`
Make test pass 2024-11-18 03:30:38 +00:00			`'title': 'md5:8dd774de9b73851151d80ef6baaea7f1',`
[voe] Add new extractor 2021-07-22 10:46:23 +00:00			`'thumbnail': r're:^https?://.*\.jpg$',`
			`'ext': 'm3u8',`
			`},`
Update extractor for current site behaviour 2024-11-18 02:28:15 +00:00			`}]`
[voe] Add new extractor 2021-07-22 10:46:23 +00:00
			`def _real_extract(self, url):`
Update extractor for current site behaviour 2024-11-18 02:28:15 +00:00			`video_id, video_path = self._match_valid_url(url).group('id', 'path')`
[voe] Add new extractor 2021-07-22 10:46:23 +00:00
			`webpage = self._download_webpage(`
			`'https://voe.sx/e/%s' % video_id, video_id)`

Update extractor for current site behaviour 2024-11-18 02:28:15 +00:00			`player_url = self._search_regex(`
Linty 2024-11-18 02:48:29 +00:00			`r'''("\|')(?P<url>https://(?!voe\.sx/)[^/]+%s)\1\s*;''' % (video_path,),`
Update extractor for current site behaviour 2024-11-18 02:28:15 +00:00			`webpage, 'redirect', group='url', default=None)`
			`if player_url:`
			`webpage = self._download_webpage(`
			`player_url, video_id, note='Redirecting to player page')`

			`sources = self._search_json(`
Linty 2024-11-18 02:48:29 +00:00			`r'\bsources\s*=', webpage, 'sources', video_id, transform_source=js_to_json)`
[voe] Add new extractor 2021-07-22 10:46:23 +00:00
			`title = self._search_regex(`
Fix extraction 2022-11-17 19:23:43 +00:00			`r'<title>(?:Watch\s+)?(?P<title>.+?)(?:-\s+VOE\s+\\|.+)?</title>',`
[voe] Add new extractor 2021-07-22 10:46:23 +00:00			`webpage, 'title', group='title')`

Fix extraction 2022-11-17 19:23:43 +00:00			`formats = []`
Fix quoting 2024-11-18 02:39:56 +00:00			`for fmt in ('mp4', 'hls'):`
Update extractor for current site behaviour 2024-11-18 02:28:15 +00:00			`if fmt not in sources:`
			`continue`
Make test pass 2024-11-18 03:30:38 +00:00			`sources[fmt] = url_or_none(sources[fmt]) or url_or_none(compat_base64_b64decode(sources[fmt]).decode('utf-8'))`
Update extractor for current site behaviour 2024-11-18 02:28:15 +00:00			`f_url = sources.get('hls')`
Fix extraction 2022-11-17 19:23:43 +00:00			`if f_url:`
			`formats.extend(self._extract_m3u8_formats(`
			`f_url, video_id, entry_protocol='m3u8_native', fatal=False))`
Update extractor for current site behaviour 2024-11-18 02:28:15 +00:00			`f_url = sources.get('mp4')`
Fix extraction 2022-11-17 19:23:43 +00:00			`if f_url:`
			`formats.append({`
			`'url': f_url,`
			`'ext': 'mp4',`
			`'height': int_or_none(sources.get('video_height')),`
			`})`
[voe] Add new extractor 2021-07-22 10:46:23 +00:00
			`self._sort_formats(formats)`

Fix extraction 2022-11-17 19:23:43 +00:00			`thumbnail = url_or_none(self._search_regex(`
Update extractor for current site behaviour 2024-11-18 02:28:15 +00:00			`r'''(?:VOEPlayer\s\.\s\|data-)poster\s=\s("\|')(?P<thumbnail>(?:(?!\1)\S)+)\1''',`
Fix extraction 2022-11-17 19:23:43 +00:00			`webpage, 'thumbnail', group='thumbnail', default=None))`

[voe] Add new extractor 2021-07-22 10:46:23 +00:00			`return {`
			`'id': video_id,`
			`'title': title,`
			`'formats': formats,`
			`'thumbnail': thumbnail,`
			`}`