youtube-dl/youtube_dl/extractor/gamespot.py

from __future__ import unicode_literals

import re

from .once import OnceIE
from ..compat import (
    compat_urllib_parse_unquote,
)
from ..utils import (
    unescapeHTML,
    url_basename,
    dict_get,
)


class GameSpotIE(OnceIE):
    _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
    _TESTS = [{
        'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
        'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
        'info_dict': {
            'id': 'gs-2300-6410818',
            'ext': 'mp4',
            'title': 'Arma 3 - Community Guide: SITREP I',
            'description': 'Check out this video where some of the basics of Arma 3 is explained.',
        },
    }, {
        'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',
        'info_dict': {
            'id': 'gs-2300-6424837',
            'ext': 'mp4',
            'title': 'Now Playing - The Witcher 3: Wild Hunt',
            'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',
        },
        'params': {
            'skip_download': True,  # m3u8 downloads
        },
    }]

    def _real_extract(self, url):
        page_id = self._match_id(url)
        webpage = self._download_webpage(url, page_id)
        data_video_json = self._search_regex(
            r'data-video=["\'](.*?)["\']', webpage, 'data video')
        data_video = self._parse_json(unescapeHTML(data_video_json), page_id)
        streams = data_video['videoStreams']

        manifest_url = None
        formats = []
        f4m_url = streams.get('f4m_stream')
        if f4m_url:
            manifest_url = f4m_url
            formats.extend(self._extract_f4m_formats(
                f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False))
        m3u8_url = streams.get('m3u8_stream')
        if m3u8_url:
            manifest_url = m3u8_url
            m3u8_formats = self._extract_m3u8_formats(
                m3u8_url, page_id, 'mp4', 'm3u8_native',
                m3u8_id='hls', fatal=False)
            formats.extend(m3u8_formats)
        progressive_url = dict_get(
            streams, ('progressive_hd', 'progressive_high', 'progressive_low'))
        if progressive_url and manifest_url:
            qualities_basename = self._search_regex(
                r'/([^/]+)\.csmil/',
                manifest_url, 'qualities basename', default=None)
            if qualities_basename:
                QUALITIES_RE = r'((,\d+)+,?)'
                qualities = self._search_regex(
                    QUALITIES_RE, qualities_basename,
                    'qualities', default=None)
                if qualities:
                    qualities = list(map(lambda q: int(q), qualities.strip(',').split(',')))
                    qualities.sort()
                    http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename)
                    http_url_basename = url_basename(progressive_url)
                    if m3u8_formats:
                        self._sort_formats(m3u8_formats)
                        m3u8_formats = list(filter(
                            lambda f: f.get('vcodec') != 'none', m3u8_formats))
                    if len(qualities) == len(m3u8_formats):
                        for q, m3u8_format in zip(qualities, m3u8_formats):
                            f = m3u8_format.copy()
                            f.update({
                                'url': progressive_url.replace(
                                    http_url_basename, http_template % q),
                                'format_id': f['format_id'].replace('hls', 'http'),
                                'protocol': 'http',
                            })
                            formats.append(f)
                    else:
                        for q in qualities:
                            formats.append({
                                'url': progressive_url.replace(
                                    http_url_basename, http_template % q),
                                'ext': 'mp4',
                                'format_id': 'http-%d' % q,
                                'tbr': q,
                            })

        onceux_json = self._search_regex(
            r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None)
        if onceux_json:
            onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri')
            if onceux_url:
                formats.extend(self._extract_once_formats(re.sub(
                    r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url).replace('ads/vmap/', '')))

        if not formats:
            for quality in ['sd', 'hd']:
                # It's actually a link to a flv file
                flv_url = streams.get('f4m_{0}'.format(quality))
                if flv_url is not None:
                    formats.append({
                        'url': flv_url,
                        'ext': 'flv',
                        'format_id': quality,
                    })
        self._sort_formats(formats)

        return {
            'id': data_video['guid'],
            'display_id': page_id,
            'title': compat_urllib_parse_unquote(data_video['title']),
            'formats': formats,
            'description': self._html_search_meta('description', webpage),
            'thumbnail': self._og_search_thumbnail(webpage),
        }
[gamespot] Use unicode_literals 2014-01-17 02:13:40 +00:00			`from __future__ import unicode_literals`

Added an IE for gamespot. Although gamespot allows downloading but it is only available to registered users. With this IE no registration is required. 2013-06-28 05:34:01 +00:00			`import re`

[gamespot] extract all formats 2016-06-21 12:36:56 +00:00			`from .once import OnceIE`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 11:24:42 +00:00			`from ..compat import (`
[gamespot] Use compat_urllib_parse_unquote 2015-07-17 17:38:30 +00:00			`compat_urllib_parse_unquote,`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 11:24:42 +00:00			`)`
			`from ..utils import (`
[gamespot] Fix video extraction (fixes #1587) 2013-10-14 14:25:04 +00:00			`unescapeHTML,`
[gamespot] extract all formats 2016-06-21 12:36:56 +00:00			`url_basename,`
			`dict_get,`
Added an IE for gamespot. Although gamespot allows downloading but it is only available to registered users. With this IE no registration is required. 2013-06-28 05:34:01 +00:00			`)`

[gamespot] Fix video extraction (fixes #1587) 2013-10-14 14:25:04 +00:00
[gamespot] extract all formats 2016-06-21 12:36:56 +00:00			`class GameSpotIE(OnceIE):`
Add support for https for all extractors as preventive and future-proof measure 2016-03-21 15:36:32 +00:00			`_VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'`
[gamespot] Add support for videos that don't use 'f4m_stream' (fixes #5707) 2015-05-15 16:42:59 +00:00			`_TESTS = [{`
[gamespot] Update test URL and modernize 2014-05-14 13:13:34 +00:00			`'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',`
			`'md5': 'b2a30deaa8654fcccd43713a6b6a4825',`
			`'info_dict': {`
			`'id': 'gs-2300-6410818',`
			`'ext': 'mp4',`
			`'title': 'Arma 3 - Community Guide: SITREP I',`
[gamespot] Use unicode_literals 2014-01-17 02:13:40 +00:00			`'description': 'Check out this video where some of the basics of Arma 3 is explained.',`
[gamespot] Add support for videos that don't use 'f4m_stream' (fixes #5707) 2015-05-15 16:42:59 +00:00			`},`
			`}, {`
			`'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',`
			`'info_dict': {`
			`'id': 'gs-2300-6424837',`
Fix unit tests for m3u8 and RTSP extractors that require ffmpeg or mplayer 2016-07-07 21:39:39 +00:00			`'ext': 'mp4',`
			`'title': 'Now Playing - The Witcher 3: Wild Hunt',`
[gamespot] Add support for videos that don't use 'f4m_stream' (fixes #5707) 2015-05-15 16:42:59 +00:00			`'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',`
			`},`
Fix unit tests for m3u8 and RTSP extractors that require ffmpeg or mplayer 2016-07-07 21:39:39 +00:00			`'params': {`
			`'skip_download': True, # m3u8 downloads`
			`},`
[gamespot] Add support for videos that don't use 'f4m_stream' (fixes #5707) 2015-05-15 16:42:59 +00:00			`}]`
Added an IE for gamespot. Although gamespot allows downloading but it is only available to registered users. With this IE no registration is required. 2013-06-28 05:34:01 +00:00
			`def _real_extract(self, url):`
[gamespot] Modernize 2014-11-04 22:04:12 +00:00			`page_id = self._match_id(url)`
GameSpotIE: support more urls and download videos in the best quality 2013-07-09 18:07:52 +00:00			`webpage = self._download_webpage(url, page_id)`
[gamespot] Modernize 2014-11-04 22:04:12 +00:00			`data_video_json = self._search_regex(`
			`r'data-video=["\'](.*?)["\']', webpage, 'data video')`
[gamespot] extract all formats 2016-06-21 12:36:56 +00:00			`data_video = self._parse_json(unescapeHTML(data_video_json), page_id)`
[gamespot] Add support for videos that don't use 'f4m_stream' (fixes #5707) 2015-05-15 16:42:59 +00:00			`streams = data_video['videoStreams']`
Added an IE for gamespot. Although gamespot allows downloading but it is only available to registered users. With this IE no registration is required. 2013-06-28 05:34:01 +00:00
[gamespot] extract all formats 2016-06-21 12:36:56 +00:00			`manifest_url = None`
[gamespot] Fix video extraction (fixes #1587) 2013-10-14 14:25:04 +00:00			`formats = []`
[gamespot] Add support for videos that don't use 'f4m_stream' (fixes #5707) 2015-05-15 16:42:59 +00:00			`f4m_url = streams.get('f4m_stream')`
[gamespot] extract all formats 2016-06-21 12:36:56 +00:00			`if f4m_url:`
			`manifest_url = f4m_url`
			`formats.extend(self._extract_f4m_formats(`
			`f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False))`
			`m3u8_url = streams.get('m3u8_stream')`
			`if m3u8_url:`
			`manifest_url = m3u8_url`
			`m3u8_formats = self._extract_m3u8_formats(`
			`m3u8_url, page_id, 'mp4', 'm3u8_native',`
			`m3u8_id='hls', fatal=False)`
			`formats.extend(m3u8_formats)`
			`progressive_url = dict_get(`
			`streams, ('progressive_hd', 'progressive_high', 'progressive_low'))`
			`if progressive_url and manifest_url:`
			`qualities_basename = self._search_regex(`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 12:08:07 +00:00			`r'/([^/]+)\.csmil/',`
[gamespot] extract all formats 2016-06-21 12:36:56 +00:00			`manifest_url, 'qualities basename', default=None)`
			`if qualities_basename:`
			`QUALITIES_RE = r'((,\d+)+,?)'`
			`qualities = self._search_regex(`
			`QUALITIES_RE, qualities_basename,`
			`'qualities', default=None)`
			`if qualities:`
			`qualities = list(map(lambda q: int(q), qualities.strip(',').split(',')))`
			`qualities.sort()`
			`http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename)`
			`http_url_basename = url_basename(progressive_url)`
			`if m3u8_formats:`
			`self._sort_formats(m3u8_formats)`
			`m3u8_formats = list(filter(`
Don't list master m3u8 playlists in format list (closes #12832) 2017-04-25 15:07:10 +00:00			`lambda f: f.get('vcodec') != 'none', m3u8_formats))`
[gamespot] extract all formats 2016-06-21 12:36:56 +00:00			`if len(qualities) == len(m3u8_formats):`
			`for q, m3u8_format in zip(qualities, m3u8_formats):`
			`f = m3u8_format.copy()`
			`f.update({`
			`'url': progressive_url.replace(`
			`http_url_basename, http_template % q),`
			`'format_id': f['format_id'].replace('hls', 'http'),`
			`'protocol': 'http',`
			`})`
			`formats.append(f)`
			`else:`
			`for q in qualities:`
			`formats.append({`
			`'url': progressive_url.replace(`
			`http_url_basename, http_template % q),`
			`'ext': 'mp4',`
			`'format_id': 'http-%d' % q,`
			`'tbr': q,`
			`})`

			`onceux_json = self._search_regex(`
			`r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None)`
			`if onceux_json:`
			`onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri')`
			`if onceux_url:`
			`formats.extend(self._extract_once_formats(re.sub(`
			`r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url).replace('ads/vmap/', '')))`

			`if not formats:`
[gamespot] Add support for videos that don't use 'f4m_stream' (fixes #5707) 2015-05-15 16:42:59 +00:00			`for quality in ['sd', 'hd']:`
			`# It's actually a link to a flv file`
			`flv_url = streams.get('f4m_{0}'.format(quality))`
			`if flv_url is not None:`
			`formats.append({`
			`'url': flv_url,`
			`'ext': 'flv',`
			`'format_id': quality,`
			`})`
[gamespot] extract all formats 2016-06-21 12:36:56 +00:00			`self._sort_formats(formats)`
Added an IE for gamespot. Although gamespot allows downloading but it is only available to registered users. With this IE no registration is required. 2013-06-28 05:34:01 +00:00
Remove the compatibility code used before the new format system was implemented 2013-12-03 13:21:06 +00:00			`return {`
[gamespot] Fix video extraction (fixes #1587) 2013-10-14 14:25:04 +00:00			`'id': data_video['guid'],`
[gamespot] Modernize 2014-11-04 22:04:12 +00:00			`'display_id': page_id,`
[gamespot] Use compat_urllib_parse_unquote 2015-07-17 17:38:30 +00:00			`'title': compat_urllib_parse_unquote(data_video['title']),`
[gamespot] Fix video extraction (fixes #1587) 2013-10-14 14:25:04 +00:00			`'formats': formats,`
[gamespot] Modernize 2014-11-04 22:04:12 +00:00			`'description': self._html_search_meta('description', webpage),`
[gamespot] Fix video extraction (fixes #1587) 2013-10-14 14:25:04 +00:00			`'thumbnail': self._og_search_thumbnail(webpage),`
			`}`