[heise] Add new extractor

2024-11-29 05:32:00 +00:00 · 2014-09-27 19:28:01 +02:00 · 2014-09-27 19:28:01 +02:00 · 0155549d6c
commit 0155549d6c
parent 11b3ce8509
2 changed files with 121 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -141,6 +141,7 @@ from .gorillavid import GorillaVidIE
 from .goshgay import GoshgayIE
 from .grooveshark import GroovesharkIE
 from .hark import HarkIE
 from .heise import HeiseIE
 from .helsinki import HelsinkiIE
 from .hentaistigma import HentaiStigmaIE
 from .hornbunny import HornBunnyIE
--- a/youtube_dl/extractor/heise.py
+++ b/youtube_dl/extractor/heise.py
@ -0,0 +1,120 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    compat_urllib_parse,
    get_meta_content,
    parse_iso8601,
 )
 class HeiseIE(InfoExtractor):
    _VALID_URL = (
        r'^https?://(?:www\.)?heise\.de/video/artikel/' +
        r'.+?(?P<id>[0-9]+)\.html$'
    )
    _TEST = {
        'url': (
            'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-' +
            'Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html'
        ),
        'md5': 'ffed432483e922e88545ad9f2f15d30e',
        'info_dict': {
            'id': '2404147',
            'ext': 'mp4',
            'title': (
                "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / " +
                "Peilsender Smartphone"
            ),
            'format_id': 'mp4_720',
            'timestamp': 1411812600,
            'upload_date': '20140927',
        }
    }
    _CONFIG = (
        r'".+?\?sequenz=(?P<sequenz>.+?)&container=(?P<container>.+?)' +
        r'(?:&hd=(?P<hd>.+?))?(?:&signature=(?P<signature>.+?))?&callback=\?"'
    )
    _PREFIX = 'http://www.heise.de/videout/info?'
    def _warn(self, fmt, *args):
        self.report_warning(fmt.format(*args), self._id)
    def _parse_config_url(self, html):
        m = re.search(self._CONFIG, html)
        if not m:
            raise ExtractorError('No config found')
        qs = compat_urllib_parse.urlencode(dict((k, v) for k, v
                                                in m.groupdict().items()
                                                if v is not None))
        return self._PREFIX + qs
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        self._id = mobj.group('id')
        html = self._download_webpage(url, self._id)
        config = self._download_json(self._parse_config_url(html), self._id)
        info = {
            'id': self._id
        }
        title = get_meta_content('fulltitle', html)
        if title:
            info['title'] = title
        elif config.get('title'):
            info['title'] = config['title']
        else:
            self._warn('title: not found')
            info['title'] = 'heise'
        if (not config.get('formats') or
                not hasattr(config['formats'], 'items')):
            raise ExtractorError('No formats found')
        formats = []
        for t, rs in config['formats'].items():
            if not rs or not hasattr(rs, 'items'):
                self._warn('formats: {0}: no resolutions', t)
                continue
            for res, obj in rs.items():
                format_id = '{0}_{1}'.format(t, res)
                if (not obj or not obj.get('url') or
                        not isinstance(obj['url'], str)):
                    self._warn('formats: {0}: no url', format_id)
                    continue
                fmt = {
                    'url': obj['url'],
                    'format_id': format_id
                }
                try:
                    fmt['height'] = int(res)
                except ValueError as e:
                    self._warn('formats: {0}: height: {1}', t, e)
                formats.append(fmt)
        self._sort_formats(formats)
        info['formats'] = formats
        if config.get('poster') and isinstance(config['poster'], str):
            info['thumbnail'] = config['poster']
        date = get_meta_content('date', html)
        if date and isinstance(date, str):
            try:
                info['timestamp'] = parse_iso8601(date)
            except ValueError as e:
                self._warn('timestamp: {0}', e)
        return info