[DHM] Add new extractor

2024-11-26 20:22:14 +00:00 · 2015-03-28 10:38:52 +01:00 · 2015-03-28 10:38:52 +01:00 · 643fe72717
commit 643fe72717
parent 4747e2183a
2 changed files with 53 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -106,6 +106,7 @@ from .dbtv import DBTVIE
 from .dctp import DctpTvIE
 from .deezer import DeezerPlaylistIE
 from .dfb import DFBIE
 from .dhm import DHMIE
 from .dotsub import DotsubIE
 from .douyutv import DouyuTVIE
 from .dreisat import DreiSatIE
--- a/youtube_dl/extractor/dhm.py
+++ b/youtube_dl/extractor/dhm.py
@ -0,0 +1,52 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 import urllib2
 import xml.etree.ElementTree as ET
 import re
 class DHMIE(InfoExtractor):
    _VALID_URL = r'http://www\.dhm\.de/filmarchiv/(?P<id>.*?)'
    _TEST = {
        'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/',
        'md5': '11c475f670209bf6acca0b2b7ef51827',
        'info_dict': {
            'id': 'marshallwg',
            'ext': 'flv',
            'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE',
            'thumbnail': 'http://www.dhm.de/filmarchiv/video/mpworkwg.jpg',
        }
    }
    def _real_extract(self, url):
        video_id = ''
        webpage = self._download_webpage(url, video_id)
        title = self._html_search_regex(
            r'dc:title=\"(.*?)\"', webpage, 'title')
        playlist_url = self._html_search_regex(
            r'file: \'(.*?)\'', webpage, 'playlist URL')
        xml_file = urllib2.urlopen(playlist_url)
        data = xml_file.read()
        xml_file.close()
        root = ET.fromstring(data)
        video_url = root[0][0][0].text
        thumbnail = root[0][0][2].text
        m = re.search('video/(.+?).flv', video_url)
        if m:
            video_id = m.group(1)
        return {
            'id': video_id,
            'title': title,
            'url': video_url,
            'thumbnail': thumbnail,
        }