youtube-dl/youtube_dl/extractor/abcnews.py

# coding: utf-8
from __future__ import unicode_literals

import calendar
import re
import time

from .amp import AMPIE
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import compat_urlparse


class AbcNewsVideoIE(AMPIE):
    IE_NAME = 'abcnews:video'
    _VALID_URL = r'''(?x)
                    https?://
                        abcnews\.go\.com/
                        (?:
                            [^/]+/video/(?P<display_id>[0-9a-z-]+)-|
                            video/embed\?.*?\bid=
                        )
                        (?P<id>\d+)
                    '''

    _TESTS = [{
        'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
        'info_dict': {
            'id': '20411932',
            'ext': 'mp4',
            'display_id': 'week-exclusive-irans-foreign-minister-zarif',
            'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',
            'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
            'duration': 180,
            'thumbnail': r're:^https?://.*\.jpg$',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
    }, {
        'url': 'http://abcnews.go.com/video/embed?id=46979033',
        'only_matching': True,
    }, {
        'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        display_id = mobj.group('display_id')
        video_id = mobj.group('id')
        info_dict = self._extract_feed_info(
            'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)
        info_dict.update({
            'id': video_id,
            'display_id': display_id,
        })
        return info_dict


class AbcNewsIE(InfoExtractor):
    IE_NAME = 'abcnews'
    _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'

    _TESTS = [{
        'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',
        'info_dict': {
            'id': '10505354',
            'ext': 'flv',
            'display_id': 'dramatic-video-rare-death-job-america',
            'title': 'Occupational Hazards',
            'description': 'Nightline investigates the dangers that lurk at various jobs.',
            'thumbnail': r're:^https?://.*\.jpg$',
            'upload_date': '20100428',
            'timestamp': 1272412800,
        },
        'add_ie': ['AbcNewsVideo'],
    }, {
        'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
        'info_dict': {
            'id': '38897857',
            'ext': 'mp4',
            'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
            'title': 'Justin Timberlake Drops Hints For Secret Single',
            'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
            'upload_date': '20160515',
            'timestamp': 1463329500,
        },
        'params': {
            # m3u8 download
            'skip_download': True,
            # The embedded YouTube video is blocked due to copyright issues
            'playlist_items': '1',
        },
        'add_ie': ['AbcNewsVideo'],
    }, {
        'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        display_id = mobj.group('display_id')
        video_id = mobj.group('id')

        webpage = self._download_webpage(url, video_id)
        video_url = self._search_regex(
            r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
        full_video_url = compat_urlparse.urljoin(url, video_url)

        youtube_url = YoutubeIE._extract_url(webpage)

        timestamp = None
        date_str = self._html_search_regex(
            r'<span[^>]+class="timestamp">([^<]+)</span>',
            webpage, 'timestamp', fatal=False)
        if date_str:
            tz_offset = 0
            if date_str.endswith(' ET'):  # Eastern Time
                tz_offset = -5
                date_str = date_str[:-3]
            date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']
            for date_format in date_formats:
                try:
                    timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))
                except ValueError:
                    continue
            if timestamp is not None:
                timestamp -= tz_offset * 3600

        entry = {
            '_type': 'url_transparent',
            'ie_key': AbcNewsVideoIE.ie_key(),
            'url': full_video_url,
            'id': video_id,
            'display_id': display_id,
            'timestamp': timestamp,
        }

        if youtube_url:
            entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())]
            return self.playlist_result(entries)

        return entry
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import calendar`
			`import re`
			`import time`

			`from .amp import AMPIE`
			`from .common import InfoExtractor`
[abcnews,chilloutsoze,cracked,vice,vk] Use dedicated YouTube embeds extraction routines 2017-09-06 00:50:25 +07:00			`from .youtube import YoutubeIE`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00			`from ..compat import compat_urlparse`


			`class AbcNewsVideoIE(AMPIE):`
			`IE_NAME = 'abcnews:video'`
[abcnews] Add support for embed URLs 2017-04-26 21:21:17 +02:00			`_VALID_URL = r'''(?x)`
[abcnews] Improve and remove duplicate test (closes #12851) 2017-05-28 07:05:50 +07:00			`https?://`
			`abcnews\.go\.com/`
			`(?:`
			`[^/]+/video/(?P<display_id>[0-9a-z-]+)-\|`
			`video/embed\?.*?\bid=`
			`)`
			`(?P<id>\d+)`
			`'''`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00
			`_TESTS = [{`
			`'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',`
			`'info_dict': {`
			`'id': '20411932',`
			`'ext': 'mp4',`
			`'display_id': 'week-exclusive-irans-foreign-minister-zarif',`
			`'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',`
			`'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',`
			`'duration': 180,`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 20:08:07 +08:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00			`},`
			`'params': {`
			`# m3u8 download`
			`'skip_download': True,`
			`},`
[abcnews] Add support for embed URLs 2017-04-26 21:21:17 +02:00			`}, {`
			`'url': 'http://abcnews.go.com/video/embed?id=46979033',`
[abcnews] Improve and remove duplicate test (closes #12851) 2017-05-28 07:05:50 +07:00			`'only_matching': True,`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00			`}, {`
			`'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',`
			`'only_matching': True,`
			`}]`

			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`display_id = mobj.group('display_id')`
			`video_id = mobj.group('id')`
			`info_dict = self._extract_feed_info(`
			`'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)`
			`info_dict.update({`
			`'id': video_id,`
			`'display_id': display_id,`
			`})`
			`return info_dict`


			`class AbcNewsIE(InfoExtractor):`
			`IE_NAME = 'abcnews'`
Add missing r prefix for _VALID_URLs 2016-09-08 17:04:57 +07:00			`_VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00
			`_TESTS = [{`
			`'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',`
			`'info_dict': {`
[abcnews] Update tests 2018-02-23 23:17:21 +01:00			`'id': '10505354',`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00			`'ext': 'flv',`
			`'display_id': 'dramatic-video-rare-death-job-america',`
			`'title': 'Occupational Hazards',`
			`'description': 'Nightline investigates the dangers that lurk at various jobs.',`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 20:08:07 +08:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00			`'upload_date': '20100428',`
			`'timestamp': 1272412800,`
			`},`
			`'add_ie': ['AbcNewsVideo'],`
			`}, {`
			`'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',`
			`'info_dict': {`
[abcnews] Update tests 2018-02-23 23:17:21 +01:00			`'id': '38897857',`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00			`'ext': 'mp4',`
			`'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',`
			`'title': 'Justin Timberlake Drops Hints For Secret Single',`
			`'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',`
			`'upload_date': '20160515',`
			`'timestamp': 1463329500,`
			`},`
			`'params': {`
			`# m3u8 download`
			`'skip_download': True,`
			`# The embedded YouTube video is blocked due to copyright issues`
			`'playlist_items': '1',`
			`},`
			`'add_ie': ['AbcNewsVideo'],`
			`}, {`
			`'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',`
			`'only_matching': True,`
			`}]`

			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`display_id = mobj.group('display_id')`
			`video_id = mobj.group('id')`

			`webpage = self._download_webpage(url, video_id)`
			`video_url = self._search_regex(`
			`r'window\.abcnvideo\.url\s=\s"([^"]+)"', webpage, 'video URL')`
			`full_video_url = compat_urlparse.urljoin(url, video_url)`

[abcnews,chilloutsoze,cracked,vice,vk] Use dedicated YouTube embeds extraction routines 2017-09-06 00:50:25 +07:00			`youtube_url = YoutubeIE._extract_url(webpage)`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00
			`timestamp = None`
			`date_str = self._html_search_regex(`
			`r'<span[^>]+class="timestamp">([^<]+)</span>',`
			`webpage, 'timestamp', fatal=False)`
			`if date_str:`
			`tz_offset = 0`
			`if date_str.endswith(' ET'): # Eastern Time`
			`tz_offset = -5`
			`date_str = date_str[:-3]`
			`date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']`
			`for date_format in date_formats:`
			`try:`
			`timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))`
			`except ValueError:`
			`continue`
			`if timestamp is not None:`
			`timestamp -= tz_offset * 3600`

			`entry = {`
			`'_type': 'url_transparent',`
			`'ie_key': AbcNewsVideoIE.ie_key(),`
			`'url': full_video_url,`
			`'id': video_id,`
			`'display_id': display_id,`
			`'timestamp': timestamp,`
			`}`

			`if youtube_url:`
[abcnews,chilloutsoze,cracked,vice,vk] Use dedicated YouTube embeds extraction routines 2017-09-06 00:50:25 +07:00			`entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())]`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 15:38:57 +08:00			`return self.playlist_result(entries)`

			`return entry`