1
0
Fork 0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2024-11-22 02:01:50 +00:00

[cinchcast] Add new extractor (Fixes #4428)

This commit is contained in:
Philipp Hagemeister 2014-12-12 02:57:36 +01:00
parent 4e40de6e2a
commit 42bdd9d051
5 changed files with 88 additions and 6 deletions

View file

@ -144,6 +144,9 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
self.assertEqual(unified_strdate('1968-12-10'), '19681210')
self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
self.assertEqual(
unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False),
'20141126')
def test_find_xpath_attr(self):
testxml = '''<root>

View file

@ -51,6 +51,7 @@ from .cbsnews import CBSNewsIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE
from .cinchcast import CinchcastIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE

View file

@ -0,0 +1,53 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
xpath_text,
)
class CinchcastIE(InfoExtractor):
_VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
_TEST = {
# Actual test is run in generic, look for undergroundwellness
'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
'only_matching': True,
}
def _real_extract(self, url):
video_id = self._match_id(url)
doc = self._download_xml(
'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id,
video_id)
item = doc.find('.//item')
title = xpath_text(item, './title', fatal=True)
date_str = xpath_text(
item, './{http://developer.longtailvideo.com/trac/}date')
upload_date = unified_strdate(date_str, day_first=False)
# duration is present but wrong
formats = []
formats.append({
'format_id': 'main',
'url': item.find(
'./{http://search.yahoo.com/mrss/}content').attrib['url'],
})
backup_url = xpath_text(
item, './{http://developer.longtailvideo.com/trac/}backupContent')
if backup_url:
formats.append({
'preference': 2, # seems to be more reliable
'format_id': 'backup',
'url': backup_url,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'upload_date': upload_date,
'formats': formats,
}

View file

@ -467,8 +467,17 @@ class GenericIE(InfoExtractor):
'expected_warnings': [
'URL could be a direct video link, returning it as such.'
]
}
},
# Cinchcast embed
{
'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
'info_dict': {
'id': '7141703',
'ext': 'mp3',
'upload_date': '20141126',
'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
}
},
]
def report_following_redirect(self, new_url):
@ -962,6 +971,13 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'SBS')
# Look for embedded Cinchcast player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'Cinchcast')
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
webpage)

View file

@ -166,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False):
xpath = xpath.encode('ascii')
n = node.find(xpath)
if n is None:
if n is None or n.text is None:
if fatal:
name = xpath if name is None else name
raise ExtractorError('Could not find XML element %s' % name)
@ -644,17 +644,19 @@ def parse_iso8601(date_str, delimiter='T'):
return calendar.timegm(dt.timetuple())
def unified_strdate(date_str):
def unified_strdate(date_str, day_first=True):
"""Return a string with the date in the format YYYYMMDD"""
if date_str is None:
return None
upload_date = None
# Replace commas
date_str = date_str.replace(',', ' ')
# %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
# Remove AM/PM + timezone
date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
format_expressions = [
'%d %B %Y',
'%d %b %Y',
@ -669,7 +671,6 @@ def unified_strdate(date_str):
'%d/%m/%Y',
'%d/%m/%y',
'%Y/%m/%d %H:%M:%S',
'%d/%m/%Y %H:%M:%S',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S.%f',
'%d.%m.%Y %H:%M',
@ -681,6 +682,14 @@ def unified_strdate(date_str):
'%Y-%m-%dT%H:%M:%S.%f',
'%Y-%m-%dT%H:%M',
]
if day_first:
format_expressions.extend([
'%d/%m/%Y %H:%M:%S',
])
else:
format_expressions.extend([
'%m/%d/%Y %H:%M:%S',
])
for expression in format_expressions:
try:
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')