mirror of
https://github.com/ytdl-org/youtube-dl.git
synced 2024-11-28 21:22:13 +00:00
[bbc] fix BBC IPlayer Episodes/Group extraction(closes #28360)
This commit is contained in:
parent
5208ae92fc
commit
a40002444e
2 changed files with 192 additions and 16 deletions
|
@ -1,17 +1,22 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import functools
|
||||||
import itertools
|
import itertools
|
||||||
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..compat import (
|
from ..compat import (
|
||||||
compat_etree_Element,
|
compat_etree_Element,
|
||||||
compat_HTTPError,
|
compat_HTTPError,
|
||||||
|
compat_parse_qs,
|
||||||
|
compat_urllib_parse_urlparse,
|
||||||
compat_urlparse,
|
compat_urlparse,
|
||||||
)
|
)
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
|
OnDemandPagedList,
|
||||||
clean_html,
|
clean_html,
|
||||||
dict_get,
|
dict_get,
|
||||||
float_or_none,
|
float_or_none,
|
||||||
|
@ -811,7 +816,7 @@ class BBCIE(BBCCoUkIE):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def suitable(cls, url):
|
def suitable(cls, url):
|
||||||
EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
|
EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
|
||||||
return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
|
return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
|
||||||
else super(BBCIE, cls).suitable(url))
|
else super(BBCIE, cls).suitable(url))
|
||||||
|
|
||||||
|
@ -1338,21 +1343,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor):
|
||||||
playlist_id, title, description)
|
playlist_id, title, description)
|
||||||
|
|
||||||
|
|
||||||
class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
|
class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
|
||||||
IE_NAME = 'bbc.co.uk:iplayer:playlist'
|
_VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
|
||||||
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
|
|
||||||
_URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
|
@staticmethod
|
||||||
_VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
|
def _get_default(episode, key, default_key='default'):
|
||||||
|
return try_get(episode, lambda x: x[key][default_key])
|
||||||
|
|
||||||
|
def _get_description(self, data):
|
||||||
|
synopsis = data.get(self._DESCRIPTION_KEY) or {}
|
||||||
|
return dict_get(synopsis, ('large', 'medium', 'small'))
|
||||||
|
|
||||||
|
def _fetch_page(self, programme_id, per_page, series_id, page):
|
||||||
|
elements = self._get_elements(self._call_api(
|
||||||
|
programme_id, per_page, page + 1, series_id))
|
||||||
|
for element in elements:
|
||||||
|
episode = self._get_episode(element)
|
||||||
|
episode_id = episode.get('id')
|
||||||
|
if not episode_id:
|
||||||
|
continue
|
||||||
|
thumbnail = None
|
||||||
|
image = self._get_episode_image(episode)
|
||||||
|
if image:
|
||||||
|
thumbnail = image.replace('{recipe}', 'raw')
|
||||||
|
category = self._get_default(episode, 'labels', 'category')
|
||||||
|
yield {
|
||||||
|
'_type': 'url',
|
||||||
|
'id': episode_id,
|
||||||
|
'title': self._get_episode_field(episode, 'subtitle'),
|
||||||
|
'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
|
||||||
|
'thumbnail': thumbnail,
|
||||||
|
'description': self._get_description(episode),
|
||||||
|
'categories': [category] if category else None,
|
||||||
|
'series': self._get_episode_field(episode, 'title'),
|
||||||
|
'ie_key': BBCCoUkIE.ie_key(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
pid = self._match_id(url)
|
||||||
|
qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
|
||||||
|
series_id = qs.get('seriesId', [None])[0]
|
||||||
|
page = qs.get('page', [None])[0]
|
||||||
|
per_page = 36 if page else self._PAGE_SIZE
|
||||||
|
fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
|
||||||
|
entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
|
||||||
|
playlist_data = self._get_playlist_data(self._call_api(pid, 1))
|
||||||
|
return self.playlist_result(
|
||||||
|
entries, pid, self._get_playlist_title(playlist_data),
|
||||||
|
self._get_description(playlist_data))
|
||||||
|
|
||||||
|
|
||||||
|
class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
|
||||||
|
IE_NAME = 'bbc.co.uk:iplayer:episodes'
|
||||||
|
_VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
|
'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'b05rcz9v',
|
'id': 'b05rcz9v',
|
||||||
'title': 'The Disappearance',
|
'title': 'The Disappearance',
|
||||||
'description': 'French thriller serial about a missing teenager.',
|
'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
|
||||||
},
|
},
|
||||||
'playlist_mincount': 6,
|
'playlist_mincount': 8,
|
||||||
'skip': 'This programme is not currently available on BBC iPlayer',
|
|
||||||
}, {
|
}, {
|
||||||
|
# all seasons
|
||||||
|
'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'b094m5t9',
|
||||||
|
'title': 'Doctor Foster',
|
||||||
|
'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
|
||||||
|
},
|
||||||
|
'playlist_mincount': 10,
|
||||||
|
}, {
|
||||||
|
# explicit season
|
||||||
|
'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'b094m5t9',
|
||||||
|
'title': 'Doctor Foster',
|
||||||
|
'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
|
||||||
|
},
|
||||||
|
'playlist_mincount': 5,
|
||||||
|
}, {
|
||||||
|
# all pages
|
||||||
|
'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'm0004c4v',
|
||||||
|
'title': 'Beechgrove',
|
||||||
|
'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
|
||||||
|
},
|
||||||
|
'playlist_mincount': 37,
|
||||||
|
}, {
|
||||||
|
# explicit page
|
||||||
|
'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'm0004c4v',
|
||||||
|
'title': 'Beechgrove',
|
||||||
|
'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
|
||||||
|
},
|
||||||
|
'playlist_mincount': 1,
|
||||||
|
}]
|
||||||
|
_PAGE_SIZE = 100
|
||||||
|
_DESCRIPTION_KEY = 'synopsis'
|
||||||
|
|
||||||
|
def _get_episode_image(self, episode):
|
||||||
|
return self._get_default(episode, 'image')
|
||||||
|
|
||||||
|
def _get_episode_field(self, episode, field):
|
||||||
|
return self._get_default(episode, field)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_elements(data):
|
||||||
|
return data['entities']['results']
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_episode(element):
|
||||||
|
return element.get('episode') or {}
|
||||||
|
|
||||||
|
def _call_api(self, pid, per_page, page=1, series_id=None):
|
||||||
|
variables = {
|
||||||
|
'id': pid,
|
||||||
|
'page': page,
|
||||||
|
'perPage': per_page,
|
||||||
|
}
|
||||||
|
if series_id:
|
||||||
|
variables['sliceId'] = series_id
|
||||||
|
return self._download_json(
|
||||||
|
'https://graph.ibl.api.bbc.co.uk/', pid, headers={
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}, data=json.dumps({
|
||||||
|
'id': '5692d93d5aac8d796a0305e895e61551',
|
||||||
|
'variables': variables,
|
||||||
|
}).encode('utf-8'))['data']['programme']
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_playlist_data(data):
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _get_playlist_title(self, data):
|
||||||
|
return self._get_default(data, 'title')
|
||||||
|
|
||||||
|
|
||||||
|
class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
|
||||||
|
IE_NAME = 'bbc.co.uk:iplayer:group'
|
||||||
|
_VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
|
||||||
|
_TESTS = [{
|
||||||
# Available for over a year unlike 30 days for most other programmes
|
# Available for over a year unlike 30 days for most other programmes
|
||||||
'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
|
'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
|
@ -1361,14 +1494,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
|
||||||
'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
|
'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
|
||||||
},
|
},
|
||||||
'playlist_mincount': 10,
|
'playlist_mincount': 10,
|
||||||
|
}, {
|
||||||
|
# all pages
|
||||||
|
'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'p081d7j7',
|
||||||
|
'title': 'Music in Scotland',
|
||||||
|
'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
|
||||||
|
},
|
||||||
|
'playlist_mincount': 47,
|
||||||
|
}, {
|
||||||
|
# explicit page
|
||||||
|
'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'p081d7j7',
|
||||||
|
'title': 'Music in Scotland',
|
||||||
|
'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
|
||||||
|
},
|
||||||
|
'playlist_mincount': 11,
|
||||||
}]
|
}]
|
||||||
|
_PAGE_SIZE = 200
|
||||||
|
_DESCRIPTION_KEY = 'synopses'
|
||||||
|
|
||||||
def _extract_title_and_description(self, webpage):
|
def _get_episode_image(self, episode):
|
||||||
title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
|
return self._get_default(episode, 'images', 'standard')
|
||||||
description = self._search_regex(
|
|
||||||
r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
|
def _get_episode_field(self, episode, field):
|
||||||
webpage, 'description', fatal=False, group='value')
|
return episode.get(field)
|
||||||
return title, description
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_elements(data):
|
||||||
|
return data['elements']
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_episode(element):
|
||||||
|
return element
|
||||||
|
|
||||||
|
def _call_api(self, pid, per_page, page=1, series_id=None):
|
||||||
|
return self._download_json(
|
||||||
|
'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
|
||||||
|
pid, query={
|
||||||
|
'page': page,
|
||||||
|
'per_page': per_page,
|
||||||
|
})['group_episodes']
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_playlist_data(data):
|
||||||
|
return data['group']
|
||||||
|
|
||||||
|
def _get_playlist_title(self, data):
|
||||||
|
return data.get('title')
|
||||||
|
|
||||||
|
|
||||||
class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
|
class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
|
||||||
|
|
|
@ -95,7 +95,8 @@ from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE
|
||||||
from .bbc import (
|
from .bbc import (
|
||||||
BBCCoUkIE,
|
BBCCoUkIE,
|
||||||
BBCCoUkArticleIE,
|
BBCCoUkArticleIE,
|
||||||
BBCCoUkIPlayerPlaylistIE,
|
BBCCoUkIPlayerEpisodesIE,
|
||||||
|
BBCCoUkIPlayerGroupIE,
|
||||||
BBCCoUkPlaylistIE,
|
BBCCoUkPlaylistIE,
|
||||||
BBCIE,
|
BBCIE,
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue