mirror of
https://github.com/ytdl-org/youtube-dl.git
synced 2025-01-09 09:26:36 +00:00
[nebula] Relax meta data lookups
This commit is contained in:
parent
61cead3235
commit
af3434b839
1 changed files with 29 additions and 19 deletions
|
@ -4,7 +4,8 @@ from __future__ import unicode_literals
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import parse_iso8601
|
from ..compat import compat_str
|
||||||
|
from ..utils import parse_iso8601, try_get
|
||||||
|
|
||||||
COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
|
COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
|
||||||
|
|
||||||
|
@ -74,9 +75,13 @@ class NebulaIE(InfoExtractor):
|
||||||
"""
|
"""
|
||||||
As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script
|
As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script
|
||||||
tag. This function is extracting this script tag, parsing it as JSON.
|
tag. This function is extracting this script tag, parsing it as JSON.
|
||||||
|
|
||||||
|
May return None if no state object could be found or it didn't contain valid JSON.
|
||||||
"""
|
"""
|
||||||
initial_state_object = self._search_regex(r'<script id="initial-app-state" type="application/json">(.+?)</script>', webpage, 'initial_state')
|
initial_state_object = self._search_regex(
|
||||||
metadata = self._parse_json(initial_state_object, video_id=display_id) # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead?
|
r'<script[^>]*id="initial-app-state"[^>]*>(.+?)</script>', webpage,
|
||||||
|
'initial_state', fatal=False, default=None)
|
||||||
|
metadata = self._parse_json(initial_state_object, video_id=display_id) if initial_state_object else None # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead?
|
||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
@ -84,9 +89,12 @@ class NebulaIE(InfoExtractor):
|
||||||
"""
|
"""
|
||||||
The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the
|
The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the
|
||||||
video ID, we can then extract a dictionary with various meta data about the video itself.
|
video ID, we can then extract a dictionary with various meta data about the video itself.
|
||||||
|
|
||||||
|
May return (None, {}) if no state object was given or it didn't contain the expected lookup table or
|
||||||
|
meta data.
|
||||||
"""
|
"""
|
||||||
video_id = state_object['videos']['byURL'][display_id]
|
video_id = try_get(state_object, lambda x: x['videos']['byURL'][display_id], compat_str)
|
||||||
video_meta = state_object['videos']['byID'][video_id]
|
video_meta = try_get(state_object, lambda x: x['videos']['byID'][video_id], dict) or {}
|
||||||
|
|
||||||
return video_id, video_meta
|
return video_id, video_meta
|
||||||
|
|
||||||
|
@ -100,8 +108,10 @@ class NebulaIE(InfoExtractor):
|
||||||
|
|
||||||
# fallback: reconstruct using video ID and access token from state object
|
# fallback: reconstruct using video ID and access token from state object
|
||||||
if not video_url:
|
if not video_url:
|
||||||
access_token = state_object['account']['userInfo']['zypeAuthInfo']['accessToken']
|
access_token = try_get(state_object, lambda x: x['account']['userInfo']['zypeAuthInfo']['accessToken'],
|
||||||
video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(video_id=video_id, access_token=access_token)
|
compat_str)
|
||||||
|
video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(
|
||||||
|
video_id=video_id, access_token=access_token)
|
||||||
|
|
||||||
return video_url
|
return video_url
|
||||||
|
|
||||||
|
@ -125,12 +135,13 @@ class NebulaIE(InfoExtractor):
|
||||||
kind of ID) via an additional API call.
|
kind of ID) via an additional API call.
|
||||||
|
|
||||||
TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL
|
TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL
|
||||||
|
|
||||||
|
May return None of no category list could be found or no category had a label ('value').
|
||||||
"""
|
"""
|
||||||
categories = video_meta['categories']
|
categories = video_meta.get('categories', []) if video_meta else []
|
||||||
for category in categories:
|
for category in categories:
|
||||||
if category['value']:
|
if category.get('value'): # we're intentionally not using "'value' in category" here, because the expression is supposed to be falsy for empty lists in category['value'] as well!
|
||||||
return category['value'][0]
|
return category['value'][0]
|
||||||
return None
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
# FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
|
# FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
|
||||||
|
@ -163,18 +174,17 @@ class NebulaIE(InfoExtractor):
|
||||||
'url': video_url,
|
'url': video_url,
|
||||||
|
|
||||||
# the meta data we were able to extract from Nebula
|
# the meta data we were able to extract from Nebula
|
||||||
'title': video_meta['title'],
|
'title': video_meta.get('title'),
|
||||||
'description': video_meta['description'],
|
'description': video_meta.get('description'),
|
||||||
'timestamp': parse_iso8601(video_meta['published_at']),
|
'timestamp': parse_iso8601(video_meta.get('published_at')),
|
||||||
'thumbnails': [
|
'thumbnails': [
|
||||||
{
|
{
|
||||||
'id': tn['name'], # this appears to be null in all cases I've seen
|
'id': tn.get('name'), # this appears to be null in all cases I've seen
|
||||||
'url': tn['url'],
|
'url': tn['url'],
|
||||||
'width': tn['width'],
|
'width': tn.get('width'),
|
||||||
'height': tn['height'],
|
'height': tn.get('height'),
|
||||||
} for tn in video_meta['thumbnails']
|
} for tn in video_meta.get('thumbnails', [])],
|
||||||
],
|
'duration': video_meta.get('duration'),
|
||||||
'duration': video_meta['duration'],
|
|
||||||
'channel': channel_title,
|
'channel': channel_title,
|
||||||
'uploader': channel_title, # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series
|
'uploader': channel_title, # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series
|
||||||
# TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from!
|
# TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from!
|
||||||
|
|
Loading…
Reference in a new issue