2020-04-07 20:05:09 +00:00
# coding: utf-8
from __future__ import unicode_literals
2020-10-03 04:08:30 +00:00
import json
2020-04-07 20:05:09 +00:00
import os
from . common import InfoExtractor
2020-05-11 03:58:14 +00:00
from . . compat import compat_urllib_parse_unquote , compat_str
2020-10-03 04:08:30 +00:00
from . . utils import parse_iso8601 , ExtractorError , try_get , urljoin , sanitized_Request
2020-04-07 20:05:09 +00:00
class NebulaIE ( InfoExtractor ) :
"""
Nebula ( https : / / watchnebula . com / ) is a video platform created by the streamer community Standard . It hosts videos
off - YouTube from a small hand - picked group of creators .
2020-05-11 03:58:14 +00:00
All videos require a subscription to watch . There are no known freely available videos . An authentication token to
2020-10-23 03:43:55 +00:00
an account with a valid subscription can be specified in multiple ways , including credentials in . netrc or a cookie
jar .
As neither of these parameters appear to be supported by the unit test runner , it ' s recommended to set the envvar
NEBULA_TOKEN to execute the test runs .
2020-04-07 20:05:09 +00:00
Nebula uses the Zype video infrastructure and this extractor is using the ' url_transparent ' mode to hand off
video extraction to the Zype extractor .
2020-10-23 03:43:55 +00:00
This description has been last updated on 2020 - 10 - 22.
2020-04-07 20:05:09 +00:00
"""
2020-04-16 02:34:17 +00:00
_VALID_URL = r ' https?://(?:www \ .)?watchnebula \ .com/videos/(?P<id>[- \ w]+) ' # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id()
_TESTS = [
{
' url ' : ' https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast ' ,
' md5 ' : ' fe79c4df8b3aa2fea98a93d027465c7e ' ,
' info_dict ' : {
' id ' : ' 5c271b40b13fd613090034fd ' ,
' ext ' : ' mp4 ' ,
' title ' : ' That Time Disney Remade Beauty and the Beast ' ,
' description ' : ' Note: this video was originally posted on YouTube with the sponsor read included. We weren’ t able to remove it without reducing video quality, so it’ s presented here in its original context. ' ,
' upload_date ' : ' 20180731 ' ,
' timestamp ' : 1533009600 ,
' channel ' : ' Lindsay Ellis ' ,
' uploader ' : ' Lindsay Ellis ' ,
}
} ,
{
' url ' : ' https://watchnebula.com/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore ' ,
2020-10-03 03:41:27 +00:00
' md5 ' : ' 6d4edd14ce65720fa63aba5c583fb328 ' ,
2020-04-16 02:34:17 +00:00
' info_dict ' : {
' id ' : ' 5e7e78171aaf320001fbd6be ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Landing Craft - How The Allies Got Ashore ' ,
' description ' : r ' re:^In this episode we explore the unsung heroes of D-Day, the landing craft. ' ,
' upload_date ' : ' 20200327 ' ,
' timestamp ' : 1585348140 ,
' channel ' : ' The Logistics of D-Day ' ,
' uploader ' : ' The Logistics of D-Day ' ,
}
} ,
{
' url ' : ' https://watchnebula.com/videos/money-episode-1-the-draw ' ,
2020-10-03 03:41:27 +00:00
' md5 ' : ' 8c7d272910eea320f6f8e6d3084eecf5 ' ,
2020-04-16 02:34:17 +00:00
' info_dict ' : {
' id ' : ' 5e779ebdd157bc0001d1c75a ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Episode 1: The Draw ' ,
2020-05-11 03:58:14 +00:00
' description ' : r ' contains:There’ s free money on offer… if the players can all work together. ' ,
2020-04-16 02:34:17 +00:00
' upload_date ' : ' 20200323 ' ,
' timestamp ' : 1584980400 ,
' channel ' : ' Tom Scott Presents: Money ' ,
' uploader ' : ' Tom Scott Presents: Money ' ,
}
} ,
]
_WORKING = True # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription?
2020-10-03 04:08:30 +00:00
_NETRC_MACHINE = ' watchnebula '
def _perform_login ( self , username , password , video_id ) :
"""
2020-10-23 03:43:55 +00:00
Log in to Nebula , authenticating using a given username and password .
2020-10-03 04:08:30 +00:00
2020-10-23 03:43:55 +00:00
Returns a Nebula token , as the frontend would store it in the
nebula - auth cookie . Or False , if authentication fails .
2020-10-03 04:08:30 +00:00
"""
data = json . dumps ( { ' email ' : username , ' password ' : password } ) . encode ( ' utf8 ' )
request = sanitized_Request ( method = ' POST ' ,
url = ' https://api.watchnebula.com/api/v1/auth/login/ ' ,
data = data ,
headers = { ' content-type ' : ' application/json ' } )
2020-10-23 03:43:55 +00:00
response = self . _download_json ( request , fatal = False , video_id = video_id ,
note = ' Authenticating to Nebula with supplied credentials ' ,
errnote = ' Authentication failed or rejected ' )
if not response or ' key ' not in response :
return False
2020-10-03 04:08:30 +00:00
return response [ ' key ' ]
2020-04-07 20:05:09 +00:00
2020-05-11 03:58:14 +00:00
def _retrieve_nebula_auth ( self , video_id ) :
2020-04-07 20:05:09 +00:00
"""
2020-10-23 03:43:55 +00:00
Attempt to find a Nebula API token . Makes multiple attempts in the
following order :
a ) login credentials used to authenticate to the Nebula login endpoint ,
either from . netrc or specified using - - username / - - password
2020-05-11 03:58:14 +00:00
b ) the - - cookies supplied cookie jar
c ) the NEBULA_TOKEN environment variable
2020-10-23 03:43:55 +00:00
d ) the - - video - password command line argument ( this isn ' t documented in
the error message , because probably highly unpopular )
If none of these are successful , an end user - intended error message is
raised , listing some solutions .
2020-04-07 20:05:09 +00:00
"""
2020-10-23 03:43:55 +00:00
nebula_token = None
2020-10-03 04:08:30 +00:00
2020-10-23 03:43:55 +00:00
# option #1: login credentials via .netrc or --username and --password
2020-10-03 04:08:30 +00:00
username , password = self . _get_login_info ( )
2020-10-23 03:43:55 +00:00
if username and password :
2020-11-25 10:16:48 +00:00
self . to_screen ( ' Authenticating to Nebula using .netrc or command line-supplied credentials ' )
2020-10-23 03:43:55 +00:00
nebula_token = self . _perform_login ( username , password , video_id )
2020-10-03 04:08:30 +00:00
2020-10-23 03:43:55 +00:00
# option #2: nebula token via cookie jar
2020-05-11 03:58:14 +00:00
if not nebula_token :
# TODO: is there a helper to do all this cookie extraction?
nebula_cookies = self . _get_cookies ( ' https://watchnebula.com ' )
nebula_cookie = nebula_cookies . get ( ' nebula-auth ' )
if nebula_cookie :
2020-11-25 10:16:48 +00:00
self . to_screen ( ' Authenticating to Nebula with credentials from cookie jar ' )
2020-05-11 03:58:14 +00:00
nebula_cookie_value = compat_urllib_parse_unquote ( nebula_cookie . value )
nebula_token = self . _parse_json ( nebula_cookie_value , video_id ) . get ( ' apiToken ' )
2020-10-23 03:43:55 +00:00
# option #3: nebula token via environment variable
2020-05-11 03:58:14 +00:00
if not nebula_token and ' NEBULA_TOKEN ' in os . environ :
nebula_token = os . environ . get ( ' NEBULA_TOKEN ' )
2020-11-25 10:16:48 +00:00
if nebula_token :
self . to_screen ( ' Authenticating to Nebula with token from NEBULA_TOKEN environment variable ' )
2020-10-23 03:43:55 +00:00
# option #4: nebula token via --videopassword
if not nebula_token :
nebula_token = self . _downloader . params . get ( ' videopassword ' )
2020-11-25 10:16:48 +00:00
if nebula_token : self . to_screen ( ' Authenticating to Nebula with token from --videopassword ' )
2020-10-23 03:43:55 +00:00
2020-05-11 03:58:14 +00:00
if not nebula_token :
raise ExtractorError ( ' Nebula requires an account with an active subscription. '
2020-10-23 03:43:55 +00:00
' You can supply your authentication information by either '
' a) storing your credentials in .netrc or supplying them via --username and --password, or '
2020-05-11 03:58:14 +00:00
' b) passing in a cookie jar containing a nebula-auth cookie via --cookies, or '
' c) setting the environment variable NEBULA_TOKEN. ' )
return nebula_token
2020-05-15 04:14:08 +00:00
def _retrieve_zype_api_key ( self , page_url , display_id ) :
"""
Retrieves the Zype API key required to make calls to the Zype API .
Unfortunately , the Nebula frontend stores this as a JS object literal in one of its JS chunks ,
looking somewhat like this ( but minified ) :
return {
NODE_ENV : " production " ,
REACT_APP_NAME : " Nebula " ,
REACT_APP_NEBULA_API : " https://api.watchnebula.com/api/v1/ " ,
REACT_APP_ZYPE_API : " https://api.zype.com/ " ,
REACT_APP_ZYPE_API_KEY : " <redacted> " ,
REACT_APP_ZYPE_APP_KEY : " <redacted> " ,
/ / . . .
}
So we have to find the reference to the chunk in the video page ( as it is hashed and the hash will
change when they do a new release ) , then download the chunk and extract the API key from there ,
hoping they won ' t rename the constant.
Alternatively , it is currently hardcoded and shared among all users . We haven ' t seen it
change so far , so we could also just hardcode it in the extractor as a fallback .
"""
# fetch the video page
webpage = self . _download_webpage ( page_url , video_id = display_id )
# find the script tag with a file named 'main.<hash>.chunk.js' in there
main_script_relpath = self . _search_regex (
r ' <script[^>]*src= " (?P<script_relpath>[^ " ]*main.[0-9a-f]*.chunk.js) " [^>]*> ' , webpage ,
group = ' script_relpath ' , name = ' script relative path ' , fatal = True )
# fetch the JS chunk
main_script_abspath = urljoin ( page_url , main_script_relpath )
main_script = self . _download_webpage ( main_script_abspath , video_id = display_id ,
note = ' Retrieving Zype API key ' )
# find the API key named 'REACT_APP_ZYPE_API_KEY' in there
api_key = self . _search_regex (
r ' REACT_APP_ZYPE_API_KEY \ s*: \ s* " (?P<api_key>[ \ w-]*) " ' , main_script ,
group = ' api_key ' , name = ' API key ' , fatal = True )
return api_key
def _call_zype_api ( self , path , params , video_id , api_key , note ) :
2020-04-07 20:05:09 +00:00
"""
2020-05-11 03:58:14 +00:00
A helper for making calls to the Zype API .
2020-04-07 20:05:09 +00:00
"""
2020-05-11 03:58:14 +00:00
query = { ' api_key ' : api_key , ' per_page ' : 1 }
query . update ( params )
2020-05-15 04:14:08 +00:00
return self . _download_json ( ' https://api.zype.com ' + path , video_id , query = query , note = note )
2020-04-07 20:05:09 +00:00
2020-05-11 03:58:14 +00:00
def _fetch_zype_video_data ( self , display_id , api_key ) :
"""
Fetch video meta data from the Zype API .
"""
2020-05-15 04:14:08 +00:00
response = self . _call_zype_api ( ' /videos ' , { ' friendly_title ' : display_id } ,
display_id , api_key , note = ' Retrieving metadata from Zype ' )
2020-05-11 03:58:14 +00:00
if ' response ' not in response or len ( response [ ' response ' ] ) != 1 :
raise ExtractorError ( ' Unable to find video on Zype API ' )
return response [ ' response ' ] [ 0 ]
2020-04-07 20:05:09 +00:00
2020-05-15 04:14:08 +00:00
def _call_nebula_api ( self , path , video_id , access_token , note ) :
2020-04-07 20:05:09 +00:00
"""
2020-05-11 03:58:14 +00:00
A helper for making calls to the Nebula API .
2020-04-07 20:05:09 +00:00
"""
2020-05-11 03:58:14 +00:00
return self . _download_json ( ' https://api.watchnebula.com/api/v1 ' + path , video_id , headers = {
' Authorization ' : ' Token {access_token} ' . format ( access_token = access_token )
2020-05-15 04:14:08 +00:00
} , note = note )
2020-04-07 20:05:09 +00:00
2020-05-11 03:58:14 +00:00
def _fetch_zype_access_token ( self , video_id , nebula_token ) :
"""
Requests a Zype access token from the Nebula API .
"""
2020-06-02 02:57:37 +00:00
user_object = self . _call_nebula_api ( ' /auth/user/ ' , video_id , nebula_token , note = ' Retrieving Zype access token ' )
2020-05-11 03:58:14 +00:00
access_token = try_get ( user_object , lambda x : x [ ' zype_auth_info ' ] [ ' access_token ' ] , compat_str )
if not access_token :
raise ExtractorError ( ' Unable to extract Zype access token from Nebula API authentication endpoint ' )
return access_token
2020-04-07 20:05:09 +00:00
2020-05-11 03:58:14 +00:00
def _build_video_url ( self , video_id , zype_access_token ) :
"""
Construct a Zype video URL ( as supported by the Zype extractor ) , given a Zype video ID and a Zype access token .
"""
return ' https://player.zype.com/embed/ {video_id} .html?access_token= {access_token} ' . format (
video_id = video_id ,
access_token = zype_access_token )
2020-04-07 20:05:09 +00:00
2020-04-16 02:35:05 +00:00
def _extract_channel ( self , video_meta ) :
2020-04-07 20:05:09 +00:00
"""
2020-04-16 02:35:05 +00:00
Extract the channel title , by going through the list of categories and finding the first value of the
first category that has a value .
I know this look like a terrible approach . But actually , it ' s just reproducing the behavior of the
React code the Nebula frontend uses ( as of 2020 - 04 - 07 ) :
let channel ;
if ( video & & video . categories & & video . categories . length ) {
const channelTitle = video . categories . map ( ( category ) = > ( category . value [ 0 ] ) )
. filter ( ( title ) = > ( ! ! title ) ) [ 0 ] ;
channel = getChannelByTitle ( state , { title : channelTitle } ) ;
}
Basically , it finds the first ( truthy ) value in the category list and that ' s assumed to be the
channel title . And then the channel details ( e . g . the URL ) are looked up by title ( ! ) ( not by any
kind of ID ) via an additional API call .
TODO : Implement the API calls giving us the channel list , so that we can do the title lookup and then figure out the channel URL
2020-04-18 04:15:03 +00:00
May return None of no category list could be found or no category had a label ( ' value ' ) .
2020-04-07 20:05:09 +00:00
"""
2020-04-18 04:15:03 +00:00
categories = video_meta . get ( ' categories ' , [ ] ) if video_meta else [ ]
2020-04-16 02:35:05 +00:00
for category in categories :
2020-04-18 04:15:03 +00:00
if category . get ( ' value ' ) : # we're intentionally not using "'value' in category" here, because the expression is supposed to be falsy for empty lists in category['value'] as well!
2020-04-16 02:35:05 +00:00
return category [ ' value ' ] [ 0 ]
2020-04-07 20:05:09 +00:00
def _real_extract ( self , url ) :
# extract the video's display ID from the URL (we'll retrieve the video ID later)
display_id = self . _match_id ( url )
2020-05-11 03:58:14 +00:00
# retrieve Nebula authentication information
nebula_token = self . _retrieve_nebula_auth ( display_id )
# fetch video meta data from the Nebula API
2020-05-15 04:14:08 +00:00
api_key = self . _retrieve_zype_api_key ( url , display_id )
2020-05-11 03:58:14 +00:00
video_meta = self . _fetch_zype_video_data ( display_id , api_key )
video_id = video_meta [ ' _id ' ]
2020-04-07 20:05:09 +00:00
2020-05-11 03:58:14 +00:00
# extract additional info
2020-04-16 02:35:05 +00:00
channel_title = self . _extract_channel ( video_meta )
2020-04-07 20:05:09 +00:00
2020-05-11 03:58:14 +00:00
# fetch the access token for Zype, then construct the video URL
2020-05-15 04:14:08 +00:00
zype_access_token = self . _fetch_zype_access_token ( display_id , nebula_token = nebula_token )
2020-05-11 03:58:14 +00:00
video_url = self . _build_video_url ( video_id , zype_access_token )
2020-04-07 20:05:09 +00:00
return {
' id ' : video_id ,
' display_id ' : display_id ,
# we're passing this video URL on to the 'Zype' extractor (that's the video infrastructure that Nebula is
# built on top of) and use the 'url_transparent' type to indicate that our meta data should be better than
# whatever the Zype extractor is able to identify
' _type ' : ' url_transparent ' ,
' ie_key ' : ' Zype ' ,
' url ' : video_url ,
# the meta data we were able to extract from Nebula
2020-04-18 04:15:03 +00:00
' title ' : video_meta . get ( ' title ' ) ,
' description ' : video_meta . get ( ' description ' ) ,
' timestamp ' : parse_iso8601 ( video_meta . get ( ' published_at ' ) ) ,
2020-04-07 20:05:09 +00:00
' thumbnails ' : [
{
2020-05-11 03:58:14 +00:00
' id ' : tn . get ( ' name ' ) , # this appears to be null in all cases I've encountered
2020-04-07 20:05:09 +00:00
' url ' : tn [ ' url ' ] ,
2020-04-18 04:15:03 +00:00
' width ' : tn . get ( ' width ' ) ,
' height ' : tn . get ( ' height ' ) ,
} for tn in video_meta . get ( ' thumbnails ' , [ ] ) ] ,
' duration ' : video_meta . get ( ' duration ' ) ,
2020-04-16 02:35:05 +00:00
' channel ' : channel_title ,
' uploader ' : channel_title , # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series
2020-04-07 20:05:09 +00:00
# TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from!
# TODO: channel_id
# TODO: channel_url
}