2020-04-06 08:45:20 +00:00
# coding: utf-8
from __future__ import unicode_literals
from . common import InfoExtractor
from . . utils import ExtractorError
class MicrosoftStreamBaseIE ( InfoExtractor ) :
_LOGIN_URL = ' https://web.microsoftstream.com/?noSignUpCheck=1 ' # expect redirection
_EXPECTED_TITLE = ' <title>Microsoft Stream</title> '
def is_logged_in ( self , webpage ) :
return self . _EXPECTED_TITLE in webpage
def _real_initialize ( self ) :
username , password = self . _get_login_info ( )
if username is not None or password is not None :
raise ExtractorError ( ' MicrosoftStream Extractor does not support username/password log-in at the moment. Please use cookies log-in instead. See https://github.com/ytdl-org/youtube-dl/blob/master/README.md#how-do-i-pass-cookies-to-youtube-dl for more information ' )
2020-04-10 09:57:10 +00:00
"""
Extraction Helper
"""
def _extract_access_token ( self , webpage ) :
"""
Extract the JWT access token with Regex
"""
self . _ACCESS_TOKEN = self . _html_search_regex ( r " \" AccessToken \" : \" (?P<AccessToken>.+?) \" " , webpage , ' AccessToken ' )
return self . _ACCESS_TOKEN
def _extract_api_gateway ( self , webpage ) :
"""
Extract the API gateway with Regex
"""
self . _API_GATEWAY = self . _html_search_regex ( r " \" ApiGatewayUri \" : \" (?P<APIGateway>.+?) \" " , webpage , ' APIGateway ' )
return self . _API_GATEWAY
2020-04-06 08:45:20 +00:00
class MicrosoftStreamIE ( MicrosoftStreamBaseIE ) :
2020-04-10 09:57:10 +00:00
"""
Extract of single Microsoft Stream video
"""
2020-04-06 08:45:20 +00:00
IE_NAME = ' microsoftstream '
_VALID_URL = r ' https?://(?:(?:web|www) \ .)?microsoftstream \ .com/video/(?P<id>[0-9a-f] {8} -[0-9a-f] {4} -[0-9a-f] {4} -[0-9a-f] {4} -[0-9a-f] {12} ) ' # https://regex101.com/r/K1mlgK/1/
_NETRC_MACHINE = ' microsoftstream '
2020-04-10 09:57:10 +00:00
_ACCESS_TOKEN = None # A JWT token
_API_GATEWAY = None
_TEXTTRACKS_RESPONSE = None
_VIDEO_ID = None
2020-04-06 08:45:20 +00:00
_TEST = {
' url ' : ' https://web.microsoftstream.com/video/c883c6a5-9895-4900-9a35-62f4b5d506c9 ' ,
' info_dict ' : {
' id ' : ' c883c6a5-9895-4900-9a35-62f4b5d506c9 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Webinar for Researchers: Use of GitLab ' ,
' thumbnail ' : r ' re:^https?://.*$ ' ,
}
}
2020-04-10 09:57:10 +00:00
"""
Getters
The following getters include helpful message to prompt developers for potential errors .
"""
@property
def api_gateway ( self ) :
if self . _API_GATEWAY is None :
raise ExtractorError ( ' API gateway is None. Did you forget to call " _extract_api_gateway " ? ' )
return self . _API_GATEWAY
@property
def access_token ( self ) :
if self . _ACCESS_TOKEN is None :
raise ExtractorError ( ' Access token is None. Did you forget to call " _extract_access_token " ? ' )
return self . _ACCESS_TOKEN
@property
def video_id ( self ) :
if self . _VIDEO_ID is None :
raise ( ' Variable " _VIDEO_ID " is not defined. Did you make the main extraction call? ' )
return self . _VIDEO_ID
@property
def headers ( self ) :
return { ' Authorization ' : ' Bearer %s ' % self . access_token }
@property
def texttrack_info_endpoint ( self ) :
return " %s /videos/ %s /texttracks?api-version=1.3-private " % ( self . api_gateway , self . video_id )
@property
def media_info_endpoint ( self ) :
return " %s /videos/ %s ?$expand=creator,tokens,status,liveEvent,extensions&api-version=1.3-private " % ( self . api_gateway , self . video_id )
def _request_texttracks ( self ) :
"""
Make an additional request to Microsoft Stream for the subtitle and auto - caption
"""
# Map default variable
self . _TEXTTRACKS_RESPONSE = self . _download_json ( self . texttrack_info_endpoint , self . video_id , headers = self . headers ) [ ' value ' ]
return self . _TEXTTRACKS_RESPONSE
def _determine_protocol ( self , mime ) :
"""
A switch board for the MIME type provided from the API endpoint .
"""
if mime in [ ' application/dash+xml ' ] :
return ' http_dash_segments '
elif mime in [ ' application/vnd.apple.mpegurl ' ] :
return ' m3u8 '
else :
return None
2020-04-06 08:45:20 +00:00
def _remap_thumbnails ( self , thumbnail_dict_list ) :
output = [ ]
preference_index = [ ' extraSmall ' , ' small ' , ' medium ' , ' large ' ]
for _ , key in enumerate ( thumbnail_dict_list ) :
output . append ( {
' preference ' : preference_index . index ( key ) ,
2020-04-10 09:57:10 +00:00
' url ' : thumbnail_dict_list . get ( key ) . get ( ' url ' )
2020-04-06 08:45:20 +00:00
} )
return output
2020-04-10 09:57:10 +00:00
def _remap_playback ( self , master_playlist_urls ) :
2020-04-06 08:45:20 +00:00
"""
A parser for the HLS and MPD playlists from the API endpoint .
"""
output = [ ]
for master_playlist_url in master_playlist_urls :
2020-04-10 09:57:10 +00:00
protocol = self . _determine_protocol ( master_playlist_url [ ' mimeType ' ] )
2020-04-06 08:45:20 +00:00
# Handle HLS Master playlist
2020-04-10 09:57:10 +00:00
if protocol == ' m3u8 ' :
varient_playlists = self . _extract_m3u8_formats ( master_playlist_url [ ' playbackUrl ' ] , video_id = self . video_id , headers = self . headers )
2020-04-06 08:45:20 +00:00
# For MPEG-DASH Master playlists
2020-04-10 09:57:10 +00:00
elif protocol == ' http_dash_segments ' :
varient_playlists = self . _extract_mpd_formats ( master_playlist_url [ ' playbackUrl ' ] , video_id = self . video_id , headers = self . headers )
2020-04-06 08:45:20 +00:00
2020-04-10 09:57:10 +00:00
# For other Master playlists (like Microsoft Smooth Streaming)
2020-04-06 08:45:20 +00:00
else :
self . to_screen ( ' Found unresolvable stream with format %s ' % master_playlist_url [ ' mimeType ' ] )
continue
# Patching the "Authorization" header
for varient_playlist in varient_playlists :
2020-04-10 09:57:10 +00:00
varient_playlist [ ' http_headers ' ] = self . headers
2020-04-06 08:45:20 +00:00
output . append ( varient_playlist )
return output
2020-04-10 09:57:10 +00:00
def _extract_subtitle ( self , tracks , is_auto_generated ) :
2020-04-06 08:45:20 +00:00
"""
2020-04-10 09:57:10 +00:00
An internal method for filtering and remapping text tracks
2020-04-06 08:45:20 +00:00
"""
2020-04-10 09:57:10 +00:00
if type ( is_auto_generated ) is not bool :
raise ExtractorError ( ' Unexpected variable " is_auto_generated " type: must be a Boolean ' )
subtitle_subset = { }
2020-04-06 08:45:20 +00:00
for track in tracks :
2020-04-10 09:57:10 +00:00
track_language = track . get ( ' language ' ) # The track language must have a language code.
if track . get ( ' autoGenerated ' ) is is_auto_generated :
if track_language not in subtitle_subset :
subtitle_subset [ track_language ] = [ ] # Scaffold an empty list for the object to insert into
# Since the subtitle is token protected, a get request will fire here.
data = self . _download_webpage ( url_or_request = track . get ( ' url ' ) , video_id = self . video_id , headers = self . headers )
subtitle_subset [ track_language ] . append ( { ' data ' : data , " ext " : " vtt " } )
return subtitle_subset
def _get_subtitles ( self , tracks = None ) : # Fulfill abstract method
tracks = self . _TEXTTRACKS_RESPONSE if tracks is None else tracks
return self . _extract_subtitle ( tracks , False )
def _get_automatic_captions ( self , tracks = None ) : # Fulfill abstract method
tracks = self . _TEXTTRACKS_RESPONSE if tracks is None else tracks
return self . _extract_subtitle ( tracks , True )
2020-04-06 08:45:20 +00:00
def _real_extract ( self , url ) :
2020-04-10 09:57:10 +00:00
self . _VIDEO_ID = self . _match_id ( url )
webpage = self . _download_webpage ( url , self . video_id )
2020-04-06 08:45:20 +00:00
if not self . is_logged_in ( webpage ) :
return self . raise_login_required ( )
# Extract access token from webpage
2020-04-10 09:57:10 +00:00
self . _extract_access_token ( webpage )
self . _extract_api_gateway ( webpage )
2020-04-06 08:45:20 +00:00
# "GET" api for video information
2020-04-10 09:57:10 +00:00
apiUri = self . media_info_endpoint
apiCall = self . _download_json ( apiUri , self . video_id , headers = self . headers )
2020-04-06 08:45:20 +00:00
2020-04-10 09:57:10 +00:00
texttracks = self . _request_texttracks ( )
2020-04-06 08:45:20 +00:00
return {
2020-04-10 09:57:10 +00:00
' id ' : self . video_id ,
2020-04-06 08:45:20 +00:00
' title ' : apiCall [ ' name ' ] ,
2020-04-10 09:57:10 +00:00
' description ' : apiCall . get ( ' description ' ) ,
' uploader ' : apiCall . get ( ' creator ' ) . get ( ' name ' ) ,
' thumbnails ' : self . _remap_thumbnails ( apiCall . get ( ' posterImage ' ) ) ,
' formats ' : self . _remap_playback ( apiCall [ ' playbackUrls ' ] ) ,
' subtitles ' : self . _get_subtitles ( texttracks ) ,
' automatic_captions ' : self . _get_automatic_captions ( texttracks ) ,
2020-04-06 08:45:20 +00:00
' is_live ' : False ,
# 'duration': apiCall['media']['duration'],
}