1
0
Fork 0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2024-06-26 13:49:38 +00:00

Compare commits

...

13 commits

Author SHA1 Message Date
qkolj c4e47796c8
Merge c8b62b28d6 into 0153b387e5 2024-06-15 01:17:20 +08:00
Paper 0153b387e5
[VidLii] Add 720p support (#30924)
* [VidLii] Add HD support  (yt-dlp backport-ish)

* Also fix a bug with the view count

---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2024-06-11 13:21:39 +01:00
dirkf a48fe7491d [ORF] Skip tests with limited availability 2024-06-11 12:52:13 +01:00
dirkf e20ca543f0 [ORF] Re-factor and updateORFFM4StoryIE
* fix getting media via DASH instead of inaccessible mp4
* also get in-page YT media
2024-06-11 12:52:13 +01:00
dirkf e39466051f [ORF] Support sound.orf.at, updating ORFRadioIE
* maintain support for xx.orf.at/player/... URLs
* add `ORFRadioCollectionIE` to support playlists in ORF Sound
* back-port and re-work `ORFPodcastIE` from https://github.com/yt-dlp/yt-dlp/pull/8486, thx Esokrates
2024-06-11 12:52:13 +01:00
dirkf d95c0d203f [ORF] Support on.orf.at, replacing ORFTVthekIE
* add `ORFONIE`, back-porting yt-dlp PR https://github.com/yt-dlp/yt-dlp/pull/9113 and friends: thx HobbyistDev, TuxCoder, seproDev
* re-factor to support livestreams via new `ORFONliveIE`
2024-06-11 12:52:13 +01:00
dirkf 3bde6a5752 [test] Improve download test
* skip reason can't be unicode in Py2
* remove duplicate assert...Equal functions
2024-06-11 12:52:13 +01:00
dirkf 50f6c5668a [core] Re-factor with _fill_common_fields() as used in yt-dlp 2024-06-11 12:52:13 +01:00
dirkf b4ff08bd2d [core] Safer handling of nested playlist data 2024-06-11 12:52:13 +01:00
kmnx 88bd8b9f87
[mixcloud] updated mixcloud API server address (#32557)
* updated mixcloud API server address
* fix tests
* etc

---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2024-06-11 12:38:24 +01:00
Petar Kukolj c8b62b28d6 Merge branch 'master' of https://github.com/rg3/youtube-dl into librivox 2018-10-02 01:42:48 +02:00
Petar Kukolj d9f4cc8b3e [LibriVox] Added description extraction, added some fallbacks and some regex improvements 2017-10-01 16:58:51 +02:00
Petar Kukolj aa016336a8 [LibriVox] Add new extractor 2017-09-20 03:02:02 +02:00
8 changed files with 843 additions and 570 deletions

View file

@ -5,9 +5,9 @@ import hashlib
import json
import os.path
import re
import types
import ssl
import sys
import types
import unittest
import youtube_dl.extractor
@ -181,18 +181,18 @@ def expect_value(self, got, expected, field):
op, _, expected_num = expected.partition(':')
expected_num = int(expected_num)
if op == 'mincount':
assert_func = assertGreaterEqual
assert_func = self.assertGreaterEqual
msg_tmpl = 'Expected %d items in field %s, but only got %d'
elif op == 'maxcount':
assert_func = assertLessEqual
assert_func = self.assertLessEqual
msg_tmpl = 'Expected maximum %d items in field %s, but got %d'
elif op == 'count':
assert_func = assertEqual
assert_func = self.assertEqual
msg_tmpl = 'Expected exactly %d items in field %s, but got %d'
else:
assert False
assert_func(
self, len(got), expected_num,
len(got), expected_num,
msg_tmpl % (expected_num, field, len(got)))
return
self.assertEqual(
@ -262,27 +262,6 @@ def assertRegexpMatches(self, text, regexp, msg=None):
self.assertTrue(m, msg)
def assertGreaterEqual(self, got, expected, msg=None):
if not (got >= expected):
if msg is None:
msg = '%r not greater than or equal to %r' % (got, expected)
self.assertTrue(got >= expected, msg)
def assertLessEqual(self, got, expected, msg=None):
if not (got <= expected):
if msg is None:
msg = '%r not less than or equal to %r' % (got, expected)
self.assertTrue(got <= expected, msg)
def assertEqual(self, got, expected, msg=None):
if not (got == expected):
if msg is None:
msg = '%r not equal to %r' % (got, expected)
self.assertTrue(got == expected, msg)
def expect_warnings(ydl, warnings_re):
real_warning = ydl.report_warning

View file

@ -9,8 +9,6 @@ import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
assertGreaterEqual,
assertLessEqual,
expect_warnings,
get_params,
gettestcases,
@ -36,12 +34,20 @@ from youtube_dl.utils import (
ExtractorError,
error_to_compat_str,
format_bytes,
IDENTITY,
preferredencoding,
UnavailableVideoError,
)
from youtube_dl.extractor import get_info_extractor
RETRIES = 3
# Some unittest APIs require actual str
if not isinstance('TEST', str):
_encode_str = lambda s: s.encode(preferredencoding())
else:
_encode_str = IDENTITY
class YoutubeDL(youtube_dl.YoutubeDL):
def __init__(self, *args, **kwargs):
@ -102,7 +108,7 @@ def generator(test_case, tname):
def print_skipping(reason):
print('Skipping %s: %s' % (test_case['name'], reason))
self.skipTest(reason)
self.skipTest(_encode_str(reason))
if not ie.working():
print_skipping('IE marked as not _WORKING')
@ -187,16 +193,14 @@ def generator(test_case, tname):
expect_info_dict(self, res_dict, test_case.get('info_dict', {}))
if 'playlist_mincount' in test_case:
assertGreaterEqual(
self,
self.assertGreaterEqual(
len(res_dict['entries']),
test_case['playlist_mincount'],
'Expected at least %d in playlist %s, but got only %d' % (
test_case['playlist_mincount'], test_case['url'],
len(res_dict['entries'])))
if 'playlist_maxcount' in test_case:
assertLessEqual(
self,
self.assertLessEqual(
len(res_dict['entries']),
test_case['playlist_maxcount'],
'Expected at most %d in playlist %s, but got %d' % (
@ -243,8 +247,8 @@ def generator(test_case, tname):
if params.get('test'):
expected_minsize = max(expected_minsize, 10000)
got_fsize = os.path.getsize(tc_filename)
assertGreaterEqual(
self, got_fsize, expected_minsize,
self.assertGreaterEqual(
got_fsize, expected_minsize,
'Expected %s to be at least %s, but it\'s only %s ' %
(tc_filename, format_bytes(expected_minsize),
format_bytes(got_fsize)))

View file

@ -1039,8 +1039,8 @@ class YoutubeDL(object):
elif result_type in ('playlist', 'multi_video'):
# Protect from infinite recursion due to recursively nested playlists
# (see https://github.com/ytdl-org/youtube-dl/issues/27833)
webpage_url = ie_result['webpage_url']
if webpage_url in self._playlist_urls:
webpage_url = ie_result.get('webpage_url') # not all pl/mv have this
if webpage_url and webpage_url in self._playlist_urls:
self.to_screen(
'[download] Skipping already downloaded playlist: %s'
% ie_result.get('title') or ie_result.get('id'))
@ -1048,6 +1048,10 @@ class YoutubeDL(object):
self._playlist_level += 1
self._playlist_urls.add(webpage_url)
new_result = dict((k, v) for k, v in extra_info.items() if k not in ie_result)
if new_result:
new_result.update(ie_result)
ie_result = new_result
try:
return self.__process_playlist(ie_result, download)
finally:
@ -1593,6 +1597,28 @@ class YoutubeDL(object):
self.cookiejar.add_cookie_header(pr)
return pr.get_header('Cookie')
def _fill_common_fields(self, info_dict, final=True):
for ts_key, date_key in (
('timestamp', 'upload_date'),
('release_timestamp', 'release_date'),
):
if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
# see http://bugs.python.org/issue1646728)
try:
upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d'))
except (ValueError, OverflowError, OSError):
pass
# Auto generate title fields corresponding to the *_number fields when missing
# in order to always have clean titles. This is very common for TV series.
if final:
for field in ('chapter', 'season', 'episode'):
if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video'
@ -1660,24 +1686,7 @@ class YoutubeDL(object):
if 'display_id' not in info_dict and 'id' in info_dict:
info_dict['display_id'] = info_dict['id']
for ts_key, date_key in (
('timestamp', 'upload_date'),
('release_timestamp', 'release_date'),
):
if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
# see http://bugs.python.org/issue1646728)
try:
upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d'))
except (ValueError, OverflowError, OSError):
pass
# Auto generate title fields corresponding to the *_number fields when missing
# in order to always have clean titles. This is very common for TV series.
for field in ('chapter', 'season', 'episode'):
if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
self._fill_common_fields(info_dict)
for cc_kind in ('subtitles', 'automatic_captions'):
cc = info_dict.get(cc_kind)

View file

@ -616,6 +616,7 @@ from .lego import LEGOIE
from .lemonde import LemondeIE
from .lenta import LentaIE
from .libraryofcongress import LibraryOfCongressIE
from .librivox import LibriVoxIE
from .libsyn import LibsynIE
from .lifenews import (
LifeNewsIE,
@ -898,21 +899,13 @@ from .ooyala import (
)
from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
ORFFM4IE,
ORFONIE,
ORFONLiveIE,
ORFFM4StoryIE,
ORFOE1IE,
ORFOE3IE,
ORFNOEIE,
ORFWIEIE,
ORFBGLIE,
ORFOOEIE,
ORFSTMIE,
ORFKTNIE,
ORFSBGIE,
ORFTIRIE,
ORFVBGIE,
ORFIPTVIE,
ORFPodcastIE,
ORFRadioIE,
ORFRadioCollectionIE,
)
from .outsidetv import OutsideTVIE
from .packtpub import (

View file

@ -0,0 +1,54 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import orderedSet
class LibriVoxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?librivox\.org/(?P<id>(?P<title>(?:[^\-]*\-)+[^\-]*)\-by\-(?P<author>(-.*\-)*[^/]*))/?'
_TESTS = [{
'url': 'https://librivox.org/the-art-of-war-by-sun-tzu/',
'info_dict': {
'id': 'the-art-of-war-by-sun-tzu',
'title': 'The Art Of War by Sun Tzu',
'description': '"The Art of War is a Chinese military treatise written during the 6th century BC by Sun Tzu. Composed of 13 chapters, each of which is devoted to one aspect of warfare, it has long been praised as the definitive work on military strategies and tactics of its time. The Art of War is one of the oldest and most famous studies of strategy and has had a huge influence on both military planning and beyond. The Art of War has also been applied, with much success, to business and managerial strategies." (summary from Wikipedia)'
},
'playlist_mincount': 7
}, {
'url': 'https://librivox.org/alexander-the-great-by-jacob-abbott/',
'info_dict': {
'id': 'alexander-the-great-by-jacob-abbott',
'title': 'Alexander The Great by Jacob Abbott',
'description': 'Alexander the Great was one of the most successful military commanders in history, and was undefeated in battle. By the time of his death, he had conquered most of the world known to the ancient Greeks.\nAlexander the Great is one of many biographies aimed at young people written by Jacob Abbott and his brother. The biographies are written in such a way that makes them appealing and easily accessible to everyone. - Written by Wikipedia and Lizzie Driver'
},
'playlist_mincount': 12
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
book_title = self._html_search_regex(
r'<h1>(?P<title>.+)</h1>', webpage, 'title', group='title', fatal=False).lower().title() or mobj.group('title').replace('-', ' ').strip().title()
author = mobj.group('author').replace('-', ' ').strip().title()
description = self._html_search_regex(
r'<div class=(["\'])description\1(^>)*>(<p(^>)*>)?(?P<desc>.+)(</p>)?</div>',
webpage, 'description', group='desc', fatal=False) or self._og_search_description(webpage)
info = {
'id': video_id,
'_type': 'playlist',
'title': book_title + ' by ' + author,
'description': description
}
links = orderedSet(re.findall(r'<a href=(["\'])(https?://(?:www\.)?archive\.org/download/[^/]*/([^\.]*(?<!(?:64kb)))\.mp3)\1.*>(.*)</a>', webpage))
info['entries'] = [self.url_result(link[1], video_id=link[2], video_title=link[3]) for link in links]
return info

View file

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
@ -10,7 +11,7 @@ from ..compat import (
compat_ord,
compat_str,
compat_urllib_parse_unquote,
compat_zip
compat_zip as zip,
)
from ..utils import (
int_or_none,
@ -24,7 +25,7 @@ class MixcloudBaseIE(InfoExtractor):
def _call_api(self, object_type, object_fields, display_id, username, slug=None):
lookup_key = object_type + 'Lookup'
return self._download_json(
'https://www.mixcloud.com/graphql', display_id, query={
'https://app.mixcloud.com/graphql', display_id, query={
'query': '''{
%s(lookup: {username: "%s"%s}) {
%s
@ -44,7 +45,7 @@ class MixcloudIE(MixcloudBaseIE):
'ext': 'm4a',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
'uploader': 'dholbach', # was: 'Daniel Holbach',
'uploader_id': 'dholbach',
'thumbnail': r're:https?://.*\.jpg',
'view_count': int,
@ -57,7 +58,7 @@ class MixcloudIE(MixcloudBaseIE):
'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat',
'ext': 'mp3',
'title': 'Caribou 7 inch Vinyl Mix & Chat',
'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
'description': r're:Last week Dan Snaith aka Caribou swung by the Brownswood.{136}',
'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
'thumbnail': 're:https?://.*',
@ -65,6 +66,23 @@ class MixcloudIE(MixcloudBaseIE):
'timestamp': 1422987057,
'upload_date': '20150203',
},
'params': {
'skip_download': '404 not found',
},
}, {
'url': 'https://www.mixcloud.com/gillespeterson/carnival-m%C3%BAsica-popular-brasileira-mix/',
'info_dict': {
'id': 'gillespeterson_carnival-música-popular-brasileira-mix',
'ext': 'm4a',
'title': 'Carnival Música Popular Brasileira Mix',
'description': r're:Gilles was recently in Brazil to play at Boiler Room.{208}',
'timestamp': 1454347174,
'upload_date': '20160201',
'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
'thumbnail': 're:https?://.*',
'view_count': int,
},
}, {
'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
'only_matching': True,
@ -76,10 +94,10 @@ class MixcloudIE(MixcloudBaseIE):
"""Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
return ''.join([
compat_chr(compat_ord(ch) ^ compat_ord(k))
for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
for ch, k in zip(ciphertext, itertools.cycle(key))])
def _real_extract(self, url):
username, slug = re.match(self._VALID_URL, url).groups()
username, slug = self._match_valid_url(url).groups()
username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug)
track_id = '%s_%s' % (username, slug)

File diff suppressed because it is too large Load diff

View file

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
float_or_none,
get_element_by_id,
@ -11,6 +12,7 @@ from ..utils import (
strip_or_none,
unified_strdate,
urljoin,
str_to_int,
)
@ -35,6 +37,26 @@ class VidLiiIE(InfoExtractor):
'categories': ['News & Politics'],
'tags': ['Vidlii', 'Jan', 'Videogames'],
}
}, {
# HD
'url': 'https://www.vidlii.com/watch?v=2Ng8Abj2Fkl',
'md5': '450e7da379c884788c3a4fa02a3ce1a4',
'info_dict': {
'id': '2Ng8Abj2Fkl',
'ext': 'mp4',
'title': 'test',
'description': 'md5:cc55a86032a7b6b3cbfd0f6b155b52e9',
'thumbnail': 'https://www.vidlii.com/usfi/thmp/2Ng8Abj2Fkl.jpg',
'uploader': 'VidLii',
'uploader_url': 'https://www.vidlii.com/user/VidLii',
'upload_date': '20200927',
'duration': 5,
'view_count': int,
'comment_count': int,
'average_rating': float,
'categories': ['Film & Animation'],
'tags': list,
},
}, {
'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0',
'only_matching': True,
@ -46,11 +68,32 @@ class VidLiiIE(InfoExtractor):
webpage = self._download_webpage(
'https://www.vidlii.com/watch?v=%s' % video_id, video_id)
video_url = self._search_regex(
r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage,
'video url', group='url')
formats = []
title = self._search_regex(
def add_format(format_url, height=None):
height = int(self._search_regex(r'(\d+)\.mp4',
format_url, 'height', default=360))
formats.append({
'url': format_url,
'format_id': '%dp' % height if height else None,
'height': height,
})
sources = re.findall(
r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1',
webpage)
formats = []
if len(sources) > 1:
add_format(sources[1][1])
self._check_formats(formats, video_id)
if len(sources) > 0:
add_format(sources[0][1])
self._sort_formats(formats)
title = self._html_search_regex(
(r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage,
'title')
@ -82,9 +125,9 @@ class VidLiiIE(InfoExtractor):
default=None) or self._search_regex(
r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
view_count = int_or_none(self._search_regex(
(r'<strong>(\d+)</strong> views',
r'Views\s*:\s*<strong>(\d+)</strong>'),
view_count = str_to_int(self._html_search_regex(
(r'<strong>([\d,.]+)</strong> views',
r'Views\s*:\s*<strong>([\d,.]+)</strong>'),
webpage, 'view count', fatal=False))
comment_count = int_or_none(self._search_regex(
@ -109,7 +152,7 @@ class VidLiiIE(InfoExtractor):
return {
'id': video_id,
'url': video_url,
'formats': formats,
'title': title,
'description': description,
'thumbnail': thumbnail,