Compare commits

...

7 Commits

Author SHA1 Message Date
TinyToweringTree c8b3e11cda
Merge d7c2b5dac8 into 668332b973 2024-04-25 11:50:49 +02:00
dirkf 668332b973 [YouPorn] Add playlist extractors
* YouPornCategoryIE
* YouPornChannelIE
* YouPornCollectionIE
* YouPornStarIE
* YouPornTagIE
* YouPornVideosIE,
2024-04-22 01:34:26 +01:00
dirkf 0b2ce3685e [YouPorn] Improve extraction
* detect unwatchable videos
* improve duration extraction
* fix count extraction and support large values
* detect and remove SEO spam boilerplate description
2024-04-22 01:34:26 +01:00
dirkf c2766cb80e [test/test_download] Support 'playlist_maxcount:count' expected value
* parallel to `playlist_mincount'
* specify both for a range of playlist lengths
* if max < min the test will always fail!
2024-04-22 01:34:26 +01:00
dirkf eb38665438 [YouPorn] Incorporate yt-dlp PR 8827
* from https://github.com/yt-dlp/yt-dlp/pull/8827
* extract from webpage instead of broken API URL
* thx The-MAGI
2024-04-22 01:34:26 +01:00
TinyToweringTree d7c2b5dac8 [ardaudiothek] Add an extractor for search results 2019-09-09 21:10:04 +02:00
TinyToweringTree cd2c7ab40e [ardaudiothek] Add support for the ARD Audiothek 2019-09-09 21:10:04 +02:00
4 changed files with 1020 additions and 48 deletions

View File

@ -10,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
assertGreaterEqual,
assertLessEqual,
expect_warnings,
get_params,
gettestcases,
@ -122,7 +123,10 @@ def generator(test_case, tname):
params['outtmpl'] = tname + '_' + params['outtmpl']
if is_playlist and 'playlist' not in test_case:
params.setdefault('extract_flat', 'in_playlist')
params.setdefault('playlistend', test_case.get('playlist_mincount'))
params.setdefault('playlistend',
test_case['playlist_maxcount'] + 1
if test_case.get('playlist_maxcount')
else test_case.get('playlist_mincount'))
params.setdefault('skip_download', True)
ydl = YoutubeDL(params, auto_init=False)
@ -190,6 +194,14 @@ def generator(test_case, tname):
'Expected at least %d in playlist %s, but got only %d' % (
test_case['playlist_mincount'], test_case['url'],
len(res_dict['entries'])))
if 'playlist_maxcount' in test_case:
assertLessEqual(
self,
len(res_dict['entries']),
test_case['playlist_maxcount'],
'Expected at most %d in playlist %s, but got %d' % (
test_case['playlist_maxcount'], test_case['url'],
len(res_dict['entries'])))
if 'playlist_count' in test_case:
self.assertEqual(
len(res_dict['entries']),

View File

@ -0,0 +1,439 @@
# coding: utf-8
from __future__ import unicode_literals
import re
try:
from urllib.parse import unquote as _unquote_compat
except ImportError:
from urllib import unquote
def _unquote_compat(str):
return unquote(str.encode('utf-8')).decode('utf-8')
from .common import InfoExtractor
from ..utils import (
compat_str,
ExtractorError,
int_or_none,
parse_duration,
str_or_none,
try_get,
unified_strdate,
unified_timestamp,
)
class ARDAudiothekBaseIE(InfoExtractor):
def _extract_episode_info(self, title):
"""Try to extract episode data from the title."""
res = {}
if not title:
return res
for pattern in [
r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
]:
m = re.match(pattern, title)
if m:
groupdict = m.groupdict()
for int_entry in ['season_number', 'episode_number']:
res[int_entry] = int_or_none(groupdict.get(int_entry))
for str_entry in ['episode']:
res[str_entry] = str_or_none(groupdict.get(str_entry))
# Build the episode title by removing numeric episode
# information.
if groupdict.get('ep_info') and not res['episode']:
res['episode'] = str_or_none(
title.replace(groupdict.get('ep_info'), ''))
if res['episode']:
res['episode'] = res['episode'].strip()
break
# As a fallback use the whole title as the episode name
if not res.get('episode'):
res['episode'] = title.strip()
return res
def _extract_id_title_desc(self, json_data):
res = {
'id': try_get(json_data, lambda x: x['id'], compat_str),
'display_id': try_get(json_data, lambda x: x['slug'], compat_str),
}
res['title'] = try_get(
json_data, lambda x: x['title'], compat_str)
res['description'] = try_get(
json_data, lambda x: x['summary'], compat_str)
return res
def _extract_episode(self, ep_data):
res = self._extract_id_title_desc(ep_data)
res['url'] = try_get(ep_data, [
lambda x: x['enclosure']['download_url'],
lambda x: x['enclosure']['playback_url'],
lambda x: x['guid'],
], compat_str)
if not res['url']:
raise ExtractorError(msg='Could not find a URL to download',
expected=True)
res['format_note'] = try_get(
ep_data, lambda x: x['enclosure']['type'], compat_str)
res['duration'] = parse_duration(
try_get(ep_data, lambda x: x['duration'], compat_str))
res['release_date'] = unified_strdate(
try_get(ep_data, lambda x: x['publication_date'], compat_str))
res['timestamp'] = unified_timestamp(
try_get(ep_data, lambda x: x['publication_date'], compat_str))
res['channel'] = try_get(ep_data, [
lambda x: x['podcast']['station'],
lambda x: x['podcast']['organization_name'],
], compat_str)
# 'sharing_url' might be a redirecting URL. The generic extractor will
# handle the redirection just fine, so that this extractor here will
# be used.
res['webpage_url'] = try_get(
ep_data, lambda x: x['sharing_url'], compat_str)
res['categories'] = [
try_get(ep_data, lambda x: x['podcast']['category'], compat_str),
]
res['is_live'] = False
res['series'] = try_get(ep_data,
lambda x: x['podcast']['title'],
compat_str)
def make_thumbnail(url, id, preference):
# Note that the images don't necessarily have the advertised
# aspect ratio! So don't set the height based on the aspect
# ratio.
# Also note that the server will not return an image of any given
# width. Most multiples of 32 (or of 64 for higher numbers) seem to
# work. When requesting a width of 1080, the server returns an
# image with a width of 1024, for instance. Requesting 1400 gives
# us 1344, and so on. So a width of 1920 works best for both 1x1
# and 16x9 images.
thumb_width = 1920
return {
'id': id,
# Only set the width if we actually replace the {width}
# placeholder in the URL.
'width': thumb_width if '{width}' in url else None,
'url': url.replace('{width}', str(thumb_width)),
'preference': preference,
}
# We prefer 1x1 images and we prefer episode images. But still provide
# all available images so that the user can choose. We use the
# thumbnail's 'preference' entry to sort them (the higher the better).
# The preferred thumbnail order is:
# (0) podcast-16x9 < (1) episode-16x9
# < (2) podcast-1x1 < (3) episode-1x1
thumbnails = []
for ar_index, aspect_ratio in enumerate(['16x9', '1x1']):
image_key = 'image_%s' % aspect_ratio
image_sources = [
{'name': 'podcast',
'access': lambda x: x['podcast'][image_key]},
{'name': 'episode',
'access': lambda x: x[image_key]},
]
for src_index, src in enumerate(image_sources):
thumb_url = try_get(ep_data, src['access'], compat_str)
if thumb_url:
thumbnails.append(make_thumbnail(
thumb_url,
src['name'] + '-' + aspect_ratio,
ar_index * len(image_sources) + src_index))
res['thumbnails'] = thumbnails
res.update(self._extract_episode_info(res.get('title')))
return res
class ARDAudiothekIE(ARDAudiothekBaseIE):
_VALID_URL = r'https?://(?:www\.|beta\.)?ardaudiothek\.de/(?:[^/]+)/(?:[^/]+)/(?P<id>[0-9]+)(?:/.*)?'
_TESTS = [{
'url': 'https://www.ardaudiothek.de/hoerspiel-pool/virginia-woolf-zum-leuchtturm-1-3-die-tuer-aus-glas/53728640',
'md5': 'dc12a86bb46faadbdba7a8c9b5a24246',
'info_dict': {
'id': '53728640',
'ext': 'mp3',
'title': 'Virginia Woolf: Zum Leuchtturm (1/3) - Die Tür aus Glas',
'description': r're:^Am Anfang steht die Frage.*',
'thumbnail': compat_str,
'timestamp': 1478818860,
'upload_date': '20161110',
}
}, {
'url': 'https://www.ardaudiothek.de/eine-stunde-talk/soziologe-matthias-quent-nicht-neutral-gegenueber-rechtsradikalismus/65904422',
'md5': '326065e45e8172124165c3b0addd4553',
'info_dict': {
'id': '65904422',
'ext': 'mp3',
'title': 'Soziologe Matthias Quent - Nicht neutral gegenüber Rechtsradikalismus',
'description': r're:^Matthias Quent erforscht die Ziele.*',
'thumbnail': compat_str,
'timestamp': 1565809200,
'upload_date': '20190814',
}
}]
def _real_extract(self, url):
episode_id = self._match_id(url)
api_url = 'https://www.ardaudiothek.de/api/episodes/%s' % episode_id
result_data = self._download_json(api_url, episode_id, fatal=False)
ep_data = try_get(result_data, lambda x: x['result']['episode'], dict)
if not ep_data:
raise ExtractorError(msg="Could not find any episode data",
expected=True)
return self._extract_episode(ep_data)
class ARDAudiothekPlaylistIE(ARDAudiothekBaseIE):
_VALID_URL = r'https?://(?:www\.|beta\.)?ardaudiothek\.de/(?!kategorie)(?:[^/]+)/(?P<id>[0-9]+)(?:/.*)?'
_TESTS = [{
'url': 'https://www.ardaudiothek.de/wirtschaft/62037362',
'info_dict': {
'id': '62037362',
'title': 'Wirtschaft',
'description': compat_str,
},
'playlist_mincount': 5,
}, {
'url': 'https://www.ardaudiothek.de/redezeit/7852070',
'info_dict': {
'id': '7852070',
'title': 'Redezeit',
'description': compat_str,
},
'playlist_mincount': 5,
}, {
'url': 'https://www.ardaudiothek.de/nur-fuer-starke-nerven-krimis-und-thriller/51581650/alle',
'info_dict': {
'id': '51581650',
'title': r're:^Nur für starke Nerven',
'description': compat_str,
},
'playlist_mincount': 5,
}]
def _get_page_str(self, page):
# The API sometimes returns 404s for page=1. So only add that
# parameter if we actually are past the first page
return '&page=' + compat_str(page) if page > 1 else ''
def _get_episode_from_array_entry(self, array_entry):
# The array entry already is a an 'episode' dict.
return array_entry
def _extract_episodes(
self, display_id, api_url_template, default_items_per_page):
"""
Extract episodes by calling a web API end point.
Sometimes the server does not respond properly when requesting a page.
This also happens on the website. It sometimes hangs when trying to
load more search results, for instance. Thus the number of entries
reported by the API is often wrong and we do not solely rely on that
number to stop reading episodes.
This function handles paginated content in a robust way by skipping
over faulty server responses. In this case it reduces the page size to
get as many episodes as possible. It also removes duplicate entries
from the result.
Args:
display_id: Only used for user feedback.
api_url_template: This is the URL of the API to download JSON data
from. It is a format string expected to have the following
fields:
- {items_per_page}
- {page_str}
default_items_per_page: The number of items to fetch per page.
It is best to set this to the same value that is used by the
website when accessing the API. This function automatically
reduces the number of items per page when the server responds
with errors or missing data.
Returns:
A list of extracted episode dicts to be used as playlist entries.
Raises:
ExtractorError: Might be raised when extracting episode data.
"""
items_per_page = default_items_per_page
page = 1
entries = []
# The number of entries as reported by the API
n_entries = None
# The API sometimes returns an empty page without any episodes. In this
# case the next page often has episodes. This, however, throws off
# the total number of entries and it no longer becomes a reliable
# stopping condition when comparing it with the number of entries
# reported by the API. So we deal with this by not stopping at the
# first occurance of an empty page. We skip over a certain number of
# empty pages before giving up.
max_n_skipped_pages = default_items_per_page + 3
n_skipped_pages = 0
while True:
# We need this to check if we actually added any entries
n_entries_before_this_page = len(entries)
# Fetch data
api_url = api_url_template.format(
page_str=self._get_page_str(page),
items_per_page=items_per_page)
result_data = self._download_json(api_url, display_id, fatal=False)
episodes = try_get(result_data,
lambda x: x['result']['episodes'],
list)
# Add entries
for episode in episodes or []:
entry = self._extract_episode(
self._get_episode_from_array_entry(episode))
if entry not in entries:
entries.append(entry)
# Fetch how many episodes the API says it has (it's enough to
# read it once)
n_entries = n_entries if n_entries is not None else try_get(
result_data,
lambda x: x['result']['meta']['episodes']['total'],
int)
# Check if we have read the reported number of episodes
if n_entries is not None and len(entries) >= n_entries:
break
# Check if we actually added any entries
if n_entries_before_this_page == len(entries):
# This was an empty page so we have to skip it
n_skipped_pages += 1
if n_skipped_pages >= max_n_skipped_pages:
# Enough skipping, give up
break
# Throttle by reading only half as many entries as before
if items_per_page > 1:
new_items_per_page = int(max(1, items_per_page / 2))
page = int((page - 1) * items_per_page /
new_items_per_page)
items_per_page = new_items_per_page
else:
# This page had episodes, so we're no longer skipping
n_skipped_pages = 0
# Try to go back to full speed by going back to the default
# items_per_page value if possible.
if items_per_page * page % default_items_per_page == 0:
page = int(page * items_per_page /
default_items_per_page)
items_per_page = default_items_per_page
page += 1
# Tell the user if we received less entries than the API reported
if n_entries is not None and len(entries) < n_entries:
self.to_screen('Received {} of {} reported episodes'.format(
len(entries), n_entries))
return entries
def _real_extract(self, url):
podcast_id = self._match_id(url)
api_url = 'https://www.ardaudiothek.de/api/podcasts/%s' % podcast_id
result_data = self._download_json(api_url, podcast_id, fatal=False)
pc_data = try_get(result_data, lambda x: x['result']['podcast'], dict)
if not pc_data:
raise ExtractorError(msg="Could not find any playlist data",
expected=True)
res = self._extract_id_title_desc(pc_data)
res['_type'] = 'playlist'
# items_per_page works from 1 up to 2147483647 (2^31 - 1).
# The website calls the API with items_per_page set to 24. Setting it
# to 500 or 1000 would download the data of all episodes in one or two
# pages. Increasing this value might however trigger server errors in
# the future. So to avoid any problems we will keep using the default
# value and just download a few more pages.
res['entries'] = self._extract_episodes(
podcast_id,
'https://www.ardaudiothek.de/api/podcasts/%s/episodes?items_per_page={items_per_page}{page_str}' % podcast_id,
24)
return res
class ARDAudiothekSearchIE(ARDAudiothekPlaylistIE):
_VALID_URL = r'https?://(?:www\.|beta\.)?ardaudiothek\.de/suche\?(?:(?!q=).*&)?q=(?P<id>[^&]+)(?:&.*)?'
_TESTS = [{
'url': 'https://www.ardaudiothek.de/suche?q=Sommer',
'info_dict': {
'id': 'Sommer',
'title': 'Sommer',
'description': compat_str,
},
'playlist_mincount': 5,
}, {
'url': 'https://www.ardaudiothek.de/suche?q=Angela%20Merkel',
'info_dict': {
'id': 'Angela%20Merkel',
'title': 'Angela Merkel',
'description': compat_str,
},
'playlist_mincount': 5,
}]
def _get_page_str(self, page):
# The search API always works with a page number
return '&page=' + compat_str(page)
def _get_episode_from_array_entry(self, array_entry):
# The array entry is a dict with an 'episode' and a 'search_meta' entry
return try_get(array_entry, lambda x: x['episode'], dict)
def _real_extract(self, url):
search_str = self._match_id(url)
display_str = _unquote_compat(search_str)
return {
'_type': 'playlist',
'id': search_str,
'display_id': display_str,
'title': display_str,
'description': 'ARD Audiothek-Suche nach "' + display_str + '"',
# Searching on the website calls the API with items_per_page set
# to 8. Other values sometimes cause server errors.
'entries': self._extract_episodes(
display_str,
'https://www.ardaudiothek.de/api/search/%s?focus=episodes{page_str}&items_per_page={items_per_page}' % search_str,
8),
}

View File

@ -71,6 +71,11 @@ from .ard import (
ARDIE,
ARDMediathekIE,
)
from .ardaudiothek import (
ARDAudiothekIE,
ARDAudiothekPlaylistIE,
ARDAudiothekSearchIE,
)
from .arte import (
ArteTVIE,
ArteTVEmbedIE,
@ -1653,7 +1658,15 @@ from .younow import (
YouNowChannelIE,
YouNowMomentIE,
)
from .youporn import YouPornIE
from .youporn import (
YouPornIE,
YouPornCategoryIE,
YouPornChannelIE,
YouPornCollectionIE,
YouPornStarIE,
YouPornTagIE,
YouPornVideosIE,
)
from .yourporn import YourPornIE
from .yourupload import YourUploadIE
from .youtube import (

View File

@ -1,20 +1,38 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from time import sleep
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
ExtractorError,
get_element_by_class,
get_element_by_id,
int_or_none,
str_to_int,
merge_dicts,
parse_count,
parse_qs,
T,
traverse_obj,
unified_strdate,
url_or_none,
urljoin,
)
class YouPornIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
_VALID_URL = (
r'youporn:(?P<id>\d+)',
r'''(?x)
https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)
(?:/(?:(?P<display_id>[^/?#&]+)/?)?)?(?:[#?]|$)
'''
)
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)']
_TESTS = [{
'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
'md5': '3744d24c50438cf5b6f6d59feb5055c2',
@ -34,7 +52,7 @@ class YouPornIE(InfoExtractor):
'tags': list,
'age_limit': 18,
},
'skip': 'This video has been disabled',
'skip': 'This video has been deactivated',
}, {
# Unknown uploader
'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4',
@ -66,57 +84,104 @@ class YouPornIE(InfoExtractor):
}, {
'url': 'https://www.youporn.com/watch/13922959/femdom-principal/',
'only_matching': True,
}, {
'url': 'https://www.youporn.com/watch/16290308/tinderspecial-trailer1/',
'info_dict': {
'id': '16290308',
'age_limit': 18,
'categories': [],
'description': None, # SEO spam using title removed
'display_id': 'tinderspecial-trailer1',
'duration': 298.0,
'ext': 'mp4',
'upload_date': '20201123',
'uploader': 'Ersties',
'tags': [],
'thumbnail': 'https://fi1.ypncdn.com/m=eaSaaTbWx/202011/23/16290308/original/3.jpg',
'timestamp': 1606147564,
'title': 'Tinder In Real Life',
'view_count': int,
}
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)',
webpage)
@classmethod
def _extract_urls(cls, webpage):
def yield_urls():
for p in cls._EMBED_REGEX:
for m in re.finditer(p, webpage):
yield m.group('url')
return list(yield_urls())
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
# A different video ID (data-video-id) is hidden in the page but
# never seems to be used
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
url = 'http://www.youporn.com/watch/%s' % (video_id,)
webpage = self._download_webpage(
url, video_id, headers={'Cookie': 'age_verified=1'})
definitions = self._download_json(
'https://www.youporn.com/api/video/media_definitions/%s/' % video_id,
display_id)
watchable = self._search_regex(
r'''(<div\s[^>]*\bid\s*=\s*('|")?watch-container(?(2)\2|(?!-)\b)[^>]*>)''',
webpage, 'watchability', default=None)
if not watchable:
msg = re.split(r'\s{4}', clean_html(get_element_by_id(
'mainContent', webpage)) or '')[0]
raise ExtractorError(
('%s says: %s' % (self.IE_NAME, msg))
if msg else 'Video unavailable: no reason found',
expected=True)
# internal ID ?
# video_id = extract_attributes(watchable).get('data-video-id')
playervars = self._search_json(
r'\bplayervars\s*:', webpage, 'playervars', video_id)
def get_fmt(x):
v_url = url_or_none(x.get('videoUrl'))
if v_url:
x['videoUrl'] = v_url
return (x['format'], x)
defs_by_format = dict(traverse_obj(playervars, (
'mediaDefinitions', lambda _, v: v.get('format'), T(get_fmt))))
def get_format_data(f):
if f not in defs_by_format:
return []
return self._download_json(
defs_by_format[f]['videoUrl'], video_id, '{0}-formats'.format(f))
formats = []
for definition in definitions:
if not isinstance(definition, dict):
continue
video_url = url_or_none(definition.get('videoUrl'))
if not video_url:
continue
f = {
'url': video_url,
'filesize': int_or_none(definition.get('videoSize')),
}
height = int_or_none(definition.get('quality'))
# Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
for hls_url in traverse_obj(
get_format_data('hls'),
(lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'),
(Ellipsis, 'videoUrl')):
formats.extend(self._extract_m3u8_formats(
hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls',
entry_protocol='m3u8_native'))
for f in traverse_obj(get_format_data('mp4'), (
lambda _, v: v.get('videoUrl'), {
'url': ('videoUrl', T(url_or_none)),
'filesize': ('videoSize', T(int_or_none)),
'height': ('quality', T(int_or_none)),
}, T(lambda x: x.get('videoUrl') and x))):
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /videos/201703/11/109285532/1080P_4000K_109285532.mp4
# We will benefit from it by extracting some metadata
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', f['videoUrl'])
if mobj:
if not height:
height = int(mobj.group('height'))
bitrate = int(mobj.group('bitrate'))
f.update({
'format_id': '%dp-%dk' % (height, bitrate),
'tbr': bitrate,
})
f['height'] = height
if not f.get('height'):
f['height'] = int(mobj.group('height'))
f['tbr'] = int(mobj.group('bitrate'))
f['format_id'] = '%dp-%dk' % (f['height'], f['tbr'])
formats.append(f)
self._sort_formats(formats)
webpage = self._download_webpage(
'http://www.youporn.com/watch/%s' % video_id, display_id,
headers={'Cookie': 'age_verified=1'})
title = self._html_search_regex(
r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
webpage, 'title', default=None) or self._og_search_title(
@ -131,8 +196,10 @@ class YouPornIE(InfoExtractor):
thumbnail = self._search_regex(
r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',
webpage, 'thumbnail', fatal=False, group='thumbnail')
duration = int_or_none(self._html_search_meta(
'video:duration', webpage, 'duration', fatal=False))
duration = traverse_obj(playervars, ('duration', T(int_or_none)))
if duration is None:
duration = int_or_none(self._html_search_meta(
'video:duration', webpage, 'duration', fatal=False))
uploader = self._html_search_regex(
r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
@ -148,11 +215,11 @@ class YouPornIE(InfoExtractor):
view_count = None
views = self._search_regex(
r'(<div[^>]+\bclass=["\']js_videoInfoViews["\']>)', webpage,
'views', default=None)
r'(<div\s[^>]*\bdata-value\s*=[^>]+>)\s*<label>Views:</label>',
webpage, 'views', default=None)
if views:
view_count = str_to_int(extract_attributes(views).get('data-value'))
comment_count = str_to_int(self._search_regex(
view_count = parse_count(extract_attributes(views).get('data-value'))
comment_count = parse_count(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)',
webpage, 'comment count', default=None))
@ -168,7 +235,10 @@ class YouPornIE(InfoExtractor):
r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>',
'tags')
return {
data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False) or {}
data.pop('url', None)
result = merge_dicts(data, {
'id': video_id,
'display_id': display_id,
'title': title,
@ -183,4 +253,442 @@ class YouPornIE(InfoExtractor):
'tags': tags,
'age_limit': age_limit,
'formats': formats,
}
})
# Remove promotional non-description
if result.get('description', '').startswith(
'Watch %s online' % (result['title'],)):
del result['description']
return result
class YouPornListBase(InfoExtractor):
# pattern in '.title-text' element of page section containing videos
_PLAYLIST_TITLEBAR_RE = r'\s+[Vv]ideos\s*$'
_PAGE_RETRY_COUNT = 0 # ie, no retry
_PAGE_RETRY_DELAY = 2 # seconds
def _get_next_url(self, url, pl_id, html):
return urljoin(url, self._search_regex(
r'''<a\s[^>]*?\bhref\s*=\s*("|')(?P<url>(?:(?!\1)[^>])+)\1''',
get_element_by_id('next', html) or '', 'next page',
group='url', default=None))
@classmethod
def _get_title_from_slug(cls, title_slug):
return re.sub(r'[_-]', ' ', title_slug)
def _entries(self, url, pl_id, html=None, page_num=None):
# separates page sections
PLAYLIST_SECTION_RE = (
r'''<div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?title-bar(?:\s+[\w$-]+|\s)*\1[^>]*>'''
)
# contains video link
VIDEO_URL_RE = r'''(?x)
<div\s[^>]*\bdata-video-id\s*=\s*('|")\d+\1[^>]*>\s*
(?:<div\b[\s\S]+?</div>\s*)*
<a\s[^>]*\bhref\s*=\s*('|")(?P<url>(?:(?!\2)[^>])+)\2
'''
def yield_pages(url, html=html, page_num=page_num):
fatal = not html
for pnum in itertools.count(start=page_num or 1):
if not html:
html = self._download_webpage(
url, pl_id, note='Downloading page %d' % pnum,
fatal=fatal)
if not html:
break
fatal = False
yield (url, html, pnum)
# explicit page: extract just that page
if page_num is not None:
break
next_url = self._get_next_url(url, pl_id, html)
if not next_url or next_url == url:
break
url, html = next_url, None
def retry_page(msg, tries_left, page_data):
if tries_left <= 0:
return
self.report_warning(msg, pl_id)
sleep(self._PAGE_RETRY_DELAY)
return next(
yield_pages(page_data[0], page_num=page_data[2]), None)
def yield_entries(html):
for frag in re.split(PLAYLIST_SECTION_RE, html):
if not frag:
continue
t_text = get_element_by_class('title-text', frag or '')
if not (t_text and re.search(self._PLAYLIST_TITLEBAR_RE, t_text)):
continue
for m in re.finditer(VIDEO_URL_RE, frag):
video_url = urljoin(url, m.group('url'))
if video_url:
yield self.url_result(video_url)
last_first_url = None
for page_data in yield_pages(url, html=html, page_num=page_num):
# page_data: url, html, page_num
first_url = None
tries_left = self._PAGE_RETRY_COUNT + 1
while tries_left > 0:
tries_left -= 1
for from_ in yield_entries(page_data[1]):
# may get the same page twice instead of empty page
# or (site bug) intead of actual next page
if not first_url:
first_url = from_['url']
if first_url == last_first_url:
# sometimes (/porntags/) the site serves the previous page
# instead but may provide the correct page after a delay
page_data = retry_page(
'Retrying duplicate page...', tries_left, page_data)
if page_data:
first_url = None
break
continue
yield from_
else:
if not first_url and 'no-result-paragarph1' in page_data[1]:
page_data = retry_page(
'Retrying empty page...', tries_left, page_data)
if page_data:
continue
else:
# success/failure
break
# may get an infinite (?) sequence of empty pages
if not first_url:
break
last_first_url = first_url
def _real_extract(self, url, html=None):
# exceptionally, id may be None
m_dict = self._match_valid_url(url).groupdict()
pl_id, page_type, sort = (m_dict.get(k) for k in ('id', 'type', 'sort'))
qs = parse_qs(url)
for q, v in qs.items():
if v:
qs[q] = v[-1]
else:
del qs[q]
base_id = pl_id or 'YouPorn'
title = self._get_title_from_slug(base_id)
if page_type:
title = '%s %s' % (page_type.capitalize(), title)
base_id = [base_id.lower()]
if sort is None:
title += ' videos'
else:
title = '%s videos by %s' % (title, re.sub(r'[_-]', ' ', sort))
base_id.append(sort)
if qs:
ps = ['%s=%s' % item for item in sorted(qs.items())]
title += ' (%s)' % ','.join(ps)
base_id.extend(ps)
pl_id = '/'.join(base_id)
return self.playlist_result(
self._entries(url, pl_id, html=html,
page_num=int_or_none(qs.get('page'))),
playlist_id=pl_id, playlist_title=title)
class YouPornCategoryIE(YouPornListBase):
IE_DESC = 'YouPorn category, with sorting, filtering and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>category)/(?P<id>[^/?#&]+)
(?:/(?P<sort>popular|views|rating|time|duration))?/?(?:[#?]|$)
'''
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/category/lingerie/popular/',
'info_dict': {
'id': 'lingerie/popular',
'title': 'Category lingerie videos by popular',
},
'playlist_mincount': 39,
}, {
'note': 'Filtered paginated list with single page result',
'url': 'https://www.youporn.com/category/lingerie/duration/?min_minutes=10',
'info_dict': {
'id': 'lingerie/duration/min_minutes=10',
'title': 'Category lingerie videos by duration (min_minutes=10)',
},
'playlist_maxcount': 30,
}, {
'note': 'Single page of full list',
'url': 'https://www.youporn.com/category/lingerie/popular?page=1',
'info_dict': {
'id': 'lingerie/popular/page=1',
'title': 'Category lingerie videos by popular (page=1)',
},
'playlist_count': 30,
}]
class YouPornChannelIE(YouPornListBase):
IE_DESC = 'YouPorn channel, with sorting and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>channel)/(?P<id>[^/?#&]+)
(?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$)
'''
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/channel/x-feeds/',
'info_dict': {
'id': 'x-feeds',
'title': 'Channel X-Feeds videos',
},
'playlist_mincount': 37,
}, {
'note': 'Single page of full list (no filters here)',
'url': 'https://www.youporn.com/channel/x-feeds/duration?page=1',
'info_dict': {
'id': 'x-feeds/duration/page=1',
'title': 'Channel X-Feeds videos by duration (page=1)',
},
'playlist_count': 24,
}]
@staticmethod
def _get_title_from_slug(title_slug):
return re.sub(r'_', ' ', title_slug).title()
class YouPornCollectionIE(YouPornListBase):
IE_DESC = 'YouPorn collection (user playlist), with sorting and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>collection)s/videos/(?P<id>\d+)
(?:/(?P<sort>rating|views|time|duration))?/?(?:[#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+in\s'
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/collections/videos/33044251/',
'info_dict': {
'id': '33044251',
'title': 'Collection Sexy Lips videos',
'uploader': 'ph-littlewillyb',
},
'playlist_mincount': 50,
}, {
'note': 'Single page of full list (no filters here)',
'url': 'https://www.youporn.com/collections/videos/33044251/time?page=1',
'info_dict': {
'id': '33044251/time/page=1',
'title': 'Collection Sexy Lips videos by time (page=1)',
'uploader': 'ph-littlewillyb',
},
'playlist_count': 20,
}]
def _real_extract(self, url):
pl_id = self._match_id(url)
html = self._download_webpage(url, pl_id)
playlist = super(YouPornCollectionIE, self)._real_extract(url, html=html)
infos = re.sub(r'\s+', ' ', clean_html(get_element_by_class(
'collection-infos', html)) or '')
title, uploader = self._search_regex(
r'^\s*Collection: (?P<title>.+?) \d+ VIDEOS \d+ VIEWS \d+ days LAST UPDATED From: (?P<uploader>[\w_-]+)',
infos, 'title/uploader', group=('title', 'uploader'), default=(None, None))
return merge_dicts({
'title': playlist['title'].replace(playlist['id'].split('/')[0], title),
'uploader': uploader,
}, playlist) if title else playlist
class YouPornTagIE(YouPornListBase):
IE_DESC = 'YouPorn tag (porntags), with sorting, filtering and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
porn(?P<type>tag)s/(?P<id>[^/?#&]+)
(?:/(?P<sort>views|rating|time|duration))?/?(?:[#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+tagged\s'
_PAGE_RETRY_COUNT = 1
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/porntags/austrian',
'info_dict': {
'id': 'austrian',
'title': 'Tag austrian videos',
},
'playlist_mincount': 35,
'expected_warnings': ['Retrying duplicate page'],
}, {
'note': 'Filtered paginated list with single page result',
'url': 'https://www.youporn.com/porntags/austrian/duration/?min_minutes=10',
'info_dict': {
'id': 'austrian/duration/min_minutes=10',
'title': 'Tag austrian videos by duration (min_minutes=10)',
},
# number of videos per page is (row x col) 2x3 + 6x4 + 2, or + 3,
# or more, varying with number of ads; let's set max as 9x4
# NB col 1 may not be shown in non-JS page with site CSS and zoom 100%
'playlist_maxcount': 32,
'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'],
}, {
'note': 'Single page of full list',
'url': 'https://www.youporn.com/porntags/austrian/?page=1',
'info_dict': {
'id': 'austrian/page=1',
'title': 'Tag austrian videos (page=1)',
},
'playlist_mincount': 32,
'playlist_maxcount': 34,
'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'],
}]
# YP tag navigation is broken, loses sort
def _get_next_url(self, url, pl_id, html):
next_url = super(YouPornTagIE, self)._get_next_url(url, pl_id, html)
if next_url:
n = self._match_valid_url(next_url)
if n:
s = n.groupdict().get('sort')
if s:
u = self._match_valid_url(url)
if u:
u = u.groupdict().get('sort')
if s and not u:
n = n.end('sort')
next_url = next_url[:n] + '/' + u + next_url[n:]
return next_url
class YouPornStarIE(YouPornListBase):
IE_DESC = 'YouPorn Pornstar, with description, sorting and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>pornstar)/(?P<id>[^/?#&]+)
(?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+[fF]eaturing\s'
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/pornstar/daynia/',
'info_dict': {
'id': 'daynia',
'title': 'Pornstar Daynia videos',
'description': r're:Daynia Rank \d+ Videos \d+ Views [\d,.]+ .+ Subscribers \d+',
},
'playlist_mincount': 45,
}, {
'note': 'Single page of full list (no filters here)',
'url': 'https://www.youporn.com/pornstar/daynia/?page=1',
'info_dict': {
'id': 'daynia/page=1',
'title': 'Pornstar Daynia videos (page=1)',
'description': 're:.{180,}',
},
'playlist_count': 26,
}]
@staticmethod
def _get_title_from_slug(title_slug):
return re.sub(r'_', ' ', title_slug).title()
def _real_extract(self, url):
pl_id = self._match_id(url)
html = self._download_webpage(url, pl_id)
playlist = super(YouPornStarIE, self)._real_extract(url, html=html)
INFO_ELEMENT_RE = r'''(?x)
<div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?pornstar-info-wrapper(?:\s+[\w$-]+|\s)*\1[^>]*>
(?P<info>[\s\S]+?)(?:</div>\s*){6,}
'''
infos = self._search_regex(INFO_ELEMENT_RE, html, 'infos', group='info', default='')
if infos:
infos = re.sub(
r'(?:\s*nl=nl)+\s*', ' ',
re.sub(r'(?u)\s+', ' ', clean_html(
re.sub('\n', 'nl=nl', infos)))).replace('ribe Subsc', '')
return merge_dicts({
'description': infos.strip() or None,
}, playlist)
class YouPornVideosIE(YouPornListBase):
IE_DESC = 'YouPorn video (browse) playlists, with sorting, filtering and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?:(?P<id>browse)/)?
(?P<sort>(?(id)
(?:duration|rating|time|views)|
(?:most_(?:favou?rit|view)ed|recommended|top_rated)?))
(?:[/#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'\s+(?:[Vv]ideos|VIDEOS)\s*$'
_TESTS = [{
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/',
'info_dict': {
'id': 'youporn',
'title': 'YouPorn videos',
},
'only_matching': True,
}, {
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/recommended',
'info_dict': {
'id': 'youporn/recommended',
'title': 'YouPorn videos by recommended',
},
'only_matching': True,
}, {
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/top_rated',
'info_dict': {
'id': 'youporn/top_rated',
'title': 'YouPorn videos by top rated',
},
'only_matching': True,
}, {
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/browse/time',
'info_dict': {
'id': 'browse/time',
'title': 'YouPorn videos by time',
},
'only_matching': True,
}, {
'note': 'Filtered paginated list with single page result',
'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=2',
'info_dict': {
'id': 'youporn/most_favorited/max_minutes=2/res=VR',
'title': 'YouPorn videos by most favorited (max_minutes=2,res=VR)',
},
'playlist_mincount': 10,
'playlist_maxcount': 28,
}, {
'note': 'Filtered paginated list with several pages',
'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=5',
'info_dict': {
'id': 'youporn/most_favorited/max_minutes=5/res=VR',
'title': 'YouPorn videos by most favorited (max_minutes=5,res=VR)',
},
'playlist_mincount': 45,
}, {
'note': 'Single page of full list',
'url': 'https://www.youporn.com/browse/time?page=1',
'info_dict': {
'id': 'browse/time/page=1',
'title': 'YouPorn videos by time (page=1)',
},
'playlist_count': 36,
}]
@staticmethod
def _get_title_from_slug(title_slug):
return 'YouPorn' if title_slug == 'browse' else title_slug