Compare commits

...

8 Commits

Author SHA1 Message Date
dirkf c287eb831a
Merge 14a087ec62 into 668332b973 2024-04-26 00:49:39 +09:00
dirkf 668332b973 [YouPorn] Add playlist extractors
* YouPornCategoryIE
* YouPornChannelIE
* YouPornCollectionIE
* YouPornStarIE
* YouPornTagIE
* YouPornVideosIE,
2024-04-22 01:34:26 +01:00
dirkf 0b2ce3685e [YouPorn] Improve extraction
* detect unwatchable videos
* improve duration extraction
* fix count extraction and support large values
* detect and remove SEO spam boilerplate description
2024-04-22 01:34:26 +01:00
dirkf c2766cb80e [test/test_download] Support 'playlist_maxcount:count' expected value
* parallel to `playlist_mincount'
* specify both for a range of playlist lengths
* if max < min the test will always fail!
2024-04-22 01:34:26 +01:00
dirkf eb38665438 [YouPorn] Incorporate yt-dlp PR 8827
* from https://github.com/yt-dlp/yt-dlp/pull/8827
* extract from webpage instead of broken API URL
* thx The-MAGI
2024-04-22 01:34:26 +01:00
dirkf 14a087ec62 [SpankBang] Support category, tag, search, channel, star and profile pages 2022-06-07 19:53:31 +01:00
dirkf 836463013c [SpankBang] Rework SpankBangPlaylistIE with pagination 2022-06-06 14:47:40 +01:00
dirkf 30a954bad9 [SpankBang] Back-port changes to SpankBangIE from yt-dlp
* improve title extraction
* add uploader_id
* update test
* but don't check file md5
2022-06-06 14:18:46 +01:00
4 changed files with 754 additions and 79 deletions

View File

@ -10,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
assertGreaterEqual,
assertLessEqual,
expect_warnings,
get_params,
gettestcases,
@ -122,7 +123,10 @@ def generator(test_case, tname):
params['outtmpl'] = tname + '_' + params['outtmpl']
if is_playlist and 'playlist' not in test_case:
params.setdefault('extract_flat', 'in_playlist')
params.setdefault('playlistend', test_case.get('playlist_mincount'))
params.setdefault('playlistend',
test_case['playlist_maxcount'] + 1
if test_case.get('playlist_maxcount')
else test_case.get('playlist_mincount'))
params.setdefault('skip_download', True)
ydl = YoutubeDL(params, auto_init=False)
@ -190,6 +194,14 @@ def generator(test_case, tname):
'Expected at least %d in playlist %s, but got only %d' % (
test_case['playlist_mincount'], test_case['url'],
len(res_dict['entries'])))
if 'playlist_maxcount' in test_case:
assertLessEqual(
self,
len(res_dict['entries']),
test_case['playlist_maxcount'],
'Expected at most %d in playlist %s, but got %d' % (
test_case['playlist_maxcount'], test_case['url'],
len(res_dict['entries'])))
if 'playlist_count' in test_case:
self.assertEqual(
len(res_dict['entries']),

View File

@ -1187,6 +1187,7 @@ from .southpark import (
from .spankbang import (
SpankBangIE,
SpankBangPlaylistIE,
SpankBangListIE,
)
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE
@ -1653,7 +1654,15 @@ from .younow import (
YouNowChannelIE,
YouNowMomentIE,
)
from .youporn import YouPornIE
from .youporn import (
YouPornIE,
YouPornCategoryIE,
YouPornChannelIE,
YouPornCollectionIE,
YouPornStarIE,
YouPornTagIE,
YouPornVideosIE,
)
from .yourporn import YourPornIE
from .yourupload import YourUploadIE
from .youtube import (

View File

@ -1,11 +1,17 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
determine_ext,
extract_attributes,
ExtractorError,
get_element_by_class,
get_element_by_id,
merge_dicts,
parse_duration,
parse_resolution,
@ -26,19 +32,24 @@ class SpankBangIE(InfoExtractor):
)
'''
_TESTS = [{
'url': 'http://spankbang.com/3vvn/video/fantasy+solo',
'md5': '1cc433e1d6aa14bc376535b8679302f7',
'url': 'https://spankbang.com/56b3d/video/the+slut+maker+hmv',
'md5': '5039ba9d26f6124a7fdea6df2f21e765',
'info_dict': {
'id': '3vvn',
'id': '56b3d',
'ext': 'mp4',
'title': 'fantasy solo',
'description': 'dillion harper masturbates on a bed',
'title': 'The Slut Maker HMV',
'description': 'Girls getting converted into cock slaves.',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'silly2587',
'timestamp': 1422571989,
'upload_date': '20150129',
'uploader': 'Mindself',
'uploader_id': 'mindself',
'timestamp': 1617109572,
'upload_date': '20210330',
'age_limit': 18,
}
},
'params': {
# adaptive download
'skip_download': True,
},
}, {
# 480p only
'url': 'http://spankbang.com/1vt0/video/solvane+gangbang',
@ -134,15 +145,15 @@ class SpankBangIE(InfoExtractor):
info = self._search_json_ld(webpage, video_id, default={})
title = self._html_search_regex(
r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title', default=None)
r'(?s)<h1[^>]+\btitle=["\']([^"]+)["\']>', webpage, 'title', default=None)
description = self._search_regex(
r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)',
webpage, 'description', default=None)
thumbnail = self._og_search_thumbnail(webpage, default=None)
uploader = self._html_search_regex(
(r'(?s)<li[^>]+class=["\']profile[^>]+>(.+?)</a>',
r'class="user"[^>]*><img[^>]+>([^<]+)'),
webpage, 'uploader', default=None)
r'<svg[^>]+\bclass="(?:[^"]*?user[^"]*?)">.*?</svg>([^<]+)', webpage, 'uploader', default=None)
uploader_id = self._html_search_regex(
r'<a[^>]+href="/profile/([^"]+)"', webpage, 'uploader_id', default=None)
duration = parse_duration(self._search_regex(
r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)',
webpage, 'duration', default=None))
@ -157,6 +168,7 @@ class SpankBangIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'uploader_id': uploader_id,
'duration': duration,
'view_count': view_count,
'formats': formats,
@ -167,32 +179,166 @@ class SpankBangIE(InfoExtractor):
class SpankBangPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/(?P<display_id>[^/]+)'
_TEST = {
_TESTS = [{
'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties',
'info_dict': {
'id': 'ug0k',
'title': 'Big Ass Titties',
'description': 'md5:65b01bb13a9276cf172a67a41304bafd',
},
'playlist_mincount': 40,
}
'playlist_mincount': 35,
}, {
# pagination required
'url': 'https://spankbang.com/51wxk/playlist/dance',
'info_dict': {
'id': '51wxk',
'title': 'Dance',
'description': 'md5:7aae6991c65d561a9319ecab31f857e2',
},
'playlist_mincount': 60,
}]
def _entries(self, url, playlist_id, webpage=None):
for ii in itertools.count(1):
if not webpage:
webpage = self._download_webpage(
url, playlist_id,
note='Downloading playlist page %d' % (ii, ),
fatal=False)
if not webpage:
break
# search <main id="container">...</main>.innerHTML
for mobj in re.finditer(
r'''<a\b[^>]*?\bclass\s*=\s*('|")(?:(?:(?!\1).)+?\s)?\s*thumb\b[^>]*>''',
get_element_by_id('container', webpage) or webpage):
item_url = extract_attributes(mobj.group(0)).get('href')
if item_url:
yield urljoin(url, item_url)
next_url = self._search_regex(
r'''\bhref\s*=\s*(["'])(?P<path>(?!\1).+?)/?\1''',
get_element_by_class('next', webpage) or '',
'continuation page', group='path', default=None)
if next_url is None or next_url in url:
break
url, webpage = urljoin(url, next_url), None
p_url = compat_urlparse.urlparse(url)
url = compat_urlparse.urlunparse(p_url._replace(path=p_url.path + '/'))
def _get_title(self, list_id, webpage, url):
return self._html_search_regex(
r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title',
fatal=False) or re.sub(r'(\w)\+(\w)', r'\1 \2', list_id).title()
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
display_id = mobj.group('display_id')
mobj = re.match(self._VALID_URL, url).groupdict()
playlist_id = mobj['id']
display_id = mobj.get('display_id') or playlist_id
related = mobj.get('related')
webpage = self._download_webpage(
url, playlist_id, headers={'Cookie': 'country=US; mobile=on'})
webpage = self._download_webpage(url, playlist_id)
entries = [self.url_result(
urljoin(url, mobj.group('path')),
ie=SpankBangIE.ie_key(), video_id=mobj.group('id'))
for mobj in re.finditer(
r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/%s(?:(?!\1).)*)\1'
% re.escape(display_id), webpage)]
def get_title():
t = self._get_title(display_id, webpage, url)
if related:
t = '%s of %s' % (related.title(), t, )
return t
title = self._html_search_regex(
r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title',
fatal=False)
result = self.playlist_from_matches(self._entries(url, playlist_id, webpage), playlist_id, get_title(), ie=SpankBangIE.ie_key())
description = self._html_search_meta(('description', 'og:description'), webpage)
if description:
result['description'] = description
return result
return self.playlist_result(entries, playlist_id, title)
class SpankBangListIE(SpankBangPlaylistIE):
_VALID_URL = r'''(?x)
https?://(?:[^/]+\.)?spankbang\.com/
(?:
category|tag|s|
[^/]+/channel|
[^/]+/pornstar|
(?P<profile>profile)
)/(?P<id>[^/?#]+)(?(profile)/(?P<related>videos|likes|playlists)?|)'''
_TESTS = [{
'url': 'https://spankbang.com/category/striptease/?p=d',
'info_dict': {
'id': 'striptease',
'title': 'Category: Striptease Porn (Jun 2022) - Joi & Strip (today)',
'description': 'md5:d4580b5fc1e8fae9f627178964989959',
},
'playlist_mincount': 1,
}, {
'url': 'https://spankbang.com/tag/monty/',
'info_dict': {
'id': 'monty',
'title': 'Tag: Monty Porn - Jenni Lee & British',
'description': 'md5:93535d53455f10b934034bfb9562de35',
},
'playlist_mincount': 40,
}, {
'url': 'https://spankbang.com/s/silly/?q=fhd&d=10',
'info_dict': {
'id': 'silly',
'title': 'Search: Silly HD Porn - Chubby Teens & Cum Swallow (FHD, 10+ mins)',
'description': 'md5:a241b24847f1efd7cc3cfc4ef2b22429',
},
'playlist_mincount': 20,
}, {
'url': 'https://spankbang.com/er/channel/purexxxfilms/?o=new',
'info_dict': {
'id': 'purexxxfilms',
'title': 'PureXXXFilms Channel (new)',
'description': 'md5:bb61fbf523511f3bd15aea0360bdbdc0',
},
'playlist_mincount': 65,
}, {
'url': 'https://spankbang.com/5pj/pornstar/promise/',
'info_dict': {
'id': 'promise',
'title': 'Promise Pornstar Page',
'description': 'md5:ff08a6ac2d6cd1225f0fae74ac896d63',
},
'playlist_mincount': 90,
}, {
'url': 'https://spankbang.com/profile/migouche/likes',
'info_dict': {
'id': 'migouche',
'title': 'Likes of migouche Profile',
'description': 'md5:b97540209879a62cadf2af6b0934bbc9',
},
'playlist_mincount': 2,
},
]
def _get_title(self, list_id, webpage, url):
title = (
self._og_search_title(webpage, default=None)
or self._html_search_regex(
r'<title\b[^>]*>([^<]+)</title', webpage, 'channel title',
fatal=False))
if title:
title = re.sub(r'\s*(?:&\s+(?:Porn\s+))?(?:Videos\s+)?[@-]\s*SpankBang\s*$', '', title)
title = title or re.sub(r'(\w)\+(\w)', r'\1 \2', list_id).title()
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
quals = []
for q in ('o', 'q', 'd', 'p'):
v = qs.get(q, [None])[-1]
if not v:
continue
if q == 'q':
v = v.upper()
elif q == 'd':
v += '+ mins'
elif q == 'p':
v = {'d': 'today', 'w': 'this week', 'm': 'this month', 'y': 'this year'}.get(v, v)
quals.append(v)
quals = ', '.join(quals)
if quals:
title += ' (%s)' % (quals, )
m = re.search(r'/(category|tag|s)/', url)
if m:
pfx = m.group(1)
if pfx == 's':
pfx = 'search'
title = '%s: %s' % (pfx.title(), title)
return title

View File

@ -1,20 +1,38 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from time import sleep
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
ExtractorError,
get_element_by_class,
get_element_by_id,
int_or_none,
str_to_int,
merge_dicts,
parse_count,
parse_qs,
T,
traverse_obj,
unified_strdate,
url_or_none,
urljoin,
)
class YouPornIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
_VALID_URL = (
r'youporn:(?P<id>\d+)',
r'''(?x)
https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)
(?:/(?:(?P<display_id>[^/?#&]+)/?)?)?(?:[#?]|$)
'''
)
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)']
_TESTS = [{
'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
'md5': '3744d24c50438cf5b6f6d59feb5055c2',
@ -34,7 +52,7 @@ class YouPornIE(InfoExtractor):
'tags': list,
'age_limit': 18,
},
'skip': 'This video has been disabled',
'skip': 'This video has been deactivated',
}, {
# Unknown uploader
'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4',
@ -66,57 +84,104 @@ class YouPornIE(InfoExtractor):
}, {
'url': 'https://www.youporn.com/watch/13922959/femdom-principal/',
'only_matching': True,
}, {
'url': 'https://www.youporn.com/watch/16290308/tinderspecial-trailer1/',
'info_dict': {
'id': '16290308',
'age_limit': 18,
'categories': [],
'description': None, # SEO spam using title removed
'display_id': 'tinderspecial-trailer1',
'duration': 298.0,
'ext': 'mp4',
'upload_date': '20201123',
'uploader': 'Ersties',
'tags': [],
'thumbnail': 'https://fi1.ypncdn.com/m=eaSaaTbWx/202011/23/16290308/original/3.jpg',
'timestamp': 1606147564,
'title': 'Tinder In Real Life',
'view_count': int,
}
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)',
webpage)
@classmethod
def _extract_urls(cls, webpage):
def yield_urls():
for p in cls._EMBED_REGEX:
for m in re.finditer(p, webpage):
yield m.group('url')
return list(yield_urls())
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
# A different video ID (data-video-id) is hidden in the page but
# never seems to be used
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
url = 'http://www.youporn.com/watch/%s' % (video_id,)
webpage = self._download_webpage(
url, video_id, headers={'Cookie': 'age_verified=1'})
definitions = self._download_json(
'https://www.youporn.com/api/video/media_definitions/%s/' % video_id,
display_id)
watchable = self._search_regex(
r'''(<div\s[^>]*\bid\s*=\s*('|")?watch-container(?(2)\2|(?!-)\b)[^>]*>)''',
webpage, 'watchability', default=None)
if not watchable:
msg = re.split(r'\s{4}', clean_html(get_element_by_id(
'mainContent', webpage)) or '')[0]
raise ExtractorError(
('%s says: %s' % (self.IE_NAME, msg))
if msg else 'Video unavailable: no reason found',
expected=True)
# internal ID ?
# video_id = extract_attributes(watchable).get('data-video-id')
playervars = self._search_json(
r'\bplayervars\s*:', webpage, 'playervars', video_id)
def get_fmt(x):
v_url = url_or_none(x.get('videoUrl'))
if v_url:
x['videoUrl'] = v_url
return (x['format'], x)
defs_by_format = dict(traverse_obj(playervars, (
'mediaDefinitions', lambda _, v: v.get('format'), T(get_fmt))))
def get_format_data(f):
if f not in defs_by_format:
return []
return self._download_json(
defs_by_format[f]['videoUrl'], video_id, '{0}-formats'.format(f))
formats = []
for definition in definitions:
if not isinstance(definition, dict):
continue
video_url = url_or_none(definition.get('videoUrl'))
if not video_url:
continue
f = {
'url': video_url,
'filesize': int_or_none(definition.get('videoSize')),
}
height = int_or_none(definition.get('quality'))
# Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
for hls_url in traverse_obj(
get_format_data('hls'),
(lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'),
(Ellipsis, 'videoUrl')):
formats.extend(self._extract_m3u8_formats(
hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls',
entry_protocol='m3u8_native'))
for f in traverse_obj(get_format_data('mp4'), (
lambda _, v: v.get('videoUrl'), {
'url': ('videoUrl', T(url_or_none)),
'filesize': ('videoSize', T(int_or_none)),
'height': ('quality', T(int_or_none)),
}, T(lambda x: x.get('videoUrl') and x))):
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /videos/201703/11/109285532/1080P_4000K_109285532.mp4
# We will benefit from it by extracting some metadata
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', f['videoUrl'])
if mobj:
if not height:
height = int(mobj.group('height'))
bitrate = int(mobj.group('bitrate'))
f.update({
'format_id': '%dp-%dk' % (height, bitrate),
'tbr': bitrate,
})
f['height'] = height
if not f.get('height'):
f['height'] = int(mobj.group('height'))
f['tbr'] = int(mobj.group('bitrate'))
f['format_id'] = '%dp-%dk' % (f['height'], f['tbr'])
formats.append(f)
self._sort_formats(formats)
webpage = self._download_webpage(
'http://www.youporn.com/watch/%s' % video_id, display_id,
headers={'Cookie': 'age_verified=1'})
title = self._html_search_regex(
r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
webpage, 'title', default=None) or self._og_search_title(
@ -131,8 +196,10 @@ class YouPornIE(InfoExtractor):
thumbnail = self._search_regex(
r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',
webpage, 'thumbnail', fatal=False, group='thumbnail')
duration = int_or_none(self._html_search_meta(
'video:duration', webpage, 'duration', fatal=False))
duration = traverse_obj(playervars, ('duration', T(int_or_none)))
if duration is None:
duration = int_or_none(self._html_search_meta(
'video:duration', webpage, 'duration', fatal=False))
uploader = self._html_search_regex(
r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
@ -148,11 +215,11 @@ class YouPornIE(InfoExtractor):
view_count = None
views = self._search_regex(
r'(<div[^>]+\bclass=["\']js_videoInfoViews["\']>)', webpage,
'views', default=None)
r'(<div\s[^>]*\bdata-value\s*=[^>]+>)\s*<label>Views:</label>',
webpage, 'views', default=None)
if views:
view_count = str_to_int(extract_attributes(views).get('data-value'))
comment_count = str_to_int(self._search_regex(
view_count = parse_count(extract_attributes(views).get('data-value'))
comment_count = parse_count(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)',
webpage, 'comment count', default=None))
@ -168,7 +235,10 @@ class YouPornIE(InfoExtractor):
r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>',
'tags')
return {
data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False) or {}
data.pop('url', None)
result = merge_dicts(data, {
'id': video_id,
'display_id': display_id,
'title': title,
@ -183,4 +253,442 @@ class YouPornIE(InfoExtractor):
'tags': tags,
'age_limit': age_limit,
'formats': formats,
}
})
# Remove promotional non-description
if result.get('description', '').startswith(
'Watch %s online' % (result['title'],)):
del result['description']
return result
class YouPornListBase(InfoExtractor):
# pattern in '.title-text' element of page section containing videos
_PLAYLIST_TITLEBAR_RE = r'\s+[Vv]ideos\s*$'
_PAGE_RETRY_COUNT = 0 # ie, no retry
_PAGE_RETRY_DELAY = 2 # seconds
def _get_next_url(self, url, pl_id, html):
return urljoin(url, self._search_regex(
r'''<a\s[^>]*?\bhref\s*=\s*("|')(?P<url>(?:(?!\1)[^>])+)\1''',
get_element_by_id('next', html) or '', 'next page',
group='url', default=None))
@classmethod
def _get_title_from_slug(cls, title_slug):
return re.sub(r'[_-]', ' ', title_slug)
def _entries(self, url, pl_id, html=None, page_num=None):
# separates page sections
PLAYLIST_SECTION_RE = (
r'''<div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?title-bar(?:\s+[\w$-]+|\s)*\1[^>]*>'''
)
# contains video link
VIDEO_URL_RE = r'''(?x)
<div\s[^>]*\bdata-video-id\s*=\s*('|")\d+\1[^>]*>\s*
(?:<div\b[\s\S]+?</div>\s*)*
<a\s[^>]*\bhref\s*=\s*('|")(?P<url>(?:(?!\2)[^>])+)\2
'''
def yield_pages(url, html=html, page_num=page_num):
fatal = not html
for pnum in itertools.count(start=page_num or 1):
if not html:
html = self._download_webpage(
url, pl_id, note='Downloading page %d' % pnum,
fatal=fatal)
if not html:
break
fatal = False
yield (url, html, pnum)
# explicit page: extract just that page
if page_num is not None:
break
next_url = self._get_next_url(url, pl_id, html)
if not next_url or next_url == url:
break
url, html = next_url, None
def retry_page(msg, tries_left, page_data):
if tries_left <= 0:
return
self.report_warning(msg, pl_id)
sleep(self._PAGE_RETRY_DELAY)
return next(
yield_pages(page_data[0], page_num=page_data[2]), None)
def yield_entries(html):
for frag in re.split(PLAYLIST_SECTION_RE, html):
if not frag:
continue
t_text = get_element_by_class('title-text', frag or '')
if not (t_text and re.search(self._PLAYLIST_TITLEBAR_RE, t_text)):
continue
for m in re.finditer(VIDEO_URL_RE, frag):
video_url = urljoin(url, m.group('url'))
if video_url:
yield self.url_result(video_url)
last_first_url = None
for page_data in yield_pages(url, html=html, page_num=page_num):
# page_data: url, html, page_num
first_url = None
tries_left = self._PAGE_RETRY_COUNT + 1
while tries_left > 0:
tries_left -= 1
for from_ in yield_entries(page_data[1]):
# may get the same page twice instead of empty page
# or (site bug) intead of actual next page
if not first_url:
first_url = from_['url']
if first_url == last_first_url:
# sometimes (/porntags/) the site serves the previous page
# instead but may provide the correct page after a delay
page_data = retry_page(
'Retrying duplicate page...', tries_left, page_data)
if page_data:
first_url = None
break
continue
yield from_
else:
if not first_url and 'no-result-paragarph1' in page_data[1]:
page_data = retry_page(
'Retrying empty page...', tries_left, page_data)
if page_data:
continue
else:
# success/failure
break
# may get an infinite (?) sequence of empty pages
if not first_url:
break
last_first_url = first_url
def _real_extract(self, url, html=None):
# exceptionally, id may be None
m_dict = self._match_valid_url(url).groupdict()
pl_id, page_type, sort = (m_dict.get(k) for k in ('id', 'type', 'sort'))
qs = parse_qs(url)
for q, v in qs.items():
if v:
qs[q] = v[-1]
else:
del qs[q]
base_id = pl_id or 'YouPorn'
title = self._get_title_from_slug(base_id)
if page_type:
title = '%s %s' % (page_type.capitalize(), title)
base_id = [base_id.lower()]
if sort is None:
title += ' videos'
else:
title = '%s videos by %s' % (title, re.sub(r'[_-]', ' ', sort))
base_id.append(sort)
if qs:
ps = ['%s=%s' % item for item in sorted(qs.items())]
title += ' (%s)' % ','.join(ps)
base_id.extend(ps)
pl_id = '/'.join(base_id)
return self.playlist_result(
self._entries(url, pl_id, html=html,
page_num=int_or_none(qs.get('page'))),
playlist_id=pl_id, playlist_title=title)
class YouPornCategoryIE(YouPornListBase):
IE_DESC = 'YouPorn category, with sorting, filtering and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>category)/(?P<id>[^/?#&]+)
(?:/(?P<sort>popular|views|rating|time|duration))?/?(?:[#?]|$)
'''
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/category/lingerie/popular/',
'info_dict': {
'id': 'lingerie/popular',
'title': 'Category lingerie videos by popular',
},
'playlist_mincount': 39,
}, {
'note': 'Filtered paginated list with single page result',
'url': 'https://www.youporn.com/category/lingerie/duration/?min_minutes=10',
'info_dict': {
'id': 'lingerie/duration/min_minutes=10',
'title': 'Category lingerie videos by duration (min_minutes=10)',
},
'playlist_maxcount': 30,
}, {
'note': 'Single page of full list',
'url': 'https://www.youporn.com/category/lingerie/popular?page=1',
'info_dict': {
'id': 'lingerie/popular/page=1',
'title': 'Category lingerie videos by popular (page=1)',
},
'playlist_count': 30,
}]
class YouPornChannelIE(YouPornListBase):
IE_DESC = 'YouPorn channel, with sorting and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>channel)/(?P<id>[^/?#&]+)
(?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$)
'''
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/channel/x-feeds/',
'info_dict': {
'id': 'x-feeds',
'title': 'Channel X-Feeds videos',
},
'playlist_mincount': 37,
}, {
'note': 'Single page of full list (no filters here)',
'url': 'https://www.youporn.com/channel/x-feeds/duration?page=1',
'info_dict': {
'id': 'x-feeds/duration/page=1',
'title': 'Channel X-Feeds videos by duration (page=1)',
},
'playlist_count': 24,
}]
@staticmethod
def _get_title_from_slug(title_slug):
return re.sub(r'_', ' ', title_slug).title()
class YouPornCollectionIE(YouPornListBase):
IE_DESC = 'YouPorn collection (user playlist), with sorting and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>collection)s/videos/(?P<id>\d+)
(?:/(?P<sort>rating|views|time|duration))?/?(?:[#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+in\s'
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/collections/videos/33044251/',
'info_dict': {
'id': '33044251',
'title': 'Collection Sexy Lips videos',
'uploader': 'ph-littlewillyb',
},
'playlist_mincount': 50,
}, {
'note': 'Single page of full list (no filters here)',
'url': 'https://www.youporn.com/collections/videos/33044251/time?page=1',
'info_dict': {
'id': '33044251/time/page=1',
'title': 'Collection Sexy Lips videos by time (page=1)',
'uploader': 'ph-littlewillyb',
},
'playlist_count': 20,
}]
def _real_extract(self, url):
pl_id = self._match_id(url)
html = self._download_webpage(url, pl_id)
playlist = super(YouPornCollectionIE, self)._real_extract(url, html=html)
infos = re.sub(r'\s+', ' ', clean_html(get_element_by_class(
'collection-infos', html)) or '')
title, uploader = self._search_regex(
r'^\s*Collection: (?P<title>.+?) \d+ VIDEOS \d+ VIEWS \d+ days LAST UPDATED From: (?P<uploader>[\w_-]+)',
infos, 'title/uploader', group=('title', 'uploader'), default=(None, None))
return merge_dicts({
'title': playlist['title'].replace(playlist['id'].split('/')[0], title),
'uploader': uploader,
}, playlist) if title else playlist
class YouPornTagIE(YouPornListBase):
IE_DESC = 'YouPorn tag (porntags), with sorting, filtering and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
porn(?P<type>tag)s/(?P<id>[^/?#&]+)
(?:/(?P<sort>views|rating|time|duration))?/?(?:[#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+tagged\s'
_PAGE_RETRY_COUNT = 1
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/porntags/austrian',
'info_dict': {
'id': 'austrian',
'title': 'Tag austrian videos',
},
'playlist_mincount': 35,
'expected_warnings': ['Retrying duplicate page'],
}, {
'note': 'Filtered paginated list with single page result',
'url': 'https://www.youporn.com/porntags/austrian/duration/?min_minutes=10',
'info_dict': {
'id': 'austrian/duration/min_minutes=10',
'title': 'Tag austrian videos by duration (min_minutes=10)',
},
# number of videos per page is (row x col) 2x3 + 6x4 + 2, or + 3,
# or more, varying with number of ads; let's set max as 9x4
# NB col 1 may not be shown in non-JS page with site CSS and zoom 100%
'playlist_maxcount': 32,
'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'],
}, {
'note': 'Single page of full list',
'url': 'https://www.youporn.com/porntags/austrian/?page=1',
'info_dict': {
'id': 'austrian/page=1',
'title': 'Tag austrian videos (page=1)',
},
'playlist_mincount': 32,
'playlist_maxcount': 34,
'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'],
}]
# YP tag navigation is broken, loses sort
def _get_next_url(self, url, pl_id, html):
next_url = super(YouPornTagIE, self)._get_next_url(url, pl_id, html)
if next_url:
n = self._match_valid_url(next_url)
if n:
s = n.groupdict().get('sort')
if s:
u = self._match_valid_url(url)
if u:
u = u.groupdict().get('sort')
if s and not u:
n = n.end('sort')
next_url = next_url[:n] + '/' + u + next_url[n:]
return next_url
class YouPornStarIE(YouPornListBase):
IE_DESC = 'YouPorn Pornstar, with description, sorting and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>pornstar)/(?P<id>[^/?#&]+)
(?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+[fF]eaturing\s'
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/pornstar/daynia/',
'info_dict': {
'id': 'daynia',
'title': 'Pornstar Daynia videos',
'description': r're:Daynia Rank \d+ Videos \d+ Views [\d,.]+ .+ Subscribers \d+',
},
'playlist_mincount': 45,
}, {
'note': 'Single page of full list (no filters here)',
'url': 'https://www.youporn.com/pornstar/daynia/?page=1',
'info_dict': {
'id': 'daynia/page=1',
'title': 'Pornstar Daynia videos (page=1)',
'description': 're:.{180,}',
},
'playlist_count': 26,
}]
@staticmethod
def _get_title_from_slug(title_slug):
return re.sub(r'_', ' ', title_slug).title()
def _real_extract(self, url):
pl_id = self._match_id(url)
html = self._download_webpage(url, pl_id)
playlist = super(YouPornStarIE, self)._real_extract(url, html=html)
INFO_ELEMENT_RE = r'''(?x)
<div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?pornstar-info-wrapper(?:\s+[\w$-]+|\s)*\1[^>]*>
(?P<info>[\s\S]+?)(?:</div>\s*){6,}
'''
infos = self._search_regex(INFO_ELEMENT_RE, html, 'infos', group='info', default='')
if infos:
infos = re.sub(
r'(?:\s*nl=nl)+\s*', ' ',
re.sub(r'(?u)\s+', ' ', clean_html(
re.sub('\n', 'nl=nl', infos)))).replace('ribe Subsc', '')
return merge_dicts({
'description': infos.strip() or None,
}, playlist)
class YouPornVideosIE(YouPornListBase):
IE_DESC = 'YouPorn video (browse) playlists, with sorting, filtering and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?:(?P<id>browse)/)?
(?P<sort>(?(id)
(?:duration|rating|time|views)|
(?:most_(?:favou?rit|view)ed|recommended|top_rated)?))
(?:[/#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'\s+(?:[Vv]ideos|VIDEOS)\s*$'
_TESTS = [{
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/',
'info_dict': {
'id': 'youporn',
'title': 'YouPorn videos',
},
'only_matching': True,
}, {
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/recommended',
'info_dict': {
'id': 'youporn/recommended',
'title': 'YouPorn videos by recommended',
},
'only_matching': True,
}, {
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/top_rated',
'info_dict': {
'id': 'youporn/top_rated',
'title': 'YouPorn videos by top rated',
},
'only_matching': True,
}, {
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/browse/time',
'info_dict': {
'id': 'browse/time',
'title': 'YouPorn videos by time',
},
'only_matching': True,
}, {
'note': 'Filtered paginated list with single page result',
'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=2',
'info_dict': {
'id': 'youporn/most_favorited/max_minutes=2/res=VR',
'title': 'YouPorn videos by most favorited (max_minutes=2,res=VR)',
},
'playlist_mincount': 10,
'playlist_maxcount': 28,
}, {
'note': 'Filtered paginated list with several pages',
'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=5',
'info_dict': {
'id': 'youporn/most_favorited/max_minutes=5/res=VR',
'title': 'YouPorn videos by most favorited (max_minutes=5,res=VR)',
},
'playlist_mincount': 45,
}, {
'note': 'Single page of full list',
'url': 'https://www.youporn.com/browse/time?page=1',
'info_dict': {
'id': 'browse/time/page=1',
'title': 'YouPorn videos by time (page=1)',
},
'playlist_count': 36,
}]
@staticmethod
def _get_title_from_slug(title_slug):
return 'YouPorn' if title_slug == 'browse' else title_slug