From b04d78068d43a00cb9e5bb4f9ff07698d7d86b39 Mon Sep 17 00:00:00 2001
From: dirkf <fieldhouse@gmx.net>
Date: Mon, 16 Oct 2023 03:34:50 +0100
Subject: [PATCH] [XVideos] Update XVideosIE including features from PR #30689
 * add uploader, tag, performer and view_count extraction (closes #30689) *
 add dis/like_count extraction

---
 youtube_dl/extractor/xvideos.py | 139 +++++++++++++++++++++++++++-----
 1 file changed, 117 insertions(+), 22 deletions(-)

diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index b736e18b5..536e9ae47 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -4,22 +4,39 @@ from __future__ import unicode_literals
 import re
 import itertools
 
-from .common import InfoExtractor
+from math import isinf
+
+from .common import (
+    InfoExtractor,
+    SearchInfoExtractor,
+)
 from ..compat import (
-    compat_parse_qs,
+    compat_kwargs,
     compat_str,
     compat_urlparse,
     compat_urllib_parse_unquote,
-    compat_urllib_parse_urlencode,
+    compat_urllib_request,
 )
 from ..utils import (
     clean_html,
     determine_ext,
     extract_attributes,
     ExtractorError,
+    get_element_by_class,
+    get_element_by_id,
+    get_elements_by_class,
     int_or_none,
+    join_nonempty,
+    LazyList,
+    merge_dicts,
+    parse_count,
     parse_duration,
-    try_get,
+    remove_end,
+    remove_start,
+    T,
+    traverse_obj,
+    try_call,
+    txt_or_none,
     url_basename,
     urljoin,
 )
@@ -27,14 +44,18 @@ from ..utils import (
 
 class XVideosIE(InfoExtractor):
     _VALID_URL = r'''(?x)
-                    https?://
-                        (?:
-                            (?:[^/]+\.)?xvideos2?\.com/(?:video|prof-video-click/model/[^/]+/)|
-                            (?:www\.)?xvideos\.es/video|
-                            (?:www|flashservice)\.xvideos\.com/embedframe/|
-                            static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video=
-                        )
-                        (?P<id>\d+)
+                    (?:
+                        https?://
+                            (?:
+                                # xvideos\d+\.com redirects to xvideos.com
+                                # (?P<country>[a-z]{2})\.xvideos.com too: catch it anyway
+                                (?:[^/]+\.)?xvideos\.com/(?:video|prof-video-click/model/[^/]+/)|
+                                (?:www\.)?xvideos\.es/video|
+                                (?:www|flashservice)\.xvideos\.com/embedframe/|
+                                static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video=
+                            )|
+                        xvideos:
+                    )(?P<id>\d+)
                  '''
     _TESTS = [{
         'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
@@ -45,19 +66,51 @@ class XVideosIE(InfoExtractor):
             'title': 'Biker Takes his Girl',
             'duration': 108,
             'age_limit': 18,
-        }
+        },
+        'skip': 'Sorry, this video has been deleted',
+    }, {
+        'url': 'https://www.xvideos.com/video78250973/hot_blonde_gets_excited_in_the_middle_of_the_club.',
+        'md5': '0bc6e46ef55907533ffa0542e45958b6',
+        'info_dict': {
+            'id': '78250973',
+            'ext': 'mp4',
+            'title': 'Hot blonde gets excited in the middle of the club.',
+            'uploader': 'Deny Barbie Official',
+            'age_limit': 18,
+            'duration': 302,
+        },
     }, {
         # Broken HLS formats
         'url': 'https://www.xvideos.com/video65982001/what_s_her_name',
-        'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5',
+        'md5': '18ff7d57d4edc3c908fc5b06166dd63d',
         'info_dict': {
             'id': '65982001',
             'ext': 'mp4',
             'title': 'what\'s her name?',
-            'duration': 120,
+            'uploader': 'Skakdjskdk',
             'age_limit': 18,
-            'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
+            'duration': 120,
+            'thumbnail': r're:^https://img-[a-z]+.xvideos-cdn.com/.+\.jpg',
         }
+    }, {
+        # from PR #30689
+        'url': 'https://www.xvideos.com/video50011247/when_girls_play_-_adriana_chechik_abella_danger_-_tradimento_-_twistys',
+        'md5': 'aa54f96311768b3a8bfe54b8c8fda070',
+        'info_dict': {
+            'id': '50011247',
+            'ext': 'mp4',
+            'title': 'When Girls Play - (Adriana Chechik, Abella Danger) - Betrayal - Twistys',
+            'duration': 720,
+            'age_limit': 18,
+            'tags': ['lesbian', 'teen', 'hardcore', 'latina', 'rough', 'squirt', 'big-ass', 'cheater', 'twistys', 'cheat', 'ass-play', 'when-girls-play'],
+            'creator': 'Twistys',
+            'uploader': 'Twistys',
+            'uploader_url': 'https://www.xvideos.com/channels/twistys1',
+            'cast': [{'given_name': 'Adriana Chechik', 'url': 'https://www.xvideos.com/pornstars/adriana-chechik'}, {'given_name': 'Abella Danger', 'url': 'https://www.xvideos.com/pornstars/abella-danger'}],
+            'view_count': 'lambda c: c >= 4038715',
+            'like_count': 'lambda c: c >= 8800',
+            'dislike_count': 'lambda c: c >= 3100',
+        },
     }, {
         'url': 'https://flashservice.xvideos.com/embedframe/4588838',
         'only_matching': True,
@@ -138,7 +191,7 @@ class XVideosIE(InfoExtractor):
         duration = int_or_none(self._og_search_property(
             'duration', webpage, default=None)) or parse_duration(
             self._search_regex(
-                r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)',
+                r'''<span [^>]*\bclass\s*=\s*["']duration\b[^>]+>.*?(\d[^<]+)''',
                 webpage, 'duration', fatal=False))
 
         formats = []
@@ -169,15 +222,57 @@ class XVideosIE(InfoExtractor):
 
         self._sort_formats(formats)
 
-        return {
+        # adapted from PR #30689
+        ignore_tags = set(('xvideos', 'xvideos.com', 'x videos', 'x video', 'porn', 'video', 'videos'))
+        tags = self._html_search_meta('keywords', webpage) or ''
+        tags = [t for t in re.split(r'\s*,\s*', tags) if t not in ignore_tags]
+
+        mobj = re.search(
+            r'''(?sx)
+                (?P<ul><a\b[^>]+\bclass\s*=\s*["'](?:[\w-]+\s+)*uploader-tag(?:\s+[\w-]+)*[^>]+>)
+                \s*<span\s+class\s*=\s*["']name\b[^>]+>\s*(?P<name>.+?)\s*<
+            ''', webpage)
+        creator = None
+        uploader_url = None
+        if mobj:
+            uploader_url = urljoin(url, extract_attributes(mobj.group('ul')).get('href'))
+            creator = mobj.group('name')
+
+        def get_actor_data(mobj):
+            ul_url = extract_attributes(mobj.group('ul')).get('href')
+            if '/pornstars/' in ul_url:
+                return {
+                    'given_name': mobj.group('name'),
+                    'url': urljoin(url, ul_url),
+                }
+
+        actors = traverse_obj(re.finditer(
+            r'''(?sx)
+                (?P<ul><a\b[^>]+\bclass\s*=\s*["'](?:[\w-]+\s+)*profile(?:\s+[\w-]+)*[^>]+>)
+                \s*<span\s+class\s*=\s*["']name\b[^>]+>\s*(?P<name>.+?)\s*<
+            ''', webpage), (Ellipsis, T(get_actor_data)))
+
+        return merge_dicts({
             'id': video_id,
             'formats': formats,
             'title': title,
-            'duration': duration,
-            'thumbnails': thumbnails,
             'age_limit': 18,
-        }
-
+        }, {
+            'duration': duration,
+            'thumbnails': thumbnails or None,
+            'tags': tags or None,
+            'creator': creator,
+            'uploader': creator,
+            'uploader_url': uploader_url,
+            'cast': actors or None,
+            'view_count': parse_count(get_element_by_class(
+                'mobile-hide', get_element_by_id('v-views', webpage))),
+            'like_count': parse_count(get_element_by_class('rating-good-nbr', webpage)),
+            'dislike_count': parse_count(get_element_by_class('rating-bad-nbr', webpage)),
+        }, {
+            'channel': creator,
+            'channel_url': uploader_url,
+        } if '/channels/' in (uploader_url or '') else {})
 
 class XVideosPlaylistIE(InfoExtractor):
     _VALID_URL = r'''(?x)