From 7b53af7f70da81eae41da645cc5af2c777c5c8e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 31 Aug 2014 06:43:36 +0700 Subject: [PATCH] [vporn] Fix issues, extract all formats and metadata --- youtube_dl/extractor/vporn.py | 91 ++++++++++++++++++++++++++--------- 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py index 645e935ec..426369c51 100644 --- a/youtube_dl/extractor/vporn.py +++ b/youtube_dl/extractor/vporn.py @@ -1,52 +1,99 @@ -# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + parse_duration, + str_to_int, +) + class VpornIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?vporn\.com/[a-z]+/(?P[a-z-]+)/(?P\d+)/?' + _VALID_URL = r'https?://(?:www\.)?vporn\.com/[^/]+/(?P[^/]+)/(?P\d+)' _TEST = { 'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/', 'md5': 'facf37c1b86546fa0208058546842c55', 'info_dict': { 'id': '497944', + 'display_id': 'violet-on-her-th-birthday', 'ext': 'mp4', - 'title': 'Violet On Her 19th Birthday', + 'title': 'Violet on her 19th birthday', 'description': 'Violet dances in front of the camera which is sure to get you horny.', - 'duration': 393, 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'kileyGrope', + 'categories': ['Masturbation', 'Teen'], + 'duration': 393, + 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + display_id = mobj.group('display_id') - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.*?) - Vporn Video', webpage, 'title') - video_url = self._html_search_regex(r'flashvars.videoUrlMedium = "(.*?)"', webpage, 'video_url') - description = self._html_search_regex(r'
(.*?)
', webpage, 'description') - thumbnail = 'http://www.vporn.com' + self._html_search_regex(r'flashvars.imageUrl = "(.*?)"', webpage, 'description') + webpage = self._download_webpage(url, display_id) - mobj = re.search(r'duration (?P\d+) min (?P\d+) sec ', webpage) - duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None + title = self._html_search_regex( + r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip() + description = self._html_search_regex( + r'
(.*?)
', webpage, 'description', fatal=False) + thumbnail = self._html_search_regex( + r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', fatal=False, default=None) + if thumbnail: + thumbnail = 'http://www.vporn.com' + thumbnail - mobj = re.search(r'((?P\d+),)?(?P\d+) VIEWS', webpage) - try: - view_count = int(mobj.group('units')) - view_count += int(mobj.group('thousands')) * 1000 - except: - pass + uploader = self._html_search_regex( + r'(?s)UPLOADED BY.*?([^<]+)', + webpage, 'uploader', fatal=False) + + categories = re.findall(r'([^<]+)', webpage) + + duration = parse_duration(self._search_regex( + r'duration (\d+ min \d+ sec)', webpage, 'duration', fatal=False)) + + view_count = str_to_int(self._html_search_regex( + r'([\d,\.]+) VIEWS', webpage, 'view count', fatal=False)) + like_count = str_to_int(self._html_search_regex( + r'([\d,\.]+)', webpage, 'like count', fatal=False)) + dislike_count = str_to_int(self._html_search_regex( + r'([\d,\.]+)', webpage, 'dislike count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'

Comments \(([\d,\.]+)\)

', webpage, 'comment count', fatal=False)) + + formats = [] + + for video in re.findall(r'flashvars\.videoUrl([^=]+?)\s*=\s*"([^"]+)"', webpage): + video_url = video[1] + fmt = { + 'url': video_url, + 'format_id': video[0], + } + m = re.search(r'_(?P\d+)x(?P\d+)_(?P\d+)k\.mp4$', video_url) + if m: + fmt.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'vbr': int(m.group('vbr')), + }) + formats.append(fmt) + + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, - 'thumbnail': thumbnail, + 'display_id': display_id, 'title': title, 'description': description, - 'duration': int_or_none(duration), - 'view_count': int_or_none(view_count), + 'thumbnail': thumbnail, + 'uploader': uploader, + 'categories': categories, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + 'age_limit': 18, + 'formats': formats, }