[LibriVox] Add new extractor

2024-11-16 23:35:45 +00:00 · 2017-09-20 03:02:02 +02:00 · 2017-09-20 03:02:02 +02:00 · aa016336a8
commit aa016336a8
parent 3b65a6fbf3
2 changed files with 49 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -527,6 +527,7 @@ from .leeco import (
    LetvCloudIE,
 )
 from .libraryofcongress import LibraryOfCongressIE
 from .librivox import LibriVoxIE
 from .libsyn import LibsynIE
 from .lifenews import (
    LifeNewsIE,
--- a/youtube_dl/extractor/librivox.py
+++ b/youtube_dl/extractor/librivox.py
@ -0,0 +1,48 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    orderedSet
 )
 class LibriVoxIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?librivox\.org/(?P<id>(?P<title>(?:[^\-]*\-)+[^\-]*)\-by\-(?P<author>(-.*\-)*[^/]*))/?'
    _TESTS = [{
        'url': 'https://librivox.org/the-art-of-war-by-sun-tzu/',
        'info_dict': {
            'id': 'the-art-of-war-by-sun-tzu',
            'title': 'The Art Of War by Sun Tzu'
        },
        'playlist_mincount': 7
    }, {
        'url': 'https://librivox.org/alexander-the-great-by-jacob-abbott/',
        'info_dict': {
            'id': 'alexander-the-great-by-jacob-abbott',
            'title': 'Alexander The Great by Jacob Abbott'
        },
        'playlist_mincount': 12
    }]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        book_title = mobj.group('title').replace('-', ' ').strip().title()
        author = mobj.group('author').replace('-', ' ').strip().title()
        info = {
            'id': video_id,
            '_type': 'playlist',
            'title': book_title + ' by ' + author
        }
        webpage = self._download_webpage(url, video_id)
        links = orderedSet(re.findall(r'<a href="(https?://(?:www\.)?archive\.org/download/[^/]*/([^\.]*(?<!(?:64kb)))\.mp3)".*>(.*)</a>', webpage))
        info['entries'] = [self.url_result(link[0], video_id=link[1], video_title=link[2]) for link in links]
        return info