[br] Allow '/' in URL, allow empty author + broadcastDate fields

* Allow URLs that have a 'subdirectory' before the actual program name, e.g. 'xyz/xyz-episode-1'. * The author and broadcastDate fields in the XML file may be empty. * Add test case for the two problems above.
2024-11-22 18:22:21 +00:00 · 2014-03-13 14:01:20 +01:00 · 2014-03-13 14:01:20 +01:00 · c21215b421
commit c21215b421
parent 98ff9d82d4
1 changed files with 41 additions and 22 deletions
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@ -9,10 +9,11 @@ from ..utils import ExtractorError
 class BRIE(InfoExtractor):
    IE_DESC = "Bayerischer Rundfunk Mediathek"
-    _VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?P<id>[a-z0-9\-]+)\.html$"
+    _VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?:[a-z0-9\-/]+/)?(?P<id>[a-z0-9\-]+)\.html$"
    _BASE_URL = "http://www.br.de"
-    _TEST = {
+    _TESTS = [
        {
            "url": "http://www.br.de/mediathek/video/anselm-gruen-114.html",
            "md5": "c4f83cf0f023ba5875aba0bf46860df2",
            "info_dict": {
@ -23,7 +24,20 @@ class BRIE(InfoExtractor):
                "uploader": "BR/Birgit Baier",
                "upload_date": "20140301"
            }
        },
        {
            "url": "http://www.br.de/mediathek/video/sendungen/unter-unserem-himmel/unter-unserem-himmel-alpen-ueber-den-pass-100.html",
            "md5": "ab451b09d861dbed7d7cc9ab0be19ebe",
            "info_dict": {
                "id": "2c060e69-3a27-4e13-b0f0-668fac17d812",
                "ext": "mp4",
                "title": "Über den Pass",
                "description": "Die Eroberung der Alpen: Über den Pass",
                "uploader": None,
                "upload_date": None
            }
        }
    ]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
@ -33,16 +47,21 @@ class BRIE(InfoExtractor):
            r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/mediathek/video/[a-z0-9/~_.-]+)'}\)\);", page, "XMLURL")
        xml = self._download_xml(self._BASE_URL + xml_url, None)
-        videos = [{
+        videos = []
        for xml_video in xml.findall("video"):
            video = {
                "id": xml_video.get("externalId"),
                "title": xml_video.find("title").text,
                "formats": self._extract_formats(xml_video.find("assets")),
                "thumbnails": self._extract_thumbnails(xml_video.find("teaserImage/variants")),
                "description": " ".join(xml_video.find("shareTitle").text.splitlines()),
-            "uploader": xml_video.find("author").text,
+                "webpage_url": xml_video.find("permalink").text
-            "upload_date": "".join(reversed(xml_video.find("broadcastDate").text.split("."))),
+            }
-            "webpage_url": xml_video.find("permalink").text,
+            if xml_video.find("author").text:
-        } for xml_video in xml.findall("video")]
+                video["uploader"] = xml_video.find("author").text
            if xml_video.find("broadcastDate").text:
                video["upload_date"] =  "".join(reversed(xml_video.find("broadcastDate").text.split(".")))
            videos.append(video)
        if len(videos) > 1:
            self._downloader.report_warning(