From 2b1b2d83cacfdce19cae5eea2f9bbfd142efc7f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 31 Oct 2015 22:17:09 +0600 Subject: [PATCH] [mdr] Modernize and include kika.de --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/kika.py | 101 ------------------ youtube_dl/extractor/mdr.py | 174 +++++++++++++++++++++++-------- 3 files changed, 132 insertions(+), 144 deletions(-) delete mode 100644 youtube_dl/extractor/kika.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5ad4e9c36..f98e6487e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -274,7 +274,6 @@ from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE -from .kika import KikaIE from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py deleted file mode 100644 index 5337ac439..000000000 --- a/youtube_dl/extractor/kika.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class KikaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P\d+).*' - - _TESTS = [ - { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None, - }, - }, - { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - }, - }, - { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None, - }, - }, - { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - }, - }, - ] - - def _real_extract(self, url): - # broadcast_id may be the same as the video_id - broadcast_id = self._match_id(url) - webpage = self._download_webpage(url, broadcast_id) - - xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' - video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) - if not video_id: - err_msg = 'Video %s is not available online' % broadcast_id - raise ExtractorError(err_msg, expected=True) - - xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id) - xml_tree = self._download_xml(xml_url, video_id) - - title = xml_tree.find('title').text - webpage_url = xml_tree.find('htmlUrl').text - - # Try to get the description, not available for all videos - try: - broadcast_elem = xml_tree.find('broadcast') - description = broadcast_elem.find('broadcastDescription').text - except AttributeError: - description = None - - # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) - tmp = xml_tree.find('duration').text.split(':') - duration = int(tmp[0]) * 60 + int(tmp[1]) - - formats = [{ - 'url': elem.find('progressiveDownloadUrl').text, - 'ext': elem.find('mediaType').text.lower(), - 'format': elem.find('profileName').text, - 'width': int(elem.find('frameWidth').text), - 'height': int(elem.find('frameHeight').text), - 'abr': int(elem.find('bitrateAudio').text), - 'vbr': int(elem.find('bitrateVideo').text), - 'filesize': int(elem.find('fileSize').text), - } for elem in xml_tree.find('assets')] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': duration, - 'webpage_url': webpage_url, - } diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index fc7499958..541ddd909 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -1,64 +1,154 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + xpath_text, +) class MDRIE(InfoExtractor): - _VALID_URL = r'^(?Phttps?://(?:www\.)?mdr\.de)/(?:.*)/(?Pvideo|audio)(?P[^/_]+)(?:_|\.html)' + IE_DESC = 'MDR.DE and KiKA' + _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P\d+)(?:_.+?)?\.html' - # No tests, MDR regularily deletes its videos - _TEST = { + _TESTS = [{ + # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, - } + }, { + 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', + 'info_dict': { + 'id': '19636', + 'ext': 'mp4', + 'title': 'Baumhaus vom 30. Oktober 2015', + 'duration': 134, + 'uploader': 'KIKA', + }, + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + 'timestamp': 1419047100, + 'upload_date': '20141220', + 'duration': 4628, + 'uploader': 'KIKA', + }, + }, { + 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', + 'only_matching': True, + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', + 'only_matching': True, + }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('video_id') - domain = m.group('domain') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) - # determine title and media streams from webpage - html = self._download_webpage(url, video_id) + data_url = self._search_regex( + r'dataURL\s*:\s*(["\'])(?P/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1', + webpage, 'data url', group='url') - title = self._html_search_regex(r'(.*?)', html, 'title') - xmlurl = self._search_regex( - r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL') + doc = self._download_xml( + compat_urlparse.urljoin(url, data_url), video_id) + + title = (xpath_text(doc, './title', 'title', default=None) or + xpath_text(doc, './broadcast/broadcastName', 'title')) - doc = self._download_xml(domain + xmlurl, video_id) formats = [] - for a in doc.findall('./assets/asset'): - url_el = a.find('./progressiveDownloadUrl') - if url_el is None: - continue - abr = int(a.find('bitrateAudio').text) // 1000 - media_type = a.find('mediaType').text - format = { - 'abr': abr, - 'filesize': int(a.find('fileSize').text), - 'url': url_el.text, - } - - vbr_el = a.find('bitrateVideo') - if vbr_el is None: - format.update({ - 'vcodec': 'none', - 'format_id': '%s-%d' % (media_type, abr), - }) - else: - vbr = int(vbr_el.text) // 1000 - format.update({ - 'vbr': vbr, - 'width': int(a.find('frameWidth').text), - 'height': int(a.find('frameHeight').text), - 'format_id': '%s-%d' % (media_type, vbr), - }) - formats.append(format) + processed_urls = [] + for asset in doc.findall('./assets/asset'): + for source in ( + 'progressiveDownload', + 'dynamicHttpStreamingRedirector', + 'adaptiveHttpStreamingRedirector'): + url_el = asset.find('./%sUrl' % source) + if url_el is None: + continue + + video_url = url_el.text + if video_url in processed_urls: + continue + + processed_urls.append(video_url) + + vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) + abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + + url_formats = [] + + ext = determine_ext(url_el.text) + if ext == 'm3u8': + url_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + preference=0, m3u8_id='HLS', fatal=False) + elif ext == 'f4m': + url_formats = self._extract_f4m_formats( + video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + preference=0, f4m_id='HDS', fatal=False) + else: + media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') + vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) + abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + + f = { + 'url': video_url, + 'format_id': '%s-%d' % (media_type, vbr or abr), + 'filesize': filesize, + 'abr': abr, + 'preference': 1, + } + + if vbr: + width = int_or_none(xpath_text(asset, './frameWidth', 'width')) + height = int_or_none(xpath_text(asset, './frameHeight', 'height')) + f.update({ + 'vbr': vbr, + 'width': width, + 'height': height, + }) + + url_formats.append(f) + + if not vbr: + for f in url_formats: + abr = f.get('tbr') or abr + if 'tbr' in f: + del f['tbr'] + f.update({ + 'abr': abr, + 'vcodec': 'none', + }) + + if url_formats: + formats.extend(url_formats) self._sort_formats(formats) + description = xpath_text(doc, './broadcast/broadcastDescription', 'description') + timestamp = parse_iso8601( + xpath_text(doc, './broadcast/broadcastDate', 'timestamp', default=None) or + xpath_text(doc, './broadcast/broadcastStartDate', 'timestamp', default=None)) + duration = parse_duration(xpath_text(doc, './duration', 'duration')) + uploader = xpath_text(doc, './rights', 'uploader') + return { 'id': video_id, 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'uploader': uploader, 'formats': formats, }