From f870544302f75bee0d96f6a8623c8ff270beca89 Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 13 Jul 2015 07:41:38 -0500 Subject: [PATCH 001/415] Add support for democracynow.org Supports downloading clips or entire shows. Subtitle support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/democracynow.py | 100 +++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/democracynow.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cbaa07391..5cc03b875 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -112,6 +112,7 @@ from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE +from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py new file mode 100644 index 000000000..1c9b36052 --- /dev/null +++ b/youtube_dl/extractor/democracynow.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import time +import hmac +import hashlib +import itertools +import re +from ..utils import ( + ExtractorError, + int_or_none, + parse_age_limit, + parse_iso8601, +) +from ..compat import compat_urllib_request +from .common import InfoExtractor + + +class DemocracynowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?democracynow.org/?(?P[^\?]*)' + IE_NAME = 'democracynow' + _TESTS = [{ + 'url': 'http://www.democracynow.org/shows/2015/7/3', + 'info_dict': { + 'id': '2015-0703-001', + 'ext': 'mp4', + 'title': 'July 03, 2015 - Democracy Now!', + 'description': 'A daily independent global news hour with Amy Goodman & Juan Gonz\xe1lez "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', + 'uploader': 'Democracy Now', + 'upload_date': None, + }, + },{ + 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', + 'info_dict': { + 'id': '2015-0703-001', + 'ext': 'mp4', + 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', + 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', + 'uploader': 'Democracy Now', + 'upload_date': None, + }, + + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + base_host = re.search(r'^(.+?://[^/]+)', url).group(1) + if display_id == '': + display_id = 'home' + webpage = self._download_webpage(url, display_id) + re_desc = re.search(r'[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)',url) + if video_id == None: + video_id = purl.group('fn') + if js.get('start') != None: + url += '&' if purl.group('hasparams') == '?' else '?' + url = url + 'start='+str(js.get('start')) + formats.append({ + 'format_id': purl.group('dir'), + 'ext': purl.group('ext'), + 'url': url, + }) + self._sort_formats(formats) + ret = { + 'id': video_id, + 'title': js.get('title'), + 'description': description, + 'uploader': 'Democracy Now', +# 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } + return ret +# \ No newline at end of file From eb08081330f5ef52d66140589137ae1bb05eee5f Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:57:08 -0500 Subject: [PATCH 002/415] democracynow: correct syntax --- youtube_dl/extractor/democracynow.py | 43 +++++++++------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 1c9b36052..973bb437b 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -1,19 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import time -import hmac -import hashlib -import itertools import re -from ..utils import ( - ExtractorError, - int_or_none, - parse_age_limit, - parse_iso8601, -) -from ..compat import compat_urllib_request from .common import InfoExtractor @@ -30,7 +18,7 @@ class DemocracynowIE(InfoExtractor): 'uploader': 'Democracy Now', 'upload_date': None, }, - },{ + }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', 'info_dict': { 'id': '2015-0703-001', @@ -40,7 +28,6 @@ class DemocracynowIE(InfoExtractor): 'uploader': 'Democracy Now', 'upload_date': None, }, - }] def _real_extract(self, url): @@ -49,7 +36,7 @@ def _real_extract(self, url): if display_id == '': display_id = 'home' webpage = self._download_webpage(url, display_id) - re_desc = re.search(r'[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)',url) - if video_id == None: + purl = re.search(r'/(?P[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)', url) + if video_id is None: video_id = purl.group('fn') - if js.get('start') != None: + if js.get('start') is not None: url += '&' if purl.group('hasparams') == '?' else '?' - url = url + 'start='+str(js.get('start')) + url = url + 'start=' + str(js.get('start')) formats.append({ 'format_id': purl.group('dir'), 'ext': purl.group('ext'), @@ -92,9 +79,7 @@ def _real_extract(self, url): 'title': js.get('title'), 'description': description, 'uploader': 'Democracy Now', -# 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, } return ret -# \ No newline at end of file From f57f84f606b246db4f102fc5bc55e64e4f7a3d60 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:38:40 -0500 Subject: [PATCH 003/415] Twitter: get and describe video from status urls --- youtube_dl/extractor/twitter.py | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1aaa06305..a65252cc6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -70,3 +70,47 @@ def _real_extract(self, url): 'duration': duration, 'formats': formats, } + + +class TwitterIE(TwitterCardIE): + _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P[^/]+/status/\d+)' + + _TESTS = [{ + 'url': 'https://m.twitter.com/thereaIbanksy/status/614301758345490432', + 'md5': '8bbccb487bd7a31349b775915fcd412f', + 'info_dict': { + 'id': '614301758345490432', + 'ext': 'mp4', + 'title': 'thereaIbanksy - This time lapse is so pretty \U0001f60d\U0001f60d', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 29.5, + 'description': 'banksy on Twitter: "This time lapse is so pretty \U0001f60d\U0001f60d http://t.co/QB8DDbqiR1"', + 'uploader': 'banksy', + 'uploader_id': 'thereaIbanksy', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + username, twid = re.match(r'([^/]+)/status/(\d+)', id).groups() + name = username + url = re.sub(r'https?://(m|mobile)\.', 'https://', url) + webpage = self._download_webpage(url, 'tweet: ' + url) + description = unescapeHTML(self._search_regex('\s*(.+?)\s*', webpage, 'title')) + title = description.replace('\n', ' ') + splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) + if splitdesc: + name, title = splitdesc.groups() + title = re.sub(r'\s*https?://[^ ]+', '', title) # strip 'https -_t.co_BJYgOjSeGA' junk from filenames + card_id = self._search_regex(r'["\']/i/cards/tfw/v1/(\d+)', webpage, '/i/card/...') + card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + return { + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'uploader_id': username, + 'uploader': name, + 'url': card_url, + 'webpage_url': url, + 'description': description, + 'title': username + ' - ' + title, + } From c3dea3f878133f3cbdad9e548609d3077572af66 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:45:36 -0500 Subject: [PATCH 004/415] Twittercard: support vmapurl method --- youtube_dl/extractor/twitter.py | 47 ++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index a65252cc6..1dd43ff3c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -12,17 +12,30 @@ class TwitterCardIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P\d+)' - _TEST = { - 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': 'a74f50b310c83170319ba16de6955192', - 'info_dict': { - 'id': '560070183650213889', - 'ext': 'mp4', - 'title': 'TwitterCard', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 30.033, + _TESTS = [ + { + 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', + 'md5': 'a74f50b310c83170319ba16de6955192', + 'info_dict': { + 'id': '560070183650213889', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 30.033, + } }, - } + { + 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', + 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', + 'info_dict': { + 'id': '623160978427936768', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 80.155, + }, + } + ] def _real_extract(self, url): video_id = self._match_id(url) @@ -44,6 +57,20 @@ def _real_extract(self, url): unescapeHTML(self._search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config')), video_id) + if 'playlist' not in config: + if 'vmapUrl' in config: + webpage = self._download_webpage(config['vmapUrl'], video_id + ' (xml)') + video_url = self._search_regex( + r'\s*', webpage, 'data player config (xml)') + f = { + 'url': video_url, + } + ext = re.search(r'\.([a-z0-9]{2,4})(\?.+)?$', video_url) + if ext: + f['ext'] = ext.group(1) + formats.append(f) + break # same video regardless of UA + continue video_url = config['playlist'][0]['source'] From 9e7e0dffd5e3e3c959e8d99a5e236b9099886fe9 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:56:35 -0500 Subject: [PATCH 005/415] Actually add the extractor --- youtube_dl/extractor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 50da08830..5c03bf8e8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -651,7 +651,7 @@ TwitchBookmarksIE, TwitchStreamIE, ) -from .twitter import TwitterCardIE +from .twitter import TwitterCardIE, TwitterIE from .ubu import UbuIE from .udemy import ( UdemyIE, From 689fb748ee1ba8e61f99d21a3bcb1bc83b708649 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 11 Sep 2015 04:44:17 +0100 Subject: [PATCH 006/415] [utlis] add extract_attributes for extracting html tags attributes --- youtube_dl/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 206dd56bc..bcebf9cc5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -248,6 +248,14 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'): + attributes = re.findall(attributes_regex, attributes_str) + attributes_dict = {} + if attributes: + attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes} + return attributes_dict + + def clean_html(html): """Clean an HTML snippet into a readable string""" From ed1269000f24a6ddc683a295ff402ef3ded5c4fb Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 11 Sep 2015 04:46:21 +0100 Subject: [PATCH 007/415] [brightcove] add support for brightcove in page embed(fixes #6824) --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/brightcove.py | 92 ++++++++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 21 ++++++- 3 files changed, 116 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 57f55b479..fcd9edec3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -59,7 +59,10 @@ from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..a07c0888f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -22,6 +22,10 @@ fix_xml_ampersands, unescapeHTML, unsmuggle_url, + js_to_json, + int_or_none, + parse_iso8601, + extract_attributes, ) @@ -346,3 +350,91 @@ def _extract_video_info(self, video_info): if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info + + +class BrightcoveInPageEmbedIE(InfoExtractor): + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' + TEST = { + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'flv', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'duration': 165768, + } + } + + @staticmethod + def _extract_url(webpage): + video_attributes = re.search(r'(?s)]*)>.*?', webpage) + if video_attributes: + video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']') + account_id = video_attributes.get('account') + player_id = video_attributes.get('player') + embed = video_attributes.get('embed') + video_id = video_attributes.get('video-id') + if account_id and player_id and embed and video_id: + return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id) + return None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + account_id, player_id, embed, video_id = mobj.groups() + + webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + + catalog = self._parse_json( + js_to_json( + self._search_regex( + r'catalog\(({[^}]+})\);', + webpage, + 'catalog' + ) + ), + video_id + ) + policy_key = catalog['policyKey'] + + req = compat_urllib_request.Request( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + headers={'Accept': 'application/json;pk=%s' % policy_key}) + json_data = self._download_json(req, video_id) + + title = json_data['name'] + description = json_data.get('description') + thumbnail = json_data.get('name') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = int_or_none(json_data.get('duration')) + + formats = [] + for source in json_data.get('sources'): + source_type = source.get('type') + if source_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) + else: + src = source.get('src') + if src: + formats.append({ + 'url': src, + 'abr': source.get('avg_bitrate'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'filesize': source.get('size'), + 'container': source.get('container'), + 'vcodec': source.get('container'), + }) + else: + formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec748ed9f..7a3a7f66b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,7 +29,10 @@ url_basename, xpath_text, ) -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -1012,6 +1015,17 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, } ] @@ -1288,6 +1302,11 @@ def _playlist_from_matches(matches, getter=None, ie=None): 'entries': entries, } + # Look for Brightcove In Page Embed: + brightcove_in_page_embed_url = BrightcoveInPageEmbedIE._extract_url(webpage) + if brightcove_in_page_embed_url: + return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed') + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', From 53407e3f383ed80c67db9e06b8c3480257aa3184 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 23 Sep 2015 14:02:13 +0100 Subject: [PATCH 008/415] [brightcove] fix streaming_src extraction --- youtube_dl/extractor/brightcove.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a07c0888f..e4a7befee 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -413,7 +413,7 @@ def _real_extract(self, url): if source_type == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) else: - src = source.get('src') + src = source.get('src') or source.get('streaming_src') if src: formats.append({ 'url': src, @@ -424,8 +424,6 @@ def _real_extract(self, url): 'container': source.get('container'), 'vcodec': source.get('container'), }) - else: - formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) self._sort_formats(formats) From c01e1a96aa964ef6d5f0bf7675dbe34096b1d2c8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 30 Sep 2015 11:20:43 +0100 Subject: [PATCH 009/415] [brightcove] fix test and fields extraction --- youtube_dl/extractor/brightcove.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index e4a7befee..b41cee91b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -354,14 +354,18 @@ def _extract_video_info(self, video_info): class BrightcoveInPageEmbedIE(InfoExtractor): _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' - TEST = { + _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', 'info_dict': { 'id': '4463358922001', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Meet the man behind Popcorn Time', - 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'timestamp': 1441391203, + 'upload_date': '20150904', 'duration': 165768, + 'uploader_id': '929656772001', } } @@ -403,7 +407,7 @@ def _real_extract(self, url): title = json_data['name'] description = json_data.get('description') - thumbnail = json_data.get('name') + thumbnail = json_data.get('thumbnail') timestamp = parse_iso8601(json_data.get('published_at')) duration = int_or_none(json_data.get('duration')) @@ -417,12 +421,13 @@ def _real_extract(self, url): if src: formats.append({ 'url': src, - 'abr': source.get('avg_bitrate'), + 'tbr': source.get('avg_bitrate'), 'width': int_or_none(source.get('width')), 'height': int_or_none(source.get('height')), 'filesize': source.get('size'), 'container': source.get('container'), - 'vcodec': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), }) self._sort_formats(formats) @@ -435,4 +440,5 @@ def _real_extract(self, url): 'timestamp': timestamp, 'duration': duration, 'formats': formats, + 'uploader_id': account_id, } From 30787f7259c4e6a08f691cc691f14fa0c8fe4b87 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 3 Oct 2015 19:28:48 +0100 Subject: [PATCH 010/415] [cspan] correct the clip info extraction --- youtube_dl/extractor/cspan.py | 58 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index fbefd37d0..994e080d5 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -18,22 +18,21 @@ class CSpanIE(InfoExtractor): IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'md5': '067803f994e049b455a58b16e5aab442', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - # For whatever reason, the served video alternates between - # two different ones + 'md5': '4eafd1e91a75d2b1e6a3cbd0995816a2', 'info_dict': { - 'id': '340723', + 'id': 'c4486943', 'ext': 'mp4', - 'title': 'International Health Care Models', + 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { @@ -44,7 +43,7 @@ class CSpanIE(InfoExtractor): 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, - 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' + 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov @@ -57,36 +56,33 @@ class CSpanIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) + if matches: + video_type, video_id = matches.groups() + if video_type == 'prog': + video_type = 'program' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) - description = self._html_search_regex( - [ - # The full description - r'
(.*?)(.*?)

' - ], - webpage, 'description', flags=re.DOTALL, default=None) - - info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data = self._download_json(info_url, video_id) + data = self._download_json( + 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) + description = self._html_search_meta('description', webpage) + title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) - files = data['video']['files'] try: capfile = data['video']['capfile']['#text'] @@ -112,12 +108,12 @@ def _real_extract(self, url): if len(entries) == 1: entry = dict(entries[0]) - entry['id'] = video_id + entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, - 'id': video_id, + 'id': 'c' + video_id if video_type == 'clip' else video_id, } From 41a7b00f183844e93ae2ba46fb4021f257f3ce79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Sat, 17 Oct 2015 18:18:40 +0200 Subject: [PATCH 011/415] [vimeo] Extract config URL from (new?) React-based Vimeo's page --- youtube_dl/extractor/vimeo.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fa1b22049..88e462a4d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -286,7 +286,14 @@ def _real_extract(self, url): try: try: config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, 'config URL') + r' data-config-url="(.+?)"', webpage, + 'config URL', default=None) + if not config_url: + # New react-based page + vimeo_clip_page_config = self._search_regex( + r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, + 'vimeo clip page config') + config_url = self._parse_json(vimeo_clip_page_config, video_id)['player']['config_url'] config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: From dd8417526b13c541e6db8f4200e717b8922a1620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 22:48:14 +0600 Subject: [PATCH 012/415] [vimeo] Clarify new react+flux website fallback --- youtube_dl/extractor/vimeo.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 88e462a4d..0f84656c0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -289,11 +289,14 @@ def _real_extract(self, url): r' data-config-url="(.+?)"', webpage, 'config URL', default=None) if not config_url: - # New react-based page + # Sometimes new react-based page is served instead of old one that require + # different config URL extraction approach (see + # https://github.com/rg3/youtube-dl/pull/7209) vimeo_clip_page_config = self._search_regex( r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, 'vimeo clip page config') - config_url = self._parse_json(vimeo_clip_page_config, video_id)['player']['config_url'] + config_url = self._parse_json( + vimeo_clip_page_config, video_id)['player']['config_url'] config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: From 59fe4824f80b7e266ea9918ae1b2e49a456b869f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Sat, 17 Oct 2015 18:52:25 +0200 Subject: [PATCH 013/415] [vidme] Better error message for suspended vidme videos --- youtube_dl/extractor/vidme.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 078d283b2..81dcaa231 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -114,6 +114,12 @@ def _real_extract(self, url): video = response['video'] + if video.get('state') == 'user-disabled': + raise ExtractorError( + 'Vidme said: This video has been suspended either due to a copyright claim, ' + 'or for violating the terms of use.', + expected=True) + formats = [{ 'format_id': f.get('type'), 'url': f['uri'], From 9eb31b265f65ec6b04a508702af1a6feddafb8fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 23:01:24 +0600 Subject: [PATCH 014/415] [vidme] Add user-disabled test --- youtube_dl/extractor/vidme.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 81dcaa231..382517a4a 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -93,6 +93,10 @@ class VidmeIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # nsfw, user-disabled + 'url': 'https://vid.me/dzGJ', + 'only_matching': True, }] def _real_extract(self, url): From 583882fdce19f8c565402f42523b275f96c91575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Sat, 17 Oct 2015 19:26:30 +0200 Subject: [PATCH 015/415] [dailymotion] Report errors from player v5 --- youtube_dl/extractor/dailymotion.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 80a05cfee..ea1edceb1 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -96,6 +96,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'HotWaves1012', 'age_limit': 18, } + }, + # geo-restricted, player v5 + { + 'url': 'http://www.dailymotion.com/video/xhza0o', + 'only_matching': True, } ] @@ -124,6 +129,9 @@ def _real_extract(self, url): if player_v5: player = self._parse_json(player_v5, video_id) metadata = player['metadata'] + + self._check_error(metadata) + formats = [] for quality, media_list in metadata['qualities'].items(): for media in media_list: @@ -201,9 +209,7 @@ def _real_extract(self, url): 'video info', flags=re.MULTILINE), video_id) - if info.get('error') is not None: - msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] - raise ExtractorError(msg, expected=True) + self._check_error(info) formats = [] for (key, format_id) in self._FORMATS: @@ -246,6 +252,11 @@ def _real_extract(self, url): 'duration': info['duration'] } + def _check_error(self, info): + if info.get('error') is not None: + msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] + raise ExtractorError(msg, expected=True) + def _get_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( From 648e6a1ffe45ceae2995c3f9ec6a9413aad55640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 00:11:34 +0600 Subject: [PATCH 016/415] [youtube] Generalize playlist entries extraction (Closes #6699, closes #6992) --- youtube_dl/extractor/youtube.py | 121 ++++++++++++++------------------ 1 file changed, 52 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b252e36e1..08e821362 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -178,6 +178,52 @@ def _real_initialize(self): return +class YoutubePlaylistBaseInfoExtractor(InfoExtractor): + # Extract the video ids from the playlist pages + def _entries(self, page, playlist_id): + more_widget_html = content_html = page + for page_num in itertools.count(1): + for video_id, video_title in self.extract_videos_from_page(content_html): + yield self.url_result( + video_id, 'Youtube', video_id=video_id, + video_title=video_title) + + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break + more_widget_html = more['load_more_widget_html'] + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + for mobj in re.finditer(self._VIDEO_RE, page): + # The link with index 0 is not the first video of the playlist (not sure if still actual) + if 'index' in mobj.groupdict() and mobj.group('id') == '0': + continue + video_id = mobj.group('id') + video_title = unescapeHTML(mobj.group('title')) + if video_title: + video_title = video_title.strip() + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + return zip(ids_in_page, titles_in_page) + + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ @@ -1419,7 +1465,7 @@ def _map_to_format_list(urlmap): } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? @@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -1557,37 +1603,11 @@ def _extract_playlist(self, playlist_id): else: self.report_warning('Youtube gives an alert message: ' + match) - # Extract the video ids from the playlist pages - def _entries(): - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.finditer(self._VIDEO_RE, content_html) - # We remove the duplicates and the link with index 0 - # (it's not the first video of the playlist) - new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') - for vid_id in new_ids: - yield self.url_result(vid_id, 'Youtube', video_id=vid_id) - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] - playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', page, 'title') - return self.playlist_result(_entries(), playlist_id, playlist_title) + return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) def _real_extract(self, url): # Extract playlist id @@ -1613,10 +1633,11 @@ def _real_extract(self, url): return self._extract_playlist(playlist_id) -class YoutubeChannelIE(InfoExtractor): +class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' + _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' _TESTS = [{ 'note': 'paginated channel', @@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor): } }] - @staticmethod - def extract_videos_from_page(page): - ids_in_page = [] - titles_in_page = [] - for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): - video_id = mobj.group('id') - video_title = unescapeHTML(mobj.group('title')) - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - return zip(ids_in_page, titles_in_page) - def _real_extract(self, url): channel_id = self._match_id(url) @@ -1685,29 +1690,7 @@ def _real_extract(self, url): for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) - def _entries(): - more_widget_html = content_html = channel_page - for pagenum in itertools.count(1): - - for video_id, video_title in self.extract_videos_from_page(content_html): - yield self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - - mobj = re.search( - r'data-uix-load-more-href="/?(?P<more>[^"]+)"', - more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), channel_id, - 'Downloading page #%s' % (pagenum + 1), - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - return self.playlist_result(_entries(), channel_id) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) class YoutubeUserIE(YoutubeChannelIE): From 8e5b1219489be399de55566090e145c89007fa48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 00:27:06 +0600 Subject: [PATCH 017/415] [test_youtube_lists] Add test flat playlist entries' titles --- test/test_youtube_lists.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c889b6f15..26aadb34f 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -57,5 +57,14 @@ def test_youtube_toptracks(self): entries = result['entries'] self.assertEqual(len(entries), 100) + def test_youtube_flat_playlist_titles(self): + dl = FakeYDL() + dl.params['extract_flat'] = True + ie = YoutubePlaylistIE(dl) + result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + self.assertIsPlaylist(result) + for entry in result['entries']: + self.assertTrue(entry.get('title')) + if __name__ == '__main__': unittest.main() From 7593fbaa126f8bf14eecff7f103cb497e3d31de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 01:00:37 +0600 Subject: [PATCH 018/415] [dailymotion] Error spelling --- youtube_dl/extractor/dailymotion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index ea1edceb1..9cd9ff17d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -254,8 +254,8 @@ def _real_extract(self, url): def _check_error(self, info): if info.get('error') is not None: - msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] - raise ExtractorError(msg, expected=True) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True) def _get_subtitles(self, video_id, webpage): try: From 5a11b793fe70beb6b0c7a74a489db9e52c4a742b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 01:36:03 +0600 Subject: [PATCH 019/415] [lynda] Extract all prioritized streams --- youtube_dl/extractor/lynda.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 378117270..5c973e75c 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -140,13 +140,14 @@ def _real_extract(self, url): prioritized_streams = video_json.get('PrioritizedStreams') if prioritized_streams: - formats.extend([ - { - 'url': video_url, - 'width': int_or_none(format_id), - 'format_id': format_id, - } for format_id, video_url in prioritized_streams['0'].items() - ]) + for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): + formats.extend([ + { + 'url': video_url, + 'width': int_or_none(format_id), + 'format_id': '%s-%s' % (prioritized_stream_id, format_id), + } for format_id, video_url in prioritized_stream.items() + ]) self._check_formats(formats, video_id) self._sort_formats(formats) From 355c7ad361aa3c8a57ff83e3f702a496dce59e65 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 17 Oct 2015 21:30:38 +0100 Subject: [PATCH 020/415] [cspan] handle error massages and extract qualities --- youtube_dl/extractor/cspan.py | 67 +++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 994e080d5..c74b35fd9 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -9,16 +9,21 @@ find_xpath_attr, smuggle_url, determine_ext, + ExtractorError, ) from .senateisvp import SenateISVPIE +def get_text_attr(d, attr): + return d.get(attr, {}).get('#text') + + class CSpanIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '067803f994e049b455a58b16e5aab442', + 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', 'ext': 'mp4', @@ -28,7 +33,7 @@ class CSpanIE(InfoExtractor): 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - 'md5': '4eafd1e91a75d2b1e6a3cbd0995816a2', + 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', 'info_dict': { 'id': 'c4486943', 'ext': 'mp4', @@ -37,7 +42,7 @@ class CSpanIE(InfoExtractor): } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '446562a736c6bf97118e389433ed88d4', + 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', 'ext': 'mp4', @@ -71,8 +76,10 @@ def _real_extract(self, url): return self.url_result(surl, 'SenateISVP', video_id, title) data = self._download_json( - 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), - video_id) + 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id)['video'] + if data['@status'] != 'Success': + raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), @@ -83,28 +90,36 @@ def _real_extract(self, url): title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - files = data['video']['files'] - try: - capfile = data['video']['capfile']['#text'] - except KeyError: - capfile = None + files = data['files'] + capfile = get_text_attr(data, 'capfile') - entries = [{ - 'id': '%s_%d' % (video_id, partnum + 1), - 'title': ( - title if len(files) == 1 else - '%s part %d' % (title, partnum + 1)), - 'url': unescapeHTML(f['path']['#text']), - 'description': description, - 'thumbnail': thumbnail, - 'duration': int_or_none(f.get('length', {}).get('#text')), - 'subtitles': { - 'en': [{ - 'url': capfile, - 'ext': determine_ext(capfile, 'dfxp') - }], - } if capfile else None, - } for partnum, f in enumerate(files)] + entries = [] + for partnum, f in enumerate(files): + formats = [] + for quality in f['qualities']: + formats.append({ + 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'url': unescapeHTML(get_text_attr(quality, 'file')), + 'height': int_or_none(get_text_attr(quality, 'height')), + 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + }) + self._sort_formats(formats) + entries.append({ + 'id': '%s_%d' % (video_id, partnum + 1), + 'title': ( + title if len(files) == 1 else + '%s part %d' % (title, partnum + 1)), + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + 'duration': int_or_none(get_text_attr(f, 'length')), + 'subtitles': { + 'en': [{ + 'url': capfile, + 'ext': determine_ext(capfile, 'dfxp') + }], + } if capfile else None, + }) if len(entries) == 1: entry = dict(entries[0]) From 80f48920c8a909ba55d13932524e55ed970f1c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 06:57:57 +0600 Subject: [PATCH 021/415] [crunchyroll] Bypass maturity wall (Closes #7202) --- youtube_dl/extractor/crunchyroll.py | 59 ++++++++++++++++++----------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 95952bc29..aa258bbc2 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -32,6 +32,26 @@ class CrunchyrollBaseIE(InfoExtractor): + _NETRC_MACHINE = 'crunchyroll' + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + login_url = 'https://www.crunchyroll.com/?a=formhandler' + data = urlencode_postdata({ + 'formname': 'RpcApiUser_Login', + 'name': username, + 'password': password, + }) + login_request = compat_urllib_request.Request(login_url, data) + login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + self._download_webpage(login_request, None, False, 'Wrong login info') + + def _real_initialize(self): + self._login() + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else compat_urllib_request.Request(url_or_request)) @@ -46,10 +66,22 @@ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, f return super(CrunchyrollBaseIE, self)._download_webpage( request, video_id, note, errnote, fatal, tries, timeout, encoding) + @staticmethod + def _add_skip_wall(url): + parsed_url = compat_urlparse.urlparse(url) + qs = compat_urlparse.parse_qs(parsed_url.query) + # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: + # > This content may be inappropriate for some people. + # > Are you sure you want to continue? + # since it's not disabled by default in crunchyroll account's settings. + # See https://github.com/rg3/youtube-dl/issues/7202. + qs['skip_wall'] = ['1'] + return compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + class CrunchyrollIE(CrunchyrollBaseIE): _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' - _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -81,7 +113,6 @@ class CrunchyrollIE(CrunchyrollBaseIE): # rtmp 'skip_download': True, }, - }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -94,24 +125,6 @@ class CrunchyrollIE(CrunchyrollBaseIE): '1080': ('80', '108'), } - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - self.report_login() - login_url = 'https://www.crunchyroll.com/?a=formhandler' - data = urlencode_postdata({ - 'formname': 'RpcApiUser_Login', - 'name': username, - 'password': password, - }) - login_request = compat_urllib_request.Request(login_url, data) - login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - self._download_webpage(login_request, None, False, 'Wrong login info') - - def _real_initialize(self): - self._login() - def _decrypt_subtitles(self, data, iv, id): data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) @@ -254,7 +267,7 @@ def _real_extract(self, url): else: webpage_url = 'http://www.' + mobj.group('url') - webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') + webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage') note_m = self._html_search_regex( r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') @@ -352,7 +365,7 @@ def _real_extract(self, url): class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = "crunchyroll:playlist" - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', @@ -366,7 +379,7 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): def _real_extract(self, url): show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) + webpage = self._download_webpage(self._add_skip_wall(url), show_id) title = self._html_search_regex( r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>', webpage, 'title') From 49941c4e4f6e33785a3be1e0d103bd81657d8a0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 07:06:47 +0600 Subject: [PATCH 022/415] [crunchyroll] Add maturity wall reference tests (#7202) --- youtube_dl/extractor/crunchyroll.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index aa258bbc2..cecd0c784 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -116,6 +116,10 @@ class CrunchyrollIE(CrunchyrollBaseIE): }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', + 'only_matching': True, }] _FORMAT_IDS = { @@ -374,6 +378,19 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' }, 'playlist_count': 13, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', + 'info_dict': { + 'id': 'cosplay-complex-ova', + 'title': 'Cosplay Complex OVA' + }, + 'playlist_count': 3, + 'skip': 'Georestricted', + }, { + # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 + 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', + 'only_matching': True, }] def _real_extract(self, url): From 448ef1f31c8bcc1550cf907fd46e31026ec981b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:11:02 +0600 Subject: [PATCH 023/415] [extractor/common] Allow angle brackets in attributes in _og_regexes (#7215) --- test/test_InfoExtractor.py | 4 ++++ youtube_dl/extractor/common.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 2a00d09a5..938466a80 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -37,12 +37,16 @@ def test_opengraph(self): <meta property='og:image' content='http://domain.com/pic.jpg?key1=val1&key2=val2'/> <meta content='application/x-shockwave-flash' property='og:video:type'> <meta content='Foo' property=og:foobar> + <meta name="og:test1" content='foo > < bar'/> + <meta name="og:test2" content="foo >//< bar"/> ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') self.assertEqual(ie._og_search_video_url(html, default=None), None) self.assertEqual(ie._og_search_property('foobar', html), 'Foo') + self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') + self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') def test_html_search_meta(self): ie = self.ie diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a0c4af92f..4365077f1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -645,7 +645,7 @@ def _get_tfa_info(self, note='two-factor verification code'): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' % {'prop': re.escape(prop)}) template = r'<meta[^>]+?%s[^>]+?%s' From 94a773feb94a20be66526348a57ebe20495eba3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk> Date: Sat, 17 Oct 2015 22:25:08 +0200 Subject: [PATCH 024/415] [vine] Use JS data to get title/alt_title --- youtube_dl/extractor/vine.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index c733a48fa..d80b580a0 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -51,6 +51,21 @@ class VineIE(InfoExtractor): }, { 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', 'only_matching': True, + }, { + 'url': 'https://vine.co/v/e192BnZnZ9V', + 'info_dict': { + 'id': 'e192BnZnZ9V', + 'ext': 'mp4', + 'title': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'alt_title': 'Vine by Pimry_zaa', + 'description': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'upload_date': '20150705', + 'uploader': 'Pimry_zaa', + 'uploader_id': '1135760698325307392', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -74,8 +89,8 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': self._og_search_title(webpage), - 'alt_title': self._og_search_description(webpage, default=None), + 'title': data['description'], + 'alt_title': 'Vine by %s' % data['username'], 'description': data['description'], 'thumbnail': data['thumbnailUrl'], 'upload_date': unified_strdate(data['created']), From 10c38c7ca248d06c2c0f069c5a810e27e207c61e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk> Date: Sat, 17 Oct 2015 22:29:49 +0200 Subject: [PATCH 025/415] [vine] Fix download tests --- youtube_dl/extractor/vine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index d80b580a0..d1dbec893 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -29,10 +29,10 @@ class VineIE(InfoExtractor): 'id': 'MYxVapFvz2z', 'ext': 'mp4', 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', - 'alt_title': 'Vine by Luna', + 'alt_title': 'Vine by Mars Ruiz', 'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', 'upload_date': '20140815', - 'uploader': 'Luna', + 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', }, }, { From 91816e8f16408a3a2753fb254a9e963ad9429ced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:32:08 +0600 Subject: [PATCH 026/415] [vine] Remove duplicate metadata, make more robust and modernize (Closes #7215) --- youtube_dl/extractor/vine.py | 39 ++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index d1dbec893..6e72cc253 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -1,10 +1,14 @@ +# coding: utf-8 from __future__ import unicode_literals import re import itertools from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + unified_strdate, +) class VineIE(InfoExtractor): @@ -17,7 +21,6 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'Chicken.', 'alt_title': 'Vine by Jack Dorsey', - 'description': 'Chicken.', 'upload_date': '20130519', 'uploader': 'Jack Dorsey', 'uploader_id': '76', @@ -30,7 +33,6 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', 'alt_title': 'Vine by Mars Ruiz', - 'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', 'upload_date': '20140815', 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', @@ -43,7 +45,6 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': '#mw3 #ac130 #killcam #angelofdeath', 'alt_title': 'Vine by Z3k3', - 'description': '#mw3 #ac130 #killcam #angelofdeath', 'upload_date': '20130430', 'uploader': 'Z3k3', 'uploader_id': '936470460173008896', @@ -56,9 +57,8 @@ class VineIE(InfoExtractor): 'info_dict': { 'id': 'e192BnZnZ9V', 'ext': 'mp4', - 'title': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', 'alt_title': 'Vine by Pimry_zaa', - 'description': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', @@ -80,25 +80,26 @@ def _real_extract(self, url): formats = [{ 'format_id': '%(format)s-%(rate)s' % f, - 'vcodec': f['format'], - 'quality': f['rate'], + 'vcodec': f.get('format'), + 'quality': f.get('rate'), 'url': f['videoUrl'], - } for f in data['videoUrls']] + } for f in data['videoUrls'] if f.get('videoUrl')] self._sort_formats(formats) + username = data.get('username') + return { 'id': video_id, - 'title': data['description'], - 'alt_title': 'Vine by %s' % data['username'], - 'description': data['description'], - 'thumbnail': data['thumbnailUrl'], - 'upload_date': unified_strdate(data['created']), - 'uploader': data['username'], - 'uploader_id': data['userIdStr'], - 'like_count': data['likes']['count'], - 'comment_count': data['comments']['count'], - 'repost_count': data['reposts']['count'], + 'title': data.get('description') or self._og_search_title(webpage), + 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None), + 'thumbnail': data.get('thumbnailUrl'), + 'upload_date': unified_strdate(data.get('created')), + 'uploader': username, + 'uploader_id': data.get('userIdStr'), + 'like_count': int_or_none(data.get('likes', {}).get('count')), + 'comment_count': int_or_none(data.get('comments', {}).get('count')), + 'repost_count': int_or_none(data.get('reposts', {}).get('count')), 'formats': formats, } From 02835c6bf4403a907c058d43220a83b3b427e181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:34:54 +0600 Subject: [PATCH 027/415] [extractor/common] Document repost_count --- youtube_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4365077f1..6169fbbeb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -172,6 +172,7 @@ class InfoExtractor(object): view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following From 2e022397c45fbcfd2ef6da43d14b0770221aabd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:36:19 +0600 Subject: [PATCH 028/415] [vine] Add counters to tests --- youtube_dl/extractor/vine.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 6e72cc253..be72f3147 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -24,6 +24,9 @@ class VineIE(InfoExtractor): 'upload_date': '20130519', 'uploader': 'Jack Dorsey', 'uploader_id': '76', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/v/MYxVapFvz2z', @@ -36,6 +39,9 @@ class VineIE(InfoExtractor): 'upload_date': '20140815', 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/v/bxVjBbZlPUH', @@ -48,6 +54,9 @@ class VineIE(InfoExtractor): 'upload_date': '20130430', 'uploader': 'Z3k3', 'uploader_id': '936470460173008896', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', @@ -62,6 +71,9 @@ class VineIE(InfoExtractor): 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, From 1e399778ee870ee583135e65458268cd7c0fb923 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 22 Jul 2015 20:03:05 +0800 Subject: [PATCH 029/415] [letv] Fix extraction Using data URIs for passing the decrypted M3U8 manifest, which is supported by ffmpeg only. --- youtube_dl/extractor/letv.py | 70 ++++++++++++++++++++++++++---------- youtube_dl/utils.py | 5 +++ 2 files changed, 57 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index a28abb0f0..9ebbc8089 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -9,13 +9,14 @@ from ..compat import ( compat_urllib_parse, compat_urllib_request, - compat_urlparse, + compat_ord, ) from ..utils import ( determine_ext, ExtractorError, parse_iso8601, int_or_none, + encode_data_uri, ) @@ -25,15 +26,16 @@ class LetvIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.letv.com/ptv/vplay/22005890.html', - 'md5': 'cab23bd68d5a8db9be31c9a222c1e8df', + 'md5': 'edadcfe5406976f42f9f266057ee5e40', 'info_dict': { 'id': '22005890', 'ext': 'mp4', 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', - 'timestamp': 1424747397, - 'upload_date': '20150224', 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', - } + }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'url': 'http://www.letv.com/ptv/vplay/1415246.html', 'info_dict': { @@ -42,16 +44,22 @@ class LetvIE(InfoExtractor): 'title': '美人天下01', 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'note': 'This video is available only in Mainland China, thus a proxy is needed', 'url': 'http://www.letv.com/ptv/vplay/1118082.html', - 'md5': 'f80936fbe20fb2f58648e81386ff7927', + 'md5': '2424c74948a62e5f31988438979c5ad1', 'info_dict': { 'id': '1118082', 'ext': 'mp4', 'title': '与龙共舞 完整版', 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', }, + 'params': { + 'hls_prefer_native': True, + }, 'skip': 'Only available in China', }] @@ -74,6 +82,27 @@ def calc_time_key(self, param1): _loc3_ = self.ror(_loc3_, _loc2_ % 17) return _loc3_ + # see M3U8Encryption class in KLetvPlayer.swf + @staticmethod + def decrypt_m3u8(encrypted_data): + if encrypted_data[:5].decode('utf-8').lower() != 'vc_01': + return encrypted_data + encrypted_data = encrypted_data[5:] + + _loc4_ = bytearray() + while encrypted_data: + b = compat_ord(encrypted_data[0]) + _loc4_.extend([b // 16, b & 0x0f]) + encrypted_data = encrypted_data[1:] + idx = len(_loc4_) - 11 + _loc4_ = _loc4_[idx:] + _loc4_[:idx] + _loc7_ = bytearray() + while _loc4_: + _loc7_.append(_loc4_[0] * 16 + _loc4_[1]) + _loc4_ = _loc4_[2:] + + return bytes(_loc7_) + def _real_extract(self, url): media_id = self._match_id(url) page = self._download_webpage(url, media_id) @@ -115,23 +144,28 @@ def _real_extract(self, url): for format_id in formats: if format_id in dispatch: media_url = playurl['domain'][0] + dispatch[format_id][0] - - # Mimic what flvxz.com do - url_parts = list(compat_urlparse.urlparse(media_url)) - qs = dict(compat_urlparse.parse_qs(url_parts[4])) - qs.update({ - 'platid': '14', - 'splatid': '1401', - 'tss': 'no', - 'retry': 1 + media_url += '&' + compat_urllib_parse.urlencode({ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'rateid': format_id, }) - url_parts[4] = compat_urllib_parse.urlencode(qs) - media_url = compat_urlparse.urlunparse(url_parts) + + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id) + + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) + + m3u8_data = self.decrypt_m3u8(req.read()) url_info_dict = { - 'url': media_url, + 'url': encode_data_uri(m3u8_data, 'application/x-mpegURL'), 'ext': determine_ext(dispatch[format_id][1]), 'format_id': format_id, + 'protocol': 'm3u8', } if format_id[-1:] == 'p': diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7dbe25661..db5b3698e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals +import base64 import calendar import codecs import contextlib @@ -1795,6 +1796,10 @@ def urlhandle_detect_ext(url_handle): return mimetype2ext(getheader('Content-Type')) +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + def age_restricted(content_limit, age_limit): """ Returns True iff the content should be blocked """ From 985e4fdc07f00a3fdc8e7b7b4119471ee97f3890 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 17 Oct 2015 22:49:05 +0800 Subject: [PATCH 030/415] [downloader/hls] Add headers only for http(s) URLs ffmpeg 2.8.1 raises an error with -headers and non-http input files. --- youtube_dl/downloader/hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a62d2047b..9a83a73dd 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -30,7 +30,7 @@ def real_download(self, filename, info_dict): args = [ffpp.executable, '-y'] - if info_dict['http_headers']: + if info_dict['http_headers'] and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. args += [ From 0a67a3632bb9cf76f64658986defc1947090ef50 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 17 Oct 2015 23:15:01 +0800 Subject: [PATCH 031/415] [compat] Add compat_urllib_request_DataHandler --- youtube_dl/compat.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 192e1c515..d103ab9ad 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals +import binascii import collections +import email import getpass +import io import optparse import os import re @@ -38,6 +41,11 @@ except ImportError: # Python 2 import urlparse as compat_urlparse +try: + import urllib.response as compat_urllib_response +except ImportError: # Python 2 + import urllib as compat_urllib_response + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 @@ -155,6 +163,40 @@ def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace') string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) +try: + from urllib.request import DataHandler as compat_urllib_request_DataHandler +except ImportError: # Python < 3.4 + # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py + class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler): + def data_open(self, req): + # data URLs as specified in RFC 2397. + # + # ignores POSTed data + # + # syntax: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + url = req.get_full_url() + + scheme, data = url.split(":", 1) + mediatype, data = data.split(",", 1) + + # even base64 encoded data URLs might be quoted so unquote in any case: + data = compat_urllib_parse_unquote_to_bytes(data) + if mediatype.endswith(";base64"): + data = binascii.a2b_base64(data) + mediatype = mediatype[:-7] + + if not mediatype: + mediatype = "text/plain;charset=US-ASCII" + + headers = email.message_from_string( + "Content-type: %s\nContent-length: %d\n" % (mediatype, len(data))) + + return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) + try: compat_basestring = basestring # Python 2 except NameError: @@ -489,6 +531,8 @@ def compat_itertools_count(start=0, step=1): 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlparse', 'compat_urllib_request', + 'compat_urllib_request_DataHandler', + 'compat_urllib_response', 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', From 8b172c2e10fb38c62c213673304c7e8dcd17b768 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 17 Oct 2015 23:16:40 +0800 Subject: [PATCH 032/415] [YoutubeDL] Use DataHandler --- youtube_dl/YoutubeDL.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index adf70d658..12977bf80 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -37,6 +37,7 @@ compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, + compat_urllib_request_DataHandler, ) from .utils import ( ContentTooShortError, @@ -1967,8 +1968,9 @@ def _setup_opener(self): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + data_handler = compat_urllib_request_DataHandler() opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh) + proxy_handler, https_handler, cookie_processor, ydlh, data_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play From 48aae2d2cf49843d0efa227fa393a0c783fc3c1e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:07:48 +0800 Subject: [PATCH 033/415] [twitter] Update tests --- youtube_dl/extractor/twitter.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1dd43ff3c..b2fff73b9 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -15,7 +16,7 @@ class TwitterCardIE(InfoExtractor): _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': 'a74f50b310c83170319ba16de6955192', + 'md5': '7d2f6b4d2eb841a7ccc893d479bfceb4', 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', @@ -103,17 +104,17 @@ class TwitterIE(TwitterCardIE): _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P<id>[^/]+/status/\d+)' _TESTS = [{ - 'url': 'https://m.twitter.com/thereaIbanksy/status/614301758345490432', - 'md5': '8bbccb487bd7a31349b775915fcd412f', + 'url': 'https://twitter.com/freethenipple/status/643211948184596480', + 'md5': '31cd83a116fc41f99ae3d909d4caf6a0', 'info_dict': { - 'id': '614301758345490432', + 'id': '643211948184596480', 'ext': 'mp4', - 'title': 'thereaIbanksy - This time lapse is so pretty \U0001f60d\U0001f60d', + 'title': 'freethenipple - FTN supporters on Hollywood Blvd today!', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 29.5, - 'description': 'banksy on Twitter: "This time lapse is so pretty \U0001f60d\U0001f60d http://t.co/QB8DDbqiR1"', - 'uploader': 'banksy', - 'uploader_id': 'thereaIbanksy', + 'duration': 12.922, + 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', + 'uploader': 'FREE THE NIPPLE', + 'uploader_id': 'freethenipple', }, }] From 01d22d47039dedace1c5414c83e9fecfca41b5a5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:11:55 +0800 Subject: [PATCH 034/415] [twitter] Use _download_xml --- youtube_dl/extractor/twitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index b2fff73b9..37a9fd5fd 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -8,6 +8,7 @@ from ..utils import ( float_or_none, unescapeHTML, + xpath_text, ) @@ -60,9 +61,8 @@ def _real_extract(self, url): video_id) if 'playlist' not in config: if 'vmapUrl' in config: - webpage = self._download_webpage(config['vmapUrl'], video_id + ' (xml)') - video_url = self._search_regex( - r'<MediaFile>\s*<!\[CDATA\[(https?://.+?)\]\]>', webpage, 'data player config (xml)') + vmap_data = self._download_xml(config['vmapUrl'], video_id) + video_url = xpath_text(vmap_data, './/MediaFile').strip() f = { 'url': video_url, } From 014e880372e896cdd63f9075864d2a3bba60e706 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:13:58 +0800 Subject: [PATCH 035/415] [twitter] Add IE_NAMEs --- youtube_dl/extractor/twitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 37a9fd5fd..5f697782e 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -13,6 +13,7 @@ class TwitterCardIE(InfoExtractor): + IE_NAME = 'twitter:card' _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' _TESTS = [ { @@ -101,6 +102,7 @@ def _real_extract(self, url): class TwitterIE(TwitterCardIE): + IE_NAME = 'twitter' _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P<id>[^/]+/status/\d+)' _TESTS = [{ From f322bfb0638aeeb527459ebcf00f8a3dde26280c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:15:47 +0800 Subject: [PATCH 036/415] [twitter:card] Remove unneeded 'ext' --- youtube_dl/extractor/twitter.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 5f697782e..48bef5d80 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -64,13 +64,9 @@ def _real_extract(self, url): if 'vmapUrl' in config: vmap_data = self._download_xml(config['vmapUrl'], video_id) video_url = xpath_text(vmap_data, './/MediaFile').strip() - f = { + formats.append({ 'url': video_url, - } - ext = re.search(r'\.([a-z0-9]{2,4})(\?.+)?$', video_url) - if ext: - f['ext'] = ext.group(1) - formats.append(f) + }) break # same video regardless of UA continue From e04edad621efe56347e155b6dc59a0c3d589b3bd Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:16:57 +0800 Subject: [PATCH 037/415] [twitter] Inherit from InfoExtractor directly --- youtube_dl/extractor/twitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 48bef5d80..c9b783745 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -97,11 +97,11 @@ def _real_extract(self, url): } -class TwitterIE(TwitterCardIE): +class TwitterIE(InfoExtractor): IE_NAME = 'twitter' _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P<id>[^/]+/status/\d+)' - _TESTS = [{ + _TEST = { 'url': 'https://twitter.com/freethenipple/status/643211948184596480', 'md5': '31cd83a116fc41f99ae3d909d4caf6a0', 'info_dict': { @@ -114,7 +114,7 @@ class TwitterIE(TwitterCardIE): 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', }, - }] + } def _real_extract(self, url): id = self._match_id(url) From f6dfd6603a9e9bb88ebcdcd52490974a34d1bd11 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:18:01 +0800 Subject: [PATCH 038/415] [twitter] Use _html_search_regex --- youtube_dl/extractor/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index c9b783745..6ff15369c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -122,7 +122,7 @@ def _real_extract(self, url): name = username url = re.sub(r'https?://(m|mobile)\.', 'https://', url) webpage = self._download_webpage(url, 'tweet: ' + url) - description = unescapeHTML(self._search_regex('<title>\s*(.+?)\s*', webpage, 'title')) + description = self._html_search_regex('\s*(.+?)\s*', webpage, 'title') title = description.replace('\n', ' ') splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) if splitdesc: From 575036b40504bc921b18f05bde64e0e7dceacec6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 18:04:13 +0800 Subject: [PATCH 039/415] [twitter] Simplify and improve --- youtube_dl/extractor/twitter.py | 41 +++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 6ff15369c..6b3b39aee 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -9,6 +9,7 @@ float_or_none, unescapeHTML, xpath_text, + remove_end, ) @@ -99,7 +100,8 @@ def _real_extract(self, url): class TwitterIE(InfoExtractor): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P[^/]+/status/\d+)' + _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' + _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' _TEST = { 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -107,7 +109,7 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '643211948184596480', 'ext': 'mp4', - 'title': 'freethenipple - FTN supporters on Hollywood Blvd today!', + 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 12.922, 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', @@ -117,26 +119,31 @@ class TwitterIE(InfoExtractor): } def _real_extract(self, url): - id = self._match_id(url) - username, twid = re.match(r'([^/]+)/status/(\d+)', id).groups() - name = username - url = re.sub(r'https?://(m|mobile)\.', 'https://', url) - webpage = self._download_webpage(url, 'tweet: ' + url) - description = self._html_search_regex('\s*(.+?)\s*', webpage, 'title') - title = description.replace('\n', ' ') - splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) - if splitdesc: - name, title = splitdesc.groups() - title = re.sub(r'\s*https?://[^ ]+', '', title) # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - card_id = self._search_regex(r'["\']/i/cards/tfw/v1/(\d+)', webpage, '/i/card/...') + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user_id') + twid = mobj.group('id') + + webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid) + + username = remove_end(self._og_search_title(webpage), ' on Twitter') + + title = self._og_search_description(webpage).strip('').replace('\n', ' ') + + # strip 'https -_t.co_BJYgOjSeGA' junk from filenames + mobj = re.match(r'“(.*)\s+(http://[^ ]+)”', title) + title, short_url = mobj.groups() + + card_id = self._search_regex( + r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url') card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + return { '_type': 'url_transparent', 'ie_key': 'TwitterCard', - 'uploader_id': username, - 'uploader': name, + 'uploader_id': user_id, + 'uploader': username, 'url': card_url, 'webpage_url': url, - 'description': description, + 'description': '%s on Twitter: "%s %s"' % (username, title, short_url), 'title': username + ' - ' + title, } From 77a54b6a658059a11de415d793588fdbfec14194 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 18:08:24 +0800 Subject: [PATCH 040/415] [twitter:card] Use _html_search_regex --- youtube_dl/extractor/twitter.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 6b3b39aee..1cdca544c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -7,7 +7,6 @@ from ..compat import compat_urllib_request from ..utils import ( float_or_none, - unescapeHTML, xpath_text, remove_end, ) @@ -57,9 +56,8 @@ def _real_extract(self, url): request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) - config = self._parse_json( - unescapeHTML(self._search_regex( - r'data-player-config="([^"]+)"', webpage, 'data player config')), + config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) if 'playlist' not in config: if 'vmapUrl' in config: From c88aec845a680ef9404b637b3dbcf706dcf00b68 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 18:23:56 +0800 Subject: [PATCH 041/415] [twitter] Fix short URL extraction --- youtube_dl/extractor/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1cdca544c..1472f22a7 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -128,7 +128,7 @@ def _real_extract(self, url): title = self._og_search_description(webpage).strip('').replace('\n', ' ') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - mobj = re.match(r'“(.*)\s+(http://[^ ]+)”', title) + mobj = re.match(r'“(.*)\s+(https?://[^ ]+)”', title) title, short_url = mobj.groups() card_id = self._search_regex( From 4a7b79038425f614af49116edab7897f0db13e5a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 19:07:37 +0800 Subject: [PATCH 042/415] [twitter:card] Support YouTube embeds --- youtube_dl/extractor/twitter.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1472f22a7..9d3e46b94 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -37,6 +37,19 @@ class TwitterCardIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg', 'duration': 80.155, }, + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', + 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814', + 'info_dict': { + 'id': 'dq4Oj5quskI', + 'ext': 'mp4', + 'title': 'Ubuntu 11.10 Overview', + 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', + 'upload_date': '20111013', + 'uploader': 'OMG! Ubuntu!', + 'uploader_id': 'omgubuntu', + }, } ] @@ -56,6 +69,12 @@ def _real_extract(self, url): request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) + youtube_url = self._html_search_regex( + r']+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', + webpage, 'youtube iframe', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) From 05a3879f1c142cc2bf0287cde4690d8ccadcdc8f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 19:19:46 +0800 Subject: [PATCH 043/415] [letv] Update M3U8's MIME type The new MIME type appears in the following places: https://www.iana.org/assignments/media-types/media-types.xhtml#application https://hg.python.org/cpython/file/tip/Lib/mimetypes.py --- youtube_dl/extractor/letv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 9ebbc8089..effd9eb92 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -162,7 +162,7 @@ def _real_extract(self, url): m3u8_data = self.decrypt_m3u8(req.read()) url_info_dict = { - 'url': encode_data_uri(m3u8_data, 'application/x-mpegURL'), + 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), 'ext': determine_ext(dispatch[format_id][1]), 'format_id': format_id, 'protocol': 'm3u8', From dd67702a3ea007369109ee8e4b67043064e1f759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 18 Oct 2015 14:13:06 +0200 Subject: [PATCH 044/415] [imdb] Fix extraction (fixes #7220) --- youtube_dl/extractor/imdb.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 4bb574cf3..02e1e428e 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -4,8 +4,8 @@ import json from .common import InfoExtractor -from ..compat import ( - compat_urlparse, +from ..utils import ( + qualities, ) @@ -30,24 +30,33 @@ def _real_extract(self, url): descr = self._html_search_regex( r'(?s)(.*?)', webpage, 'description', fatal=False) - available_formats = re.findall( - r'case \'(?P.*?)\' :$\s+url = \'(?P.*?)\'', webpage, - flags=re.MULTILINE) + player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id + player_page = self._download_webpage( + player_url, video_id, 'Downloading player page') + # the player page contains the info for the default format, we have to + # fetch other pages for the rest of the formats + extra_formats = re.findall(r'href="(?P%s.*?)".*?>(?P.*?)<' % re.escape(player_url), player_page) + format_pages = [ + self._download_webpage( + f_url, video_id, 'Downloading info for %s format' % f_name) + for f_url, f_name in extra_formats] + format_pages.append(player_page) + + quality = qualities(['SD', '480p', '720p']) formats = [] - for f_id, f_path in available_formats: - f_path = f_path.strip() - format_page = self._download_webpage( - compat_urlparse.urljoin(url, f_path), - 'Downloading info for %s format' % f_id) + for format_page in format_pages: json_data = self._search_regex( r']+class="imdb-player-data"[^>]*?>(.*?)', format_page, 'json data', flags=re.DOTALL) info = json.loads(json_data) format_info = info['videoPlayerObject']['video'] + f_id = format_info['ffname'] formats.append({ 'format_id': f_id, 'url': format_info['videoInfoList'][0]['videoUrl'], + 'quality': quality(f_id), }) + self._sort_formats(formats) return { 'id': video_id, From b0f001a6cbd220c8b10c0ce359f17072d6347a8f Mon Sep 17 00:00:00 2001 From: remitamine Date: Mon, 21 Sep 2015 15:52:36 +0100 Subject: [PATCH 045/415] [canalc2] fix info extraction --- youtube_dl/extractor/canalc2.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index c4fefefe4..66a9ff093 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -8,34 +8,40 @@ class Canalc2IE(InfoExtractor): IE_NAME = 'canalc2.tv' - _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P\d+)' + _VALID_URL = r'https?://(www\.)?canalc2\.tv/video/(?P\d+)' _TEST = { - 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + 'url': 'http://www.canalc2.tv/video/12163', 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', 'ext': 'mp4', 'title': 'Terrasses du Numérique' + }, + 'params': { + 'skip_download': True, # Requires rtmpdump } } def _real_extract(self, url): - video_id = re.match(self._VALID_URL, url).group('id') - # We need to set the voir field for getting the file name - url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - file_name = self._search_regex( - r"so\.addVariable\('file','(.*?)'\);", - webpage, 'file name') - video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + video_url = self._search_regex( + r'jwplayer\("Player"\).setup\({[^}]*file: "([^"]+)"', + webpage, 'video_url') + formats = [{'url': video_url}] + if video_url.startswith('rtmp://'): + rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?Pmp4:.+)$', video_url) + formats[0].update({ + 'app': rtmp.group('app'), + 'play_path': rtmp.group('play_path'), + }) title = self._html_search_regex( - r'class="evenement8">(.*?)
', webpage, 'title') + r'(?s)class="[^"]*col_description[^"]*">.*?

(.*?)

', webpage, 'title') return { 'id': video_id, - 'ext': 'mp4', - 'url': video_url, + 'formats': formats, 'title': title, } From 6682049dee5e73b98e99e1359b959240d0920d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:19:43 +0600 Subject: [PATCH 046/415] [canalc2] Improve rtmp extraction --- youtube_dl/extractor/canalc2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 66a9ff093..648af2e18 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -31,10 +31,12 @@ def _real_extract(self, url): webpage, 'video_url') formats = [{'url': video_url}] if video_url.startswith('rtmp://'): - rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?Pmp4:.+)$', video_url) + rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) formats[0].update({ + 'url': rtmp.group('url'), 'app': rtmp.group('app'), 'play_path': rtmp.group('play_path'), + 'page_url': url, }) title = self._html_search_regex( From ef6c868f23f2fe0d493831e0d4cba71c735bd160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:23:31 +0600 Subject: [PATCH 047/415] [canalc2] Improve some regexes --- youtube_dl/extractor/canalc2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 648af2e18..d9137e2ef 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -8,7 +8,7 @@ class Canalc2IE(InfoExtractor): IE_NAME = 'canalc2.tv' - _VALID_URL = r'https?://(www\.)?canalc2\.tv/video/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P\d+)' _TEST = { 'url': 'http://www.canalc2.tv/video/12163', @@ -27,8 +27,8 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r'jwplayer\("Player"\).setup\({[^}]*file: "([^"]+)"', - webpage, 'video_url') + r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P.+?)\2', + webpage, 'video_url', group='file') formats = [{'url': video_url}] if video_url.startswith('rtmp://'): rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) From 14bddf35fbe8253e283042630e24b134996b2575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:23:52 +0600 Subject: [PATCH 048/415] [canalc2] Add ext --- youtube_dl/extractor/canalc2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index d9137e2ef..ba82bb2b7 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -34,6 +34,7 @@ def _real_extract(self, url): rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) formats[0].update({ 'url': rtmp.group('url'), + 'ext': 'flv', 'app': rtmp.group('app'), 'play_path': rtmp.group('play_path'), 'page_url': url, From b1bf063503893192637f95e929d1a9147de59a7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:27:05 +0600 Subject: [PATCH 049/415] [canalc2] Extract duration --- youtube_dl/extractor/canalc2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index ba82bb2b7..e326b8fbd 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor +from ..utils import parse_duration class Canalc2IE(InfoExtractor): @@ -42,9 +43,13 @@ def _real_extract(self, url): title = self._html_search_regex( r'(?s)class="[^"]*col_description[^"]*">.*?

(.*?)

', webpage, 'title') + duration = parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) return { 'id': video_id, - 'formats': formats, 'title': title, + 'duration': duration, + 'formats': formats, } From 608945d44a7e47fa5115295839c993af545936eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:27:22 +0600 Subject: [PATCH 050/415] [canalc2] Fix test --- youtube_dl/extractor/canalc2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index e326b8fbd..f6a1ff381 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -16,8 +16,9 @@ class Canalc2IE(InfoExtractor): 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'mp4', - 'title': 'Terrasses du Numérique' + 'ext': 'flv', + 'title': 'Terrasses du Numérique', + 'duration': 122, }, 'params': { 'skip_download': True, # Requires rtmpdump From dedd35c6bc33eb88f19b16eeb37498cee076c47a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:59:18 +0600 Subject: [PATCH 051/415] [viewster] Fix failing m3u8 --- youtube_dl/extractor/viewster.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 632e57fb4..7cf930d69 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -131,10 +131,11 @@ def _real_extract(self, url): formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds')) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id='hls', - fatal=False # m3u8 sometimes fail - )) + fatal=False) # m3u8 sometimes fail + if m3u8_formats: + formats.extend(m3u8_formats) else: format_id = media.get('Bitrate') f = { From e36963e0eb57294f156a98c38df891dec41ebaa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 20:24:33 +0600 Subject: [PATCH 052/415] [eagleplatform] Identify hls formats --- youtube_dl/extractor/eagleplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index e529b9b96..7bbf617d4 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -87,7 +87,7 @@ def _real_extract(self, url): m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON') formats = self._extract_m3u8_formats( m3u8_url, video_id, - 'mp4', entry_protocol='m3u8_native') + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') mp4_url = self._get_video_url( # Secure mp4 URL is constructed according to Player.prototype.mp4 from From a6e0afa2bbc93d145b31911b8ce40c502994e2a1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 18 Oct 2015 19:23:40 +0200 Subject: [PATCH 053/415] release 2015.10.18 --- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 47f7da86d..cfa665d88 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -588,7 +588,8 @@ # Supported sites - **twitch:stream** - **twitch:video** - **twitch:vod** - - **TwitterCard** + - **twitter** + - **twitter:card** - **Ubu** - **udemy** - **udemy:course** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 31d2a9dc0..660b0050b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.16' +__version__ = '2015.10.18' From 264b23e1a42378d52f8774a07c1d906cd1cff96c Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 19:56:22 +0200 Subject: [PATCH 054/415] adds thumbnail support for ZDF Mediathek extractor --- youtube_dl/extractor/zdf.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 98f15177b..f376025e1 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -70,6 +70,23 @@ def xml_to_format(fnode): '_available': is_available, } + def xml_to_thumbnails(fnode): + thumbnails = list() + for node in fnode: + width_x_height = node.attrib['key'] + thumbnail = { + 'url': node.text, + 'width': int(width_x_height.split('x')[0]), + 'height': int(width_x_height.split('x')[1]) + } + thumbnails.append(thumbnail) + return thumbnails + + + thumbnail_nodes = doc.findall('.//teaserimages/teaserimage') + thumbnails = xml_to_thumbnails(thumbnail_nodes) + thumbnail = thumbnails[-1]['url'] + format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter( lambda f: f['_available'], @@ -81,6 +98,8 @@ def xml_to_format(fnode): 'title': title, 'description': description, 'duration': duration, + 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'uploader': uploader, 'uploader_id': uploader_id, 'upload_date': upload_date, From d762f86e940ad656e8f7e7b93636292e4cf36de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 19 Oct 2015 00:11:16 +0600 Subject: [PATCH 055/415] [ok] Extend _VALID_URL --- youtube_dl/extractor/odnoklassniki.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index ccc88cfb1..184c7a323 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -13,7 +13,7 @@ class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', @@ -66,6 +66,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://www.ok.ru/video/20648036891', 'only_matching': True, + }, { + 'url': 'http://www.ok.ru/videoembed/20648036891', + 'only_matching': True, }] def _real_extract(self, url): From 8cc83d301dd0e8029aff804e362860d36e3d7e7a Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 20:47:42 +0200 Subject: [PATCH 056/415] use int_or_none, check if attrib exists, remove thumbnail --- youtube_dl/extractor/zdf.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index f376025e1..d41c4e712 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -73,19 +73,17 @@ def xml_to_format(fnode): def xml_to_thumbnails(fnode): thumbnails = list() for node in fnode: - width_x_height = node.attrib['key'] - thumbnail = { - 'url': node.text, - 'width': int(width_x_height.split('x')[0]), - 'height': int(width_x_height.split('x')[1]) - } + thumbnail = {'url': node.text} + if 'key' in node.attrib: + width_x_height = node.attrib['key'] + thumbnail['width'] = int_or_none(width_x_height.split('x')[0]) + thumbnail['height'] = int_or_none(width_x_height.split('x')[1]) thumbnails.append(thumbnail) return thumbnails thumbnail_nodes = doc.findall('.//teaserimages/teaserimage') thumbnails = xml_to_thumbnails(thumbnail_nodes) - thumbnail = thumbnails[-1]['url'] format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter( @@ -98,7 +96,6 @@ def xml_to_thumbnails(fnode): 'title': title, 'description': description, 'duration': duration, - 'thumbnail': thumbnail, 'thumbnails': thumbnails, 'uploader': uploader, 'uploader_id': uploader_id, From b243340f0ce311443a15a2dfd4356a9504e18c04 Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 21:07:52 +0200 Subject: [PATCH 057/415] check if key attrib matches resolution pattern --- youtube_dl/extractor/zdf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index d41c4e712..ed385450c 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -75,9 +75,9 @@ def xml_to_thumbnails(fnode): for node in fnode: thumbnail = {'url': node.text} if 'key' in node.attrib: - width_x_height = node.attrib['key'] - thumbnail['width'] = int_or_none(width_x_height.split('x')[0]) - thumbnail['height'] = int_or_none(width_x_height.split('x')[1]) + if re.match("^[0-9]+x[0-9]+$", node.attrib['key']): + thumbnail['width'] = int_or_none(node.attrib['key'].split('x')[0]) + thumbnail['height'] = int_or_none(node.attrib['key'].split('x')[1]) thumbnails.append(thumbnail) return thumbnails From 2038ad6ee71c842420b83cb6c5ce3c6898e8e380 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 19 Oct 2015 01:12:41 +0600 Subject: [PATCH 058/415] [README.md] Add uploader extraction sample in example extractor --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cf4aebf3d..a6ec9619c 100644 --- a/README.md +++ b/README.md @@ -710,12 +710,13 @@ ### Adding support for a new site webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... - title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') return { 'id': video_id, 'title': title, 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), # TODO more properties (see youtube_dl/extractor/common.py) } ``` From b7cedb16043c60d4032b206a83539acbd39f994f Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 21:25:26 +0200 Subject: [PATCH 059/415] simplify thumbnail dict building --- youtube_dl/extractor/zdf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index ed385450c..c2b196504 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -75,9 +75,10 @@ def xml_to_thumbnails(fnode): for node in fnode: thumbnail = {'url': node.text} if 'key' in node.attrib: - if re.match("^[0-9]+x[0-9]+$", node.attrib['key']): - thumbnail['width'] = int_or_none(node.attrib['key'].split('x')[0]) - thumbnail['height'] = int_or_none(node.attrib['key'].split('x')[1]) + m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) thumbnails.append(thumbnail) return thumbnails From 7b091c370c0f187545df8b1b1cc990fcf95df108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 19 Oct 2015 01:48:05 +0600 Subject: [PATCH 060/415] [zdf] Modernize and PEP 8 --- youtube_dl/extractor/zdf.py | 43 +++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index c2b196504..a795f56b3 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -9,6 +9,7 @@ int_or_none, unified_strdate, OnDemandPagedList, + xpath_text, ) @@ -19,13 +20,11 @@ def extract_from_xml_url(ie, video_id, xml_url): errnote='Failed to download video info') title = doc.find('.//information/title').text - description = doc.find('.//information/detail').text - duration = int(doc.find('.//details/lengthSec').text) - uploader_node = doc.find('.//details/originChannelTitle') - uploader = None if uploader_node is None else uploader_node.text - uploader_id_node = doc.find('.//details/originChannelId') - uploader_id = None if uploader_id_node is None else uploader_id_node.text - upload_date = unified_strdate(doc.find('.//details/airtime').text) + description = xpath_text(doc, './/information/detail', 'description') + duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) + uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') + uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') + upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) def xml_to_format(fnode): video_url = fnode.find('url').text @@ -40,15 +39,14 @@ def xml_to_format(fnode): ext = format_m.group('container') proto = format_m.group('proto').lower() - quality = fnode.find('./quality').text - abr = int(fnode.find('./audioBitrate').text) // 1000 - vbr_node = fnode.find('./videoBitrate') - vbr = None if vbr_node is None else int(vbr_node.text) // 1000 + quality = xpath_text(fnode, './quality', 'quality') + abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) - width_node = fnode.find('./width') - width = None if width_node is None else int_or_none(width_node.text) - height_node = fnode.find('./height') - height = None if height_node is None else int_or_none(height_node.text) + width = int_or_none(xpath_text(fnode, './width', 'width')) + height = int_or_none(xpath_text(fnode, './height', 'height')) + + filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) format_note = '' if not format_note: @@ -64,16 +62,21 @@ def xml_to_format(fnode): 'vbr': vbr, 'width': width, 'height': height, - 'filesize': int_or_none(fnode.find('./filesize').text), + 'filesize': filesize, 'format_note': format_note, 'protocol': proto, '_available': is_available, } def xml_to_thumbnails(fnode): - thumbnails = list() + thumbnails = [] for node in fnode: - thumbnail = {'url': node.text} + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } if 'key' in node.attrib: m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) if m: @@ -82,9 +85,7 @@ def xml_to_thumbnails(fnode): thumbnails.append(thumbnail) return thumbnails - - thumbnail_nodes = doc.findall('.//teaserimages/teaserimage') - thumbnails = xml_to_thumbnails(thumbnail_nodes) + thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter( From 0be30bafa42dbfa99644a9eb7fefa5cebb70f121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Mon, 19 Oct 2015 20:53:27 +0200 Subject: [PATCH 061/415] [vidme] Stream URL fallback, better error message for suspended videos --- youtube_dl/extractor/vidme.py | 37 +++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 382517a4a..393970a12 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -97,6 +97,31 @@ class VidmeIE(InfoExtractor): # nsfw, user-disabled 'url': 'https://vid.me/dzGJ', 'only_matching': True, + }, { + # suspended + 'url': 'https://vid.me/Ox3G', + 'only_matching': True, + }, { + # no formats in the API response + 'url': 'https://vid.me/e5g', + 'info_dict': { + 'id': 'e5g', + 'ext': 'mp4', + 'title': 'e5g', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1401480195, + 'upload_date': '20140530', + 'uploader': None, + 'uploader_id': None, + 'age_limit': 0, + 'duration': 483, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -118,7 +143,7 @@ def _real_extract(self, url): video = response['video'] - if video.get('state') == 'user-disabled': + if video.get('state') in ('user-disabled', 'suspended'): raise ExtractorError( 'Vidme said: This video has been suspended either due to a copyright claim, ' 'or for violating the terms of use.', @@ -131,6 +156,14 @@ def _real_extract(self, url): 'height': int_or_none(f.get('height')), 'preference': 0 if f.get('type', '').endswith('clip') else 1, } for f in video.get('formats', []) if f.get('uri')] + + if not formats and video.get('complete_url'): + formats.append({ + 'url': video.get('complete_url'), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + }) + self._sort_formats(formats) title = video['title'] @@ -147,7 +180,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': title, + 'title': title or video_id, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, From 4bf56141950f3c24000381403417d20095f04460 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 20 Oct 2015 07:43:39 +0100 Subject: [PATCH 062/415] [cspan] move get_text_attr to CSpanIE --- youtube_dl/extractor/cspan.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index c74b35fd9..388460a32 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -14,10 +14,6 @@ from .senateisvp import SenateISVPIE -def get_text_attr(d, attr): - return d.get(attr, {}).get('#text') - - class CSpanIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' IE_DESC = 'C-SPAN' @@ -60,6 +56,9 @@ class CSpanIE(InfoExtractor): } }] + def get_text_attr(self, d, attr): + return d.get(attr, {}).get('#text') + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -79,7 +78,7 @@ def _real_extract(self, url): 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), video_id)['video'] if data['@status'] != 'Success': - raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) + raise ExtractorError('%s said: %s' % (self.IE_NAME, self.get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), @@ -91,17 +90,17 @@ def _real_extract(self, url): thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text files = data['files'] - capfile = get_text_attr(data, 'capfile') + capfile = self.get_text_attr(data, 'capfile') entries = [] for partnum, f in enumerate(files): formats = [] for quality in f['qualities']: formats.append({ - 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), - 'url': unescapeHTML(get_text_attr(quality, 'file')), - 'height': int_or_none(get_text_attr(quality, 'height')), - 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + 'format_id': '%s-%sp' % (self.get_text_attr(quality, 'bitrate'), self.get_text_attr(quality, 'height')), + 'url': unescapeHTML(self.get_text_attr(quality, 'file')), + 'height': int_or_none(self.get_text_attr(quality, 'height')), + 'tbr': int_or_none(self.get_text_attr(quality, 'bitrate')), }) self._sort_formats(formats) entries.append({ @@ -112,7 +111,7 @@ def _real_extract(self, url): 'formats': formats, 'description': description, 'thumbnail': thumbnail, - 'duration': int_or_none(get_text_attr(f, 'length')), + 'duration': int_or_none(self.get_text_attr(f, 'length')), 'subtitles': { 'en': [{ 'url': capfile, From b6aa99aff8278142fed94e37e500f1cfb62defd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Tue, 20 Oct 2015 10:30:31 +0200 Subject: [PATCH 063/415] [vimeo] Fix error parsing --- youtube_dl/extractor/vimeo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 0f84656c0..bdec79341 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -273,13 +273,13 @@ def _real_extract(self, url): self.report_extraction(video_id) vimeo_config = self._search_regex( - r'vimeo\.config\s*=\s*({.+?});', webpage, + r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', webpage, 'vimeo config', default=None) if vimeo_config: seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {}) if seed_status.get('state') == 'failed': raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, seed_status['title']), + '%s said: %s' % (self.IE_NAME, seed_status['title']), expected=True) # Extract the config JSON From 4a8963770e37568c484841338cbb6761cf3cb5c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Oct 2015 20:17:54 +0600 Subject: [PATCH 064/415] [vidme] Use original vid.me title template for untitled videos --- youtube_dl/extractor/vidme.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 393970a12..296e00423 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -107,7 +107,7 @@ class VidmeIE(InfoExtractor): 'info_dict': { 'id': 'e5g', 'ext': 'mp4', - 'title': 'e5g', + 'title': 'Video upload (e5g)', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1401480195, 'upload_date': '20140530', @@ -180,7 +180,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': title or video_id, + 'title': title or 'Video upload (%s)' % video_id, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, From d65889bbc0a6b4a1eafe6a8c0e0e26170dc75586 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Oct 2015 20:18:23 +0600 Subject: [PATCH 065/415] [vidme] Update test --- youtube_dl/extractor/vidme.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 296e00423..eb5cde761 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -14,7 +14,7 @@ class VidmeIE(InfoExtractor): _VALID_URL = r'https?://vid\.me/(?:e/)?(?P[\da-zA-Z]+)' _TESTS = [{ 'url': 'https://vid.me/QNB', - 'md5': 'c62f1156138dc3323902188c5b5a8bd6', + 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', 'info_dict': { 'id': 'QNB', 'ext': 'mp4', From 8bea039b8329074af9a95fe51e7622c8074f6218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Tue, 20 Oct 2015 16:38:44 +0200 Subject: [PATCH 066/415] [vimeo] New test, fixed one older test --- youtube_dl/extractor/vimeo.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bdec79341..2437ae1eb 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -133,7 +133,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.', + 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026', }, 'params': { 'videopassword': 'youtube-dl', @@ -181,6 +181,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user28849593', }, }, + { + 'url': 'https://vimeo.com/109815029', + 'note': 'Video not completely processed, "failed" seed status', + 'only_matching': True, + }, ] @staticmethod From d01949dc89feb2441f251e42e8a6bfa4711b9715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Oct 2015 23:09:51 +0600 Subject: [PATCH 067/415] [utils:js_to_json] Fix bad escape in double quoted strings --- test/test_utils.py | 3 +++ youtube_dl/utils.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index a5f164c49..918a7a9ef 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -495,6 +495,9 @@ def test_js_to_json_realworld(self): "playlist":[{"controls":{"all":null}}] }''') + inp = '''"The CW\\'s \\'Crazy Ex-Girlfriend\\'"''' + self.assertEqual(js_to_json(inp), '''"The CW's 'Crazy Ex-Girlfriend'"''') + inp = '"SAND Number: SAND 2013-7800P\\nPresenter: Tom Russo\\nHabanero Software Training - Xyce Software\\nXyce, Sandia\\u0027s"' json_code = js_to_json(inp) self.assertEqual(json.loads(json_code), json.loads(inp)) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index db5b3698e..a61e47646 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1701,8 +1701,8 @@ def fix_kv(m): if v in ('true', 'false', 'null'): return v if v.startswith('"'): - return v - if v.startswith("'"): + v = re.sub(r"\\'", "'", v[1:-1]) + elif v.startswith("'"): v = v[1:-1] v = re.sub(r"\\\\|\\'|\"", lambda m: { '\\\\': '\\\\', From 4211c83aa4dec0cf9874a6a485665360570e2a89 Mon Sep 17 00:00:00 2001 From: mjdubell Date: Mon, 19 Oct 2015 03:36:07 +0200 Subject: [PATCH 068/415] [stitcher] Add extractor Stitcher review updates Removed re import Stitcher review updates --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/stitcher.py | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/stitcher.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd6eb6ae0..eac5e7d5e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -586,6 +586,7 @@ from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE +from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import ( SportBoxIE, diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py new file mode 100644 index 000000000..a547debbd --- /dev/null +++ b/youtube_dl/extractor/stitcher.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import int_or_none + + +class StitcherIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/[\/a-z\-]+(?P\d+)' + _TEST = { + 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', + 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'info_dict': { + 'id': '40789481', + 'ext': 'mp3', + 'title': 'Machine Learning Mastery and Cancer Clusters from Talking Machines', + } + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage(url, audio_id) + + title = self._og_search_title(webpage) + url = self._search_regex(r'episodeURL: "(.+?)"', webpage, 'url') + episode_image = self._search_regex(r'episodeImage: "(.+?)"', webpage, 'episode_image', fatal=False) + duration = int_or_none(self._search_regex(r'duration: (\d+?),', webpage, 'duration', fatal=False)) + + return { + 'id': audio_id, + 'url': url, + 'title': title, + 'duration': duration, + 'thumbnail': episode_image, + 'ext': 'mp3', + 'vcodec': 'none', + } From 7308b8cb3df5a2df0a86e8050c83b951004a0aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Oct 2015 23:12:13 +0600 Subject: [PATCH 069/415] [stitcher] Improve (Closes #7162, closes #7228) --- youtube_dl/extractor/stitcher.py | 78 +++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index a547debbd..971a1c466 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -1,37 +1,81 @@ -# coding: utf-8 from __future__ import unicode_literals + +import re + from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + unescapeHTML, +) class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/[\/a-z\-]+(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' + _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', 'info_dict': { 'id': '40789481', 'ext': 'mp3', - 'title': 'Machine Learning Mastery and Cancer Clusters from Talking Machines', - } - } + 'title': 'Machine Learning Mastery and Cancer Clusters', + 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'duration': 1604, + 'thumbnail': 're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', + 'info_dict': { + 'id': '40846275', + 'display_id': 'the-rare-hourlong-comedy-plus', + 'ext': 'mp3', + 'title': "The CW's 'Crazy Ex-Girlfriend'", + 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17', + 'duration': 2235, + 'thumbnail': 're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # escaped title + 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', + 'only_matching': True, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', + 'only_matching': True, + }] def _real_extract(self, url): - audio_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + audio_id = mobj.group('id') + display_id = mobj.group('display_id') or audio_id - webpage = self._download_webpage(url, audio_id) + webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage) - url = self._search_regex(r'episodeURL: "(.+?)"', webpage, 'url') - episode_image = self._search_regex(r'episodeImage: "(.+?)"', webpage, 'episode_image', fatal=False) - duration = int_or_none(self._search_regex(r'duration: (\d+?),', webpage, 'duration', fatal=False)) + episode = self._parse_json( + js_to_json(self._search_regex( + r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')), + display_id)['config']['episode'] + + title = unescapeHTML(episode['title']) + formats = [{ + 'url': episode[episode_key], + 'ext': determine_ext(episode[episode_key]) or 'mp3', + 'vcodec': 'none', + } for episode_key in ('origEpisodeURL', 'episodeURL') if episode.get(episode_key)] + description = self._search_regex( + r'Episode Info:\s*([^<]+)<', webpage, 'description', fatal=False) + duration = int_or_none(episode.get('duration')) + thumbnail = episode.get('episodeImage') return { 'id': audio_id, - 'url': url, + 'display_id': display_id, 'title': title, + 'description': description, 'duration': duration, - 'thumbnail': episode_image, - 'ext': 'mp3', - 'vcodec': 'none', + 'thumbnail': thumbnail, + 'formats': formats, } From cc449417c4b3835c31f89e47a5e08e0f0c42ac5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Oct 2015 20:35:22 +0600 Subject: [PATCH 070/415] [vine] Use _search_regex for JSON data (Closes #7254, closes #7255) --- youtube_dl/extractor/vine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index be72f3147..cb2a4b0b5 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -85,8 +85,8 @@ def _real_extract(self, url): webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id) data = self._parse_json( - self._html_search_regex( - r'window\.POST_DATA = { %s: ({.+?}) };\s*' % video_id, + self._search_regex( + r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*' % video_id, webpage, 'vine data'), video_id) From 44d6dd08b299ccf17eb04901cf09a8d333769783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Oct 2015 21:35:57 +0600 Subject: [PATCH 071/415] [facebook] Fix extraction (Closes #7252) --- youtube_dl/extractor/facebook.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 178a7ca4c..f53c51615 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -14,7 +14,6 @@ ) from ..utils import ( ExtractorError, - int_or_none, limit_length, urlencode_postdata, get_element_by_id, @@ -142,16 +141,20 @@ def _real_extract(self, url): data = dict(json.loads(m.group(1))) params_raw = compat_urllib_parse_unquote(data['params']) params = json.loads(params_raw) - video_data = params['video_data'][0] formats = [] - for quality in ['sd', 'hd']: - src = video_data.get('%s_src' % quality) - if src is not None: - formats.append({ - 'format_id': quality, - 'url': src, - }) + for format_id, f in params['video_data'].items(): + if not f or not isinstance(f, list): + continue + for quality in ('sd', 'hd'): + for src_type in ('src', 'src_no_ratelimit'): + src = f[0].get('%s_%s' % (quality, src_type)) + if src: + formats.append({ + 'format_id': '%s_%s_%s' % (format_id, quality, src_type), + 'url': src, + 'preference': -10 if format_id == 'progressive' else 0, + }) if not formats: raise ExtractorError('Cannot find video formats') @@ -171,7 +174,5 @@ def _real_extract(self, url): 'id': video_id, 'title': video_title, 'formats': formats, - 'duration': int_or_none(video_data.get('video_duration')), - 'thumbnail': video_data.get('thumbnail_src'), 'uploader': uploader, } From 8c3533ba976af15ca9fac8acd68547b195dc4e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 21 Oct 2015 23:57:23 +0200 Subject: [PATCH 072/415] [adultswim] Don't default to the native m3u8 downloader (closes #7243) Some of the streams are encrypted, which is not supported . --- youtube_dl/extractor/adultswim.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 130afe791..3ae618e71 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -183,7 +183,7 @@ def _real_extract(self, url): media_url = file_el.text if determine_ext(media_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( - media_url, segment_title, 'mp4', 'm3u8_native', preference=0, m3u8_id='hls')) + media_url, segment_title, 'mp4', preference=0, m3u8_id='hls')) else: formats.append({ 'format_id': '%s_%s' % (bitrate, ftype), From 89d5fbf354cf0b49098582a19f76cef67358d375 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 22 Oct 2015 17:47:11 +0800 Subject: [PATCH 073/415] [iqiyi] Update key --- youtube_dl/extractor/iqiyi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 0e53cb154..2df1da3f0 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -205,9 +205,9 @@ def get_raw_data(self, tvid, video_id, enc_key, _uuid): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - # last update at 2015-10-10 for Zombie::bite - # '7239670519b6ac209a0bee4ef0446a6b24894b8ac2751506e42116212a0d0272e505'[2:66][1::2] - enc_key = '97596c0abee04ab49ba25564161ad225' + # last update at 2015-10-22 for Zombie::bite + # '7223c67061dbea1259d0ceb44f44b6d62288f4f80c972170de5201d2321060270e05'[2:66][0::2] + enc_key = '2c76de15dcb44bd28ff0927d50d31620' return enc_key def _real_extract(self, url): From 7033bc1a5117068c493931cb736d53e68d50f9a1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 22 Oct 2015 21:12:29 +0800 Subject: [PATCH 074/415] [bbc] Fix test_BBC_9 --- youtube_dl/extractor/bbc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 1b3a33e4e..ea67e3f2d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -625,6 +625,7 @@ class BBCIE(BBCCoUkIE): 'id': 'p02xycnp', 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', 'duration': 140, }, 'params': { From a65402ef42c42477f78469f0a6c4af1583d97a31 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 22 Oct 2015 21:13:03 +0800 Subject: [PATCH 075/415] [bbc.co.uk:article] Add new extractor (#7257) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bbc.py | 34 ++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index eac5e7d5e..6318ac4a2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -45,6 +45,7 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbc import ( BBCCoUkIE, + BBCCoUkArticleIE, BBCIE, ) from .beeg import BeegIE diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index ea67e3f2d..2cdce1eb9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -20,7 +20,7 @@ class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P[\da-z]{8})' _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails @@ -652,7 +652,7 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url) + return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. @@ -903,3 +903,33 @@ def extract_all(pattern): }) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + +class BBCCoUkArticleIE(InfoExtractor): + _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + IE_NAME = 'bbc.co.uk:article' + IE_DESC = 'BBC articles' + + _TEST = { + 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', + 'info_dict': { + 'id': '3jNQLTMrPlYGTBn0WV6M2MS', + 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', + 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', + }, + 'playlist_count': 4, + 'add_ie': ['BBCCoUk'], + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage).strip() + + entries = [self.url_result(programme_url) for programme_url in re.findall( + r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] + + return self.playlist_result(entries, playlist_id, title, description) From 769078755318896fa1a6c5c8aba6f76d6aeddf78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 22 Oct 2015 20:34:11 +0600 Subject: [PATCH 076/415] [crunchyroll] Improve subtitle regex (Closes #7262) --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index cecd0c784..f8ce10111 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -245,7 +245,7 @@ def _extract_subtitles(self, subtitle): def _get_subtitles(self, video_id, webpage): subtitles = {} - for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): + for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage): sub_page = self._download_webpage( 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, video_id, note='Downloading subtitles for ' + sub_name) From ab03c0b47c142bfb649eacb3c72ce9cb67184535 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 23 Oct 2015 09:33:05 +0200 Subject: [PATCH 077/415] release 2015.10.23 --- CONTRIBUTING.md | 3 ++- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 32c2fd84c..aebded4ce 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -114,12 +114,13 @@ ### Adding support for a new site webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... - title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') return { 'id': video_id, 'title': title, 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), # TODO more properties (see youtube_dl/extractor/common.py) } ``` diff --git a/docs/supportedsites.md b/docs/supportedsites.md index cfa665d88..03561b87d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -53,6 +53,7 @@ # Supported sites - **Bandcamp:album** - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer + - **bbc.co.uk:article**: BBC articles - **BeatportPro** - **Beeg** - **BehindKink** @@ -515,6 +516,7 @@ # Supported sites - **SSA** - **stanfordoc**: Stanford Open ClassRoom - **Steam** + - **Stitcher** - **streamcloud.eu** - **StreamCZ** - **StreetVoice** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 660b0050b..d5c7f338d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.18' +__version__ = '2015.10.23' From 65d49afa48086b568364bbcbab29feef71031178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 23 Oct 2015 14:12:46 +0200 Subject: [PATCH 078/415] [test/test_download] Use extract_flat = 'in_playlist' for playlist items Some playlist extractors return a 'url' result, which wouldn't be resolved. --- test/test_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index 284418834..a3f1c0644 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -102,7 +102,7 @@ def print_skipping(reason): params = get_params(test_case.get('params', {})) if is_playlist and 'playlist' not in test_case: - params.setdefault('extract_flat', True) + params.setdefault('extract_flat', 'in_playlist') params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) From 9170ca5b16f3420892ff06bbe5cccf1679eb75e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 23 Oct 2015 14:16:08 +0200 Subject: [PATCH 079/415] [youtube:channel] Fix test --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 08e821362..bae1b1117 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1644,7 +1644,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'playlist_mincount': 91, 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'Uploads from lex will', } }] From 5c43afd40f8ba101e0cf90b8fcb5713b378a62c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 23 Oct 2015 14:23:45 +0200 Subject: [PATCH 080/415] [youtube:channel] Support age restricted channels (fixes #7277) --- youtube_dl/extractor/youtube.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bae1b1117..d7eda7aa7 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1647,6 +1647,15 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', 'title': 'Uploads from lex will', } + }, { + 'note': 'Age restricted channel', + # from https://www.youtube.com/user/DeusExOfficial + 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', + 'playlist_mincount': 64, + 'info_dict': { + 'id': 'UUs0ifCMCm1icqRbqhUINa0w', + 'title': 'Uploads from Deus Ex', + }, }] def _real_extract(self, url): @@ -1667,7 +1676,7 @@ def _real_extract(self, url): 'channelId', channel_page, 'channel id', default=None) if not channel_playlist_id: channel_playlist_id = self._search_regex( - r'data-channel-external-id="([^"]+)"', + r'data-(?:channel-external-|yt)id="([^"]+)"', channel_page, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] From edeb3e7cb1ab2d82ff7c712a7cc1e338a9dcd8f8 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Fri, 23 Oct 2015 15:58:24 +0000 Subject: [PATCH 081/415] [README.md] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6ec9619c..38db97c59 100644 --- a/README.md +++ b/README.md @@ -795,7 +795,7 @@ # BUGS **Please include the full output of youtube-dl when run with `-v`**. -The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. +The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): From dae69640d086ca1e2683ca81b60f48a0c6c83eac Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 24 Oct 2015 00:10:28 +0200 Subject: [PATCH 082/415] Fix py2exe build (#7276) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4686260e0..bfe931f5b 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ "compressed": 1, "optimize": 2, "dist_dir": '.', - "dll_excludes": ['w9xpopen.exe'], + "dll_excludes": ['w9xpopen.exe', 'crypt32.dll'], } py2exe_console = [{ From ab9c7214ee6c831a68216caf1cd1f9f3c183e4fd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 24 Oct 2015 00:10:41 +0200 Subject: [PATCH 083/415] release 2015.10.24 --- CONTRIBUTING.md | 2 +- youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aebded4ce..09ce98ca2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ **Please include the full output of youtube-dl when run with `-v`**. -The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. +The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d5c7f338d..125e8ccf5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.23' +__version__ = '2015.10.24' From c93153852f342ef26005b37649d58fa944c53fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:10:53 +0200 Subject: [PATCH 084/415] [mitele] Don't encode the URL query (closes #7280) This seems to produce sporadic errors when trying to access the URL, because on python 3.x when you do '%s' % b'somedata' you get "b'somedata'". --- youtube_dl/extractor/mitele.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 54993e2c9..ccb5c1467 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -56,7 +56,7 @@ def _real_extract(self, url): 'sta': '0', } media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data)).encode('utf-8')), + '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data))), display_id, 'Downloading %s JSON' % location['loc']) file_ = media.get('file') if not file_: From 6856139705aea86ab1f950c08e605dd47f839be0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:13:26 +0200 Subject: [PATCH 085/415] [mitele] Fix test checksum --- youtube_dl/extractor/mitele.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index ccb5c1467..3142fcde2 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -15,7 +15,7 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': 'ace7635b2a0b286aaa37d3ff192d2a8a', + 'md5': '757b0b66cbd7e0a97226d7d3156cb3e9', 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', From 0198807ef95338bec69cfdcd67249c007e4d4141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20D=C3=B6pmann?= Date: Sat, 24 Oct 2015 11:35:41 +0200 Subject: [PATCH 086/415] [spiegeltv] Fix Accept-Encoding issue (server chokes on gzip) --- youtube_dl/extractor/spiegeltv.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 27f4033c5..a85305281 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -83,6 +83,10 @@ def _real_extract(self, url): preference=1, # Prefer hls since it allows to workaround georestriction m3u8_id='hls', fatal=False) if m3u8_formats is not False: + for m3u8_format in m3u8_formats: + m3u8_format['http_headers'] = { + 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side + } formats.extend(m3u8_formats) else: formats.append({ From 50f01302d347738647262f9442bc4f5d06f013c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 16:24:08 +0600 Subject: [PATCH 087/415] [spiegeltv] Do not extract m3u8 formats since it's already a format --- youtube_dl/extractor/spiegeltv.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index a85305281..d976bf33c 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -77,17 +77,16 @@ def _real_extract(self, url): 'rtmp_live': True, }) elif determine_ext(endpoint) == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - endpoint.replace('[video]', play_path), - video_id, 'm4v', - preference=1, # Prefer hls since it allows to workaround georestriction - m3u8_id='hls', fatal=False) - if m3u8_formats is not False: - for m3u8_format in m3u8_formats: - m3u8_format['http_headers'] = { - 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side - } - formats.extend(m3u8_formats) + formats.append({ + 'url': endpoint.replace('[video]', play_path), + 'ext': 'm4v', + 'format_id': 'hls', # Prefer hls since it allows to workaround georestriction + 'protocol': 'm3u8', + 'preference': 1, + 'http_headers': { + 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side + }, + }) else: formats.append({ 'url': endpoint, From 943a1e24b896a64c869bbd302f32fe5bd2afec96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 16:25:04 +0600 Subject: [PATCH 088/415] [extractor/common] Use more generic URLError in _is_valid_url --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6169fbbeb..720033ddf 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -842,7 +842,7 @@ def _is_valid_url(self, url, video_id, item='video'): self._request_webpage(url, video_id, 'Checking %s URL' % item) return True except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, compat_urllib_error.URLError): self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, item)) return False From ac21e7196856d7b689f74ed3f9953cbcbe90bee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 16:25:44 +0600 Subject: [PATCH 089/415] [spiegeltv] Check formats --- youtube_dl/extractor/spiegeltv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index d976bf33c..0981e325a 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -91,6 +91,7 @@ def _real_extract(self, url): formats.append({ 'url': endpoint, }) + self._check_formats(formats, video_id) thumbnails = [] for image in media_json['images']: From 865d1fbafc671815904b2ba3da76544d66c593c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:39:23 +0200 Subject: [PATCH 090/415] [extractor/common] Remove unused import --- youtube_dl/extractor/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 720033ddf..04b699972 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -16,7 +16,6 @@ compat_cookiejar, compat_cookies, compat_getpass, - compat_HTTPError, compat_http_client, compat_urllib_error, compat_urllib_parse, From 36d72810374ef2dba0232706a461d6dc4aa292d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:41:41 +0200 Subject: [PATCH 091/415] [spiegeltv] Fix style issue Use two spaces before comment. --- youtube_dl/extractor/spiegeltv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 0981e325a..034bd47ff 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -84,7 +84,7 @@ def _real_extract(self, url): 'protocol': 'm3u8', 'preference': 1, 'http_headers': { - 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side + 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side }, }) else: From 7687b354c59efea076fae762206c00de273fbe04 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 23 Oct 2015 07:09:41 +0100 Subject: [PATCH 092/415] [abc] add support for audio extraction --- youtube_dl/extractor/abc.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index f9a389f67..ae80dc529 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -36,6 +36,15 @@ class ABCIE(InfoExtractor): 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', }, 'add_ie': ['Youtube'], + }, { + 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', + 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', + 'info_dict': { + 'id': '6880080', + 'ext': 'mp3', + 'title': 'NAB lifts interest rates, following Westpac and CBA', + 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', + }, }] def _real_extract(self, url): @@ -43,7 +52,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) mobj = re.search( - r'inline(?PVideo|YouTube)Data\.push\((?P[^)]+)\);', + r'inline(?PVideo|Audio|YouTube)Data\.push\((?P[^)]+)\);', webpage) if mobj is None: raise ExtractorError('Unable to extract video urls') @@ -60,11 +69,13 @@ def _real_extract(self, url): formats = [{ 'url': url_info['url'], + 'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none', 'width': int_or_none(url_info.get('width')), 'height': int_or_none(url_info.get('height')), 'tbr': int_or_none(url_info.get('bitrate')), 'filesize': int_or_none(url_info.get('filesize')), } for url_info in urls_info] + self._sort_formats(formats) return { From d97da29da26c920ae31fde94bba5e3b4e1f5a36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:31:42 +0200 Subject: [PATCH 093/415] [abc] Support more URL formats --- youtube_dl/extractor/abc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index ae80dc529..c0e5d1abf 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -12,7 +12,7 @@ class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P\d+)' + _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', @@ -45,6 +45,9 @@ class ABCIE(InfoExtractor): 'title': 'NAB lifts interest rates, following Westpac and CBA', 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', }, + }, { + 'url': 'http://www.abc.net.au/news/2015-10-19/6866214', + 'only_matching': True, }] def _real_extract(self, url): From 50b936936dcf53b448557c35a90e4678239aaf81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 14:22:47 +0200 Subject: [PATCH 094/415] [tutv] Fix test --- youtube_dl/extractor/tutv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index fad720b68..822372ea1 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -10,10 +10,10 @@ class TutvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P[^/?]+)' _TEST = { 'url': 'http://tu.tv/videos/robots-futbolistas', - 'md5': '627c7c124ac2a9b5ab6addb94e0e65f7', + 'md5': '0cd9e28ad270488911b0d2a72323395d', 'info_dict': { 'id': '2973058', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Robots futbolistas', }, } From 3711304510d3be6a5f9b2b18084aad8687e78001 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 8 Sep 2015 19:35:41 +0100 Subject: [PATCH 095/415] [extractor/common] get the redirected m3u8_url in _extract_m3u8_formats --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 04b699972..10c0d5d1f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -943,13 +943,14 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc = self._download_webpage( + m3u8_doc, urlh = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) if m3u8_doc is False: return m3u8_doc + m3u8_url = urlh.geturl() last_info = None last_media = None kv_rex = re.compile( From 324ac0a243c14340f7e4cd909e2e7c62828a2425 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 10 Sep 2015 20:49:43 +0100 Subject: [PATCH 096/415] [downloader/f4m] get the redirected f4m_url and handle url query string properly --- youtube_dl/downloader/f4m.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 174180db5..b8db6bf9b 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -11,6 +11,7 @@ from ..compat import ( compat_urlparse, compat_urllib_error, + compat_urllib_parse_urlparse, ) from ..utils import ( encodeFilename, @@ -285,7 +286,9 @@ def real_download(self, filename, info_dict): man_url = info_dict['url'] requested_bitrate = info_dict.get('tbr') self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) - manifest = self.ydl.urlopen(man_url).read() + urlh = self.ydl.urlopen(man_url) + man_url = urlh.geturl() + manifest = urlh.read() doc = etree.fromstring(manifest) formats = [(int(f.attrib.get('bitrate', -1)), f) @@ -329,20 +332,22 @@ def real_download(self, filename, info_dict): if not live: write_metadata_tag(dest_stream, metadata) + base_url_parsed = compat_urllib_parse_urlparse(base_url) + self._start_frag_download(ctx) frags_filenames = [] while fragments_list: seg_i, frag_i = fragments_list.pop(0) name = 'Seg%d-Frag%d' % (seg_i, frag_i) - url = base_url + name + url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name) if akamai_pv: - url += '?' + akamai_pv.strip(';') + url_parsed = url_parsed._replace(query=url_parsed.query + akamai_pv.strip(';')) if info_dict.get('extra_param_to_segment_url'): - url += info_dict.get('extra_param_to_segment_url') + url_parsed = url_parsed._replace(query=url_parsed.query + info_dict.get('extra_param_to_segment_url')) frag_filename = '%s-%s' % (ctx['tmpfilename'], name) try: - success = ctx['dl'].download(frag_filename, {'url': url}) + success = ctx['dl'].download(frag_filename, {'url': url_parsed.geturl()}) if not success: return False (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') From 8cd9614abf81cb41055142d87158b5eda4353a4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 21:02:31 +0600 Subject: [PATCH 097/415] [downloader/f4m] More accurate fragment URL construction --- youtube_dl/downloader/f4m.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b8db6bf9b..7f6143954 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -340,11 +340,14 @@ def real_download(self, filename, info_dict): while fragments_list: seg_i, frag_i = fragments_list.pop(0) name = 'Seg%d-Frag%d' % (seg_i, frag_i) - url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name) + query = [] + if base_url_parsed.query: + query.append(base_url_parsed.query) if akamai_pv: - url_parsed = url_parsed._replace(query=url_parsed.query + akamai_pv.strip(';')) + query.append(akamai_pv.strip(';')) if info_dict.get('extra_param_to_segment_url'): - url_parsed = url_parsed._replace(query=url_parsed.query + info_dict.get('extra_param_to_segment_url')) + query.append(info_dict['extra_param_to_segment_url']) + url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) frag_filename = '%s-%s' % (ctx['tmpfilename'], name) try: success = ctx['dl'].download(frag_filename, {'url': url_parsed.geturl()}) From ec29539e06e156a2bb589af774a80d156b2c2f76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 21:03:45 +0600 Subject: [PATCH 098/415] [senateisvp] Pass extra param as query segment without `?` --- youtube_dl/extractor/senateisvp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 9c53704ea..474ebb49b 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -121,9 +121,9 @@ def _real_extract(self, url): 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', }] else: - hdcore_sign = '?hdcore=3.1.0' + hdcore_sign = 'hdcore=3.1.0' url_params = (domain, video_id, stream_num) - f4m_url = '%s/z/%s_1@%s/manifest.f4m' % url_params + hdcore_sign + f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): # URLs without the extra param induce an 404 error From 8e82ecfe8f0dc2b9dfb6a2cda68e7b5f7926b0e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 21:04:09 +0600 Subject: [PATCH 099/415] [dailymotion] Extract f4m formats --- youtube_dl/extractor/dailymotion.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 9cd9ff17d..bc7823931 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -141,9 +141,17 @@ def _real_extract(self, url): type_ = media.get('type') if type_ == 'application/vnd.lumberjack.manifest': continue - if type_ == 'application/x-mpegURL' or determine_ext(media_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls')) + ext = determine_ext(media_url) + if type_ == 'application/x-mpegURL' or ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif type_ == 'application/f4m' or ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + media_url, video_id, preference=-1, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) else: f = { 'url': media_url, From 7e0dc61334cfe9c79e92fd79d9996d191990a80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 25 Oct 2015 20:48:29 +0600 Subject: [PATCH 100/415] [njoy] Add support for URLs without display id --- youtube_dl/extractor/ndr.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index e3cc6fde8..ba06d8a98 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -14,7 +14,8 @@ class NDRBaseIE(InfoExtractor): def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + display_id = next(group for group in mobj.groups() if group) webpage = self._download_webpage(url, display_id) return self._extract_embed(webpage, display_id) @@ -101,7 +102,7 @@ def _extract_embed(self, webpage, display_id): class NJoyIE(NDRBaseIE): IE_NAME = 'njoy' IE_DESC = 'N-JOY' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?:(?P[^/?#]+),)?(?P[\da-z]+)\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', @@ -136,6 +137,9 @@ class NJoyIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://www.n-joy.de/radio/webradio/morningshow209.html', + 'only_matching': True, }] def _extract_embed(self, webpage, display_id): From e572a1010b81f5864e610808c86848db4d6ed8e4 Mon Sep 17 00:00:00 2001 From: Erik Date: Sat, 17 Oct 2015 19:22:47 +0200 Subject: [PATCH 101/415] [youporn] Fix extraction [youporn] Added description and thumbnail [youporn] Added uploader and date [youporn] Removed Try and Except lines [youporn] Fixed date, fatal, formats and /s* [youporn] Undid removing comment about video url components & Undid and fixed removal of encrypted URL detection [youporn] Fix: Add encrypted link to links array only if not already in it [youporn] Fix: Add encrypted link to links array only if not already in it [youporn] Fix: cleanup --- youtube_dl/extractor/youporn.py | 55 ++++++++++++++------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 4ba7c36db..546985f3a 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -4,6 +4,7 @@ import json import re import sys +import datetime from .common import InfoExtractor from ..compat import ( @@ -27,10 +28,11 @@ class YouPornIE(InfoExtractor): 'info_dict': { 'id': '505835', 'ext': 'mp4', - 'upload_date': '20101221', - 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', - 'uploader': 'Ask Dan And Jennifer', 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', + 'description': 'Watch Sex Ed: Is It Safe To Masturbate Daily? at YouPorn.com - YouPorn is the biggest free porn tube site on the net!', + 'uploader': 'Ask Dan And Jennifer', + 'thumbnail': 'http://cdn5.image.youporn.phncdn.com/201012/17/505835/640x480/8/sex-ed-is-it-safe-to-masturbate-daily-8.jpg', + 'date': '20101221', 'age_limit': 18, } } @@ -45,45 +47,34 @@ def _real_extract(self, url): webpage = self._download_webpage(req, video_id) age_limit = self._rta_search(webpage) - # Get JSON parameters - json_params = self._search_regex( - [r'videoJa?son\s*=\s*({.+})', - r'var\s+currentVideo\s*=\s*new\s+Video\((.+?)\)[,;]'], - webpage, 'JSON parameters') - try: - params = json.loads(json_params) - except ValueError: - raise ExtractorError('Invalid JSON') - self.report_extraction(video_id) - try: - video_title = params['title'] - upload_date = unified_strdate(params['release_date_f']) - video_description = params['description'] - video_uploader = params['submitted_by'] - thumbnail = params['thumbnails'][0]['image'] - except KeyError: - raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) + video_title = self._html_search_regex(r'page_params.video_title = \'(.+?)\';', webpage, 'video URL', fatal=False) + video_description = self._html_search_meta('description', webpage, 'video DESC', fatal=False) + video_thumbnail = self._html_search_regex(r'page_params.imageurl\t=\t"(.+?)";', webpage, 'video THUMB', fatal=False) + video_uploader = self._html_search_regex(r"
By:
\n]+\">(.+?)", webpage, 'video UPLOADER', fatal=False) + video_date = self._html_search_regex(r"
\n (.+?)\n
", webpage, 'video DATE', fatal=False) + video_date = datetime.datetime.strptime(video_date, '%B %d, %Y').strftime('%Y%m%d') # Get all of the links from the page - DOWNLOAD_LIST_RE = r'(?s)
    (?P.*?)
' - download_list_html = self._search_regex(DOWNLOAD_LIST_RE, - webpage, 'download list').strip() - LINK_RE = r'' + DOWNLOAD_LIST_RE = r'(?s)sources: {\n(?P.*?)}' + download_list_html = self._search_regex(DOWNLOAD_LIST_RE, webpage, 'download list').strip() + LINK_RE = r': \'(.+?)\',' links = re.findall(LINK_RE, download_list_html) # Get all encrypted links - encrypted_links = re.findall(r'var encryptedQuality[0-9]{3}URL = \'([a-zA-Z0-9+/]+={0,2})\';', webpage) + encrypted_links = re.findall(r'page_params.encryptedQuality[0-9]{3,4}URL\s=\s\'([a-zA-Z0-9+/]+={0,2})\';', webpage) for encrypted_link in encrypted_links: link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8') - links.append(link) + # it's unclear if encryted links still differ from normal ones, so only include in links array if it's unique + if link not in links: + links.append(link) formats = [] for link in links: # A link looks like this: - # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 + # http://cdn2b.public.youporn.phncdn.com/201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4?rs=200&ri=2500&s=1445599900&e=1445773500&h=5345d19ce9944ec52eb167abf24af248 # A path looks like this: - # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 + # 201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 video_url = unescapeHTML(link) path = compat_urllib_parse_urlparse(video_url).path format_parts = path.split('/')[4].split('_')[:2] @@ -111,11 +102,11 @@ def _real_extract(self, url): return { 'id': video_id, - 'uploader': video_uploader, - 'upload_date': upload_date, 'title': video_title, - 'thumbnail': thumbnail, 'description': video_description, + 'thumbnail': video_thumbnail, + 'uploader': video_uploader, + 'date': video_date, 'age_limit': age_limit, 'formats': formats, } From 589c33dadeec18a9d50713a4a200e3e2d9e297bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 25 Oct 2015 22:56:35 +0600 Subject: [PATCH 102/415] [youporn] Improve and make more robust (Closes #6888, closes #7214) --- youtube_dl/extractor/youporn.py | 173 +++++++++++++++++++------------- 1 file changed, 102 insertions(+), 71 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 546985f3a..d10ebb0bf 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,112 +1,143 @@ from __future__ import unicode_literals - -import json import re -import sys -import datetime from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_urllib_request, -) +from ..compat import compat_urllib_request from ..utils import ( - ExtractorError, + int_or_none, + str_to_int, unescapeHTML, unified_strdate, ) -from ..aes import ( - aes_decrypt_text -) +from ..aes import aes_decrypt_text class YouPornIE(InfoExtractor): - _VALID_URL = r'^(?Phttps?://)(?:www\.)?(?Pyouporn\.com/watch/(?P[0-9]+)/(?P[^/]+))' + _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' _TEST = { 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'info_dict': { 'id': '505835', + 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily', 'ext': 'mp4', 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', - 'description': 'Watch Sex Ed: Is It Safe To Masturbate Daily? at YouPorn.com - YouPorn is the biggest free porn tube site on the net!', + 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', + 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Ask Dan And Jennifer', - 'thumbnail': 'http://cdn5.image.youporn.phncdn.com/201012/17/505835/640x480/8/sex-ed-is-it-safe-to-masturbate-daily-8.jpg', - 'date': '20101221', + 'upload_date': '20101221', + 'average_rating': int, + 'view_count': int, + 'categories': list, + 'tags': list, 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') - url = mobj.group('proto') + 'www.' + mobj.group('url') + video_id = mobj.group('id') + display_id = mobj.group('display_id') - req = compat_urllib_request.Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) - age_limit = self._rta_search(webpage) + request = compat_urllib_request.Request(url) + request.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(request, display_id) - self.report_extraction(video_id) - video_title = self._html_search_regex(r'page_params.video_title = \'(.+?)\';', webpage, 'video URL', fatal=False) - video_description = self._html_search_meta('description', webpage, 'video DESC', fatal=False) - video_thumbnail = self._html_search_regex(r'page_params.imageurl\t=\t"(.+?)";', webpage, 'video THUMB', fatal=False) - video_uploader = self._html_search_regex(r"<div class=\'videoInfoBy\'>By:</div>\n<a href=\"[^>]+\">(.+?)</a>", webpage, 'video UPLOADER', fatal=False) - video_date = self._html_search_regex(r"<div class='videoInfoTime'>\n<i class='icon-clock'></i> (.+?)\n</div>", webpage, 'video DATE', fatal=False) - video_date = datetime.datetime.strptime(video_date, '%B %d, %Y').strftime('%Y%m%d') + title = self._search_regex( + [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>.+?)\1', + r'<h1[^>]+class=["\']heading\d?["\'][^>]*>([^<])<'], + webpage, 'title', group='title') - # Get all of the links from the page - DOWNLOAD_LIST_RE = r'(?s)sources: {\n(?P<download_list>.*?)}' - download_list_html = self._search_regex(DOWNLOAD_LIST_RE, webpage, 'download list').strip() - LINK_RE = r': \'(.+?)\',' - links = re.findall(LINK_RE, download_list_html) + links = [] - # Get all encrypted links - encrypted_links = re.findall(r'page_params.encryptedQuality[0-9]{3,4}URL\s=\s\'([a-zA-Z0-9+/]+={0,2})\';', webpage) - for encrypted_link in encrypted_links: - link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8') - # it's unclear if encryted links still differ from normal ones, so only include in links array if it's unique - if link not in links: + sources = self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'sources', default=None) + if sources: + for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources): links.append(link) + # Fallback #1 + for _, link in re.findall( + r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): + links.append(link) + + # Fallback #2, this also contains extra low quality 180p format + for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + links.append(link) + + # Fallback #3, encrypted links + for _, encrypted_link in re.findall( + r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage): + links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8')) + formats = [] - for link in links: - # A link looks like this: - # http://cdn2b.public.youporn.phncdn.com/201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4?rs=200&ri=2500&s=1445599900&e=1445773500&h=5345d19ce9944ec52eb167abf24af248 - # A path looks like this: - # 201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 - video_url = unescapeHTML(link) - path = compat_urllib_parse_urlparse(video_url).path - format_parts = path.split('/')[4].split('_')[:2] - - dn = compat_urllib_parse_urlparse(video_url).netloc.partition('.')[0] - - resolution = format_parts[0] - height = int(resolution[:-len('p')]) - bitrate = int(format_parts[1][:-len('k')]) - format = '-'.join(format_parts) + '-' + dn - - formats.append({ + for video_url in set(unescapeHTML(link) for link in links): + f = { 'url': video_url, - 'format': format, - 'format_id': format, - 'height': height, - 'tbr': bitrate, - 'resolution': resolution, - }) - + } + # Video URL's path looks like this: + # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # We will benefit from it by extracting some metadata + mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url) + if mobj: + height = int(mobj.group('height')) + bitrate = int(mobj.group('bitrate')) + f.update({ + 'format_id': '%dp-%dk' % (height, bitrate), + 'height': height, + 'tbr': bitrate, + }) + formats.append(f) self._sort_formats(formats) - if not formats: - raise ExtractorError('ERROR: no known formats available for video') + description = self._html_search_regex( + r'(?s)<div[^>]+class=["\']video-description["\'][^>]*>(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._search_regex( + r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', + webpage, 'thumbnail', fatal=False, group='thumbnail') + + uploader = self._search_regex( + r'<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>\s*<a[^>]+href="[^"]*">([^<]+)</a>', + webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._html_search_regex( + r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>', + webpage, 'upload date', fatal=False)) + + age_limit = self._rta_search(webpage) + + average_rating = int_or_none(self._search_regex( + r'<div[^>]+class=["\']videoInfoRating["\'][^>]*>\s*<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>', + webpage, 'average rating', fatal=False)) + + view_count = str_to_int(self._search_regex( + r'(?s)<div[^>]+class=["\']videoInfoViews["\'][^>]*>.*?([\d,.]+)\s*</div>', + webpage, 'view count', fatal=False)) + + def extract_tag_box(title): + tag_box = self._search_regex( + (r'<div[^>]+class=["\']tagBoxTitle["\'][^>]*>\s*%s\b.*?</div>\s*' + '<div[^>]+class=["\']tagBoxContent["\']>(.+?)</div>') % re.escape(title), + webpage, '%s tag box' % title, default=None) + if not tag_box: + return [] + return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box) + + categories = extract_tag_box('Category') + tags = extract_tag_box('Tags') return { 'id': video_id, - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - 'date': video_date, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'average_rating': average_rating, + 'view_count': view_count, + 'categories': categories, + 'tags': tags, 'age_limit': age_limit, 'formats': formats, } From feb7711cf58863a19cae770a878d22a8424e3c61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 25 Oct 2015 23:01:12 +0600 Subject: [PATCH 103/415] [youporn] Make description optional Some videos does not contain any description --- youtube_dl/extractor/youporn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d10ebb0bf..db5d049d2 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -92,7 +92,7 @@ def _real_extract(self, url): description = self._html_search_regex( r'(?s)<div[^>]+class=["\']video-description["\'][^>]*>(.+?)</div>', - webpage, 'description', fatal=False) + webpage, 'description', default=None) thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') From 4f13f8f798be06bc2b3c0c42818bb0785e4cde64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 25 Oct 2015 23:12:12 +0600 Subject: [PATCH 104/415] [youporn] Improve uploader extraction --- youtube_dl/extractor/youporn.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index db5d049d2..b39fbb5fc 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -15,8 +15,9 @@ class YouPornIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', + 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89', 'info_dict': { 'id': '505835', 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily', @@ -31,8 +32,29 @@ class YouPornIE(InfoExtractor): 'categories': list, 'tags': list, 'age_limit': 18, - } - } + }, + }, { + # Anonymous User uploader + 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', + 'info_dict': { + 'id': '561726', + 'display_id': 'big-tits-awesome-brunette-on-amazing-webcam-show', + 'ext': 'mp4', + 'title': 'Big Tits Awesome Brunette On amazing webcam show', + 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Anonymous User', + 'upload_date': '20111125', + 'average_rating': int, + 'view_count': int, + 'categories': list, + 'tags': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -97,8 +119,8 @@ def _real_extract(self, url): r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') - uploader = self._search_regex( - r'<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>\s*<a[^>]+href="[^"]*">([^<]+)</a>', + uploader = self._html_search_regex( + r'(?s)<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>', From 7b3a19e5339344037a872574780c39f334cea90e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 25 Oct 2015 23:17:23 +0600 Subject: [PATCH 105/415] [stitcher] Remove origEpisodeURL It's always 404 --- youtube_dl/extractor/stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index 971a1c466..d5c852f52 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -64,7 +64,7 @@ def _real_extract(self, url): 'url': episode[episode_key], 'ext': determine_ext(episode[episode_key]) or 'mp3', 'vcodec': 'none', - } for episode_key in ('origEpisodeURL', 'episodeURL') if episode.get(episode_key)] + } for episode_key in ('episodeURL',) if episode.get(episode_key)] description = self._search_regex( r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False) duration = int_or_none(episode.get('duration')) From 755ff8d22ca5607400c1232b194e20a004e4e9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 25 Oct 2015 23:41:10 +0600 Subject: [PATCH 106/415] [youporn] Extract comment count --- youtube_dl/extractor/youporn.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index b39fbb5fc..9bf8d1eeb 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -29,6 +29,7 @@ class YouPornIE(InfoExtractor): 'upload_date': '20101221', 'average_rating': int, 'view_count': int, + 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -47,6 +48,7 @@ class YouPornIE(InfoExtractor): 'upload_date': '20111125', 'average_rating': int, 'view_count': int, + 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -135,6 +137,9 @@ def _real_extract(self, url): view_count = str_to_int(self._search_regex( r'(?s)<div[^>]+class=["\']videoInfoViews["\'][^>]*>.*?([\d,.]+)\s*</div>', webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._search_regex( + r'>All [Cc]omments? \(([\d,.]+)\)', + webpage, 'comment count', fatal=False)) def extract_tag_box(title): tag_box = self._search_regex( @@ -158,6 +163,7 @@ def extract_tag_box(title): 'upload_date': upload_date, 'average_rating': average_rating, 'view_count': view_count, + 'comment_count': comment_count, 'categories': categories, 'tags': tags, 'age_limit': age_limit, From 36e6f62cd0883f0f486d1666d010e5d9e6d515bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 25 Oct 2015 20:04:55 +0100 Subject: [PATCH 107/415] Use a wrapper around xml.etree.ElementTree.fromstring in python 2.x (#7178) Attributes aren't unicode objects, so they couldn't be directly used in info_dict fields (for example '--write-description' doesn't work with bytes). --- test/test_compat.py | 7 +++++++ test/test_utils.py | 11 +++++++---- youtube_dl/compat.py | 25 +++++++++++++++++++++++++ youtube_dl/downloader/f4m.py | 4 ++-- youtube_dl/extractor/bbc.py | 8 +++++--- youtube_dl/extractor/bilibili.py | 6 ++++-- youtube_dl/extractor/brightcove.py | 4 ++-- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/extractor/crunchyroll.py | 4 ++-- youtube_dl/extractor/vevo.py | 6 +++--- youtube_dl/utils.py | 3 ++- 11 files changed, 61 insertions(+), 21 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 4ee0dc99d..2b0860479 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -13,8 +13,10 @@ from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, + compat_etree_fromstring, compat_expanduser, compat_shlex_split, + compat_str, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, ) @@ -71,5 +73,10 @@ def test_compat_urllib_parse_unquote_plus(self): def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) + def test_compat_etree_fromstring(self): + xml = '<el foo="bar"></el>' + doc = compat_etree_fromstring(xml.encode('utf-8')) + self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 918a7a9ef..a9e0fed7e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -68,6 +68,9 @@ cli_valueless_option, cli_bool_option, ) +from youtube_dl.compat import ( + compat_etree_fromstring, +) class TestUtil(unittest.TestCase): @@ -242,7 +245,7 @@ def test_find_xpath_attr(self): <node x="b" y="d" /> <node x="" /> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) @@ -263,7 +266,7 @@ def test_xpath_with_ns(self): <url>http://server.com/download.mp3</url> </media:song> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) self.assertTrue(find('media:song') is not None) self.assertEqual(find('media:song/media:author').text, 'The Author') @@ -285,7 +288,7 @@ def test_xpath_text(self): <p>Foo</p> </div> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default') self.assertTrue(xpath_text(doc, 'div/bar') is None) @@ -297,7 +300,7 @@ def test_xpath_attr(self): <p x="a">Foo</p> </div> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index d103ab9ad..cf10835ca 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -14,6 +14,7 @@ import subprocess import sys import itertools +import xml.etree.ElementTree try: @@ -212,6 +213,29 @@ def data_open(self, req): except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error +if sys.version_info[0] >= 3: + compat_etree_fromstring = xml.etree.ElementTree.fromstring +else: + # on python 2.x the the attributes of a node are str objects instead of + # unicode + etree = xml.etree.ElementTree + + # on 2.6 XML doesn't have a parser argument, function copied from CPython + # 2.7 source + def _XML(text, parser=None): + if not parser: + parser = etree.XMLParser(target=etree.TreeBuilder()) + parser.feed(text) + return parser.close() + + def _element_factory(*args, **kwargs): + el = etree.Element(*args, **kwargs) + for k, v in el.items(): + el.set(k, v.decode('utf-8')) + return el + + def compat_etree_fromstring(text): + return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) try: from urllib.parse import parse_qs as compat_parse_qs @@ -507,6 +531,7 @@ def compat_itertools_count(start=0, step=1): 'compat_chr', 'compat_cookiejar', 'compat_cookies', + 'compat_etree_fromstring', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 7f6143954..6170cc155 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -5,10 +5,10 @@ import itertools import os import time -import xml.etree.ElementTree as etree from .fragment import FragmentFD from ..compat import ( + compat_etree_fromstring, compat_urlparse, compat_urllib_error, compat_urllib_parse_urlparse, @@ -290,7 +290,7 @@ def real_download(self, filename, info_dict): man_url = urlh.geturl() manifest = urlh.read() - doc = etree.fromstring(manifest) + doc = compat_etree_fromstring(manifest) formats = [(int(f.attrib.get('bitrate', -1)), f) for f in self._get_unencrypted_media(doc)] if requested_bitrate is None: diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2cdce1eb9..a55a6dbc9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -14,7 +13,10 @@ remove_end, unescapeHTML, ) -from ..compat import compat_HTTPError +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, +) class BBCCoUkIE(InfoExtractor): @@ -344,7 +346,7 @@ def _download_media_selector_url(self, url, programme_id=None): url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) + media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) else: raise return self._process_media_selector(media_selection, programme_id) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index ecc17ebeb..6c66a1236 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -4,9 +4,11 @@ import re import itertools import json -import xml.etree.ElementTree as ET from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, +) from ..utils import ( int_or_none, unified_strdate, @@ -88,7 +90,7 @@ def _real_extract(self, url): except ValueError: pass - lq_doc = ET.fromstring(lq_page) + lq_doc = compat_etree_fromstring(lq_page) lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..1686cdde1 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -3,10 +3,10 @@ import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_parse_qs, compat_str, compat_urllib_parse, @@ -119,7 +119,7 @@ def _build_brighcove_url(cls, object_str): object_str = fix_xml_ampersands(object_str) try: - object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) except compat_xml_parse_error: return diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 10c0d5d1f..52523d7b2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,7 +10,6 @@ import socket import sys import time -import xml.etree.ElementTree from ..compat import ( compat_cookiejar, @@ -23,6 +22,7 @@ compat_urllib_request, compat_urlparse, compat_str, + compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -461,7 +461,7 @@ def _download_xml(self, url_or_request, video_id, return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f8ce10111..0c9b8ca02 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -5,12 +5,12 @@ import json import base64 import zlib -import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_request, @@ -234,7 +234,7 @@ def ass_bool(strvalue): return output def _extract_subtitles(self, subtitle): - sub_root = xml.etree.ElementTree.fromstring(subtitle) + sub_root = compat_etree_fromstring(subtitle) return [{ 'ext': 'srt', 'data': self._convert_subtitles_to_srt(sub_root), diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c17094f81..4c0de354f 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_urllib_request, ) from ..utils import ( @@ -97,7 +97,7 @@ def _formats_from_json(self, video_info): if last_version['version'] == -1: raise ExtractorError('Unable to extract last version of the video') - renditions = xml.etree.ElementTree.fromstring(last_version['data']) + renditions = compat_etree_fromstring(last_version['data']) formats = [] # Already sorted from worst to best quality for rend in renditions.findall('rendition'): @@ -114,7 +114,7 @@ def _formats_from_json(self, video_info): def _formats_from_smil(self, smil_xml): formats = [] - smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) + smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8')) els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') for el in els: src = el.attrib['src'] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a61e47646..7d846d680 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -36,6 +36,7 @@ from .compat import ( compat_basestring, compat_chr, + compat_etree_fromstring, compat_html_entities, compat_http_client, compat_kwargs, @@ -1974,7 +1975,7 @@ def parse_node(node): return out - dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') From 387db16a789fea25795433538d80513c18d0f699 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 25 Oct 2015 20:30:54 +0100 Subject: [PATCH 108/415] [compat] compat_etree_fromstring: only decode bytes objects --- test/test_compat.py | 3 ++- youtube_dl/compat.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 2b0860479..834f4bc55 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -74,9 +74,10 @@ def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) def test_compat_etree_fromstring(self): - xml = '<el foo="bar"></el>' + xml = '<el foo="bar" spam="中文"></el>' doc = compat_etree_fromstring(xml.encode('utf-8')) self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) + self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index cf10835ca..f39d4e9a9 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,8 +216,7 @@ def data_open(self, req): if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the the attributes of a node are str objects instead of - # unicode + # on python 2.x the the attributes of a node aren't always unicode objects etree = xml.etree.ElementTree # on 2.6 XML doesn't have a parser argument, function copied from CPython @@ -231,7 +230,8 @@ def _XML(text, parser=None): def _element_factory(*args, **kwargs): el = etree.Element(*args, **kwargs) for k, v in el.items(): - el.set(k, v.decode('utf-8')) + if isinstance(v, bytes): + el.set(k, v.decode('utf-8')) return el def compat_etree_fromstring(text): From 5f9f87c06fa819c75e59f5d0d491d191c229abbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk> Date: Mon, 26 Oct 2015 14:42:17 +0100 Subject: [PATCH 109/415] [vidme] Check for deleted videos --- youtube_dl/extractor/vidme.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index eb5cde761..3d63ed4f0 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -101,6 +101,10 @@ class VidmeIE(InfoExtractor): # suspended 'url': 'https://vid.me/Ox3G', 'only_matching': True, + }, { + # deleted + 'url': 'https://vid.me/KTPm', + 'only_matching': True, }, { # no formats in the API response 'url': 'https://vid.me/e5g', @@ -143,6 +147,11 @@ def _real_extract(self, url): video = response['video'] + if video.get('state') == 'deleted': + raise ExtractorError( + 'Vidme said: Sorry, this video has been deleted.', + expected=True) + if video.get('state') in ('user-disabled', 'suspended'): raise ExtractorError( 'Vidme said: This video has been suspended either due to a copyright claim, ' From 5dadae079bd053c822353b081e94d9daff333208 Mon Sep 17 00:00:00 2001 From: Frans de Jonge <fransdejonge@gmail.com> Date: Mon, 26 Oct 2015 15:11:09 +0100 Subject: [PATCH 110/415] [francetv] Add subtitles support --- youtube_dl/extractor/francetv.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 129984a5f..eaaa43958 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -83,6 +83,16 @@ def _extract_video(self, video_id, catalogue): if subtitle: title += ' - %s' % subtitle + subtitles = {} + for subtitle_accessibilite in info['subtitles']: + if subtitle_accessibilite['url'] is not '': + if not subtitles: + subtitles['fr'] = [] + subtitles['fr'].append({ + 'ext': subtitle_accessibilite['format'], + 'url': subtitle_accessibilite['url'], + }) + return { 'id': video_id, 'title': title, @@ -91,6 +101,7 @@ def _extract_video(self, video_id, catalogue): 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), 'timestamp': int_or_none(info['diffusion']['timestamp']), 'formats': formats, + 'subtitles': subtitles, } From 6e4b8b28916aaafc6d1b4b4d69a6f667e35d413f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 26 Oct 2015 20:35:28 +0600 Subject: [PATCH 111/415] [francetv] Make subtitles more robust (Closes #7298) --- youtube_dl/extractor/francetv.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index eaaa43958..07115b9d4 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -84,14 +84,12 @@ def _extract_video(self, video_id, catalogue): title += ' - %s' % subtitle subtitles = {} - for subtitle_accessibilite in info['subtitles']: - if subtitle_accessibilite['url'] is not '': - if not subtitles: - subtitles['fr'] = [] - subtitles['fr'].append({ - 'ext': subtitle_accessibilite['format'], - 'url': subtitle_accessibilite['url'], - }) + subtitles_list = [{ + 'url': subtitle['url'], + 'ext': subtitle.get('format'), + } for subtitle in info.get('subtitles', []) if subtitle.get('url')] + if subtitles_list: + subtitles['fr'] = subtitles_list return { 'id': video_id, From c137cc0d33fee8369b857c8f12a9116379248127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 26 Oct 2015 20:35:45 +0600 Subject: [PATCH 112/415] [francetv] Add subtitles test --- youtube_dl/extractor/francetv.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 07115b9d4..a31cc3c97 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -129,6 +129,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'title': 'Soir 3', 'upload_date': '20130826', 'timestamp': 1377548400, + 'subtitles': { + 'fr': 'mincount:2', + }, }, }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', From f78546272cf7c4b10c8003870728ab69bec982fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 26 Oct 2015 16:41:24 +0100 Subject: [PATCH 113/415] [compat] compat_etree_fromstring: also decode the text attribute Deletes parse_xml from utils, because it also does it. --- test/test_compat.py | 11 ++++++++++- youtube_dl/compat.py | 18 ++++++++++++++++-- youtube_dl/extractor/ard.py | 4 ++-- youtube_dl/extractor/generic.py | 4 ++-- youtube_dl/utils.py | 23 ----------------------- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 834f4bc55..b6bfad05e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -74,10 +74,19 @@ def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) def test_compat_etree_fromstring(self): - xml = '<el foo="bar" spam="中文"></el>' + xml = ''' + <root foo="bar" spam="中文"> + <normal>foo</normal> + <chinese>中文</chinese> + <foo><bar>spam</bar></foo> + </root> + ''' doc = compat_etree_fromstring(xml.encode('utf-8')) self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) + self.assertTrue(isinstance(doc.find('normal').text, compat_str)) + self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) + self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f39d4e9a9..2d43ec852 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,9 +216,19 @@ def data_open(self, req): if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the the attributes of a node aren't always unicode objects + # on python 2.x the attributes and text of a node aren't always unicode + # objects etree = xml.etree.ElementTree + try: + _etree_iter = etree.Element.iter + except AttributeError: # Python <=2.6 + def _etree_iter(root): + for el in root.findall('*'): + yield el + for sub in _etree_iter(el): + yield sub + # on 2.6 XML doesn't have a parser argument, function copied from CPython # 2.7 source def _XML(text, parser=None): @@ -235,7 +245,11 @@ def _element_factory(*args, **kwargs): return el def compat_etree_fromstring(text): - return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + for el in _etree_iter(doc): + if el.text is not None and isinstance(el.text, bytes): + el.text = el.text.decode('utf-8') + return doc try: from urllib.parse import parse_qs as compat_parse_qs diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6f465789b..73be6d204 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -14,8 +14,8 @@ parse_duration, unified_strdate, xpath_text, - parse_xml, ) +from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): @@ -161,7 +161,7 @@ def _real_extract(self, url): raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) if re.search(r'[\?&]rss($|[=&])', url): - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..1de96b268 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( + compat_etree_fromstring, compat_urllib_parse_unquote, compat_urllib_request, compat_urlparse, @@ -21,7 +22,6 @@ HEADRequest, is_html, orderedSet, - parse_xml, smuggle_url, unescapeHTML, unified_strdate, @@ -1237,7 +1237,7 @@ def _real_extract(self, url): # Is it an RSS feed, a SMIL file or a XSPF playlist? try: - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7d846d680..c761ea22a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'): return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) -try: - etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError: # Python <=2.6 - etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): - class TreeBuilder(xml.etree.ElementTree.TreeBuilder): - def doctype(self, name, pubid, system): - pass # Ignore doctypes - - parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) - kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) - # Fix up XML parser in Python 2.x - if sys.version_info < (3, 0): - for n in etree_iter(tree): - if n.text is not None: - if not isinstance(n.text, compat_str): - n.text = n.text.decode('utf-8') - return tree - - US_RATINGS = { 'G': 0, 'PG': 10, From a526167d40983e47231d10c09c9f9064e0298604 Mon Sep 17 00:00:00 2001 From: Pierre Fenoll <pierrefenoll@gmail.com> Date: Tue, 27 Oct 2015 11:58:59 +0100 Subject: [PATCH 114/415] [francetv] Accept mobile URLs --- youtube_dl/extractor/francetv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index a31cc3c97..d63dc4d7c 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -105,7 +105,7 @@ def _extract_video(self, video_id, catalogue): class PluzzIE(FranceTVBaseInfoExtractor): IE_NAME = 'pluzz.francetv.fr' - _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html' + _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(.*?)\.html' # Can't use tests, videos expire in 7 days From 0a192fbea798c843ad6fef37106901d431f39b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 27 Oct 2015 21:43:29 +0600 Subject: [PATCH 115/415] [pluzz] Fix mobile support and modernize (Closes #7305) --- youtube_dl/extractor/francetv.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index d63dc4d7c..00a80ba61 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -105,15 +105,21 @@ def _extract_video(self, video_id, catalogue): class PluzzIE(FranceTVBaseInfoExtractor): IE_NAME = 'pluzz.francetv.fr' - _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(.*?)\.html' + _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html' # Can't use tests, videos expire in 7 days def _real_extract(self, url): - title = re.match(self._VALID_URL, url).group(1) - webpage = self._download_webpage(url, title) - video_id = self._search_regex( - r'data-diffusion="(\d+)"', webpage, 'ID') + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._html_search_meta( + 'id_video', webpage, 'video id', default=None) + if not video_id: + video_id = self._search_regex( + r'data-diffusion=["\'](\d+)', webpage, 'video id') + return self._extract_video(video_id, 'Pluzz') From 7ccb2b84ddb65f41d01037b5b62301886be9d22c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Oct 2015 08:22:04 +0100 Subject: [PATCH 116/415] [francetv] fix style issues reported by flake8 * Don't redefine variable in list comprehension * Line missing indentation --- youtube_dl/extractor/francetv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 00a80ba61..8e60cf60f 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -85,9 +85,9 @@ def _extract_video(self, video_id, catalogue): subtitles = {} subtitles_list = [{ - 'url': subtitle['url'], - 'ext': subtitle.get('format'), - } for subtitle in info.get('subtitles', []) if subtitle.get('url')] + 'url': subformat['url'], + 'ext': subformat.get('format'), + } for subformat in info.get('subtitles', []) if subformat.get('url')] if subtitles_list: subtitles['fr'] = subtitles_list @@ -118,7 +118,7 @@ def _real_extract(self, url): 'id_video', webpage, 'video id', default=None) if not video_id: video_id = self._search_regex( - r'data-diffusion=["\'](\d+)', webpage, 'video id') + r'data-diffusion=["\'](\d+)', webpage, 'video id') return self._extract_video(video_id, 'Pluzz') From 4e16c1f80b009001aaea6f8baca5dfbfa9b58c11 Mon Sep 17 00:00:00 2001 From: Cian Ruane <CianLR@users.noreply.github.com> Date: Fri, 16 Oct 2015 01:23:09 +0100 Subject: [PATCH 117/415] [clyp] Add extractor Update __init__.py [clyp.it] Extract ID idiomatically and make duration and description optional --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/clyp.py | 57 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 youtube_dl/extractor/clyp.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6318ac4a2..f98e6487e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -90,6 +90,7 @@ from .clipsyndicate import ClipsyndicateIE from .cloudy import CloudyIE from .clubic import ClubicIE +from .clyp import ClypIE from .cmt import CMTIE from .cnet import CNETIE from .cnn import ( diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py new file mode 100644 index 000000000..906729b30 --- /dev/null +++ b/youtube_dl/extractor/clyp.py @@ -0,0 +1,57 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ClypIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' + + _TESTS = [{ + 'url': 'https://clyp.it/ojz2wfah', + 'md5': '1d4961036c41247ecfdcc439c0cddcbb', + 'info_dict': { + 'id': 'ojz2wfah', + 'ext': 'mp3', + 'title': 'Krisson80 - bits wip wip', + 'description': '#Krisson80BitsWipWip #chiptune\n#wip', + }, + }, { + 'url': 'https://clyp.it/ojz2wfah', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + api_url = 'https://api.clyp.it/' + audio_id + metadata = self._download_json(api_url, audio_id) + + title = metadata['Title'] + + description = None + if metadata['Description']: description = metadata['Description'] + + duration = None + if metadata['Duration']: duration = int(metadata['Duration']) + + formats = [ + { + 'url': metadata['OggUrl'], + 'format_id': 'ogg', + 'preference': -2 + },{ + 'url': metadata['Mp3Url'], + 'format_id': 'mp3', + 'preference': -1 + }] + + return { + 'id': audio_id, + 'title': title, + 'formats': formats, + 'description': description, + 'duration': duration + } From 52c3a6e49d2cbc1932992d816d28bbed629daadc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 28 Oct 2015 21:40:22 +0600 Subject: [PATCH 118/415] [utils] Improve parse_iso8601 --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 918a7a9ef..0c34f0e55 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -425,6 +425,8 @@ def test_parse_iso8601(self): self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266) + self.assertEqual(parse_iso8601('2015-09-29T08:27:31.727'), 1443515251) + self.assertEqual(parse_iso8601('2015-09-29T08-27-31.727'), None) def test_strip_jsonp(self): stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a61e47646..558c9c7d5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -814,9 +814,11 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): if date_str is None: return None + date_str = re.sub(r'\.[0-9]+', '', date_str) + if timezone is None: m = re.search( - r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', + r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', date_str) if not m: timezone = datetime.timedelta() @@ -829,9 +831,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): timezone = datetime.timedelta( hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) - date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) + try: + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + except ValueError: + pass def unified_strdate(date_str, day_first=True): From 03c2c162f9eaac6b474a1be9e985621f5b7b8c10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 28 Oct 2015 21:42:01 +0600 Subject: [PATCH 119/415] [clyp] Improve and cleanup (Closes #7194) --- youtube_dl/extractor/clyp.py | 62 ++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py index 906729b30..57e643799 100644 --- a/youtube_dl/extractor/clyp.py +++ b/youtube_dl/extractor/clyp.py @@ -1,16 +1,15 @@ -# coding: utf-8 - from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_iso8601, +) class ClypIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' - - _TESTS = [{ + _TEST = { 'url': 'https://clyp.it/ojz2wfah', 'md5': '1d4961036c41247ecfdcc439c0cddcbb', 'info_dict': { @@ -18,40 +17,41 @@ class ClypIE(InfoExtractor): 'ext': 'mp3', 'title': 'Krisson80 - bits wip wip', 'description': '#Krisson80BitsWipWip #chiptune\n#wip', + 'duration': 263.21, + 'timestamp': 1443515251, + 'upload_date': '20150929', }, - }, { - 'url': 'https://clyp.it/ojz2wfah', - 'only_matching': True, - }] + } def _real_extract(self, url): audio_id = self._match_id(url) - api_url = 'https://api.clyp.it/' + audio_id - metadata = self._download_json(api_url, audio_id) + + metadata = self._download_json( + 'https://api.clyp.it/%s' % audio_id, audio_id) + + formats = [] + for secure in ('', 'Secure'): + for ext in ('Ogg', 'Mp3'): + format_id = '%s%s' % (secure, ext) + format_url = metadata.get('%sUrl' % format_id) + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none', + }) + self._sort_formats(formats) title = metadata['Title'] - - description = None - if metadata['Description']: description = metadata['Description'] - - duration = None - if metadata['Duration']: duration = int(metadata['Duration']) - - formats = [ - { - 'url': metadata['OggUrl'], - 'format_id': 'ogg', - 'preference': -2 - },{ - 'url': metadata['Mp3Url'], - 'format_id': 'mp3', - 'preference': -1 - }] + description = metadata.get('Description') + duration = float_or_none(metadata.get('Duration')) + timestamp = parse_iso8601(metadata.get('DateCreated')) return { 'id': audio_id, 'title': title, - 'formats': formats, 'description': description, - 'duration': duration + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, } From ae37338e681319a28d98dc551253d9fa1830969a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 29 Oct 2015 13:58:40 +0100 Subject: [PATCH 120/415] [compat] compat_etree_fromstring: clarify comment --- youtube_dl/compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2d43ec852..a3e85264a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,8 +216,8 @@ def data_open(self, req): if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the attributes and text of a node aren't always unicode - # objects + # python 2.x tries to encode unicode strings with ascii (see the + # XMLParser._fixtext method) etree = xml.etree.ElementTree try: From 6fb8ace671db2f2bdcc9cd7ac6b9f81fbd356791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 29 Oct 2015 22:44:01 +0600 Subject: [PATCH 121/415] [moniker] Add support for builtin embedded videos (Closes #7244) --- youtube_dl/extractor/moniker.py | 35 ++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py index 69e4bcd1a..204c03c4a 100644 --- a/youtube_dl/extractor/moniker.py +++ b/youtube_dl/extractor/moniker.py @@ -17,7 +17,7 @@ class MonikerIE(InfoExtractor): IE_DESC = 'allmyvideos.net and vidspot.net' - _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?P<id>[a-zA-Z0-9_-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?:(?:2|v)/v-)?(?P<id>[a-zA-Z0-9_-]+)' _TESTS = [{ 'url': 'http://allmyvideos.net/jih3nce3x6wn', @@ -64,18 +64,30 @@ def _real_extract(self, url): raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error), expected=True) - fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) - data = dict(fields) + builtin_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>.+?/builtin-.+?)\1', + orig_webpage, 'builtin URL', default=None, group='url') - post = compat_urllib_parse.urlencode(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = compat_urllib_request.Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') + if builtin_url: + req = compat_urllib_request.Request(builtin_url) + req.add_header('Referer', url) + webpage = self._download_webpage(req, video_id, 'Downloading builtin page') + title = self._og_search_title(orig_webpage).strip() + description = self._og_search_description(orig_webpage).strip() + else: + fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) + data = dict(fields) - title = os.path.splitext(data['fname'])[0] + post = compat_urllib_parse.urlencode(data) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + } + req = compat_urllib_request.Request(url, post, headers) + webpage = self._download_webpage( + req, video_id, note='Downloading video page ...') + + title = os.path.splitext(data['fname'])[0] + description = None # Could be several links with different quality links = re.findall(r'"file" : "?(.+?)",', webpage) @@ -89,5 +101,6 @@ def _real_extract(self, url): return { 'id': video_id, 'title': title, + 'description': description, 'formats': formats, } From 721f5a277ca0012ee72c9d4b3e5550e52a0a596d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 29 Oct 2015 22:47:18 +0600 Subject: [PATCH 122/415] [moniker] Add tests for #7244 --- youtube_dl/extractor/moniker.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py index 204c03c4a..7c0c4e50e 100644 --- a/youtube_dl/extractor/moniker.py +++ b/youtube_dl/extractor/moniker.py @@ -46,6 +46,18 @@ class MonikerIE(InfoExtractor): }, { 'url': 'https://www.vidspot.net/l2ngsmhs8ci5', 'only_matching': True, + }, { + 'url': 'http://vidspot.net/2/v-ywDf99', + 'md5': '5f8254ce12df30479428b0152fb8e7ba', + 'info_dict': { + 'id': 'ywDf99', + 'ext': 'mp4', + 'title': 'IL FAIT LE MALIN EN PORSHE CAYENNE ( mais pas pour longtemps)', + 'description': 'IL FAIT LE MALIN EN PORSHE CAYENNE.', + }, + }, { + 'url': 'http://allmyvideos.net/v/v-HXZm5t', + 'only_matching': True, }] def _real_extract(self, url): From 6722ebd43720e836af8217fa078fa1a604b98229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 30 Oct 2015 21:00:36 +0600 Subject: [PATCH 123/415] [anitube] Relax key regex (Closes #7303) Another variant seen http://anitubebr.xpg.uol.com.br/embed/ --- youtube_dl/extractor/anitube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 31f0d417c..23f942ae2 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -26,8 +26,8 @@ def _real_extract(self, url): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - key = self._html_search_regex( - r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key') + key = self._search_regex( + r'src=["\']https?://[^/]+/embed/([A-Za-z0-9_-]+)', webpage, 'key') config_xml = self._download_xml( 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) From 47f2d01a5ac074f6959aa12e8bc00310f18a54e8 Mon Sep 17 00:00:00 2001 From: Lucas <mikotosc@gmail.com> Date: Thu, 24 Sep 2015 22:19:09 +0200 Subject: [PATCH 124/415] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/kika.py | 115 +++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 youtube_dl/extractor/kika.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f98e6487e..5ad4e9c36 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -274,6 +274,7 @@ from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE +from .kika import KikaIE from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py new file mode 100644 index 000000000..db0f333ff --- /dev/null +++ b/youtube_dl/extractor/kika.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class KikaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|sendung)(?P<id>\d+).*' + + _TESTS = [ + { + 'url': 'http://www.kika.de/baumhaus/videos/video9572.html', + 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'info_dict': { + 'id': '9572', + 'ext': 'mp4', + 'title': 'Baumhaus vom 29. Oktober 2014', + 'description': None + } + }, + { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' + } + }, + { + 'url': 'http://www.kika.de/videos/allevideos/video9572_zc-32ca94ad_zs-3f535991.html', + 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'info_dict': { + 'id': '9572', + 'ext': 'mp4', + 'title': 'Baumhaus vom 29. Oktober 2014', + 'description': None + } + }, + { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/sendung81244_zc-81d703f8_zs-f82d5e31.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' + } + } + ] + + def _real_extract(self, url): + # broadcast_id may be the same as the video_id + broadcast_id = self._match_id(url) + webpage = self._download_webpage(url, broadcast_id) + + xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' + video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) + if not video_id: + # Video is not available online + err_msg = 'Video %s is not available online' % broadcast_id + raise ExtractorError(err_msg, expected=True) + + xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id) + xml_tree = self._download_xml(xml_url, video_id) + + title = xml_tree.find('title').text + webpage_url = xml_tree.find('htmlUrl').text + + # Try to get the description, not available for all videos + try: + broadcast_elem = xml_tree.find('broadcast') + description = broadcast_elem.find('broadcastDescription').text + except AttributeError: + # No description available + description = None + + # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) + tmp = xml_tree.find('duration').text.split(':') + duration = int(tmp[0]) * 60 + int(tmp[1]) + + formats_list = [] + for elem in xml_tree.find('assets'): + format_dict = {} + format_dict['url'] = elem.find('progressiveDownloadUrl').text + format_dict['ext'] = elem.find('mediaType').text.lower() + format_dict['format'] = elem.find('profileName').text + width = int(elem.find('frameWidth').text) + height = int(elem.find('frameHeight').text) + format_dict['width'] = width + format_dict['height'] = height + format_dict['resolution'] = '%dx%d' % (width, height) + format_dict['abr'] = int(elem.find('bitrateAudio').text) + format_dict['vbr'] = int(elem.find('bitrateVideo').text) + format_dict['tbr'] = format_dict['abr'] + format_dict['vbr'] + format_dict['filesize'] = int(elem.find('fileSize').text) + + # append resolution and dict for sorting by resolution + formats_list.append((width * height, format_dict)) + + # Sort by resolution (=quality) + formats_list.sort() + + out_list = [x[1] for x in formats_list] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': out_list, + 'duration': duration, + 'webpage_url': webpage_url + } From 892015b088fa21915270b0a05937fcc7063ccdd2 Mon Sep 17 00:00:00 2001 From: Lucas <mikotosc@gmail.com> Date: Mon, 28 Sep 2015 22:00:56 +0200 Subject: [PATCH 125/415] replaced inefficient code --- youtube_dl/extractor/kika.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py index db0f333ff..871e4ea44 100644 --- a/youtube_dl/extractor/kika.py +++ b/youtube_dl/extractor/kika.py @@ -87,29 +87,25 @@ def _real_extract(self, url): format_dict['url'] = elem.find('progressiveDownloadUrl').text format_dict['ext'] = elem.find('mediaType').text.lower() format_dict['format'] = elem.find('profileName').text - width = int(elem.find('frameWidth').text) - height = int(elem.find('frameHeight').text) - format_dict['width'] = width - format_dict['height'] = height - format_dict['resolution'] = '%dx%d' % (width, height) + format_dict['width'] = int(elem.find('frameWidth').text) + format_dict['height'] = int(elem.find('frameHeight').text) + format_dict['resolution'] = '%dx%d' % (format_dict['width'], + format_dict['height']) format_dict['abr'] = int(elem.find('bitrateAudio').text) format_dict['vbr'] = int(elem.find('bitrateVideo').text) format_dict['tbr'] = format_dict['abr'] + format_dict['vbr'] format_dict['filesize'] = int(elem.find('fileSize').text) - # append resolution and dict for sorting by resolution - formats_list.append((width * height, format_dict)) + formats_list.append(format_dict) # Sort by resolution (=quality) - formats_list.sort() - - out_list = [x[1] for x in formats_list] + formats_list.sort(key=lambda x: x['width'] * x['height']) return { 'id': video_id, 'title': title, 'description': description, - 'formats': out_list, + 'formats': formats_list, 'duration': duration, 'webpage_url': webpage_url } From 78d7ee19dc417b16b26fe2fa1101124866727a85 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 31 Oct 2015 22:21:52 +0800 Subject: [PATCH 126/415] [democracynow] Fix _TESTS --- youtube_dl/extractor/democracynow.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 973bb437b..05cfc7502 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -36,10 +36,9 @@ def _real_extract(self, url): if display_id == '': display_id = 'home' webpage = self._download_webpage(url, display_id) - re_desc = re.search(r'<meta property=.og:description. content=(["\'])(.+?)\1', webpage, re.DOTALL) - description = re_desc.group(2) if re_desc else '' + description = self._og_search_description(webpage) - jstr = self._search_regex(r'({.+?"related_video_xml".+?})', webpage, 'json', default=None) + jstr = self._search_regex(r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json') js = self._parse_json(jstr, display_id) video_id = None formats = [] @@ -56,7 +55,7 @@ def _real_extract(self, url): 'ext': ext, 'url': url, }] - for key in ('file', 'audio'): + for key in ('file', 'audio', 'video'): url = js.get(key, '') if url == '' or url is None: continue From 8c1aa28c27af204c4996260cdc70359e83c2c3d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 31 Oct 2015 16:14:36 +0100 Subject: [PATCH 127/415] [kika] Replace non working tests and recognize 'einzelsendung' urls. --- youtube_dl/extractor/kika.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py index 871e4ea44..c9169076a 100644 --- a/youtube_dl/extractor/kika.py +++ b/youtube_dl/extractor/kika.py @@ -6,16 +6,16 @@ class KikaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|sendung)(?P<id>\d+).*' + _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P<id>\d+).*' _TESTS = [ { - 'url': 'http://www.kika.de/baumhaus/videos/video9572.html', - 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', 'info_dict': { - 'id': '9572', + 'id': '19636', 'ext': 'mp4', - 'title': 'Baumhaus vom 29. Oktober 2014', + 'title': 'Baumhaus vom 30. Oktober 2015', 'description': None } }, @@ -30,17 +30,17 @@ class KikaIE(InfoExtractor): } }, { - 'url': 'http://www.kika.de/videos/allevideos/video9572_zc-32ca94ad_zs-3f535991.html', - 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', 'info_dict': { - 'id': '9572', + 'id': '19636', 'ext': 'mp4', - 'title': 'Baumhaus vom 29. Oktober 2014', + 'title': 'Baumhaus vom 30. Oktober 2015', 'description': None } }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/sendung81244_zc-81d703f8_zs-f82d5e31.html', + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', 'info_dict': { 'id': '8182', From c3040bd00a43e111dab0d1ab903df03ac7d19a00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 31 Oct 2015 16:32:35 +0100 Subject: [PATCH 128/415] [kika] Cleanup Closes #6957. --- youtube_dl/extractor/kika.py | 54 +++++++++++++++--------------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py index c9169076a..5337ac439 100644 --- a/youtube_dl/extractor/kika.py +++ b/youtube_dl/extractor/kika.py @@ -16,8 +16,8 @@ class KikaIE(InfoExtractor): 'id': '19636', 'ext': 'mp4', 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None - } + 'description': None, + }, }, { 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', @@ -26,8 +26,8 @@ class KikaIE(InfoExtractor): 'id': '8182', 'ext': 'mp4', 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' - } + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + }, }, { 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', @@ -36,8 +36,8 @@ class KikaIE(InfoExtractor): 'id': '19636', 'ext': 'mp4', 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None - } + 'description': None, + }, }, { 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', @@ -46,9 +46,9 @@ class KikaIE(InfoExtractor): 'id': '8182', 'ext': 'mp4', 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' - } - } + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + }, + }, ] def _real_extract(self, url): @@ -59,7 +59,6 @@ def _real_extract(self, url): xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) if not video_id: - # Video is not available online err_msg = 'Video %s is not available online' % broadcast_id raise ExtractorError(err_msg, expected=True) @@ -74,38 +73,29 @@ def _real_extract(self, url): broadcast_elem = xml_tree.find('broadcast') description = broadcast_elem.find('broadcastDescription').text except AttributeError: - # No description available description = None # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) tmp = xml_tree.find('duration').text.split(':') duration = int(tmp[0]) * 60 + int(tmp[1]) - formats_list = [] - for elem in xml_tree.find('assets'): - format_dict = {} - format_dict['url'] = elem.find('progressiveDownloadUrl').text - format_dict['ext'] = elem.find('mediaType').text.lower() - format_dict['format'] = elem.find('profileName').text - format_dict['width'] = int(elem.find('frameWidth').text) - format_dict['height'] = int(elem.find('frameHeight').text) - format_dict['resolution'] = '%dx%d' % (format_dict['width'], - format_dict['height']) - format_dict['abr'] = int(elem.find('bitrateAudio').text) - format_dict['vbr'] = int(elem.find('bitrateVideo').text) - format_dict['tbr'] = format_dict['abr'] + format_dict['vbr'] - format_dict['filesize'] = int(elem.find('fileSize').text) - - formats_list.append(format_dict) - - # Sort by resolution (=quality) - formats_list.sort(key=lambda x: x['width'] * x['height']) + formats = [{ + 'url': elem.find('progressiveDownloadUrl').text, + 'ext': elem.find('mediaType').text.lower(), + 'format': elem.find('profileName').text, + 'width': int(elem.find('frameWidth').text), + 'height': int(elem.find('frameHeight').text), + 'abr': int(elem.find('bitrateAudio').text), + 'vbr': int(elem.find('bitrateVideo').text), + 'filesize': int(elem.find('fileSize').text), + } for elem in xml_tree.find('assets')] + self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, - 'formats': formats_list, + 'formats': formats, 'duration': duration, - 'webpage_url': webpage_url + 'webpage_url': webpage_url, } From 2b1b2d83cacfdce19cae5eea2f9bbfd142efc7f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:17:09 +0600 Subject: [PATCH 129/415] [mdr] Modernize and include kika.de --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/kika.py | 101 ------------------ youtube_dl/extractor/mdr.py | 172 +++++++++++++++++++++++-------- 3 files changed, 131 insertions(+), 143 deletions(-) delete mode 100644 youtube_dl/extractor/kika.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5ad4e9c36..f98e6487e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -274,7 +274,6 @@ from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE -from .kika import KikaIE from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py deleted file mode 100644 index 5337ac439..000000000 --- a/youtube_dl/extractor/kika.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class KikaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P<id>\d+).*' - - _TESTS = [ - { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None, - }, - }, - { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - }, - }, - { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None, - }, - }, - { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - }, - }, - ] - - def _real_extract(self, url): - # broadcast_id may be the same as the video_id - broadcast_id = self._match_id(url) - webpage = self._download_webpage(url, broadcast_id) - - xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' - video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) - if not video_id: - err_msg = 'Video %s is not available online' % broadcast_id - raise ExtractorError(err_msg, expected=True) - - xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id) - xml_tree = self._download_xml(xml_url, video_id) - - title = xml_tree.find('title').text - webpage_url = xml_tree.find('htmlUrl').text - - # Try to get the description, not available for all videos - try: - broadcast_elem = xml_tree.find('broadcast') - description = broadcast_elem.find('broadcastDescription').text - except AttributeError: - description = None - - # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) - tmp = xml_tree.find('duration').text.split(':') - duration = int(tmp[0]) * 60 + int(tmp[1]) - - formats = [{ - 'url': elem.find('progressiveDownloadUrl').text, - 'ext': elem.find('mediaType').text.lower(), - 'format': elem.find('profileName').text, - 'width': int(elem.find('frameWidth').text), - 'height': int(elem.find('frameHeight').text), - 'abr': int(elem.find('bitrateAudio').text), - 'vbr': int(elem.find('bitrateVideo').text), - 'filesize': int(elem.find('fileSize').text), - } for elem in xml_tree.find('assets')] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': duration, - 'webpage_url': webpage_url, - } diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index fc7499958..541ddd909 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -1,64 +1,154 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + xpath_text, +) class MDRIE(InfoExtractor): - _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)' + IE_DESC = 'MDR.DE and KiKA' + _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html' - # No tests, MDR regularily deletes its videos - _TEST = { + _TESTS = [{ + # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, - } + }, { + 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', + 'info_dict': { + 'id': '19636', + 'ext': 'mp4', + 'title': 'Baumhaus vom 30. Oktober 2015', + 'duration': 134, + 'uploader': 'KIKA', + }, + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + 'timestamp': 1419047100, + 'upload_date': '20141220', + 'duration': 4628, + 'uploader': 'KIKA', + }, + }, { + 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', + 'only_matching': True, + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', + 'only_matching': True, + }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('video_id') - domain = m.group('domain') + video_id = self._match_id(url) - # determine title and media streams from webpage - html = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title') - xmlurl = self._search_regex( - r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL') + data_url = self._search_regex( + r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1', + webpage, 'data url', group='url') + + doc = self._download_xml( + compat_urlparse.urljoin(url, data_url), video_id) + + title = (xpath_text(doc, './title', 'title', default=None) or + xpath_text(doc, './broadcast/broadcastName', 'title')) - doc = self._download_xml(domain + xmlurl, video_id) formats = [] - for a in doc.findall('./assets/asset'): - url_el = a.find('./progressiveDownloadUrl') - if url_el is None: - continue - abr = int(a.find('bitrateAudio').text) // 1000 - media_type = a.find('mediaType').text - format = { - 'abr': abr, - 'filesize': int(a.find('fileSize').text), - 'url': url_el.text, - } + processed_urls = [] + for asset in doc.findall('./assets/asset'): + for source in ( + 'progressiveDownload', + 'dynamicHttpStreamingRedirector', + 'adaptiveHttpStreamingRedirector'): + url_el = asset.find('./%sUrl' % source) + if url_el is None: + continue - vbr_el = a.find('bitrateVideo') - if vbr_el is None: - format.update({ - 'vcodec': 'none', - 'format_id': '%s-%d' % (media_type, abr), - }) - else: - vbr = int(vbr_el.text) // 1000 - format.update({ - 'vbr': vbr, - 'width': int(a.find('frameWidth').text), - 'height': int(a.find('frameHeight').text), - 'format_id': '%s-%d' % (media_type, vbr), - }) - formats.append(format) + video_url = url_el.text + if video_url in processed_urls: + continue + + processed_urls.append(video_url) + + vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) + abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + + url_formats = [] + + ext = determine_ext(url_el.text) + if ext == 'm3u8': + url_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + preference=0, m3u8_id='HLS', fatal=False) + elif ext == 'f4m': + url_formats = self._extract_f4m_formats( + video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + preference=0, f4m_id='HDS', fatal=False) + else: + media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') + vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) + abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + + f = { + 'url': video_url, + 'format_id': '%s-%d' % (media_type, vbr or abr), + 'filesize': filesize, + 'abr': abr, + 'preference': 1, + } + + if vbr: + width = int_or_none(xpath_text(asset, './frameWidth', 'width')) + height = int_or_none(xpath_text(asset, './frameHeight', 'height')) + f.update({ + 'vbr': vbr, + 'width': width, + 'height': height, + }) + + url_formats.append(f) + + if not vbr: + for f in url_formats: + abr = f.get('tbr') or abr + if 'tbr' in f: + del f['tbr'] + f.update({ + 'abr': abr, + 'vcodec': 'none', + }) + + if url_formats: + formats.extend(url_formats) self._sort_formats(formats) + description = xpath_text(doc, './broadcast/broadcastDescription', 'description') + timestamp = parse_iso8601( + xpath_text(doc, './broadcast/broadcastDate', 'timestamp', default=None) or + xpath_text(doc, './broadcast/broadcastStartDate', 'timestamp', default=None)) + duration = parse_duration(xpath_text(doc, './duration', 'duration')) + uploader = xpath_text(doc, './rights', 'uploader') + return { 'id': video_id, 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'uploader': uploader, 'formats': formats, } From 8cdb5c845336ad3dc48c85a0558a38bd42972b00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:24:21 +0600 Subject: [PATCH 130/415] [mdr] Add audio test --- youtube_dl/extractor/mdr.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 541ddd909..e05577496 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -20,6 +20,17 @@ class MDRIE(InfoExtractor): # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, + }, { + # audio + 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html', + 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa', + 'info_dict': { + 'id': '1312272', + 'ext': 'mp3', + 'title': 'Feuilleton vom 30. Oktober 2015', + 'duration': 250, + 'uploader': 'MITTELDEUTSCHER RUNDFUNK', + }, }, { 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', 'md5': '4930515e36b06c111213e80d1e4aad0e', From 578c074575f45ffdfd032d7b84f6fe449614f511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:39:44 +0600 Subject: [PATCH 131/415] [utils] Support list of xpath in xpath_element --- test/test_utils.py | 7 +++++++ youtube_dl/utils.py | 15 ++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0c34f0e55..5a56ad776 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -275,9 +275,16 @@ def test_xpath_element(self): p = xml.etree.ElementTree.SubElement(div, 'p') p.text = 'Foo' self.assertEqual(xpath_element(doc, 'div/p'), p) + self.assertEqual(xpath_element(doc, ['div/p']), p) + self.assertEqual(xpath_element(doc, ['div/bar', 'div/p']), p) self.assertEqual(xpath_element(doc, 'div/bar', default='default'), 'default') + self.assertEqual(xpath_element(doc, ['div/bar'], default='default'), 'default') self.assertTrue(xpath_element(doc, 'div/bar') is None) + self.assertTrue(xpath_element(doc, ['div/bar']) is None) + self.assertTrue(xpath_element(doc, ['div/bar'], 'div/baz') is None) self.assertRaises(ExtractorError, xpath_element, doc, 'div/bar', fatal=True) + self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar'], fatal=True) + self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar', 'div/baz'], fatal=True) def test_xpath_text(self): testxml = '''<root> diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 558c9c7d5..89c88a4d3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -178,10 +178,19 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') + def _find_xpath(xpath): + if sys.version_info < (2, 7): # Crazy 2.6 + xpath = xpath.encode('ascii') + return node.find(xpath) + + if isinstance(xpath, (str, compat_str)): + n = _find_xpath(xpath) + else: + for xp in xpath: + n = _find_xpath(xp) + if n is not None: + break - n = node.find(xpath) if n is None: if default is not NO_DEFAULT: return default From 11465da70257663ee52c7be50debe1c1e825ec67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:45:45 +0600 Subject: [PATCH 132/415] [mdr] Simplify xpath --- youtube_dl/extractor/mdr.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index e05577496..a63257c56 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -74,8 +74,7 @@ def _real_extract(self, url): doc = self._download_xml( compat_urlparse.urljoin(url, data_url), video_id) - title = (xpath_text(doc, './title', 'title', default=None) or - xpath_text(doc, './broadcast/broadcastName', 'title')) + title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) formats = [] processed_urls = [] @@ -149,8 +148,12 @@ def _real_extract(self, url): description = xpath_text(doc, './broadcast/broadcastDescription', 'description') timestamp = parse_iso8601( - xpath_text(doc, './broadcast/broadcastDate', 'timestamp', default=None) or - xpath_text(doc, './broadcast/broadcastStartDate', 'timestamp', default=None)) + xpath_text( + doc, [ + './broadcast/broadcastDate', + './broadcast/broadcastStartDate', + './broadcast/broadcastEndDate'], + 'timestamp', default=None)) duration = parse_duration(xpath_text(doc, './duration', 'duration')) uploader = xpath_text(doc, './rights', 'uploader') From 82b69a5cbb1455d31916be4f19ab327ae63f313c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:00:36 +0600 Subject: [PATCH 133/415] [mdr] PEP 8 --- youtube_dl/extractor/mdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index a63257c56..a566c6a2c 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -20,7 +20,7 @@ class MDRIE(InfoExtractor): # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, - }, { + }, { # audio 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html', 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa', From e327b736ca6a6a1c880b93e09a3b310c354c2c7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:05:30 +0600 Subject: [PATCH 134/415] [generic] Update test --- youtube_dl/extractor/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..a84135032 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -141,6 +141,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', 'formats': 'mincount:16', 'subtitles': 'mincount:1', }, From ae12bc3ebb4cb377c2b4337ec255e652b36f5143 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:07:37 +0600 Subject: [PATCH 135/415] [utils] Make unified_strdate always return unicode string --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 89c88a4d3..764a89cca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -910,7 +910,7 @@ def unified_strdate(date_str, day_first=True): timetuple = email.utils.parsedate_tz(date_str) if timetuple: upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - return upload_date + return compat_str(upload_date) def determine_ext(url, default_ext='unknown_video'): From dc519b5421366a8cac681455a817ae25f7f4aa83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:12:57 +0600 Subject: [PATCH 136/415] [extractor/common] Make ie_key and IE_NAME return unicode string --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 10c0d5d1f..59c3fa8dc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -310,11 +310,11 @@ def _real_extract(self, url): @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" - return cls.__name__[:-2] + return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): - return type(self).__name__[:-2] + return compat_str(type(self).__name__[:-2]) def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ From 76f0c50d3d3e2eb5903b61da08829699e902916d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 00:01:08 +0600 Subject: [PATCH 137/415] [mdr] Fix failed formats processing --- youtube_dl/extractor/mdr.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index a566c6a2c..88334889e 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -96,8 +96,6 @@ def _real_extract(self, url): vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) - url_formats = [] - ext = determine_ext(url_el.text) if ext == 'm3u8': url_formats = self._extract_m3u8_formats( @@ -130,7 +128,10 @@ def _real_extract(self, url): 'height': height, }) - url_formats.append(f) + url_formats = [f] + + if not url_formats: + continue if not vbr: for f in url_formats: @@ -142,8 +143,8 @@ def _real_extract(self, url): 'vcodec': 'none', }) - if url_formats: - formats.extend(url_formats) + formats.extend(url_formats) + self._sort_formats(formats) description = xpath_text(doc, './broadcast/broadcastDescription', 'description') From dbd82a1d4fff1655920e111cc25a7fd526d7bf9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 00:01:34 +0600 Subject: [PATCH 138/415] [extractor/common] Fix m3u8 extraction on failure --- youtube_dl/extractor/common.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 59c3fa8dc..1f09fbb47 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -943,13 +943,14 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc, urlh = self._download_webpage_handle( + res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) - if m3u8_doc is False: - return m3u8_doc + if res is False: + return res + m3u8_doc, urlh = res m3u8_url = urlh.geturl() last_info = None last_media = None From 9550ca506fccf9c9d795816cc0a7817ff262ef45 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 31 Oct 2015 19:36:04 +0100 Subject: [PATCH 139/415] [utils] change extract_attributes to work in python 2 --- youtube_dl/extractor/brightcove.py | 3 +-- youtube_dl/utils.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b41cee91b..c6ad1d065 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -383,8 +383,7 @@ def _extract_url(webpage): return None def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - account_id, player_id, embed, video_id = mobj.groups() + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bcebf9cc5..518cea98b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -252,7 +252,8 @@ def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s attributes = re.findall(attributes_regex, attributes_str) attributes_dict = {} if attributes: - attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes} + for (attribute_name, attribute_value) in attributes: + attributes_dict[attribute_name] = attribute_value return attributes_dict From 80dcee5cd5cbe623a53e0c582e3e3ae170c63e8d Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 31 Oct 2015 04:02:49 +0100 Subject: [PATCH 140/415] [eitb] fix info extraction --- youtube_dl/extractor/eitb.py | 65 ++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 2cba82532..fc8f15544 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,39 +1,62 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .brightcove import BrightcoveIE -from ..utils import ExtractorError +from ..compat import compat_urllib_request +from ..utils import ( + int_or_none, + unified_strdate, +) class EitbIE(InfoExtractor): IE_NAME = 'eitb.tv' - _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' + _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)' _TEST = { - 'add_ie': ['Brightcove'], - 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', + 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/', 'md5': 'edf4436247185adee3ea18ce64c47998', 'info_dict': { - 'id': '2743577154001', + 'id': '4090227752001', 'ext': 'mp4', 'title': '60 minutos (Lasa y Zabala, 30 años)', - # All videos from eitb has this description in the brightcove info - 'description': '.', - 'uploader': 'Euskal Telebista', + 'description': '', + 'duration': 3996760, + 'upload_date': '20131014', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - chapter_id = mobj.group('chapter_id') - webpage = self._download_webpage(url, chapter_id) - bc_url = BrightcoveIE._extract_brightcove_url(webpage) - if bc_url is None: - raise ExtractorError('Could not extract the Brightcove url') - # The BrightcoveExperience object doesn't contain the video id, we set - # it manually - bc_url += '&%40videoPlayer={0}'.format(chapter_id) - return self.url_result(bc_url, BrightcoveIE.ie_key()) + video_id = self._match_id(url) + video_data = self._download_json('http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, video_id)['web_media'][0] + + formats = [] + for rendition in video_data['RENDITIONS']: + formats.append({ + 'url': rendition['PMD_URL'], + 'width': int_or_none(rendition.get('FRAME_WIDTH')), + 'height': int_or_none(rendition.get('FRAME_HEIGHT')), + 'tbr': int_or_none(rendition.get('ENCODING_RATE')), + }) + + # TODO: parse f4m manifest + request = compat_urllib_request.Request( + 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', + headers={'Referer': url}) + token_data = self._download_json(request, video_id, fatal=False) + if token_data: + m3u8_formats = self._extract_m3u8_formats('%s?hdnts=%s' % (video_data['HLS_SURL'], token_data['token']), video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['NAME_ES'], + 'description': video_data.get('SHORT_DESC_ES'), + 'thumbnail': video_data.get('STILL_URL'), + 'duration': int_or_none(video_data.get('LENGTH')), + 'upload_date': unified_strdate(video_data.get('BROADCST_DATE')), + 'formats': formats, + } From 8a06999ba0f9c948f8d2a1ef89c73eedbfb09cfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 01:52:33 +0600 Subject: [PATCH 141/415] [eitb] Improve, make more robust and extract f4m formats (Closes #7328) --- youtube_dl/extractor/eitb.py | 71 +++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index fc8f15544..0de8d3dc6 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -4,14 +4,15 @@ from .common import InfoExtractor from ..compat import compat_urllib_request from ..utils import ( + float_or_none, int_or_none, - unified_strdate, + parse_iso8601, ) class EitbIE(InfoExtractor): IE_NAME = 'eitb.tv' - _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?eitb\.tv/(?:eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)' _TEST = { 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/', @@ -20,43 +21,71 @@ class EitbIE(InfoExtractor): 'id': '4090227752001', 'ext': 'mp4', 'title': '60 minutos (Lasa y Zabala, 30 años)', - 'description': '', - 'duration': 3996760, + 'description': 'Programa de reportajes de actualidad.', + 'duration': 3996.76, + 'timestamp': 1381789200, 'upload_date': '20131014', + 'tags': list, }, } def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json('http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, video_id)['web_media'][0] + + video = self._download_json( + 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, + video_id, 'Downloading video JSON') + + media = video['web_media'][0] formats = [] - for rendition in video_data['RENDITIONS']: + for rendition in media['RENDITIONS']: + video_url = rendition.get('PMD_URL') + if not video_url: + continue + tbr = float_or_none(rendition.get('ENCODING_RATE'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % int(tbr) formats.append({ 'url': rendition['PMD_URL'], + 'format_id': format_id, 'width': int_or_none(rendition.get('FRAME_WIDTH')), 'height': int_or_none(rendition.get('FRAME_HEIGHT')), - 'tbr': int_or_none(rendition.get('ENCODING_RATE')), + 'tbr': tbr, }) - # TODO: parse f4m manifest - request = compat_urllib_request.Request( - 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', - headers={'Referer': url}) - token_data = self._download_json(request, video_id, fatal=False) - if token_data: - m3u8_formats = self._extract_m3u8_formats('%s?hdnts=%s' % (video_data['HLS_SURL'], token_data['token']), video_id, m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + hls_url = media.get('HLS_SURL') + if hls_url: + request = compat_urllib_request.Request( + 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', + headers={'Referer': url}) + token_data = self._download_json( + request, video_id, 'Downloading auth token', fatal=False) + if token_data: + token = token_data.get('token') + if token: + m3u8_formats = self._extract_m3u8_formats( + '%s?hdnts=%s' % (hls_url, token), video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + hds_url = media.get('HDS_SURL').replace('euskalsvod', 'euskalvod') + if hds_url: + f4m_formats = self._extract_f4m_formats( + '%s?hdcore=3.7.0' % hds_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) self._sort_formats(formats) return { 'id': video_id, - 'title': video_data['NAME_ES'], - 'description': video_data.get('SHORT_DESC_ES'), - 'thumbnail': video_data.get('STILL_URL'), - 'duration': int_or_none(video_data.get('LENGTH')), - 'upload_date': unified_strdate(video_data.get('BROADCST_DATE')), + 'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'], + 'description': media.get('SHORT_DESC_ES') or video.get('desc_group') or media.get('SHORT_DESC_EU'), + 'thumbnail': media.get('STILL_URL') or media.get('THUMBNAIL_URL'), + 'duration': float_or_none(media.get('LENGTH'), 1000), + 'timestamp': parse_iso8601(media.get('BROADCST_DATE'), ' '), + 'tags': media.get('TAGS'), 'formats': formats, } From 999079b4543b4cd5e71a235865fbfefd349eb064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 15:49:11 +0600 Subject: [PATCH 142/415] [eitb] Improve hds extraction --- youtube_dl/extractor/eitb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 0de8d3dc6..357a2196c 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -70,10 +70,11 @@ def _real_extract(self, url): if m3u8_formats: formats.extend(m3u8_formats) - hds_url = media.get('HDS_SURL').replace('euskalsvod', 'euskalvod') + hds_url = media.get('HDS_SURL') if hds_url: f4m_formats = self._extract_f4m_formats( - '%s?hdcore=3.7.0' % hds_url, video_id, f4m_id='hds', fatal=False) + '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'), + video_id, f4m_id='hds', fatal=False) if f4m_formats: formats.extend(f4m_formats) From ab6ca0480280abb2a35a54e1b380bbae07a48863 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 1 Nov 2015 14:20:10 +0100 Subject: [PATCH 143/415] release 2015.11.01 --- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 03561b87d..805af14a0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -93,6 +93,7 @@ # Supported sites - **Clipsyndicate** - **Cloudy** - **Clubic** + - **Clyp** - **cmt.com** - **CNET** - **CNN** @@ -281,7 +282,7 @@ # Supported sites - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru - **Malemotion** - - **MDR** + - **MDR**: MDR.DE and KiKA - **media.ccc.de** - **metacafe** - **Metacritic** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 125e8ccf5..006b973c0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.24' +__version__ = '2015.11.01' From c90d16cf36d8edf03f4dc923ee9dbeadca910844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 2 Nov 2015 04:26:20 +0600 Subject: [PATCH 144/415] [utils:sanitize_path] Disallow trailing whitespace in path segment (Closes #7332) --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index efd5f4ae1..7b3f79141 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -366,7 +366,7 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) From eb97f46e8bd9cb04f0fe5f8a5c13aeeaabeefef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 2 Nov 2015 12:46:10 +0100 Subject: [PATCH 145/415] [mitele] Fix extraction and update test checksum (fixes #7343) --- youtube_dl/extractor/mitele.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 3142fcde2..c595f2077 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import ( + compat_urllib_parse, + compat_urlparse, +) from ..utils import ( encode_dict, get_element_by_attribute, @@ -15,7 +18,7 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': '757b0b66cbd7e0a97226d7d3156cb3e9', + 'md5': '0ff1a13aebb35d9bc14081ff633dd324', 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', @@ -34,6 +37,7 @@ def _real_extract(self, url): config_url = self._search_regex( r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') + config_url = compat_urlparse.urljoin(url, config_url) config = self._download_json( config_url, display_id, 'Downloading config JSON') From c514b0ec655b23e7804eb18df04daa863d973f32 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 1 Nov 2015 22:12:20 +0100 Subject: [PATCH 146/415] [videofy.me] fix info extraction Closes #7339. --- youtube_dl/extractor/videofyme.py | 40 ++++++++++++++++--------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py index 94f9e9be9..cd3f50a63 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/youtube_dl/extractor/videofyme.py @@ -2,8 +2,8 @@ from .common import InfoExtractor from ..utils import ( - find_xpath_attr, int_or_none, + parse_iso8601, ) @@ -18,33 +18,35 @@ class VideofyMeIE(InfoExtractor): 'id': '1100701', 'ext': 'mp4', 'title': 'This is VideofyMe', - 'description': None, + 'description': '', + 'upload_date': '20130326', + 'timestamp': 1364288959, 'uploader': 'VideofyMe', 'uploader_id': 'thisisvideofyme', 'view_count': int, + 'likes': int, + 'comment_count': int, }, - } def _real_extract(self, url): video_id = self._match_id(url) - config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id, - video_id) - video = config.find('video') - sources = video.find('sources') - url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key) - for key in ['on', 'av', 'off']] if node is not None) - video_url = url_node.find('url').text - view_count = int_or_none(self._search_regex( - r'([0-9]+)', video.find('views').text, 'view count', fatal=False)) + + config = self._download_json('http://vf-player-info-loader.herokuapp.com/%s.json' % video_id, video_id)['videoinfo'] + + video = config.get('video') + blog = config.get('blog', {}) return { 'id': video_id, - 'title': video.find('title').text, - 'url': video_url, - 'thumbnail': video.find('thumb').text, - 'description': video.find('description').text, - 'uploader': config.find('blog/name').text, - 'uploader_id': video.find('identifier').text, - 'view_count': view_count, + 'title': video['title'], + 'url': video['sources']['source']['url'], + 'thumbnail': video.get('thumb'), + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'uploader': blog.get('name'), + 'uploader_id': blog.get('identifier'), + 'view_count': int_or_none(self._search_regex(r'([0-9]+)', video.get('views'), 'view count', fatal=False)), + 'likes': int_or_none(video.get('likes')), + 'comment_count': int_or_none(video.get('nrOfComments')), } From 6a750402787dfc1f39a9ad347f2d78ae1c94c52c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 2 Nov 2015 14:08:38 +0100 Subject: [PATCH 147/415] [utils] unified_strdate: Return None if the date format can't be recognized (fixes #7340) This issue was introduced with ae12bc3ebb4cb377c2b4337ec255e652b36f5143, it returned 'None'. --- test/test_utils.py | 1 + youtube_dl/utils.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 3298315d2..01829f71e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -236,6 +236,7 @@ def test_unified_dates(self): unified_strdate('2/2/2015 6:47:40 PM', day_first=False), '20150202') self.assertEqual(unified_strdate('25-09-2014'), '20140925') + self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) def test_find_xpath_attr(self): testxml = '''<root> diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7b3f79141..d39f313a4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -911,7 +911,8 @@ def unified_strdate(date_str, day_first=True): timetuple = email.utils.parsedate_tz(date_str) if timetuple: upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - return compat_str(upload_date) + if upload_date is not None: + return compat_str(upload_date) def determine_ext(url, default_ext='unknown_video'): From a230068ff7427c19e29331fc0f2bb17d50003bca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Nov 2015 16:18:54 +0100 Subject: [PATCH 148/415] release 2015.11.02 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 006b973c0..6ef482b78 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.11.01' +__version__ = '2015.11.02' From dde9fe9788f23f168e0bddaf8ab0470f469165fa Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 Nov 2015 18:36:54 +0800 Subject: [PATCH 149/415] [democracynow] Simplify --- youtube_dl/extractor/democracynow.py | 86 ++++++++++++++-------------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 05cfc7502..824b8e2c5 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -2,11 +2,18 @@ from __future__ import unicode_literals import re +import os.path + from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + url_basename, + remove_start, +) class DemocracynowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?democracynow.org/?(?P<id>[^\?]*)' + _VALID_URL = r'https?://(?:www\.)?democracynow.org/(?P<id>[^\?]*)' IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', @@ -14,9 +21,7 @@ class DemocracynowIE(InfoExtractor): 'id': '2015-0703-001', 'ext': 'mp4', 'title': 'July 03, 2015 - Democracy Now!', - 'description': 'A daily independent global news hour with Amy Goodman & Juan Gonz\xe1lez "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', - 'uploader': 'Democracy Now', - 'upload_date': None, + 'description': 'A daily independent global news hour with Amy Goodman & Juan González "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', @@ -25,60 +30,57 @@ class DemocracynowIE(InfoExtractor): 'ext': 'mp4', 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', - 'uploader': 'Democracy Now', - 'upload_date': None, }, }] def _real_extract(self, url): display_id = self._match_id(url) - base_host = re.search(r'^(.+?://[^/]+)', url).group(1) - if display_id == '': - display_id = 'home' webpage = self._download_webpage(url, display_id) description = self._og_search_description(webpage) - jstr = self._search_regex(r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json') - js = self._parse_json(jstr, display_id) + js = self._parse_json(self._search_regex( + r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), + display_id) video_id = None formats = [] + + default_lang = 'en' + subtitles = {} - for key in ('caption_file', '.......'): - # ....... = pending vtt support that doesn't clobber srt 'chapter_file': - url = js.get(key, '') - if url == '' or url is None: - continue - if not re.match(r'^https?://', url): - url = base_host + url - ext = re.search(r'\.([^\.]+)$', url).group(1) - subtitles['eng'] = [{ - 'ext': ext, - 'url': url, - }] - for key in ('file', 'audio', 'video'): - url = js.get(key, '') - if url == '' or url is None: - continue - if not re.match(r'^https?://', url): - url = base_host + url - purl = re.search(r'/(?P<dir>[^/]+)/(?:dn)?(?P<fn>[^/]+?)\.(?P<ext>[^\.\?]+)(?P<hasparams>\?|$)', url) - if video_id is None: - video_id = purl.group('fn') - if js.get('start') is not None: - url += '&' if purl.group('hasparams') == '?' else '?' - url = url + 'start=' + str(js.get('start')) - formats.append({ - 'format_id': purl.group('dir'), - 'ext': purl.group('ext'), - 'url': url, + + def add_subtitle_item(lang, info_dict): + if lang not in subtitles: + subtitles[lang] = [] + subtitles[lang].append(info_dict) + + # chapter_file are not subtitles + if 'caption_file' in js: + add_subtitle_item(default_lang, { + 'url': compat_urlparse.urljoin(url, js['caption_file']), }) + + for subtitle_item in js.get('captions', []): + lang = subtitle_item.get('language', '').lower() or default_lang + add_subtitle_item(lang, { + 'url': compat_urlparse.urljoin(url, subtitle_item['url']), + }) + + for key in ('file', 'audio', 'video'): + media_url = js.get(key, '') + if not media_url: + continue + media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) + video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn') + formats.append({ + 'url': media_url, + }) + self._sort_formats(formats) - ret = { + + return { 'id': video_id, 'title': js.get('title'), 'description': description, - 'uploader': 'Democracy Now', 'subtitles': subtitles, 'formats': formats, } - return ret From fc68d52bb95dc81ed3d05a5c5397cd3f35ee093a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 Nov 2015 21:24:10 +0800 Subject: [PATCH 150/415] [democracynow] Add MD5 sums --- youtube_dl/extractor/democracynow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 824b8e2c5..70c364e8b 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -17,6 +17,7 @@ class DemocracynowIE(InfoExtractor): IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', + 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', @@ -25,6 +26,7 @@ class DemocracynowIE(InfoExtractor): }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', + 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', From 852fad922ffa931b3c90b0b9fdb2fa1c7f965ab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 20:53:17 +0600 Subject: [PATCH 151/415] [vimeo] Fix non-ASCII video passwords (Closes #7352) --- youtube_dl/extractor/vimeo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 2437ae1eb..cc0d337e8 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -13,6 +13,7 @@ compat_urlparse, ) from ..utils import ( + encode_dict, ExtractorError, InAdvancePagedList, int_or_none, @@ -208,10 +209,10 @@ def _verify_video_password(self, url, video_id, webpage): if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ + data = urlencode_postdata(encode_dict({ 'password': password, 'token': token, - }) + })) if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') From 0a0110fc6bbd21850e25541fd0bd4b602ce194e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 21:01:09 +0600 Subject: [PATCH 152/415] [vimeo] Fix non-ASCII video passwords (2) --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cc0d337e8..fa07bd59c 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -228,7 +228,7 @@ def _verify_player_video_password(self, url, video_id): password = self._downloader.params.get('videopassword', None) if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = compat_urllib_parse.urlencode({'password': password}) + data = urlencode_postdata(encode_dict({'password': password})) pass_url = url + '/check-password' password_request = compat_urllib_request.Request(pass_url, data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') From 3fa3ff1bc36aaf82ac0a5e880304cb7aae217b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 21:06:36 +0600 Subject: [PATCH 153/415] [vimeo] Fix non-ASCII login --- youtube_dl/extractor/vimeo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fa07bd59c..46fb36f21 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -41,13 +41,13 @@ def _login(self): self.report_login() webpage = self._download_webpage(self._LOGIN_URL, None, False) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ + data = urlencode_postdata(encode_dict({ 'action': 'login', 'email': username, 'password': password, 'service': 'vimeo', 'token': token, - }) + })) login_request = compat_urllib_request.Request(self._LOGIN_URL, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_request.add_header('Cookie', 'vuid=%s' % vuid) From bfdf891fd36811909aa5d83dc0614eacbb634fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 21:09:24 +0600 Subject: [PATCH 154/415] [vimeo] Fix non-ASCII album passwords --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 46fb36f21..b608740b8 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -489,7 +489,7 @@ def _login_list_password(self, page_url, list_id, webpage): token, vuid = self._extract_xsrft_and_vuid(webpage) fields['token'] = token fields['password'] = password - post = urlencode_postdata(fields) + post = urlencode_postdata(encode_dict(fields)) password_path = self._search_regex( r'action="([^"]+)"', login_form, 'password URL') password_url = compat_urlparse.urljoin(page_url, password_path) From fd8102820c4d14fdb1ff7e090553211717012f67 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 4 Nov 2015 00:09:55 +0800 Subject: [PATCH 155/415] [democracynow] Rename js to json_data --- youtube_dl/extractor/democracynow.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 70c364e8b..72fc75d80 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -40,7 +40,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) description = self._og_search_description(webpage) - js = self._parse_json(self._search_regex( + json_data = self._parse_json(self._search_regex( r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), display_id) video_id = None @@ -56,19 +56,19 @@ def add_subtitle_item(lang, info_dict): subtitles[lang].append(info_dict) # chapter_file are not subtitles - if 'caption_file' in js: + if 'caption_file' in json_data: add_subtitle_item(default_lang, { - 'url': compat_urlparse.urljoin(url, js['caption_file']), + 'url': compat_urlparse.urljoin(url, json_data['caption_file']), }) - for subtitle_item in js.get('captions', []): + for subtitle_item in json_data.get('captions', []): lang = subtitle_item.get('language', '').lower() or default_lang add_subtitle_item(lang, { 'url': compat_urlparse.urljoin(url, subtitle_item['url']), }) for key in ('file', 'audio', 'video'): - media_url = js.get(key, '') + media_url = json_data.get(key, '') if not media_url: continue media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) @@ -81,7 +81,7 @@ def add_subtitle_item(lang, info_dict): return { 'id': video_id, - 'title': js.get('title'), + 'title': json_data.get('title'), 'description': description, 'subtitles': subtitles, 'formats': formats, From 0aeb9a106e1aad37967e0ee666ed816a7d5eb7c2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 4 Nov 2015 00:13:00 +0800 Subject: [PATCH 156/415] [democracynow] Prevent required fields to be None --- youtube_dl/extractor/democracynow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 72fc75d80..6cd395e11 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -80,8 +80,8 @@ def add_subtitle_item(lang, info_dict): self._sort_formats(formats) return { - 'id': video_id, - 'title': json_data.get('title'), + 'id': video_id or display_id, + 'title': json_data['title'], 'description': description, 'subtitles': subtitles, 'formats': formats, From 66d041f250f7d3e0c4d501e3b98721f2c6588c35 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 4 Nov 2015 00:53:30 +0800 Subject: [PATCH 157/415] [test/subtitles] Add test for DemocracynowIE --- test/test_subtitles.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 0343967d9..75f0ea75f 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -28,6 +28,7 @@ ThePlatformFeedIE, RTVEALaCartaIE, FunnyOrDieIE, + DemocracynowIE, ) @@ -346,5 +347,25 @@ def test_allsubtitles(self): self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') +class TestDemocracynowSubtitles(BaseTestSubtitles): + url = 'http://www.democracynow.org/shows/2015/7/3' + IE = DemocracynowIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + + def test_subtitles_in_page(self): + self.url = 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + + if __name__ == '__main__': unittest.main() From ad607563a2fbb5275ea39f7a052c09ffa232e271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:46:26 +0600 Subject: [PATCH 158/415] [globo] Separate article extractor --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/globo.py | 140 +++++++++++++++++-------------- 2 files changed, 79 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 10286aa88..94150a28f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -212,7 +212,10 @@ from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE -from .globo import GloboIE +from .globo import ( + GloboIE, + GloboArticleIE, +) from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 33d6432a6..828e40d76 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -18,75 +18,52 @@ class GloboIE(InfoExtractor): - _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)' + _VALID_URL = '(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' - _VIDEOID_REGEXES = [ - r'\bdata-video-id="(\d+)"', - r'\bdata-player-videosids="(\d+)"', - r'<div[^>]+\bid="(\d+)"', - ] - _RESIGN_EXPIRATION = 86400 - _TESTS = [ - { - 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', - 'md5': '03ebf41cb7ade43581608b7d9b71fab0', - 'info_dict': { - 'id': '3654973', - 'ext': 'mp4', - 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', - 'duration': 251.585, - 'uploader': 'SporTV', - 'uploader_id': 698, - 'like_count': int, - } - }, - { - 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', - 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', - 'info_dict': { - 'id': '3607726', - 'ext': 'mp4', - 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', - 'duration': 103.204, - 'uploader': 'Globo.com', - 'uploader_id': 265, - 'like_count': int, - } - }, - { - 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', - 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', - 'info_dict': { - 'id': '3652183', - 'ext': 'mp4', - 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', - 'duration': 110.711, - 'uploader': 'Rede Globo', - 'uploader_id': 196, - 'like_count': int, - } - }, - { - 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', - 'md5': 'c1defca721ce25b2354e927d3e4b3dec', - 'info_dict': { - 'id': '3928201', - 'ext': 'mp4', - 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas', - 'duration': 1472.906, - 'uploader': 'Canal Brasil', - 'uploader_id': 705, - 'like_count': int, - } - }, - ] + _TESTS = [{ + 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', + 'md5': '03ebf41cb7ade43581608b7d9b71fab0', + 'info_dict': { + 'id': '3654973', + 'ext': 'mp4', + 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', + 'duration': 251.585, + 'uploader': 'SporTV', + 'uploader_id': 698, + 'like_count': int, + } + }, { + 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', + 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', + 'info_dict': { + 'id': '3607726', + 'ext': 'mp4', + 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', + 'duration': 103.204, + 'uploader': 'Globo.com', + 'uploader_id': 265, + 'like_count': int, + } + }, { + 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', + 'md5': 'c1defca721ce25b2354e927d3e4b3dec', + 'info_dict': { + 'id': '3928201', + 'ext': 'mp4', + 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas', + 'duration': 1472.906, + 'uploader': 'Canal Brasil', + 'uploader_id': 705, + 'like_count': int, + } + }] - class MD5(): + class MD5: HEX_FORMAT_LOWERCASE = 0 HEX_FORMAT_UPPERCASE = 1 BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' @@ -353,9 +330,6 @@ def lshift(value, count): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') - video = self._download_json( self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] @@ -417,3 +391,39 @@ def _real_extract(self, url): 'like_count': like_count, 'formats': formats } + + +class GloboArticleIE(InfoExtractor): + _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)\.html' + + _VIDEOID_REGEXES = [ + r'\bdata-video-id=["\'](\d{7,})', + r'\bdata-player-videosids=["\'](\d{7,})', + r'\bvideosIDs\s*:\s*["\'](\d{7,})', + r'\bdata-id=["\'](\d{7,})', + r'<div[^>]+\bid=["\'](\d{7,})', + ] + + _TEST = { + 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', + 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', + 'info_dict': { + 'id': '3652183', + 'ext': 'mp4', + 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', + 'duration': 110.711, + 'uploader': 'Rede Globo', + 'uploader_id': 196, + 'like_count': int, + } + } + + @classmethod + def suitable(cls, url): + return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') + return self.url_result('globo:%s' % video_id, 'Globo') From e3778cce0e912f803ea10cb806406f7fcafe840f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:51:19 +0600 Subject: [PATCH 159/415] [globo] Improve m3u8 extraction --- youtube_dl/extractor/globo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 828e40d76..c28899011 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -367,7 +367,10 @@ def _real_extract(self, url): resource_url = resource['url'] signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(signed_url, resource_id, 'mp4')) + m3u8_formats = self._extract_m3u8_formats( + signed_url, resource_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: formats.append({ 'url': signed_url, From c3459d24f16056e8ae8f982db2a10871ef18e80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:53:21 +0600 Subject: [PATCH 160/415] [globo] Skip unsupported smooth streaming --- youtube_dl/extractor/globo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index c28899011..ec451bb07 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -338,7 +338,7 @@ def _real_extract(self, url): formats = [] for resource in video['resources']: resource_id = resource.get('_id') - if not resource_id: + if not resource_id or resource_id.endswith('manifest'): continue security = self._download_json( From 5d235ca7f66af1f82c1a4d753d238f48fc3afa40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:55:39 +0600 Subject: [PATCH 161/415] [globo] Prefer native m3u8 --- youtube_dl/extractor/globo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index ec451bb07..2a805cbb2 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -368,7 +368,8 @@ def _real_extract(self, url): signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): m3u8_formats = self._extract_m3u8_formats( - signed_url, resource_id, 'mp4', m3u8_id='hls', fatal=False) + signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) if m3u8_formats: formats.extend(m3u8_formats) else: From b4ef6a0038657c1adde565df947e42ad1e1b4195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:01:27 +0600 Subject: [PATCH 162/415] [globo] Remove non available test --- youtube_dl/extractor/globo.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 2a805cbb2..8aada01dc 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -26,18 +26,6 @@ class GloboIE(InfoExtractor): _RESIGN_EXPIRATION = 86400 _TESTS = [{ - 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', - 'md5': '03ebf41cb7ade43581608b7d9b71fab0', - 'info_dict': { - 'id': '3654973', - 'ext': 'mp4', - 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', - 'duration': 251.585, - 'uploader': 'SporTV', - 'uploader_id': 698, - 'like_count': int, - } - }, { 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', 'info_dict': { From aebb42d32b608eaffb424e5e7c22f1b68a491e3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:01:55 +0600 Subject: [PATCH 163/415] [globo] Remove like count It's no longer provided --- youtube_dl/extractor/globo.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 8aada01dc..dc89e46ac 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -35,7 +35,6 @@ class GloboIE(InfoExtractor): 'duration': 103.204, 'uploader': 'Globo.com', 'uploader_id': 265, - 'like_count': int, } }, { 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', @@ -47,7 +46,6 @@ class GloboIE(InfoExtractor): 'duration': 1472.906, 'uploader': 'Canal Brasil', 'uploader_id': 705, - 'like_count': int, } }] @@ -370,7 +368,6 @@ def _real_extract(self, url): self._sort_formats(formats) duration = float_or_none(video.get('duration'), 1000) - like_count = int_or_none(video.get('likes')) uploader = video.get('channel') uploader_id = video.get('channel_id') @@ -380,7 +377,6 @@ def _real_extract(self, url): 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, - 'like_count': like_count, 'formats': formats } @@ -406,7 +402,6 @@ class GloboArticleIE(InfoExtractor): 'duration': 110.711, 'uploader': 'Rede Globo', 'uploader_id': 196, - 'like_count': int, } } From a4a6b7b80f18680ee0a8bba50a24c58edd3f2a73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:03:45 +0600 Subject: [PATCH 164/415] [globo] Improve http formats --- youtube_dl/extractor/globo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index dc89e46ac..64622aa5c 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -361,8 +361,8 @@ def _real_extract(self, url): else: formats.append({ 'url': signed_url, - 'format_id': resource_id, - 'height': resource.get('height'), + 'format_id': 'http-%s' % resource_id, + 'height': int_or_none(resource.get('height')), }) self._sort_formats(formats) From 264cd00fff4f6d7063d43e1d476de46901bd9c5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:10:45 +0600 Subject: [PATCH 165/415] [globo] Update tests --- youtube_dl/extractor/globo.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 64622aa5c..0337256ed 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -35,18 +35,30 @@ class GloboIE(InfoExtractor): 'duration': 103.204, 'uploader': 'Globo.com', 'uploader_id': 265, - } + }, + }, { + 'url': 'http://globoplay.globo.com/v/4581987/', + 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff', + 'info_dict': { + 'id': '4581987', + 'ext': 'mp4', + 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', + 'duration': 137.973, + 'uploader': 'Rede Globo', + 'uploader_id': 196, + }, + }, { + 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', + 'only_matching': True, + }, { + 'url': 'http://globosatplay.globo.com/globonews/v/4472924/', + 'only_matching': True, + }, { + 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/', + 'only_matching': True, }, { 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', - 'md5': 'c1defca721ce25b2354e927d3e4b3dec', - 'info_dict': { - 'id': '3928201', - 'ext': 'mp4', - 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas', - 'duration': 1472.906, - 'uploader': 'Canal Brasil', - 'uploader_id': 705, - } + 'only_matching': True, }] class MD5: From e7d34c03f200e178e9d6dfe4ae3f6856e382a4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:12:42 +0600 Subject: [PATCH 166/415] [globo] Force uploader id to be string --- youtube_dl/extractor/globo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 0337256ed..6c0fc54de 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -14,6 +14,7 @@ ExtractorError, float_or_none, int_or_none, + str_or_none, ) @@ -34,7 +35,7 @@ class GloboIE(InfoExtractor): 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', 'duration': 103.204, 'uploader': 'Globo.com', - 'uploader_id': 265, + 'uploader_id': '265', }, }, { 'url': 'http://globoplay.globo.com/v/4581987/', @@ -45,7 +46,7 @@ class GloboIE(InfoExtractor): 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', 'duration': 137.973, 'uploader': 'Rede Globo', - 'uploader_id': 196, + 'uploader_id': '196', }, }, { 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', @@ -381,7 +382,7 @@ def _real_extract(self, url): duration = float_or_none(video.get('duration'), 1000) uploader = video.get('channel') - uploader_id = video.get('channel_id') + uploader_id = str_or_none(video.get('channel_id')) return { 'id': video_id, From c13722480bebfb1fc33169516790df2e99b3e499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:13:35 +0600 Subject: [PATCH 167/415] [globo:article] Fix test --- youtube_dl/extractor/globo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 6c0fc54de..5883be704 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -414,7 +414,7 @@ class GloboArticleIE(InfoExtractor): 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', 'duration': 110.711, 'uploader': 'Rede Globo', - 'uploader_id': 196, + 'uploader_id': '196', } } From 5d501a0901c36695c9d6ca3958ac4ccfdea90954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:42:11 +0600 Subject: [PATCH 168/415] [globo] Add more tests --- youtube_dl/extractor/globo.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 5883be704..c65ef6bcf 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -60,6 +60,9 @@ class GloboIE(InfoExtractor): }, { 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', 'only_matching': True, + }, { + 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html', + 'only_matching': True, }] class MD5: @@ -405,7 +408,7 @@ class GloboArticleIE(InfoExtractor): r'<div[^>]+\bid=["\'](\d{7,})', ] - _TEST = { + _TESTS = [{ 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', 'info_dict': { @@ -416,7 +419,13 @@ class GloboArticleIE(InfoExtractor): 'uploader': 'Rede Globo', 'uploader_id': '196', } - } + }, { + 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html', + 'only_matching': True, + }, { + 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html', + 'only_matching': True, + }] @classmethod def suitable(cls, url): From 17d1900581ffd12866e56640080ce340d99149a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:57:46 +0600 Subject: [PATCH 169/415] [vk] Fix view count extraction (Closes #7353) --- youtube_dl/extractor/vk.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 765e9e6fd..01960b827 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -281,9 +281,13 @@ def _real_extract(self, url): mobj.group(1) + ' ' + mobj.group(2) upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) - view_count = str_to_int(self._search_regex( - r'"mv_views_count_number"[^>]*>([\d,.]+) views<', - info_page, 'view count', fatal=False)) + view_count = None + views = self._html_search_regex( + r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', + info_page, 'view count', fatal=False) + if views: + view_count = str_to_int(self._search_regex( + r'([\d,.]+)', views, 'view count', fatal=False)) formats = [{ 'format_id': k, From cb5a470635ea2ad91f18d33e391979aabb0755fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Nov 2015 16:18:51 +0100 Subject: [PATCH 170/415] [vimeo] Remove unused import --- youtube_dl/extractor/vimeo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index b608740b8..ca716c8f5 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,7 +8,6 @@ from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urllib_request, compat_urlparse, ) From 44b2264feae331eeb34e83eed1387def3d61a437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 22:12:24 +0600 Subject: [PATCH 171/415] [youtube] Prefer video_info with token available --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d7eda7aa7..5eeb3c663 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1107,6 +1107,8 @@ def add_dash_mpd(video_info): if not video_info: video_info = get_video_info if 'token' in get_video_info: + if 'token' not in video_info: + video_info = get_video_info break if 'token' not in video_info: if 'reason' in video_info: From 89ea063eebae84792a7ccb968533ff8bf6a41d56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 22:49:23 +0600 Subject: [PATCH 172/415] [youtube] Clarify rationale for preferring a video info with token (#7362) --- youtube_dl/extractor/youtube.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5eeb3c663..e2a43299f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1107,6 +1107,15 @@ def add_dash_mpd(video_info): if not video_info: video_info = get_video_info if 'token' in get_video_info: + # Different get_video_info requests may report different results, e.g. + # some may report video unavailability, but some may serve it without + # any complaint (see https://github.com/rg3/youtube-dl/issues/7362, + # the original webpage as well as el=info and el=embedded get_video_info + # requests report video unavailability due to geo restriction while + # el=detailpage succeeds and returns valid data). This is probably + # due to YouTube measures against IP ranges of hosting providers. + # Working around by preferring the first succeeded video_info containing + # the token if no such video_info yet was found. if 'token' not in video_info: video_info = get_video_info break From f93ded98522cc1272a8d2210738937132292afc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 01:54:49 +0600 Subject: [PATCH 173/415] [prosiebensat1] Add support for .ch domains (Closes #7365) --- youtube_dl/extractor/prosiebensat1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index effcf1db3..baa54a3af 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -20,7 +20,7 @@ class ProSiebenSat1IE(InfoExtractor): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)' _TESTS = [ { From b15c44cd36831f175e9dd4081b82beb8075790b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 02:51:30 +0600 Subject: [PATCH 174/415] [periscope] Add support for videos with broadcast_id (Closes #7359) --- youtube_dl/extractor/periscope.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 8ad936758..0f9d7576f 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -27,9 +27,10 @@ class PeriscopeIE(InfoExtractor): 'skip': 'Expires in 24 hours', } - def _call_api(self, method, token): + def _call_api(self, method, value): + attribute = 'token' if len(value) > 13 else 'broadcast_id' return self._download_json( - 'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token) + 'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value) def _real_extract(self, url): token = self._match_id(url) From 2549e113b8750a493917436d4fd15ed74a1a4983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 02:55:53 +0600 Subject: [PATCH 175/415] [periscope] Add test for broadcast_id based URL --- youtube_dl/extractor/periscope.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 0f9d7576f..7621d9e99 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -12,7 +12,7 @@ class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', 'info_dict': { @@ -25,7 +25,10 @@ class PeriscopeIE(InfoExtractor): 'uploader_id': '1465763', }, 'skip': 'Expires in 24 hours', - } + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, + }] def _call_api(self, method, value): attribute = 'token' if len(value) > 13 else 'broadcast_id' From 53472df85793cc89deb779c2ffc3ae1f47292fd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 02:56:44 +0600 Subject: [PATCH 176/415] [periscope] Add note on where to find alive example URLs --- youtube_dl/extractor/periscope.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 7621d9e99..887c8020d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -12,6 +12,7 @@ class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' + # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', From b3613d36da14ab527166326707c0f911d192144d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 04:37:51 +0600 Subject: [PATCH 177/415] [YoutubeDL] Sanitize path after output template substitution (Closes #7367) --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 12977bf80..1783ce01b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -572,7 +572,7 @@ def prepare_filename(self, info_dict): if v is not None) template_dict = collections.defaultdict(lambda: 'NA', template_dict) - outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL)) + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) tmpl = compat_expanduser(outtmpl) filename = tmpl % template_dict # Temporary fix for #4787 @@ -580,7 +580,7 @@ def prepare_filename(self, info_dict): # to workaround encoding issues with subprocess on python2 @ Windows if sys.version_info < (3, 0) and sys.platform == 'win32': filename = encodeFilename(filename, True).decode(preferredencoding()) - return filename + return sanitize_path(filename) except ValueError as err: self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None From 6953d8e95a78e83f087693b7353baab96b09fbdd Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 02:09:55 +0100 Subject: [PATCH 178/415] [miomio] fix info extraction (fixes #7366) --- youtube_dl/extractor/miomio.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index a784fc5fb..3f812e005 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -4,6 +4,7 @@ import random from .common import InfoExtractor +from ..compat import compat_urllib_request from ..utils import ( xpath_text, int_or_none, @@ -60,10 +61,12 @@ def _real_extract(self, url): 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)), video_id) - # the following xml contains the actual configuration information on the video file(s) - vid_config = self._download_xml( + vid_config_request = compat_urllib_request.Request( 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), - video_id) + headers={'Referer': 'http://www.miomio.tv/mioplayer/mioplayer-v3.0.swf'}) + + # the following xml contains the actual configuration information on the video file(s) + vid_config = self._download_xml(vid_config_request, video_id) http_headers = { 'Referer': 'http://www.miomio.tv%s' % mioplayer_path, From e68dd1921ad7528d225a8571066f99b9934b6a06 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 06:33:05 +0100 Subject: [PATCH 179/415] [miomio] use the formats urls headers for downloading xml --- youtube_dl/extractor/miomio.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 3f812e005..6f40bf1b9 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -52,6 +52,8 @@ def _real_extract(self, url): mioplayer_path = self._search_regex( r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') + http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path,} + xml_config = self._search_regex( r'flashvars="type=(?:sina|video)&(.+?)&', webpage, 'xml config') @@ -63,15 +65,11 @@ def _real_extract(self, url): vid_config_request = compat_urllib_request.Request( 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), - headers={'Referer': 'http://www.miomio.tv/mioplayer/mioplayer-v3.0.swf'}) + headers=http_headers) # the following xml contains the actual configuration information on the video file(s) vid_config = self._download_xml(vid_config_request, video_id) - http_headers = { - 'Referer': 'http://www.miomio.tv%s' % mioplayer_path, - } - if not int_or_none(xpath_text(vid_config, 'timelength')): raise ExtractorError('Unable to load videos!', expected=True) From 5003e4283b35acb82ea9793d91bc3cd1ee679f86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:06:44 +0600 Subject: [PATCH 180/415] [ndr] Relax _VALID_URL (Closes #7383) --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index ba06d8a98..a2b51ccb3 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -23,7 +23,7 @@ def _real_extract(self, url): class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', From 01003d072c20c2ed095930d87c5ce3d5610e66b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:07:52 +0600 Subject: [PATCH 181/415] [ndr] Add test for #7383 --- youtube_dl/extractor/ndr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index a2b51ccb3..0be866681 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -78,6 +78,9 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', + 'only_matching': True, }] def _extract_embed(self, webpage, display_id): From 1e2eb4b40d46f39d15c067657ecad16fa3b2121d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:08:21 +0600 Subject: [PATCH 182/415] [njoy] Relax _VALID_URL --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0be866681..7043c7e0f 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -105,7 +105,7 @@ def _extract_embed(self, webpage, display_id): class NJoyIE(NDRBaseIE): IE_NAME = 'njoy' IE_DESC = 'N-JOY' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', From 81413c01651eddcc5180af379f2ce3689a376051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:08:52 +0600 Subject: [PATCH 183/415] [ndr:embed] Relax _VALID_URL --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 7043c7e0f..477ce4e6b 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -238,7 +238,7 @@ def _real_extract(self, url): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', From 92366d189ef280b8ba0057930c54aa14b0ecdd24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:09:17 +0600 Subject: [PATCH 184/415] [njoy:embed] Relax _VALID_URL --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 477ce4e6b..16213eed9 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -332,7 +332,7 @@ class NDREmbedIE(NDREmbedBaseIE): class NJoyEmbedIE(NDREmbedBaseIE): IE_NAME = 'njoy:embed' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' _TESTS = [{ # httpVideo 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', From deb85c32bbd32e8d280e1919432a11c0bdaa26bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:56:31 +0600 Subject: [PATCH 185/415] [postprocessor/ffmpeg] Use ffmpeg as prefix since it's used all over the places (Closes #7371) --- youtube_dl/postprocessor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 4f320e124..5ed723bc6 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -272,7 +272,7 @@ def run(self, information): return [], information try: - self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path) + self._downloader.to_screen('[ffmpeg] Destination: ' + new_path) self.run_ffmpeg(path, new_path, acodec, more_opts) except AudioConversionError as e: raise PostProcessingError( From 179ffab69c3359ab7d0a7b0a2b63c94d8c70af67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:06:13 +0600 Subject: [PATCH 186/415] [lynda:course] Force log out (Closes #7361) --- youtube_dl/extractor/lynda.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 5c973e75c..67f2025de 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -82,6 +82,11 @@ def _login(self): expected=True) raise ExtractorError('Unable to log in') + def _logout(self): + self._download_webpage( + 'http://www.lynda.com/ajax/logout.aspx', None, + 'Logging out', 'Unable to log out', fatal=False) + class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' @@ -210,6 +215,8 @@ def _real_extract(self, url): course_id, 'Downloading course JSON') course_json = json.loads(page) + self._logout() + if 'Status' in course_json and course_json['Status'] == 'NotFound': raise ExtractorError( 'Course %s does not exist' % course_id, expected=True) From 71bb016160744a80fecaadf5b75b0dc2b1e8089b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:10:07 +0600 Subject: [PATCH 187/415] [lynda:course] Modernize and make more robust --- youtube_dl/extractor/lynda.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 67f2025de..98474ded9 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -210,14 +210,13 @@ def _real_extract(self, url): course_path = mobj.group('coursepath') course_id = mobj.group('courseid') - page = self._download_webpage( + course = self._download_json( 'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, course_id, 'Downloading course JSON') - course_json = json.loads(page) self._logout() - if 'Status' in course_json and course_json['Status'] == 'NotFound': + if course.get('Status') == 'NotFound': raise ExtractorError( 'Course %s does not exist' % course_id, expected=True) @@ -227,12 +226,14 @@ def _real_extract(self, url): # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided # by single video API anymore - for chapter in course_json['Chapters']: - for video in chapter['Videos']: - if video['HasAccess'] is False: + for chapter in course['Chapters']: + for video in chapter.get('Videos', []): + if video.get('HasAccess') is False: unaccessible_videos += 1 continue - videos.append(video['ID']) + video_id = video.get('ID') + if video_id: + videos.append(video_id) if unaccessible_videos > 0: self._downloader.report_warning( @@ -245,6 +246,6 @@ def _real_extract(self, url): 'Lynda') for video_id in videos] - course_title = course_json['Title'] + course_title = course.get('Title') return self.playlist_result(entries, course_id, course_title) From ea8ed40b2fb70fc2f01aba475128821078873d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:24:39 +0600 Subject: [PATCH 188/415] [lynda] Modernize and make more robust --- youtube_dl/extractor/lynda.py | 52 ++++++++++++++++------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 98474ded9..c8a16842e 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -113,51 +113,47 @@ class LyndaIE(LyndaBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage( + video = self._download_json( 'http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id, 'Downloading video JSON') - video_json = json.loads(page) - if 'Status' in video_json: + if 'Status' in video: raise ExtractorError( - 'lynda returned error: %s' % video_json['Message'], expected=True) + 'lynda returned error: %s' % video['Message'], expected=True) - if video_json['HasAccess'] is False: + if video.get('HasAccess') is False: self.raise_login_required('Video %s is only available for members' % video_id) - video_id = compat_str(video_json['ID']) - duration = video_json['DurationInSeconds'] - title = video_json['Title'] + video_id = compat_str(video.get('ID') or video_id) + duration = int_or_none(video.get('DurationInSeconds')) + title = video['Title'] formats = [] - fmts = video_json.get('Formats') + fmts = video.get('Formats') if fmts: - formats.extend([ - { - 'url': fmt['Url'], - 'ext': fmt['Extension'], - 'width': fmt['Width'], - 'height': fmt['Height'], - 'filesize': fmt['FileSize'], - 'format_id': str(fmt['Resolution']) - } for fmt in fmts]) + formats.extend([{ + 'url': f['Url'], + 'ext': f.get('Extension'), + 'width': int_or_none(f.get('Width')), + 'height': int_or_none(f.get('Height')), + 'filesize': int_or_none(f.get('FileSize')), + 'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None, + } for f in fmts if f.get('Url')]) - prioritized_streams = video_json.get('PrioritizedStreams') + prioritized_streams = video.get('PrioritizedStreams') if prioritized_streams: for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): - formats.extend([ - { - 'url': video_url, - 'width': int_or_none(format_id), - 'format_id': '%s-%s' % (prioritized_stream_id, format_id), - } for format_id, video_url in prioritized_stream.items() - ]) + formats.extend([{ + 'url': video_url, + 'width': int_or_none(format_id), + 'format_id': '%s-%s' % (prioritized_stream_id, format_id), + } for format_id, video_url in prioritized_stream.items()]) self._check_formats(formats, video_id) self._sort_formats(formats) - subtitles = self.extract_subtitles(video_id, page) + subtitles = self.extract_subtitles(video_id) return { 'id': video_id, @@ -188,7 +184,7 @@ def _fix_subtitles(self, subs): if srt: return srt - def _get_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id): url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id subs = self._download_json(url, None, False) if subs: From ae4ddf9efae816f4d52fc584c93e4f0e3c79c410 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:27:38 +0600 Subject: [PATCH 189/415] [lynda] PEP 8 --- youtube_dl/extractor/lynda.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index c8a16842e..9a207b2cd 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -227,9 +227,8 @@ def _real_extract(self, url): if video.get('HasAccess') is False: unaccessible_videos += 1 continue - video_id = video.get('ID') - if video_id: - videos.append(video_id) + if video.get('ID'): + videos.append(video['ID']) if unaccessible_videos > 0: self._downloader.report_warning( From 472404953a22811cc8156da110ea872a924f1f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:28:14 +0600 Subject: [PATCH 190/415] [miomio] PEP 8 --- youtube_dl/extractor/miomio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 6f40bf1b9..ce391c759 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -52,7 +52,7 @@ def _real_extract(self, url): mioplayer_path = self._search_regex( r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') - http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path,} + http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} xml_config = self._search_regex( r'flashvars="type=(?:sina|video)&(.+?)&', From 0fa6b17dccd2347cb0611651fc04e36839d33a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:45:26 +0600 Subject: [PATCH 191/415] [pbs] Simplify and speed up player URL search --- youtube_dl/extractor/pbs.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3448736a2..7b868d057 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -191,9 +191,13 @@ def _extract_webpage(self, url): if media_id: return media_id, presumptive_id, upload_date - url = self._search_regex( - r'(?s)<iframe[^>]+?(?:[a-z-]+?=["\'].*?["\'][^>]+?)*?\bsrc=["\']([^\'"]+partnerplayer[^\'"]+)["\']', - webpage, 'player URL') + for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): + url = self._search_regex( + r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe, + 'player URL', default=None, group='url') + if url: + break + mobj = re.match(self._VALID_URL, url) player_id = mobj.group('player_id') From 686f98816ecbbcb224d1336682688b05cdb051a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 00:39:16 +0600 Subject: [PATCH 192/415] [pbs] Add support for flp frontlines (Closes #7369) --- youtube_dl/extractor/pbs.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 7b868d057..3169e9c3f 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,6 +8,7 @@ ExtractorError, determine_ext, int_or_none, + strip_jsonp, unified_strdate, US_RATINGS, ) @@ -191,6 +192,23 @@ def _extract_webpage(self, url): if media_id: return media_id, presumptive_id, upload_date + # Fronline video embedded via flp + video_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid') + if video_id: + # pkg_id calculation is reverse engineered from + # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js + prg_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:] + if 'q' in prg_id: + prg_id = prg_id.split('q')[1] + prg_id = int(prg_id, 16) + getdir = self._download_json( + 'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id, + presumptive_id, 'Downloading getdir JSON', + transform_source=strip_jsonp) + return getdir['mid'], presumptive_id, upload_date + for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): url = self._search_regex( r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe, From 8b6d9406db1d3361b006016e6aace54b05cb6fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 00:42:30 +0600 Subject: [PATCH 193/415] [pbs] Add test for flp frontline embeds --- youtube_dl/extractor/pbs.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3169e9c3f..a690f9c29 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -154,6 +154,22 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, + { + # Frontline video embedded via flp2012.js + 'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists', + 'info_dict': { + 'id': '2070868960', + 'display_id': 'the-atomic-artists', + 'ext': 'mp4', + 'title': 'FRONTLINE - The Atomic Artists', + 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', + 'duration': 723, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, } ] _ERRORS = { From 21d0c33ecde573db961b97f5f0c37ba9d3c02ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 01:08:40 +0600 Subject: [PATCH 194/415] [pbs] Make flp embed lookup non fatal --- youtube_dl/extractor/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index a690f9c29..8fb9b1849 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -210,7 +210,7 @@ def _extract_webpage(self, url): # Fronline video embedded via flp video_id = self._search_regex( - r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid') + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) if video_id: # pkg_id calculation is reverse engineered from # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js From ee223abb88263bdda2d92c4b2139d1dca60ba3ae Mon Sep 17 00:00:00 2001 From: Mister Hat <misterhat144@gmail.com> Date: Tue, 3 Nov 2015 19:13:27 -0600 Subject: [PATCH 195/415] [vidzi] fixed. finds url from hash and host in script Closes #7386. --- youtube_dl/extractor/vidzi.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 08a5a7b8d..2ba9f31df 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -20,8 +20,14 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url') + video_host = self._html_search_regex( + r'id=\'vplayer\'><img src="http://(.*?)/i', webpage, + 'video host') + video_hash = self._html_search_regex( + r'\|([a-z0-9]+)\|hls\|type', webpage, 'video_hash') + ext = self._html_search_regex( + r'\|tracks\|([a-z0-9]+)\|', webpage, 'video ext') + video_url = 'http://' + video_host + '/' + video_hash + '/v.' + ext title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') From 5d0f84d32cc038dd71673987cb6efaa85e953474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 06:23:00 +0600 Subject: [PATCH 196/415] [beeg] Skip empty URLs (Closes #7392) --- youtube_dl/extractor/beeg.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index e6c928699..61bc2f744 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -33,6 +33,8 @@ def _real_extract(self, url): formats = [] for format_id, video_url in video.items(): + if not video_url: + continue height = self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None) if not height: From 5214f1e31d5e5ba692fb1ed4803ff71ef4e480e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 19:25:59 +0600 Subject: [PATCH 197/415] [crunchyroll] Fix title extraction (Closes #7396) --- youtube_dl/extractor/crunchyroll.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 0c9b8ca02..4243f3e2e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -287,7 +287,9 @@ def _real_extract(self, url): if 'To view this, please log in to verify you are 18 or older.' in webpage: self.raise_login_required() - video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL) + video_title = self._html_search_regex( + r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', + webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') if not video_description: From 2c740cf28d257d2a915195e7cc60f83e6690d2cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 19:29:09 +0600 Subject: [PATCH 198/415] [crunchyroll] Simplify description extraction --- youtube_dl/extractor/crunchyroll.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4243f3e2e..9aa5d58b4 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -291,9 +291,8 @@ def _real_extract(self, url): r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) - video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') - if not video_description: - video_description = None + video_description = self._html_search_regex( + r'"description":"([^"]+)', webpage, 'video_description', default=None) video_upload_date = self._html_search_regex( [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) From 6d02b9a392d39c114d3fb58bf7965f62196ccecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 20:02:39 +0600 Subject: [PATCH 199/415] [crunchyroll] Fix description extraction --- youtube_dl/extractor/crunchyroll.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 9aa5d58b4..6e5999c72 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -21,6 +21,7 @@ bytes_to_intlist, intlist_to_bytes, int_or_none, + lowercase_escape, remove_end, unified_strdate, urlencode_postdata, @@ -104,7 +105,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'id': '589804', 'ext': 'flv', 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', - 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e', + 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Danny Choo Network', 'upload_date': '20120213', @@ -292,7 +293,10 @@ def _real_extract(self, url): webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex( - r'"description":"([^"]+)', webpage, 'video_description', default=None) + r'<script[^>]*>\s*.+?\[media_id=%s\].+?"description"\s*:\s*"([^"]+)' % video_id, + webpage, 'description', default=None) + if video_description: + video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) From cff551c0b0ed8eb55c1ab63ec669c07a51aa4998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 7 Nov 2015 18:43:22 +0100 Subject: [PATCH 200/415] [googleplus] Fix extraction of formats --- youtube_dl/extractor/googleplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index fcefe54cd..731bacd67 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -61,7 +61,7 @@ def unicode_escape(s): 'width': int(width), 'height': int(height), } for width, height, video_url in re.findall( - r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)] + r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent.com.*?)"', webpage)] self._sort_formats(formats) return { From ee4337d100f68bbb2ae795101d4c391b522ec753 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 20:16:14 +0100 Subject: [PATCH 201/415] [videolecture] add support for multi part videos --- youtube_dl/extractor/videolecturesnet.py | 95 +++++++++++++++++------- 1 file changed, 70 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index 649ac9433..351706362 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -10,17 +10,19 @@ from ..utils import ( ExtractorError, parse_duration, + js_to_json, + parse_iso8601, ) class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/*(?:[#?].*)?$' + _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?' IE_NAME = 'videolectures.net' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 'info_dict': { - 'id': 'promogram_igor_mekjavic_eng', + 'id': '20171_part1', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', @@ -32,7 +34,7 @@ class VideoLecturesNetIE(InfoExtractor): # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', 'info_dict': { - 'id': 'russir2010_filippova_nlp', + 'id': '14891_part1', 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', @@ -46,37 +48,80 @@ class VideoLecturesNetIE(InfoExtractor): }, { 'url': 'http://videolectures.net/deeplearning2015_montreal/', 'info_dict': { - 'id': 'deeplearning2015_montreal', + 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', - 'description': 'md5:90121a40cc6926df1bf04dcd8563ed3b', + 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'timestamp': 1438560000, }, 'playlist_count': 30, + }, { + # multi part lecture + 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', + 'info_dict': { + 'id': '9737', + 'title': 'Introduction To Bayesian Inference', + 'timestamp': 1251622800, + }, + 'playlist': [{ + 'info_dict': { + 'id': '9737_part1', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }, { + 'info_dict': { + 'id': '9737_part2', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }], + 'playlist_count': 2, }] def _real_extract(self, url): - video_id = self._match_id(url) + lecture_slug, part = re.match(self._VALID_URL, url).groups() - smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id + webpage = self._download_webpage(url, lecture_slug) - try: - smil = self._download_smil(smil_url, video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - # Probably a playlist - webpage = self._download_webpage(url, video_id) - entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') - for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] - playlist_title = self._html_search_meta('title', webpage, 'title', fatal=True) - playlist_description = self._html_search_meta('description', webpage, 'description') - return self.playlist_result(entries, video_id, playlist_title, playlist_description) + cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) - info = self._parse_smil(smil, smil_url, video_id) + lecture_id = str(cfg['obj_id']) - info['id'] = video_id + lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (self._proto_relative_url(cfg['livepipe'], 'http:'), lecture_id), lecture_id)['lecture'][0] - switch = smil.find('.//switch') - if switch is not None: - info['duration'] = parse_duration(switch.attrib.get('dur')) + lecture_info = { + 'id': lecture_id, + 'display_id': lecture_slug, + 'title': lecture_data['title'], + 'timestamp': parse_iso8601(lecture_data.get('time')), + 'description': lecture_data.get('description_wiki'), + 'thumbnail': lecture_data.get('thumb'), + } - return info + entries = [] + parts = cfg.get('videos') + if parts: + if len(parts) == 1: + part = str(parts[0]) + if part: + smil_url = 'http://videolectures.net/%s/video/%s/smil.xml' % (lecture_slug, part) + smil = self._download_smil(smil_url, lecture_id) + info = self._parse_smil(smil, smil_url, lecture_id) + info['id'] = '%s_part%s' % (lecture_id, part) + switch = smil.find('.//switch') + if switch is not None: + info['duration'] = parse_duration(switch.attrib.get('dur')) + return info + else: + for part in parts: + entries.append(self.url_result('http://videolectures.net/%s/video/%s' % (lecture_slug, part), 'VideoLecturesNet')) + lecture_info['_type'] = 'multi_video' + else: + # Probably a playlist + entries = [ + self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') + for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] + lecture_info['_type'] = 'playlist' + + lecture_info['entries'] = entries + return lecture_info From a06bf87a2c6009d82ec28afe566f653b3deb11bf Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 21:23:41 +0100 Subject: [PATCH 202/415] [viidea] add support for sites using viidea service --- youtube_dl/extractor/__init__.py | 2 +- .../{videolecturesnet.py => viidea.py} | 33 ++++++++++++++----- 2 files changed, 26 insertions(+), 9 deletions(-) rename youtube_dl/extractor/{videolecturesnet.py => viidea.py} (77%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94150a28f..0a90da73c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -724,7 +724,6 @@ from .vice import ViceIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE -from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE @@ -734,6 +733,7 @@ from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewster import ViewsterIE +from .viidea import ViideaIE from .vimeo import ( VimeoIE, VimeoAlbumIE, diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/viidea.py similarity index 77% rename from youtube_dl/extractor/videolecturesnet.py rename to youtube_dl/extractor/viidea.py index 351706362..71fb298e6 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/viidea.py @@ -15,9 +15,23 @@ ) -class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?' - IE_NAME = 'videolectures.net' +class ViideaIE(InfoExtractor): + _VALID_URL = r'''(?x)http://(?:www\.)?(?: + videolectures\.net| + flexilearn\.viidea\.net| + presentations\.ocwconsortium\.org| + video\.travel-zoom\.si| + video\.pomp-forum\.si| + tv\.nil\.si| + video\.hekovnik.com| + video\.szko\.si| + kpk\.viidea\.com| + inside\.viidea\.net| + video\.kiberpipa\.org| + bvvideo\.si| + kongres\.viidea\.net| + edemokracija\.viidea\.com + )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?''' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', @@ -87,7 +101,9 @@ def _real_extract(self, url): lecture_id = str(cfg['obj_id']) - lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (self._proto_relative_url(cfg['livepipe'], 'http:'), lecture_id), lecture_id)['lecture'][0] + base_url = self._proto_relative_url(cfg['livepipe'], 'http:') + + lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] lecture_info = { 'id': lecture_id, @@ -104,7 +120,7 @@ def _real_extract(self, url): if len(parts) == 1: part = str(parts[0]) if part: - smil_url = 'http://videolectures.net/%s/video/%s/smil.xml' % (lecture_slug, part) + smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) info['id'] = '%s_part%s' % (lecture_id, part) @@ -114,13 +130,14 @@ def _real_extract(self, url): return info else: for part in parts: - entries.append(self.url_result('http://videolectures.net/%s/video/%s' % (lecture_slug, part), 'VideoLecturesNet')) + entries.append(self.url_result('%s/video/%s' % (base_url, lecture_id, part), 'Viidea')) lecture_info['_type'] = 'multi_video' else: # Probably a playlist + playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') - for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] + self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') + for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] lecture_info['_type'] = 'playlist' lecture_info['entries'] = entries From 8e3a2bd6200660f9fb9d485b1c924fa5462bd566 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 17:43:23 +0100 Subject: [PATCH 203/415] [viidea] fix _VALID_URL regex and tests --- youtube_dl/extractor/viidea.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 71fb298e6..ae9a42737 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -31,7 +31,7 @@ class ViideaIE(InfoExtractor): bvvideo\.si| kongres\.viidea\.net| edemokracija\.viidea\.com - )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?''' + )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$''' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', @@ -130,7 +130,7 @@ def _real_extract(self, url): return info else: for part in parts: - entries.append(self.url_result('%s/video/%s' % (base_url, lecture_id, part), 'Viidea')) + entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) lecture_info['_type'] = 'multi_video' else: # Probably a playlist From 6fdb39ded15c6276b49fa67cb517bf1fed63af35 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 20:38:33 +0100 Subject: [PATCH 204/415] [viidia] Cleaup [viidea] extract playlist if lecture is an event [viidia] use compat_str --- youtube_dl/extractor/viidea.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index ae9a42737..2541a36ed 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -4,11 +4,10 @@ from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_urlparse, + compat_str, ) from ..utils import ( - ExtractorError, parse_duration, js_to_json, parse_iso8601, @@ -97,9 +96,9 @@ def _real_extract(self, url): webpage = self._download_webpage(url, lecture_slug) - cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) + cfg = self._parse_json(self._search_regex([r'cfg\s*:\s*({.+?}),[\da-zA-Z_]:\(?function', r'cfg\s*:\s*({[^}]+})'], webpage, 'cfg'), lecture_slug, js_to_json) - lecture_id = str(cfg['obj_id']) + lecture_id = compat_str(cfg['obj_id']) base_url = self._proto_relative_url(cfg['livepipe'], 'http:') @@ -118,7 +117,7 @@ def _real_extract(self, url): parts = cfg.get('videos') if parts: if len(parts) == 1: - part = str(parts[0]) + part = compat_str(parts[0]) if part: smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) smil = self._download_smil(smil_url, lecture_id) @@ -132,7 +131,7 @@ def _real_extract(self, url): for part in parts: entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) lecture_info['_type'] = 'multi_video' - else: + if not parts or lecture_data.get('type') == 'evt': # Probably a playlist playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ From e8ce2375e0851e65c4882002297404825fe1045e Mon Sep 17 00:00:00 2001 From: Sergey M? <dstftw@gmail.com> Date: Sun, 8 Nov 2015 06:54:27 +0600 Subject: [PATCH 205/415] [viidea] Improve and cleanup (Closes #7390) * Optimize requests for multipart videos * Fix cfg regex * Improve titles and identifiers --- youtube_dl/extractor/viidea.py | 99 ++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 2541a36ed..525e303d4 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -35,35 +35,42 @@ class ViideaIE(InfoExtractor): _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 'info_dict': { - 'id': '20171_part1', + 'id': '20171', + 'display_id': 'promogram_igor_mekjavic_eng', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'thumbnail': 're:http://.*\.jpg', + 'timestamp': 1372349289, 'upload_date': '20130627', 'duration': 565, - 'thumbnail': 're:http://.*\.jpg', }, }, { # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', 'info_dict': { - 'id': '14891_part1', + 'id': '14891', + 'display_id': 'russir2010_filippova_nlp', 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', - 'duration': 5352, 'thumbnail': 're:http://.*\.jpg', + 'timestamp': 1284375600, + 'upload_date': '20100913', + 'duration': 5352, }, 'params': { # rtmp download 'skip_download': True, }, }, { + # event playlist 'url': 'http://videolectures.net/deeplearning2015_montreal/', 'info_dict': { 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'thumbnail': 're:http://.*\.jpg', 'timestamp': 1438560000, }, 'playlist_count': 30, @@ -72,37 +79,54 @@ class ViideaIE(InfoExtractor): 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', 'info_dict': { 'id': '9737', + 'display_id': 'mlss09uk_bishop_ibi', 'title': 'Introduction To Bayesian Inference', + 'thumbnail': 're:http://.*\.jpg', 'timestamp': 1251622800, }, 'playlist': [{ 'info_dict': { 'id': '9737_part1', + 'display_id': 'mlss09uk_bishop_ibi_part1', 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', + 'title': 'Introduction To Bayesian Inference (Part 1)', + 'thumbnail': 're:http://.*\.jpg', + 'duration': 4622, + 'timestamp': 1251622800, + 'upload_date': '20090830', }, }, { 'info_dict': { 'id': '9737_part2', + 'display_id': 'mlss09uk_bishop_ibi_part2', 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', + 'title': 'Introduction To Bayesian Inference (Part 2)', + 'thumbnail': 're:http://.*\.jpg', + 'duration': 5641, + 'timestamp': 1251622800, + 'upload_date': '20090830', }, }], 'playlist_count': 2, }] def _real_extract(self, url): - lecture_slug, part = re.match(self._VALID_URL, url).groups() + lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, lecture_slug) - cfg = self._parse_json(self._search_regex([r'cfg\s*:\s*({.+?}),[\da-zA-Z_]:\(?function', r'cfg\s*:\s*({[^}]+})'], webpage, 'cfg'), lecture_slug, js_to_json) + cfg = self._parse_json(self._search_regex( + [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function', + r'cfg\s*:\s*({[^}]+})'], + webpage, 'cfg'), lecture_slug, js_to_json) lecture_id = compat_str(cfg['obj_id']) base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] lecture_info = { 'id': lecture_id, @@ -113,31 +137,52 @@ def _real_extract(self, url): 'thumbnail': lecture_data.get('thumb'), } - entries = [] - parts = cfg.get('videos') + playlist_entries = [] + lecture_type = lecture_data.get('type') + parts = [compat_str(video) for video in cfg.get('videos', [])] if parts: - if len(parts) == 1: - part = compat_str(parts[0]) - if part: - smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) + multipart = len(parts) > 1 + + def extract_part(part_id): + smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) - info['id'] = '%s_part%s' % (lecture_id, part) + info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) + info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) + if multipart: + info['title'] += ' (Part %s)' % part_id switch = smil.find('.//switch') if switch is not None: info['duration'] = parse_duration(switch.attrib.get('dur')) - return info + item_info = lecture_info.copy() + item_info.update(info) + return item_info + + if explicit_part_id or not multipart: + result = extract_part(explicit_part_id or parts[0]) else: - for part in parts: - entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) - lecture_info['_type'] = 'multi_video' - if not parts or lecture_data.get('type') == 'evt': - # Probably a playlist - playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) + result = { + '_type': 'multi_video', + 'entries': [extract_part(part) for part in parts], + } + result.update(lecture_info) + + # Immediately return explicitly requested part or non event item + if explicit_part_id or lecture_type != 'evt': + return result + + playlist_entries.append(result) + + # It's probably a playlist + if not parts or lecture_type == 'evt': + playlist_webpage = self._download_webpage( + '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') - for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] - lecture_info['_type'] = 'playlist' + for _, video_url in re.findall( + r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] + playlist_entries.extend(entries) - lecture_info['entries'] = entries - return lecture_info + playlist = self.playlist_result(playlist_entries, lecture_id) + playlist.update(lecture_info) + return playlist From d5c181a14e08198e400932d591b47683a630c8c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 8 Nov 2015 11:49:51 +0100 Subject: [PATCH 206/415] [movieclips] Fix extraction (fixes #7404) They use theplatform now. Changed the test, because the old one seems to be georestricted. --- youtube_dl/extractor/movieclips.py | 77 ++++++++---------------------- 1 file changed, 19 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index 04e17d055..e06828b55 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -1,80 +1,41 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( - compat_str, -) -from ..utils import ( - ExtractorError, - clean_html, + compat_urllib_request, ) class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://movieclips\.com/(?P<id>[\da-zA-Z]+)(?:-(?P<display_id>[\da-z-]+))?' + _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P<id>[^/?#]+)' _TEST = { - 'url': 'http://movieclips.com/Wy7ZU-my-week-with-marilyn-movie-do-you-love-me/', + 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5', 'info_dict': { - 'id': 'Wy7ZU', - 'display_id': 'my-week-with-marilyn-movie-do-you-love-me', + 'id': 'pKIGmG83AqD9', + 'display_id': 'warcraft-trailer-1-561180739597', 'ext': 'mp4', - 'title': 'My Week with Marilyn - Do You Love Me?', - 'description': 'md5:e86795bd332fe3cff461e7c8dc542acb', + 'title': 'Warcraft Trailer 1', + 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'add_ie': ['ThePlatform'], } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - show_id = display_id or video_id + display_id = self._match_id(url) - config = self._download_xml( - 'http://config.movieclips.com/player/config/%s' % video_id, - show_id, 'Downloading player config') - - if config.find('./country-region').text == 'false': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, config.find('./region_alert').text), expected=True) - - properties = config.find('./video/properties') - smil_file = properties.attrib['smil_file'] - - smil = self._download_xml(smil_file, show_id, 'Downloading SMIL') - base_url = smil.find('./head/meta').attrib['base'] - - formats = [] - for video in smil.findall('./body/switch/video'): - vbr = int(video.attrib['system-bitrate']) / 1000 - src = video.attrib['src'] - formats.append({ - 'url': base_url, - 'play_path': src, - 'ext': src.split(':')[0], - 'vbr': vbr, - 'format_id': '%dk' % vbr, - }) - - self._sort_formats(formats) - - title = '%s - %s' % (properties.attrib['clip_movie_title'], properties.attrib['clip_title']) - description = clean_html(compat_str(properties.attrib['clip_description'])) - thumbnail = properties.attrib['image'] - categories = properties.attrib['clip_categories'].split(',') + req = compat_urllib_request.Request(url) + # it doesn't work if it thinks the browser it's too old + req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)') + webpage = self._download_webpage(req, display_id) + theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link') + title = self._html_search_regex(r'<title[^>]*>([^>]+)-\s*\d+\s*|\s*Movieclips.com', webpage, 'title') + description = self._html_search_meta('description', webpage) return { - 'id': video_id, - 'display_id': display_id, + '_type': 'url_transparent', + 'url': theplatform_link, 'title': title, + 'display_id': display_id, 'description': description, - 'thumbnail': thumbnail, - 'categories': categories, - 'formats': formats, } From 937511dfc01c3d00c35a00f78c2b6f989b4d46e3 Mon Sep 17 00:00:00 2001 From: Frans de Jonge Date: Sat, 7 Nov 2015 22:55:02 +0100 Subject: [PATCH 207/415] Added support for the RTBF OUFtivi subpage --- youtube_dl/extractor/rtbf.py | 41 ++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 04a66df90..e75b45112 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,17 +9,36 @@ class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://www.rtbf.be/video/[^\?]+\?id=(?P\d+)' - _TEST = { - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'duration': 3099, - } - } + _VALID_URL = r'''(?x) + https?://www\.rtbf\.be/ + (?: + video/[^\?]+\?id=| + ouftivi/heros/[^&]+&videoId= + ) + (?P\d+) + ''' + _TESTS = [ + { + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '799f334ddf2c0a582ba80c44655be570', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'duration': 3099, + } + }, + { + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'md5': '25aea17e949e1e0c7c41270d60d25f22', + 'info_dict': { + 'id': '2057442', + 'ext': 'mp4', + 'title': 'Scooby-Doo, myst\xe8res associ\xe9s', + 'duration': 1279, + } + }, + ] _QUALITIES = [ ('mobile', 'mobile'), From fda2717ef9d429358d5816582590d15d18f9109f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 16:56:20 +0600 Subject: [PATCH 208/415] [movieclips] Add coding cookie --- youtube_dl/extractor/movieclips.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index e06828b55..b8c43a163 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor From 114e6025b09e12bd01b5ce22bd2c43a3ef0ba460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 17:01:45 +0600 Subject: [PATCH 209/415] [rtbf] Expand _VALID_URL (Closes #7402) --- youtube_dl/extractor/rtbf.py | 48 ++++++++++++++---------------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index e75b45112..acf10e253 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,36 +9,24 @@ class RTBFIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://www\.rtbf\.be/ - (?: - video/[^\?]+\?id=| - ouftivi/heros/[^&]+&videoId= - ) - (?P\d+) - ''' - _TESTS = [ - { - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'duration': 3099, - } - }, - { - 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', - 'md5': '25aea17e949e1e0c7c41270d60d25f22', - 'info_dict': { - 'id': '2057442', - 'ext': 'mp4', - 'title': 'Scooby-Doo, myst\xe8res associ\xe9s', - 'duration': 1279, - } - }, - ] + _VALID_URL = r'https?://www\.rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '799f334ddf2c0a582ba80c44655be570', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'duration': 3099, + } + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }] _QUALITIES = [ ('mobile', 'mobile'), From aa8d2d5be6a99542b85a85af3310fab1bf641e86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 17:03:21 +0600 Subject: [PATCH 210/415] [rtbf] Make www optional in _VALID_URL --- youtube_dl/extractor/rtbf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index acf10e253..e42b319a3 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,7 +9,7 @@ class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://www\.rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', 'md5': '799f334ddf2c0a582ba80c44655be570', From 50506cb60798fe4d2ebb9603798b3fb1cb81e55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:01:37 +0600 Subject: [PATCH 211/415] [extremetube] Fix extraction (Closes #7163) --- youtube_dl/extractor/extremetube.py | 45 +++++++++++++++++++---------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index c826a5404..3e11e3299 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -3,12 +3,9 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_request, -) +from ..compat import compat_urllib_request from ..utils import ( - qualities, + int_or_none, str_to_int, ) @@ -49,20 +46,36 @@ def _real_extract(self, url): r'Views:\s*\s*([\d,\.]+)', webpage, 'view count', fatal=False)) - flash_vars = compat_parse_qs(self._search_regex( - r']+?name="flashvars"[^>]+?value="([^"]+)"', webpage, 'flash vars')) + flash_vars = self._parse_json( + self._search_regex( + r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'), + video_id) formats = [] - quality = qualities(['180p', '240p', '360p', '480p', '720p', '1080p']) - for k, vals in flash_vars.items(): - m = re.match(r'quality_(?P[0-9]+p)$', k) - if m is not None: - formats.append({ - 'format_id': m.group('quality'), - 'quality': quality(m.group('quality')), - 'url': vals[0], + for quality_key, video_url in flash_vars.items(): + height = int_or_none(self._search_regex( + r'quality_(\d+)[pP]$', quality_key, 'height', default=None)) + if not height: + continue + f = { + 'url': video_url, + } + mobj = re.search( + r'/(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) + if mobj: + height = int(mobj.group('height')) + bitrate = int(mobj.group('bitrate')) + f.update({ + 'format_id': '%dp-%dk' % (height, bitrate), + 'height': height, + 'tbr': bitrate, }) - + else: + f.update({ + 'format_id': '%dp' % height, + 'height': height, + }) + formats.append(f) self._sort_formats(formats) return { From cc8034cc4c52fcbfaf9f8edf34d562c481860193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:14:39 +0600 Subject: [PATCH 212/415] [extremetube] Modernize --- youtube_dl/extractor/extremetube.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 3e11e3299..c5677c82b 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -11,12 +11,12 @@ class ExtremeTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pextremetube\.com/.*?video/.+?(?P[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'md5': '344d0c6d50e2f16b06e49ca011d8ac69', 'info_dict': { - 'id': '652431', + 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', @@ -26,12 +26,16 @@ class ExtremeTubeIE(InfoExtractor): }, { 'url': 'http://www.extremetube.com/gay/video/abcde-1234', 'only_matching': True, + }, { + 'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick', + 'only_matching': True, + }, { + 'url': 'http://www.extremetube.com/video/652431', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.' + mobj.group('url') + video_id = self._match_id(url) req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') From f09a767d3198823e5c0ac187a91284c8d2736eb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:19:13 +0600 Subject: [PATCH 213/415] [mit] Allow external embeds (Closes #7406) --- youtube_dl/extractor/mit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index f088ab9e2..29ca45778 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -86,7 +86,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, page_title) embed_url = self._search_regex( r'