[wistia] Add extractor

2024-11-03 09:40:30 +00:00 · 2013-12-06 09:15:04 +01:00 · 2013-12-06 09:15:04 +01:00 · ef4fd84857
commit ef4fd84857
parent 72135030d1
4 changed files with 80 additions and 10 deletions
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -488,7 +488,8 @@ def make_result(embedded_info):
                new_result = ie_result.copy()
                for f in ('_type', 'url', 'ext', 'player_url', 'formats',
                          'entries', 'urlhandle', 'ie_key', 'duration',
-                          'subtitles', 'annotations', 'format'):
+                          'subtitles', 'annotations', 'format',
                          'thumbnail', 'thumbnails'):
                    if f in new_result:
                        del new_result[f]
                    if f in embedded_info:
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -178,6 +178,7 @@
 from .websurg import WeBSurgIE
 from .weibo import WeiboIE
 from .wimp import WimpIE
 from .wistia import WistiaIE
 from .worldstarhiphop import WorldStarHipHopIE
 from .xhamster import XHamsterIE
 from .xnxx import XNXXIE
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -169,8 +169,13 @@ def _real_extract(self, url):
        #   Site Name | Video Title
        #   Video Title - Tagline | Site Name
        # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(r'<title>(.*)</title>',
+        video_title = self._html_search_regex(
-            webpage, u'video title', default=u'video', flags=re.DOTALL)
+            r'(?s)<title>(.*?)</title>', webpage, u'video title',
            default=u'video')
        # video uploader is domain name
        video_uploader = self._search_regex(
            r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
        # Look for BrightCove:
        bc_url = BrightcoveIE._extract_brightcove_url(webpage)
@ -188,7 +193,7 @@ def _real_extract(self, url):
        # Look for embedded YouTube player
        matches = re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)
        if matches:
            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
                     for tuppl in matches]
@ -197,13 +202,26 @@ def _real_extract(self, url):
        # Look for embedded Dailymotion player
        matches = re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
        if matches:
            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
                     for tuppl in matches]
            return self.playlist_result(
                urlrs, playlist_id=video_id, playlist_title=video_title)
        # Look for embedded Wistia player
        match = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
        if match:
            return {
                '_type': 'url_transparent',
                'url': unescapeHTML(match.group('url')),
                'ie_key': 'Wistia',
                'uploader': video_uploader,
                'title': video_title,
                'id': video_id,
            }
        # Look for Bandcamp pages with custom domain
        mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
        if mobj is not None:
@ -247,14 +265,9 @@ def _real_extract(self, url):
        # here's a fun little line of code for you:
        video_id = os.path.splitext(video_id)[0]
        # video uploader is domain name
        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
            url, u'video uploader')
        return {
            'id':       video_id,
            'url':      video_url,
            'uploader': video_uploader,
            'upload_date':  None,
            'title':    video_title,
        }
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@ -0,0 +1,55 @@
 import json
 import re
 from .common import InfoExtractor
 class WistiaIE(InfoExtractor):
    _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
    _TEST = {
        u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt",
        u"file": u"sh7fpupwlt.mov",
        u"md5": u"cafeb56ec0c53c18c97405eecb3133df",
        u"info_dict": {
            u"title": u"cfh_resourceful_zdkh_final_1"
        },
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage = self._download_webpage(url, video_id)
        data_json = self._html_search_regex(
            r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data')
        data = json.loads(data_json)
        formats = []
        thumbnails = []
        for atype, a in data['assets'].items():
            if atype == 'still':
                thumbnails.append({
                    'url': a['url'],
                    'resolution': '%dx%d' % (a['width'], a['height']),
                })
                continue
            if atype == 'preview':
                continue
            formats.append({
                'format_id': atype,
                'url': a['url'],
                'width': a['width'],
                'height': a['height'],
                'filesize': a['size'],
                'ext': a['ext'],
            })
        formats.sort(key=lambda a: a['filesize'])
        return {
            'id': video_id,
            'title': data['name'],
            'formats': formats,
            'thumbnails': thumbnails,
        }