WatIE: support videos divided in multiple parts (closes #222 and #659)

The id for the videos is now the full id, no the one in the webpage url.
Also extract more information: description, view_count and upload_date
This commit is contained in:
Jaime Marquínez Ferrándiz 2013-06-29 18:22:03 +02:00
parent 6ffe72835a
commit 8244288dfe
2 changed files with 51 additions and 14 deletions

View File

@ -8,15 +8,16 @@ from .common import InfoExtractor
class TF1IE(InfoExtractor): class TF1IE(InfoExtractor):
""" """
TF1 uses the wat.tv player, currently it can only download videos with the TF1 uses the wat.tv player, currently it can only download videos with the
html5 player enabled, it cannot download HD videos or the news. html5 player enabled, it cannot download HD videos.
""" """
_VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html' _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html'
_TEST = { _TEST = {
u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
u'file': u'6bysb.mp4', u'file': u'10635995.mp4',
u'md5': u'66789d3e91278d332f75e1feb7aea327', u'md5': u'66789d3e91278d332f75e1feb7aea327',
u'info_dict': { u'info_dict': {
u"title": u"Citroën Grand C4 Picasso 2013 : présentation officielle" u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle',
u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
} }
} }

View File

@ -1,3 +1,5 @@
# coding: utf-8
import json import json
import re import re
@ -5,6 +7,7 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
compat_urllib_parse, compat_urllib_parse,
unified_strdate,
) )
@ -13,36 +16,69 @@ class WatIE(InfoExtractor):
IE_NAME = 'wat.tv' IE_NAME = 'wat.tv'
_TEST = { _TEST = {
u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html', u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
u'file': u'6bv55.mp4', u'file': u'10631273.mp4',
u'md5': u'0a4fe7870f31eaeabb5e25fd8da8414a', u'md5': u'0a4fe7870f31eaeabb5e25fd8da8414a',
u'info_dict': { u'info_dict': {
u"title": u"World War Z - Philadelphia VOST" u'title': u'World War Z - Philadelphia VOST',
u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
} }
} }
def download_video_info(self, real_id):
# 'contentv4' is used in the website, but it also returns the related
# videos, we don't need them
info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info')
info = json.loads(info)
return info['media']
def _real_extract(self, url): def _real_extract(self, url):
def real_id_for_chapter(chapter):
return chapter['tc_start'].split('-')[0]
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
short_id = mobj.group('shortID') short_id = mobj.group('shortID')
webpage = self._download_webpage(url, short_id)
real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
video_info = self.download_video_info(real_id)
chapters = video_info['chapters']
first_chapter = chapters[0]
if real_id_for_chapter(first_chapter) != real_id:
self.to_screen('Multipart video detected')
chapter_urls = []
for chapter in chapters:
chapter_id = real_id_for_chapter(chapter)
# Yes, when we this chapter is processed by WatIE,
# it will download the info again
chapter_info = self.download_video_info(chapter_id)
chapter_urls.append(chapter_info['url'])
entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
return self.playlist_result(entries, real_id, video_info['title'])
# Otherwise we can continue and extract just one part, we have to use
# the short id for getting the video url
player_data = compat_urllib_parse.urlencode({'shortVideoId': short_id, player_data = compat_urllib_parse.urlencode({'shortVideoId': short_id,
'html5': '1'}) 'html5': '1'})
player_info = self._download_webpage('http://www.wat.tv/player?' + player_data, player_info = self._download_webpage('http://www.wat.tv/player?' + player_data,
short_id, u'Downloading player info') real_id, u'Downloading player info')
player = json.loads(player_info)['player'] player = json.loads(player_info)['player']
html5_player = self._html_search_regex(r'iframe src="(.*?)"', player, html5_player = self._html_search_regex(r'iframe src="(.*?)"', player,
'html5 player') 'html5 player')
player_webpage = self._download_webpage(html5_player, short_id, player_webpage = self._download_webpage(html5_player, real_id,
u'Downloading player webpage') u'Downloading player webpage')
video_url = self._search_regex(r'urlhtml5 : "(.*?)"', player_webpage, video_url = self._search_regex(r'urlhtml5 : "(.*?)"', player_webpage,
'video url') 'video url')
title = self._search_regex(r'contentTitle : "(.*?)"', player_webpage, info = {'id': real_id,
'title')
thumbnail = self._search_regex(r'previewMedia : "(.*?)"', player_webpage,
'thumbnail')
return {'id': short_id,
'url': video_url, 'url': video_url,
'ext': 'mp4', 'ext': 'mp4',
'title': title, 'title': first_chapter['title'],
'thumbnail': thumbnail, 'thumbnail': first_chapter['preview'],
'description': first_chapter['description'],
'view_count': video_info['views'],
} }
if 'date_diffusion' in first_chapter:
info['upload_date'] = unified_strdate(first_chapter['date_diffusion'])
return info