[tvp] Fix embeds (#1401)

Authored by: selfisekai
pull/1613/head
Lauren Liberda 3 years ago committed by pukkandan
parent c0599d4fe4
commit 56bb56f3cf
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698

@ -135,6 +135,7 @@ from .arcpublishing import ArcPublishingIE
from .medialaan import MedialaanIE from .medialaan import MedialaanIE
from .simplecast import SimplecastIE from .simplecast import SimplecastIE
from .wimtv import WimTVIE from .wimtv import WimTVIE
from .tvp import TVPEmbedIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -3508,6 +3509,10 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key())
tvp_urls = TVPEmbedIE._extract_urls(webpage)
if tvp_urls:
return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())
# Look for HTML5 media # Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries: if entries:

@ -89,114 +89,162 @@ class TVPIE(InfoExtractor):
class TVPEmbedIE(InfoExtractor): class TVPEmbedIE(InfoExtractor):
IE_NAME = 'tvp:embed' IE_NAME = 'tvp:embed'
IE_DESC = 'Telewizja Polska' IE_DESC = 'Telewizja Polska'
_VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' _VALID_URL = r'''(?x)
(?:
tvp:
|https?://
(?:[^/]+\.)?
(?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/
(?:sess/
(?:tvplayer\.php\?.*?object_id
|TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
|shared/details\.php\?.*?object_id)
=)
(?P<id>\d+)
'''
_TESTS = [{ _TESTS = [{
'url': 'tvp:194536', 'url': 'tvp:194536',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': { 'info_dict': {
'id': '194536', 'id': '194536',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Czas honoru, odc. 13 Władek', 'title': 'Czas honoru, odc. 13 Władek',
'description': 'md5:76649d2014f65c99477be17f23a4dead',
'age_limit': 12,
}, },
}, { }, {
# not available 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&amp;autoplay=false',
'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
'md5': '8c9cd59d16edabf39331f93bf8a766c7',
'info_dict': { 'info_dict': {
'id': '22670268', 'id': '51247504',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Panorama, 07.12.2015, 15:40', 'title': 'Razmova 091220',
}, },
'skip': 'Transmisja została zakończona lub materiał niedostępny',
}, { }, {
'url': 'tvp:22670268', # TVPlayer2 embed URL
'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757',
'only_matching': True,
}, {
'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452',
'only_matching': True,
}, {
# pulsembed on dziennik.pl
'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html',
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage, **kw):
return [m.group('embed') for m in re.finditer(
r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:],
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
# it could be anything that is a valid JS function name
callback = random.choice((
'jebac_pis',
'jebacpis',
'ziobro',
'sasin70',
'sasin_przejebal_70_milionow_PLN',
'tvp_is_a_state_propaganda_service',
))
webpage = self._download_webpage( webpage = self._download_webpage(
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s'
+ '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id)
error = self._html_search_regex(
r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', # stripping JSONP padding
webpage, 'error', default=None) or clean_html( datastr = webpage[15 + len(callback):-3]
get_element_by_attribute('class', 'msg error', webpage)) if datastr.startswith('null,'):
if error: error = self._parse_json(datastr[5:], video_id)
raise ExtractorError('%s said: %s' % ( raise ExtractorError(error[0]['desc'])
self.IE_NAME, clean_html(error)), expected=True)
content = self._parse_json(datastr, video_id)['content']
title = self._search_regex( info = content['info']
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', is_live = try_get(info, lambda x: x['isLive'], bool)
webpage, 'title', group='title')
series_title = self._search_regex(
r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
webpage, 'series', group='series', default=None)
if series_title:
title = '%s, %s' % (series_title, title)
thumbnail = self._search_regex(
r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
video_url = self._search_regex(
r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
'formats', group='url', default=None)
if not video_url or 'material_niedostepny.mp4' in video_url:
video_url = self._download_json(
'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
video_id)['video_url']
formats = [] formats = []
video_url_base = self._search_regex( for file in content['files']:
r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', video_url = file.get('url')
video_url, 'video base url', default=None) if not video_url:
if video_url_base: continue
# TODO: <Group> found instead of <AdaptationSet> in MPD manifest. if video_url.endswith('.m3u8'):
# It's not mentioned in MPEG-DASH standard. Figure that out. formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live))
# formats.extend(self._extract_mpd_formats( elif video_url.endswith('.mpd'):
# video_url_base + '.ism/video.mpd', if is_live:
# video_id, mpd_id='dash', fatal=False)) # doesn't work with either ffmpeg or native downloader
formats.extend(self._extract_ism_formats( continue
video_url_base + '.ism/Manifest', formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
video_id, 'mss', fatal=False)) elif video_url.endswith('.f4m'):
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
video_url_base + '.ism/video.f4m', elif video_url.endswith('.ism/manifest'):
video_id, f4m_id='hds', fatal=False)) formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
m3u8_formats = self._extract_m3u8_formats( else:
video_url_base + '.ism/video.m3u8', video_id, # mp4, wmv or something
'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) quality = file.get('quality', {})
self._sort_formats(m3u8_formats) formats.append({
m3u8_formats = list(filter( 'format_id': 'direct',
lambda f: f.get('vcodec') != 'none', m3u8_formats)) 'url': video_url,
formats.extend(m3u8_formats) 'ext': determine_ext(video_url, file['type']),
for i, m3u8_format in enumerate(m3u8_formats, 2): 'fps': int_or_none(quality.get('fps')),
http_url = '%s-%d.mp4' % (video_url_base, i) 'tbr': int_or_none(quality.get('bitrate')),
if self._is_valid_url(http_url, video_id): 'width': int_or_none(quality.get('width')),
f = m3u8_format.copy() 'height': int_or_none(quality.get('height')),
f.update({ })
'url': http_url,
'format_id': f['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
formats.append(f)
else:
formats = [{
'format_id': 'direct',
'url': video_url,
'ext': determine_ext(video_url, 'mp4'),
}]
self._sort_formats(formats) self._sort_formats(formats)
return { title = dict_get(info, ('subtitle', 'title', 'seoTitle'))
description = dict_get(info, ('description', 'seoDescription'))
thumbnails = []
for thumb in content.get('posters') or ():
thumb_url = thumb.get('src')
if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
continue
thumbnails.append({
'url': thumb.get('src'),
'width': thumb.get('width'),
'height': thumb.get('height'),
})
age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int)
if age_limit == 1:
age_limit = 0
duration = try_get(info, lambda x: x['duration'], int) if not is_live else None
subtitles = {}
for sub in content.get('subtitles') or []:
if not sub.get('url'):
continue
subtitles.setdefault(sub['lang'], []).append({
'url': sub['url'],
'ext': sub.get('type'),
})
info_dict = {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'thumbnail': thumbnail, 'description': description,
'thumbnails': thumbnails,
'age_limit': age_limit,
'is_live': is_live,
'duration': duration,
'formats': formats, 'formats': formats,
'subtitles': subtitles,
} }
# vod.tvp.pl
if info.get('vortalName') == 'vod':
info_dict.update({
'title': '%s, %s' % (info.get('title'), info.get('subtitle')),
'series': info.get('title'),
'season': info.get('season'),
'episode_number': info.get('episode'),
})
return info_dict
class TVPWebsiteIE(InfoExtractor): class TVPWebsiteIE(InfoExtractor):
IE_NAME = 'tvp:series' IE_NAME = 'tvp:series'

Loading…
Cancel
Save