From 07256b9fee23960799024b95d5972abc7174aa81 Mon Sep 17 00:00:00 2001 From: SirElderling <148036781+SirElderling@users.noreply.github.com> Date: Mon, 5 Feb 2024 00:35:52 +0000 Subject: [PATCH] [ie/nytimes] Overhaul extractors (#9075) Closes #2899, Closes #8605 Authored by: SirElderling --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/nytimes.py | 450 +++++++++++++++++++++----------- 2 files changed, 303 insertions(+), 148 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 04318a716..36335286c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1352,6 +1352,7 @@ from .nytimes import ( NYTimesIE, NYTimesArticleIE, NYTimesCookingIE, + NYTimesCookingRecipeIE, ) from .nuvid import NuvidIE from .nzherald import NZHeraldIE diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index 2e21edbb4..354eb02c3 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -1,50 +1,92 @@ -import hmac -import hashlib -import base64 +import json +import uuid from .common import InfoExtractor from ..utils import ( + ExtractorError, + clean_html, determine_ext, + extract_attributes, float_or_none, + get_elements_html_by_class, int_or_none, - js_to_json, + merge_dicts, mimetype2ext, parse_iso8601, + remove_end, remove_start, + str_or_none, + traverse_obj, + url_or_none, ) class NYTimesBaseIE(InfoExtractor): - _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v' - - def _extract_video_from_id(self, video_id): - # Authorization generation algorithm is reverse engineered from `signer` in - # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js - path = '/svc/video/api/v3/video/' + video_id - hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest() - video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={ - 'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(), - 'X-NYTV': 'vhs', - }, fatal=False) - if not video_data: - video_data = self._download_json( - 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id, - video_id, 'Downloading video JSON') - - title = video_data['headline'] - - def get_file_size(file_size): - if isinstance(file_size, int): - return file_size - elif isinstance(file_size, dict): - return int(file_size.get('value', 0)) - else: - return None - + _DNS_NAMESPACE = uuid.UUID('36dd619a-56dc-595b-9e09-37f4152c7b5d') + _TOKEN = 'MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAuNIzKBOFB77aT/jN/FQ+/QVKWq5V1ka1AYmCR9hstz1pGNPH5ajOU9gAqta0T89iPnhjwla+3oec/Z3kGjxbpv6miQXufHFq3u2RC6HyU458cLat5kVPSOQCe3VVB5NRpOlRuwKHqn0txfxnwSSj8mqzstR997d3gKB//RO9zE16y3PoWlDQXkASngNJEWvL19iob/xwAkfEWCjyRILWFY0JYX3AvLMSbq7wsqOCE5srJpo7rRU32zsByhsp1D5W9OYqqwDmflsgCEQy2vqTsJjrJohuNg+urMXNNZ7Y3naMoqttsGDrWVxtPBafKMI8pM2ReNZBbGQsQXRzQNo7+QIDAQAB' + _GRAPHQL_API = 'https://samizdat-graphql.nytimes.com/graphql/v2' + _GRAPHQL_QUERY = '''query VideoQuery($id: String!) { + video(id: $id) { + ... on Video { + bylines { + renderedRepresentation + } + duration + promotionalHeadline + promotionalMedia { + ... on Image { + crops { + name + renditions { + name + width + height + url + } + } + } + } + renditions { + type + width + height + url + bitrate + } + summary + } + } +}''' + + def _call_api(self, media_id): + # reference: `id-to-uri.js` + video_uuid = uuid.uuid5(self._DNS_NAMESPACE, 'video') + media_uuid = uuid.uuid5(video_uuid, media_id) + + return traverse_obj(self._download_json( + self._GRAPHQL_API, media_id, 'Downloading JSON from GraphQL API', data=json.dumps({ + 'query': self._GRAPHQL_QUERY, + 'variables': {'id': f'nyt://video/{media_uuid}'}, + }, separators=(',', ':')).encode(), headers={ + 'Content-Type': 'application/json', + 'Nyt-App-Type': 'vhs', + 'Nyt-App-Version': 'v3.52.21', + 'Nyt-Token': self._TOKEN, + 'Origin': 'https://nytimes.com', + }, fatal=False), ('data', 'video', {dict})) or {} + + def _extract_thumbnails(self, thumbs): + return traverse_obj(thumbs, (lambda _, v: url_or_none(v['url']), { + 'url': 'url', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), default=None) + + def _extract_formats_and_subtitles(self, video_id, content_media_json): urls = [] formats = [] subtitles = {} - for video in video_data.get('renditions', []): + for video in traverse_obj(content_media_json, ('renditions', ..., {dict})): video_url = video.get('url') format_id = video.get('type') if not video_url or format_id == 'thumbs' or video_url in urls: @@ -56,11 +98,9 @@ class NYTimesBaseIE(InfoExtractor): video_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id or 'hls', fatal=False) formats.extend(m3u8_fmts) - subtitles = self._merge_subtitles(subtitles, m3u8_subs) + self._merge_subtitles(m3u8_subs, target=subtitles) elif ext == 'mpd': - continue - # formats.extend(self._extract_mpd_formats( - # video_url, video_id, format_id or 'dash', fatal=False)) + continue # all mpd urls give 404 errors else: formats.append({ 'url': video_url, @@ -68,55 +108,49 @@ class NYTimesBaseIE(InfoExtractor): 'vcodec': video.get('videoencoding') or video.get('video_codec'), 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), - 'filesize': get_file_size(video.get('file_size') or video.get('fileSize')), + 'filesize': traverse_obj(video, ( + ('file_size', 'fileSize'), (None, ('value')), {int_or_none}), get_all=False), 'tbr': int_or_none(video.get('bitrate'), 1000) or None, 'ext': ext, }) - thumbnails = [] - for image in video_data.get('images', []): - image_url = image.get('url') - if not image_url: - continue - thumbnails.append({ - 'url': 'http://www.nytimes.com/' + image_url, - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - }) + return formats, subtitles - publication_date = video_data.get('publication_date') - timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None + def _extract_video(self, media_id): + data = self._call_api(media_id) + formats, subtitles = self._extract_formats_and_subtitles(media_id, data) return { - 'id': video_id, - 'title': title, - 'description': video_data.get('summary'), - 'timestamp': timestamp, - 'uploader': video_data.get('byline'), - 'duration': float_or_none(video_data.get('duration'), 1000), + 'id': media_id, + 'title': data.get('promotionalHeadline'), + 'description': data.get('summary'), + 'duration': float_or_none(data.get('duration'), scale=1000), + 'creator': ', '.join(traverse_obj(data, ( # TODO: change to 'creators' + 'bylines', ..., 'renderedRepresentation', {lambda x: remove_start(x, 'By ')}))), 'formats': formats, 'subtitles': subtitles, - 'thumbnails': thumbnails, + 'thumbnails': self._extract_thumbnails( + traverse_obj(data, ('promotionalMedia', 'crops', ..., 'renditions', ...))), } class NYTimesIE(NYTimesBaseIE): _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P\d+)' _EMBED_REGEX = [r']+src=(["\'])(?P(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>'] - _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', - 'md5': 'd665342765db043f7e225cff19df0f2d', + 'md5': 'a553aa344014e3723d33893d89d4defc', 'info_dict': { 'id': '100000002847155', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Verbatim: What Is a Photocopier?', 'description': 'md5:93603dada88ddbda9395632fdc5da260', - 'timestamp': 1398631707, - 'upload_date': '20140427', - 'uploader': 'Brett Weiner', + 'timestamp': 1398631707, # FIXME + 'upload_date': '20140427', # FIXME + 'creator': 'Brett Weiner', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.+\.jpg', 'duration': 419, - } + }, }, { 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', 'only_matching': True, @@ -125,138 +159,258 @@ class NYTimesIE(NYTimesBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_video_from_id(video_id) + return self._extract_video(video_id) class NYTimesArticleIE(NYTimesBaseIE): - _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?[^.]+)(?:\.html)?' + _VALID_URL = r'https?://(?:www\.)?nytimes\.com/\d{4}/\d{2}/\d{2}/(?!books|podcasts)[^/?#]+/(?:\w+/)?(?P[^./?#]+)(?:\.html)?' _TESTS = [{ 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0', - 'md5': 'e2076d58b4da18e6a001d53fd56db3c9', + 'md5': '3eb5ddb1d6f86254fe4f233826778737', 'info_dict': { 'id': '100000003628438', - 'ext': 'mov', - 'title': 'New Minimum Wage: $70,000 a Year', - 'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.', - 'timestamp': 1429033037, + 'ext': 'mp4', + 'title': 'One Company’s New Minimum Wage: $70,000 a Year', + 'description': 'md5:89ba9ab67ca767bb92bf823d1f138433', + 'timestamp': 1429047468, 'upload_date': '20150414', 'uploader': 'Matthew Williams', - } + 'creator': 'Patricia Cohen', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + 'duration': 119.0, + }, }, { - 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html', - 'md5': 'e0d52040cafb07662acf3c9132db3575', + # article with audio and no video + 'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html', + 'md5': '2365b3555c8aa7f4dd34ca735ad02e6a', 'info_dict': { - 'id': '100000004709062', - 'title': 'The Run-Up: ‘He Was Like an Octopus’', + 'id': '100000009110381', 'ext': 'mp3', - 'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4', - 'series': 'The Run-Up', - 'episode': '‘He Was Like an Octopus’', - 'episode_number': 20, - 'duration': 2130, - } + 'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?', + 'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e', + 'timestamp': 1695960700, + 'upload_date': '20230929', + 'creator': 'Stephanie Nolen, Natalija Gormalova', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + 'duration': 1322, + }, }, { - 'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html', + 'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html', + 'md5': '3eb5ddb1d6f86254fe4f233826778737', 'info_dict': { - 'id': '100000004709479', - 'title': 'The Rise of Hitler', - 'ext': 'mp3', - 'description': 'md5:bce877fd9e3444990cb141875fab0028', - 'creator': 'Pamela Paul', - 'duration': 3475, + 'id': '100000009202270', + 'ext': 'mp4', + 'title': 'Kamala Harris Defends Biden Policies, but Says ‘More Work’ Needed to Reach Voters', + 'description': 'md5:de4212a7e19bb89e4fb14210ca915f1f', + 'timestamp': 1701290997, + 'upload_date': '20231129', + 'uploader': 'By The New York Times', + 'creator': 'Katie Rogers', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + 'duration': 97.631, }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', + }, + }, { + # multiple videos in the same article + 'url': 'https://www.nytimes.com/2023/12/02/business/air-traffic-controllers-safety.html', + 'info_dict': { + 'id': 'air-traffic-controllers-safety', + 'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink', + 'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d', + 'upload_date': '20231202', + 'creator': 'Emily Steel, Sydney Ember', + 'timestamp': 1701511264, }, + 'playlist_count': 3, }, { - 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1', + 'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html', 'only_matching': True, }] - def _extract_podcast_from_json(self, json, page_id, webpage): - podcast_audio = self._parse_json( - json, page_id, transform_source=js_to_json) + def _extract_content_from_block(self, block): + details = traverse_obj(block, { + 'id': ('sourceId', {str}), + 'uploader': ('bylines', ..., 'renderedRepresentation', {str}), + 'duration': (None, (('duration', {lambda x: float_or_none(x, scale=1000)}), ('length', {int_or_none}))), + 'timestamp': ('firstPublished', {parse_iso8601}), + 'series': ('podcastSeries', {str}), + }, get_all=False) + + formats, subtitles = self._extract_formats_and_subtitles(details.get('id'), block) + # audio articles will have an url and no formats + url = traverse_obj(block, ('fileUrl', {url_or_none})) + if not formats and url: + formats.append({'url': url, 'vcodec': 'none'}) - audio_data = podcast_audio['data'] - track = audio_data['track'] - - episode_title = track['title'] - video_url = track['source'] + return { + **details, + 'thumbnails': self._extract_thumbnails(traverse_obj( + block, ('promotionalMedia', 'crops', ..., 'renditions', ...))), + 'formats': formats, + 'subtitles': subtitles + } - description = track.get('description') or self._html_search_meta( - ['og:description', 'twitter:description'], webpage) + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + art_json = self._search_json( + r'window\.__preloadedData\s*=', webpage, 'media details', page_id, + transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article'] + + blocks = traverse_obj(art_json, ( + 'sprinkledBody', 'content', ..., ('ledeMedia', None), + lambda _, v: v['__typename'] in ('Video', 'Audio'))) + if not blocks: + raise ExtractorError('Unable to extract any media blocks from webpage') + + common_info = { + 'title': remove_end(self._html_extract_title(webpage), ' - The New York Times'), + 'description': traverse_obj(art_json, ( + 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}), + get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})), + 'creator': ', '.join( + traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list) + 'thumbnails': self._extract_thumbnails(traverse_obj( + art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))), + } - podcast_title = audio_data.get('podcast', {}).get('title') - title = ('%s: %s' % (podcast_title, episode_title) - if podcast_title else episode_title) + entries = [] + for block in blocks: + entries.append(merge_dicts(self._extract_content_from_block(block), common_info)) - episode = audio_data.get('podcast', {}).get('episode') or '' - episode_number = int_or_none(self._search_regex( - r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None)) + if len(entries) > 1: + return self.playlist_result(entries, page_id, **common_info) return { - 'id': remove_start(podcast_audio.get('target'), 'FT') or page_id, - 'url': video_url, - 'title': title, - 'description': description, - 'creator': track.get('credit'), - 'series': podcast_title, - 'episode': episode_title, - 'episode_number': episode_number, - 'duration': int_or_none(track.get('duration')), + 'id': page_id, + **entries[0], } + +class NYTimesCookingIE(NYTimesBaseIE): + IE_NAME = 'NYTimesCookingGuide' + _VALID_URL = r'https?://cooking\.nytimes\.com/guides/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', + 'info_dict': { + 'id': '13-how-to-cook-a-turkey', + 'title': 'How to Cook a Turkey', + 'description': 'md5:726cfd3f9b161bdf5c279879e8050ca0', + }, + 'playlist_count': 2, + }, { + # single video example + 'url': 'https://cooking.nytimes.com/guides/50-how-to-make-mac-and-cheese', + 'md5': '64415805fe0b8640fce6b0b9def5989a', + 'info_dict': { + 'id': '100000005835845', + 'ext': 'mp4', + 'title': 'How to Make Mac and Cheese', + 'description': 'md5:b8f2f33ec1fb7523b21367147c9594f1', + 'duration': 9.51, + 'creator': 'Alison Roman', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + }, + }, { + 'url': 'https://cooking.nytimes.com/guides/20-how-to-frost-a-cake', + 'md5': '64415805fe0b8640fce6b0b9def5989a', + 'info_dict': { + 'id': '20-how-to-frost-a-cake', + 'title': 'How to Frost a Cake', + 'description': 'md5:a31fe3b98a8ce7b98aae097730c269cd', + }, + 'playlist_count': 8, + }] + def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + description = self._html_search_meta(['og:description', 'twitter:description'], webpage) - video_id = self._search_regex( - r'data-videoid=["\'](\d+)', webpage, 'video id', - default=None, fatal=False) - if video_id is not None: - return self._extract_video_from_id(video_id) + lead_video_id = self._search_regex( + r'data-video-player-id="(\d+)">', webpage, 'lead video') + media_ids = traverse_obj( + get_elements_html_by_class('video-item', webpage), (..., {extract_attributes}, 'data-video-id')) - podcast_data = self._search_regex( - (r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*([^<]+)

', webpage, 'author', default=None), + } -class NYTimesCookingIE(NYTimesBaseIE): - _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P\d+)' + +class NYTimesCookingRecipeIE(InfoExtractor): + _VALID_URL = r'https?://cooking\.nytimes\.com/recipes/(?P\d+)' _TESTS = [{ 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', - 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'md5': '579e83bbe8e61e9de67f80edba8a78a8', 'info_dict': { - 'id': '100000004756089', - 'ext': 'mov', - 'timestamp': 1479383008, - 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', - 'title': 'Cranberry Tart', - 'upload_date': '20161117', - 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + 'id': '1017817', + 'ext': 'mp4', + 'title': 'Cranberry Curd Tart', + 'description': 'md5:ad77a3fc321db636256d4343c5742152', + 'timestamp': 1447804800, + 'upload_date': '20151118', + 'creator': 'David Tanis', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', }, }, { - 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', - 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'url': 'https://cooking.nytimes.com/recipes/1024781-neapolitan-checkerboard-cookies', + 'md5': '58df35998241dcf0620e99e646331b42', 'info_dict': { - 'id': '100000003951728', - 'ext': 'mov', - 'timestamp': 1445509539, - 'description': 'Turkey guide', - 'upload_date': '20151022', - 'title': 'Turkey', - } + 'id': '1024781', + 'ext': 'mp4', + 'title': 'Neapolitan Checkerboard Cookies', + 'description': 'md5:ba12394c585ababea951cb6d2fcc6631', + 'timestamp': 1701302400, + 'upload_date': '20231130', + 'creator': 'Sue Li', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + }, + }, { + 'url': 'https://cooking.nytimes.com/recipes/1019516-overnight-oats', + 'md5': '2fe7965a3adc899913b8e25ada360823', + 'info_dict': { + 'id': '1019516', + 'ext': 'mp4', + 'timestamp': 1546387200, + 'description': 'md5:8856ce10239161bd2596ac335b9f9bfb', + 'upload_date': '20190102', + 'title': 'Overnight Oats', + 'creator': 'Genevieve Ko', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + }, }] def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + recipe_data = self._search_nextjs_data(webpage, page_id)['props']['pageProps']['recipe'] - video_id = self._search_regex( - r'data-video-id=["\'](\d+)', webpage, 'video id') + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + recipe_data['videoSrc'], page_id, 'mp4', m3u8_id='hls') - return self._extract_video_from_id(video_id) + return { + **traverse_obj(recipe_data, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'description': ('topnote', {clean_html}), + 'timestamp': ('publishedAt', {int_or_none}), + 'creator': ('contentAttribution', 'cardByline', {str}), + }), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': [{'url': thumb_url} for thumb_url in traverse_obj( + recipe_data, ('image', 'crops', 'recipe', ..., {url_or_none}))], + }