diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 113609a6b..9ee598b8f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1078,6 +1078,7 @@ MicrosoftEmbedIE, MicrosoftMediusIE, MicrosoftLearnIE, + MicrosoftBuildIE, ) from .mildom import ( MildomIE, diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index d03578c74..5e640d0fb 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -149,10 +149,7 @@ def _real_extract(self, url): class MicrosoftLearnIE(MicrosoftMediusBaseIE): - _VALID_URL = [ - r'https://learn\.microsoft\.com/[\w\-]+/(events|shows)/[\w\-]+/(?P[^?#/]+)', - r'https://build\.microsoft\.com/[\w\-]+/sessions/(?P[0-9a-f\-]+)', - ] + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w\-]+/)?(?Pevents|shows)/(?P[\w\-]+)(?:/(?P[^?#/]+))?' _TESTS = [{ 'url': 'https://learn.microsoft.com/en-us/events/build-2022/ts01-rapidly-code-test-ship-from-secure-cloud-developer-environments', @@ -165,6 +162,14 @@ class MicrosoftLearnIE(MicrosoftMediusBaseIE): 'upload_date': '20220524', 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', }, + }, { + 'url': 'https://learn.microsoft.com/en-us/events/build-2022', + 'info_dict': { + 'id': 'build-2022', + 'title': 'Microsoft Build 2022 - Events', + 'description': 'md5:c16b43848027df837b22c6fbac7648d3', + }, + 'playlist_count': 201, }, { 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners/what-is-the-difference-between-a-terminal-and-a-shell-2-of-20-bash-for-beginners/', 'info_dict': { @@ -174,24 +179,34 @@ class MicrosoftLearnIE(MicrosoftMediusBaseIE): 'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88', 'timestamp': 1676339547, 'upload_date': '20230214', + 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.*\.png', 'subtitles': 'count:14', }, 'params': {'listsubtitles': True}, }, { - 'url': 'https://build.microsoft.com/en-US/sessions/49e81029-20f0-485b-b641-73b7f9622656?source=sessions', + 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners', 'info_dict': { - 'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3', - 'ext': 'ismv', - 'title': 'Microsoft Build: Highlights from 2023', - 'description': 'md5:24fb8410b48256bb42dfca37eb936583', - 'timestamp': 1684857600, - 'upload_date': '20230523', - 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'id': 'bash-for-beginners', + 'title': 'Bash for Beginners', + 'description': 'md5:16a91c07222117d1e00912f0dbc02c2c', }, + 'playlist_count': 20, }] + def _entries(self, url_base, video_id): + skip = 0 + while True: + playlist_info = self._download_json(f'{url_base}&$skip={skip}', video_id, f'Downloading entries {skip}') + items = traverse_obj(playlist_info, ( + 'results', ..., 'url', {lambda x: self.url_result(f'https://learn.microsoft.com/en-us{x}')})) + yield from items + skip += len(items) + if skip >= playlist_info['count'] or not items: + break + def _real_extract(self, url): - video_id = self._match_id(url) + video_type, series, slug = self._match_valid_url(url).groups() + video_id = slug or series webpage = self._download_webpage(url, video_id) metainfo = { @@ -199,34 +214,74 @@ def _real_extract(self, url): 'description': self._og_search_description(webpage), } - video_url = self._search_regex( - r'[0-9a-f\-]+)', + r'https?://build\.microsoft\.com/[\w\-]+/(?Psessions)/?(?:[?#]|$)', + ] - if self._og_search_url(webpage) == 'https://build.microsoft.com': - video_info = self._download_json( - f'https://api.build.microsoft.com/api/session/en-US-{video_id}', video_id) - return self.url_result(video_info['onDemand'], url_transparent=True, **metainfo, **{ - 'timestamp': traverse_obj(video_info, ('startDateTime', {parse_iso8601})), - }) + _TESTS = [{ + 'url': 'https://build.microsoft.com/en-US/sessions/49e81029-20f0-485b-b641-73b7f9622656?source=sessions', + 'info_dict': { + 'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'ext': 'ismv', + 'title': 'Microsoft Build opening', + 'description': 'md5:756ab1fb60bdc6923d627803694e9cc5', + 'timestamp': 1684857600, + 'upload_date': '20230523', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }, { + 'url': 'https://build.microsoft.com/en-US/sessions', + 'info_dict': { + 'id': 'sessions', + }, + 'playlist_mincount': 418, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + entries = [ + self.url_result(video_info['onDemand'], url_transparent=True, **traverse_obj(video_info, { + 'id': ('sessionId', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('startDateTime', {parse_iso8601}), + })) + for video_info in self._download_json( + 'https://api.build.microsoft.com/api/session/all/en-US', video_id, 'Downloading video info') + ] + if video_id == 'sessions': + return self.playlist_result(entries, video_id) + else: + return traverse_obj(entries, (lambda _, v: v['id'] == video_id), get_all=False)