From 954e57e405f79188450eb30103a9308732cd318f Mon Sep 17 00:00:00 2001
From: bytedream <63594396+bytedream@users.noreply.github.com>
Date: Sat, 6 Apr 2024 12:53:20 +0200
Subject: [PATCH] [ie/crunchyroll] Fix extractor (#9615)

Authored by: bytedream
---
 README.md                       |   3 +-
 yt_dlp/extractor/crunchyroll.py | 143 +++++++++++++++++---------------
 2 files changed, 75 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index d4dd2c7be..ee1b59990 100644
--- a/README.md
+++ b/README.md
@@ -1784,8 +1784,7 @@ The following extractors use this feature:
 * `version`: The video version to extract - `uncut` or `simulcast`
 
 #### crunchyrollbeta (Crunchyroll)
-* `format`: Which stream type(s) to extract (default: `adaptive_hls`). Potentially useful values include `adaptive_hls`, `adaptive_dash`, `vo_adaptive_hls`, `vo_adaptive_dash`, `download_hls`, `download_dash`, `multitrack_adaptive_hls_v2`
-* `hardsub`: Preference order for which hardsub versions to extract, or `all` (default: `None` = no hardsubs), e.g. `crunchyrollbeta:hardsub=en-US,None`
+* `hardsub`: One or more hardsub versions to extract (in order of preference), or `all` (default: `None` = no hardsubs will be extracted), e.g. `crunchyrollbeta:hardsub=en-US,de-DE`
 
 #### vikichannel
 * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers`
diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py
index d35e9995a..118b575ab 100644
--- a/yt_dlp/extractor/crunchyroll.py
+++ b/yt_dlp/extractor/crunchyroll.py
@@ -1,4 +1,5 @@
 import base64
+import uuid
 
 from .common import InfoExtractor
 from ..networking.exceptions import HTTPError
@@ -7,12 +8,11 @@ from ..utils import (
     float_or_none,
     format_field,
     int_or_none,
-    join_nonempty,
+    jwt_decode_hs256,
     parse_age_limit,
     parse_count,
     parse_iso8601,
     qualities,
-    remove_start,
     time_seconds,
     traverse_obj,
     url_or_none,
@@ -27,6 +27,7 @@ class CrunchyrollBaseIE(InfoExtractor):
     _AUTH_HEADERS = None
     _API_ENDPOINT = None
     _BASIC_AUTH = None
+    _IS_PREMIUM = None
     _CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q')
     _LOCALE_LOOKUP = {
         'ar': 'ar-SA',
@@ -84,11 +85,16 @@ class CrunchyrollBaseIE(InfoExtractor):
             self.write_debug(f'Using cxApiParam={cx_api_param}')
             CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode()
 
-        grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id'
+        auth_headers = {'Authorization': CrunchyrollBaseIE._BASIC_AUTH}
+        if self.is_logged_in:
+            grant_type = 'etp_rt_cookie'
+        else:
+            grant_type = 'client_id'
+            auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
         try:
             auth_response = self._download_json(
                 f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}',
-                headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode())
+                headers=auth_headers, data=f'grant_type={grant_type}'.encode())
         except ExtractorError as error:
             if isinstance(error.cause, HTTPError) and error.cause.status == 403:
                 raise ExtractorError(
@@ -97,6 +103,7 @@ class CrunchyrollBaseIE(InfoExtractor):
                     'and your browser\'s User-Agent (with --user-agent)', expected=True)
             raise
 
+        CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(auth_response, ('access_token', {jwt_decode_hs256}, 'benefits', ...))
         CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']}
         CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10)
 
@@ -135,62 +142,72 @@ class CrunchyrollBaseIE(InfoExtractor):
             raise ExtractorError(f'Unexpected response when downloading {note} JSON')
         return result
 
-    def _extract_formats(self, stream_response, display_id=None):
-        requested_formats = self._configuration_arg('format') or ['vo_adaptive_hls']
-        available_formats = {}
-        for stream_type, streams in traverse_obj(
-                stream_response, (('streams', ('data', 0)), {dict.items}, ...)):
-            if stream_type not in requested_formats:
+    def _extract_chapters(self, internal_id):
+        # if no skip events are available, a 403 xml error is returned
+        skip_events = self._download_json(
+            f'https://static.crunchyroll.com/skip-events/production/{internal_id}.json',
+            internal_id, note='Downloading chapter info', fatal=False, errnote=False)
+        if not skip_events:
+            return None
+
+        chapters = []
+        for event in ('recap', 'intro', 'credits', 'preview'):
+            start = traverse_obj(skip_events, (event, 'start', {float_or_none}))
+            end = traverse_obj(skip_events, (event, 'end', {float_or_none}))
+            # some chapters have no start and/or ending time, they will just be ignored
+            if start is None or end is None:
                 continue
-            for stream in traverse_obj(streams, lambda _, v: v['url']):
-                hardsub_lang = stream.get('hardsub_locale') or ''
-                format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s'))
-                available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url'])
+            chapters.append({'title': event.capitalize(), 'start_time': start, 'end_time': end})
+
+        return chapters
+
+    def _extract_stream(self, identifier, display_id=None):
+        if not display_id:
+            display_id = identifier
+
+        self._update_auth()
+        stream_response = self._download_json(
+            f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play',
+            display_id, note='Downloading stream info', headers=CrunchyrollBaseIE._AUTH_HEADERS)
+
+        available_formats = {'': ('', '', stream_response['url'])}
+        for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])):
+            available_formats[hardsub_lang] = (f'hardsub-{hardsub_lang}', hardsub_lang, stream['url'])
 
         requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
-        if '' in available_formats and 'all' not in requested_hardsubs:
+        hardsub_langs = [lang for lang in available_formats if lang]
+        if hardsub_langs and 'all' not in requested_hardsubs:
             full_format_langs = set(requested_hardsubs)
+            self.to_screen(f'Available hardsub languages: {", ".join(hardsub_langs)}')
             self.to_screen(
-                'To get all formats of a hardsub language, use '
+                'To extract formats of a hardsub language, use '
                 '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". '
                 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info',
                 only_once=True)
         else:
             full_format_langs = set(map(str.lower, available_formats))
 
-        audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False)
+        audio_locale = traverse_obj(stream_response, ('audioLocale', {str}))
         hardsub_preference = qualities(requested_hardsubs[::-1])
-        formats = []
-        for stream_type, format_id, hardsub_lang, stream_url in available_formats.values():
-            if stream_type.endswith('hls'):
-                if hardsub_lang.lower() in full_format_langs:
-                    adaptive_formats = self._extract_m3u8_formats(
-                        stream_url, display_id, 'mp4', m3u8_id=format_id,
-                        fatal=False, note=f'Downloading {format_id} HLS manifest')
-                else:
-                    adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),)
-            elif stream_type.endswith('dash'):
-                adaptive_formats = self._extract_mpd_formats(
-                    stream_url, display_id, mpd_id=format_id,
-                    fatal=False, note=f'Downloading {format_id} MPD manifest')
+        formats, subtitles = [], {}
+        for format_id, hardsub_lang, stream_url in available_formats.values():
+            if hardsub_lang.lower() in full_format_langs:
+                adaptive_formats, dash_subs = self._extract_mpd_formats_and_subtitles(
+                    stream_url, display_id, mpd_id=format_id, headers=CrunchyrollBaseIE._AUTH_HEADERS,
+                    fatal=False, note=f'Downloading {f"{format_id} " if hardsub_lang else ""}MPD manifest')
+                self._merge_subtitles(dash_subs, target=subtitles)
             else:
-                self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True)
-                continue
+                continue  # XXX: Update this if/when meta mpd formats are working
             for f in adaptive_formats:
                 if f.get('acodec') != 'none':
                     f['language'] = audio_locale
                 f['quality'] = hardsub_preference(hardsub_lang.lower())
             formats.extend(adaptive_formats)
 
-        return formats
-
-    def _extract_subtitles(self, data):
-        subtitles = {}
+        for locale, subtitle in traverse_obj(stream_response, (('subtitles', 'captions'), {dict.items}, ...)):
+            subtitles.setdefault(locale, []).append(traverse_obj(subtitle, {'url': 'url', 'ext': 'format'}))
 
-        for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)):
-            subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})]
-
-        return subtitles
+        return formats, subtitles
 
 
 class CrunchyrollCmsBaseIE(CrunchyrollBaseIE):
@@ -245,7 +262,11 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
             'like_count': int,
             'dislike_count': int,
         },
-        'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'},
+        'params': {
+            'skip_download': 'm3u8',
+            'extractor_args': {'crunchyrollbeta': {'hardsub': ['de-DE']}},
+            'format': 'bv[format_id~=hardsub]',
+        },
     }, {
         # Premium only
         'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR',
@@ -306,6 +327,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
             'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
         },
         'params': {'skip_download': 'm3u8'},
+        'skip': 'no longer exists',
     }, {
         'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6',
         'info_dict': {
@@ -359,31 +381,15 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
         else:
             raise ExtractorError(f'Unknown object type {object_type}')
 
-        # There might be multiple audio languages for one object (`<object>_metadata.versions`),
-        # so we need to get the id from `streams_link` instead or we dont know which language to choose
-        streams_link = response.get('streams_link')
-        if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
+        if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
             message = f'This {object_type} is for premium members only'
             if self.is_logged_in:
                 raise ExtractorError(message, expected=True)
             self.raise_login_required(message)
 
-        # We need go from unsigned to signed api to avoid getting soft banned
-        stream_response = self._call_cms_api_signed(remove_start(
-            streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info')
-        result['formats'] = self._extract_formats(stream_response, internal_id)
-        result['subtitles'] = self._extract_subtitles(stream_response)
+        result['formats'], result['subtitles'] = self._extract_stream(internal_id)
 
-        # if no intro chapter is available, a 403 without usable data is returned
-        intro_chapter = self._download_json(
-            f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json',
-            internal_id, note='Downloading chapter info', fatal=False, errnote=False)
-        if isinstance(intro_chapter, dict):
-            result['chapters'] = [{
-                'title': 'Intro',
-                'start_time': float_or_none(intro_chapter.get('startTime')),
-                'end_time': float_or_none(intro_chapter.get('endTime')),
-            }]
+        result['chapters'] = self._extract_chapters(internal_id)
 
         def calculate_count(item):
             return parse_count(''.join((item['displayed'], item.get('unit') or '')))
@@ -512,7 +518,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
             'display_id': 'egaono-hana',
             'title': 'Egaono Hana',
             'track': 'Egaono Hana',
-            'artist': 'Goose house',
+            'artists': ['Goose house'],
             'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
             'genres': ['J-Pop'],
         },
@@ -525,11 +531,12 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
             'display_id': 'crossing-field',
             'title': 'Crossing Field',
             'track': 'Crossing Field',
-            'artist': 'LiSA',
+            'artists': ['LiSA'],
             'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
             'genres': ['Anime'],
         },
         'params': {'skip_download': 'm3u8'},
+        'skip': 'no longer exists',
     }, {
         'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135',
         'info_dict': {
@@ -538,7 +545,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
             'display_id': 'live-is-smile-always-364joker-at-yokohama-arena',
             'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
             'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
-            'artist': 'LiSA',
+            'artists': ['LiSA'],
             'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
             'description': 'md5:747444e7e6300907b7a43f0a0503072e',
             'genres': ['J-Pop'],
@@ -566,16 +573,14 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
         if not response:
             raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True)
 
-        streams_link = response.get('streams_link')
-        if not streams_link and response.get('isPremiumOnly'):
+        if not self._IS_PREMIUM and response.get('isPremiumOnly'):
             message = f'This {response.get("type") or "media"} is for premium members only'
             if self.is_logged_in:
                 raise ExtractorError(message, expected=True)
             self.raise_login_required(message)
 
         result = self._transform_music_response(response)
-        stream_response = self._call_api(streams_link, internal_id, lang, 'stream info')
-        result['formats'] = self._extract_formats(stream_response, internal_id)
+        result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)
 
         return result
 
@@ -587,7 +592,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
                 'display_id': 'slug',
                 'title': 'title',
                 'track': 'title',
-                'artist': ('artist', 'name'),
+                'artists': ('artist', 'name', all),
                 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}),
                 'thumbnails': ('images', ..., ..., {
                     'url': ('source', {url_or_none}),
@@ -611,7 +616,7 @@ class CrunchyrollArtistIE(CrunchyrollBaseIE):
         'info_dict': {
             'id': 'MA179CB50D',
             'title': 'LiSA',
-            'genres': ['J-Pop', 'Anime', 'Rock'],
+            'genres': ['Anime', 'J-Pop', 'Rock'],
             'description': 'md5:16d87de61a55c3f7d6c454b73285938e',
         },
         'playlist_mincount': 83,