diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cd7ead796..4bed5af6a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -164,7 +164,7 @@ jobs: - name: Upload artifacts uses: actions/upload-artifact@v4 with: - name: build-${{ github.job }} + name: build-bin-${{ github.job }} path: | yt-dlp yt-dlp.tar.gz @@ -227,7 +227,7 @@ jobs: - name: Upload artifacts uses: actions/upload-artifact@v4 with: - name: build-linux_${{ matrix.architecture }} + name: build-bin-linux_${{ matrix.architecture }} path: | # run-on-arch-action designates armv7l as armv7 repo/dist/yt-dlp_linux_${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }} compression-level: 0 @@ -271,7 +271,7 @@ jobs: - name: Upload artifacts uses: actions/upload-artifact@v4 with: - name: build-${{ github.job }} + name: build-bin-${{ github.job }} path: | dist/yt-dlp_macos dist/yt-dlp_macos.zip @@ -324,7 +324,7 @@ jobs: - name: Upload artifacts uses: actions/upload-artifact@v4 with: - name: build-${{ github.job }} + name: build-bin-${{ github.job }} path: | dist/yt-dlp_macos_legacy compression-level: 0 @@ -373,7 +373,7 @@ jobs: - name: Upload artifacts uses: actions/upload-artifact@v4 with: - name: build-${{ github.job }} + name: build-bin-${{ github.job }} path: | dist/yt-dlp.exe dist/yt-dlp_min.exe @@ -421,7 +421,7 @@ jobs: - name: Upload artifacts uses: actions/upload-artifact@v4 with: - name: build-${{ github.job }} + name: build-bin-${{ github.job }} path: | dist/yt-dlp_x86.exe compression-level: 0 @@ -441,7 +441,7 @@ jobs: - uses: actions/download-artifact@v4 with: path: artifact - pattern: build-* + pattern: build-bin-* merge-multiple: true - name: Make SHA2-SUMS files @@ -484,3 +484,4 @@ jobs: _update_spec SHA*SUMS* compression-level: 0 + overwrite: true diff --git a/README.md b/README.md index 2fcb09917..99235220a 100644 --- a/README.md +++ b/README.md @@ -167,8 +167,8 @@ For ease of use, a few more compat options are available: * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress` -* `--compat-options 2023`: Same as `--compat-options prefer-legacy-http-handler,manifest-filesize-approx`. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` +* `--compat-options 2023`: Currently does nothing. Use this to enable all future compat options # INSTALLATION @@ -218,7 +218,7 @@ Example usage: yt-dlp --update-to nightly # To install nightly with pip: -python -m pip install -U --pre yt-dlp +python -m pip install -U --pre yt-dlp[default] ``` @@ -1310,8 +1310,11 @@ The available fields are: - `description` (string): The description of the video - `display_id` (string): An alternative identifier for the video - `uploader` (string): Full name of the video uploader + - `uploader_id` (string): Nickname or id of the video uploader + - `uploader_url` (string): URL to the video uploader's profile - `license` (string): License name the video is licensed under - - `creator` (string): The creator of the video + - `creators` (list): The creators of the video + - `creator` (string): The creators of the video; comma-separated - `timestamp` (numeric): UNIX timestamp of the moment the video became available - `upload_date` (string): Video upload date in UTC (YYYYMMDD) - `release_timestamp` (numeric): UNIX timestamp of the moment the video was released @@ -1319,9 +1322,9 @@ The available fields are: - `release_year` (numeric): Year (YYYY) when the video or album was released - `modified_timestamp` (numeric): UNIX timestamp of the moment the video was last modified - `modified_date` (string): The date (YYYYMMDD) when the video was last modified in UTC - - `uploader_id` (string): Nickname or id of the video uploader - `channel` (string): Full name of the channel the video is uploaded on - `channel_id` (string): Id of the channel + - `channel_url` (string): URL of the channel - `channel_follower_count` (numeric): Number of followers of the channel - `channel_is_verified` (boolean): Whether the channel is verified on the platform - `location` (string): Physical location where the video was filmed @@ -1361,7 +1364,10 @@ The available fields are: - `webpage_url_basename` (string): The basename of the webpage URL - `webpage_url_domain` (string): The domain of the webpage URL - `original_url` (string): The URL given by the user (or same as `webpage_url` for playlist entries) - + - `categories` (list): List of categories the video belongs to + - `tags` (list): List of tags assigned to the video + - `cast` (list): List of cast members + All the fields in [Filtering Formats](#filtering-formats) can also be used Available for the video that belongs to some logical chapter or section: @@ -1373,6 +1379,7 @@ Available for the video that belongs to some logical chapter or section: Available for the video that is an episode of some series or programme: - `series` (string): Title of the series or programme the video episode belongs to + - `series_id` (string): Id of the series or programme the video episode belongs to - `season` (string): Title of the season the video episode belongs to - `season_number` (numeric): Number of the season the video episode belongs to - `season_id` (string): Id of the season the video episode belongs to @@ -1385,11 +1392,16 @@ Available for the media that is a track or a part of a music album: - `track` (string): Title of the track - `track_number` (numeric): Number of the track within an album or a disc - `track_id` (string): Id of the track - - `artist` (string): Artist(s) of the track - - `genre` (string): Genre(s) of the track + - `artists` (list): Artist(s) of the track + - `artist` (string): Artist(s) of the track; comma-separated + - `genres` (list): Genre(s) of the track + - `genre` (string): Genre(s) of the track; comma-separated + - `composers` (list): Composer(s) of the piece + - `composer` (string): Composer(s) of the piece; comma-separated - `album` (string): Title of the album the track belongs to - `album_type` (string): Type of the album - - `album_artist` (string): List of all artists appeared on the album + - `album_artists` (list): All artists appeared on the album + - `album_artist` (string): All artists appeared on the album; comma-separated - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to Available only when using `--download-sections` and for `chapter:` prefix when using `--split-chapters` for videos with internal chapters: @@ -1767,10 +1779,11 @@ Metadata fields | From `description`, `synopsis` | `description` `purl`, `comment` | `webpage_url` `track` | `track_number` -`artist` | `artist`, `creator`, `uploader` or `uploader_id` -`genre` | `genre` +`artist` | `artist`, `artists`, `creator`, `creators`, `uploader` or `uploader_id` +`composer` | `composer` or `composers` +`genre` | `genre` or `genres` `album` | `album` -`album_artist` | `album_artist` +`album_artist` | `album_artist` or `album_artists` `disc` | `disc_number` `show` | `series` `season_number` | `season_number` diff --git a/devscripts/install_deps.py b/devscripts/install_deps.py index 715e5b044..889d9abeb 100755 --- a/devscripts/install_deps.py +++ b/devscripts/install_deps.py @@ -19,7 +19,7 @@ def parse_args(): parser.add_argument( 'input', nargs='?', metavar='TOMLFILE', default='pyproject.toml', help='Input file (default: %(default)s)') parser.add_argument( - '-e', '--exclude', metavar='REQUIREMENT', action='append', help='Exclude a required dependency') + '-e', '--exclude', metavar='DEPENDENCY', action='append', help='Exclude a dependency') parser.add_argument( '-i', '--include', metavar='GROUP', action='append', help='Include an optional dependency group') parser.add_argument( @@ -33,21 +33,28 @@ def parse_args(): def main(): args = parse_args() - toml_data = parse_toml(read_file(args.input)) - deps = toml_data['project']['dependencies'] - targets = deps.copy() if not args.only_optional else [] - - for exclude in args.exclude or []: - for dep in deps: - simplified_dep = re.match(r'[\w-]+', dep)[0] - if dep in targets and (exclude.lower() == simplified_dep.lower() or exclude == dep): - targets.remove(dep) - - optional_deps = toml_data['project']['optional-dependencies'] - for include in args.include or []: - group = optional_deps.get(include) - if group: - targets.extend(group) + project_table = parse_toml(read_file(args.input))['project'] + optional_groups = project_table['optional-dependencies'] + excludes = args.exclude or [] + + deps = [] + if not args.only_optional: # `-o` should exclude 'dependencies' and the 'default' group + deps.extend(project_table['dependencies']) + if 'default' not in excludes: # `--exclude default` should exclude entire 'default' group + deps.extend(optional_groups['default']) + + def name(dependency): + return re.match(r'[\w-]+', dependency)[0].lower() + + target_map = {name(dep): dep for dep in deps} + + for include in filter(None, map(optional_groups.get, args.include or [])): + target_map.update(zip(map(name, include), include)) + + for exclude in map(name, excludes): + target_map.pop(exclude, None) + + targets = list(target_map.values()) if args.print: for target in targets: diff --git a/pyproject.toml b/pyproject.toml index 0c9c5fc01..dda43288f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ ] [project.optional-dependencies] +default = [] secretstorage = [ "cffi", "secretstorage", diff --git a/test/helper.py b/test/helper.py index 4aca47025..7760fd8d7 100644 --- a/test/helper.py +++ b/test/helper.py @@ -223,6 +223,10 @@ def sanitize_got_info_dict(got_dict): if test_info_dict.get('display_id') == test_info_dict.get('id'): test_info_dict.pop('display_id') + # Remove deprecated fields + for old in YoutubeDL._deprecated_multivalue_fields.keys(): + test_info_dict.pop(old, None) + # release_year may be generated from release_date if try_call(lambda: test_info_dict['release_year'] == int(test_info_dict['release_date'][:4])): test_info_dict.pop('release_year') diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 0087cbc94..6be47af97 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -941,7 +941,7 @@ class TestYoutubeDL(unittest.TestCase): def get_videos(filter_=None): ydl = YDL({'match_filter': filter_, 'simulate': True}) for v in videos: - ydl.process_ie_result(v, download=True) + ydl.process_ie_result(v.copy(), download=True) return [v['id'] for v in ydl.downloaded_info_dicts] res = get_videos() diff --git a/test/test_websockets.py b/test/test_websockets.py index 91bac3442..13b3a1e76 100644 --- a/test/test_websockets.py +++ b/test/test_websockets.py @@ -192,8 +192,8 @@ class TestWebsSocketRequestHandlerConformance: @pytest.mark.parametrize('handler', ['Websockets'], indirect=True) @pytest.mark.parametrize('params,extensions', [ - ({'timeout': 0.00001}, {}), - ({}, {'timeout': 0.00001}), + ({'timeout': sys.float_info.min}, {}), + ({}, {'timeout': sys.float_info.min}), ]) def test_timeout(self, handler, params, extensions): with handler(**params) as rh: diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e7d654d0f..ef66306b1 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -580,6 +580,13 @@ class YoutubeDL: 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time' } + _deprecated_multivalue_fields = { + 'album_artist': 'album_artists', + 'artist': 'artists', + 'composer': 'composers', + 'creator': 'creators', + 'genre': 'genres', + } _format_selection_exts = { 'audio': set(MEDIA_EXTENSIONS.common_audio), 'video': set(MEDIA_EXTENSIONS.common_video + ('3gp', )), @@ -683,7 +690,6 @@ class YoutubeDL: self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers')) self._load_cookies(self.params['http_headers'].get('Cookie')) # compat self.params['http_headers'].pop('Cookie', None) - self._request_director = self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES) if auto_init and auto_init != 'no_verbose_header': self.print_debug_header() @@ -957,6 +963,7 @@ class YoutubeDL: def close(self): self.save_cookies() self._request_director.close() + del self._request_director def trouble(self, message=None, tb=None, is_error=True): """Determine action to take when a download problem appears. @@ -2640,6 +2647,14 @@ class YoutubeDL: if final and info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + for old_key, new_key in self._deprecated_multivalue_fields.items(): + if new_key in info_dict and old_key in info_dict: + self.deprecation_warning(f'Do not return {old_key!r} when {new_key!r} is present') + elif old_value := info_dict.get(old_key): + info_dict[new_key] = old_value.split(', ') + elif new_value := info_dict.get(new_key): + info_dict[old_key] = ', '.join(v.replace(',', '\N{FULLWIDTH COMMA}') for v in new_value) + def _raise_pending_errors(self, info): err = info.pop('__pending_error', None) if err: @@ -3483,7 +3498,8 @@ class YoutubeDL: or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None, 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', FFmpegFixupM3u8PP) - ffmpeg_fixup(info_dict.get('is_live') and downloader == 'dashsegments', + ffmpeg_fixup(downloader == 'dashsegments' + and (info_dict.get('is_live') or info_dict.get('is_dash_periods')), 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP) ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP) @@ -4144,6 +4160,10 @@ class YoutubeDL: director.preferences.add(lambda rh, _: 500 if rh.RH_KEY == 'Urllib' else 0) return director + @functools.cached_property + def _request_director(self): + return self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES) + def encode(self, s): if isinstance(s, bytes): return s # Already encoded diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 57a487157..4380b888d 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -14,7 +14,7 @@ import os import re import traceback -from .compat import compat_shlex_quote +from .compat import compat_os_name, compat_shlex_quote from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS from .downloader.external import get_external_downloader from .extractor import list_extractor_classes @@ -984,7 +984,28 @@ def _real_main(argv=None): if pre_process: return ydl._download_retcode - ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv) + args = sys.argv[1:] if argv is None else argv + ydl.warn_if_short_id(args) + + # Show a useful error message and wait for keypress if not launched from shell on Windows + if not args and compat_os_name == 'nt' and getattr(sys, 'frozen', False): + import ctypes.wintypes + import msvcrt + + kernel32 = ctypes.WinDLL('Kernel32') + + buffer = (1 * ctypes.wintypes.DWORD)() + attached_processes = kernel32.GetConsoleProcessList(buffer, 1) + # If we only have a single process attached, then the executable was double clicked + # When using `pyinstaller` with `--onefile`, two processes get attached + is_onefile = hasattr(sys, '_MEIPASS') and os.path.basename(sys._MEIPASS).startswith('_MEI') + if attached_processes == 1 or is_onefile and attached_processes == 2: + print(parser._generate_error_message( + 'Do not double-click the executable, instead call it from a command line.\n' + 'Please read the README for further information on how to use yt-dlp: ' + 'https://github.com/yt-dlp/yt-dlp#readme')) + msvcrt.getch() + _exit(2) parser.error( 'You must provide at least one URL.\n' 'Type yt-dlp --help to see a list of all options.') diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4cfa5b442..026d85fbb 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -320,7 +320,6 @@ from .cbs import ( CBSIE, ParamountPressExpressIE, ) -from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsEmbedIE, CBSNewsIE, @@ -348,10 +347,6 @@ from .cgtn import CGTNIE from .charlierose import CharlieRoseIE from .chaturbate import ChaturbateIE from .chilloutzone import ChilloutzoneIE -from .chingari import ( - ChingariIE, - ChingariUserIE, -) from .chzzk import ( CHZZKLiveIE, CHZZKVideoIE, @@ -369,7 +364,6 @@ from .ciscolive import ( from .ciscowebex import CiscoWebexIE from .cjsw import CJSWIE from .clipchamp import ClipchampIE -from .cliphunter import CliphunterIE from .clippit import ClippitIE from .cliprs import ClipRsIE from .closertotruth import CloserToTruthIE @@ -379,7 +373,6 @@ from .clubic import ClubicIE from .clyp import ClypIE from .cmt import CMTIE from .cnbc import ( - CNBCIE, CNBCVideoIE, ) from .cnn import ( @@ -445,6 +438,7 @@ from .dailymail import DailyMailIE from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, + DailymotionSearchIE, DailymotionUserIE, ) from .dailywire import ( @@ -476,7 +470,6 @@ from .dlf import ( ) from .dfb import DFBIE from .dhm import DHMIE -from .digg import DiggIE from .douyutv import ( DouyuShowIE, DouyuTVIE, @@ -610,7 +603,6 @@ from .fc2 import ( ) from .fczenit import FczenitIE from .fifa import FifaIE -from .filmmodu import FilmmoduIE from .filmon import ( FilmOnIE, FilmOnChannelIE, @@ -676,7 +668,6 @@ from .gab import ( GabIE, ) from .gaia import GaiaIE -from .gameinformer import GameInformerIE from .gamejolt import ( GameJoltIE, GameJoltUserIE, @@ -705,7 +696,6 @@ from .gettr import ( GettrStreamingIE, ) from .giantbomb import GiantBombIE -from .giga import GigaIE from .glide import GlideIE from .globalplayer import ( GlobalPlayerLiveIE, @@ -896,10 +886,8 @@ from .jtbc import ( from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE -from .kanal2 import Kanal2IE from .kankanews import KankaNewsIE from .karaoketv import KaraoketvIE -from .karrierevideos import KarriereVideosIE from .kelbyone import KelbyOneIE from .khanacademy import ( KhanAcademyIE, @@ -915,13 +903,11 @@ from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE from .kompas import KompasVideoIE -from .konserthusetplay import KonserthusetPlayIE from .koo import KooIE from .kth import KTHIE from .krasview import KrasViewIE from .ku6 import Ku6IE from .kukululive import KukuluLiveIE -from .kusi import KUSIIE from .kuwo import ( KuwoIE, KuwoAlbumIE, @@ -1003,7 +989,6 @@ from .lnkgo import ( LnkGoIE, LnkIE, ) -from .localnews8 import LocalNews8IE from .lovehomeporn import LoveHomePornIE from .lrt import ( LRTVODIE, @@ -1030,7 +1015,6 @@ from .mailru import ( MailRuMusicSearchIE, ) from .mainstreaming import MainStreamingIE -from .malltv import MallTVIE from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, @@ -1074,7 +1058,6 @@ from .meipai import MeipaiIE from .melonvod import MelonVODIE from .metacritic import MetacriticIE from .mgtv import MGTVIE -from .miaopai import MiaoPaiIE from .microsoftstream import MicrosoftStreamIE from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, @@ -1092,7 +1075,6 @@ from .minds import ( MindsChannelIE, MindsGroupIE, ) -from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .mirrativ import ( MirrativIE, @@ -1120,7 +1102,6 @@ from .mlssoccer import MLSSoccerIE from .mocha import MochaVideoIE from .mojvideo import MojvideoIE from .monstercat import MonstercatIE -from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, MotherlessGroupIE, @@ -1365,7 +1346,6 @@ from .nuvid import NuvidIE from .nzherald import NZHeraldIE from .nzonscreen import NZOnScreenIE from .nzz import NZZIE -from .odatv import OdaTVIE from .odkmedia import OnDemandChinaEpisodeIE from .odnoklassniki import OdnoklassnikiIE from .oftv import ( @@ -1477,7 +1457,6 @@ from .platzi import ( PlatziCourseIE, ) from .playplustv import PlayPlusTVIE -from .playstuff import PlayStuffIE from .playsuisse import PlaySuisseIE from .playtvak import PlaytvakIE from .playwire import PlaywireIE @@ -1599,7 +1578,6 @@ from .raywenderlich import ( RayWenderlichIE, RayWenderlichCourseIE, ) -from .rbmaradio import RBMARadioIE from .rbgtum import ( RbgTumIE, RbgTumCourseIE, @@ -1631,7 +1609,6 @@ from .redgifs import ( RedGifsUserIE, ) from .redtube import RedTubeIE -from .regiotv import RegioTVIE from .rentv import ( RENTVIE, RENTVArticleIE, @@ -1640,6 +1617,7 @@ from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE from .rheinmaintv import RheinMainTVIE +from .ridehome import RideHomeIE from .rinsefm import ( RinseFMIE, RinseFMArtistPlaylistIE, @@ -1738,7 +1716,6 @@ from .safari import ( from .saitosan import SaitosanIE from .samplefocus import SampleFocusIE from .sapo import SapoIE -from .savefrom import SaveFromIE from .sbs import SBSIE from .sbscokr import ( SBSCoKrIE, @@ -1758,7 +1735,6 @@ from .scte import ( SCTECourseIE, ) from .scrolller import ScrolllerIE -from .seeker import SeekerIE from .sejmpl import SejmIE from .senalcolombia import SenalColombiaLiveIE from .senategov import SenateISVPIE, SenateGovIE @@ -1901,7 +1877,6 @@ from .storyfire import ( ) from .streamable import StreamableIE from .streamcz import StreamCZIE -from .streamff import StreamFFIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE from .stripchat import StripchatIE @@ -1930,7 +1905,6 @@ from .tbsjp import ( TBSJPProgramIE, TBSJPPlaylistIE, ) -from .tdslifeway import TDSLifewayIE from .teachable import ( TeachableIE, TeachableCourseIE, @@ -2500,6 +2474,7 @@ from .zee5 import ( Zee5SeriesIE, ) from .zeenews import ZeeNewsIE +from .zenporn import ZenPornIE from .zetland import ZetlandDKArticleIE from .zhihu import ZhihuIE from .zingmp3 import ( diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index a7b614ca1..b21742281 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -245,7 +245,6 @@ class ABCIViewIE(InfoExtractor): 'episode_id': 'NC2203H039S00', 'season_number': 2022, 'season': 'Season 2022', - 'episode_number': None, 'episode': 'Locking Up Kids', 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/nc/NC2203H039S00_636d8a0944a22_1920.jpg', 'timestamp': 1668460497, @@ -271,8 +270,6 @@ class ABCIViewIE(InfoExtractor): 'episode_id': 'RF2004Q043S00', 'season_number': 2021, 'season': 'Season 2021', - 'episode_number': None, - 'episode': None, 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/rf/RF2004Q043S00_61a950639dbc0_1920.jpg', 'timestamp': 1638710705, diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 6453dde97..6742f75d5 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -259,7 +259,7 @@ class AbemaTVIE(AbemaTVBaseIE): 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', 'series': 'ゆるキャン△ SEASON2', 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', - 'series_number': 2, + 'season_number': 2, 'episode_number': 1, 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', }, diff --git a/yt_dlp/extractor/acfun.py b/yt_dlp/extractor/acfun.py index dc5792944..c3b4f432e 100644 --- a/yt_dlp/extractor/acfun.py +++ b/yt_dlp/extractor/acfun.py @@ -3,6 +3,7 @@ from ..utils import ( float_or_none, format_field, int_or_none, + str_or_none, traverse_obj, parse_codecs, parse_qs, @@ -129,7 +130,7 @@ class AcFunBangumiIE(AcFunVideoBaseIE): 'title': '红孩儿之趴趴蛙寻石记 第5话 ', 'duration': 760.0, 'season': '红孩儿之趴趴蛙寻石记', - 'season_id': 5023171, + 'season_id': '5023171', 'season_number': 1, # series has only 1 season 'episode': 'Episode 5', 'episode_number': 5, @@ -146,7 +147,7 @@ class AcFunBangumiIE(AcFunVideoBaseIE): 'title': '叽歪老表(第二季) 第5话 坚不可摧', 'season': '叽歪老表(第二季)', 'season_number': 2, - 'season_id': 6065485, + 'season_id': '6065485', 'episode': '坚不可摧', 'episode_number': 5, 'upload_date': '20220324', @@ -191,7 +192,7 @@ class AcFunBangumiIE(AcFunVideoBaseIE): 'title': json_bangumi_data.get('showTitle'), 'thumbnail': json_bangumi_data.get('image'), 'season': json_bangumi_data.get('bangumiTitle'), - 'season_id': season_id, + 'season_id': str_or_none(season_id), 'season_number': season_number, 'episode': json_bangumi_data.get('title'), 'episode_number': episode_number, diff --git a/yt_dlp/extractor/altcensored.py b/yt_dlp/extractor/altcensored.py index 0e1627bfd..a8428ce2e 100644 --- a/yt_dlp/extractor/altcensored.py +++ b/yt_dlp/extractor/altcensored.py @@ -22,7 +22,7 @@ class AltCensoredIE(InfoExtractor): 'title': "QUELLES SONT LES CONSÉQUENCES DE L'HYPERSEXUALISATION DE LA SOCIÉTÉ ?", 'display_id': 'k0srjLSkga8.webm', 'release_date': '20180403', - 'creator': 'Virginie Vota', + 'creators': ['Virginie Vota'], 'release_year': 2018, 'upload_date': '20230318', 'uploader': 'admin@altcensored.com', @@ -32,7 +32,7 @@ class AltCensoredIE(InfoExtractor): 'duration': 926.09, 'thumbnail': 'https://archive.org/download/youtube-k0srjLSkga8/youtube-k0srjLSkga8.thumbs/k0srjLSkga8_000925.jpg', 'view_count': int, - 'categories': ['News & Politics'], + 'categories': ['News & Politics'], # FIXME } }] @@ -62,14 +62,21 @@ class AltCensoredChannelIE(InfoExtractor): 'title': 'Virginie Vota', 'id': 'UCFPTO55xxHqFqkzRZHu4kcw', }, - 'playlist_count': 91 + 'playlist_count': 85, }, { 'url': 'https://altcensored.com/channel/UC9CcJ96HKMWn0LZlcxlpFTw', 'info_dict': { 'title': 'yukikaze775', 'id': 'UC9CcJ96HKMWn0LZlcxlpFTw', }, - 'playlist_count': 4 + 'playlist_count': 4, + }, { + 'url': 'https://altcensored.com/channel/UCfYbb7nga6-icsFWWgS-kWw', + 'info_dict': { + 'title': 'Mister Metokur', + 'id': 'UCfYbb7nga6-icsFWWgS-kWw', + }, + 'playlist_count': 121, }] def _real_extract(self, url): @@ -78,7 +85,7 @@ class AltCensoredChannelIE(InfoExtractor): url, channel_id, 'Download channel webpage', 'Unable to get channel webpage') title = self._html_search_meta('altcen_title', webpage, 'title', fatal=False) page_count = int_or_none(self._html_search_regex( - r']+href="/channel/\w+/page/(\d+)">(?:\1)', + r']+href="/channel/[\w-]+/page/(\d+)">(?:\1)', webpage, 'page count', default='1')) def page_func(page_num): diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 3bb6f2e31..41f3a4ff2 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -31,6 +31,7 @@ from ..utils import ( unified_timestamp, url_or_none, urlhandle_detect_ext, + variadic, ) @@ -49,7 +50,7 @@ class ArchiveOrgIE(InfoExtractor): 'release_date': '19681210', 'timestamp': 1268695290, 'upload_date': '20100315', - 'creator': 'SRI International', + 'creators': ['SRI International'], 'uploader': 'laura@archive.org', 'thumbnail': r're:https://archive\.org/download/.*\.jpg', 'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr', @@ -109,7 +110,7 @@ class ArchiveOrgIE(InfoExtractor): 'title': 'Turning', 'ext': 'flac', 'track': 'Turning', - 'creator': 'Grateful Dead', + 'creators': ['Grateful Dead'], 'display_id': 'gd1977-05-08d01t01.flac', 'track_number': 1, 'album': '1977-05-08 - Barton Hall - Cornell University', @@ -129,7 +130,7 @@ class ArchiveOrgIE(InfoExtractor): 'location': 'Barton Hall - Cornell University', 'duration': 438.68, 'track': 'Deal', - 'creator': 'Grateful Dead', + 'creators': ['Grateful Dead'], 'album': '1977-05-08 - Barton Hall - Cornell University', 'release_date': '19770508', 'display_id': 'gd1977-05-08d01t07.flac', @@ -167,7 +168,7 @@ class ArchiveOrgIE(InfoExtractor): 'upload_date': '20160610', 'description': 'md5:f70956a156645a658a0dc9513d9e78b7', 'uploader': 'dimitrios@archive.org', - 'creator': ['British Broadcasting Corporation', 'Time-Life Films'], + 'creators': ['British Broadcasting Corporation', 'Time-Life Films'], 'timestamp': 1465594947, }, 'playlist': [ @@ -257,7 +258,7 @@ class ArchiveOrgIE(InfoExtractor): 'title': m['title'], 'description': clean_html(m.get('description')), 'uploader': dict_get(m, ['uploader', 'adder']), - 'creator': m.get('creator'), + 'creators': traverse_obj(m, ('creator', {variadic}, {lambda x: x[0] and list(x)})), 'license': m.get('licenseurl'), 'release_date': unified_strdate(m.get('date')), 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])), @@ -272,7 +273,7 @@ class ArchiveOrgIE(InfoExtractor): 'title': f.get('title') or f['name'], 'display_id': f['name'], 'description': clean_html(f.get('description')), - 'creator': f.get('creator'), + 'creators': traverse_obj(f, ('creator', {variadic}, {lambda x: x[0] and list(x)})), 'duration': parse_duration(f.get('length')), 'track_number': int_or_none(f.get('track')), 'album': f.get('album'), @@ -300,7 +301,7 @@ class ArchiveOrgIE(InfoExtractor): is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig')) if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in): entry['formats'].append({ - 'url': 'https://archive.org/download/' + identifier + '/' + f['name'], + 'url': 'https://archive.org/download/' + identifier + '/' + urllib.parse.quote(f['name']), 'format': f.get('format'), 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), diff --git a/yt_dlp/extractor/axs.py b/yt_dlp/extractor/axs.py index 4b263725f..7e9166771 100644 --- a/yt_dlp/extractor/axs.py +++ b/yt_dlp/extractor/axs.py @@ -24,7 +24,8 @@ class AxsIE(InfoExtractor): 'timestamp': 1685729564, 'duration': 1284.216, 'series': 'Rock & Roll Road Trip with Sammy Hagar', - 'season': 2, + 'season': 'Season 2', + 'season_number': 2, 'episode': '3', 'thumbnail': 'https://images.dotstudiopro.com/5f4e9d330a0c3b295a7e8394', }, @@ -41,7 +42,8 @@ class AxsIE(InfoExtractor): 'timestamp': 1676403615, 'duration': 2570.668, 'series': 'The Big Interview with Dan Rather', - 'season': 3, + 'season': 'Season 3', + 'season_number': 3, 'episode': '5', 'thumbnail': 'https://images.dotstudiopro.com/5f4d1901f340b50d937cec32', }, @@ -77,7 +79,7 @@ class AxsIE(InfoExtractor): 'title': ('title', {str}), 'description': ('description', {str}), 'series': ('seriestitle', {str}), - 'season': ('season', {int}), + 'season_number': ('season', {int}), 'episode': ('episode', {str}), 'duration': ('duration', {float_or_none}), 'timestamp': ('updated_at', {parse_iso8601}), diff --git a/yt_dlp/extractor/beeg.py b/yt_dlp/extractor/beeg.py index 52ee68eca..042b3220b 100644 --- a/yt_dlp/extractor/beeg.py +++ b/yt_dlp/extractor/beeg.py @@ -2,6 +2,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, + str_or_none, traverse_obj, try_get, unified_timestamp, @@ -22,7 +23,7 @@ class BeegIE(InfoExtractor): 'age_limit': 18, 'upload_date': '20220131', 'timestamp': 1643656455, - 'display_id': 2540839, + 'display_id': '2540839', } }, { 'url': 'https://beeg.com/-0599050563103750?t=4-861', @@ -36,7 +37,7 @@ class BeegIE(InfoExtractor): 'age_limit': 18, 'description': 'md5:b4fc879a58ae6c604f8f259155b7e3b9', 'timestamp': 1643623200, - 'display_id': 2569965, + 'display_id': '2569965', 'upload_date': '20220131', } }, { @@ -78,7 +79,7 @@ class BeegIE(InfoExtractor): return { 'id': video_id, - 'display_id': first_fact.get('id'), + 'display_id': str_or_none(first_fact.get('id')), 'title': traverse_obj(video, ('file', 'stuff', 'sf_name')), 'description': traverse_obj(video, ('file', 'stuff', 'sf_story')), 'timestamp': unified_timestamp(first_fact.get('fc_created')), diff --git a/yt_dlp/extractor/bellmedia.py b/yt_dlp/extractor/bellmedia.py index 5ae4b917a..677680b42 100644 --- a/yt_dlp/extractor/bellmedia.py +++ b/yt_dlp/extractor/bellmedia.py @@ -32,7 +32,7 @@ class BellMediaIE(InfoExtractor): 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', 'upload_date': '20180525', 'timestamp': 1527288600, - 'season_id': 73997, + 'season_id': '73997', 'season': '2018', 'thumbnail': 'http://images2.9c9media.com/image_asset/2018_5_25_baf30cbd-b28d-4a18-9903-4bb8713b00f5_PNG_956x536.jpg', 'tags': [], diff --git a/yt_dlp/extractor/bfmtv.py b/yt_dlp/extractor/bfmtv.py index 5d0c73ff3..c4621ca82 100644 --- a/yt_dlp/extractor/bfmtv.py +++ b/yt_dlp/extractor/bfmtv.py @@ -93,7 +93,6 @@ class BFMTVArticleIE(BFMTVBaseIE): 'id': '6318445464112', 'ext': 'mp4', 'title': 'Le plein de bioéthanol fait de plus en plus mal à la pompe', - 'description': None, 'uploader_id': '876630703001', 'upload_date': '20230110', 'timestamp': 1673341692, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index c138bde3a..f4e1c91a8 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1996,7 +1996,7 @@ class BiliIntlIE(BiliIntlBaseIE): 'title': get_element_by_class( 'bstar-meta__title', webpage) or self._html_search_meta('og:title', webpage), 'description': get_element_by_class( - 'bstar-meta__desc', webpage) or self._html_search_meta('og:description'), + 'bstar-meta__desc', webpage) or self._html_search_meta('og:description', webpage), }, self._search_json_ld(webpage, video_id, default={})) def _get_comments_reply(self, root_id, next_id=0, display_id=None): diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index 41367c5b9..194bf1f46 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -185,7 +185,6 @@ class BitChuteChannelIE(InfoExtractor): 'info_dict': { 'id': 'UGlrF9o9b-Q', 'ext': 'mp4', - 'filesize': None, 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', 'thumbnail': r're:^https?://.*\.jpg$', diff --git a/yt_dlp/extractor/bleacherreport.py b/yt_dlp/extractor/bleacherreport.py index 5e5155af2..e875957cf 100644 --- a/yt_dlp/extractor/bleacherreport.py +++ b/yt_dlp/extractor/bleacherreport.py @@ -4,10 +4,12 @@ from ..utils import ( ExtractorError, int_or_none, parse_iso8601, + str_or_none, ) class BleacherReportIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P\d+)' _TESTS = [{ 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', @@ -16,7 +18,7 @@ class BleacherReportIE(InfoExtractor): 'id': '2496438', 'ext': 'mp4', 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', - 'uploader_id': 3992341, + 'uploader_id': '3992341', 'description': 'CFB, ACC, Florida State', 'timestamp': 1434380212, 'upload_date': '20150615', @@ -33,7 +35,7 @@ class BleacherReportIE(InfoExtractor): 'timestamp': 1446839961, 'uploader': 'Sean Fay', 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', - 'uploader_id': 6466954, + 'uploader_id': '6466954', 'upload_date': '20151011', }, 'add_ie': ['Youtube'], @@ -58,7 +60,7 @@ class BleacherReportIE(InfoExtractor): 'id': article_id, 'title': article_data['title'], 'uploader': article_data.get('author', {}).get('name'), - 'uploader_id': article_data.get('authorId'), + 'uploader_id': str_or_none(article_data.get('authorId')), 'timestamp': parse_iso8601(article_data.get('createdAt')), 'thumbnails': thumbnails, 'comment_count': int_or_none(article_data.get('commentsCount')), @@ -82,6 +84,7 @@ class BleacherReportIE(InfoExtractor): class BleacherReportCMSIE(AMPIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index d97fbd758..cf830210f 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -76,6 +76,7 @@ class CBSBaseIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE class CBSIE(CBSBaseIE): + _WORKING = False _VALID_URL = r'''(?x) (?: cbs:| diff --git a/yt_dlp/extractor/cbsinteractive.py b/yt_dlp/extractor/cbsinteractive.py deleted file mode 100644 index b09e9823e..000000000 --- a/yt_dlp/extractor/cbsinteractive.py +++ /dev/null @@ -1,98 +0,0 @@ -from .cbs import CBSIE -from ..utils import int_or_none - - -class CBSInteractiveIE(CBSIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?(?Pcnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P[^/?]+)' - _TESTS = [{ - 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', - 'info_dict': { - 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', - 'display_id': 'hands-on-with-microsofts-windows-8-1-update', - 'ext': 'mp4', - 'title': 'Hands-on with Microsoft Windows 8.1 Update', - 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', - 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', - 'uploader': 'Sarah Mitroff', - 'duration': 70, - 'timestamp': 1396479627, - 'upload_date': '20140402', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', - 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', - 'info_dict': { - 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', - 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', - 'ext': 'mp4', - 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', - 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', - 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', - 'uploader': 'Ashley Esqueda', - 'duration': 1482, - 'timestamp': 1433289889, - 'upload_date': '20150603', - }, - }, { - 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', - 'info_dict': { - 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', - 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', - 'ext': 'mp4', - 'title': 'Video: Keeping Android smartphones and tablets secure', - 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', - 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', - 'uploader': 'Adrian Kingsley-Hughes', - 'duration': 731, - 'timestamp': 1449129925, - 'upload_date': '20151203', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', - 'only_matching': True, - }] - - MPX_ACCOUNTS = { - 'cnet': 2198311517, - 'zdnet': 2387448114, - } - - def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, display_id) - - data_json = self._html_search_regex( - r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", - webpage, 'data json') - data = self._parse_json(data_json, display_id) - vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] - - video_id = vdata['mpxRefId'] - - title = vdata['title'] - author = vdata.get('author') - if author: - uploader = '%s %s' % (author['firstName'], author['lastName']) - uploader_id = author.get('id') - else: - uploader = None - uploader_id = None - - info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) - info.update({ - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'duration': int_or_none(vdata.get('duration')), - 'uploader': uploader, - 'uploader_id': uploader_id, - }) - return info diff --git a/yt_dlp/extractor/cbssports.py b/yt_dlp/extractor/cbssports.py index b5d85af12..b9c82dab6 100644 --- a/yt_dlp/extractor/cbssports.py +++ b/yt_dlp/extractor/cbssports.py @@ -8,6 +8,7 @@ from ..utils import ( # class CBSSportsEmbedIE(CBSBaseIE): class CBSSportsEmbedIE(InfoExtractor): + _WORKING = False IE_NAME = 'cbssports:embed' _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? (?: @@ -75,6 +76,7 @@ class CBSSportsBaseIE(InfoExtractor): class CBSSportsIE(CBSSportsBaseIE): + _WORKING = False IE_NAME = 'cbssports' _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P[^/?#&]+)' _TESTS = [{ @@ -92,6 +94,7 @@ class CBSSportsIE(CBSSportsBaseIE): class TwentyFourSevenSportsIE(CBSSportsBaseIE): + _WORKING = False IE_NAME = '247sports' _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P\d+)' _TESTS = [{ diff --git a/yt_dlp/extractor/cctv.py b/yt_dlp/extractor/cctv.py index 466bdfb7c..8552ee511 100644 --- a/yt_dlp/extractor/cctv.py +++ b/yt_dlp/extractor/cctv.py @@ -88,6 +88,20 @@ class CCTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # videoCenterId: "id" + 'url': 'http://news.cctv.com/2024/02/21/ARTIcU5tKIOIF2myEGCATkLo240221.shtml', + 'info_dict': { + 'id': '5c846c0518444308ba32c4159df3b3e0', + 'ext': 'mp4', + 'title': '《平“语”近人——习近平喜欢的典故》第三季 第5集:风物长宜放眼量', + 'uploader': 'yangjuan', + 'timestamp': 1708554940, + 'upload_date': '20240221', + }, + 'params': { + 'skip_download': True, + }, }, { # var ids = ["id"] 'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml', @@ -128,7 +142,7 @@ class CCTVIE(InfoExtractor): video_id = self._search_regex( [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)', - r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)', + r'videoCenterId(?:["\']\s*,|:)\s*["\']([\da-fA-F]+)', r'changePlayer\s*\(\s*["\']([\da-fA-F]+)', r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)', r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)', diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index 8390160a0..156b6a324 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -51,7 +51,7 @@ class CeskaTelevizeIE(InfoExtractor): 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', 'only_matching': True, 'info_dict': { - 'id': 402, + 'id': '402', 'ext': 'mp4', 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, diff --git a/yt_dlp/extractor/cgtn.py b/yt_dlp/extractor/cgtn.py index aaafa02d1..5d9d9bcde 100644 --- a/yt_dlp/extractor/cgtn.py +++ b/yt_dlp/extractor/cgtn.py @@ -17,6 +17,7 @@ class CGTNIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1615295940, 'upload_date': '20210309', + 'categories': ['Video'], }, 'params': { 'skip_download': True @@ -29,8 +30,8 @@ class CGTNIE(InfoExtractor): 'title': 'China, Indonesia vow to further deepen maritime cooperation', 'thumbnail': r're:^https?://.*\.png$', 'description': 'China and Indonesia vowed to upgrade their cooperation into the maritime sector and also for political security, economy, and cultural and people-to-people exchanges.', - 'author': 'CGTN', - 'category': 'China', + 'creators': ['CGTN'], + 'categories': ['China'], 'timestamp': 1622950200, 'upload_date': '20210606', }, @@ -45,7 +46,12 @@ class CGTNIE(InfoExtractor): webpage = self._download_webpage(url, video_id) download_url = self._html_search_regex(r'data-video ="(?P.+m3u8)"', webpage, 'download_url') - datetime_str = self._html_search_regex(r'\s*(.+?)\s*', webpage, 'datetime_str', fatal=False) + datetime_str = self._html_search_regex( + r'\s*(.+?)\s*', webpage, 'datetime_str', fatal=False) + category = self._html_search_regex( + r'\s*(.+?)\s*', webpage, 'category', fatal=False) + author = self._search_regex( + r'
\s*(.+?)\s*
', webpage, 'author', default=None) return { 'id': video_id, @@ -53,9 +59,7 @@ class CGTNIE(InfoExtractor): 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': self._extract_m3u8_formats(download_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'), - 'category': self._html_search_regex(r'\s*(.+?)\s*', - webpage, 'category', fatal=False), - 'author': self._html_search_regex(r'
\s*(.+?)\s*
', - webpage, 'author', default=None, fatal=False), + 'categories': [category] if category else None, + 'creators': [author] if author else None, 'timestamp': try_get(unified_timestamp(datetime_str), lambda x: x - 8 * 3600), } diff --git a/yt_dlp/extractor/chingari.py b/yt_dlp/extractor/chingari.py deleted file mode 100644 index 48091dd65..000000000 --- a/yt_dlp/extractor/chingari.py +++ /dev/null @@ -1,207 +0,0 @@ -import itertools -import json -import urllib.parse - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - int_or_none, - str_to_int, - url_or_none, -) - - -class ChingariBaseIE(InfoExtractor): - def _get_post(self, id, post_data): - media_data = post_data['mediaLocation'] - base_url = media_data['base'] - author_data = post_data.get('authorData', {}) - song_data = post_data.get('song', {}) # revist this in future for differentiating b/w 'art' and 'author' - - formats = [{ - 'format_id': frmt, - 'width': str_to_int(frmt[1:]), - 'url': base_url + frmt_path, - } for frmt, frmt_path in media_data.get('transcoded', {}).items()] - - if media_data.get('path'): - formats.append({ - 'format_id': 'original', - 'format_note': 'Direct video.', - 'url': base_url + '/apipublic' + media_data['path'], - 'quality': 10, - }) - timestamp = str_to_int(post_data.get('created_at')) - if timestamp: - timestamp = int_or_none(timestamp, 1000) - - thumbnail, uploader_url = None, None - if media_data.get('thumbnail'): - thumbnail = base_url + media_data.get('thumbnail') - if author_data.get('username'): - uploader_url = 'https://chingari.io/' + author_data.get('username') - - return { - 'id': id, - 'extractor_key': ChingariIE.ie_key(), - 'extractor': 'Chingari', - 'title': urllib.parse.unquote_plus(clean_html(post_data.get('caption'))), - 'description': urllib.parse.unquote_plus(clean_html(post_data.get('caption'))), - 'duration': media_data.get('duration'), - 'thumbnail': url_or_none(thumbnail), - 'like_count': post_data.get('likeCount'), - 'view_count': post_data.get('viewsCount'), - 'comment_count': post_data.get('commentCount'), - 'repost_count': post_data.get('shareCount'), - 'timestamp': timestamp, - 'uploader_id': post_data.get('userId') or author_data.get('_id'), - 'uploader': author_data.get('name'), - 'uploader_url': url_or_none(uploader_url), - 'track': song_data.get('title'), - 'artist': song_data.get('author'), - 'formats': formats, - } - - -class ChingariIE(ChingariBaseIE): - _VALID_URL = r'https?://(?:www\.)?chingari\.io/share/post\?id=(?P[^&/#?]+)' - _TESTS = [{ - 'url': 'https://chingari.io/share/post?id=612f8f4ce1dc57090e8a7beb', - 'info_dict': { - 'id': '612f8f4ce1dc57090e8a7beb', - 'ext': 'mp4', - 'title': 'Happy birthday Srila Prabhupada', - 'description': 'md5:c7080ebfdfeb06016e638c286d6bc3fa', - 'duration': 0, - 'thumbnail': 'https://media.chingari.io/uploads/c41d30e2-06b6-4e3b-9b4b-edbb929cec06-1630506826911/thumbnail/198f993f-ce87-4623-82c6-cd071bd6d4f4-1630506828016.jpg', - 'like_count': int, - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - 'timestamp': 1630506828, - 'upload_date': '20210901', - 'uploader_id': '5f0403982c8bd344f4813f8c', - 'uploader': 'ISKCON,Inc.', - 'uploader_url': 'https://chingari.io/iskcon,inc', - 'track': None, - 'artist': None, - }, - 'params': {'skip_download': True} - }] - - def _real_extract(self, url): - id = self._match_id(url) - post_json = self._download_json(f'https://api.chingari.io/post/post_details/{id}', id) - if post_json['code'] != 200: - raise ExtractorError(post_json['message'], expected=True) - post_data = post_json['data'] - return self._get_post(id, post_data) - - -class ChingariUserIE(ChingariBaseIE): - _VALID_URL = r'https?://(?:www\.)?chingari\.io/(?!share/post)(?P[^/?]+)' - _TESTS = [{ - 'url': 'https://chingari.io/dada1023', - 'info_dict': { - 'id': 'dada1023', - }, - 'params': {'playlistend': 3}, - 'playlist': [{ - 'url': 'https://chingari.io/share/post?id=614781f3ade60b3a0bfff42a', - 'info_dict': { - 'id': '614781f3ade60b3a0bfff42a', - 'ext': 'mp4', - 'title': '#chingaribappa ', - 'description': 'md5:d1df21d84088770468fa63afe3b17857', - 'duration': 7, - 'thumbnail': 'https://media.chingari.io/uploads/346d86d4-abb2-474e-a164-ffccf2bbcb72-1632076273717/thumbnail/b0b3aac2-2b86-4dd1-909d-9ed6e57cf77c-1632076275552.jpg', - 'like_count': int, - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - 'timestamp': 1632076275, - 'upload_date': '20210919', - 'uploader_id': '5efc4b12cca35c3d1794c2d3', - 'uploader': 'dada (girish) dhawale', - 'uploader_url': 'https://chingari.io/dada1023', - 'track': None, - 'artist': None - }, - 'params': {'skip_download': True} - }, { - 'url': 'https://chingari.io/share/post?id=6146b132bcbf860959e12cba', - 'info_dict': { - 'id': '6146b132bcbf860959e12cba', - 'ext': 'mp4', - 'title': 'Tactor harvesting', - 'description': 'md5:8403f12dce68828b77ecee7eb7e887b7', - 'duration': 59.3, - 'thumbnail': 'https://media.chingari.io/uploads/b353ca70-7a87-400d-93a6-fa561afaec86-1632022814584/thumbnail/c09302e3-2043-41b1-a2fe-77d97e5bd676-1632022834260.jpg', - 'like_count': int, - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - 'timestamp': 1632022834, - 'upload_date': '20210919', - 'uploader_id': '5efc4b12cca35c3d1794c2d3', - 'uploader': 'dada (girish) dhawale', - 'uploader_url': 'https://chingari.io/dada1023', - 'track': None, - 'artist': None - }, - 'params': {'skip_download': True} - }, { - 'url': 'https://chingari.io/share/post?id=6145651b74cb030a64c40b82', - 'info_dict': { - 'id': '6145651b74cb030a64c40b82', - 'ext': 'mp4', - 'title': '#odiabhajan ', - 'description': 'md5:687ea36835b9276cf2af90f25e7654cb', - 'duration': 56.67, - 'thumbnail': 'https://media.chingari.io/uploads/6cbf216b-babc-4cce-87fe-ceaac8d706ac-1631937782708/thumbnail/8855754f-6669-48ce-b269-8cc0699ed6da-1631937819522.jpg', - 'like_count': int, - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - 'timestamp': 1631937819, - 'upload_date': '20210918', - 'uploader_id': '5efc4b12cca35c3d1794c2d3', - 'uploader': 'dada (girish) dhawale', - 'uploader_url': 'https://chingari.io/dada1023', - 'track': None, - 'artist': None - }, - 'params': {'skip_download': True} - }], - }, { - 'url': 'https://chingari.io/iskcon%2Cinc', - 'playlist_mincount': 1025, - 'info_dict': { - 'id': 'iskcon%2Cinc', - }, - }] - - def _entries(self, id): - skip = 0 - has_more = True - for page in itertools.count(): - posts = self._download_json('https://api.chingari.io/users/getPosts', id, - data=json.dumps({'userId': id, 'ownerId': id, 'skip': skip, 'limit': 20}).encode(), - headers={'content-type': 'application/json;charset=UTF-8'}, - note='Downloading page %s' % page) - for post in posts.get('data', []): - post_data = post['post'] - yield self._get_post(post_data['_id'], post_data) - skip += 20 - has_more = posts['hasMoreData'] - if not has_more: - break - - def _real_extract(self, url): - alt_id = self._match_id(url) - post_json = self._download_json(f'https://api.chingari.io/user/{alt_id}', alt_id) - if post_json['code'] != 200: - raise ExtractorError(post_json['message'], expected=True) - id = post_json['data']['_id'] - return self.playlist_result(self._entries(id), playlist_id=alt_id) diff --git a/yt_dlp/extractor/chzzk.py b/yt_dlp/extractor/chzzk.py index 6894baea5..420fe0514 100644 --- a/yt_dlp/extractor/chzzk.py +++ b/yt_dlp/extractor/chzzk.py @@ -2,7 +2,7 @@ import functools from .common import InfoExtractor from ..utils import ( - ExtractorError, + UserNotLive, float_or_none, int_or_none, parse_iso8601, @@ -40,7 +40,7 @@ class CHZZKLiveIE(InfoExtractor): note='Downloading channel info', errnote='Unable to download channel info')['content'] if live_detail.get('status') == 'CLOSE': - raise ExtractorError('The channel is not currently live', expected=True) + raise UserNotLive(video_id=channel_id) live_playback = self._parse_json(live_detail['livePlaybackJson'], channel_id) diff --git a/yt_dlp/extractor/cinemax.py b/yt_dlp/extractor/cinemax.py index 54cab2285..706ec8553 100644 --- a/yt_dlp/extractor/cinemax.py +++ b/yt_dlp/extractor/cinemax.py @@ -2,6 +2,7 @@ from .hbo import HBOBaseIE class CinemaxIE(HBOBaseIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P[^/]+/video/[0-9a-z-]+-(?P\d+))' _TESTS = [{ 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903', diff --git a/yt_dlp/extractor/cliphunter.py b/yt_dlp/extractor/cliphunter.py deleted file mode 100644 index 2b907dc80..000000000 --- a/yt_dlp/extractor/cliphunter.py +++ /dev/null @@ -1,76 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - url_or_none, -) - - -class CliphunterIE(InfoExtractor): - IE_NAME = 'cliphunter' - - _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ - (?P[0-9]+)/ - (?P.+?)(?:$|[#\?]) - ''' - _TESTS = [{ - 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', - 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480', - 'info_dict': { - 'id': '1012420', - 'ext': 'flv', - 'title': 'Fun Jynx Maze solo', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz', - 'md5': '55a723c67bfc6da6b0cfa00d55da8a27', - 'info_dict': { - 'id': '2019449', - 'ext': 'mp4', - 'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_title = self._search_regex( - r'mediaTitle = "([^"]+)"', webpage, 'title') - - gexo_files = self._parse_json( - self._search_regex( - r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'), - video_id) - - formats = [] - for format_id, f in gexo_files.items(): - video_url = url_or_none(f.get('url')) - if not video_url: - continue - fmt = f.get('fmt') - height = f.get('h') - format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'width': int_or_none(f.get('w')), - 'height': int_or_none(height), - 'tbr': int_or_none(f.get('br')), - }) - - thumbnail = self._search_regex( - r"var\s+mov_thumb\s*=\s*'([^']+)';", - webpage, 'thumbnail', fatal=False) - - return { - 'id': video_id, - 'title': video_title, - 'formats': formats, - 'age_limit': self._rta_search(webpage), - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/cliprs.py b/yt_dlp/extractor/cliprs.py index 567f77b94..c2add02da 100644 --- a/yt_dlp/extractor/cliprs.py +++ b/yt_dlp/extractor/cliprs.py @@ -2,6 +2,7 @@ from .onet import OnetBaseIE class ClipRsIE(OnetBaseIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P[^/]+)/\d+' _TEST = { 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', diff --git a/yt_dlp/extractor/closertotruth.py b/yt_dlp/extractor/closertotruth.py index e78e26a11..1f9a5f611 100644 --- a/yt_dlp/extractor/closertotruth.py +++ b/yt_dlp/extractor/closertotruth.py @@ -4,6 +4,7 @@ from .common import InfoExtractor class CloserToTruthIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', diff --git a/yt_dlp/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py index c4c7d66a5..a812c24af 100644 --- a/yt_dlp/extractor/cloudflarestream.py +++ b/yt_dlp/extractor/cloudflarestream.py @@ -4,27 +4,25 @@ from .common import InfoExtractor class CloudflareStreamIE(InfoExtractor): + _SUBDOMAIN_RE = r'(?:(?:watch|iframe|customer-\w+)\.)?' _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' - _EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE + _EMBED_RE = rf'embed\.{_DOMAIN_RE}/embed/[^/]+\.js\?.*?\bvideo=' _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:watch\.)?%s/| - %s - ) - (?P%s) - ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) - _EMBED_REGEX = [fr']+\bsrc=(["\'])(?P(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1'] + _VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P{_ID_RE})' + _EMBED_REGEX = [ + rf']+\bsrc=(["\'])(?P(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1', + rf']+\bsrc=["\'](?Phttps?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})', + ] _TESTS = [{ 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', 'info_dict': { 'id': '31c9291ab41fac05471db4e73aa11717', 'ext': 'mp4', 'title': '31c9291ab41fac05471db4e73aa11717', + 'thumbnail': 'https://videodelivery.net/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg', }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', @@ -35,6 +33,21 @@ class CloudflareStreamIE(InfoExtractor): }, { 'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e', 'only_matching': True, + }, { + 'url': 'https://customer-aw5py76sw8wyqzmh.cloudflarestream.com/2463f6d3e06fa29710a337f5f5389fd8/iframe', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://upride.cc/incident/shoulder-pass-at-light/', + 'info_dict': { + 'id': 'eaef9dea5159cf968be84241b5cedfe7', + 'ext': 'mp4', + 'title': 'eaef9dea5159cf968be84241b5cedfe7', + 'thumbnail': 'https://videodelivery.net/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg', + }, + 'params': { + 'skip_download': 'm3u8', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/cnbc.py b/yt_dlp/extractor/cnbc.py index 7d209b6d9..cedfd3ef9 100644 --- a/yt_dlp/extractor/cnbc.py +++ b/yt_dlp/extractor/cnbc.py @@ -1,68 +1,97 @@ from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import int_or_none, parse_iso8601, str_or_none, url_or_none +from ..utils.traversal import traverse_obj -class CNBCIE(InfoExtractor): - _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P[0-9]+)' - _TEST = { - 'url': 'http://video.cnbc.com/gallery/?video=3000503714', +class CNBCVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/?#]+/)+(?P[^./?#&]+)\.html' + + _TESTS = [{ + 'url': 'https://www.cnbc.com/video/2023/12/07/mcdonalds-just-unveiled-cosmcsits-new-spinoff-brand.html', 'info_dict': { - 'id': '3000503714', 'ext': 'mp4', - 'title': 'Fighting zombies is big business', - 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', - 'timestamp': 1459332000, - 'upload_date': '20160330', - 'uploader': 'NBCU-CNBC', + 'id': '107344774', + 'display_id': 'mcdonalds-just-unveiled-cosmcsits-new-spinoff-brand', + 'modified_timestamp': 1702053483, + 'timestamp': 1701977810, + 'channel': 'News Videos', + 'upload_date': '20231207', + 'description': 'md5:882c001d85cb43d7579b514307b3e78b', + 'release_timestamp': 1701977375, + 'modified_date': '20231208', + 'release_date': '20231207', + 'duration': 65, + 'creators': ['Sean Conlon'], + 'title': 'Here\'s a first look at McDonald\'s new spinoff brand, CosMc\'s', + 'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107344192-1701894812493-CosMcsskyHero_2336x1040_hero-desktop.jpg?v=1701894855', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Dead link', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, - {'force_smil_url': True}), - 'id': video_id, - } - - -class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P/video/(?:[^/]+/)+(?P[^./?#&]+)\.html)' - _TEST = { - 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://www.cnbc.com/video/2023/12/08/jim-cramer-shares-his-take-on-seattles-tech-scene.html', 'info_dict': { - 'id': '7000031301', + 'creators': ['Jim Cramer'], + 'channel': 'Mad Money with Jim Cramer', + 'description': 'md5:72925be21b952e95eba51178dddf4e3e', + 'duration': 299.0, 'ext': 'mp4', - 'title': "Trump: I don't necessarily agree with raising rates", - 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', - 'timestamp': 1531958400, - 'upload_date': '20180719', - 'uploader': 'NBCU-CNBC', + 'id': '107345451', + 'display_id': 'jim-cramer-shares-his-take-on-seattles-tech-scene', + 'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107345481-1702079431MM-B-120823.jpg?v=1702079430', + 'timestamp': 1702080139, + 'title': 'Jim Cramer shares his take on Seattle\'s tech scene', + 'release_date': '20231208', + 'upload_date': '20231209', + 'modified_timestamp': 1702080139, + 'modified_date': '20231209', + 'release_timestamp': 1702073551, }, - 'params': { - 'skip_download': True, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://www.cnbc.com/video/2023/12/08/the-epicenter-of-ai-is-in-seattle-says-jim-cramer.html', + 'info_dict': { + 'creators': ['Jim Cramer'], + 'channel': 'Mad Money with Jim Cramer', + 'description': 'md5:72925be21b952e95eba51178dddf4e3e', + 'duration': 113.0, + 'ext': 'mp4', + 'id': '107345474', + 'display_id': 'the-epicenter-of-ai-is-in-seattle-says-jim-cramer', + 'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107345486-Screenshot_2023-12-08_at_70339_PM.png?v=1702080248', + 'timestamp': 1702080535, + 'title': 'The epicenter of AI is in Seattle, says Jim Cramer', + 'release_timestamp': 1702077347, + 'modified_timestamp': 1702080535, + 'release_date': '20231208', + 'upload_date': '20231209', + 'modified_date': '20231209', }, - 'skip': 'Dead link', - } + 'expected_warnings': ['Unable to download f4m manifest'], + }] def _real_extract(self, url): - path, display_id = self._match_valid_url(url).groups() - video_id = self._download_json( - 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ - 'query': '''{ - page(path: "%s") { - vcpsId - } -}''' % path, - })['data']['page']['vcpsId'] - return self.url_result( - 'http://video.cnbc.com/gallery/?video=%d' % video_id, - CNBCIE.ie_key()) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data = self._search_json(r'window\.__s_data=', webpage, 'video data', display_id) + + player_data = traverse_obj(data, ( + 'page', 'page', 'layout', ..., 'columns', ..., 'modules', + lambda _, v: v['name'] == 'clipPlayer', 'data', {dict}), get_all=False) + + return { + 'id': display_id, + 'display_id': display_id, + 'formats': self._extract_akamai_formats(player_data['playbackURL'], display_id), + **self._search_json_ld(webpage, display_id, fatal=False), + **traverse_obj(player_data, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'creators': ('author', ..., 'name', {str}), + 'timestamp': ('datePublished', {parse_iso8601}), + 'release_timestamp': ('uploadDate', {parse_iso8601}), + 'modified_timestamp': ('dateLastPublished', {parse_iso8601}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'channel': ('section', 'title', {str}), + }), + } diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index af534775f..f57963da2 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -247,6 +247,8 @@ class InfoExtractor: (For internal use only) * http_chunk_size Chunk size for HTTP downloads * ffmpeg_args Extra arguments for ffmpeg downloader + * is_dash_periods Whether the format is a result of merging + multiple DASH periods. RTMP formats can also have the additional fields: page_url, app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, rtmp_protocol, rtmp_real_time @@ -260,7 +262,7 @@ class InfoExtractor: direct: True if a direct video file was given (must only be set by GenericIE) alt_title: A secondary title of the video. - display_id An alternative identifier for the video, not necessarily + display_id: An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", and display_id "dancing-naked-mole-rats" @@ -278,7 +280,7 @@ class InfoExtractor: description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. - creator: The creator of the video. + creators: List of creators of the video. timestamp: UNIX timestamp of the moment the video was uploaded upload_date: Video upload date in UTC (YYYYMMDD). If not explicitly set, calculated from timestamp @@ -422,16 +424,16 @@ class InfoExtractor: track_number: Number of the track within an album or a disc, as an integer. track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), as a unicode string. - artist: Artist(s) of the track. - genre: Genre(s) of the track. + artists: List of artists of the track. + composers: List of composers of the piece. + genres: List of genres of the track. album: Title of the album the track belongs to. album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). - album_artist: List of all artists appeared on the album (e.g. - "Ash Borer / Fell Voices" or "Various Artists", useful for splits - and compilations). + album_artists: List of all artists appeared on the album. + E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"]. + Useful for splits and compilations. disc_number: Number of the disc or other physical medium the track belongs to, as an integer. - composer: Composer of the piece The following fields should only be set for clips that should be cut from the original video: @@ -442,6 +444,18 @@ class InfoExtractor: rows: Number of rows in each storyboard fragment, as an integer columns: Number of columns in each storyboard fragment, as an integer + The following fields are deprecated and should not be set by new code: + composer: Use "composers" instead. + Composer(s) of the piece, comma-separated. + artist: Use "artists" instead. + Artist(s) of the track, comma-separated. + genre: Use "genres" instead. + Genre(s) of the track, comma-separated. + album_artist: Use "album_artists" instead. + All artists appeared on the album, comma-separated. + creator: Use "creators" instead. + The creator of the video. + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -2530,7 +2544,11 @@ class InfoExtractor: self._report_ignoring_subs('DASH') return fmts - def _extract_mpd_formats_and_subtitles( + def _extract_mpd_formats_and_subtitles(self, *args, **kwargs): + periods = self._extract_mpd_periods(*args, **kwargs) + return self._merge_mpd_periods(periods) + + def _extract_mpd_periods( self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): @@ -2543,17 +2561,16 @@ class InfoExtractor: errnote='Failed to download MPD manifest' if errnote is None else errnote, fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [], {} + return [] mpd_doc, urlh = res if mpd_doc is None: - return [], {} + return [] # We could have been redirected to a new url when we retrieved our mpd file. mpd_url = urlh.url mpd_base_url = base_url(mpd_url) - return self._parse_mpd_formats_and_subtitles( - mpd_doc, mpd_id, mpd_base_url, mpd_url) + return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url) def _parse_mpd_formats(self, *args, **kwargs): fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs) @@ -2561,8 +2578,39 @@ class InfoExtractor: self._report_ignoring_subs('DASH') return fmts - def _parse_mpd_formats_and_subtitles( - self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): + def _parse_mpd_formats_and_subtitles(self, *args, **kwargs): + periods = self._parse_mpd_periods(*args, **kwargs) + return self._merge_mpd_periods(periods) + + def _merge_mpd_periods(self, periods): + """ + Combine all formats and subtitles from an MPD manifest into a single list, + by concatenate streams with similar formats. + """ + formats, subtitles = {}, {} + for period in periods: + for f in period['formats']: + assert 'is_dash_periods' not in f, 'format already processed' + f['is_dash_periods'] = True + format_key = tuple(v for k, v in f.items() if k not in ( + ('format_id', 'fragments', 'manifest_stream_number'))) + if format_key not in formats: + formats[format_key] = f + elif 'fragments' in f: + formats[format_key].setdefault('fragments', []).extend(f['fragments']) + + if subtitles and period['subtitles']: + self.report_warning(bug_reports_message( + 'Found subtitles in multiple periods in the DASH manifest; ' + 'if part of the subtitles are missing,' + ), only_once=True) + + for sub_lang, sub_info in period['subtitles'].items(): + subtitles.setdefault(sub_lang, []).extend(sub_info) + + return list(formats.values()), subtitles + + def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): """ Parse formats from MPD manifest. References: @@ -2641,9 +2689,13 @@ class InfoExtractor: return ms_info mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) - formats, subtitles = [], {} stream_numbers = collections.defaultdict(int) - for period in mpd_doc.findall(_add_ns('Period')): + for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))): + period_entry = { + 'id': period.get('id', f'period-{period_idx}'), + 'formats': [], + 'subtitles': collections.defaultdict(list), + } period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { 'start_number': 1, @@ -2893,11 +2945,10 @@ class InfoExtractor: if content_type in ('video', 'audio', 'image/jpeg'): f['manifest_stream_number'] = stream_numbers[f['url']] stream_numbers[f['url']] += 1 - formats.append(f) + period_entry['formats'].append(f) elif content_type == 'text': - subtitles.setdefault(lang or 'und', []).append(f) - - return formats, subtitles + period_entry['subtitles'][lang or 'und'].append(f) + yield period_entry def _extract_ism_formats(self, *args, **kwargs): fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs) diff --git a/yt_dlp/extractor/cpac.py b/yt_dlp/extractor/cpac.py index 0f23f2be2..32bba1e5a 100644 --- a/yt_dlp/extractor/cpac.py +++ b/yt_dlp/extractor/cpac.py @@ -65,7 +65,7 @@ class CPACIE(InfoExtractor): 'title': title, 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), - 'category': [category] if category else None, + 'categories': [category] if category else None, 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), 'is_live': is_live(content['details'].get('type')), } diff --git a/yt_dlp/extractor/craftsy.py b/yt_dlp/extractor/craftsy.py index 5d3733143..3a05ed48a 100644 --- a/yt_dlp/extractor/craftsy.py +++ b/yt_dlp/extractor/craftsy.py @@ -1,12 +1,13 @@ +import json + from .brightcove import BrightcoveNewIE from .common import InfoExtractor - from ..utils import ( - dict_get, - get_element_by_id, - js_to_json, - traverse_obj, + extract_attributes, + get_element_html_by_class, + get_element_text_and_html_by_tag, ) +from ..utils.traversal import traverse_obj class CraftsyIE(InfoExtractor): @@ -41,28 +42,34 @@ class CraftsyIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_data = self._parse_json(self._search_regex( - r'class_video_player_vars\s*=\s*({.*})\s*;', - get_element_by_id('vidstore-classes_class-video-player-js-extra', webpage), - 'video data'), video_id, transform_source=js_to_json) + video_player = get_element_html_by_class('class-video-player', webpage) + video_data = traverse_obj(video_player, ( + {extract_attributes}, 'wire:snapshot', {json.loads}, 'data', {dict})) or {} + video_js = traverse_obj(video_player, ( + {lambda x: get_element_text_and_html_by_tag('video-js', x)}, 1, {extract_attributes})) or {} + + has_access = video_data.get('userHasAccess') + lessons = traverse_obj(video_data, ('lessons', ..., ..., lambda _, v: v['video_id'])) + + preview_id = video_js.get('data-video-id') + if preview_id and preview_id not in traverse_obj(lessons, (..., 'video_id')): + if not lessons and not has_access: + self.report_warning( + 'Only extracting preview. For the full class, pass cookies ' + + f'from an account that has access. {self._login_hint()}') + lessons.append({'video_id': preview_id}) - account_id = traverse_obj(video_data, ('video_player', 'bc_account_id')) + if not lessons and not has_access: + self.raise_login_required('You do not have access to this class') - entries = [] - class_preview = traverse_obj(video_data, ('video_player', 'class_preview')) - if class_preview: - v_id = class_preview.get('video_id') - entries.append(self.url_result( - f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={v_id}', - BrightcoveNewIE, v_id, class_preview.get('title'))) + account_id = video_data.get('accountId') or video_js['data-account'] - if dict_get(video_data, ('is_free', 'user_has_access')): - entries += [ - self.url_result( + def entries(lessons): + for lesson in lessons: + yield self.url_result( f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={lesson["video_id"]}', BrightcoveNewIE, lesson['video_id'], lesson.get('title')) - for lesson in video_data['lessons']] return self.playlist_result( - entries, video_id, video_data.get('class_title'), + entries(lessons), video_id, self._html_search_meta(('og:title', 'twitter:title'), webpage), self._html_search_meta(('og:description', 'description'), webpage, default=None)) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index ee34aced5..8d997debf 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -514,7 +514,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'track': 'Egaono Hana', 'artist': 'Goose house', 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', - 'genre': ['J-Pop'], + 'genres': ['J-Pop'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -527,7 +527,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'track': 'Crossing Field', 'artist': 'LiSA', 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', - 'genre': ['Anime'], + 'genres': ['Anime'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -541,7 +541,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'artist': 'LiSA', 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', 'description': 'md5:747444e7e6300907b7a43f0a0503072e', - 'genre': ['J-Pop'], + 'genres': ['J-Pop'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -594,7 +594,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'width': ('width', {int_or_none}), 'height': ('height', {int_or_none}), }), - 'genre': ('genres', ..., 'displayValue'), + 'genres': ('genres', ..., 'displayValue'), 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), }), } @@ -611,7 +611,7 @@ class CrunchyrollArtistIE(CrunchyrollBaseIE): 'info_dict': { 'id': 'MA179CB50D', 'title': 'LiSA', - 'genre': ['J-Pop', 'Anime', 'Rock'], + 'genres': ['J-Pop', 'Anime', 'Rock'], 'description': 'md5:16d87de61a55c3f7d6c454b73285938e', }, 'playlist_mincount': 83, @@ -645,6 +645,6 @@ class CrunchyrollArtistIE(CrunchyrollBaseIE): 'width': ('width', {int_or_none}), 'height': ('height', {int_or_none}), }), - 'genre': ('genres', ..., 'displayValue'), + 'genres': ('genres', ..., 'displayValue'), }), } diff --git a/yt_dlp/extractor/cybrary.py b/yt_dlp/extractor/cybrary.py index c4c78ee1b..614d0cd9e 100644 --- a/yt_dlp/extractor/cybrary.py +++ b/yt_dlp/extractor/cybrary.py @@ -114,7 +114,7 @@ class CybraryCourseIE(CybraryBaseIE): _TESTS = [{ 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', 'info_dict': { - 'id': 898, + 'id': '898', 'title': 'AZ-500: Microsoft Azure Security Technologies', 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4' }, @@ -122,7 +122,7 @@ class CybraryCourseIE(CybraryBaseIE): }, { 'url': 'https://app.cybrary.it/browse/course/cybrary-orientation', 'info_dict': { - 'id': 1245, + 'id': '1245', 'title': 'Cybrary Orientation', 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e' }, diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 708d6fed2..c570a4f52 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -1,6 +1,7 @@ import functools import json import re +import urllib.parse from .common import InfoExtractor from ..networking.exceptions import HTTPError @@ -44,36 +45,41 @@ class DailymotionBaseInfoExtractor(InfoExtractor): self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self.get_param('age_limit')) self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off') + def _get_token(self, xid): + cookies = self._get_dailymotion_cookies() + token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token') + if token: + return token + + data = { + 'client_id': 'f1a362d288c1b98099c7', + 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5', + } + username, password = self._get_login_info() + if username: + data.update({ + 'grant_type': 'password', + 'password': password, + 'username': username, + }) + else: + data['grant_type'] = 'client_credentials' + try: + token = self._download_json( + 'https://graphql.api.dailymotion.com/oauth/token', + None, 'Downloading Access Token', + data=urlencode_postdata(data))['access_token'] + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + raise ExtractorError(self._parse_json( + e.cause.response.read().decode(), xid)['error_description'], expected=True) + raise + self._set_dailymotion_cookie('access_token' if username else 'client_token', token) + return token + def _call_api(self, object_type, xid, object_fields, note, filter_extra=None): if not self._HEADERS.get('Authorization'): - cookies = self._get_dailymotion_cookies() - token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token') - if not token: - data = { - 'client_id': 'f1a362d288c1b98099c7', - 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5', - } - username, password = self._get_login_info() - if username: - data.update({ - 'grant_type': 'password', - 'password': password, - 'username': username, - }) - else: - data['grant_type'] = 'client_credentials' - try: - token = self._download_json( - 'https://graphql.api.dailymotion.com/oauth/token', - None, 'Downloading Access Token', - data=urlencode_postdata(data))['access_token'] - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 400: - raise ExtractorError(self._parse_json( - e.cause.response.read().decode(), xid)['error_description'], expected=True) - raise - self._set_dailymotion_cookie('access_token' if username else 'client_token', token) - self._HEADERS['Authorization'] = 'Bearer ' + token + self._HEADERS['Authorization'] = f'Bearer {self._get_token(xid)}' resp = self._download_json( 'https://graphql.api.dailymotion.com/', xid, note, data=json.dumps({ @@ -393,9 +399,55 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE): yield '//dailymotion.com/playlist/%s' % p +class DailymotionSearchIE(DailymotionPlaylistBaseIE): + IE_NAME = 'dailymotion:search' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/search/(?P[^/?#]+)/videos' + _PAGE_SIZE = 20 + _TESTS = [{ + 'url': 'http://www.dailymotion.com/search/king of turtles/videos', + 'info_dict': { + 'id': 'king of turtles', + 'title': 'king of turtles', + }, + 'playlist_mincount': 90, + }] + _SEARCH_QUERY = 'query SEARCH_QUERY( $query: String! $page: Int $limit: Int ) { search { videos( query: $query first: $limit page: $page ) { edges { node { xid } } } } } ' + + def _call_search_api(self, term, page, note): + if not self._HEADERS.get('Authorization'): + self._HEADERS['Authorization'] = f'Bearer {self._get_token(term)}' + resp = self._download_json( + 'https://graphql.api.dailymotion.com/', None, note, data=json.dumps({ + 'operationName': 'SEARCH_QUERY', + 'query': self._SEARCH_QUERY, + 'variables': { + 'limit': 20, + 'page': page, + 'query': term, + } + }).encode(), headers=self._HEADERS) + obj = traverse_obj(resp, ('data', 'search', {dict})) + if not obj: + raise ExtractorError( + traverse_obj(resp, ('errors', 0, 'message', {str})) or 'Could not fetch search data') + + return obj + + def _fetch_page(self, term, page): + page += 1 + response = self._call_search_api(term, page, f'Searching "{term}" page {page}') + for xid in traverse_obj(response, ('videos', 'edges', ..., 'node', 'xid')): + yield self.url_result(f'https://www.dailymotion.com/video/{xid}', DailymotionIE, xid) + + def _real_extract(self, url): + term = urllib.parse.unquote_plus(self._match_id(url)) + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, term), self._PAGE_SIZE), term, term) + + class DailymotionUserIE(DailymotionPlaylistBaseIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search)/)(?:(?:old/)?user/)?(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', 'info_dict': { diff --git a/yt_dlp/extractor/damtomo.py b/yt_dlp/extractor/damtomo.py index 0e08e4f65..5e14d6aff 100644 --- a/yt_dlp/extractor/damtomo.py +++ b/yt_dlp/extractor/damtomo.py @@ -83,7 +83,6 @@ class DamtomoRecordIE(DamtomoBaseIE): 'info_dict': { 'id': '27376862', 'title': 'イカSUMMER [良音]', - 'description': None, 'uploader': 'NANA', 'uploader_id': 'MzAyMDExNTY', 'upload_date': '20210721', diff --git a/yt_dlp/extractor/daum.py b/yt_dlp/extractor/daum.py index 3ef514065..24c520855 100644 --- a/yt_dlp/extractor/daum.py +++ b/yt_dlp/extractor/daum.py @@ -27,7 +27,7 @@ class DaumIE(DaumBaseIE): 'duration': 2117, 'view_count': int, 'comment_count': int, - 'uploader_id': 186139, + 'uploader_id': '186139', 'uploader': '콘간지', 'timestamp': 1387310323, }, @@ -44,7 +44,7 @@ class DaumIE(DaumBaseIE): 'view_count': int, 'comment_count': int, 'uploader': 'MBC 예능', - 'uploader_id': 132251, + 'uploader_id': '132251', 'timestamp': 1421604228, }, }, { @@ -63,7 +63,7 @@ class DaumIE(DaumBaseIE): 'view_count': int, 'comment_count': int, 'uploader': '까칠한 墮落始祖 황비홍님의', - 'uploader_id': 560824, + 'uploader_id': '560824', 'timestamp': 1203770745, }, }, { @@ -77,7 +77,7 @@ class DaumIE(DaumBaseIE): 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'upload_date': '20170129', 'uploader': '쇼! 음악중심', - 'uploader_id': 2653210, + 'uploader_id': '2653210', 'timestamp': 1485684628, }, }] @@ -107,7 +107,7 @@ class DaumClipIE(DaumBaseIE): 'duration': 3868, 'view_count': int, 'uploader': 'GOMeXP', - 'uploader_id': 6667, + 'uploader_id': '6667', 'timestamp': 1377911092, }, }, { diff --git a/yt_dlp/extractor/digg.py b/yt_dlp/extractor/digg.py deleted file mode 100644 index 86e8a6fac..000000000 --- a/yt_dlp/extractor/digg.py +++ /dev/null @@ -1,54 +0,0 @@ -from .common import InfoExtractor -from ..utils import js_to_json - - -class DiggIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?digg\.com/video/(?P[^/?#&]+)' - _TESTS = [{ - # JWPlatform via provider - 'url': 'http://digg.com/video/sci-fi-short-jonah-daniel-kaluuya-get-out', - 'info_dict': { - 'id': 'LcqvmS0b', - 'ext': 'mp4', - 'title': "'Get Out' Star Daniel Kaluuya Goes On 'Moby Dick'-Like Journey In Sci-Fi Short 'Jonah'", - 'description': 'md5:541bb847648b6ee3d6514bc84b82efda', - 'upload_date': '20180109', - 'timestamp': 1515530551, - }, - 'params': { - 'skip_download': True, - }, - }, { - # Youtube via provider - 'url': 'http://digg.com/video/dog-boat-seal-play', - 'only_matching': True, - }, { - # vimeo as regular embed - 'url': 'http://digg.com/video/dream-girl-short-film', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - info = self._parse_json( - self._search_regex( - r'(?s)video_info\s*=\s*({.+?});\n', webpage, 'video info', - default='{}'), display_id, transform_source=js_to_json, - fatal=False) - - video_id = info.get('video_id') - - if video_id: - provider = info.get('provider_name') - if provider == 'youtube': - return self.url_result( - video_id, ie='Youtube', video_id=video_id) - elif provider == 'jwplayer': - return self.url_result( - 'jwplatform:%s' % video_id, ie='JWPlatform', - video_id=video_id) - - return self.url_result(url, 'Generic') diff --git a/yt_dlp/extractor/dtube.py b/yt_dlp/extractor/dtube.py index 25a98f625..bb06c42be 100644 --- a/yt_dlp/extractor/dtube.py +++ b/yt_dlp/extractor/dtube.py @@ -9,6 +9,7 @@ from ..utils import ( class DTubeIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P[0-9a-z.-]+)/(?P[0-9a-z]{8})' _TEST = { 'url': 'https://d.tube/#!/v/broncnutz/x380jtr1', diff --git a/yt_dlp/extractor/dumpert.py b/yt_dlp/extractor/dumpert.py index 0cf84263c..5e7aef0c5 100644 --- a/yt_dlp/extractor/dumpert.py +++ b/yt_dlp/extractor/dumpert.py @@ -8,9 +8,9 @@ from ..utils import ( class DumpertIE(InfoExtractor): _VALID_URL = r'''(?x) - (?Phttps?)://(?:(?:www|legacy)\.)?dumpert\.nl(?: - /(?:mediabase|embed|item)/| - (?:/toppers|/latest|/?)\?selectedId= + (?Phttps?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?: + (?:mediabase|embed|item)/| + [^#]*[?&]selectedId= )(?P[0-9]+[/_][0-9a-zA-Z]+)''' _TESTS = [{ 'url': 'https://www.dumpert.nl/item/6646981_951bc60f', @@ -56,6 +56,9 @@ class DumpertIE(InfoExtractor): }, { 'url': 'https://www.dumpert.nl/?selectedId=100031688_b317a185', 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/toppers/dag?selectedId=100086074_f5cef3ac', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/duoplay.py b/yt_dlp/extractor/duoplay.py index 7d3f39942..ebce0b5f2 100644 --- a/yt_dlp/extractor/duoplay.py +++ b/yt_dlp/extractor/duoplay.py @@ -32,7 +32,7 @@ class DuoplayIE(InfoExtractor): 'season_number': 2, 'episode': 'Operatsioon "Öö"', 'episode_number': 12, - 'episode_id': 24, + 'episode_id': '24', }, }, { 'note': 'Empty title', @@ -50,7 +50,7 @@ class DuoplayIE(InfoExtractor): 'series_id': '17', 'season': 'Season 2', 'season_number': 2, - 'episode_id': 14, + 'episode_id': '14', 'release_year': 2010, }, }, { @@ -99,6 +99,6 @@ class DuoplayIE(InfoExtractor): 'season_number': ('season_id', {int_or_none}), 'episode': 'subtitle', 'episode_number': ('episode_nr', {int_or_none}), - 'episode_id': ('episode_id', {int_or_none}), + 'episode_id': ('episode_id', {str_or_none}), }, get_all=False) if episode_attr.get('category') != 'movies' else {}), } diff --git a/yt_dlp/extractor/dw.py b/yt_dlp/extractor/dw.py index 9c4a08e54..f7b852076 100644 --- a/yt_dlp/extractor/dw.py +++ b/yt_dlp/extractor/dw.py @@ -8,6 +8,8 @@ from ..compat import compat_urlparse class DWIE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE IE_NAME = 'dw' _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P\d+)' _TESTS = [{ @@ -82,6 +84,8 @@ class DWIE(InfoExtractor): class DWArticleIE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE IE_NAME = 'dw:article' _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P\d+)' _TEST = { diff --git a/yt_dlp/extractor/eplus.py b/yt_dlp/extractor/eplus.py index 6383691a1..88a8d5a94 100644 --- a/yt_dlp/extractor/eplus.py +++ b/yt_dlp/extractor/eplus.py @@ -42,7 +42,6 @@ class EplusIbIE(InfoExtractor): 'live_status': 'was_live', 'release_date': '20210719', 'release_timestamp': 1626703200, - 'description': None, }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index f3da95f5c..191a4361a 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -13,6 +13,7 @@ from ..utils import ( class EuropaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P[A-Za-z0-9-]+)' _TESTS = [{ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', diff --git a/yt_dlp/extractor/fancode.py b/yt_dlp/extractor/fancode.py index 1b5db818a..cddf25497 100644 --- a/yt_dlp/extractor/fancode.py +++ b/yt_dlp/extractor/fancode.py @@ -10,6 +10,7 @@ from ..utils import ( class FancodeVodIE(InfoExtractor): + _WORKING = False IE_NAME = 'fancode:vod' _VALID_URL = r'https?://(?:www\.)?fancode\.com/video/(?P[0-9]+)\b' @@ -126,6 +127,7 @@ class FancodeVodIE(InfoExtractor): class FancodeLiveIE(FancodeVodIE): # XXX: Do not subclass from concrete IE + _WORKING = False IE_NAME = 'fancode:live' _VALID_URL = r'https?://(www\.)?fancode\.com/match/(?P[0-9]+).+' diff --git a/yt_dlp/extractor/filmmodu.py b/yt_dlp/extractor/filmmodu.py deleted file mode 100644 index 1e793560d..000000000 --- a/yt_dlp/extractor/filmmodu.py +++ /dev/null @@ -1,69 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none - - -class FilmmoduIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?filmmodu\.org/(?P[^/]+-(?:turkce-dublaj-izle|altyazili-izle))' - _TESTS = [{ - 'url': 'https://www.filmmodu.org/f9-altyazili-izle', - 'md5': 'aeefd955c2a508a5bdaa3bcec8eeb0d4', - 'info_dict': { - 'id': '10804', - 'ext': 'mp4', - 'title': 'F9', - 'description': 'md5:2713f584a4d65afa2611e2948d0b953c', - 'subtitles': { - 'tr': [{ - 'ext': 'vtt', - }], - }, - 'thumbnail': r're:https://s[0-9]+.filmmodu.org/uploads/movie/cover/10804/xXHZeb1yhJvnSHPzZDqee0zfMb6.jpg', - }, - }, { - 'url': 'https://www.filmmodu.org/the-godfather-turkce-dublaj-izle', - 'md5': '109f2fcb9c941330eed133971c035c00', - 'info_dict': { - 'id': '3646', - 'ext': 'mp4', - 'title': 'Baba', - 'description': 'md5:d43fd651937cd75cc650883ebd8d8461', - 'thumbnail': r're:https://s[0-9]+.filmmodu.org/uploads/movie/cover/3646/6xKCYgH16UuwEGAyroLU6p8HLIn.jpg', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage, fatal=True) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - real_video_id = self._search_regex(r'var\s*videoId\s*=\s*\'([0-9]+)\'', webpage, 'video_id') - video_type = self._search_regex(r'var\s*videoType\s*=\s*\'([a-z]+)\'', webpage, 'video_type') - data = self._download_json('https://www.filmmodu.org/get-source', real_video_id, query={ - 'movie_id': real_video_id, - 'type': video_type, - }) - formats = [{ - 'url': source['src'], - 'ext': 'mp4', - 'format_id': source['label'], - 'height': int_or_none(source.get('res')), - 'protocol': 'm3u8_native', - } for source in data['sources']] - - subtitles = {} - - if data.get('subtitle'): - subtitles['tr'] = [{ - 'url': data['subtitle'], - }] - - return { - 'id': real_video_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'subtitles': subtitles, - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 0ceecde74..7b8f7dd04 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -1,60 +1,49 @@ +import re +import urllib.parse + from .common import InfoExtractor from .dailymotion import DailymotionIE +from ..networking import HEADRequest from ..utils import ( - ExtractorError, determine_ext, + filter_dict, format_field, int_or_none, join_nonempty, parse_iso8601, - parse_qs, + smuggle_url, + unsmuggle_url, + url_or_none, ) +from ..utils.traversal import traverse_obj class FranceTVBaseInfoExtractor(InfoExtractor): - def _make_url_result(self, video_or_full_id, catalog=None): - full_id = 'francetv:%s' % video_or_full_id - if '@' not in video_or_full_id and catalog: - full_id += '@%s' % catalog - return self.url_result( - full_id, ie=FranceTVIE.ie_key(), - video_id=video_or_full_id.split('@')[0]) + def _make_url_result(self, video_id, url=None): + video_id = video_id.split('@')[0] # for compat with old @catalog IDs + full_id = f'francetv:{video_id}' + if url: + full_id = smuggle_url(full_id, {'hostname': urllib.parse.urlparse(url).hostname}) + return self.url_result(full_id, FranceTVIE, video_id) class FranceTVIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - https?:// - sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\? - .*?\bidDiffusion=[^&]+| - (?: - https?://videos\.francetv\.fr/video/| - francetv: - ) - (?P[^@]+)(?:@(?P.+))? - ) - ''' - _EMBED_REGEX = [r']+?src=(["\'])(?P(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1'] + _VALID_URL = r'francetv:(?P[^@#]+)' + _GEO_COUNTRIES = ['FR'] + _GEO_BYPASS = False _TESTS = [{ - # without catalog - 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0', - 'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f', + 'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1', 'info_dict': { - 'id': '162311093', + 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', 'timestamp': 1502623500, + 'duration': 2580, + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20170813', }, - }, { - # with catalog - 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4', - 'only_matching': True, - }, { - 'url': 'http://videos.francetv.fr/video/NI_657393@Regions', - 'only_matching': True, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'francetv:162311093', 'only_matching': True, @@ -76,10 +65,7 @@ class FranceTVIE(InfoExtractor): 'only_matching': True, }] - def _extract_video(self, video_id, catalogue=None): - # Videos are identified by idDiffusion so catalogue part is optional. - # However when provided, some extra formats may be returned so we pass - # it if available. + def _extract_video(self, video_id, hostname=None): is_live = None videos = [] title = None @@ -91,18 +77,20 @@ class FranceTVIE(InfoExtractor): timestamp = None spritesheets = None - for device_type in ('desktop', 'mobile'): + # desktop+chrome returns dash; mobile+safari returns hls + for device_type, browser in [('desktop', 'chrome'), ('mobile', 'safari')]: dinfo = self._download_json( - 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, - video_id, 'Downloading %s video JSON' % device_type, query={ + f'https://k7.ftven.fr/videos/{video_id}', video_id, + f'Downloading {device_type} {browser} video JSON', query=filter_dict({ 'device_type': device_type, - 'browser': 'chrome', - }, fatal=False) + 'browser': browser, + 'domain': hostname, + }), fatal=False) if not dinfo: continue - video = dinfo.get('video') + video = traverse_obj(dinfo, ('video', {dict})) if video: videos.append(video) if duration is None: @@ -112,7 +100,7 @@ class FranceTVIE(InfoExtractor): if spritesheets is None: spritesheets = video.get('spritesheets') - meta = dinfo.get('meta') + meta = traverse_obj(dinfo, ('meta', {dict})) if meta: if title is None: title = meta.get('title') @@ -126,43 +114,46 @@ class FranceTVIE(InfoExtractor): if timestamp is None: timestamp = parse_iso8601(meta.get('broadcasted_at')) - formats = [] - subtitles = {} - for video in videos: + formats, subtitles, video_url = [], {}, None + for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])): + video_url = video['url'] format_id = video.get('format') - video_url = None - if video.get('workflow') == 'token-akamai': - token_url = video.get('token') - if token_url: - token_json = self._download_json( - token_url, video_id, - 'Downloading signed %s manifest URL' % format_id) - if token_json: - video_url = token_json.get('url') - if not video_url: - video_url = video.get('url') + if token_url := url_or_none(video.get('token')): + tokenized_url = traverse_obj(self._download_json( + token_url, video_id, f'Downloading signed {format_id} manifest URL', + fatal=False, query={ + 'format': 'json', + 'url': video_url, + }), ('url', {url_or_none})) + if tokenized_url: + video_url = tokenized_url ext = determine_ext(video_url) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) + video_url, video_id, f4m_id=format_id or ext, fatal=False)) elif ext == 'm3u8': + format_id = format_id or 'hls' fmts, subs = self._extract_m3u8_formats_and_subtitles( - video_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False) + video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) + for f in traverse_obj(fmts, lambda _, v: v['vcodec'] == 'none' and v.get('tbr') is None): + if mobj := re.match(rf'{format_id}-[Aa]udio-\w+-(?P\d+)', f['format_id']): + f.update({ + 'tbr': int_or_none(mobj.group('bitrate')), + 'acodec': 'mp4a', + }) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif ext == 'mpd': fmts, subs = self._extract_mpd_formats_and_subtitles( - video_url, video_id, mpd_id=format_id, fatal=False) + video_url, video_id, mpd_id=format_id or 'dash', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, - 'format_id': 'rtmp-%s' % format_id, + 'format_id': join_nonempty('rtmp', format_id), 'ext': 'flv', }) else: @@ -174,6 +165,13 @@ class FranceTVIE(InfoExtractor): # XXX: what is video['captions']? + if not formats and video_url: + urlh = self._request_webpage( + HEADRequest(video_url), video_id, 'Checking for geo-restriction', + fatal=False, expected_status=403) + if urlh and urlh.headers.get('x-errortype') == 'geo': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + for f in formats: if f.get('acodec') != 'none' and f.get('language') in ('qtz', 'qad'): f['language_preference'] = -10 @@ -194,7 +192,7 @@ class FranceTVIE(InfoExtractor): # a 10×10 grid of thumbnails corresponding to approximately # 2 seconds of the video; the last spritesheet may be shorter 'duration': 200, - } for sheet in spritesheets] + } for sheet in traverse_obj(spritesheets, (..., {url_or_none}))] }) return { @@ -210,21 +208,15 @@ class FranceTVIE(InfoExtractor): 'series': title if episode_number else None, 'episode_number': int_or_none(episode_number), 'season_number': int_or_none(season_number), + '_format_sort_fields': ('res', 'tbr', 'proto'), # prioritize m3u8 over dash } def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - catalog = mobj.group('catalog') + url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + hostname = smuggled_data.get('hostname') or 'www.france.tv' - if not video_id: - qs = parse_qs(url) - video_id = qs.get('idDiffusion', [None])[0] - catalog = qs.get('catalogue', [None])[0] - if not video_id: - raise ExtractorError('Invalid URL', expected=True) - - return self._extract_video(video_id, catalog) + return self._extract_video(video_id, hostname=hostname) class FranceTVSiteIE(FranceTVBaseInfoExtractor): @@ -246,6 +238,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): }, 'add_ie': [FranceTVIE.ie_key()], }, { + # geo-restricted 'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html', 'info_dict': { 'id': 'a9050959-eedd-4b4a-9b0d-de6eeaa73e44', @@ -261,6 +254,26 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1441, }, + }, { + # geo-restricted livestream (workflow == 'token-akamai') + 'url': 'https://www.france.tv/france-4/direct.html', + 'info_dict': { + 'id': '9a6a7670-dde9-4264-adbc-55b89558594b', + 'ext': 'mp4', + 'title': r're:France 4 en direct .+', + 'live_status': 'is_live', + }, + 'skip': 'geo-restricted livestream', + }, { + # livestream (workflow == 'dai') + 'url': 'https://www.france.tv/france-2/direct.html', + 'info_dict': { + 'id': '006194ea-117d-4bcf-94a9-153d999c59ae', + 'ext': 'mp4', + 'title': r're:France 2 en direct .+', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'livestream'}, }, { # france3 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', @@ -277,10 +290,6 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): # franceo 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html', 'only_matching': True, - }, { - # france2 live - 'url': 'https://www.france.tv/france-2/direct.html', - 'only_matching': True, }, { 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html', 'only_matching': True, @@ -304,17 +313,16 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): webpage = self._download_webpage(url, display_id) - catalogue = None video_id = self._search_regex( r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', default=None, group='id') if not video_id: - video_id, catalogue = self._html_search_regex( - r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', - webpage, 'video ID').split('@') + video_id = self._html_search_regex( + r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@"]+@[^"]+)"', + webpage, 'video ID') - return self._make_url_result(video_id, catalogue) + return self._make_url_result(video_id, url=url) class FranceTVInfoIE(FranceTVBaseInfoExtractor): @@ -328,8 +336,9 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): 'ext': 'mp4', 'title': 'Soir 3', 'upload_date': '20190822', - 'timestamp': 1566510900, - 'description': 'md5:72d167097237701d6e8452ff03b83c00', + 'timestamp': 1566510730, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'duration': 1637, 'subtitles': { 'fr': 'mincount:2', }, @@ -344,8 +353,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): 'info_dict': { 'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482', 'ext': 'mp4', - 'title': 'Covid-19 : une situation catastrophique à New Dehli', - 'thumbnail': str, + 'title': 'Covid-19 : une situation catastrophique à New Dehli - Édition du mercredi 21 avril 2021', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'duration': 76, 'timestamp': 1619028518, 'upload_date': '20210421', @@ -371,11 +380,17 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): 'id': 'x4iiko0', 'ext': 'mp4', 'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen', - 'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016', + 'description': 'md5:fdcb582c370756293a65cdfbc6ecd90e', 'timestamp': 1467011958, - 'upload_date': '20160627', 'uploader': 'France Inter', 'uploader_id': 'x2q2ez', + 'upload_date': '20160627', + 'view_count': int, + 'tags': ['Politique', 'France Inter', '27 juin 2016', 'Linvité de 8h20', 'Cécile Duflot', 'Patrick Cohen'], + 'age_limit': 0, + 'duration': 640, + 'like_count': int, + 'thumbnail': r're:https://[^/?#]+/v/[^/?#]+/x1080', }, 'add_ie': ['Dailymotion'], }, { @@ -405,4 +420,4 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): r'(?:data-id|[^.?&#]+)' - _TESTS = [{ - # normal Brightcove embed code extracted with BrightcoveNewIE._extract_url - 'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx', - 'md5': '292f26da1ab4beb4c9099f1304d2b071', - 'info_dict': { - 'id': '4515472681001', - 'ext': 'mp4', - 'title': 'Replay - Animal Crossing', - 'description': 'md5:2e211891b215c85d061adc7a4dd2d930', - 'timestamp': 1443457610, - 'upload_date': '20150928', - 'uploader_id': '694940074001', - }, - }, { - # Brightcove id inside unique element with field--name-field-brightcove-video-id class - 'url': 'https://www.gameinformer.com/video-feature/new-gameplay-today/2019/07/09/new-gameplay-today-streets-of-rogue', - 'info_dict': { - 'id': '6057111913001', - 'ext': 'mp4', - 'title': 'New Gameplay Today – Streets Of Rogue', - 'timestamp': 1562699001, - 'upload_date': '20190709', - 'uploader_id': '694940074001', - - }, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/694940074001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage( - url, display_id, headers=self.geo_verification_headers()) - brightcove_id = clean_html(get_element_by_class('field--name-field-brightcove-video-id', webpage) or get_element_by_id('video-source-content', webpage)) - brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id if brightcove_id else BrightcoveNewIE._extract_url(self, webpage) - return self.url_result(brightcove_url, 'BrightcoveNew', brightcove_id) diff --git a/yt_dlp/extractor/gamejolt.py b/yt_dlp/extractor/gamejolt.py index 8ec046bb3..4d57391ac 100644 --- a/yt_dlp/extractor/gamejolt.py +++ b/yt_dlp/extractor/gamejolt.py @@ -267,9 +267,9 @@ class GameJoltIE(GameJoltBaseIE): 'id': 'dszyjnwi', 'ext': 'webm', 'title': 'gif-presentacion-mejorado-dszyjnwi', - 'n_entries': 1, } - }] + }], + 'playlist_count': 1, }, { # Multiple GIFs 'url': 'https://gamejolt.com/p/gif-yhsqkumq', @@ -374,7 +374,6 @@ class GameJoltGameSoundtrackIE(GameJoltBaseIE): 'info_dict': { 'id': '657899', 'title': 'Friday Night Funkin\': Vs Oswald', - 'n_entries': None, }, 'playlist': [{ 'info_dict': { @@ -384,7 +383,6 @@ class GameJoltGameSoundtrackIE(GameJoltBaseIE): 'url': r're:^https://.+vs-oswald-menu-music\.mp3$', 'release_timestamp': 1635190816, 'release_date': '20211025', - 'n_entries': 3, } }, { 'info_dict': { @@ -394,7 +392,6 @@ class GameJoltGameSoundtrackIE(GameJoltBaseIE): 'url': r're:^https://.+rabbit-s-luck--full-version-\.mp3$', 'release_timestamp': 1635190841, 'release_date': '20211025', - 'n_entries': 3, } }, { 'info_dict': { @@ -404,9 +401,9 @@ class GameJoltGameSoundtrackIE(GameJoltBaseIE): 'url': r're:^https://.+last-straw\.mp3$', 'release_timestamp': 1635881104, 'release_date': '20211102', - 'n_entries': 3, } - }] + }], + 'playlist_count': 3, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/gaskrank.py b/yt_dlp/extractor/gaskrank.py index e0bbdae0a..bc56b03e3 100644 --- a/yt_dlp/extractor/gaskrank.py +++ b/yt_dlp/extractor/gaskrank.py @@ -21,7 +21,6 @@ class GaskrankIE(InfoExtractor): 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', 'uploader_id': 'Bikefun', 'upload_date': '20170110', - 'uploader_url': None, } }, { 'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', diff --git a/yt_dlp/extractor/gazeta.py b/yt_dlp/extractor/gazeta.py index c6868a672..8925b69fd 100644 --- a/yt_dlp/extractor/gazeta.py +++ b/yt_dlp/extractor/gazeta.py @@ -2,6 +2,7 @@ from .common import InfoExtractor class GazetaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'(?Phttps?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:main/)*(?:\d{4}/\d{2}/\d{2}/)?(?P[A-Za-z0-9-_.]+)\.s?html)' _TESTS = [{ 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', diff --git a/yt_dlp/extractor/gdcvault.py b/yt_dlp/extractor/gdcvault.py index 4265feb61..b4d81b2e8 100644 --- a/yt_dlp/extractor/gdcvault.py +++ b/yt_dlp/extractor/gdcvault.py @@ -7,6 +7,7 @@ from ..utils import remove_start, smuggle_url, urlencode_postdata class GDCVaultIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P\d+)(?:/(?P[\w-]+))?' _NETRC_MACHINE = 'gdcvault' _TESTS = [ diff --git a/yt_dlp/extractor/giga.py b/yt_dlp/extractor/giga.py deleted file mode 100644 index b59c129ab..000000000 --- a/yt_dlp/extractor/giga.py +++ /dev/null @@ -1,93 +0,0 @@ -import itertools - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import parse_duration, parse_iso8601, qualities, str_to_int - - -class GigaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?giga\.de/(?:[^/]+/)*(?P[^/]+)' - _TESTS = [{ - 'url': 'http://www.giga.de/filme/anime-awesome/trailer/anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss/', - 'md5': '6bc5535e945e724640664632055a584f', - 'info_dict': { - 'id': '2622086', - 'display_id': 'anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss', - 'ext': 'mp4', - 'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss', - 'description': 'md5:afdf5862241aded4718a30dff6a57baf', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 578, - 'timestamp': 1414749706, - 'upload_date': '20141031', - 'uploader': 'Robin Schweiger', - 'view_count': int, - }, - }, { - 'url': 'http://www.giga.de/games/channel/giga-top-montag/giga-topmontag-die-besten-serien-2014/', - 'only_matching': True, - }, { - 'url': 'http://www.giga.de/extra/netzkultur/videos/giga-games-tom-mats-robin-werden-eigene-wege-gehen-eine-ankuendigung/', - 'only_matching': True, - }, { - 'url': 'http://www.giga.de/tv/jonas-liest-spieletitel-eingedeutscht-episode-2/', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - [r'data-video-id="(\d+)"', r'/api/video/jwplayer/#v=(\d+)'], - webpage, 'video id') - - playlist = self._download_json( - 'http://www.giga.de/api/syndication/video/video_id/%s/playlist.json?content=syndication/key/368b5f151da4ae05ced7fa296bdff65a/' - % video_id, video_id)[0] - - quality = qualities(['normal', 'hd720']) - - formats = [] - for format_id in itertools.count(0): - fmt = playlist.get(compat_str(format_id)) - if not fmt: - break - formats.append({ - 'url': fmt['src'], - 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]), - 'quality': quality(fmt['quality']), - }) - - title = self._html_search_meta( - 'title', webpage, 'title', fatal=True) - description = self._html_search_meta( - 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) - - duration = parse_duration(self._search_regex( - r'(?s)(?:data-video-id="{0}"|data-video="[^"]*/api/video/jwplayer/#v={0}[^"]*")[^>]*>.+?([^<]+)'.format(video_id), - webpage, 'duration', fatal=False)) - - timestamp = parse_iso8601(self._search_regex( - r'datetime="([^"]+)"', webpage, 'upload date', fatal=False)) - uploader = self._search_regex( - r'class="author">([^<]+)', webpage, 'uploader', fatal=False) - - view_count = str_to_int(self._search_regex( - r'([\d.,]+)', - webpage, 'view count', fatal=False)) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, - 'view_count': view_count, - 'formats': formats, - } diff --git a/yt_dlp/extractor/godtube.py b/yt_dlp/extractor/godtube.py index 697540155..35fb7a9c9 100644 --- a/yt_dlp/extractor/godtube.py +++ b/yt_dlp/extractor/godtube.py @@ -6,6 +6,7 @@ from ..utils import ( class GodTubeIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?godtube\.com/watch/\?v=(?P[\da-zA-Z]+)' _TESTS = [ { diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py index 0a3c8340f..74aad1192 100644 --- a/yt_dlp/extractor/goplay.py +++ b/yt_dlp/extractor/goplay.py @@ -40,6 +40,22 @@ class GoPlayIE(InfoExtractor): 'title': 'A Family for the Holidays', }, 'skip': 'This video is only available for registered users' + }, { + 'url': 'https://www.goplay.be/video/de-mol/de-mol-s11/de-mol-s11-aflevering-1#autoplay', + 'info_dict': { + 'id': '03eb8f2f-153e-41cb-9805-0d3a29dab656', + 'ext': 'mp4', + 'title': 'S11 - Aflevering 1', + 'episode': 'Episode 1', + 'series': 'De Mol', + 'season_number': 11, + 'episode_number': 1, + 'season': 'Season 11' + }, + 'params': { + 'skip_download': True + }, + 'skip': 'This video is only available for registered users' }] _id_token = None @@ -77,16 +93,39 @@ class GoPlayIE(InfoExtractor): api = self._download_json( f'https://api.goplay.be/web/v1/videos/long-form/{video_id}', - video_id, headers={'Authorization': 'Bearer %s' % self._id_token}) + video_id, headers={ + 'Authorization': 'Bearer %s' % self._id_token, + **self.geo_verification_headers(), + }) + + if 'manifestUrls' in api: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS') - formats, subs = self._extract_m3u8_formats_and_subtitles( - api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS') + else: + if 'ssai' not in api: + raise ExtractorError('expecting Google SSAI stream') + + ssai_content_source_id = api['ssai']['contentSourceID'] + ssai_video_id = api['ssai']['videoID'] + + dai = self._download_json( + f'https://dai.google.com/ondemand/dash/content/{ssai_content_source_id}/vid/{ssai_video_id}/streams', + video_id, data=b'{"api-key":"null"}', + headers={'content-type': 'application/json'}) + + periods = self._extract_mpd_periods(dai['stream_manifest'], video_id) + + # skip pre-roll and mid-roll ads + periods = [p for p in periods if '-ad-' not in p['id']] + + formats, subtitles = self._merge_mpd_periods(periods) info_dict.update({ 'id': video_id, 'formats': formats, + 'subtitles': subtitles, }) - return info_dict diff --git a/yt_dlp/extractor/hotnewhiphop.py b/yt_dlp/extractor/hotnewhiphop.py index 3007fbb53..4f506cde7 100644 --- a/yt_dlp/extractor/hotnewhiphop.py +++ b/yt_dlp/extractor/hotnewhiphop.py @@ -5,6 +5,7 @@ from ..utils import ExtractorError, urlencode_postdata class HotNewHipHopIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?hotnewhiphop\.com/.*\.(?P.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index 541792b90..a3a3c20c9 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -115,11 +115,11 @@ class HotStarIE(HotStarBaseIE): 'upload_date': '20190501', 'duration': 1219, 'channel': 'StarPlus', - 'channel_id': 3, + 'channel_id': '3', 'series': 'Ek Bhram - Sarvagun Sampanna', 'season': 'Chapter 1', 'season_number': 1, - 'season_id': 6771, + 'season_id': '6771', 'episode': 'Janhvi Targets Suman', 'episode_number': 8, } @@ -135,12 +135,12 @@ class HotStarIE(HotStarBaseIE): 'channel': 'StarPlus', 'series': 'Anupama', 'season_number': 1, - 'season_id': 7399, + 'season_id': '7399', 'upload_date': '20230307', 'episode': 'Anupama, Anuj Share a Moment', 'episode_number': 853, 'duration': 1272, - 'channel_id': 3, + 'channel_id': '3', }, 'skip': 'HTTP Error 504: Gateway Time-out', # XXX: Investigate 504 errors on some episodes }, { @@ -155,12 +155,12 @@ class HotStarIE(HotStarBaseIE): 'channel': 'Hotstar Specials', 'series': 'Kana Kaanum Kaalangal', 'season_number': 1, - 'season_id': 9441, + 'season_id': '9441', 'upload_date': '20220421', 'episode': 'Back To School', 'episode_number': 1, 'duration': 1810, - 'channel_id': 54, + 'channel_id': '54', }, }, { 'url': 'https://www.hotstar.com/in/clips/e3-sairat-kahani-pyaar-ki/1000262286', @@ -325,11 +325,11 @@ class HotStarIE(HotStarBaseIE): 'formats': formats, 'subtitles': subs, 'channel': video_data.get('channelName'), - 'channel_id': video_data.get('channelId'), + 'channel_id': str_or_none(video_data.get('channelId')), 'series': video_data.get('showName'), 'season': video_data.get('seasonName'), 'season_number': int_or_none(video_data.get('seasonNo')), - 'season_id': video_data.get('seasonId'), + 'season_id': str_or_none(video_data.get('seasonId')), 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episodeNo')), } diff --git a/yt_dlp/extractor/hungama.py b/yt_dlp/extractor/hungama.py index cdec36838..7da8aad7a 100644 --- a/yt_dlp/extractor/hungama.py +++ b/yt_dlp/extractor/hungama.py @@ -114,7 +114,6 @@ class HungamaSongIE(InfoExtractor): 'title': 'Lucky Ali - Kitni Haseen Zindagi', 'track': 'Kitni Haseen Zindagi', 'artist': 'Lucky Ali', - 'album': None, 'release_year': 2000, 'thumbnail': 'https://stat2.hungama.ind.in/assets/images/default_images/da-200x200.png', }, diff --git a/yt_dlp/extractor/hypergryph.py b/yt_dlp/extractor/hypergryph.py index 9ca6caebc..96e452a51 100644 --- a/yt_dlp/extractor/hypergryph.py +++ b/yt_dlp/extractor/hypergryph.py @@ -9,7 +9,7 @@ class MonsterSirenHypergryphMusicIE(InfoExtractor): 'info_dict': { 'id': '514562', 'ext': 'wav', - 'artist': ['塞壬唱片-MSR'], + 'artists': ['塞壬唱片-MSR'], 'album': 'Flame Shadow', 'title': 'Flame Shadow', } @@ -27,6 +27,6 @@ class MonsterSirenHypergryphMusicIE(InfoExtractor): 'url': traverse_obj(json_data, ('player', 'songDetail', 'sourceUrl')), 'ext': 'wav', 'vcodec': 'none', - 'artist': traverse_obj(json_data, ('player', 'songDetail', 'artists')), + 'artists': traverse_obj(json_data, ('player', 'songDetail', 'artists', ...)), 'album': traverse_obj(json_data, ('musicPlay', 'albumDetail', 'name')) } diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index dbaa332c2..f7f21505e 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -617,6 +617,7 @@ class InstagramPlaylistBaseIE(InstagramBaseIE): class InstagramUserIE(InstagramPlaylistBaseIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' diff --git a/yt_dlp/extractor/jeuxvideo.py b/yt_dlp/extractor/jeuxvideo.py index 56ea15cf9..793820600 100644 --- a/yt_dlp/extractor/jeuxvideo.py +++ b/yt_dlp/extractor/jeuxvideo.py @@ -2,6 +2,8 @@ from .common import InfoExtractor class JeuxVideoIE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' _TESTS = [{ diff --git a/yt_dlp/extractor/kanal2.py b/yt_dlp/extractor/kanal2.py deleted file mode 100644 index 3c0efe598..000000000 --- a/yt_dlp/extractor/kanal2.py +++ /dev/null @@ -1,66 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - join_nonempty, - traverse_obj, - unified_timestamp, - update_url_query, -) - - -class Kanal2IE(InfoExtractor): - _VALID_URL = r'https?://kanal2\.postimees\.ee/[^?#]+\?([^#]+&)?id=(?P\d+)' - _TESTS = [{ - 'note': 'Test standard url (#5575)', - 'url': 'https://kanal2.postimees.ee/pluss/video/?id=40792', - 'md5': '7ea7b16266ec1798743777df241883dd', - 'info_dict': { - 'id': '40792', - 'ext': 'mp4', - 'title': 'Aedniku aabits / Osa 53 (05.08.2016 20:00)', - 'thumbnail': r're:https?://.*\.jpg$', - 'description': 'md5:53cabf3c5d73150d594747f727431248', - 'upload_date': '20160805', - 'timestamp': 1470420000, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - playlist = self._download_json( - f'https://kanal2.postimees.ee/player/playlist/{video_id}', - video_id, query={'type': 'episodes'}, - headers={'X-Requested-With': 'XMLHttpRequest'}) - - return { - 'id': video_id, - 'title': join_nonempty(*traverse_obj(playlist, ('info', ('title', 'subtitle'))), delim=' / '), - 'description': traverse_obj(playlist, ('info', 'description')), - 'thumbnail': traverse_obj(playlist, ('data', 'image')), - 'formats': self.get_formats(playlist, video_id), - 'timestamp': unified_timestamp(self._search_regex( - r'\((\d{2}\.\d{2}\.\d{4}\s\d{2}:\d{2})\)$', - traverse_obj(playlist, ('info', 'subtitle')), 'timestamp', default='') + ' +0200'), - } - - def get_formats(self, playlist, video_id): - path = traverse_obj(playlist, ('data', 'path')) - if not path: - raise ExtractorError('Path value not found in playlist JSON response') - session = self._download_json( - 'https://sts.postimees.ee/session/register', - video_id, note='Creating session', errnote='Error creating session', - headers={ - 'X-Original-URI': path, - 'Accept': 'application/json', - }) - if session.get('reason') != 'OK' or not session.get('session'): - reason = session.get('reason', 'unknown error') - raise ExtractorError(f'Unable to obtain session: {reason}') - - formats = [] - for stream in traverse_obj(playlist, ('data', 'streams', ..., 'file')): - formats.extend(self._extract_m3u8_formats( - update_url_query(stream, {'s': session['session']}), video_id, 'mp4')) - - return formats diff --git a/yt_dlp/extractor/kankanews.py b/yt_dlp/extractor/kankanews.py index 46e239bd6..8f247b305 100644 --- a/yt_dlp/extractor/kankanews.py +++ b/yt_dlp/extractor/kankanews.py @@ -8,6 +8,7 @@ from .common import InfoExtractor class KankaNewsIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?kankanews\.com/a/\d+\-\d+\-\d+/(?P\d+)\.shtml' _TESTS = [{ 'url': 'https://www.kankanews.com/a/2022-11-08/00310276054.shtml?appid=1088227', diff --git a/yt_dlp/extractor/karrierevideos.py b/yt_dlp/extractor/karrierevideos.py deleted file mode 100644 index 28d4841aa..000000000 --- a/yt_dlp/extractor/karrierevideos.py +++ /dev/null @@ -1,96 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - fix_xml_ampersands, - float_or_none, - xpath_with_ns, - xpath_text, -) - - -class KarriereVideosIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P[^/]+)' - _TESTS = [{ - 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', - 'info_dict': { - 'id': '32c91', - 'ext': 'flv', - 'title': 'AltenpflegerIn', - 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2', - 'thumbnail': r're:^http://.*\.png', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - # broken ampersands - 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun', - 'info_dict': { - 'id': '5sniu', - 'ext': 'flv', - 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"', - 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33', - 'thumbnail': r're:^http://.*\.png', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = (self._html_search_meta('title', webpage, default=None) - or self._search_regex(r'

([^<]+)

', webpage, 'video title')) - - video_id = self._search_regex( - r'/config/video/(.+?)\.xml', webpage, 'video id') - # Server returns malformed headers - # Force Accept-Encoding: * to prevent gzipped results - playlist = self._download_xml( - 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id, - video_id, transform_source=fix_xml_ampersands, - headers={'Accept-Encoding': '*'}) - - NS_MAP = { - 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' - } - - def ns(path): - return xpath_with_ns(path, NS_MAP) - - item = playlist.find('./tracklist/item') - video_file = xpath_text( - item, ns('./jwplayer:file'), 'video url', fatal=True) - streamer = xpath_text( - item, ns('./jwplayer:streamer'), 'streamer', fatal=True) - - uploader = xpath_text( - item, ns('./jwplayer:author'), 'uploader') - duration = float_or_none( - xpath_text(item, ns('./jwplayer:duration'), 'duration')) - - description = self._html_search_regex( - r'(?s)
(.+?)
', - webpage, 'description') - - thumbnail = self._html_search_meta( - 'thumbnail', webpage, 'thumbnail') - if thumbnail: - thumbnail = compat_urlparse.urljoin(url, thumbnail) - - return { - 'id': video_id, - 'url': streamer.replace('rtmpt', 'rtmp'), - 'play_path': 'mp4:%s' % video_file, - 'ext': 'flv', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, - } diff --git a/yt_dlp/extractor/kelbyone.py b/yt_dlp/extractor/kelbyone.py index 2ca9ad426..bba527e29 100644 --- a/yt_dlp/extractor/kelbyone.py +++ b/yt_dlp/extractor/kelbyone.py @@ -3,6 +3,7 @@ from ..utils import int_or_none class KelbyOneIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://members\.kelbyone\.com/course/(?P[^$&?#/]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/konserthusetplay.py b/yt_dlp/extractor/konserthusetplay.py deleted file mode 100644 index 10767f1b6..000000000 --- a/yt_dlp/extractor/konserthusetplay.py +++ /dev/null @@ -1,119 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - url_or_none, -) - - -class KonserthusetPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:konserthusetplay|rspoplay)\.se/\?.*\bm=(?P[^&]+)' - _TESTS = [{ - 'url': 'http://www.konserthusetplay.se/?m=CKDDnlCY-dhWAAqiMERd-A', - 'md5': 'e3fd47bf44e864bd23c08e487abe1967', - 'info_dict': { - 'id': 'CKDDnlCY-dhWAAqiMERd-A', - 'ext': 'mp4', - 'title': 'Orkesterns instrument: Valthornen', - 'description': 'md5:f10e1f0030202020396a4d712d2fa827', - 'thumbnail': 're:^https?://.*$', - 'duration': 398.76, - }, - }, { - 'url': 'http://rspoplay.se/?m=elWuEH34SMKvaO4wO_cHBw', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - e = self._search_regex( - r'https?://csp\.picsearch\.com/rest\?.*\be=(.+?)[&"\']', webpage, 'e') - - rest = self._download_json( - 'http://csp.picsearch.com/rest?e=%s&containerId=mediaplayer&i=object' % e, - video_id, transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) - - media = rest['media'] - player_config = media['playerconfig'] - playlist = player_config['playlist'] - - source = next(f for f in playlist if f.get('bitrates') or f.get('provider')) - - FORMAT_ID_REGEX = r'_([^_]+)_h264m\.mp4' - - formats = [] - - m3u8_url = source.get('url') - if m3u8_url and determine_ext(m3u8_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - - fallback_url = source.get('fallbackUrl') - fallback_format_id = None - if fallback_url: - fallback_format_id = self._search_regex( - FORMAT_ID_REGEX, fallback_url, 'format id', default=None) - - connection_url = (player_config.get('rtmp', {}).get( - 'netConnectionUrl') or player_config.get( - 'plugins', {}).get('bwcheck', {}).get('netConnectionUrl')) - if connection_url: - for f in source['bitrates']: - video_url = f.get('url') - if not video_url: - continue - format_id = self._search_regex( - FORMAT_ID_REGEX, video_url, 'format id', default=None) - f_common = { - 'vbr': int_or_none(f.get('bitrate')), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - } - f = f_common.copy() - f.update({ - 'url': connection_url, - 'play_path': video_url, - 'format_id': 'rtmp-%s' % format_id if format_id else 'rtmp', - 'ext': 'flv', - }) - formats.append(f) - if format_id and format_id == fallback_format_id: - f = f_common.copy() - f.update({ - 'url': fallback_url, - 'format_id': 'http-%s' % format_id if format_id else 'http', - }) - formats.append(f) - - if not formats and fallback_url: - formats.append({ - 'url': fallback_url, - }) - - title = player_config.get('title') or media['title'] - description = player_config.get('mediaInfo', {}).get('description') - thumbnail = media.get('image') - duration = float_or_none(media.get('duration'), 1000) - - subtitles = {} - captions = source.get('captionsAvailableLanguages') - if isinstance(captions, dict): - for lang, subtitle_url in captions.items(): - subtitle_url = url_or_none(subtitle_url) - if lang != 'none' and subtitle_url: - subtitles.setdefault(lang, []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/yt_dlp/extractor/koo.py b/yt_dlp/extractor/koo.py index 9cfec5eb9..c78a7b9ca 100644 --- a/yt_dlp/extractor/koo.py +++ b/yt_dlp/extractor/koo.py @@ -6,6 +6,7 @@ from ..utils import ( class KooIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?kooapp\.com/koo/[^/]+/(?P[^/&#$?]+)' _TESTS = [{ # Test for video in the comments 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/946c4189-bc2d-4524-b95b-43f641e2adde', diff --git a/yt_dlp/extractor/krasview.py b/yt_dlp/extractor/krasview.py index 4323aa429..0febf759b 100644 --- a/yt_dlp/extractor/krasview.py +++ b/yt_dlp/extractor/krasview.py @@ -8,6 +8,7 @@ from ..utils import ( class KrasViewIE(InfoExtractor): + _WORKING = False IE_DESC = 'Красвью' _VALID_URL = r'https?://krasview\.ru/(?:video|embed)/(?P\d+)' diff --git a/yt_dlp/extractor/kusi.py b/yt_dlp/extractor/kusi.py deleted file mode 100644 index a23ad8945..000000000 --- a/yt_dlp/extractor/kusi.py +++ /dev/null @@ -1,83 +0,0 @@ -import random -import urllib.parse - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - timeconvert, - update_url_query, - xpath_text, -) - - -class KUSIIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?Pstory/.+|video\?clipId=(?P\d+))' - _TESTS = [{ - 'url': 'http://www.kusi.com/story/32849881/turko-files-refused-to-help-it-aint-right', - 'md5': '4e76ce8e53660ce9697d06c0ba6fc47d', - 'info_dict': { - 'id': '12689020', - 'ext': 'mp4', - 'title': "Turko Files: Refused to Help, It Ain't Right!", - 'duration': 223.586, - 'upload_date': '20160826', - 'timestamp': 1472233118, - 'thumbnail': r're:^https?://.*\.jpg$' - }, - }, { - 'url': 'http://kusi.com/video?clipId=12203019', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - clip_id = mobj.group('clipId') - video_id = clip_id or mobj.group('path') - - webpage = self._download_webpage(url, video_id) - - if clip_id is None: - video_id = clip_id = self._html_search_regex( - r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id') - - affiliate_id = self._search_regex( - r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id') - - # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf - xml_url = update_url_query('http://www.kusi.com/build.asp', { - 'buildtype': 'buildfeaturexmlrequest', - 'featureType': 'Clip', - 'featureid': clip_id, - 'affiliateno': affiliate_id, - 'clientgroupid': '1', - 'rnd': int(round(random.random() * 1000000)), - }) - - doc = self._download_xml(xml_url, video_id) - - video_title = xpath_text(doc, 'HEADLINE', fatal=True) - duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) - description = xpath_text(doc, 'ABSTRACT') - thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') - creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) - - quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') - formats = [] - for quality in quality_options: - formats.append({ - 'url': urllib.parse.unquote_plus(quality.attrib['url']), - 'height': int_or_none(quality.attrib.get('height')), - 'width': int_or_none(quality.attrib.get('width')), - 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), - }) - - return { - 'id': video_id, - 'title': video_title, - 'description': description, - 'duration': duration, - 'formats': formats, - 'thumbnail': thumbnail, - 'timestamp': creation_time, - } diff --git a/yt_dlp/extractor/kuwo.py b/yt_dlp/extractor/kuwo.py index e8a061a10..3c93dedac 100644 --- a/yt_dlp/extractor/kuwo.py +++ b/yt_dlp/extractor/kuwo.py @@ -54,6 +54,7 @@ class KuwoBaseIE(InfoExtractor): class KuwoIE(KuwoBaseIE): + _WORKING = False IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/yinyue/(?P\d+)' @@ -133,6 +134,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): + _WORKING = False IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/album/(?P\d+?)/' @@ -169,6 +171,7 @@ class KuwoAlbumIE(InfoExtractor): class KuwoChartIE(InfoExtractor): + _WORKING = False IE_NAME = 'kuwo:chart' IE_DESC = '酷我音乐 - 排行榜' _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P[^.]+).htm' @@ -194,6 +197,7 @@ class KuwoChartIE(InfoExtractor): class KuwoSingerIE(InfoExtractor): + _WORKING = False IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mingxing/(?P[^/]+)' @@ -251,6 +255,7 @@ class KuwoSingerIE(InfoExtractor): class KuwoCategoryIE(InfoExtractor): + _WORKING = False IE_NAME = 'kuwo:category' IE_DESC = '酷我音乐 - 分类' _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P\d+?).htm' @@ -290,6 +295,7 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): + _WORKING = False IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mv/(?P\d+?)/' diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index cc37c41e8..dcb44d07f 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -231,7 +231,6 @@ class LBRYIE(LBRYBaseIE): 'release_timestamp': int, 'release_date': str, 'tags': list, - 'duration': None, 'channel': 'RT', 'channel_id': 'fdd11cb3ab75f95efb7b3bc2d726aa13ac915b66', 'channel_url': 'https://odysee.com/@RT:fdd11cb3ab75f95efb7b3bc2d726aa13ac915b66', diff --git a/yt_dlp/extractor/lecture2go.py b/yt_dlp/extractor/lecture2go.py index 3a9b30a3c..10fb5d479 100644 --- a/yt_dlp/extractor/lecture2go.py +++ b/yt_dlp/extractor/lecture2go.py @@ -10,6 +10,7 @@ from ..utils import ( class Lecture2GoIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P\d+)' _TEST = { 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', diff --git a/yt_dlp/extractor/lenta.py b/yt_dlp/extractor/lenta.py index 10aac984e..fe01bda1c 100644 --- a/yt_dlp/extractor/lenta.py +++ b/yt_dlp/extractor/lenta.py @@ -2,6 +2,7 @@ from .common import InfoExtractor class LentaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/', diff --git a/yt_dlp/extractor/likee.py b/yt_dlp/extractor/likee.py index 74ee2bea9..324463136 100644 --- a/yt_dlp/extractor/likee.py +++ b/yt_dlp/extractor/likee.py @@ -22,8 +22,6 @@ class LikeeIE(InfoExtractor): 'description': 'md5:9a7ebe816f0e78722ee5ed76f75983b4', 'thumbnail': r're:^https?://.+\.jpg', 'uploader': 'Huỳnh Hồng Quân ', - 'play_count': int, - 'download_count': int, 'artist': 'Huỳnh Hồng Quân ', 'timestamp': 1651571320, 'upload_date': '20220503', @@ -44,11 +42,9 @@ class LikeeIE(InfoExtractor): 'comment_count': int, 'like_count': int, 'uploader': 'Vương Phước Nhi', - 'download_count': int, 'timestamp': 1651506835, 'upload_date': '20220502', 'duration': 60024, - 'play_count': int, 'artist': 'Vương Phước Nhi', 'uploader_id': '649222262', 'view_count': int, @@ -65,9 +61,7 @@ class LikeeIE(InfoExtractor): 'duration': 9684, 'uploader_id': 'fernanda_rivasg', 'view_count': int, - 'play_count': int, 'artist': 'La Cami La✨', - 'download_count': int, 'like_count': int, 'uploader': 'Fernanda Rivas🎶', 'timestamp': 1614034308, @@ -83,13 +77,11 @@ class LikeeIE(InfoExtractor): 'thumbnail': r're:^https?://.+\.jpg', 'comment_count': int, 'duration': 18014, - 'play_count': int, 'view_count': int, 'timestamp': 1611694774, 'like_count': int, 'uploader': 'Fernanda Rivas🎶', 'uploader_id': 'fernanda_rivasg', - 'download_count': int, 'artist': 'ʟᴇʀɪᴋ_ᴜɴɪᴄᴏʀɴ♡︎', 'upload_date': '20210126', }, @@ -128,8 +120,6 @@ class LikeeIE(InfoExtractor): 'description': info.get('share_desc'), 'view_count': int_or_none(info.get('video_count')), 'like_count': int_or_none(info.get('likeCount')), - 'play_count': int_or_none(info.get('play_count')), - 'download_count': int_or_none(info.get('download_count')), 'comment_count': int_or_none(info.get('comment_count')), 'uploader': str_or_none(info.get('nick_name')), 'uploader_id': str_or_none(info.get('likeeId')), diff --git a/yt_dlp/extractor/localnews8.py b/yt_dlp/extractor/localnews8.py deleted file mode 100644 index 6f3f02c70..000000000 --- a/yt_dlp/extractor/localnews8.py +++ /dev/null @@ -1,42 +0,0 @@ -from .common import InfoExtractor - - -class LocalNews8IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P[^/]+)/(?P[0-9]+)' - _TEST = { - 'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304', - 'md5': 'be4d48aea61aa2bde7be2ee47691ad20', - 'info_dict': { - 'id': '35183304', - 'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings', - 'ext': 'mp4', - 'title': 'Rexburg business turns carbon fiber scraps into wedding ring', - 'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.', - 'duration': 153, - 'timestamp': 1441844822, - 'upload_date': '20150910', - 'uploader_id': 'api', - } - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - partner_id = self._search_regex( - r'partnerId\s*[:=]\s*(["\'])(?P\d+)\1', - webpage, 'partner id', group='id') - kaltura_id = self._search_regex( - r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P[0-9a-z_]+)\1', - webpage, 'videl id', group='id') - - return { - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), - 'ie_key': 'Kaltura', - 'id': video_id, - 'display_id': display_id, - } diff --git a/yt_dlp/extractor/lumni.py b/yt_dlp/extractor/lumni.py index 5810da0c8..5a9538336 100644 --- a/yt_dlp/extractor/lumni.py +++ b/yt_dlp/extractor/lumni.py @@ -1,8 +1,7 @@ -from .common import InfoExtractor -from .francetv import FranceTVIE +from .francetv import FranceTVBaseInfoExtractor -class LumniIE(InfoExtractor): +class LumniIE(FranceTVBaseInfoExtractor): _VALID_URL = r'https?://(?:www\.)?lumni\.fr/video/(?P[\w-]+)' _TESTS = [{ 'url': 'https://www.lumni.fr/video/l-homme-et-son-environnement-dans-la-revolution-industrielle', @@ -21,4 +20,4 @@ class LumniIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_id = self._html_search_regex( r']+data-factoryid\s*=\s*["\']([^"\']+)', webpage, 'video id') - return self.url_result(f'francetv:{video_id}', FranceTVIE, video_id) + return self._make_url_result(video_id, url=url) diff --git a/yt_dlp/extractor/malltv.py b/yt_dlp/extractor/malltv.py deleted file mode 100644 index e1031d8da..000000000 --- a/yt_dlp/extractor/malltv.py +++ /dev/null @@ -1,107 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - clean_html, - dict_get, - float_or_none, - int_or_none, - merge_dicts, - parse_duration, - try_get, -) - - -class MallTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|sk)\.)?mall\.tv/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', - 'md5': 'cd69ce29176f6533b65bff69ed9a5f2a', - 'info_dict': { - 'id': 't0zzt0', - 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', - 'ext': 'mp4', - 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', - 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35', - 'duration': 216, - 'timestamp': 1538870400, - 'upload_date': '20181007', - 'view_count': int, - 'comment_count': int, - 'thumbnail': 'https://cdn.vpplayer.tech/agmipnzv/encode/vjsnigfq/thumbnails/retina.jpg', - 'average_rating': 9.060869565217391, - 'dislike_count': int, - 'like_count': int, - } - }, { - 'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', - 'only_matching': True, - }, { - 'url': 'https://sk.mall.tv/gejmhaus/reklamacia-nehreje-vyrobnik-tepla-alebo-spekacka', - 'only_matching': True, - }, { - 'url': 'https://www.mall.tv/zivoty-slavnych/nadeje-vychodu-i-zapadu-jak-michail-gorbacov-zmenil-politickou-mapu-sveta-a-ziskal-za-to-nobelovu-cenu-miru', - 'info_dict': { - 'id': 'yx010y', - 'ext': 'mp4', - 'dislike_count': int, - 'description': 'md5:aee02bee5a8d072c6a8207b91d1905a9', - 'thumbnail': 'https://cdn.vpplayer.tech/agmipnzv/encode/vjsnjdeu/thumbnails/retina.jpg', - 'comment_count': int, - 'display_id': 'md5:0ec2afa94d2e2b7091c019cef2a43a9b', - 'like_count': int, - 'duration': 752, - 'timestamp': 1646956800, - 'title': 'md5:fe79385daaf16d74c12c1ec4a26687af', - 'view_count': int, - 'upload_date': '20220311', - 'average_rating': 9.685714285714285, - } - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage( - url, display_id, headers=self.geo_verification_headers()) - - video = self._parse_json(self._search_regex( - r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', - webpage, 'video object'), display_id) - - video_id = self._search_regex( - r']+value\s*=\s*(\w+)', webpage, 'video id') - - formats = self._extract_m3u8_formats( - video['VideoSource'], video_id, 'mp4', 'm3u8_native') - - subtitles = {} - for s in (video.get('Subtitles') or {}): - s_url = s.get('Url') - if not s_url: - continue - subtitles.setdefault(s.get('Language') or 'cz', []).append({ - 'url': s_url, - }) - - entity_counts = video.get('EntityCounts') or {} - - def get_count(k): - v = entity_counts.get(k + 's') or {} - return int_or_none(dict_get(v, ('Count', 'StrCount'))) - - info = self._search_json_ld(webpage, video_id, default={}) - - return merge_dicts({ - 'id': str(video_id), - 'display_id': display_id, - 'title': video.get('Title'), - 'description': clean_html(video.get('Description')), - 'thumbnail': video.get('ThumbnailUrl'), - 'formats': formats, - 'subtitles': subtitles, - 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), - 'view_count': get_count('View'), - 'like_count': get_count('Like'), - 'dislike_count': get_count('Dislike'), - 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), - 'comment_count': get_count('Comment'), - }, info) diff --git a/yt_dlp/extractor/manyvids.py b/yt_dlp/extractor/manyvids.py index 741745378..2aa3a3c93 100644 --- a/yt_dlp/extractor/manyvids.py +++ b/yt_dlp/extractor/manyvids.py @@ -12,6 +12,7 @@ from ..utils import ( class ManyVidsIE(InfoExtractor): + _WORKING = False _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P\d+)' _TESTS = [{ # preview video diff --git a/yt_dlp/extractor/markiza.py b/yt_dlp/extractor/markiza.py index 53ed79158..ca465eae9 100644 --- a/yt_dlp/extractor/markiza.py +++ b/yt_dlp/extractor/markiza.py @@ -10,6 +10,7 @@ from ..utils import ( class MarkizaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P\d+)(?:[_/]|$)' _TESTS = [{ 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109', @@ -68,6 +69,7 @@ class MarkizaIE(InfoExtractor): class MarkizaPageIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P\d+)_' _TESTS = [{ 'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni', diff --git a/yt_dlp/extractor/megaphone.py b/yt_dlp/extractor/megaphone.py index af80523e3..eb790e691 100644 --- a/yt_dlp/extractor/megaphone.py +++ b/yt_dlp/extractor/megaphone.py @@ -8,15 +8,15 @@ class MegaphoneIE(InfoExtractor): _VALID_URL = r'https://player\.megaphone\.fm/(?P[A-Z0-9]+)' _EMBED_REGEX = [rf']*?\ssrc=["\'](?P{_VALID_URL})'] _TEST = { - 'url': 'https://player.megaphone.fm/GLT9749789991?"', + 'url': 'https://player.megaphone.fm/GLT9749789991', 'md5': '4816a0de523eb3e972dc0dda2c191f96', 'info_dict': { 'id': 'GLT9749789991', 'ext': 'mp3', 'title': '#97 What Kind Of Idiot Gets Phished?', 'thumbnail': r're:^https://.*\.png.*$', - 'duration': 1776.26375, - 'author': 'Reply All', + 'duration': 1998.36, + 'creators': ['Reply All'], }, } @@ -40,7 +40,7 @@ class MegaphoneIE(InfoExtractor): 'id': video_id, 'thumbnail': thumbnail, 'title': title, - 'author': author, + 'creators': [author] if author else None, 'duration': episode_data['duration'], 'formats': formats, } diff --git a/yt_dlp/extractor/miaopai.py b/yt_dlp/extractor/miaopai.py deleted file mode 100644 index 329ce3658..000000000 --- a/yt_dlp/extractor/miaopai.py +++ /dev/null @@ -1,36 +0,0 @@ -from .common import InfoExtractor - - -class MiaoPaiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?miaopai\.com/show/(?P[-A-Za-z0-9~_]+)' - _TEST = { - 'url': 'http://www.miaopai.com/show/n~0hO7sfV1nBEw4Y29-Hqg__.htm', - 'md5': '095ed3f1cd96b821add957bdc29f845b', - 'info_dict': { - 'id': 'n~0hO7sfV1nBEw4Y29-Hqg__', - 'ext': 'mp4', - 'title': '西游记音乐会的秒拍视频', - 'thumbnail': 're:^https?://.*/n~0hO7sfV1nBEw4Y29-Hqg___m.jpg', - } - } - - _USER_AGENT_IPAD = 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD}) - - title = self._html_extract_title(webpage) - thumbnail = self._html_search_regex( - r']+class=(?P[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P[\'"])(?P[^\'"]+)(?P=q2)', - webpage, 'thumbnail', fatal=False, group='url') - videos = self._parse_html5_media_entries(url, webpage, video_id) - info = videos[0] - - info.update({ - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - }) - return info diff --git a/yt_dlp/extractor/ministrygrid.py b/yt_dlp/extractor/ministrygrid.py deleted file mode 100644 index 053c6726c..000000000 --- a/yt_dlp/extractor/ministrygrid.py +++ /dev/null @@ -1,55 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - smuggle_url, -) - - -class MinistryGridIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ministrygrid\.com/([^/?#]*/)*(?P[^/#?]+)/?(?:$|[?#])' - - _TEST = { - 'url': 'http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers', - 'md5': '844be0d2a1340422759c2a9101bab017', - 'info_dict': { - 'id': '3453494717001', - 'ext': 'mp4', - 'title': 'The Gospel by Numbers', - 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20140410', - 'description': 'Coming soon from T4G 2014!', - 'uploader_id': '2034960640001', - 'timestamp': 1397145591, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['TDSLifeway'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - portlets = self._parse_json(self._search_regex( - r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list'), - video_id) - pl_id = self._search_regex( - r'getPlid:function\(\){return"(\d+)"}', webpage, 'p_l_id') - - for i, portlet in enumerate(portlets): - portlet_url = 'http://www.ministrygrid.com/c/portal/render_portlet?p_l_id=%s&p_p_id=%s' % (pl_id, portlet) - portlet_code = self._download_webpage( - portlet_url, video_id, - note='Looking in portlet %s (%d/%d)' % (portlet, i + 1, len(portlets)), - fatal=False) - video_iframe_url = self._search_regex( - r'[0-9]+)' - _TESTS = [{ - 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869', - 'md5': '6c0acface7a787aadc8391e4bbf7b0f5', - 'info_dict': { - 'id': '615869', - 'ext': 'mp4', - 'title': 'Get Ahead of the Curve on 2013 Taxes', - 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.", - 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$' - } - }, { - 'url': 'http://news.morningstar.com/cover/videocenter.aspx?id=825556', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'

(.*?)

', webpage, 'title') - video_url = self._html_search_regex( - r'(.*?)', - webpage, 'description', fatal=False) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'thumbnail': thumbnail, - 'description': description, - } diff --git a/yt_dlp/extractor/motorsport.py b/yt_dlp/extractor/motorsport.py index efb087d03..167d85fa9 100644 --- a/yt_dlp/extractor/motorsport.py +++ b/yt_dlp/extractor/motorsport.py @@ -5,6 +5,7 @@ from ..compat import ( class MotorsportIE(InfoExtractor): + _WORKING = False IE_DESC = 'motorsport.com' _VALID_URL = r'https?://(?:www\.)?motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P[^/]+)/?(?:$|[?#])' _TEST = { diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index e192453c7..404e431bc 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -451,6 +451,7 @@ class MTVVideoIE(MTVServicesInfoExtractor): class MTVDEIE(MTVServicesInfoExtractor): + _WORKING = False IE_NAME = 'mtv.de' _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P[0-9a-z]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/muenchentv.py b/yt_dlp/extractor/muenchentv.py index 36a2d4688..934cd4fbc 100644 --- a/yt_dlp/extractor/muenchentv.py +++ b/yt_dlp/extractor/muenchentv.py @@ -9,6 +9,7 @@ from ..utils import ( class MuenchenTVIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?muenchen\.tv/livestream' IE_DESC = 'münchen.tv' _TEST = { diff --git a/yt_dlp/extractor/musicdex.py b/yt_dlp/extractor/musicdex.py index 48f29702c..a86351458 100644 --- a/yt_dlp/extractor/musicdex.py +++ b/yt_dlp/extractor/musicdex.py @@ -17,11 +17,11 @@ class MusicdexBaseIE(InfoExtractor): 'track_number': track_json.get('number'), 'url': format_field(track_json, 'url', 'https://www.musicdex.org/%s'), 'duration': track_json.get('duration'), - 'genre': [genre.get('name') for genre in track_json.get('genres') or []], + 'genres': [genre.get('name') for genre in track_json.get('genres') or []], 'like_count': track_json.get('likes_count'), 'view_count': track_json.get('plays'), - 'artist': [artist.get('name') for artist in track_json.get('artists') or []], - 'album_artist': [artist.get('name') for artist in album_json.get('artists') or []], + 'artists': [artist.get('name') for artist in track_json.get('artists') or []], + 'album_artists': [artist.get('name') for artist in album_json.get('artists') or []], 'thumbnail': format_field(album_json, 'image', 'https://www.musicdex.org/%s'), 'album': album_json.get('name'), 'release_year': try_get(album_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year), @@ -43,11 +43,11 @@ class MusicdexSongIE(MusicdexBaseIE): 'track': 'dual existence', 'track_number': 1, 'duration': 266000, - 'genre': ['Anime'], + 'genres': ['Anime'], 'like_count': int, 'view_count': int, - 'artist': ['fripSide'], - 'album_artist': ['fripSide'], + 'artists': ['fripSide'], + 'album_artists': ['fripSide'], 'thumbnail': 'https://www.musicdex.org/storage/album/9iDIam1DHTVqUG4UclFIEq1WAFGXfPW4y0TtZa91.png', 'album': 'To Aru Kagaku no Railgun T OP2 Single - dual existence', 'release_year': 2020 @@ -69,9 +69,9 @@ class MusicdexAlbumIE(MusicdexBaseIE): 'playlist_mincount': 28, 'info_dict': { 'id': '56', - 'genre': ['OST'], + 'genres': ['OST'], 'view_count': int, - 'artist': ['TENMON & Eiichiro Yanagi / minori'], + 'artists': ['TENMON & Eiichiro Yanagi / minori'], 'title': 'ef - a tale of memories Original Soundtrack 2 ~fortissimo~', 'release_year': 2008, 'thumbnail': 'https://www.musicdex.org/storage/album/2rSHkyYBYfB7sbvElpEyTMcUn6toY7AohOgJuDlE.jpg', @@ -88,9 +88,9 @@ class MusicdexAlbumIE(MusicdexBaseIE): 'id': id, 'title': data_json.get('name'), 'description': data_json.get('description'), - 'genre': [genre.get('name') for genre in data_json.get('genres') or []], + 'genres': [genre.get('name') for genre in data_json.get('genres') or []], 'view_count': data_json.get('plays'), - 'artist': [artist.get('name') for artist in data_json.get('artists') or []], + 'artists': [artist.get('name') for artist in data_json.get('artists') or []], 'thumbnail': format_field(data_json, 'image', 'https://www.musicdex.org/%s'), 'release_year': try_get(data_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year), 'entries': entries, diff --git a/yt_dlp/extractor/ndtv.py b/yt_dlp/extractor/ndtv.py index bfe52f77d..d099db37b 100644 --- a/yt_dlp/extractor/ndtv.py +++ b/yt_dlp/extractor/ndtv.py @@ -5,6 +5,7 @@ from ..utils import parse_duration, remove_end, unified_strdate, urljoin class NDTVIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:[^/]+\.)?ndtv\.com/(?:[^/]+/)*videos?/?(?:[^/]+/)*[^/?^&]+-(?P\d+)' _TESTS = [ diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 136b0e10a..cb8f6a67d 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -1,6 +1,7 @@ import itertools import json +from .art19 import Art19IE from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( @@ -112,7 +113,8 @@ class NebulaBaseIE(InfoExtractor): class NebulaIE(NebulaBaseIE): - _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P[-\w]+)' + IE_NAME = 'nebula:video' + _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P[\w-]+)' _TESTS = [{ 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', 'info_dict': { @@ -236,8 +238,8 @@ class NebulaIE(NebulaBaseIE): class NebulaClassIE(NebulaBaseIE): - IE_NAME = 'nebula:class' - _VALID_URL = rf'{_BASE_URL_RE}/(?P[-\w]+)/(?P\d+)' + IE_NAME = 'nebula:media' + _VALID_URL = rf'{_BASE_URL_RE}/(?!(?:myshows|library|videos)/)(?P[\w-]+)/(?P[\w-]+)/?(?:$|[?#])' _TESTS = [{ 'url': 'https://nebula.tv/copyright-for-fun-and-profit/14', 'info_dict': { @@ -253,6 +255,46 @@ class NebulaClassIE(NebulaBaseIE): 'title': 'Photos, Sculpture, and Video', }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://nebula.tv/extremitiespodcast/pyramiden-the-high-arctic-soviet-ghost-town', + 'info_dict': { + 'ext': 'mp3', + 'id': '018f65f0-0033-4021-8f87-2d132beb19aa', + 'description': 'md5:05d2b23ab780c955e2511a2b9127acff', + 'series_id': '335e8159-d663-491a-888f-1732285706ac', + 'modified_timestamp': 1599091504, + 'episode_id': '018f65f0-0033-4021-8f87-2d132beb19aa', + 'series': 'Extremities', + 'modified_date': '20200903', + 'upload_date': '20200902', + 'title': 'Pyramiden: The High-Arctic Soviet Ghost Town', + 'release_timestamp': 1571237958, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'duration': 1546.05714, + 'timestamp': 1599085608, + 'release_date': '20191016', + }, + }, { + 'url': 'https://nebula.tv/thelayover/the-layover-episode-1', + 'info_dict': { + 'ext': 'mp3', + 'id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0', + 'episode_number': 1, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'release_date': '20230304', + 'modified_date': '20230403', + 'series': 'The Layover', + 'episode_id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0', + 'modified_timestamp': 1680554566, + 'duration': 3130.46401, + 'release_timestamp': 1677943800, + 'title': 'The Layover — Episode 1', + 'series_id': '874303a5-4900-4626-a4b6-2aacac34466a', + 'upload_date': '20230303', + 'episode': 'Episode 1', + 'timestamp': 1677883672, + 'description': 'md5:002cca89258e3bc7c268d5b8c24ba482', + }, }] def _real_extract(self, url): @@ -268,16 +310,38 @@ class NebulaClassIE(NebulaBaseIE): metadata = self._call_api( f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons', - slug, note='Fetching video metadata') - return { - **self._extract_video_metadata(metadata), - **self._extract_formats(metadata['id'], slug), - } + slug, note='Fetching class/podcast metadata') + content_type = metadata.get('type') + if content_type == 'lesson': + return { + **self._extract_video_metadata(metadata), + **self._extract_formats(metadata['id'], slug), + } + elif content_type == 'podcast_episode': + episode_url = metadata['episode_url'] + if not episode_url and metadata.get('premium'): + self.raise_login_required() + + if Art19IE.suitable(episode_url): + return self.url_result(episode_url, Art19IE) + return traverse_obj(metadata, { + 'id': ('id', {str}), + 'url': ('episode_url', {url_or_none}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('published_at', {parse_iso8601}), + 'duration': ('duration', {int_or_none}), + 'channel_id': ('channel_id', {str}), + 'chnanel': ('channel_title', {str}), + 'thumbnail': ('assets', 'regular', {url_or_none}), + }) + + raise ExtractorError(f'Unexpected content type {content_type!r}') class NebulaSubscriptionsIE(NebulaBaseIE): IE_NAME = 'nebula:subscriptions' - _VALID_URL = rf'{_BASE_URL_RE}/(?Pmyshows|library/latest-videos)' + _VALID_URL = rf'{_BASE_URL_RE}/(?Pmyshows|library/latest-videos)/?(?:$|[?#])' _TESTS = [{ 'url': 'https://nebula.tv/myshows', 'playlist_mincount': 1, @@ -310,7 +374,7 @@ class NebulaSubscriptionsIE(NebulaBaseIE): class NebulaChannelIE(NebulaBaseIE): IE_NAME = 'nebula:channel' - _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos/)(?P[-\w]+)/?(?:$|[?#])' + _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos)(?P[\w-]+)/?(?:$|[?#])' _TESTS = [{ 'url': 'https://nebula.tv/tom-scott-presents-money', 'info_dict': { @@ -343,6 +407,14 @@ class NebulaChannelIE(NebulaBaseIE): 'description': 'md5:6690248223eed044a9f11cd5a24f9742', }, 'playlist_count': 23, + }, { + 'url': 'https://nebula.tv/trussissuespodcast', + 'info_dict': { + 'id': 'trussissuespodcast', + 'title': 'The TLDR News Podcast', + 'description': 'md5:a08c4483bc0b705881d3e0199e721385', + }, + 'playlist_mincount': 80, }] def _generate_playlist_entries(self, collection_id, collection_slug): @@ -365,6 +437,17 @@ class NebulaChannelIE(NebulaBaseIE): lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}', {'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata) + def _generate_podcast_entries(self, collection_id, collection_slug): + next_url = f'https://content.api.nebula.app/podcast_channels/{collection_id}/podcast_episodes/?ordering=-published_at&premium=true' + for page_num in itertools.count(1): + episodes = self._call_api(next_url, collection_slug, note=f'Retrieving podcast page {page_num}') + + for episode in traverse_obj(episodes, ('results', lambda _, v: url_or_none(v['share_url']))): + yield self.url_result(episode['share_url'], NebulaClassIE) + next_url = episodes.get('next') + if not next_url: + break + def _real_extract(self, url): collection_slug = self._match_id(url) channel = self._call_api( @@ -373,6 +456,8 @@ class NebulaChannelIE(NebulaBaseIE): if channel.get('type') == 'class': entries = self._generate_class_entries(channel) + elif channel.get('type') == 'podcast_channel': + entries = self._generate_podcast_entries(channel['id'], collection_slug) else: entries = self._generate_playlist_entries(channel['id'], collection_slug) diff --git a/yt_dlp/extractor/nekohacker.py b/yt_dlp/extractor/nekohacker.py index e10ffe925..24b66570e 100644 --- a/yt_dlp/extractor/nekohacker.py +++ b/yt_dlp/extractor/nekohacker.py @@ -118,7 +118,6 @@ class NekoHackerIE(InfoExtractor): 'artist': 'Neko Hacker', 'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', 'track_number': 1, - 'duration': None } }, { @@ -136,7 +135,6 @@ class NekoHackerIE(InfoExtractor): 'artist': 'Neko Hacker', 'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', 'track_number': 2, - 'duration': None } }, { @@ -154,7 +152,6 @@ class NekoHackerIE(InfoExtractor): 'artist': 'Neko Hacker', 'track': '進め!むじなカンパニー (instrumental)', 'track_number': 3, - 'duration': None } }, { @@ -172,7 +169,6 @@ class NekoHackerIE(InfoExtractor): 'artist': 'Neko Hacker', 'track': 'むじな de なじむ (instrumental)', 'track_number': 4, - 'duration': None } } ] diff --git a/yt_dlp/extractor/nerdcubed.py b/yt_dlp/extractor/nerdcubed.py index 7c801b5d3..5f5607a20 100644 --- a/yt_dlp/extractor/nerdcubed.py +++ b/yt_dlp/extractor/nerdcubed.py @@ -1,33 +1,38 @@ -import datetime - from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import parse_iso8601, url_or_none +from ..utils.traversal import traverse_obj class NerdCubedFeedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/feed\.json' + _VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/?(?:$|[#?])' _TEST = { - 'url': 'http://www.nerdcubed.co.uk/feed.json', + 'url': 'http://www.nerdcubed.co.uk/', 'info_dict': { 'id': 'nerdcubed-feed', 'title': 'nerdcubed.co.uk feed', }, - 'playlist_mincount': 1300, + 'playlist_mincount': 5500, } - def _real_extract(self, url): - feed = self._download_json(url, url, 'Downloading NerdCubed JSON feed') + def _extract_video(self, feed_entry): + return self.url_result( + f'https://www.youtube.com/watch?v={feed_entry["id"]}', YoutubeIE, + **traverse_obj(feed_entry, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('publishedAt', {parse_iso8601}), + 'channel': ('source', 'name', {str}), + 'channel_id': ('source', 'id', {str}), + 'channel_url': ('source', 'url', {str}), + 'thumbnail': ('thumbnail', 'source', {url_or_none}), + }), url_transparent=True) - entries = [{ - '_type': 'url', - 'title': feed_entry['title'], - 'uploader': feed_entry['source']['name'] if feed_entry['source'] else None, - 'upload_date': datetime.datetime.strptime(feed_entry['date'], '%Y-%m-%d').strftime('%Y%m%d'), - 'url': 'http://www.youtube.com/watch?v=' + feed_entry['youtube_id'], - } for feed_entry in feed] + def _real_extract(self, url): + video_id = 'nerdcubed-feed' + feed = self._download_json('https://www.nerdcubed.co.uk/_/cdn/videos.json', video_id) - return { - '_type': 'playlist', - 'title': 'nerdcubed.co.uk feed', - 'id': 'nerdcubed-feed', - 'entries': entries, - } + return self.playlist_result( + map(self._extract_video, traverse_obj(feed, ('videos', lambda _, v: v['id']))), + video_id, 'nerdcubed.co.uk feed') diff --git a/yt_dlp/extractor/netzkino.py b/yt_dlp/extractor/netzkino.py index 9c314e223..e9422eebf 100644 --- a/yt_dlp/extractor/netzkino.py +++ b/yt_dlp/extractor/netzkino.py @@ -8,6 +8,7 @@ from ..utils import ( class NetzkinoIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/[^/]+/(?P[^/]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py index 9601cd10e..67e52efd6 100644 --- a/yt_dlp/extractor/newgrounds.py +++ b/yt_dlp/extractor/newgrounds.py @@ -2,7 +2,9 @@ import functools import re from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, OnDemandPagedList, clean_html, extract_attributes, @@ -10,12 +12,16 @@ from ..utils import ( int_or_none, parse_count, parse_duration, - traverse_obj, unified_timestamp, + url_or_none, + urlencode_postdata, + urljoin, ) +from ..utils.traversal import traverse_obj class NewgroundsIE(InfoExtractor): + _NETRC_MACHINE = 'newgrounds' _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P\d+)(?:/format/flash)?' _TESTS = [{ 'url': 'https://www.newgrounds.com/audio/listen/549479', @@ -25,11 +31,13 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp3', 'title': 'B7 - BusMode', 'uploader': 'Burn7', - 'timestamp': 1378878540, + 'timestamp': 1378892945, 'upload_date': '20130911', 'duration': 143, 'view_count': int, 'description': 'md5:b8b3c2958875189f07d8e313462e8c4f', + 'age_limit': 0, + 'thumbnail': r're:^https://aicon\.ngfiles\.com/549/549479\.png', }, }, { 'url': 'https://www.newgrounds.com/portal/view/1', @@ -39,11 +47,12 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Scrotum 1', 'uploader': 'Brian-Beaton', - 'timestamp': 955064100, - 'upload_date': '20000406', + 'timestamp': 955078533, + 'upload_date': '20000407', 'view_count': int, 'description': 'Scrotum plays "catch."', 'age_limit': 17, + 'thumbnail': r're:^https://picon\.ngfiles\.com/0/flash_1_card\.png', }, }, { # source format unavailable, additional mp4 formats @@ -53,11 +62,12 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp4', 'title': 'ZTV News Episode 8', 'uploader': 'ZONE-SAMA', - 'timestamp': 1487965140, - 'upload_date': '20170224', + 'timestamp': 1487983183, + 'upload_date': '20170225', 'view_count': int, 'description': 'md5:aff9b330ec2e78ed93b1ad6d017accc6', 'age_limit': 17, + 'thumbnail': r're:^https://picon\.ngfiles\.com/689000/flash_689400_card\.png', }, 'params': { 'skip_download': True, @@ -70,11 +80,12 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Metal Gear Awesome', 'uploader': 'Egoraptor', - 'timestamp': 1140663240, + 'timestamp': 1140681292, 'upload_date': '20060223', 'view_count': int, 'description': 'md5:9246c181614e23754571995104da92e0', 'age_limit': 13, + 'thumbnail': r're:^https://picon\.ngfiles\.com/297000/flash_297383_card\.png', } }, { 'url': 'https://www.newgrounds.com/portal/view/297383/format/flash', @@ -86,8 +97,24 @@ class NewgroundsIE(InfoExtractor): 'description': 'Metal Gear Awesome', 'uploader': 'Egoraptor', 'upload_date': '20060223', - 'timestamp': 1140663240, + 'timestamp': 1140681292, + 'view_count': int, 'age_limit': 13, + 'thumbnail': r're:^https://picon\.ngfiles\.com/297000/flash_297383_card\.png', + } + }, { + 'url': 'https://www.newgrounds.com/portal/view/823109', + 'info_dict': { + 'id': '823109', + 'ext': 'mp4', + 'title': 'Rouge Futa Fleshlight Fuck', + 'description': 'I made a fleshlight model and I wanted to use it in an animation. Based on a video by CDNaturally.', + 'uploader': 'DefaultUser12', + 'upload_date': '20211122', + 'timestamp': 1637611540, + 'view_count': int, + 'age_limit': 18, + 'thumbnail': r're:^https://picon\.ngfiles\.com/823000/flash_823109_card\.png', } }] _AGE_LIMIT = { @@ -96,42 +123,59 @@ class NewgroundsIE(InfoExtractor): 'm': 17, 'a': 18, } + _LOGIN_URL = 'https://www.newgrounds.com/passport' + + def _perform_login(self, username, password): + login_webpage = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') + login_url = urljoin(self._LOGIN_URL, self._search_regex( + r'
]+>([^<]+)'), webpage, 'uploader', fatal=False) - age_limit = self._html_search_regex( - r']+>', webpage, 'age_limit', default='e') - age_limit = self._AGE_LIMIT.get(age_limit) - - timestamp = unified_timestamp(self._html_search_regex( - (r'
\s*Uploaded\s*
\s*
([^<]+
\s*
[^<]+)', - r'
\s*Uploaded\s*
\s*
([^<]+)'), webpage, 'timestamp', - default=None)) - - duration = parse_duration(self._html_search_regex( - r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage, - 'duration', default=None)) - - description = clean_html(get_element_by_id('author_comments', webpage)) or self._og_search_description(webpage) - - view_count = parse_count(self._html_search_regex( - r'(?s)
\s*(?:Views|Listens)\s*
\s*
([\d\.,]+)
', webpage, - 'view count', default=None)) - - filesize = int_or_none(self._html_search_regex( - r'"filesize"\s*:\s*["\']?([\d]+)["\']?,', webpage, 'filesize', - default=None)) - - video_type_description = self._html_search_regex( - r'"description"\s*:\s*["\']?([^"\']+)["\']?,', webpage, 'filesize', - default=None) - if len(formats) == 1: - formats[0]['filesize'] = filesize + formats[0]['filesize'] = int_or_none(self._html_search_regex( + r'"filesize"\s*:\s*["\']?([\d]+)["\']?,', webpage, 'filesize', default=None)) - if video_type_description == 'Audio File': - formats[0]['vcodec'] = 'none' - self._check_formats(formats, media_id) + video_type_description = self._html_search_regex( + r'"description"\s*:\s*["\']?([^"\']+)["\']?,', webpage, 'media type', default=None) + if video_type_description == 'Audio File': + formats[0]['vcodec'] = 'none' + self._check_formats(formats, media_id) return { 'id': media_id, - 'title': title, + 'title': self._html_extract_title(webpage), 'uploader': uploader, - 'timestamp': timestamp, - 'duration': duration, + 'timestamp': unified_timestamp(self._search_regex( + r'itemprop="(?:uploadDate|datePublished)"\s+content="([^"]+)"', + webpage, 'timestamp', default=None)), + 'duration': parse_duration(self._html_search_regex( + r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage, 'duration', default=None)), 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), - 'description': description, - 'age_limit': age_limit, - 'view_count': view_count, + 'description': ( + clean_html(get_element_by_id('author_comments', webpage)) + or self._og_search_description(webpage)), + 'age_limit': self._AGE_LIMIT.get(self._html_search_regex( + r'\s*(?:Views|Listens)\s*\s*
([\d\.,]+)
', + webpage, 'view count', default=None)), } diff --git a/yt_dlp/extractor/nextmedia.py b/yt_dlp/extractor/nextmedia.py index 0e47a4d45..871d3e669 100644 --- a/yt_dlp/extractor/nextmedia.py +++ b/yt_dlp/extractor/nextmedia.py @@ -191,6 +191,8 @@ class AppleDailyIE(NextMediaIE): # XXX: Do not subclass from concrete IE class NextTVIE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE IE_DESC = '壹電視' _VALID_URL = r'https?://(?:www\.)?nexttv\.com\.tw/(?:[^/]+/)+(?P\d+)' diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index b889c752c..6a4624602 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -13,13 +13,11 @@ from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, - bug_reports_message, clean_html, float_or_none, int_or_none, join_nonempty, parse_duration, - parse_filesize, parse_iso8601, parse_resolution, qualities, @@ -38,6 +36,8 @@ from ..utils import ( class NiconicoIE(InfoExtractor): IE_NAME = 'niconico' IE_DESC = 'ニコニコ動画' + _GEO_COUNTRIES = ['JP'] + _GEO_BYPASS = False _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', @@ -55,25 +55,31 @@ class NiconicoIE(InfoExtractor): 'duration': 33, 'view_count': int, 'comment_count': int, + 'genres': ['未設定'], + 'tags': [], + 'expected_protocol': str, }, - 'skip': 'Requires an account', }, { # File downloaded with and without credentials are different, so omit # the md5 field 'url': 'http://www.nicovideo.jp/watch/nm14296458', 'info_dict': { 'id': 'nm14296458', - 'ext': 'swf', - 'title': '【鏡音リン】Dance on media【オリジナル】take2!', - 'description': 'md5:689f066d74610b3b22e0f1739add0f58', + 'ext': 'mp4', + 'title': '【Kagamine Rin】Dance on media【Original】take2!', + 'description': 'md5:9368f2b1f4178de64f2602c2f3d6cbf5', 'thumbnail': r're:https?://.*', 'uploader': 'りょうた', 'uploader_id': '18822557', 'upload_date': '20110429', 'timestamp': 1304065916, - 'duration': 209, + 'duration': 208.0, + 'comment_count': int, + 'view_count': int, + 'genres': ['音楽・サウンド'], + 'tags': ['Translation_Request', 'Kagamine_Rin', 'Rin_Original'], + 'expected_protocol': str, }, - 'skip': 'Requires an account', }, { # 'video exists but is marked as "deleted" # md5 is unstable @@ -107,22 +113,24 @@ class NiconicoIE(InfoExtractor): }, { # video not available via `getflv`; "old" HTML5 video 'url': 'http://www.nicovideo.jp/watch/sm1151009', - 'md5': '8fa81c364eb619d4085354eab075598a', + 'md5': 'f95a3d259172667b293530cc2e41ebda', 'info_dict': { 'id': 'sm1151009', 'ext': 'mp4', 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)', - 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7', + 'description': 'md5:f95a3d259172667b293530cc2e41ebda', 'thumbnail': r're:https?://.*', 'duration': 184, - 'timestamp': 1190868283, - 'upload_date': '20070927', + 'timestamp': 1190835883, + 'upload_date': '20070926', 'uploader': 'denden2', 'uploader_id': '1392194', 'view_count': int, 'comment_count': int, + 'genres': ['ゲーム'], + 'tags': [], + 'expected_protocol': str, }, - 'skip': 'Requires an account', }, { # "New" HTML5 video # md5 is unstable @@ -132,16 +140,18 @@ class NiconicoIE(InfoExtractor): 'ext': 'mp4', 'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質', 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb', - 'timestamp': 1498514060, + 'timestamp': 1498481660, 'upload_date': '20170626', - 'uploader': 'ゲスト', + 'uploader': 'no-namamae', 'uploader_id': '40826363', 'thumbnail': r're:https?://.*', 'duration': 198, 'view_count': int, 'comment_count': int, + 'genres': ['アニメ'], + 'tags': [], + 'expected_protocol': str, }, - 'skip': 'Requires an account', }, { # Video without owner 'url': 'http://www.nicovideo.jp/watch/sm18238488', @@ -151,16 +161,16 @@ class NiconicoIE(InfoExtractor): 'ext': 'mp4', 'title': '【実写版】ミュータントタートルズ', 'description': 'md5:15df8988e47a86f9e978af2064bf6d8e', - 'timestamp': 1341160408, + 'timestamp': 1341128008, 'upload_date': '20120701', - 'uploader': None, - 'uploader_id': None, 'thumbnail': r're:https?://.*', 'duration': 5271, 'view_count': int, 'comment_count': int, + 'genres': ['エンターテイメント'], + 'tags': [], + 'expected_protocol': str, }, - 'skip': 'Requires an account', }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, @@ -353,15 +363,10 @@ class NiconicoIE(InfoExtractor): if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): return None - def extract_video_quality(video_quality): - return parse_filesize('%sB' % self._search_regex( - r'\| ([0-9]*\.?[0-9]*[MK])', video_quality, 'vbr', default='')) - format_id = '-'.join( [remove_start(s['id'], 'archive_') for s in (video_quality, audio_quality)] + [dmc_protocol]) vid_qual_label = traverse_obj(video_quality, ('metadata', 'label')) - vid_quality = traverse_obj(video_quality, ('metadata', 'bitrate')) return { 'url': 'niconico_dmc:%s/%s/%s' % (video_id, video_quality['id'], audio_quality['id']), @@ -370,10 +375,15 @@ class NiconicoIE(InfoExtractor): 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 'acodec': 'aac', 'vcodec': 'h264', - 'abr': float_or_none(traverse_obj(audio_quality, ('metadata', 'bitrate')), 1000), - 'vbr': float_or_none(vid_quality if vid_quality > 0 else extract_video_quality(vid_qual_label), 1000), - 'height': traverse_obj(video_quality, ('metadata', 'resolution', 'height')), - 'width': traverse_obj(video_quality, ('metadata', 'resolution', 'width')), + **traverse_obj(audio_quality, ('metadata', { + 'abr': ('bitrate', {functools.partial(float_or_none, scale=1000)}), + 'asr': ('samplingRate', {int_or_none}), + })), + **traverse_obj(video_quality, ('metadata', { + 'vbr': ('bitrate', {functools.partial(float_or_none, scale=1000)}), + 'height': ('resolution', 'height', {int_or_none}), + 'width': ('resolution', 'width', {int_or_none}), + })), 'quality': -2 if 'low' in video_quality['id'] else None, 'protocol': 'niconico_dmc', 'expected_protocol': dmc_protocol, # XXX: This is not a documented field @@ -383,6 +393,63 @@ class NiconicoIE(InfoExtractor): } } + def _yield_dmc_formats(self, api_data, video_id): + dmc_data = traverse_obj(api_data, ('media', 'delivery', 'movie')) + audios = traverse_obj(dmc_data, ('audios', ..., {dict})) + videos = traverse_obj(dmc_data, ('videos', ..., {dict})) + protocols = traverse_obj(dmc_data, ('session', 'protocols', ..., {str})) + if not all((audios, videos, protocols)): + return + + for audio_quality, video_quality, protocol in itertools.product(audios, videos, protocols): + if fmt := self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol): + yield fmt + + def _yield_dms_formats(self, api_data, video_id): + fmt_filter = lambda _, v: v['isAvailable'] and v['id'] + videos = traverse_obj(api_data, ('media', 'domand', 'videos', fmt_filter)) + audios = traverse_obj(api_data, ('media', 'domand', 'audios', fmt_filter)) + access_key = traverse_obj(api_data, ('media', 'domand', 'accessRightKey', {str})) + track_id = traverse_obj(api_data, ('client', 'watchTrackId', {str})) + if not all((videos, audios, access_key, track_id)): + return + + dms_m3u8_url = self._download_json( + f'https://nvapi.nicovideo.jp/v1/watch/{video_id}/access-rights/hls', video_id, + data=json.dumps({ + 'outputs': list(itertools.product((v['id'] for v in videos), (a['id'] for a in audios))) + }).encode(), query={'actionTrackId': track_id}, headers={ + 'x-access-right-key': access_key, + 'x-frontend-id': 6, + 'x-frontend-version': 0, + 'x-request-with': 'https://www.nicovideo.jp', + })['data']['contentUrl'] + # Getting all audio formats results in duplicate video formats which we filter out later + dms_fmts = self._extract_m3u8_formats(dms_m3u8_url, video_id) + + # m3u8 extraction does not provide audio bitrates, so extract from the API data and fix + for audio_fmt in traverse_obj(dms_fmts, lambda _, v: v['vcodec'] == 'none'): + yield { + **audio_fmt, + **traverse_obj(audios, (lambda _, v: audio_fmt['format_id'].startswith(v['id']), { + 'format_id': ('id', {str}), + 'abr': ('bitRate', {functools.partial(float_or_none, scale=1000)}), + 'asr': ('samplingRate', {int_or_none}), + }), get_all=False), + 'acodec': 'aac', + 'ext': 'm4a', + } + + # Sort before removing dupes to keep the format dicts with the lowest tbr + video_fmts = sorted((fmt for fmt in dms_fmts if fmt['vcodec'] != 'none'), key=lambda f: f['tbr']) + self._remove_duplicate_formats(video_fmts) + # Calculate the true vbr/tbr by subtracting the lowest abr + min_abr = min(traverse_obj(audios, (..., 'bitRate', {float_or_none})), default=0) / 1000 + for video_fmt in video_fmts: + video_fmt['tbr'] -= min_abr + video_fmt['format_id'] = f'video-{video_fmt["tbr"]:.0f}' + yield video_fmt + def _real_extract(self, url): video_id = self._match_id(url) @@ -409,19 +476,29 @@ class NiconicoIE(InfoExtractor): webpage, 'error reason', default=None) if not error_msg: raise - raise ExtractorError(re.sub(r'\s+', ' ', error_msg), expected=True) - - formats = [] - - def get_video_info(*items, get_first=True, **kwargs): - return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs) - - quality_info = api_data['media']['delivery']['movie'] - session_api_data = quality_info['session'] - for (audio_quality, video_quality, protocol) in itertools.product(quality_info['audios'], quality_info['videos'], session_api_data['protocols']): - fmt = self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol) - if fmt: - formats.append(fmt) + raise ExtractorError(clean_html(error_msg), expected=True) + + availability = self._availability(**(traverse_obj(api_data, ('payment', 'video', { + 'needs_premium': ('isPremium', {bool}), + 'needs_subscription': ('isAdmission', {bool}), + })) or {'needs_auth': True})) + formats = [*self._yield_dmc_formats(api_data, video_id), + *self._yield_dms_formats(api_data, video_id)] + if not formats: + fail_msg = clean_html(self._html_search_regex( + r']+\bclass="fail-message"[^>]*>(?P.+?)

', + webpage, 'fail message', default=None, group='msg')) + if fail_msg: + self.to_screen(f'Niconico said: {fail_msg}') + if fail_msg and 'された地域と同じ地域からのみ視聴できます。' in fail_msg: + availability = None + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + elif availability == 'premium_only': + self.raise_login_required('This video requires premium', metadata_available=True) + elif availability == 'subscriber_only': + self.raise_login_required('This video is for members only', metadata_available=True) + elif availability == 'needs_auth': + self.raise_login_required(metadata_available=False) # Start extracting information tags = None @@ -440,11 +517,15 @@ class NiconicoIE(InfoExtractor): thumb_prefs = qualities(['url', 'middleUrl', 'largeUrl', 'player', 'ogp']) + def get_video_info(*items, get_first=True, **kwargs): + return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs) + return { 'id': video_id, '_api_data': api_data, 'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None), 'formats': formats, + 'availability': availability, 'thumbnails': [{ 'id': key, 'url': url, @@ -472,8 +553,11 @@ class NiconicoIE(InfoExtractor): def _get_subtitles(self, video_id, api_data): comments_info = traverse_obj(api_data, ('comment', 'nvComment', {dict})) or {} + if not comments_info.get('server'): + return + danmaku = traverse_obj(self._download_json( - f'{comments_info.get("server")}/v1/threads', video_id, data=json.dumps({ + f'{comments_info["server"]}/v1/threads', video_id, data=json.dumps({ 'additionals': {}, 'params': comments_info.get('params'), 'threadKey': comments_info.get('threadKey'), @@ -489,10 +573,6 @@ class NiconicoIE(InfoExtractor): note='Downloading comments', errnote='Failed to download comments'), ('data', 'threads', ..., 'comments', ...)) - if not danmaku: - self.report_warning(f'Failed to get comments. {bug_reports_message()}') - return - return { 'comments': [{ 'ext': 'json', diff --git a/yt_dlp/extractor/ninecninemedia.py b/yt_dlp/extractor/ninecninemedia.py index 31df42f4f..579370f1b 100644 --- a/yt_dlp/extractor/ninecninemedia.py +++ b/yt_dlp/extractor/ninecninemedia.py @@ -3,6 +3,7 @@ from ..utils import ( float_or_none, int_or_none, parse_iso8601, + str_or_none, try_get, ) @@ -73,7 +74,7 @@ class NineCNineMediaIE(InfoExtractor): 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), 'season_number': int_or_none(season.get('Number')), - 'season_id': season.get('Id'), + 'season_id': str_or_none(season.get('Id')), 'series': try_get(content, lambda x: x['Media']['Name']), 'tags': tags, 'categories': categories, @@ -109,10 +110,9 @@ class CPTwentyFourIE(InfoExtractor): 'title': 'WATCH: Truck rips ATM from Mississauga business', 'description': 'md5:cf7498480885f080a754389a2b2f7073', 'timestamp': 1637618377, - 'episode_number': None, 'season': 'Season 0', 'season_number': 0, - 'season_id': 57974, + 'season_id': '57974', 'series': 'CTV News Toronto', 'duration': 26.86, 'thumbnail': 'http://images2.9c9media.com/image_asset/2014_11_5_2eb609a0-475b-0132-fbd6-34b52f6f1279_jpg_2000x1125.jpg', diff --git a/yt_dlp/extractor/nobelprize.py b/yt_dlp/extractor/nobelprize.py index 1aa9705be..cddc72f71 100644 --- a/yt_dlp/extractor/nobelprize.py +++ b/yt_dlp/extractor/nobelprize.py @@ -10,6 +10,7 @@ from ..utils import ( class NobelPrizeIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?nobelprize\.org/mediaplayer.*?\bid=(?P\d+)' _TEST = { 'url': 'http://www.nobelprize.org/mediaplayer/?id=2636', diff --git a/yt_dlp/extractor/novaplay.py b/yt_dlp/extractor/novaplay.py index d8849cd88..77ae03fd0 100644 --- a/yt_dlp/extractor/novaplay.py +++ b/yt_dlp/extractor/novaplay.py @@ -18,7 +18,6 @@ class NovaPlayIE(InfoExtractor): 'upload_date': '20220722', 'thumbnail': 'https://nbg-img.fite.tv/img/606627_460x260.jpg', 'description': '29 сек', - 'view_count': False }, }, { @@ -34,7 +33,6 @@ class NovaPlayIE(InfoExtractor): 'upload_date': '20220722', 'thumbnail': 'https://nbg-img.fite.tv/img/606609_460x260.jpg', 'description': '29 сек', - 'view_count': False }, } ] diff --git a/yt_dlp/extractor/noz.py b/yt_dlp/extractor/noz.py index 59d259f9d..c7b803803 100644 --- a/yt_dlp/extractor/noz.py +++ b/yt_dlp/extractor/noz.py @@ -9,6 +9,7 @@ from ..compat import compat_urllib_parse_unquote class NozIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?noz\.de/video/(?P[0-9]+)/' _TESTS = [{ 'url': 'http://www.noz.de/video/25151/32-Deutschland-gewinnt-Badminton-Lnderspiel-in-Melle', diff --git a/yt_dlp/extractor/ntvru.py b/yt_dlp/extractor/ntvru.py index 91b7724eb..fe3965729 100644 --- a/yt_dlp/extractor/ntvru.py +++ b/yt_dlp/extractor/ntvru.py @@ -35,6 +35,7 @@ class NTVRuIE(InfoExtractor): 'duration': 172, 'view_count': int, }, + 'skip': '404 Not Found', }, { 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', 'md5': '82dbd49b38e3af1d00df16acbeab260c', @@ -78,7 +79,8 @@ class NTVRuIE(InfoExtractor): }] _VIDEO_ID_REGEXES = [ - r'http.+?)\1', webpage, 'video url', - default=None if no_video else NO_DEFAULT, group='url') - - if no_video: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - return { - 'id': video_id, - 'url': video_url, - 'title': remove_start(self._og_search_title(webpage), 'Video: '), - 'thumbnail': self._og_search_thumbnail(webpage), - } diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py index 94fcac720..591b4147e 100644 --- a/yt_dlp/extractor/ondemandkorea.py +++ b/yt_dlp/extractor/ondemandkorea.py @@ -11,6 +11,7 @@ from ..utils import ( join_nonempty, parse_age_limit, parse_qs, + str_or_none, unified_strdate, url_or_none, ) @@ -32,7 +33,7 @@ class OnDemandKoreaIE(InfoExtractor): 'duration': 5486.955, 'release_date': '20220924', 'series': 'Ask Us Anything', - 'series_id': 11790, + 'series_id': '11790', 'episode_number': 351, 'episode': 'Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won', }, @@ -47,7 +48,7 @@ class OnDemandKoreaIE(InfoExtractor): 'duration': 1586.0, 'release_date': '20231001', 'series': 'Breakup Probation, A Week', - 'series_id': 22912, + 'series_id': '22912', 'episode_number': 8, 'episode': 'E08', }, @@ -117,7 +118,7 @@ class OnDemandKoreaIE(InfoExtractor): 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), 'age_limit': ('age_rating', 'name', {lambda x: x.replace('R', '')}, {parse_age_limit}), 'series': ('episode', {if_series(key='program')}, 'title'), - 'series_id': ('episode', {if_series(key='program')}, 'id'), + 'series_id': ('episode', {if_series(key='program')}, 'id', {str_or_none}), 'episode': ('episode', {if_series(key='title')}), 'episode_number': ('episode', {if_series(key='number')}, {int_or_none}), }, get_all=False), diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py index 86dc9bb89..82a81c6c2 100644 --- a/yt_dlp/extractor/openrec.py +++ b/yt_dlp/extractor/openrec.py @@ -12,6 +12,8 @@ from ..compat import compat_str class OpenRecBaseIE(InfoExtractor): + _M3U8_HEADERS = {'Referer': 'https://www.openrec.tv/'} + def _extract_pagestore(self, webpage, video_id): return self._parse_json( self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id) @@ -21,7 +23,7 @@ class OpenRecBaseIE(InfoExtractor): if not m3u8_url: continue yield from self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id=name) + m3u8_url, video_id, ext='mp4', m3u8_id=name, headers=self._M3U8_HEADERS) def _extract_movie(self, webpage, video_id, name, is_live): window_stores = self._extract_pagestore(webpage, video_id) @@ -60,6 +62,7 @@ class OpenRecBaseIE(InfoExtractor): 'uploader_id': get_first(movie_stores, ('channel', 'user', 'id')), 'timestamp': int_or_none(get_first(movie_stores, ['publishedAt', 'time']), scale=1000) or unified_timestamp(get_first(movie_stores, 'publishedAt')), 'is_live': is_live, + 'http_headers': self._M3U8_HEADERS, } @@ -110,7 +113,7 @@ class OpenRecCaptureIE(OpenRecBaseIE): raise ExtractorError('Cannot extract title') formats = self._extract_m3u8_formats( - capture_data.get('source'), video_id, ext='mp4') + capture_data.get('source'), video_id, ext='mp4', headers=self._M3U8_HEADERS) return { 'id': video_id, @@ -121,6 +124,7 @@ class OpenRecCaptureIE(OpenRecBaseIE): 'uploader': traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str), 'uploader_id': traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str), 'upload_date': unified_strdate(capture_data.get('createdAt')), + 'http_headers': self._M3U8_HEADERS, } diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py index 1b2a79a62..526e9acaf 100644 --- a/yt_dlp/extractor/orf.py +++ b/yt_dlp/extractor/orf.py @@ -49,7 +49,6 @@ class ORFTVthekIE(InfoExtractor): 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150', 'info_dict': { 'id': '14121079', - 'playlist_count': 1 }, 'playlist': [{ 'info_dict': { diff --git a/yt_dlp/extractor/parlview.py b/yt_dlp/extractor/parlview.py index 0b547917c..777b00889 100644 --- a/yt_dlp/extractor/parlview.py +++ b/yt_dlp/extractor/parlview.py @@ -8,7 +8,7 @@ from ..utils import ( class ParlviewIE(InfoExtractor): - + _WORKING = False _VALID_URL = r'https?://(?:www\.)?parlview\.aph\.gov\.au/(?:[^/]+)?\bvideoID=(?P\d{6})' _TESTS = [{ 'url': 'https://parlview.aph.gov.au/mediaPlayer.php?videoID=542661', diff --git a/yt_dlp/extractor/peekvids.py b/yt_dlp/extractor/peekvids.py index 41f591b09..939c26dc7 100644 --- a/yt_dlp/extractor/peekvids.py +++ b/yt_dlp/extractor/peekvids.py @@ -157,7 +157,6 @@ class PlayVidsIE(PeekVidsBaseIE): 'display_id': '47iUho33toY', 'ext': 'mp4', 'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE', - 'description': None, 'timestamp': 1507052209, 'upload_date': '20171003', 'thumbnail': r're:^https?://.*\.jpg$', @@ -176,7 +175,6 @@ class PlayVidsIE(PeekVidsBaseIE): 'display_id': 'z3_7iwWCmqt', 'ext': 'mp4', 'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances', - 'description': None, 'timestamp': 1607470323, 'upload_date': '20201208', 'thumbnail': r're:^https?://.*\.jpg$', diff --git a/yt_dlp/extractor/pladform.py b/yt_dlp/extractor/pladform.py index 00500686f..d67f6005c 100644 --- a/yt_dlp/extractor/pladform.py +++ b/yt_dlp/extractor/pladform.py @@ -35,7 +35,6 @@ class PladformIE(InfoExtractor): 'thumbnail': str, 'view_count': int, 'description': str, - 'category': list, 'uploader_id': '12082', 'uploader': 'Comedy Club', 'duration': 367, diff --git a/yt_dlp/extractor/planetmarathi.py b/yt_dlp/extractor/planetmarathi.py index 25753fe7e..a4b612a6e 100644 --- a/yt_dlp/extractor/planetmarathi.py +++ b/yt_dlp/extractor/planetmarathi.py @@ -20,7 +20,6 @@ class PlanetMarathiIE(InfoExtractor): 'title': 'ek unad divas', 'alt_title': 'चित्रपट', 'description': 'md5:41c7ed6b041c2fea9820a3f3125bd881', - 'season_number': None, 'episode_number': 1, 'duration': 5539, 'upload_date': '20210829', diff --git a/yt_dlp/extractor/playstuff.py b/yt_dlp/extractor/playstuff.py deleted file mode 100644 index b424ba187..000000000 --- a/yt_dlp/extractor/playstuff.py +++ /dev/null @@ -1,63 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - smuggle_url, - try_get, -) - - -class PlayStuffIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?play\.stuff\.co\.nz/details/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://play.stuff.co.nz/details/608778ac1de1c4001a3fa09a', - 'md5': 'c82d3669e5247c64bc382577843e5bd0', - 'info_dict': { - 'id': '6250584958001', - 'ext': 'mp4', - 'title': 'Episode 1: Rotorua/Mt Maunganui/Tauranga', - 'description': 'md5:c154bafb9f0dd02d01fd4100fb1c1913', - 'uploader_id': '6005208634001', - 'timestamp': 1619491027, - 'upload_date': '20210427', - }, - 'add_ie': ['BrightcoveNew'], - }, { - # geo restricted, bypassable - 'url': 'https://play.stuff.co.nz/details/_6155660351001', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - state = self._parse_json( - self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'state'), - video_id) - - account_id = try_get( - state, lambda x: x['configurations']['accountId'], - compat_str) or '6005208634001' - player_id = try_get( - state, lambda x: x['configurations']['playerId'], - compat_str) or 'default' - - entries = [] - for item_id, video in state['items'].items(): - if not isinstance(video, dict): - continue - asset_id = try_get( - video, lambda x: x['content']['attributes']['assetId'], - compat_str) - if not asset_id: - continue - entries.append(self.url_result( - smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, asset_id), - {'geo_countries': ['NZ']}), - 'BrightcoveNew', video_id)) - - return self.playlist_result(entries, video_id) diff --git a/yt_dlp/extractor/plutotv.py b/yt_dlp/extractor/plutotv.py index caffeb21d..5898d927c 100644 --- a/yt_dlp/extractor/plutotv.py +++ b/yt_dlp/extractor/plutotv.py @@ -16,6 +16,7 @@ from ..utils import ( class PlutoTVIE(InfoExtractor): + _WORKING = False _VALID_URL = r'''(?x) https?://(?:www\.)?pluto\.tv(?:/[^/]+)?/on-demand /(?Pmovies|series) diff --git a/yt_dlp/extractor/podchaser.py b/yt_dlp/extractor/podchaser.py index 290c48817..fc2d407b1 100644 --- a/yt_dlp/extractor/podchaser.py +++ b/yt_dlp/extractor/podchaser.py @@ -29,7 +29,7 @@ class PodchaserIE(InfoExtractor): 'duration': 3708, 'timestamp': 1636531259, 'upload_date': '20211110', - 'rating': 4.0 + 'average_rating': 4.0 } }, { 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853', @@ -59,7 +59,7 @@ class PodchaserIE(InfoExtractor): 'thumbnail': episode.get('image_url'), 'duration': str_to_int(episode.get('length')), 'timestamp': unified_timestamp(episode.get('air_date')), - 'rating': float_or_none(episode.get('rating')), + 'average_rating': float_or_none(episode.get('rating')), 'categories': list(set(traverse_obj(podcast, (('summary', None), 'categories', ..., 'text')))), 'tags': traverse_obj(podcast, ('tags', ..., 'text')), 'series': podcast.get('title'), diff --git a/yt_dlp/extractor/podomatic.py b/yt_dlp/extractor/podomatic.py index 985bfae9d..37b68694b 100644 --- a/yt_dlp/extractor/podomatic.py +++ b/yt_dlp/extractor/podomatic.py @@ -5,6 +5,7 @@ from ..utils import int_or_none class PodomaticIE(InfoExtractor): + _WORKING = False IE_NAME = 'podomatic' _VALID_URL = r'''(?x) (?Phttps?):// diff --git a/yt_dlp/extractor/pornovoisines.py b/yt_dlp/extractor/pornovoisines.py index aa48da06b..2e51b4f6b 100644 --- a/yt_dlp/extractor/pornovoisines.py +++ b/yt_dlp/extractor/pornovoisines.py @@ -7,6 +7,7 @@ from ..utils import ( class PornoVoisinesIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P\d+)/(?P[^/.]+)' _TEST = { diff --git a/yt_dlp/extractor/pornoxo.py b/yt_dlp/extractor/pornoxo.py index 5104d8a49..049feb4ec 100644 --- a/yt_dlp/extractor/pornoxo.py +++ b/yt_dlp/extractor/pornoxo.py @@ -5,6 +5,7 @@ from ..utils import ( class PornoXOIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P\d+)/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index 36e415f4a..66f8a5f44 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -1,5 +1,4 @@ import json -from datetime import date from urllib.parse import unquote from .common import InfoExtractor @@ -10,6 +9,7 @@ from ..utils import ( int_or_none, make_archive_id, mimetype2ext, + str_or_none, urljoin, ) from ..utils.traversal import traverse_obj @@ -25,8 +25,8 @@ class Pr0grammIE(InfoExtractor): 'title': 'pr0gramm-5466437 by g11st', 'tags': ['Neon Genesis Evangelion', 'Touhou Project', 'Fly me to the Moon', 'Marisad', 'Marisa Kirisame', 'video', 'sound', 'Marisa', 'Anime'], 'uploader': 'g11st', - 'uploader_id': 394718, - 'upload_timestamp': 1671590240, + 'uploader_id': '394718', + 'timestamp': 1671590240, 'upload_date': '20221221', 'like_count': int, 'dislike_count': int, @@ -42,8 +42,8 @@ class Pr0grammIE(InfoExtractor): 'title': 'pr0gramm-3052805 by Hansking1', 'tags': 'count:15', 'uploader': 'Hansking1', - 'uploader_id': 385563, - 'upload_timestamp': 1552930408, + 'uploader_id': '385563', + 'timestamp': 1552930408, 'upload_date': '20190318', 'like_count': int, 'dislike_count': int, @@ -60,8 +60,8 @@ class Pr0grammIE(InfoExtractor): 'title': 'pr0gramm-5848332 by erd0pfel', 'tags': 'count:18', 'uploader': 'erd0pfel', - 'uploader_id': 349094, - 'upload_timestamp': 1694489652, + 'uploader_id': '349094', + 'timestamp': 1694489652, 'upload_date': '20230912', 'like_count': int, 'dislike_count': int, @@ -77,8 +77,8 @@ class Pr0grammIE(InfoExtractor): 'title': 'pr0gramm-5895149 by algoholigSeeManThrower', 'tags': 'count:19', 'uploader': 'algoholigSeeManThrower', - 'uploader_id': 457556, - 'upload_timestamp': 1697580902, + 'uploader_id': '457556', + 'timestamp': 1697580902, 'upload_date': '20231018', 'like_count': int, 'dislike_count': int, @@ -192,11 +192,10 @@ class Pr0grammIE(InfoExtractor): '_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)], **traverse_obj(video_info, { 'uploader': ('user', {str}), - 'uploader_id': ('userId', {int}), + 'uploader_id': ('userId', {str_or_none}), 'like_count': ('up', {int}), 'dislike_count': ('down', {int}), - 'upload_timestamp': ('created', {int}), - 'upload_date': ('created', {int}, {date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}), + 'timestamp': ('created', {int}), 'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)}) }), } diff --git a/yt_dlp/extractor/prankcast.py b/yt_dlp/extractor/prankcast.py index 562aca0ff..56cd40d8a 100644 --- a/yt_dlp/extractor/prankcast.py +++ b/yt_dlp/extractor/prankcast.py @@ -16,7 +16,7 @@ class PrankCastIE(InfoExtractor): 'display_id': 'Beverly-is-back-like-a-heart-attack-', 'timestamp': 1661391575, 'uploader': 'Devonanustart', - 'channel_id': 4, + 'channel_id': '4', 'duration': 7918, 'cast': ['Devonanustart', 'Phonelosers'], 'description': '', @@ -33,7 +33,7 @@ class PrankCastIE(InfoExtractor): 'display_id': 'NOT-COOL', 'timestamp': 1665028364, 'uploader': 'phonelosers', - 'channel_id': 6, + 'channel_id': '6', 'duration': 4044, 'cast': ['phonelosers'], 'description': '', @@ -60,7 +60,7 @@ class PrankCastIE(InfoExtractor): 'url': f'{json_info["broadcast_url"]}{json_info["recording_hash"]}.mp3', 'timestamp': start_date, 'uploader': uploader, - 'channel_id': json_info.get('user_id'), + 'channel_id': str_or_none(json_info.get('user_id')), 'duration': try_call(lambda: parse_iso8601(json_info['end_date']) - start_date), 'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))), 'description': json_info.get('broadcast_description'), diff --git a/yt_dlp/extractor/projectveritas.py b/yt_dlp/extractor/projectveritas.py index 0e029ce8c..daf14054c 100644 --- a/yt_dlp/extractor/projectveritas.py +++ b/yt_dlp/extractor/projectveritas.py @@ -7,6 +7,7 @@ from ..utils import ( class ProjectVeritasIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?projectveritas\.com/(?Pnews|video)/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.projectveritas.com/news/exclusive-inside-the-new-york-and-new-jersey-hospitals-battling-coronavirus/', diff --git a/yt_dlp/extractor/r7.py b/yt_dlp/extractor/r7.py index f067a0571..36f0b52bd 100644 --- a/yt_dlp/extractor/r7.py +++ b/yt_dlp/extractor/r7.py @@ -3,6 +3,8 @@ from ..utils import int_or_none class R7IE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE _VALID_URL = r'''(?x) https?:// (?: @@ -86,6 +88,8 @@ class R7IE(InfoExtractor): class R7ArticleIE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P\d+)' _TEST = { 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', diff --git a/yt_dlp/extractor/radiocomercial.py b/yt_dlp/extractor/radiocomercial.py index 07891fe41..38f8cf786 100644 --- a/yt_dlp/extractor/radiocomercial.py +++ b/yt_dlp/extractor/radiocomercial.py @@ -30,7 +30,8 @@ class RadioComercialIE(InfoExtractor): 'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.', 'release_date': '20231025', 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', - 'season': 6 + 'season': 'Season 6', + 'season_number': 6, } }, { 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3/convenca-me-num-minuto-que-os-lobisomens-existem', @@ -41,7 +42,8 @@ class RadioComercialIE(InfoExtractor): 'title': 'Convença-me num minuto que os lobisomens existem', 'release_date': '20231026', 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', - 'season': 3 + 'season': 'Season 3', + 'season_number': 3, } }, { 'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao', @@ -53,7 +55,8 @@ class RadioComercialIE(InfoExtractor): 'description': 'md5:8a82beeb372641614772baab7246245f', 'release_date': '20231101', 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', - 'season': 2 + 'season': 'Season 2', + 'season_number': 2, }, 'params': { # inconsistant md5 @@ -68,7 +71,8 @@ class RadioComercialIE(InfoExtractor): 'title': 'T.N.T 29 de outubro', 'release_date': '20231029', 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', - 'season': 2023 + 'season': 'Season 2023', + 'season_number': 2023, } }] @@ -82,7 +86,7 @@ class RadioComercialIE(InfoExtractor): 'release_date': unified_strdate(get_element_by_class( 'date', get_element_html_by_class('descriptions', webpage) or '')), 'thumbnail': self._og_search_thumbnail(webpage), - 'season': int_or_none(season), + 'season_number': int_or_none(season), 'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'), } diff --git a/yt_dlp/extractor/radiode.py b/yt_dlp/extractor/radiode.py index 32c36d557..726207825 100644 --- a/yt_dlp/extractor/radiode.py +++ b/yt_dlp/extractor/radiode.py @@ -2,6 +2,7 @@ from .common import InfoExtractor class RadioDeIE(InfoExtractor): + _WORKING = False IE_NAME = 'radio.de' _VALID_URL = r'https?://(?P.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)' _TEST = { diff --git a/yt_dlp/extractor/radiojavan.py b/yt_dlp/extractor/radiojavan.py index 6a9139466..b3befaef9 100644 --- a/yt_dlp/extractor/radiojavan.py +++ b/yt_dlp/extractor/radiojavan.py @@ -11,6 +11,7 @@ from ..utils import ( class RadioJavanIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P[^/]+)/?' _TEST = { 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', diff --git a/yt_dlp/extractor/radlive.py b/yt_dlp/extractor/radlive.py index 9bcbb11d5..3c00183be 100644 --- a/yt_dlp/extractor/radlive.py +++ b/yt_dlp/extractor/radlive.py @@ -38,10 +38,6 @@ class RadLiveIE(InfoExtractor): 'language': 'en', 'thumbnail': 'https://lsp.littlstar.com/channels/WHISTLE/BAD_JOKES/SEASON_1/BAD_JOKES_101/poster.jpg', 'description': 'Bad Jokes - Champions, Adam Pally, Super Troopers, Team Edge and 2Hype', - 'release_timestamp': None, - 'channel': None, - 'channel_id': None, - 'channel_url': None, 'episode': 'E01: Bad Jokes 1', 'episode_number': 1, 'episode_id': '336', diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index f6219c2db..c1fc65c81 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -28,6 +28,29 @@ class RaiBaseIE(InfoExtractor): _GEO_COUNTRIES = ['IT'] _GEO_BYPASS = False + def _fix_m3u8_formats(self, media_url, video_id): + fmts = self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + + # Fix malformed m3u8 manifests by setting audio-only/video-only formats + for f in fmts: + if not f.get('acodec'): + f['acodec'] = 'mp4a' + if not f.get('vcodec'): + f['vcodec'] = 'avc1' + man_url = f['url'] + if re.search(r'chunklist(?:_b\d+)*_ao[_.]', man_url): # audio only + f['vcodec'] = 'none' + elif re.search(r'chunklist(?:_b\d+)*_vo[_.]', man_url): # video only + f['acodec'] = 'none' + else: # video+audio + if f['acodec'] == 'none': + f['acodec'] = 'mp4a' + if f['vcodec'] == 'none': + f['vcodec'] = 'avc1' + + return fmts + def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): def fix_cdata(s): # remove \r\n\t before and after to avoid @@ -69,8 +92,7 @@ class RaiBaseIE(InfoExtractor): 'format_id': 'https-mp3', }) elif ext == 'm3u8' or 'format=m3u8' in media_url: - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + formats.extend(self._fix_m3u8_formats(media_url, video_id)) elif ext == 'f4m': # very likely no longer needed. Cannot find any url that uses it. manifest_url = update_url_query( @@ -153,10 +175,10 @@ class RaiBaseIE(InfoExtractor): 'format_id': f'https-{tbr}', 'width': format_copy.get('width'), 'height': format_copy.get('height'), - 'tbr': format_copy.get('tbr'), - 'vcodec': format_copy.get('vcodec'), - 'acodec': format_copy.get('acodec'), - 'fps': format_copy.get('fps'), + 'tbr': format_copy.get('tbr') or tbr, + 'vcodec': format_copy.get('vcodec') or 'avc1', + 'acodec': format_copy.get('acodec') or 'mp4a', + 'fps': format_copy.get('fps') or 25, } if format_copy else { 'format_id': f'https-{tbr}', 'width': _QUALITY[tbr][0], @@ -245,7 +267,7 @@ class RaiPlayIE(RaiBaseIE): 'series': 'Report', 'season': '2013/14', 'subtitles': {'it': 'count:4'}, - 'release_year': 2022, + 'release_year': 2024, 'episode': 'Espresso nel caffè - 07/04/2014', 'timestamp': 1396919880, 'upload_date': '20140408', @@ -253,7 +275,7 @@ class RaiPlayIE(RaiBaseIE): }, 'params': {'skip_download': True}, }, { - # 1080p direct mp4 url + # 1080p 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', 'md5': 'aeda7243115380b2dd5e881fd42d949a', 'info_dict': { @@ -274,7 +296,7 @@ class RaiPlayIE(RaiBaseIE): 'episode': 'Senza occhi', 'timestamp': 1637318940, 'upload_date': '20211119', - 'formats': 'count:12', + 'formats': 'count:7', }, 'params': {'skip_download': True}, 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] @@ -527,7 +549,7 @@ class RaiPlaySoundPlaylistIE(InfoExtractor): 'info_dict': { 'id': 'ilruggitodelconiglio', 'title': 'Il Ruggito del Coniglio', - 'description': 'md5:48cff6972435964284614d70474132e6', + 'description': 'md5:62a627b3a2d0635d08fa8b6e0a04f27e', }, 'playlist_mincount': 65, }, { @@ -634,19 +656,20 @@ class RaiIE(RaiBaseIE): } -class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE +class RaiNewsIE(RaiBaseIE): _VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' _EMBED_REGEX = [rf']+data-src="(?P/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] _TESTS = [{ # new rainews player (#3911) - 'url': 'https://www.rainews.it/rubriche/24mm/video/2022/05/24mm-del-29052022-12cf645d-1ffd-4220-b27c-07c226dbdecf.html', + 'url': 'https://www.rainews.it/video/2024/02/membri-della-croce-rossa-evacuano-gli-abitanti-di-un-villaggio-nella-regione-ucraina-di-kharkiv-il-filmato-dallucraina--31e8017c-845c-43f5-9c48-245b43c3a079.html', 'info_dict': { - 'id': '12cf645d-1ffd-4220-b27c-07c226dbdecf', + 'id': '31e8017c-845c-43f5-9c48-245b43c3a079', 'ext': 'mp4', - 'title': 'Puntata del 29/05/2022', - 'duration': 1589, - 'upload_date': '20220529', + 'title': 'md5:1e81364b09de4a149042bac3c7d36f0b', + 'duration': 196, + 'upload_date': '20240225', 'uploader': 'rainews', + 'formats': 'count:2', }, 'params': {'skip_download': True}, }, { @@ -659,7 +682,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE 'description': 'I film in uscita questa settimana.', 'thumbnail': r're:^https?://.*\.png$', 'duration': 833, - 'upload_date': '20161103' + 'upload_date': '20161103', + 'formats': 'count:8', }, 'params': {'skip_download': True}, 'expected_warnings': ['unable to extract player_data'], @@ -684,7 +708,7 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE if not relinker_url: # fallback on old implementation for some old content try: - return self._extract_from_content_id(video_id, url) + return RaiIE._real_extract(self, url) except GeoRestrictedError: raise except ExtractorError as e: diff --git a/yt_dlp/extractor/rbmaradio.py b/yt_dlp/extractor/rbmaradio.py deleted file mode 100644 index 86c63dbb7..000000000 --- a/yt_dlp/extractor/rbmaradio.py +++ /dev/null @@ -1,68 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - clean_html, - int_or_none, - unified_timestamp, - update_url_query, -) - - -class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P[^/]+)/episodes/(?P[^/?#&]+)' - _TEST = { - 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', - 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', - 'info_dict': { - 'id': 'ford-lopatin-live-at-primavera-sound-2011', - 'ext': 'mp3', - 'title': 'Main Stage - Ford & Lopatin at Primavera Sound', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2452, - 'timestamp': 1307103164, - 'upload_date': '20110603', - }, - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - show_id = mobj.group('show_id') - episode_id = mobj.group('id') - - webpage = self._download_webpage(url, episode_id) - - episode = self._parse_json( - self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*', - webpage, 'json data'), - episode_id)['episodes'][show_id][episode_id] - - title = episode['title'] - - show_title = episode.get('showTitle') - if show_title: - title = '%s - %s' % (show_title, title) - - formats = [{ - 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), - 'format_id': compat_str(abr), - 'abr': abr, - 'vcodec': 'none', - } for abr in (96, 128, 192, 256)] - self._check_formats(formats, episode_id) - - description = clean_html(episode.get('longTeaser')) - thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) - duration = int_or_none(episode.get('duration')) - timestamp = unified_timestamp(episode.get('publishedAt')) - - return { - 'id': episode_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } diff --git a/yt_dlp/extractor/rcti.py b/yt_dlp/extractor/rcti.py index 79d9c8e31..2f50efeda 100644 --- a/yt_dlp/extractor/rcti.py +++ b/yt_dlp/extractor/rcti.py @@ -229,7 +229,7 @@ class RCTIPlusSeriesIE(RCTIPlusBaseIE): 'age_limit': 2, 'cast': ['Verrel Bramasta', 'Ranty Maria', 'Riza Syah', 'Ivan Fadilla', 'Nicole Parham', 'Dll', 'Aviv Elham'], 'display_id': 'putri-untuk-pangeran', - 'tag': 'count:18', + 'tags': 'count:18', }, }, { # No episodes 'url': 'https://www.rctiplus.com/programs/615/inews-pagi', @@ -239,7 +239,7 @@ class RCTIPlusSeriesIE(RCTIPlusBaseIE): 'title': 'iNews Pagi', 'description': 'md5:f18ee3d4643cfb41c358e5a9b693ee04', 'age_limit': 2, - 'tag': 'count:11', + 'tags': 'count:11', 'display_id': 'inews-pagi', } }] @@ -327,8 +327,8 @@ class RCTIPlusSeriesIE(RCTIPlusBaseIE): 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]), 'cast': traverse_obj(series_meta, (('starring', 'creator', 'writer'), ..., 'name'), expected_type=lambda x: strip_or_none(x) or None), - 'tag': traverse_obj(series_meta, ('tag', ..., 'name'), - expected_type=lambda x: strip_or_none(x) or None), + 'tags': traverse_obj(series_meta, ('tag', ..., 'name'), + expected_type=lambda x: strip_or_none(x) or None), } return self.playlist_result( self._series_entries(series_id, display_id, video_type, metadata), series_id, diff --git a/yt_dlp/extractor/rds.py b/yt_dlp/extractor/rds.py index 9a2e0d985..1a1c6634e 100644 --- a/yt_dlp/extractor/rds.py +++ b/yt_dlp/extractor/rds.py @@ -8,6 +8,7 @@ from ..compat import compat_str class RDSIE(InfoExtractor): + _WORKING = False IE_DESC = 'RDS.ca' _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P[^/]+)-\d+\.\d+' diff --git a/yt_dlp/extractor/redbee.py b/yt_dlp/extractor/redbee.py index b59b518b1..4d71133b3 100644 --- a/yt_dlp/extractor/redbee.py +++ b/yt_dlp/extractor/redbee.py @@ -134,6 +134,7 @@ class ParliamentLiveUKIE(RedBeeBaseIE): class RTBFIE(RedBeeBaseIE): + _WORKING = False _VALID_URL = r'''(?x) https?://(?:www\.)?rtbf\.be/ (?: diff --git a/yt_dlp/extractor/regiotv.py b/yt_dlp/extractor/regiotv.py deleted file mode 100644 index edb6ae5bc..000000000 --- a/yt_dlp/extractor/regiotv.py +++ /dev/null @@ -1,55 +0,0 @@ -from .common import InfoExtractor -from ..networking import Request -from ..utils import xpath_text, xpath_with_ns - - -class RegioTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?regio-tv\.de/video/(?P[0-9]+)' - _TESTS = [{ - 'url': 'http://www.regio-tv.de/video/395808.html', - 'info_dict': { - 'id': '395808', - 'ext': 'mp4', - 'title': 'Wir in Ludwigsburg', - 'description': 'Mit unseren zuckersüßen Adventskindern, außerdem besuchen wir die Abendsterne!', - } - }, { - 'url': 'http://www.regio-tv.de/video/395808', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - key = self._search_regex( - r'key\s*:\s*(["\'])(?P.+?)\1', webpage, 'key', group='key') - title = self._og_search_title(webpage) - - SOAP_TEMPLATE = '<{0} xmlns="http://v.telvi.de/">{1}' - - request = Request( - 'http://v.telvi.de/', - SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8')) - video_data = self._download_xml(request, video_id, 'Downloading video XML') - - NS_MAP = { - 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', - 'soap': 'http://schemas.xmlsoap.org/soap/envelope/', - } - - video_url = xpath_text( - video_data, xpath_with_ns('.//video', NS_MAP), 'video url', fatal=True) - thumbnail = xpath_text( - video_data, xpath_with_ns('.//image', NS_MAP), 'thumbnail') - description = self._og_search_description( - webpage) or self._html_search_meta('description', webpage) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/rentv.py b/yt_dlp/extractor/rentv.py index fdde31704..abb537cf3 100644 --- a/yt_dlp/extractor/rentv.py +++ b/yt_dlp/extractor/rentv.py @@ -8,6 +8,7 @@ from ..utils import ( class RENTVIE(InfoExtractor): + _WORKING = False _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P\d+)' _TESTS = [{ 'url': 'http://ren.tv/video/epizod/118577', @@ -59,6 +60,7 @@ class RENTVIE(InfoExtractor): class RENTVArticleIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?ren\.tv/novosti/\d{4}-\d{2}-\d{2}/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://ren.tv/novosti/2016-10-26/video-mikroavtobus-popavshiy-v-dtp-s-gruzovikami-v-podmoskove-prevratilsya-v', diff --git a/yt_dlp/extractor/restudy.py b/yt_dlp/extractor/restudy.py index 6d032564d..f49262a65 100644 --- a/yt_dlp/extractor/restudy.py +++ b/yt_dlp/extractor/restudy.py @@ -2,6 +2,7 @@ from .common import InfoExtractor class RestudyIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:(?:www|portal)\.)?restudy\.dk/video/[^/]+/id/(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.restudy.dk/video/play/id/1637', diff --git a/yt_dlp/extractor/reuters.py b/yt_dlp/extractor/reuters.py index 6919425f3..0a8f13b9f 100644 --- a/yt_dlp/extractor/reuters.py +++ b/yt_dlp/extractor/reuters.py @@ -9,6 +9,7 @@ from ..utils import ( class ReutersIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P[0-9]+)' _TEST = { 'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562', diff --git a/yt_dlp/extractor/ridehome.py b/yt_dlp/extractor/ridehome.py new file mode 100644 index 000000000..78f838ac1 --- /dev/null +++ b/yt_dlp/extractor/ridehome.py @@ -0,0 +1,96 @@ +from .art19 import Art19IE +from .common import InfoExtractor +from ..utils import extract_attributes, get_elements_html_by_class +from ..utils.traversal import traverse_obj + + +class RideHomeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ridehome\.info/show/[\w-]+/(?P[\w-]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.ridehome.info/show/techmeme-ride-home/thu-1228-will-2024-be-the-year-apple-gets-serious-about-gaming-on-macs/', + 'info_dict': { + 'id': 'thu-1228-will-2024-be-the-year-apple-gets-serious-about-gaming-on-macs', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': 'c84ea3cc96950a9ab86fe540f3edc588', + 'info_dict': { + 'id': '540e5493-9fe6-4c14-a488-dc508d8794b2', + 'ext': 'mp3', + 'title': 'Thu. 12/28 – Will 2024 Be The Year Apple Gets Serious About Gaming On Macs?', + 'description': 'md5:9dba86ae9b5047a8150eceddeeb629c2', + 'series': 'Techmeme Ride Home', + 'series_id': '3c30e8f4-ab48-415b-9421-1ae06cd4058b', + 'upload_date': '20231228', + 'timestamp': 1703780995, + 'modified_date': '20231230', + 'episode_id': '540e5493-9fe6-4c14-a488-dc508d8794b2', + 'modified_timestamp': 1703912404, + 'release_date': '20231228', + 'release_timestamp': 1703782800, + 'duration': 1000.1502, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com/images/.*\.jpeg$', + }, + }], + }, { + 'url': 'https://www.ridehome.info/show/techmeme-ride-home/portfolio-profile-sensel-with-ilyarosenberg/', + 'info_dict': { + 'id': 'portfolio-profile-sensel-with-ilyarosenberg', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': 'bf9d6efad221008ce71aea09d5533cf6', + 'info_dict': { + 'id': '6beed803-b1ef-4536-9fef-c23cf6b4dcac', + 'ext': 'mp3', + 'title': '(Portfolio Profile) Sensel - With @IlyaRosenberg', + 'description': 'md5:e1e4a970bce04290e0ba6f030b0125db', + 'series': 'Techmeme Ride Home', + 'series_id': '3c30e8f4-ab48-415b-9421-1ae06cd4058b', + 'upload_date': '20220108', + 'timestamp': 1641656064, + 'modified_date': '20230418', + 'episode_id': '6beed803-b1ef-4536-9fef-c23cf6b4dcac', + 'modified_timestamp': 1681843318, + 'release_date': '20220108', + 'release_timestamp': 1641672000, + 'duration': 2789.38122, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com/images/.*\.jpeg$' + }, + }], + }, { + 'url': 'https://www.ridehome.info/show/spacecasts/big-tech-news-apples-macbook-pro-event/', + 'info_dict': { + 'id': 'big-tech-news-apples-macbook-pro-event', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': 'b1428530c6e03904a8271e978007fc05', + 'info_dict': { + 'id': 'f4780044-6c4b-4ce0-8215-8a86cc66bff7', + 'ext': 'mp3', + 'title': 'md5:e6c05d44d59b6577a4145ac339de5040', + 'description': 'md5:14152f7228c8a301a77e3d6bc891b145', + 'series': 'SpaceCasts', + 'series_id': '8e3e837d-7fe0-4a23-8e11-894917e07e17', + 'upload_date': '20211026', + 'timestamp': 1635271450, + 'modified_date': '20230502', + 'episode_id': 'f4780044-6c4b-4ce0-8215-8a86cc66bff7', + 'modified_timestamp': 1683057500, + 'release_date': '20211026', + 'release_timestamp': 1635272124, + 'duration': 2266.30531, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com/images/.*\.jpeg$' + }, + }], + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + urls = traverse_obj( + get_elements_html_by_class('iframeContainer', webpage), + (..., {extract_attributes}, lambda k, v: k == 'data-src' and Art19IE.suitable(v))) + return self.playlist_from_matches(urls, article_id, ie=Art19IE) diff --git a/yt_dlp/extractor/rockstargames.py b/yt_dlp/extractor/rockstargames.py index c491aaf53..b0b92e642 100644 --- a/yt_dlp/extractor/rockstargames.py +++ b/yt_dlp/extractor/rockstargames.py @@ -6,6 +6,7 @@ from ..utils import ( class RockstarGamesIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?rockstargames\.com/videos(?:/video/|#?/?\?.*\bvideo=)(?P\d+)' _TESTS = [{ 'url': 'https://www.rockstargames.com/videos/video/11544/', diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py index cad76f0c9..5099f3ae4 100644 --- a/yt_dlp/extractor/rokfin.py +++ b/yt_dlp/extractor/rokfin.py @@ -38,7 +38,7 @@ class RokfinIE(InfoExtractor): 'upload_date': '20211023', 'timestamp': 1634998029, 'channel': 'Jimmy Dore', - 'channel_id': 65429, + 'channel_id': '65429', 'channel_url': 'https://rokfin.com/TheJimmyDoreShow', 'availability': 'public', 'live_status': 'not_live', @@ -56,7 +56,7 @@ class RokfinIE(InfoExtractor): 'upload_date': '20190412', 'timestamp': 1555052644, 'channel': 'Ron Placone', - 'channel_id': 10, + 'channel_id': '10', 'channel_url': 'https://rokfin.com/RonPlacone', 'availability': 'public', 'live_status': 'not_live', @@ -73,7 +73,7 @@ class RokfinIE(InfoExtractor): 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', 'description': 'md5:324ce2d3e3b62e659506409e458b9d8e', 'channel': 'TLAVagabond', - 'channel_id': 53856, + 'channel_id': '53856', 'channel_url': 'https://rokfin.com/TLAVagabond', 'availability': 'public', 'is_live': False, @@ -86,7 +86,6 @@ class RokfinIE(InfoExtractor): 'dislike_count': int, 'like_count': int, 'tags': ['FreeThinkingMedia^'], - 'duration': None, } }, { 'url': 'https://rokfin.com/post/126703/Brave-New-World--Aldous-Huxley-DEEPDIVE--Chpts-13--Quite-Frankly--Jay-Dyer', @@ -96,7 +95,7 @@ class RokfinIE(InfoExtractor): 'title': 'Brave New World - Aldous Huxley DEEPDIVE! (Chpts 1-3) - Quite Frankly & Jay Dyer', 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', 'channel': 'Jay Dyer', - 'channel_id': 186881, + 'channel_id': '186881', 'channel_url': 'https://rokfin.com/jaydyer', 'availability': 'premium_only', 'live_status': 'not_live', @@ -116,7 +115,7 @@ class RokfinIE(InfoExtractor): 'title': 'The Grayzone live on Nordstream blame game', 'thumbnail': r're:https://image\.v\.rokfin\.com/.+', 'channel': 'Max Blumenthal', - 'channel_id': 248902, + 'channel_id': '248902', 'channel_url': 'https://rokfin.com/MaxBlumenthal', 'availability': 'premium_only', 'live_status': 'was_live', @@ -174,7 +173,7 @@ class RokfinIE(InfoExtractor): 'like_count': int_or_none(metadata.get('likeCount')), 'dislike_count': int_or_none(metadata.get('dislikeCount')), 'channel': str_or_none(traverse_obj(metadata, ('createdBy', 'name'), ('creator', 'name'))), - 'channel_id': traverse_obj(metadata, ('createdBy', 'id'), ('creator', 'id')), + 'channel_id': str_or_none(traverse_obj(metadata, ('createdBy', 'id'), ('creator', 'id'))), 'channel_url': url_or_none(f'https://rokfin.com/{uploader}') if uploader else None, 'timestamp': timestamp, 'release_timestamp': timestamp if live_status != 'not_live' else None, diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py index 94e673b13..e19a85d06 100644 --- a/yt_dlp/extractor/roosterteeth.py +++ b/yt_dlp/extractor/roosterteeth.py @@ -2,16 +2,17 @@ from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, + LazyList, int_or_none, join_nonempty, - LazyList, + parse_iso8601, parse_qs, str_or_none, traverse_obj, + update_url_query, url_or_none, urlencode_postdata, urljoin, - update_url_query, ) @@ -70,6 +71,7 @@ class RoosterTeethBaseIE(InfoExtractor): 'episode_id': str_or_none(data.get('uuid')), 'channel_id': attributes.get('channel_id'), 'duration': int_or_none(attributes.get('length')), + 'release_timestamp': parse_iso8601(attributes.get('original_air_date')), 'thumbnails': thumbnails, 'availability': self._availability( needs_premium=sub_only, needs_subscription=sub_only, needs_auth=sub_only, @@ -91,6 +93,17 @@ class RoosterTeethIE(RoosterTeethBaseIE): 'thumbnail': r're:^https?://.*\.png$', 'series': 'Million Dollars, But...', 'episode': 'Million Dollars, But... The Game Announcement', + 'tags': ['Game Show', 'Sketch'], + 'season_number': 2, + 'availability': 'public', + 'episode_number': 10, + 'episode_id': '00374575-464e-11e7-a302-065410f210c4', + 'season': 'Season 2', + 'season_id': 'ffa27d48-464d-11e7-a302-065410f210c4', + 'channel_id': '92b6bb21-91d2-4b1b-bf95-3268fa0d9939', + 'duration': 145, + 'release_timestamp': 1462982400, + 'release_date': '20160511', }, 'params': {'skip_download': True}, }, { @@ -104,6 +117,42 @@ class RoosterTeethIE(RoosterTeethBaseIE): 'channel_id': '92f780eb-ebfe-4bf5-a3b5-c6ad5460a5f1', 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', 'ext': 'mp4', + 'availability': 'public', + 'episode_id': 'f8117b13-f068-499e-803e-eec9ea2dec8c', + 'episode_number': 3, + 'tags': ['Animation'], + 'season_id': '4b8f0a9e-12c4-41ed-8caa-fed15a85bab8', + 'season': 'Season 1', + 'series': 'RWBY: World of Remnant', + 'season_number': 1, + 'duration': 216, + 'release_timestamp': 1413489600, + 'release_date': '20141016', + }, + 'params': {'skip_download': True}, + }, { + # only works with video_data['attributes']['url'] m3u8 url + 'url': 'https://www.roosterteeth.com/watch/achievement-hunter-achievement-hunter-fatality-walkthrough-deathstroke-lex-luthor-captain-marvel-green-lantern-and-wonder-woman', + 'info_dict': { + 'id': '25394', + 'ext': 'mp4', + 'title': 'Fatality Walkthrough: Deathstroke, Lex Luthor, Captain Marvel, Green Lantern, and Wonder Woman', + 'description': 'md5:91bb934698344fb9647b1c7351f16964', + 'availability': 'public', + 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', + 'episode': 'Fatality Walkthrough: Deathstroke, Lex Luthor, Captain Marvel, Green Lantern, and Wonder Woman', + 'episode_number': 71, + 'episode_id': 'ffaec998-464d-11e7-a302-065410f210c4', + 'season': 'Season 2008', + 'tags': ['Gaming'], + 'series': 'Achievement Hunter', + 'display_id': 'md5:4465ce4f001735f9d7a2ae529a543d31', + 'season_id': 'ffa13340-464d-11e7-a302-065410f210c4', + 'season_number': 2008, + 'channel_id': '2cb2a70c-be50-46f5-93d7-84a1baabb4f7', + 'duration': 189, + 'release_timestamp': 1228317300, + 'release_date': '20081203', }, 'params': {'skip_download': True}, }, { @@ -133,10 +182,10 @@ class RoosterTeethIE(RoosterTeethBaseIE): try: video_data = self._download_json( - api_episode_url + '/videos', display_id, - 'Downloading video JSON metadata')['data'][0] + api_episode_url + '/videos', display_id, 'Downloading video JSON metadata', + headers={'Client-Type': 'web'})['data'][0] # web client-type yields ad-free streams m3u8_url = video_data['attributes']['url'] - # XXX: additional URL at video_data['links']['download'] + # XXX: additional ad-free URL at video_data['links']['download'] but often gives 403 errors except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 403: if self._parse_json(e.cause.response.read().decode(), display_id).get('access') is False: diff --git a/yt_dlp/extractor/rozhlas.py b/yt_dlp/extractor/rozhlas.py index 63134322d..411a62519 100644 --- a/yt_dlp/extractor/rozhlas.py +++ b/yt_dlp/extractor/rozhlas.py @@ -247,17 +247,17 @@ class MujRozhlasIE(RozhlasBaseIE): 'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli', 'md5': '6f8fd68663e64936623e67c152a669e0', 'info_dict': { - 'id': '10739193', + 'id': '10787730', 'ext': 'mp3', 'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli', 'description': 'md5:db7141e9caaedc9041ec7cefb9a62908', 'timestamp': 1684915200, - 'modified_timestamp': 1684922446, + 'modified_timestamp': 1687550432, 'series': 'Vykopávky', 'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg', 'channel_id': 'radio-wave', 'upload_date': '20230524', - 'modified_date': '20230524', + 'modified_date': '20230623', }, }, { # serial extraction @@ -277,6 +277,26 @@ class MujRozhlasIE(RozhlasBaseIE): 'title': 'Nespavci', 'description': 'md5:c430adcbf9e2b9eac88b745881e814dc', }, + }, { + # serialPart + 'url': 'https://www.mujrozhlas.cz/povidka/gustavo-adolfo-becquer-hora-duchu', + 'info_dict': { + 'id': '8889035', + 'ext': 'm4a', + 'title': 'Gustavo Adolfo Bécquer: Hora duchů', + 'description': 'md5:343a15257b376c276e210b78e900ffea', + 'chapter': 'Hora duchů a Polibek – dva tajemné příběhy Gustava Adolfa Bécquera', + 'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/2adfe1387fb140634be725c1ccf26214.jpg', + 'timestamp': 1708173000, + 'episode': 'Episode 1', + 'episode_number': 1, + 'series': 'Povídka', + 'modified_date': '20240217', + 'upload_date': '20240217', + 'modified_timestamp': 1708173198, + 'channel_id': 'vltava', + }, + 'params': {'skip_download': 'dash'}, }] def _call_api(self, path, item_id, msg='API JSON'): @@ -322,7 +342,7 @@ class MujRozhlasIE(RozhlasBaseIE): entity = info['siteEntityBundle'] - if entity == 'episode': + if entity in ('episode', 'serialPart'): return self._extract_audio_entry(self._call_api( 'episodes', info['contentId'], 'episode info API JSON')) diff --git a/yt_dlp/extractor/rts.py b/yt_dlp/extractor/rts.py index 9f73d1811..bce5cba82 100644 --- a/yt_dlp/extractor/rts.py +++ b/yt_dlp/extractor/rts.py @@ -13,6 +13,7 @@ from ..utils import ( class RTSIE(SRGSSRIE): # XXX: Do not subclass from concrete IE + _WORKING = False IE_DESC = 'RTS.ch' _VALID_URL = r'rts:(?P\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P[0-9]+)-(?P.+?)\.html' diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py index 85ad7e2ff..11095b262 100644 --- a/yt_dlp/extractor/rule34video.py +++ b/yt_dlp/extractor/rule34video.py @@ -9,7 +9,6 @@ from ..utils import ( get_element_html_by_class, get_elements_by_class, int_or_none, - join_nonempty, parse_count, parse_duration, unescapeHTML, @@ -57,7 +56,7 @@ class Rule34VideoIE(InfoExtractor): 'comment_count': int, 'timestamp': 1640131200, 'description': '', - 'creator': 'WildeerStudio', + 'creators': ['WildeerStudio'], 'upload_date': '20211222', 'uploader': 'CerZule', 'uploader_url': 'https://rule34video.com/members/36281/', @@ -81,13 +80,13 @@ class Rule34VideoIE(InfoExtractor): 'quality': quality, }) - categories, creator, uploader, uploader_url = [None] * 4 + categories, creators, uploader, uploader_url = [None] * 4 for col in get_elements_by_class('col', webpage): label = clean_html(get_element_by_class('label', col)) if label == 'Categories:': categories = list(map(clean_html, get_elements_by_class('item', col))) elif label == 'Artist:': - creator = join_nonempty(*map(clean_html, get_elements_by_class('item', col)), delim=', ') + creators = list(map(clean_html, get_elements_by_class('item', col))) elif label == 'Uploaded By:': uploader = clean_html(get_element_by_class('name', col)) uploader_url = extract_attributes(get_element_html_by_class('name', col) or '').get('href') @@ -115,7 +114,7 @@ class Rule34VideoIE(InfoExtractor): 'comment_count': int_or_none(self._search_regex( r'[^(]+\((\d+)\)', get_element_by_attribute('href', '#tab_comments', webpage), 'comment count', fatal=False)), 'age_limit': 18, - 'creator': creator, + 'creators': creators, 'uploader': uploader, 'uploader_url': uploader_url, 'categories': categories, diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 1dc049ac8..837a324e6 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -90,7 +90,6 @@ class RumbleEmbedIE(InfoExtractor): 'channel_url': 'https://rumble.com/c/LofiGirl', 'channel': 'Lofi Girl', 'thumbnail': r're:https://.+\.jpg', - 'duration': None, 'uploader': 'Lofi Girl', 'live_status': 'is_live', }, diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index 08d9b9257..287824d08 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -46,7 +46,7 @@ class RutubeBaseIE(InfoExtractor): 'uploader': try_get(video, lambda x: x['author']['name']), 'uploader_id': compat_str(uploader_id) if uploader_id else None, 'timestamp': unified_timestamp(video.get('created_ts')), - 'category': [category] if category else None, + 'categories': [category] if category else None, 'age_limit': age_limit, 'view_count': int_or_none(video.get('hits')), 'comment_count': int_or_none(video.get('comments_count')), @@ -112,7 +112,7 @@ class RutubeIE(RutubeBaseIE): 'age_limit': 0, 'view_count': int, 'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg', - 'category': ['Новости и СМИ'], + 'categories': ['Новости и СМИ'], 'chapters': [], }, 'expected_warnings': ['Unable to download f4m'], @@ -144,7 +144,7 @@ class RutubeIE(RutubeBaseIE): 'age_limit': 0, 'view_count': int, 'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg', - 'category': ['Видеоигры'], + 'categories': ['Видеоигры'], 'chapters': [], }, 'expected_warnings': ['Unable to download f4m'], @@ -154,7 +154,7 @@ class RutubeIE(RutubeBaseIE): 'id': 'c65b465ad0c98c89f3b25cb03dcc87c6', 'ext': 'mp4', 'chapters': 'count:4', - 'category': ['Бизнес и предпринимательство'], + 'categories': ['Бизнес и предпринимательство'], 'description': 'md5:252feac1305257d8c1bab215cedde75d', 'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png', 'duration': 782, diff --git a/yt_dlp/extractor/saitosan.py b/yt_dlp/extractor/saitosan.py index d2f60e92f..a5f05e1d0 100644 --- a/yt_dlp/extractor/saitosan.py +++ b/yt_dlp/extractor/saitosan.py @@ -3,6 +3,7 @@ from ..utils import ExtractorError, try_get class SaitosanIE(InfoExtractor): + _WORKING = False IE_NAME = 'Saitosan' _VALID_URL = r'https?://(?:www\.)?saitosan\.net/bview.html\?id=(?P[0-9]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/savefrom.py b/yt_dlp/extractor/savefrom.py deleted file mode 100644 index 9c9e74b6d..000000000 --- a/yt_dlp/extractor/savefrom.py +++ /dev/null @@ -1,30 +0,0 @@ -import os.path - -from .common import InfoExtractor - - -class SaveFromIE(InfoExtractor): - IE_NAME = 'savefrom.net' - _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P.*)$' - - _TEST = { - 'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com', - 'info_dict': { - 'id': 'UlVRAPW2WJY', - 'ext': 'mp4', - 'title': 'About Team Radical MMA | MMA Fighting', - 'upload_date': '20120816', - 'uploader': 'Howcast', - 'uploader_id': 'Howcast', - 'description': r're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*', - }, - 'params': { - 'skip_download': True - } - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = os.path.splitext(url.split('/')[-1])[0] - - return self.url_result(mobj.group('url'), video_id=video_id) diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 7a9115047..8d61e22fc 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -44,8 +44,6 @@ class SBSIE(InfoExtractor): 'timestamp': 1408613220, 'upload_date': '20140821', 'uploader': 'SBSC', - 'tags': None, - 'categories': None, }, 'expected_warnings': ['Unable to download JSON metadata'], }, { diff --git a/yt_dlp/extractor/seeker.py b/yt_dlp/extractor/seeker.py deleted file mode 100644 index 65eb16a09..000000000 --- a/yt_dlp/extractor/seeker.py +++ /dev/null @@ -1,55 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - get_element_by_class, - strip_or_none, -) - - -class SeekerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P.*)-(?P\d+)\.html' - _TESTS = [{ - 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', - 'md5': '897d44bbe0d8986a2ead96de565a92db', - 'info_dict': { - 'id': 'Elrn3gnY', - 'ext': 'mp4', - 'title': 'Should Trump Be Required To Release His Tax Returns?', - 'description': 'md5:41efa8cfa8d627841045eec7b018eb45', - 'timestamp': 1490090165, - 'upload_date': '20170321', - } - }, { - 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', - 'playlist': [ - { - 'md5': '0497b9f20495174be73ae136949707d2', - 'info_dict': { - 'id': 'FihYQ8AE', - 'ext': 'mp4', - 'title': 'The Pros & Cons Of Zoos', - 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c', - 'timestamp': 1490039133, - 'upload_date': '20170320', - }, - } - ], - 'info_dict': { - 'id': '1834116536', - 'title': 'After Gorilla Killing, Changes Ahead for Zoos', - 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', - }, - }] - - def _real_extract(self, url): - display_id, article_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, display_id) - entries = [] - for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage): - entries.append(self.url_result( - 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id)) - return self.playlist_result( - entries, article_id, - self._og_search_title(webpage), - strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage)) diff --git a/yt_dlp/extractor/senalcolombia.py b/yt_dlp/extractor/senalcolombia.py index f3c066da7..b2f354fae 100644 --- a/yt_dlp/extractor/senalcolombia.py +++ b/yt_dlp/extractor/senalcolombia.py @@ -3,6 +3,7 @@ from .rtvcplay import RTVCKalturaIE class SenalColombiaLiveIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?senalcolombia\.tv/(?Psenal-en-vivo)' _TESTS = [{ diff --git a/yt_dlp/extractor/sendtonews.py b/yt_dlp/extractor/sendtonews.py index 3600e2e74..1ecea71fc 100644 --- a/yt_dlp/extractor/sendtonews.py +++ b/yt_dlp/extractor/sendtonews.py @@ -12,6 +12,7 @@ from ..utils import ( class SendtoNewsIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P[0-9A-Za-z-]+)' _TEST = { diff --git a/yt_dlp/extractor/sexu.py b/yt_dlp/extractor/sexu.py index 3117f81e3..989b63c72 100644 --- a/yt_dlp/extractor/sexu.py +++ b/yt_dlp/extractor/sexu.py @@ -2,6 +2,7 @@ from .common import InfoExtractor class SexuIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?sexu\.com/(?P\d+)' _TEST = { 'url': 'http://sexu.com/961791/', diff --git a/yt_dlp/extractor/skeb.py b/yt_dlp/extractor/skeb.py index e02f8cef0..54dfdc441 100644 --- a/yt_dlp/extractor/skeb.py +++ b/yt_dlp/extractor/skeb.py @@ -10,7 +10,7 @@ class SkebIE(InfoExtractor): 'info_dict': { 'id': '466853', 'title': '内容はおまかせします! by 姫ノ森りぃる@一周年', - 'descripion': 'md5:1ec50901efc3437cfbfe3790468d532d', + 'description': 'md5:1ec50901efc3437cfbfe3790468d532d', 'uploader': '姫ノ森りぃる@一周年', 'uploader_id': 'riiru_wm', 'age_limit': 0, @@ -34,7 +34,7 @@ class SkebIE(InfoExtractor): 'info_dict': { 'id': '489408', 'title': 'いつもお世話になってお... by 古川ノブ@音楽とVlo...', - 'descripion': 'md5:5adc2e41d06d33b558bf7b1faeb7b9c2', + 'description': 'md5:5adc2e41d06d33b558bf7b1faeb7b9c2', 'uploader': '古川ノブ@音楽とVlogのVtuber', 'uploader_id': 'furukawa_nob', 'age_limit': 0, @@ -61,12 +61,12 @@ class SkebIE(InfoExtractor): 'info_dict': { 'id': '6', 'title': 'ヒロ。\n\n私のキャラク... by 諸々', - 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07', + 'description': 'md5:aa6cbf2ba320b50bce219632de195f07', '_type': 'playlist', 'entries': [{ 'id': '486430', 'title': 'ヒロ。\n\n私のキャラク... by 諸々', - 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07', + 'description': 'md5:aa6cbf2ba320b50bce219632de195f07', }, { 'id': '486431', 'title': 'ヒロ。\n\n私のキャラク... by 諸々', @@ -81,7 +81,7 @@ class SkebIE(InfoExtractor): parent = { 'id': video_id, 'title': nuxt_data.get('title'), - 'descripion': nuxt_data.get('description'), + 'description': nuxt_data.get('description'), 'uploader': traverse_obj(nuxt_data, ('creator', 'name')), 'uploader_id': traverse_obj(nuxt_data, ('creator', 'screen_name')), 'age_limit': 18 if nuxt_data.get('nsfw') else 0, diff --git a/yt_dlp/extractor/skylinewebcams.py b/yt_dlp/extractor/skylinewebcams.py index 4292bb2ae..197407c18 100644 --- a/yt_dlp/extractor/skylinewebcams.py +++ b/yt_dlp/extractor/skylinewebcams.py @@ -2,6 +2,7 @@ from .common import InfoExtractor class SkylineWebcamsIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?skylinewebcams\.com/[^/]+/webcam/(?:[^/]+/)+(?P[^/]+)\.html' _TEST = { 'url': 'https://www.skylinewebcams.com/it/webcam/italia/lazio/roma/scalinata-piazza-di-spagna-barcaccia.html', diff --git a/yt_dlp/extractor/skynewsarabia.py b/yt_dlp/extractor/skynewsarabia.py index 6264b04bb..867782778 100644 --- a/yt_dlp/extractor/skynewsarabia.py +++ b/yt_dlp/extractor/skynewsarabia.py @@ -38,6 +38,7 @@ class SkyNewsArabiaBaseIE(InfoExtractor): class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): + _WORKING = False IE_NAME = 'skynewsarabia:video' _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/video/(?P[0-9]+)' _TEST = { @@ -64,6 +65,7 @@ class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): + _WORKING = False IE_NAME = 'skynewsarabia:article' _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P[0-9]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/stageplus.py b/yt_dlp/extractor/stageplus.py index 4bed4d646..77e4362fc 100644 --- a/yt_dlp/extractor/stageplus.py +++ b/yt_dlp/extractor/stageplus.py @@ -21,7 +21,7 @@ class StagePlusVODConcertIE(InfoExtractor): 'id': 'vod_concert_APNM8GRFDPHMASJKBSPJACG', 'title': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 – from Odeonsplatz', 'description': 'md5:50f78ec180518c9bdb876bac550996fc', - 'artist': ['Yuja Wang', 'Lorenzo Viotti'], + 'artists': ['Yuja Wang', 'Lorenzo Viotti'], 'upload_date': '20230331', 'timestamp': 1680249600, 'release_date': '20210709', @@ -40,10 +40,10 @@ class StagePlusVODConcertIE(InfoExtractor): 'release_timestamp': 1625788800, 'duration': 2207, 'chapters': 'count:5', - 'artist': ['Yuja Wang'], - 'composer': ['Sergei Rachmaninoff'], + 'artists': ['Yuja Wang'], + 'composers': ['Sergei Rachmaninoff'], 'album': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 – from Odeonsplatz', - 'album_artist': ['Yuja Wang', 'Lorenzo Viotti'], + 'album_artists': ['Yuja Wang', 'Lorenzo Viotti'], 'track': 'Piano Concerto No. 2 in C Minor, Op. 18', 'track_number': 1, 'genre': 'Instrumental Concerto', @@ -474,7 +474,7 @@ fragment BannerFields on Banner { metadata = traverse_obj(data, { 'title': 'title', 'description': ('shortDescription', {str}), - 'artist': ('artists', 'edges', ..., 'node', 'name'), + 'artists': ('artists', 'edges', ..., 'node', 'name'), 'timestamp': ('archiveReleaseDate', {unified_timestamp}), 'release_timestamp': ('productionDate', {unified_timestamp}), }) @@ -494,7 +494,7 @@ fragment BannerFields on Banner { 'formats': formats, 'subtitles': subtitles, 'album': metadata.get('title'), - 'album_artist': metadata.get('artist'), + 'album_artists': metadata.get('artist'), 'track_number': idx, **metadata, **traverse_obj(video, { @@ -506,8 +506,8 @@ fragment BannerFields on Banner { 'title': 'title', 'start_time': ('mark', {float_or_none}), }), - 'artist': ('artists', 'edges', ..., 'node', 'name'), - 'composer': ('work', 'composers', ..., 'name'), + 'artists': ('artists', 'edges', ..., 'node', 'name'), + 'composers': ('work', 'composers', ..., 'name'), 'genre': ('work', 'genre', 'title'), }), }) diff --git a/yt_dlp/extractor/startrek.py b/yt_dlp/extractor/startrek.py index e92122f9b..94efb589c 100644 --- a/yt_dlp/extractor/startrek.py +++ b/yt_dlp/extractor/startrek.py @@ -3,6 +3,7 @@ from ..utils import int_or_none, urljoin class StarTrekIE(InfoExtractor): + _WORKING = False _VALID_URL = r'(?Phttps?://(?:intl|www)\.startrek\.com)/videos/(?P[^/]+)' _TESTS = [{ 'url': 'https://intl.startrek.com/videos/watch-welcoming-jess-bush-to-the-ready-room', diff --git a/yt_dlp/extractor/steam.py b/yt_dlp/extractor/steam.py index 7daee2fe0..63da9662a 100644 --- a/yt_dlp/extractor/steam.py +++ b/yt_dlp/extractor/steam.py @@ -2,9 +2,10 @@ import re from .common import InfoExtractor from ..utils import ( - extract_attributes, ExtractorError, + extract_attributes, get_element_by_class, + str_or_none, ) @@ -30,7 +31,6 @@ class SteamIE(InfoExtractor): 'ext': 'mp4', 'title': 'Terraria video 256785003', 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com', - 'n_entries': 2, } }, { @@ -39,9 +39,7 @@ class SteamIE(InfoExtractor): 'id': '2040428', 'ext': 'mp4', 'title': 'Terraria video 2040428', - 'playlist_index': 2, 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com', - 'n_entries': 2, } } ], @@ -55,12 +53,10 @@ class SteamIE(InfoExtractor): }, { 'url': 'https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/', 'info_dict': { - 'id': '256757115', - 'title': 'Grand Theft Auto V video 256757115', - 'ext': 'mp4', - 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com', - 'n_entries': 20, + 'id': '271590', + 'title': 'Grand Theft Auto V', }, + 'playlist_count': 23, }] def _real_extract(self, url): @@ -136,7 +132,7 @@ class SteamCommunityBroadcastIE(InfoExtractor): 'id': '76561199073851486', 'title': r're:Steam Community :: pepperm!nt :: Broadcast 2022-06-26 \d{2}:\d{2}', 'ext': 'mp4', - 'uploader_id': 1113585758, + 'uploader_id': '1113585758', 'uploader': 'pepperm!nt', 'live_status': 'is_live', }, @@ -169,6 +165,6 @@ class SteamCommunityBroadcastIE(InfoExtractor): 'live_status': 'is_live', 'view_count': json_data.get('num_view'), 'uploader': uploader_json.get('persona_name'), - 'uploader_id': uploader_json.get('accountid'), + 'uploader_id': str_or_none(uploader_json.get('accountid')), 'subtitles': subs, } diff --git a/yt_dlp/extractor/streamff.py b/yt_dlp/extractor/streamff.py deleted file mode 100644 index 93c42942c..000000000 --- a/yt_dlp/extractor/streamff.py +++ /dev/null @@ -1,30 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none, parse_iso8601 - - -class StreamFFIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?streamff\.com/v/(?P[a-zA-Z0-9]+)' - - _TESTS = [{ - 'url': 'https://streamff.com/v/55cc94', - 'md5': '8745a67bb5e5c570738efe7983826370', - 'info_dict': { - 'id': '55cc94', - 'ext': 'mp4', - 'title': '55cc94', - 'timestamp': 1634764643, - 'upload_date': '20211020', - 'view_count': int, - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - json_data = self._download_json(f'https://streamff.com/api/videos/{video_id}', video_id) - return { - 'id': video_id, - 'title': json_data.get('name') or video_id, - 'url': 'https://streamff.com/%s' % json_data['videoLink'], - 'view_count': int_or_none(json_data.get('views')), - 'timestamp': parse_iso8601(json_data.get('date')), - } diff --git a/yt_dlp/extractor/swearnet.py b/yt_dlp/extractor/swearnet.py index 6e216a2a5..aeaff28f2 100644 --- a/yt_dlp/extractor/swearnet.py +++ b/yt_dlp/extractor/swearnet.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj +from ..utils import ExtractorError, int_or_none, traverse_obj class SwearnetEpisodeIE(InfoExtractor): @@ -51,7 +51,13 @@ class SwearnetEpisodeIE(InfoExtractor): display_id, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num') webpage = self._download_webpage(url, display_id) - external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid') + try: + external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid') + except ExtractorError: + if 'Upgrade Now' in webpage: + self.raise_login_required() + raise + json_data = self._download_json( f'https://play.vidyard.com/player/{external_id}.json', display_id)['payload']['chapters'][0] diff --git a/yt_dlp/extractor/syfy.py b/yt_dlp/extractor/syfy.py index afcdbf780..bd2d73842 100644 --- a/yt_dlp/extractor/syfy.py +++ b/yt_dlp/extractor/syfy.py @@ -6,6 +6,7 @@ from ..utils import ( class SyfyIE(AdobePassIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?syfy\.com/(?:[^/]+/)?videos/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer', diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py index e23b490b0..c69c13d0b 100644 --- a/yt_dlp/extractor/tagesschau.py +++ b/yt_dlp/extractor/tagesschau.py @@ -12,6 +12,7 @@ from ..utils import ( class TagesschauIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?(?P[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' _TESTS = [{ diff --git a/yt_dlp/extractor/tass.py b/yt_dlp/extractor/tass.py index 67e544a6a..d4c5b41a7 100644 --- a/yt_dlp/extractor/tass.py +++ b/yt_dlp/extractor/tass.py @@ -8,6 +8,7 @@ from ..utils import ( class TassIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:tass\.ru|itar-tass\.com)/[^/]+/(?P\d+)' _TESTS = [ { diff --git a/yt_dlp/extractor/tdslifeway.py b/yt_dlp/extractor/tdslifeway.py deleted file mode 100644 index 3623a68c8..000000000 --- a/yt_dlp/extractor/tdslifeway.py +++ /dev/null @@ -1,31 +0,0 @@ -from .common import InfoExtractor - - -class TDSLifewayIE(InfoExtractor): - _VALID_URL = r'https?://tds\.lifeway\.com/v1/trainingdeliverysystem/courses/(?P\d+)/index\.html' - - _TEST = { - # From http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers - 'url': 'http://tds.lifeway.com/v1/trainingdeliverysystem/courses/3453494717001/index.html?externalRegistration=AssetId%7C34F466F1-78F3-4619-B2AB-A8EFFA55E9E9%21InstanceId%7C0%21UserId%7Caaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa&grouping=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&activity_id=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&content_endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2Fcontent%2F&actor=%7B%22name%22%3A%5B%22Guest%20Guest%22%5D%2C%22account%22%3A%5B%7B%22accountServiceHomePage%22%3A%22http%3A%2F%2Fscorm.lifeway.com%2F%22%2C%22accountName%22%3A%22aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa%22%7D%5D%2C%22objectType%22%3A%22Agent%22%7D&content_token=462a50b2-b6f9-4970-99b1-930882c499fb®istration=93d6ec8e-7f7b-4ed3-bbc8-a857913c0b2a&externalConfiguration=access%7CFREE%21adLength%7C-1%21assignOrgId%7C4AE36F78-299A-425D-91EF-E14A899B725F%21assignOrgParentId%7C%21courseId%7C%21isAnonymous%7Cfalse%21previewAsset%7Cfalse%21previewLength%7C-1%21previewMode%7Cfalse%21royalty%7CFREE%21sessionId%7C671422F9-8E79-48D4-9C2C-4EE6111EA1CD%21trackId%7C&auth=Basic%20OjhmZjk5MDBmLTBlYTMtNDJhYS04YjFlLWE4MWQ3NGNkOGRjYw%3D%3D&endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2F', - 'info_dict': { - 'id': '3453494717001', - 'ext': 'mp4', - 'title': 'The Gospel by Numbers', - 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20140410', - 'description': 'Coming soon from T4G 2014!', - 'uploader_id': '2034960640001', - 'timestamp': 1397145591, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['BrightcoveNew'], - } - - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2034960640001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - brightcove_id = self._match_id(url) - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/yt_dlp/extractor/teachable.py b/yt_dlp/extractor/teachable.py index 01906bda9..5eac9aa3f 100644 --- a/yt_dlp/extractor/teachable.py +++ b/yt_dlp/extractor/teachable.py @@ -99,6 +99,7 @@ class TeachableBaseIE(InfoExtractor): class TeachableIE(TeachableBaseIE): + _WORKING = False _VALID_URL = r'''(?x) (?: %shttps?://(?P[^/]+)| diff --git a/yt_dlp/extractor/teachertube.py b/yt_dlp/extractor/teachertube.py index c3eec2784..90a976297 100644 --- a/yt_dlp/extractor/teachertube.py +++ b/yt_dlp/extractor/teachertube.py @@ -9,6 +9,7 @@ from ..utils import ( class TeacherTubeIE(InfoExtractor): + _WORKING = False IE_NAME = 'teachertube' IE_DESC = 'teachertube.com videos' @@ -87,6 +88,7 @@ class TeacherTubeIE(InfoExtractor): class TeacherTubeUserIE(InfoExtractor): + _WORKING = False IE_NAME = 'teachertube:user:collection' IE_DESC = 'teachertube.com user and collection videos' diff --git a/yt_dlp/extractor/teachingchannel.py b/yt_dlp/extractor/teachingchannel.py index 275f6d1f9..5791292a9 100644 --- a/yt_dlp/extractor/teachingchannel.py +++ b/yt_dlp/extractor/teachingchannel.py @@ -2,6 +2,7 @@ from .common import InfoExtractor class TeachingChannelIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?teachingchannel\.org/videos?/(?P[^/?&#]+)' _TEST = { diff --git a/yt_dlp/extractor/tele5.py b/yt_dlp/extractor/tele5.py index 9260db2b4..72f67e402 100644 --- a/yt_dlp/extractor/tele5.py +++ b/yt_dlp/extractor/tele5.py @@ -7,6 +7,7 @@ from ..utils import ( class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE + _WORKING = False _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_COUNTRIES = ['DE'] _TESTS = [{ diff --git a/yt_dlp/extractor/telemb.py b/yt_dlp/extractor/telemb.py index 3d29dace3..a71b14c27 100644 --- a/yt_dlp/extractor/telemb.py +++ b/yt_dlp/extractor/telemb.py @@ -5,6 +5,7 @@ from ..utils import remove_start class TeleMBIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?telemb\.be/(?P.+?)_d_(?P\d+)\.html' _TESTS = [ { diff --git a/yt_dlp/extractor/telemundo.py b/yt_dlp/extractor/telemundo.py index 54e74a6c0..84b24dead 100644 --- a/yt_dlp/extractor/telemundo.py +++ b/yt_dlp/extractor/telemundo.py @@ -4,7 +4,7 @@ from ..utils import try_get, unified_timestamp class TelemundoIE(InfoExtractor): - + _WORKING = False _VALID_URL = r'https?:\/\/(?:www\.)?telemundo\.com\/.+?video\/[^\/]+(?Ptmvo\d{7})' _TESTS = [{ 'url': 'https://www.telemundo.com/noticias/noticias-telemundo-en-la-noche/empleo/video/esta-aplicacion-gratuita-esta-ayudando-los-latinos-encontrar-trabajo-en-estados-unidos-tmvo9829325', diff --git a/yt_dlp/extractor/teletask.py b/yt_dlp/extractor/teletask.py index a73dd68fb..fd831f580 100644 --- a/yt_dlp/extractor/teletask.py +++ b/yt_dlp/extractor/teletask.py @@ -5,6 +5,7 @@ from ..utils import unified_strdate class TeleTaskIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?tele-task\.de/archive/video/html5/(?P[0-9]+)' _TEST = { 'url': 'http://www.tele-task.de/archive/video/html5/26168/', diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index 7ce7cbf84..a98275d86 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -20,7 +20,8 @@ class TenPlayIE(InfoExtractor): 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', 'duration': 186, - 'season': 39, + 'season': 'Season 39', + 'season_number': 39, 'series': 'Neighbours', 'thumbnail': r're:https://.*\.jpg', 'uploader': 'Channel 10', @@ -108,7 +109,7 @@ class TenPlayIE(InfoExtractor): 'description': data.get('description'), 'age_limit': self._AUS_AGES.get(data.get('classification')), 'series': data.get('tvShow'), - 'season': int_or_none(data.get('season')), + 'season_number': int_or_none(data.get('season')), 'episode_number': int_or_none(data.get('episode')), 'timestamp': data.get('published'), 'thumbnail': data.get('imageUrl'), diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index f26972cff..aa8356796 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -6,7 +6,7 @@ import string import time from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse +from ..compat import compat_urllib_parse_urlparse from ..networking import HEADRequest from ..utils import ( ExtractorError, @@ -15,7 +15,6 @@ from ..utils import ( UserNotLive, determine_ext, format_field, - get_first, int_or_none, join_nonempty, merge_dicts, @@ -51,7 +50,13 @@ class TikTokBaseIE(InfoExtractor): def _get_sigi_state(self, webpage, display_id): return self._search_json( r']+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage, - 'sigi state', display_id, end_pattern=r'') + 'sigi state', display_id, end_pattern=r'', default={}) + + def _get_universal_data(self, webpage, display_id): + return traverse_obj(self._search_json( + r']+\bid="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>', webpage, + 'universal data', display_id, end_pattern=r'', default={}), + ('__DEFAULT_SCOPE__', {dict})) or {} def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): @@ -219,8 +224,8 @@ class TikTokBaseIE(InfoExtractor): def extract_addr(addr, add_meta={}): parsed_meta, res = parse_url_key(addr.get('url_key', '')) if res: - known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height')) - known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width')) + known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height'))) + known_resolutions[res].setdefault('width', int_or_none(addr.get('width'))) parsed_meta.update(known_resolutions.get(res, {})) add_meta.setdefault('height', int_or_none(res[:-1])) return [{ @@ -237,22 +242,26 @@ class TikTokBaseIE(InfoExtractor): # Hack: Add direct video links first to prioritize them when removing duplicate formats formats = [] + width = int_or_none(video_info.get('width')) + height = int_or_none(video_info.get('height')) if video_info.get('play_addr'): formats.extend(extract_addr(video_info['play_addr'], { 'format_id': 'play_addr', 'format_note': 'Direct video', 'vcodec': 'h265' if traverse_obj( video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002 - 'width': video_info.get('width'), - 'height': video_info.get('height'), + 'width': width, + 'height': height, })) if video_info.get('download_addr'): - formats.extend(extract_addr(video_info['download_addr'], { + download_addr = video_info['download_addr'] + dl_width = int_or_none(download_addr.get('width')) + formats.extend(extract_addr(download_addr, { 'format_id': 'download_addr', 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''), 'vcodec': 'h264', - 'width': video_info.get('width'), - 'height': video_info.get('height'), + 'width': dl_width or width, + 'height': try_call(lambda: int(dl_width / 0.5625)) or height, # download_addr['height'] is wrong 'preference': -2 if video_info.get('has_watermark') else -1, })) if video_info.get('play_addr_h264'): @@ -311,13 +320,10 @@ class TikTokBaseIE(InfoExtractor): if is_generic_og_trackname: music_track, music_author = contained_music_track or 'original sound', contained_music_author else: - music_track, music_author = music_info.get('title'), music_info.get('author') + music_track, music_author = music_info.get('title'), traverse_obj(music_info, ('author', {str})) return { 'id': aweme_id, - 'extractor_key': TikTokIE.ie_key(), - 'extractor': TikTokIE.IE_NAME, - 'webpage_url': self._create_url(author_info.get('uid'), aweme_id), **traverse_obj(aweme_detail, { 'title': ('desc', {str}), 'description': ('desc', {str}), @@ -330,15 +336,16 @@ class TikTokBaseIE(InfoExtractor): 'comment_count': 'comment_count', }, expected_type=int_or_none), **traverse_obj(author_info, { - 'uploader': 'unique_id', - 'uploader_id': 'uid', - 'creator': 'nickname', - 'channel_id': 'sec_uid', - }, expected_type=str_or_none), + 'uploader': ('unique_id', {str}), + 'uploader_id': ('uid', {str_or_none}), + 'creators': ('nickname', {str}, {lambda x: [x] if x else None}), # for compat + 'channel': ('nickname', {str}), + 'channel_id': ('sec_uid', {str}), + }), 'uploader_url': user_url, 'track': music_track, 'album': str_or_none(music_info.get('album')) or None, - 'artist': music_author or None, + 'artists': re.split(r'(?:, | & )', music_author) if music_author else None, 'formats': formats, 'subtitles': self.extract_subtitles(aweme_detail, aweme_id), 'thumbnails': thumbnails, @@ -399,7 +406,8 @@ class TikTokBaseIE(InfoExtractor): 'timestamp': ('createTime', {int_or_none}), }), **traverse_obj(author_info or aweme_detail, { - 'creator': ('nickname', {str}), + 'creators': ('nickname', {str}, {lambda x: [x] if x else None}), # for compat + 'channel': ('nickname', {str}), 'uploader': (('uniqueId', 'author'), {str}), 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}), }, get_all=False), @@ -410,10 +418,10 @@ class TikTokBaseIE(InfoExtractor): 'comment_count': 'commentCount', }, expected_type=int_or_none), **traverse_obj(music_info, { - 'track': 'title', - 'album': ('album', {lambda x: x or None}), - 'artist': 'authorName', - }, expected_type=str), + 'track': ('title', {str}), + 'album': ('album', {str}, {lambda x: x or None}), + 'artists': ('authorName', {str}, {lambda x: [x] if x else None}), + }), 'channel_id': channel_id, 'uploader_url': user_url, 'formats': formats, @@ -470,7 +478,8 @@ class TikTokIE(TikTokBaseIE): 'uploader_id': '18702747', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws', 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws', - 'creator': 'patroX', + 'channel': 'patroX', + 'creators': ['patroX'], 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', 'upload_date': '20190930', 'timestamp': 1569860870, @@ -478,7 +487,7 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, - 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson', + 'artists': ['Evan Todd', 'Jessica Keenan Wynn', 'Alice Lee', 'Barrett Wilbert Weed', 'Jon Eidson'], 'track': 'Big Fun', }, }, { @@ -490,12 +499,13 @@ class TikTokIE(TikTokBaseIE): 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥', 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥', 'uploader': 'barudakhb_', - 'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', + 'channel': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', + 'creators': ['md5:29f238c49bc0c176cb3cef1a9cea9fa6'], 'uploader_id': '6974687867511718913', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d', 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d', 'track': 'Boka Dance', - 'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', + 'artists': ['md5:29f238c49bc0c176cb3cef1a9cea9fa6'], 'timestamp': 1626121503, 'duration': 18, 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', @@ -514,7 +524,8 @@ class TikTokIE(TikTokBaseIE): 'title': 'Slap and Run!', 'description': 'Slap and Run!', 'uploader': 'user440922249', - 'creator': 'Slap And Run', + 'channel': 'Slap And Run', + 'creators': ['Slap And Run'], 'uploader_id': '7036055384943690754', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_', 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_', @@ -538,7 +549,8 @@ class TikTokIE(TikTokBaseIE): 'title': 'TikTok video #7059698374567611694', 'description': '', 'uploader': 'pokemonlife22', - 'creator': 'Pokemon', + 'channel': 'Pokemon', + 'creators': ['Pokemon'], 'uploader_id': '6820838815978423302', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W', 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W', @@ -547,7 +559,7 @@ class TikTokIE(TikTokBaseIE): 'duration': 6, 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', 'upload_date': '20220201', - 'artist': 'Pokemon', + 'artists': ['Pokemon'], 'view_count': int, 'like_count': int, 'repost_count': int, @@ -584,12 +596,13 @@ class TikTokIE(TikTokBaseIE): 'ext': 'mp3', 'title': 'TikTok video #7139980461132074283', 'description': '', - 'creator': 'Antaura', + 'channel': 'Antaura', + 'creators': ['Antaura'], 'uploader': '_le_cannibale_', 'uploader_id': '6604511138619654149', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP', 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP', - 'artist': 'nathan !', + 'artists': ['nathan !'], 'track': 'grahamscott canon', 'upload_date': '20220905', 'timestamp': 1662406249, @@ -597,23 +610,24 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, - 'thumbnail': r're:^https://.+\.webp', + 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)', }, }, { # only available via web - 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662', + 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662', # FIXME 'md5': '6aba7fad816e8709ff2c149679ace165', 'info_dict': { 'id': '7206382937372134662', 'ext': 'mp4', 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a', 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a', - 'creator': 'MoxyPatch', + 'channel': 'MoxyPatch', + 'creators': ['MoxyPatch'], 'uploader': 'moxypatch', 'uploader_id': '7039142049363379205', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V', 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V', - 'artist': 'your worst nightmare', + 'artists': ['your worst nightmare'], 'track': 'original sound', 'upload_date': '20230303', 'timestamp': 1677866781, @@ -628,7 +642,7 @@ class TikTokIE(TikTokBaseIE): 'expected_warnings': ['Unable to find video in feed'], }, { # 1080p format - 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', + 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', # FIXME 'md5': '982512017a8a917124d5a08c8ae79621', 'info_dict': { 'id': '7107337212743830830', @@ -639,8 +653,9 @@ class TikTokIE(TikTokBaseIE): 'uploader_id': '86328792343818240', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', - 'creator': 'tate mcrae', - 'artist': 'tate mcrae', + 'channel': 'tate mcrae', + 'creators': ['tate mcrae'], + 'artists': ['tate mcrae'], 'track': 'original sound', 'upload_date': '20220609', 'timestamp': 1654805899, @@ -651,7 +666,7 @@ class TikTokIE(TikTokBaseIE): 'comment_count': int, 'thumbnail': r're:^https://.+\.webp', }, - 'params': {'format': 'bytevc1_1080p_808907-0'}, + 'skip': 'Unavailable via feed API, no formats available via web', }, { # Slideshow, audio-only m4a format 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594', @@ -665,8 +680,9 @@ class TikTokIE(TikTokBaseIE): 'uploader_id': '6582536342634676230', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB', 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB', - 'creator': 'лампочка', - 'artist': 'Øneheart', + 'channel': 'лампочка', + 'creators': ['лампочка'], + 'artists': ['Øneheart'], 'album': 'watching the stars', 'track': 'watching the stars', 'upload_date': '20230708', @@ -675,7 +691,7 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, - 'thumbnail': r're:^https://.+\.webp', + 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)', }, }, { # Auto-captions available @@ -688,24 +704,35 @@ class TikTokIE(TikTokBaseIE): try: return self._extract_aweme_app(video_id) except ExtractorError as e: + e.expected = True self.report_warning(f'{e}; trying with webpage') url = self._create_url(user_id, video_id) webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}) - next_data = self._search_nextjs_data(webpage, video_id, default='{}') - if next_data: - status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0 - video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict) + + if universal_data := self._get_universal_data(webpage, video_id): + self.write_debug('Found universal data for rehydration') + status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0 + video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict})) + + elif sigi_data := self._get_sigi_state(webpage, video_id): + self.write_debug('Found sigi state data') + status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 + video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) + + elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'): + self.write_debug('Found next.js data') + status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 + video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) + else: - sigi_data = self._get_sigi_state(webpage, video_id) - status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0 - video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict) + raise ExtractorError('Unable to extract webpage video data') - if status == 0: + if video_data and status == 0: return self._parse_aweme_video_web(video_data, url, video_id) elif status == 10216: raise ExtractorError('This video is private', expected=True) - raise ExtractorError('Video not available', video_id=video_id) + raise ExtractorError(f'Video not available, status code {status}', video_id=video_id) class TikTokUserIE(TikTokBaseIE): @@ -921,20 +948,23 @@ class DouyinIE(TikTokBaseIE): _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.douyin.com/video/6961737553342991651', - 'md5': 'a97db7e3e67eb57bf40735c022ffa228', + 'md5': '9ecce7bc5b302601018ecb2871c63a75', 'info_dict': { 'id': '6961737553342991651', 'ext': 'mp4', 'title': '#杨超越 小小水手带你去远航❤️', 'description': '#杨超越 小小水手带你去远航❤️', + 'uploader': '6897520xka', 'uploader_id': '110403406559', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', - 'creator': '杨超越', - 'duration': 19782, + 'channel': '杨超越', + 'creators': ['杨超越'], + 'duration': 19, 'timestamp': 1620905839, 'upload_date': '20210513', 'track': '@杨超越创作的原声', + 'artists': ['杨超越'], 'view_count': int, 'like_count': int, 'repost_count': int, @@ -943,20 +973,23 @@ class DouyinIE(TikTokBaseIE): }, }, { 'url': 'https://www.douyin.com/video/6982497745948921092', - 'md5': '34a87ebff3833357733da3fe17e37c0e', + 'md5': '15c5e660b7048af3707304e3cc02bbb5', 'info_dict': { 'id': '6982497745948921092', 'ext': 'mp4', 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想', 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想', + 'uploader': '0731chaoyue', 'uploader_id': '408654318141572', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', - 'creator': '杨超越工作室', - 'duration': 42479, + 'channel': '杨超越工作室', + 'creators': ['杨超越工作室'], + 'duration': 42, 'timestamp': 1625739481, 'upload_date': '20210708', 'track': '@杨超越工作室创作的原声', + 'artists': ['杨超越工作室'], 'view_count': int, 'like_count': int, 'repost_count': int, @@ -965,20 +998,23 @@ class DouyinIE(TikTokBaseIE): }, }, { 'url': 'https://www.douyin.com/video/6953975910773099811', - 'md5': 'dde3302460f19db59c47060ff013b902', + 'md5': '0e6443758b8355db9a3c34864a4276be', 'info_dict': { 'id': '6953975910773099811', 'ext': 'mp4', 'title': '#一起看海 出现在你的夏日里', 'description': '#一起看海 出现在你的夏日里', + 'uploader': '6897520xka', 'uploader_id': '110403406559', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', - 'creator': '杨超越', - 'duration': 17343, + 'channel': '杨超越', + 'creators': ['杨超越'], + 'duration': 17, 'timestamp': 1619098692, 'upload_date': '20210422', 'track': '@杨超越创作的原声', + 'artists': ['杨超越'], 'view_count': int, 'like_count': int, 'repost_count': int, @@ -1004,20 +1040,23 @@ class DouyinIE(TikTokBaseIE): 'skip': 'No longer available', }, { 'url': 'https://www.douyin.com/video/6963263655114722595', - 'md5': 'cf9f11f0ec45d131445ec2f06766e122', + 'md5': '1440bcf59d8700f8e014da073a4dfea8', 'info_dict': { 'id': '6963263655114722595', 'ext': 'mp4', 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈', 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈', + 'uploader': '6897520xka', 'uploader_id': '110403406559', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', - 'creator': '杨超越', - 'duration': 15115, + 'channel': '杨超越', + 'creators': ['杨超越'], + 'duration': 15, 'timestamp': 1621261163, 'upload_date': '20210517', 'track': '@杨超越创作的原声', + 'artists': ['杨超越'], 'view_count': int, 'like_count': int, 'repost_count': int, @@ -1025,34 +1064,23 @@ class DouyinIE(TikTokBaseIE): 'thumbnail': r're:https?://.+\.jpe?g', }, }] - _APP_VERSIONS = [('23.3.0', '230300')] - _APP_NAME = 'aweme' - _AID = 1128 - _API_HOSTNAME = 'aweme.snssdk.com' _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s' _WEBPAGE_HOST = 'https://www.douyin.com/' def _real_extract(self, url): video_id = self._match_id(url) - try: - return self._extract_aweme_app(video_id) - except ExtractorError as e: - e.expected = True - self.to_screen(f'{e}; trying with webpage') - - webpage = self._download_webpage(url, video_id) - render_data = self._search_json( - r'