yt-dlp/yt_dlp/extractor/podbayfm.py

from .common import InfoExtractor
from ..utils import (
    OnDemandPagedList,
    clean_html,
    int_or_none,
    jwt_decode_hs256,
    url_or_none,
)
from ..utils.traversal import traverse_obj


def result_from_props(props):
    return {
        **traverse_obj(props, {
            'id': ('_id', {str}),
            'title': ('title', {str}),
            'url': ('mediaURL', {url_or_none}),
            'description': ('description', {clean_html}),
            'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}),
            'timestamp': ('timestamp', {int_or_none}),
            'duration': ('duration', {int_or_none}),
        }),
        'ext': 'mp3',
        'vcodec': 'none',
    }


class PodbayFMIE(InfoExtractor):
    _VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
        'md5': '895ac8505de349515f5ee8a4a3195c93',
        'info_dict': {
            'id': '62306451f4a48e58d0c4d6a8',
            'title': 'Part One: Kissinger',
            'ext': 'mp3',
            'description': r're:^We begin our epic six part series on Henry Kissinger.+',
            'thumbnail': r're:^https?://.*\.jpg',
            'timestamp': 1647338400,
            'duration': 5001,
            'upload_date': '20220315',
        },
    }]

    def _real_extract(self, url):
        episode_id = self._match_id(url)
        webpage = self._download_webpage(url, episode_id)
        data = self._search_nextjs_data(webpage, episode_id)
        return result_from_props(data['props']['pageProps']['episode'])


class PodbayFMChannelIE(InfoExtractor):
    _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$|[?#])'
    _TESTS = [{
        'url': 'https://podbay.fm/p/behind-the-bastards',
        'info_dict': {
            'id': 'behind-the-bastards',
            'title': 'Behind the Bastards',
        },
        'playlist_mincount': 21,
    }]
    _PAGE_SIZE = 10

    def _fetch_page(self, channel_id, pagenum):
        return self._download_json(
            f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
            f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast']

    @staticmethod
    def _results_from_page(channel_id, page):
        return [{
            **result_from_props(e),
            'extractor': PodbayFMIE.IE_NAME,
            'extractor_key': PodbayFMIE.ie_key(),
            # somehow they use timestamps as the episode identifier
            'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
        } for e in page['episodes']]

    def _real_extract(self, url):
        channel_id = self._match_id(url)

        first_page = self._fetch_page(channel_id, 0)
        entries = OnDemandPagedList(
            lambda pagenum: self._results_from_page(
                channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
            self._PAGE_SIZE)

        return self.playlist_result(entries, channel_id, first_page.get('title'))
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`from .common import InfoExtractor`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`from ..utils import (`
			`OnDemandPagedList,`
			`clean_html,`
			`int_or_none,`
			`jwt_decode_hs256,`
			`url_or_none,`
			`)`
			`from ..utils.traversal import traverse_obj`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago

[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`def result_from_props(props):`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`return {`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`**traverse_obj(props, {`
			`'id': ('_id', {str}),`
			`'title': ('title', {str}),`
			`'url': ('mediaURL', {url_or_none}),`
			`'description': ('description', {clean_html}),`
			`'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}),`
			`'timestamp': ('timestamp', {int_or_none}),`
			`'duration': ('duration', {int_or_none}),`
			`}),`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`'ext': 'mp3',`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`'vcodec': 'none',`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`}`


			`class PodbayFMIE(InfoExtractor):`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`_VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)'`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`_TESTS = [{`
			`'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`'md5': '895ac8505de349515f5ee8a4a3195c93',`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`'info_dict': {`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`'id': '62306451f4a48e58d0c4d6a8',`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`'title': 'Part One: Kissinger',`
			`'ext': 'mp3',`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`'description': r're:^We begin our epic six part series on Henry Kissinger.+',`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`'thumbnail': r're:^https?://.*\.jpg',`
			`'timestamp': 1647338400,`
			`'duration': 5001,`
			`'upload_date': '20220315',`
			`},`
			`}]`

			`def _real_extract(self, url):`
			`episode_id = self._match_id(url)`
			`webpage = self._download_webpage(url, episode_id)`
			`data = self._search_nextjs_data(webpage, episode_id)`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`return result_from_props(data['props']['pageProps']['episode'])`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago

			`class PodbayFMChannelIE(InfoExtractor):`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`_VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$\|[?#])'`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`_TESTS = [{`
			`'url': 'https://podbay.fm/p/behind-the-bastards',`
			`'info_dict': {`
			`'id': 'behind-the-bastards',`
			`'title': 'Behind the Bastards',`
			`},`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`'playlist_mincount': 21,`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago			`}]`
			`_PAGE_SIZE = 10`

			`def _fetch_page(self, channel_id, pagenum):`
			`return self._download_json(`
			`f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',`
[ie/podbayfm] Fix extraction (#10195) Authored by: bashonly, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 1 week ago			`f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast']`
[extractors/podbayfm] Add extractor (#4971) Authored by: schnusch 2 years ago
			`@staticmethod`
			`def _results_from_page(channel_id, page):`
			`return [{`
			`**result_from_props(e),`
			`'extractor': PodbayFMIE.IE_NAME,`
			`'extractor_key': PodbayFMIE.ie_key(),`
			`# somehow they use timestamps as the episode identifier`
			`'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',`
			`} for e in page['episodes']]`

			`def _real_extract(self, url):`
			`channel_id = self._match_id(url)`

			`first_page = self._fetch_page(channel_id, 0)`
			`entries = OnDemandPagedList(`
			`lambda pagenum: self._results_from_page(`
			`channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),`
			`self._PAGE_SIZE)`

			`return self.playlist_result(entries, channel_id, first_page.get('title'))`