yt-dlp/yt_dlp/extractor/googlesearch.py

from __future__ import unicode_literals

import itertools
import re

from .common import SearchInfoExtractor


class GoogleSearchIE(SearchInfoExtractor):
    IE_DESC = 'Google Video search'
    _MAX_RESULTS = 1000
    IE_NAME = 'video.google:search'
    _SEARCH_KEY = 'gvsearch'
    _WORKING = False
    _TEST = {
        'url': 'gvsearch15:python language',
        'info_dict': {
            'id': 'python language',
            'title': 'python language',
        },
        'playlist_count': 15,
    }

    def _search_results(self, query):
        for pagenum in itertools.count():
            webpage = self._download_webpage(
                'http://www.google.com/search',
                'gvsearch:' + query,
                note='Downloading result page %s' % (pagenum + 1),
                query={
                    'tbm': 'vid',
                    'q': query,
                    'start': pagenum * 10,
                    'hl': 'en',
                })

            for hit_idx, mobj in enumerate(re.finditer(
                    r'<h3 class="r"><a href="([^"]+)"', webpage)):
                if re.search(f'id="vidthumb{hit_idx + 1}"', webpage):
                    yield self.url_result(mobj.group(1))

            if not re.search(r'id="pnnext"', webpage):
                return
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 02:29:10 +00:00			`from __future__ import unicode_literals`

Move GoogleSearchIE into its own file 2013-06-23 18:32:49 +00:00			`import itertools`
			`import re`

			`from .common import SearchInfoExtractor`


			`class GoogleSearchIE(SearchInfoExtractor):`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 02:29:10 +00:00			`IE_DESC = 'Google Video search'`
Move GoogleSearchIE into its own file 2013-06-23 18:32:49 +00:00			`_MAX_RESULTS = 1000`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 02:29:10 +00:00			`IE_NAME = 'video.google:search'`
Move GoogleSearchIE into its own file 2013-06-23 18:32:49 +00:00			`_SEARCH_KEY = 'gvsearch'`
[extractor] Simplify search extractors 2021-10-08 20:39:55 +00:00			`_WORKING = False`
[googlesearch] Move test to extractor 2014-08-25 15:02:52 +00:00			`_TEST = {`
			`'url': 'gvsearch15:python language',`
			`'info_dict': {`
			`'id': 'python language',`
			`'title': 'python language',`
			`},`
			`'playlist_count': 15,`
			`}`
Move GoogleSearchIE into its own file 2013-06-23 18:32:49 +00:00
[extractor] Simplify search extractors 2021-10-08 20:39:55 +00:00			`def _search_results(self, query):`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 02:29:10 +00:00			`for pagenum in itertools.count():`
			`webpage = self._download_webpage(`
[gvsearch] Modernize and fix page result request (closes #11051) 2016-10-28 16:19:59 +00:00			`'http://www.google.com/search',`
			`'gvsearch:' + query,`
			`note='Downloading result page %s' % (pagenum + 1),`
			`query={`
			`'tbm': 'vid',`
			`'q': query,`
			`'start': pagenum * 10,`
			`'hl': 'en',`
			`})`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 02:29:10 +00:00
			`for hit_idx, mobj in enumerate(re.finditer(`
			`r'<h3 class="r"><a href="([^"]+)"', webpage)):`
[extractor] Simplify search extractors 2021-10-08 20:39:55 +00:00			`if re.search(f'id="vidthumb{hit_idx + 1}"', webpage):`
			`yield self.url_result(mobj.group(1))`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 02:29:10 +00:00
[extractor] Simplify search extractors 2021-10-08 20:39:55 +00:00			`if not re.search(r'id="pnnext"', webpage):`
			`return`