2023-09-25 22:52:29 +00:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2024-03-12 01:06:06 +00:00
|
|
|
from typing import Union
|
|
|
|
from aiohttp import ClientResponse
|
|
|
|
from requests import Response as RequestsResponse
|
2024-01-14 06:45:41 +00:00
|
|
|
|
2024-01-26 06:54:13 +00:00
|
|
|
try:
|
2024-03-12 01:06:06 +00:00
|
|
|
from curl_cffi.requests import Session, Response
|
2024-02-21 23:16:58 +00:00
|
|
|
from .curl_cffi import StreamResponse, StreamSession
|
2024-01-26 06:54:13 +00:00
|
|
|
has_curl_cffi = True
|
|
|
|
except ImportError:
|
2024-03-12 01:06:06 +00:00
|
|
|
from typing import Type as Session, Type as Response
|
2024-02-21 23:16:58 +00:00
|
|
|
from .aiohttp import StreamResponse, StreamSession
|
2024-01-26 06:54:13 +00:00
|
|
|
has_curl_cffi = False
|
2024-01-14 06:45:41 +00:00
|
|
|
|
2024-03-12 01:06:06 +00:00
|
|
|
from ..webdriver import WebDriver, WebDriverSession
|
2024-03-12 17:45:22 +00:00
|
|
|
from ..webdriver import bypass_cloudflare, get_driver_cookies
|
2024-03-12 01:06:06 +00:00
|
|
|
from ..errors import MissingRequirementsError, RateLimitError, ResponseStatusError
|
2024-01-29 17:14:46 +00:00
|
|
|
from .defaults import DEFAULT_HEADERS
|
2024-01-26 06:54:13 +00:00
|
|
|
|
2024-02-23 10:33:38 +00:00
|
|
|
def get_args_from_browser(
|
|
|
|
url: str,
|
|
|
|
webdriver: WebDriver = None,
|
|
|
|
proxy: str = None,
|
|
|
|
timeout: int = 120,
|
2024-03-12 01:06:06 +00:00
|
|
|
do_bypass_cloudflare: bool = True,
|
|
|
|
virtual_display: bool = False
|
2024-02-23 10:33:38 +00:00
|
|
|
) -> dict:
|
2024-01-14 06:45:41 +00:00
|
|
|
"""
|
|
|
|
Create a Session object using a WebDriver to handle cookies and headers.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url (str): The URL to navigate to using the WebDriver.
|
|
|
|
webdriver (WebDriver, optional): The WebDriver instance to use.
|
|
|
|
proxy (str, optional): Proxy server to use for the Session.
|
|
|
|
timeout (int, optional): Timeout in seconds for the WebDriver.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Session: A Session object configured with cookies and headers from the WebDriver.
|
|
|
|
"""
|
2024-03-12 17:45:22 +00:00
|
|
|
with WebDriverSession(webdriver, "", proxy=proxy, virtual_display=virtual_display) as driver:
|
2024-02-23 10:33:38 +00:00
|
|
|
if do_bypass_cloudflare:
|
|
|
|
bypass_cloudflare(driver, url, timeout)
|
2024-03-12 01:06:06 +00:00
|
|
|
headers = {
|
|
|
|
**DEFAULT_HEADERS,
|
|
|
|
'referer': url,
|
|
|
|
}
|
2024-03-12 17:45:22 +00:00
|
|
|
if not hasattr(driver, "requests"):
|
|
|
|
headers["user-agent"] = driver.execute_script("return navigator.userAgent")
|
|
|
|
else:
|
2024-03-12 01:06:06 +00:00
|
|
|
for request in driver.requests:
|
|
|
|
if request.url.startswith(url):
|
|
|
|
for key, value in request.headers.items():
|
|
|
|
if key in (
|
|
|
|
"accept-encoding",
|
|
|
|
"accept-language",
|
|
|
|
"user-agent",
|
|
|
|
"sec-ch-ua",
|
|
|
|
"sec-ch-ua-platform",
|
|
|
|
"sec-ch-ua-arch",
|
|
|
|
"sec-ch-ua-full-version",
|
|
|
|
"sec-ch-ua-platform-version",
|
|
|
|
"sec-ch-ua-bitness"
|
|
|
|
):
|
|
|
|
headers[key] = value
|
|
|
|
break
|
|
|
|
cookies = get_driver_cookies(driver)
|
2024-01-27 01:00:44 +00:00
|
|
|
return {
|
|
|
|
'cookies': cookies,
|
2024-03-12 01:06:06 +00:00
|
|
|
'headers': headers,
|
2024-01-27 01:00:44 +00:00
|
|
|
}
|
2024-01-29 17:14:46 +00:00
|
|
|
|
2024-01-27 01:00:44 +00:00
|
|
|
def get_session_from_browser(url: str, webdriver: WebDriver = None, proxy: str = None, timeout: int = 120) -> Session:
|
|
|
|
if not has_curl_cffi:
|
|
|
|
raise MissingRequirementsError('Install "curl_cffi" package')
|
|
|
|
args = get_args_from_browser(url, webdriver, proxy, timeout)
|
|
|
|
return Session(
|
|
|
|
**args,
|
2023-12-02 04:40:07 +00:00
|
|
|
proxies={"https": proxy, "http": proxy},
|
|
|
|
timeout=timeout,
|
2024-03-12 01:06:06 +00:00
|
|
|
impersonate="chrome"
|
|
|
|
)
|
|
|
|
|
2024-03-14 12:53:57 +00:00
|
|
|
def is_cloudflare(text: str):
|
|
|
|
return '<div id="cf-please-wait">' in text or "<title>Just a moment...</title>" in text
|
|
|
|
|
2024-03-12 17:45:22 +00:00
|
|
|
async def raise_for_status_async(response: Union[StreamResponse, ClientResponse], message: str = None):
|
2024-03-12 01:06:06 +00:00
|
|
|
if response.status in (429, 402):
|
|
|
|
raise RateLimitError(f"Response {response.status}: Rate limit reached")
|
2024-03-12 17:45:22 +00:00
|
|
|
message = await response.text() if not response.ok and message is None else message
|
2024-03-14 12:53:57 +00:00
|
|
|
if response.status == 403 and is_cloudflare(message):
|
2024-03-12 01:06:06 +00:00
|
|
|
raise ResponseStatusError(f"Response {response.status}: Cloudflare detected")
|
|
|
|
elif not response.ok:
|
2024-03-12 17:45:22 +00:00
|
|
|
raise ResponseStatusError(f"Response {response.status}: {message}")
|
2024-03-12 01:06:06 +00:00
|
|
|
|
2024-03-12 17:45:22 +00:00
|
|
|
def raise_for_status(response: Union[StreamResponse, ClientResponse, Response, RequestsResponse], message: str = None):
|
2024-03-14 12:53:57 +00:00
|
|
|
if hasattr(response, "status"):
|
2024-03-12 17:45:22 +00:00
|
|
|
return raise_for_status_async(response, message)
|
2024-03-12 01:06:06 +00:00
|
|
|
|
|
|
|
if response.status_code in (429, 402):
|
|
|
|
raise RateLimitError(f"Response {response.status_code}: Rate limit reached")
|
2024-03-14 12:53:57 +00:00
|
|
|
elif response.status_code == 403 and is_cloudflare(response.text):
|
2024-03-12 01:06:06 +00:00
|
|
|
raise ResponseStatusError(f"Response {response.status_code}: Cloudflare detected")
|
|
|
|
elif not response.ok:
|
2024-03-12 17:45:22 +00:00
|
|
|
raise ResponseStatusError(f"Response {response.status_code}: {response.text if message is None else message}")
|