fix web loader (#8538)

This commit is contained in:
Harrison Chase 2023-07-31 12:47:33 -07:00 committed by GitHub
parent 4780156955
commit 15de57b848
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -26,23 +26,15 @@ default_header_template = {
class AsyncHtmlLoader(BaseLoader): class AsyncHtmlLoader(BaseLoader):
"""Loads HTML asynchronously.""" """Loads HTML asynchronously."""
web_paths: List[str]
requests_per_second: int = 2
"""Max number of concurrent requests to make."""
requests_kwargs: Dict[str, Any] = {}
"""kwargs for requests"""
raise_for_status: bool = False
"""Raise an exception if http status code denotes an error."""
def __init__( def __init__(
self, self,
web_path: Union[str, List[str]], web_path: Union[str, List[str]],
header_template: Optional[dict] = None, header_template: Optional[dict] = None,
verify_ssl: Optional[bool] = True, verify_ssl: Optional[bool] = True,
proxies: Optional[dict] = None, proxies: Optional[dict] = None,
requests_per_second: int = 2,
requests_kwargs: Dict[str, Any] = {},
raise_for_status: bool = False,
): ):
"""Initialize with webpage path.""" """Initialize with webpage path."""
@ -74,6 +66,10 @@ class AsyncHtmlLoader(BaseLoader):
if proxies: if proxies:
self.session.proxies.update(proxies) self.session.proxies.update(proxies)
self.requests_per_second = requests_per_second
self.requests_kwargs = requests_kwargs
self.raise_for_status = raise_for_status
async def _fetch( async def _fetch(
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5 self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
) -> str: ) -> str: