diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index 45bd7036..50cf549d 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -1,6 +1,7 @@ """Web base loader class.""" import asyncio import logging +import warnings from typing import Any, List, Optional, Union import aiohttp @@ -85,10 +86,26 @@ class WebBaseLoader(BaseLoader): raise ValueError("Multiple webpaths found.") return self.web_paths[0] - async def _fetch(self, url: str) -> str: + async def _fetch( + self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5 + ) -> str: async with aiohttp.ClientSession() as session: - async with session.get(url, headers=self.session.headers) as response: - return await response.text() + for i in range(retries): + try: + async with session.get( + url, headers=self.session.headers + ) as response: + return await response.text() + except aiohttp.ClientConnectionError as e: + if i == retries - 1: + raise + else: + logger.warning( + f"Error fetching {url} with attempt " + f"{i + 1}/{retries}: {e}. Retrying..." + ) + await asyncio.sleep(cooldown * backoff**i) + raise ValueError("retry count exceeded") async def _fetch_with_rate_limit( self, url: str, semaphore: asyncio.Semaphore @@ -103,7 +120,15 @@ class WebBaseLoader(BaseLoader): for url in urls: task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore)) tasks.append(task) - return await asyncio.gather(*tasks) + try: + from tqdm.asyncio import tqdm_asyncio + + return await tqdm_asyncio.gather( + *tasks, desc="Fetching pages", ascii=True, mininterval=1 + ) + except ImportError: + warnings.warn("For better logging of progress, `pip install tqdm`") + return await asyncio.gather(*tasks) @staticmethod def _check_parser(parser: str) -> None: diff --git a/poetry.lock b/poetry.lock index 78933c81..8e328e23 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "absl-py" @@ -7391,7 +7391,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -9151,13 +9151,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["aleph-alpha-client", "anthropic", "atlassian-python-api", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache"] cohere = ["cohere"] -llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] +llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "8476b824f9682ed85d44929519054cc5951a6fe47067e8fc7e43a364f6486477" +content-hash = "568b190c884e62df4e7bd897f402e3b6e61b24134af7f189f3d44b2ba5f00082" diff --git a/pyproject.toml b/pyproject.toml index 448ff14b..cdff1971 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ SQLAlchemy = "^1" requests = "^2" PyYAML = ">=5.4.1" numpy = "^1" +tqdm = {version = ">=4.48.0", optional = true} openapi-schema-pydantic = "^1.2" faiss-cpu = {version = "^1", optional = true} wikipedia = {version = "^1", optional = true}