forked from Archives/langchain
Harrison/site map tqdm (#3184)
Co-authored-by: Tianyi Pan <60060750+tipani86@users.noreply.github.com> Co-authored-by: Tianyi Pan <tianyi.pan@clobotics.com>
This commit is contained in:
parent
e55db5841a
commit
f19b3890c9
@ -1,6 +1,7 @@
|
||||
"""Web base loader class."""
|
||||
import asyncio
|
||||
import logging
|
||||
import warnings
|
||||
from typing import Any, List, Optional, Union
|
||||
|
||||
import aiohttp
|
||||
@ -85,10 +86,26 @@ class WebBaseLoader(BaseLoader):
|
||||
raise ValueError("Multiple webpaths found.")
|
||||
return self.web_paths[0]
|
||||
|
||||
async def _fetch(self, url: str) -> str:
|
||||
async def _fetch(
|
||||
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
|
||||
) -> str:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers=self.session.headers) as response:
|
||||
return await response.text()
|
||||
for i in range(retries):
|
||||
try:
|
||||
async with session.get(
|
||||
url, headers=self.session.headers
|
||||
) as response:
|
||||
return await response.text()
|
||||
except aiohttp.ClientConnectionError as e:
|
||||
if i == retries - 1:
|
||||
raise
|
||||
else:
|
||||
logger.warning(
|
||||
f"Error fetching {url} with attempt "
|
||||
f"{i + 1}/{retries}: {e}. Retrying..."
|
||||
)
|
||||
await asyncio.sleep(cooldown * backoff**i)
|
||||
raise ValueError("retry count exceeded")
|
||||
|
||||
async def _fetch_with_rate_limit(
|
||||
self, url: str, semaphore: asyncio.Semaphore
|
||||
@ -103,7 +120,15 @@ class WebBaseLoader(BaseLoader):
|
||||
for url in urls:
|
||||
task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore))
|
||||
tasks.append(task)
|
||||
return await asyncio.gather(*tasks)
|
||||
try:
|
||||
from tqdm.asyncio import tqdm_asyncio
|
||||
|
||||
return await tqdm_asyncio.gather(
|
||||
*tasks, desc="Fetching pages", ascii=True, mininterval=1
|
||||
)
|
||||
except ImportError:
|
||||
warnings.warn("For better logging of progress, `pip install tqdm`")
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
@staticmethod
|
||||
def _check_parser(parser: str) -> None:
|
||||
|
10
poetry.lock
generated
10
poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "absl-py"
|
||||
@ -7391,7 +7391,7 @@ files = [
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""}
|
||||
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
|
||||
|
||||
[package.extras]
|
||||
aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
|
||||
@ -9151,13 +9151,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
||||
cffi = ["cffi (>=1.11)"]
|
||||
|
||||
[extras]
|
||||
all = ["aleph-alpha-client", "anthropic", "atlassian-python-api", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
||||
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache"]
|
||||
cohere = ["cohere"]
|
||||
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
|
||||
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
||||
openai = ["openai"]
|
||||
qdrant = ["qdrant-client"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "8476b824f9682ed85d44929519054cc5951a6fe47067e8fc7e43a364f6486477"
|
||||
content-hash = "568b190c884e62df4e7bd897f402e3b6e61b24134af7f189f3d44b2ba5f00082"
|
||||
|
@ -17,6 +17,7 @@ SQLAlchemy = "^1"
|
||||
requests = "^2"
|
||||
PyYAML = ">=5.4.1"
|
||||
numpy = "^1"
|
||||
tqdm = {version = ">=4.48.0", optional = true}
|
||||
openapi-schema-pydantic = "^1.2"
|
||||
faiss-cpu = {version = "^1", optional = true}
|
||||
wikipedia = {version = "^1", optional = true}
|
||||
|
Loading…
Reference in New Issue
Block a user