forked from Archives/langchain
Harrison/site map tqdm (#3184)
Co-authored-by: Tianyi Pan <60060750+tipani86@users.noreply.github.com> Co-authored-by: Tianyi Pan <tianyi.pan@clobotics.com>
This commit is contained in:
parent
e55db5841a
commit
f19b3890c9
@ -1,6 +1,7 @@
|
|||||||
"""Web base loader class."""
|
"""Web base loader class."""
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import warnings
|
||||||
from typing import Any, List, Optional, Union
|
from typing import Any, List, Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
@ -85,10 +86,26 @@ class WebBaseLoader(BaseLoader):
|
|||||||
raise ValueError("Multiple webpaths found.")
|
raise ValueError("Multiple webpaths found.")
|
||||||
return self.web_paths[0]
|
return self.web_paths[0]
|
||||||
|
|
||||||
async def _fetch(self, url: str) -> str:
|
async def _fetch(
|
||||||
|
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
|
||||||
|
) -> str:
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(url, headers=self.session.headers) as response:
|
for i in range(retries):
|
||||||
return await response.text()
|
try:
|
||||||
|
async with session.get(
|
||||||
|
url, headers=self.session.headers
|
||||||
|
) as response:
|
||||||
|
return await response.text()
|
||||||
|
except aiohttp.ClientConnectionError as e:
|
||||||
|
if i == retries - 1:
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"Error fetching {url} with attempt "
|
||||||
|
f"{i + 1}/{retries}: {e}. Retrying..."
|
||||||
|
)
|
||||||
|
await asyncio.sleep(cooldown * backoff**i)
|
||||||
|
raise ValueError("retry count exceeded")
|
||||||
|
|
||||||
async def _fetch_with_rate_limit(
|
async def _fetch_with_rate_limit(
|
||||||
self, url: str, semaphore: asyncio.Semaphore
|
self, url: str, semaphore: asyncio.Semaphore
|
||||||
@ -103,7 +120,15 @@ class WebBaseLoader(BaseLoader):
|
|||||||
for url in urls:
|
for url in urls:
|
||||||
task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore))
|
task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore))
|
||||||
tasks.append(task)
|
tasks.append(task)
|
||||||
return await asyncio.gather(*tasks)
|
try:
|
||||||
|
from tqdm.asyncio import tqdm_asyncio
|
||||||
|
|
||||||
|
return await tqdm_asyncio.gather(
|
||||||
|
*tasks, desc="Fetching pages", ascii=True, mininterval=1
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
warnings.warn("For better logging of progress, `pip install tqdm`")
|
||||||
|
return await asyncio.gather(*tasks)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _check_parser(parser: str) -> None:
|
def _check_parser(parser: str) -> None:
|
||||||
|
10
poetry.lock
generated
10
poetry.lock
generated
@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
|
# This file is automatically @generated by Poetry and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "absl-py"
|
name = "absl-py"
|
||||||
@ -7391,7 +7391,7 @@ files = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""}
|
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
|
aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
|
||||||
@ -9151,13 +9151,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
|||||||
cffi = ["cffi (>=1.11)"]
|
cffi = ["cffi (>=1.11)"]
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
all = ["aleph-alpha-client", "anthropic", "atlassian-python-api", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache"]
|
||||||
cohere = ["cohere"]
|
cohere = ["cohere"]
|
||||||
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
|
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
||||||
openai = ["openai"]
|
openai = ["openai"]
|
||||||
qdrant = ["qdrant-client"]
|
qdrant = ["qdrant-client"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0"
|
python-versions = ">=3.8.1,<4.0"
|
||||||
content-hash = "8476b824f9682ed85d44929519054cc5951a6fe47067e8fc7e43a364f6486477"
|
content-hash = "568b190c884e62df4e7bd897f402e3b6e61b24134af7f189f3d44b2ba5f00082"
|
||||||
|
@ -17,6 +17,7 @@ SQLAlchemy = "^1"
|
|||||||
requests = "^2"
|
requests = "^2"
|
||||||
PyYAML = ">=5.4.1"
|
PyYAML = ">=5.4.1"
|
||||||
numpy = "^1"
|
numpy = "^1"
|
||||||
|
tqdm = {version = ">=4.48.0", optional = true}
|
||||||
openapi-schema-pydantic = "^1.2"
|
openapi-schema-pydantic = "^1.2"
|
||||||
faiss-cpu = {version = "^1", optional = true}
|
faiss-cpu = {version = "^1", optional = true}
|
||||||
wikipedia = {version = "^1", optional = true}
|
wikipedia = {version = "^1", optional = true}
|
||||||
|
Loading…
Reference in New Issue
Block a user