Harrison/site map tqdm (#3184)

Co-authored-by: Tianyi Pan <60060750+tipani86@users.noreply.github.com>
Co-authored-by: Tianyi Pan <tianyi.pan@clobotics.com>
This commit is contained in:
Harrison Chase 2023-04-19 20:48:47 -07:00 committed by GitHub
parent e55db5841a
commit f19b3890c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 35 additions and 9 deletions

View File

@ -1,6 +1,7 @@
"""Web base loader class.""" """Web base loader class."""
import asyncio import asyncio
import logging import logging
import warnings
from typing import Any, List, Optional, Union from typing import Any, List, Optional, Union
import aiohttp import aiohttp
@ -85,10 +86,26 @@ class WebBaseLoader(BaseLoader):
raise ValueError("Multiple webpaths found.") raise ValueError("Multiple webpaths found.")
return self.web_paths[0] return self.web_paths[0]
async def _fetch(self, url: str) -> str: async def _fetch(
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
) -> str:
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(url, headers=self.session.headers) as response: for i in range(retries):
return await response.text() try:
async with session.get(
url, headers=self.session.headers
) as response:
return await response.text()
except aiohttp.ClientConnectionError as e:
if i == retries - 1:
raise
else:
logger.warning(
f"Error fetching {url} with attempt "
f"{i + 1}/{retries}: {e}. Retrying..."
)
await asyncio.sleep(cooldown * backoff**i)
raise ValueError("retry count exceeded")
async def _fetch_with_rate_limit( async def _fetch_with_rate_limit(
self, url: str, semaphore: asyncio.Semaphore self, url: str, semaphore: asyncio.Semaphore
@ -103,7 +120,15 @@ class WebBaseLoader(BaseLoader):
for url in urls: for url in urls:
task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore)) task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore))
tasks.append(task) tasks.append(task)
return await asyncio.gather(*tasks) try:
from tqdm.asyncio import tqdm_asyncio
return await tqdm_asyncio.gather(
*tasks, desc="Fetching pages", ascii=True, mininterval=1
)
except ImportError:
warnings.warn("For better logging of progress, `pip install tqdm`")
return await asyncio.gather(*tasks)
@staticmethod @staticmethod
def _check_parser(parser: str) -> None: def _check_parser(parser: str) -> None:

10
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. # This file is automatically @generated by Poetry and should not be changed by hand.
[[package]] [[package]]
name = "absl-py" name = "absl-py"
@ -7391,7 +7391,7 @@ files = [
] ]
[package.dependencies] [package.dependencies]
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""} greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
[package.extras] [package.extras]
aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
@ -9151,13 +9151,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"] cffi = ["cffi (>=1.11)"]
[extras] [extras]
all = ["aleph-alpha-client", "anthropic", "atlassian-python-api", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache"]
cohere = ["cohere"] cohere = ["cohere"]
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
openai = ["openai"] openai = ["openai"]
qdrant = ["qdrant-client"] qdrant = ["qdrant-client"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<4.0" python-versions = ">=3.8.1,<4.0"
content-hash = "8476b824f9682ed85d44929519054cc5951a6fe47067e8fc7e43a364f6486477" content-hash = "568b190c884e62df4e7bd897f402e3b6e61b24134af7f189f3d44b2ba5f00082"

View File

@ -17,6 +17,7 @@ SQLAlchemy = "^1"
requests = "^2" requests = "^2"
PyYAML = ">=5.4.1" PyYAML = ">=5.4.1"
numpy = "^1" numpy = "^1"
tqdm = {version = ">=4.48.0", optional = true}
openapi-schema-pydantic = "^1.2" openapi-schema-pydantic = "^1.2"
faiss-cpu = {version = "^1", optional = true} faiss-cpu = {version = "^1", optional = true}
wikipedia = {version = "^1", optional = true} wikipedia = {version = "^1", optional = true}