Harrison/site map tqdm (#3184)

Co-authored-by: Tianyi Pan <60060750+tipani86@users.noreply.github.com> Co-authored-by: Tianyi Pan <tianyi.pan@clobotics.com>
2023-04-19 20:48:47 -07:00 · 2023-04-19 20:48:47 -07:00 · f19b3890c9
commit f19b3890c9
parent e55db5841a
3 changed files with 35 additions and 9 deletions
--- a/langchain/document_loaders/web_base.py
+++ b/langchain/document_loaders/web_base.py
@ -1,6 +1,7 @@
 """Web base loader class."""
 import asyncio
 import logging
 import warnings
 from typing import Any, List, Optional, Union
 import aiohttp
@ -85,10 +86,26 @@ class WebBaseLoader(BaseLoader):
            raise ValueError("Multiple webpaths found.")
        return self.web_paths[0]
-    async def _fetch(self, url: str) -> str:
+    async def _fetch(
        self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
    ) -> str:
        async with aiohttp.ClientSession() as session:
-            async with session.get(url, headers=self.session.headers) as response:
+            for i in range(retries):
-                return await response.text()
+                try:
                    async with session.get(
                        url, headers=self.session.headers
                    ) as response:
                        return await response.text()
                except aiohttp.ClientConnectionError as e:
                    if i == retries - 1:
                        raise
                    else:
                        logger.warning(
                            f"Error fetching {url} with attempt "
                            f"{i + 1}/{retries}: {e}. Retrying..."
                        )
                        await asyncio.sleep(cooldown * backoff**i)
        raise ValueError("retry count exceeded")
    async def _fetch_with_rate_limit(
        self, url: str, semaphore: asyncio.Semaphore
@ -103,7 +120,15 @@ class WebBaseLoader(BaseLoader):
        for url in urls:
            task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore))
            tasks.append(task)
-        return await asyncio.gather(*tasks)
+        try:
            from tqdm.asyncio import tqdm_asyncio
            return await tqdm_asyncio.gather(
                *tasks, desc="Fetching pages", ascii=True, mininterval=1
            )
        except ImportError:
            warnings.warn("For better logging of progress, `pip install tqdm`")
            return await asyncio.gather(*tasks)
    @staticmethod
    def _check_parser(parser: str) -> None:
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.
 [[package]]
 name = "absl-py"
@ -7391,7 +7391,7 @@ files = [
 ]
 [package.dependencies]
-greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""}
+greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
 [package.extras]
 aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
@ -9151,13 +9151,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
 cffi = ["cffi (>=1.11)"]
 [extras]
-all = ["aleph-alpha-client", "anthropic", "atlassian-python-api", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
+all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache"]
 cohere = ["cohere"]
-llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
+llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
 openai = ["openai"]
 qdrant = ["qdrant-client"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "8476b824f9682ed85d44929519054cc5951a6fe47067e8fc7e43a364f6486477"
+content-hash = "568b190c884e62df4e7bd897f402e3b6e61b24134af7f189f3d44b2ba5f00082"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -17,6 +17,7 @@ SQLAlchemy = "^1"
 requests = "^2"
 PyYAML = ">=5.4.1"
 numpy = "^1"
 tqdm = {version = ">=4.48.0", optional = true}
 openapi-schema-pydantic = "^1.2"
 faiss-cpu = {version = "^1", optional = true}
 wikipedia = {version = "^1", optional = true}