when encountering error during fetch return "" in web_base.py (#8753)

when e.g. downloading a sitemap with a malformed url (e.g. "ttp://example.com/index.html" with the h omitted at the beginning of the url), this will ensure that the sitemap download does not crash, but just emits a warning. (maybe should be optional with e.g. a `skip_faulty_urls:bool=True` parameter, but this was the most straightforward fix) @rlancemartin, @eyurtsev --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
1 year ago · cff52638b2
parent bbd22b9b76
commit cff52638b2
4 changed files with 35 additions and 2 deletions
--- a/libs/langchain/langchain/document_loaders/blackboard.py
+++ b/libs/langchain/langchain/document_loaders/blackboard.py
@ -31,7 +31,7 @@ class BlackboardLoader(WebBaseLoader):
            )
            documents = loader.load()

-    """
+    """  # noqa: E501

    base_url: str
    """Base url of the blackboard course."""
@ -47,6 +47,7 @@ class BlackboardLoader(WebBaseLoader):
        load_all_recursively: bool = True,
        basic_auth: Optional[Tuple[str, str]] = None,
        cookies: Optional[dict] = None,
+        continue_on_failure: Optional[bool] = False,
    ):
        """Initialize with blackboard course url.

@ -58,6 +59,10 @@ class BlackboardLoader(WebBaseLoader):
            load_all_recursively: If True, load all documents recursively.
            basic_auth: Basic auth credentials.
            cookies: Cookies.
+            continue_on_failure: whether to continue loading the sitemap if an error
+                occurs loading a url, emitting a warning instead of raising an
+                exception. Setting this to True makes the loader more robust, but also
+                may result in missing data. Default: False

        Raises:
            ValueError: If blackboard course url is invalid.
@ -80,6 +85,7 @@ class BlackboardLoader(WebBaseLoader):
        cookies.update({"BbRouter": bbrouter})
        self.session.cookies.update(cookies)
        self.load_all_recursively = load_all_recursively
+        self.continue_on_failure = continue_on_failure
        self.check_bs4()

    def check_bs4(self) -> None:
--- a/libs/langchain/langchain/document_loaders/gitbook.py
+++ b/libs/langchain/langchain/document_loaders/gitbook.py
@ -19,6 +19,7 @@ class GitbookLoader(WebBaseLoader):
        load_all_paths: bool = False,
        base_url: Optional[str] = None,
        content_selector: str = "main",
+        continue_on_failure: Optional[bool] = False,
    ):
        """Initialize with web page and whether to load all paths.

@ -31,6 +32,10 @@ class GitbookLoader(WebBaseLoader):
                appended to this base url. Defaults to `web_page`.
            content_selector: The CSS selector for the content to load.
                Defaults to "main".
+            continue_on_failure: whether to continue loading the sitemap if an error
+                occurs loading a url, emitting a warning instead of raising an
+                exception. Setting this to True makes the loader more robust, but also
+                may result in missing data. Default: False
        """
        self.base_url = base_url or web_page
        if self.base_url.endswith("/"):
@ -43,6 +48,7 @@ class GitbookLoader(WebBaseLoader):
        super().__init__(web_paths)
        self.load_all_paths = load_all_paths
        self.content_selector = content_selector
+        self.continue_on_failure = continue_on_failure

    def load(self) -> List[Document]:
        """Fetch text from one single GitBook page."""
--- a/libs/langchain/langchain/document_loaders/sitemap.py
+++ b/libs/langchain/langchain/document_loaders/sitemap.py
@ -33,6 +33,7 @@ class SitemapLoader(WebBaseLoader):
        blocknum: int = 0,
        meta_function: Optional[Callable] = None,
        is_local: bool = False,
+        continue_on_failure: bool = False,
    ):
        """Initialize with webpage path and optional filter URLs.

@ -48,6 +49,10 @@ class SitemapLoader(WebBaseLoader):
                remember when setting this method to also copy metadata["loc"]
                to metadata["source"] if you are using this field
            is_local: whether the sitemap is a local file. Default: False
+            continue_on_failure: whether to continue loading the sitemap if an error
+                occurs loading a url, emitting a warning instead of raising an
+                exception. Setting this to True makes the loader more robust, but also
+                may result in missing data. Default: False
        """

        if blocksize is not None and blocksize < 1:
@ -71,6 +76,7 @@ class SitemapLoader(WebBaseLoader):
        self.blocksize = blocksize
        self.blocknum = blocknum
        self.is_local = is_local
+        self.continue_on_failure = continue_on_failure

    def parse_sitemap(self, soup: Any) -> List[dict]:
        """Parse sitemap xml and load into a list of dicts.
--- a/libs/langchain/langchain/document_loaders/web_base.py
+++ b/libs/langchain/langchain/document_loaders/web_base.py
@ -62,6 +62,7 @@ class WebBaseLoader(BaseLoader):
        header_template: Optional[dict] = None,
        verify_ssl: Optional[bool] = True,
        proxies: Optional[dict] = None,
+        continue_on_failure: Optional[bool] = False,
    ):
        """Initialize with webpage path."""

@ -96,6 +97,7 @@ class WebBaseLoader(BaseLoader):
        self.session = requests.Session()
        self.session.headers = dict(headers)
        self.session.verify = verify_ssl
+        self.continue_on_failure = continue_on_failure

        if proxies:
            self.session.proxies.update(proxies)
@ -133,7 +135,20 @@ class WebBaseLoader(BaseLoader):
        self, url: str, semaphore: asyncio.Semaphore
    ) -> str:
        async with semaphore:
-            return await self._fetch(url)
+            try:
+                return await self._fetch(url)
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.warning(
+                        f"Error fetching {url}, skipping due to"
+                        f" continue_on_failure=True"
+                    )
+                    return ""
+                logger.exception(
+                    f"Error fetching {url} and aborting, use continue_on_failure=True "
+                    "to continue loading urls after encountering an error."
+                )
+                raise e

    async def fetch_all(self, urls: List[str]) -> Any:
        """Fetch all urls concurrently with rate limiting."""