IMPROVEMENT WebResearchRetriever error handling in urls with connection error (#13401)

- **Description:** Added a method `fetch_valid_documents` to `WebResearchRetriever` class that will test the connection for every url in `new_urls` and remove those that raise a `ConnectionError`. - **Issue:** [Previous PR](https://github.com/langchain-ai/langchain/pull/13353), - **Dependencies:** None, - **Tag maintainer:** @efriis Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17.
10 months ago · 0fb5f857f9
parent d2335d0114
commit 0fb5f857f9
2 changed files with 23 additions and 3 deletions
--- a/libs/langchain/langchain/document_loaders/async_html.py
+++ b/libs/langchain/langchain/document_loaders/async_html.py
@ -51,6 +51,7 @@ class AsyncHtmlLoader(BaseLoader):
        requests_per_second: int = 2,
        requests_kwargs: Optional[Dict[str, Any]] = None,
        raise_for_status: bool = False,
+        ignore_load_errors: bool = False,
    ):
        """Initialize with a webpage path."""

@ -88,6 +89,17 @@ class AsyncHtmlLoader(BaseLoader):
        self.raise_for_status = raise_for_status
        self.autoset_encoding = autoset_encoding
        self.encoding = encoding
+        self.ignore_load_errors = ignore_load_errors
+
+    def _fetch_valid_connection_docs(self, url: str) -> Any:
+        if self.ignore_load_errors:
+            try:
+                return self.session.get(url, **self.requests_kwargs)
+            except Exception as e:
+                warnings.warn(str(e))
+                return None
+
+        return self.session.get(url, **self.requests_kwargs)

    @staticmethod
    def _check_parser(parser: str) -> None:
@ -114,7 +126,10 @@ class AsyncHtmlLoader(BaseLoader):

        self._check_parser(parser)

-        html_doc = self.session.get(url, **self.requests_kwargs)
+        html_doc = self._fetch_valid_connection_docs(url)
+        if not getattr(html_doc, "ok", False):
+            return None
+
        if self.raise_for_status:
            html_doc.raise_for_status()

@ -142,7 +157,10 @@ class AsyncHtmlLoader(BaseLoader):
                            text = ""
                        return text
                except aiohttp.ClientConnectionError as e:
-                    if i == retries - 1:
+                    if i == retries - 1 and self.ignore_load_errors:
+                        logger.warning(f"Error fetching {url} after {retries} retries.")
+                        return ""
+                    elif i == retries - 1:
                        raise
                    else:
                        logger.warning(
@ -196,6 +214,8 @@ class AsyncHtmlLoader(BaseLoader):
        docs = []
        for i, text in enumerate(cast(List[str], results)):
            soup = self._scrape(self.web_paths[i])
+            if not soup:
+                continue
            metadata = _build_metadata(soup, self.web_paths[i])
            docs.append(Document(page_content=text, metadata=metadata))

--- a/libs/langchain/langchain/retrievers/web_research.py
+++ b/libs/langchain/langchain/retrievers/web_research.py
@ -198,7 +198,7 @@ class WebResearchRetriever(BaseRetriever):
        logger.info(f"New URLs to load: {new_urls}")
        # Load, split, and add new urls to vectorstore
        if new_urls:
-            loader = AsyncHtmlLoader(new_urls)
+            loader = AsyncHtmlLoader(new_urls, ignore_load_errors=True)
            html2text = Html2TextTransformer()
            logger.info("Indexing new urls...")
            docs = loader.load()