From 0fb5f857f95001aeeab9accaba17def2e0c6a723 Mon Sep 17 00:00:00 2001 From: pedro-inf-custodio <113921389+pedro-inf-custodio@users.noreply.github.com> Date: Fri, 17 Nov 2023 22:02:26 +0000 Subject: [PATCH] IMPROVEMENT WebResearchRetriever error handling in urls with connection error (#13401) - **Description:** Added a method `fetch_valid_documents` to `WebResearchRetriever` class that will test the connection for every url in `new_urls` and remove those that raise a `ConnectionError`. - **Issue:** [Previous PR](https://github.com/langchain-ai/langchain/pull/13353), - **Dependencies:** None, - **Tag maintainer:** @efriis Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --- .../langchain/document_loaders/async_html.py | 24 +++++++++++++++++-- .../langchain/retrievers/web_research.py | 2 +- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/async_html.py b/libs/langchain/langchain/document_loaders/async_html.py index 877a9c41a9..d7ae3f2f90 100644 --- a/libs/langchain/langchain/document_loaders/async_html.py +++ b/libs/langchain/langchain/document_loaders/async_html.py @@ -51,6 +51,7 @@ class AsyncHtmlLoader(BaseLoader): requests_per_second: int = 2, requests_kwargs: Optional[Dict[str, Any]] = None, raise_for_status: bool = False, + ignore_load_errors: bool = False, ): """Initialize with a webpage path.""" @@ -88,6 +89,17 @@ class AsyncHtmlLoader(BaseLoader): self.raise_for_status = raise_for_status self.autoset_encoding = autoset_encoding self.encoding = encoding + self.ignore_load_errors = ignore_load_errors + + def _fetch_valid_connection_docs(self, url: str) -> Any: + if self.ignore_load_errors: + try: + return self.session.get(url, **self.requests_kwargs) + except Exception as e: + warnings.warn(str(e)) + return None + + return self.session.get(url, **self.requests_kwargs) @staticmethod def _check_parser(parser: str) -> None: @@ -114,7 +126,10 @@ class AsyncHtmlLoader(BaseLoader): self._check_parser(parser) - html_doc = self.session.get(url, **self.requests_kwargs) + html_doc = self._fetch_valid_connection_docs(url) + if not getattr(html_doc, "ok", False): + return None + if self.raise_for_status: html_doc.raise_for_status() @@ -142,7 +157,10 @@ class AsyncHtmlLoader(BaseLoader): text = "" return text except aiohttp.ClientConnectionError as e: - if i == retries - 1: + if i == retries - 1 and self.ignore_load_errors: + logger.warning(f"Error fetching {url} after {retries} retries.") + return "" + elif i == retries - 1: raise else: logger.warning( @@ -196,6 +214,8 @@ class AsyncHtmlLoader(BaseLoader): docs = [] for i, text in enumerate(cast(List[str], results)): soup = self._scrape(self.web_paths[i]) + if not soup: + continue metadata = _build_metadata(soup, self.web_paths[i]) docs.append(Document(page_content=text, metadata=metadata)) diff --git a/libs/langchain/langchain/retrievers/web_research.py b/libs/langchain/langchain/retrievers/web_research.py index ac8ecb6c75..73b822dd96 100644 --- a/libs/langchain/langchain/retrievers/web_research.py +++ b/libs/langchain/langchain/retrievers/web_research.py @@ -198,7 +198,7 @@ class WebResearchRetriever(BaseRetriever): logger.info(f"New URLs to load: {new_urls}") # Load, split, and add new urls to vectorstore if new_urls: - loader = AsyncHtmlLoader(new_urls) + loader = AsyncHtmlLoader(new_urls, ignore_load_errors=True) html2text = Html2TextTransformer() logger.info("Indexing new urls...") docs = loader.load()