IMPROVEMENT WebResearchRetriever error handling in urls with connection error (#13401)

- **Description:** Added a method `fetch_valid_documents` to
`WebResearchRetriever` class that will test the connection for every url
in `new_urls` and remove those that raise a `ConnectionError`.
- **Issue:** [Previous
PR](https://github.com/langchain-ai/langchain/pull/13353),
  - **Dependencies:** None,
  - **Tag maintainer:** @efriis 

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc:

https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in `docs/extras`
directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
pull/13543/head
pedro-inf-custodio 10 months ago committed by GitHub
parent d2335d0114
commit 0fb5f857f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -51,6 +51,7 @@ class AsyncHtmlLoader(BaseLoader):
requests_per_second: int = 2,
requests_kwargs: Optional[Dict[str, Any]] = None,
raise_for_status: bool = False,
ignore_load_errors: bool = False,
):
"""Initialize with a webpage path."""
@ -88,6 +89,17 @@ class AsyncHtmlLoader(BaseLoader):
self.raise_for_status = raise_for_status
self.autoset_encoding = autoset_encoding
self.encoding = encoding
self.ignore_load_errors = ignore_load_errors
def _fetch_valid_connection_docs(self, url: str) -> Any:
if self.ignore_load_errors:
try:
return self.session.get(url, **self.requests_kwargs)
except Exception as e:
warnings.warn(str(e))
return None
return self.session.get(url, **self.requests_kwargs)
@staticmethod
def _check_parser(parser: str) -> None:
@ -114,7 +126,10 @@ class AsyncHtmlLoader(BaseLoader):
self._check_parser(parser)
html_doc = self.session.get(url, **self.requests_kwargs)
html_doc = self._fetch_valid_connection_docs(url)
if not getattr(html_doc, "ok", False):
return None
if self.raise_for_status:
html_doc.raise_for_status()
@ -142,7 +157,10 @@ class AsyncHtmlLoader(BaseLoader):
text = ""
return text
except aiohttp.ClientConnectionError as e:
if i == retries - 1:
if i == retries - 1 and self.ignore_load_errors:
logger.warning(f"Error fetching {url} after {retries} retries.")
return ""
elif i == retries - 1:
raise
else:
logger.warning(
@ -196,6 +214,8 @@ class AsyncHtmlLoader(BaseLoader):
docs = []
for i, text in enumerate(cast(List[str], results)):
soup = self._scrape(self.web_paths[i])
if not soup:
continue
metadata = _build_metadata(soup, self.web_paths[i])
docs.append(Document(page_content=text, metadata=metadata))

@ -198,7 +198,7 @@ class WebResearchRetriever(BaseRetriever):
logger.info(f"New URLs to load: {new_urls}")
# Load, split, and add new urls to vectorstore
if new_urls:
loader = AsyncHtmlLoader(new_urls)
loader = AsyncHtmlLoader(new_urls, ignore_load_errors=True)
html2text = Html2TextTransformer()
logger.info("Indexing new urls...")
docs = loader.load()

Loading…
Cancel
Save