diff --git a/langchain/document_loaders/url.py b/langchain/document_loaders/url.py index 1b332849..95db7726 100644 --- a/langchain/document_loaders/url.py +++ b/langchain/document_loaders/url.py @@ -1,14 +1,17 @@ """Loader that uses unstructured to load HTML files.""" +import logging from typing import List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +logger = logging.getLogger(__file__) + class UnstructuredURLLoader(BaseLoader): """Loader that uses unstructured to load HTML files.""" - def __init__(self, urls: List[str]): + def __init__(self, urls: List[str], continue_on_failure: bool = True): """Initialize with file path.""" try: import unstructured # noqa:F401 @@ -18,6 +21,7 @@ class UnstructuredURLLoader(BaseLoader): "`pip install unstructured`" ) self.urls = urls + self.continue_on_failure = continue_on_failure def load(self) -> List[Document]: """Load file.""" @@ -25,7 +29,13 @@ class UnstructuredURLLoader(BaseLoader): docs: List[Document] = list() for url in self.urls: - elements = partition_html(url=url) + try: + elements = partition_html(url=url) + except Exception as e: + if self.continue_on_failure: + logger.error(f"Error fetching or processing {url}, exeption: {e}") + else: + raise e text = "\n\n".join([str(el) for el in elements]) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata))