UnstructuredURLLoader: allow url failures, keep processing (#1954)

By default, UnstructuredURLLoader now continues processing remaining
`urls` if encountering an error for a particular url.

If failure of the entire loader is desired as was previously the case,
use `continue_on_failure=False`.

E.g., this fails splendidly, courtesy of the 2nd url:

```
from langchain.document_loaders import UnstructuredURLLoader
urls = [
    "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023",
    "https://doesnotexistithinkprobablynotverynotlikely.io",
    "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023",
]
loader = UnstructuredURLLoader(urls=urls, continue_on_failure=False)
data = loader.load()
```

Issue: https://github.com/hwchase17/langchain/issues/1939
This commit is contained in:
cragwolfe 2023-03-27 14:34:14 -07:00 committed by GitHub
parent 6598beacdb
commit 71e8eaff2b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,14 +1,17 @@
"""Loader that uses unstructured to load HTML files.""" """Loader that uses unstructured to load HTML files."""
import logging
from typing import List from typing import List
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__file__)
class UnstructuredURLLoader(BaseLoader): class UnstructuredURLLoader(BaseLoader):
"""Loader that uses unstructured to load HTML files.""" """Loader that uses unstructured to load HTML files."""
def __init__(self, urls: List[str]): def __init__(self, urls: List[str], continue_on_failure: bool = True):
"""Initialize with file path.""" """Initialize with file path."""
try: try:
import unstructured # noqa:F401 import unstructured # noqa:F401
@ -18,6 +21,7 @@ class UnstructuredURLLoader(BaseLoader):
"`pip install unstructured`" "`pip install unstructured`"
) )
self.urls = urls self.urls = urls
self.continue_on_failure = continue_on_failure
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load file.""" """Load file."""
@ -25,7 +29,13 @@ class UnstructuredURLLoader(BaseLoader):
docs: List[Document] = list() docs: List[Document] = list()
for url in self.urls: for url in self.urls:
try:
elements = partition_html(url=url) elements = partition_html(url=url)
except Exception as e:
if self.continue_on_failure:
logger.error(f"Error fetching or processing {url}, exeption: {e}")
else:
raise e
text = "\n\n".join([str(el) for el in elements]) text = "\n\n".join([str(el) for el in elements])
metadata = {"source": url} metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))