mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
UnstructuredURLLoader: allow url failures, keep processing (#1954)
By default, UnstructuredURLLoader now continues processing remaining `urls` if encountering an error for a particular url. If failure of the entire loader is desired as was previously the case, use `continue_on_failure=False`. E.g., this fails splendidly, courtesy of the 2nd url: ``` from langchain.document_loaders import UnstructuredURLLoader urls = [ "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023", "https://doesnotexistithinkprobablynotverynotlikely.io", "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023", ] loader = UnstructuredURLLoader(urls=urls, continue_on_failure=False) data = loader.load() ``` Issue: https://github.com/hwchase17/langchain/issues/1939
This commit is contained in:
parent
6598beacdb
commit
71e8eaff2b
@ -1,14 +1,17 @@
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
class UnstructuredURLLoader(BaseLoader):
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
|
||||
def __init__(self, urls: List[str]):
|
||||
def __init__(self, urls: List[str], continue_on_failure: bool = True):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
@ -18,6 +21,7 @@ class UnstructuredURLLoader(BaseLoader):
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.urls = urls
|
||||
self.continue_on_failure = continue_on_failure
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
@ -25,7 +29,13 @@ class UnstructuredURLLoader(BaseLoader):
|
||||
|
||||
docs: List[Document] = list()
|
||||
for url in self.urls:
|
||||
try:
|
||||
elements = partition_html(url=url)
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
logger.error(f"Error fetching or processing {url}, exeption: {e}")
|
||||
else:
|
||||
raise e
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
|
Loading…
Reference in New Issue
Block a user