mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
UnstructuredURLLoader: allow url failures, keep processing (#1954)
By default, UnstructuredURLLoader now continues processing remaining `urls` if encountering an error for a particular url. If failure of the entire loader is desired as was previously the case, use `continue_on_failure=False`. E.g., this fails splendidly, courtesy of the 2nd url: ``` from langchain.document_loaders import UnstructuredURLLoader urls = [ "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023", "https://doesnotexistithinkprobablynotverynotlikely.io", "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023", ] loader = UnstructuredURLLoader(urls=urls, continue_on_failure=False) data = loader.load() ``` Issue: https://github.com/hwchase17/langchain/issues/1939
This commit is contained in:
parent
6598beacdb
commit
71e8eaff2b
@ -1,14 +1,17 @@
|
|||||||
"""Loader that uses unstructured to load HTML files."""
|
"""Loader that uses unstructured to load HTML files."""
|
||||||
|
import logging
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredURLLoader(BaseLoader):
|
class UnstructuredURLLoader(BaseLoader):
|
||||||
"""Loader that uses unstructured to load HTML files."""
|
"""Loader that uses unstructured to load HTML files."""
|
||||||
|
|
||||||
def __init__(self, urls: List[str]):
|
def __init__(self, urls: List[str], continue_on_failure: bool = True):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
try:
|
try:
|
||||||
import unstructured # noqa:F401
|
import unstructured # noqa:F401
|
||||||
@ -18,6 +21,7 @@ class UnstructuredURLLoader(BaseLoader):
|
|||||||
"`pip install unstructured`"
|
"`pip install unstructured`"
|
||||||
)
|
)
|
||||||
self.urls = urls
|
self.urls = urls
|
||||||
|
self.continue_on_failure = continue_on_failure
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load file."""
|
"""Load file."""
|
||||||
@ -25,7 +29,13 @@ class UnstructuredURLLoader(BaseLoader):
|
|||||||
|
|
||||||
docs: List[Document] = list()
|
docs: List[Document] = list()
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
|
try:
|
||||||
elements = partition_html(url=url)
|
elements = partition_html(url=url)
|
||||||
|
except Exception as e:
|
||||||
|
if self.continue_on_failure:
|
||||||
|
logger.error(f"Error fetching or processing {url}, exeption: {e}")
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
Loading…
Reference in New Issue
Block a user