You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/document_loaders/url.py

161 lines
5.9 KiB
Python

"""Loader that uses unstructured to load HTML files."""
import logging
from typing import Any, List
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class UnstructuredURLLoader(BaseLoader):
"""Load files from remote URLs using `Unstructured`.
Use the unstructured partition function to detect the MIME type
and route the file to the appropriate partitioner.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain_community.document_loaders import UnstructuredURLLoader
loader = UnstructuredURLLoader(
urls=["<url-1>", "<url-2>"], mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
"""
def __init__(
self,
urls: List[str],
continue_on_failure: bool = True,
mode: str = "single",
show_progress_bar: bool = False,
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
try:
import unstructured # noqa:F401
from unstructured.__version__ import __version__ as __unstructured_version__
self.__version = __unstructured_version__
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
self._validate_mode(mode)
self.mode = mode
headers = unstructured_kwargs.pop("headers", {})
if len(headers.keys()) != 0:
warn_about_headers = False
if self.__is_non_html_available():
warn_about_headers = not self.__is_headers_available_for_non_html()
else:
warn_about_headers = not self.__is_headers_available_for_html()
if warn_about_headers:
logger.warning(
"You are using an old version of unstructured. "
"The headers parameter is ignored"
)
self.urls = urls
self.continue_on_failure = continue_on_failure
self.headers = headers
self.unstructured_kwargs = unstructured_kwargs
self.show_progress_bar = show_progress_bar
def _validate_mode(self, mode: str) -> None:
_valid_modes = {"single", "elements"}
if mode not in _valid_modes:
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
)
def __is_headers_available_for_html(self) -> bool:
_unstructured_version = self.__version.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
return unstructured_version >= (0, 5, 7)
def __is_headers_available_for_non_html(self) -> bool:
_unstructured_version = self.__version.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
return unstructured_version >= (0, 5, 13)
def __is_non_html_available(self) -> bool:
_unstructured_version = self.__version.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
return unstructured_version >= (0, 5, 12)
def load(self) -> List[Document]:
"""Load file."""
from unstructured.partition.auto import partition
from unstructured.partition.html import partition_html
docs: List[Document] = list()
if self.show_progress_bar:
try:
from tqdm import tqdm
except ImportError as e:
raise ImportError(
"Package tqdm must be installed if show_progress_bar=True. "
"Please install with 'pip install tqdm' or set "
"show_progress_bar=False."
) from e
urls = tqdm(self.urls)
else:
urls = self.urls
for url in urls:
try:
if self.__is_non_html_available():
if self.__is_headers_available_for_non_html():
elements = partition(
url=url, headers=self.headers, **self.unstructured_kwargs
)
else:
elements = partition(url=url, **self.unstructured_kwargs)
else:
if self.__is_headers_available_for_html():
elements = partition_html(
url=url, headers=self.headers, **self.unstructured_kwargs
)
else:
elements = partition_html(url=url, **self.unstructured_kwargs)
except Exception as e:
if self.continue_on_failure:
logger.error(f"Error fetching or processing {url}, exception: {e}")
continue
else:
raise e
if self.mode == "single":
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))
elif self.mode == "elements":
for element in elements:
metadata = element.metadata.to_dict()
metadata["category"] = element.category
docs.append(Document(page_content=str(element), metadata=metadata))
return docs