diff --git a/docs/modules/indexes/chain_examples/chat_vector_db.ipynb b/docs/modules/indexes/chain_examples/chat_vector_db.ipynb index 9f2158eb..2df1bc9e 100644 --- a/docs/modules/indexes/chain_examples/chat_vector_db.ipynb +++ b/docs/modules/indexes/chain_examples/chat_vector_db.ipynb @@ -225,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "id": "562769c6", "metadata": {}, "outputs": [], diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index c54838ec..2ca0f1a9 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -28,13 +28,17 @@ from langchain.document_loaders.s3_file import S3FileLoader from langchain.document_loaders.srt import SRTLoader from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.text import TextLoader -from langchain.document_loaders.unstructured import UnstructuredFileLoader +from langchain.document_loaders.unstructured import ( + UnstructuredFileIOLoader, + UnstructuredFileLoader, +) from langchain.document_loaders.url import UnstructuredURLLoader from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.youtube import YoutubeLoader __all__ = [ "UnstructuredFileLoader", + "UnstructuredFileIOLoader", "UnstructuredURLLoader", "DirectoryLoader", "NotionDirectoryLoader", diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py index 97137e08..614ff849 100644 --- a/langchain/document_loaders/unstructured.py +++ b/langchain/document_loaders/unstructured.py @@ -1,14 +1,15 @@ """Loader that uses unstructured to load files.""" -from typing import List +from abc import ABC, abstractmethod +from typing import IO, List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader -class UnstructuredFileLoader(BaseLoader): +class UnstructuredBaseLoader(BaseLoader, ABC): """Loader that uses unstructured to load files.""" - def __init__(self, file_path: str, mode: str = "single"): + def __init__(self, mode: str = "single"): """Initialize with file path.""" try: import unstructured # noqa:F401 @@ -22,13 +23,15 @@ class UnstructuredFileLoader(BaseLoader): raise ValueError( f"Got {mode} for `mode`, but should be one of `{_valid_modes}`" ) - self.file_path = file_path self.mode = mode + @abstractmethod def _get_elements(self) -> List: - from unstructured.partition.auto import partition + """Get elements.""" - return partition(filename=self.file_path) + @abstractmethod + def _get_metadata(self) -> dict: + """Get metadata.""" def load(self) -> List[Document]: """Load file.""" @@ -36,7 +39,7 @@ class UnstructuredFileLoader(BaseLoader): if self.mode == "elements": docs: List[Document] = list() for element in elements: - metadata = {"source": self.file_path} + metadata = self._get_metadata() # NOTE(MthwRobinson) - the attribute check is for backward compatibility # with unstructured<0.4.9. The metadata attributed was added in 0.4.9. if hasattr(element, "metadata"): @@ -45,9 +48,43 @@ class UnstructuredFileLoader(BaseLoader): metadata["category"] = element.category docs.append(Document(page_content=str(element), metadata=metadata)) elif self.mode == "single": - metadata = {"source": self.file_path} + metadata = self._get_metadata() text = "\n\n".join([str(el) for el in elements]) docs = [Document(page_content=text, metadata=metadata)] else: raise ValueError(f"mode of {self.mode} not supported.") return docs + + +class UnstructuredFileLoader(UnstructuredBaseLoader): + """Loader that uses unstructured to load files.""" + + def __init__(self, file_path: str, mode: str = "single"): + """Initialize with file path.""" + self.file_path = file_path + super().__init__(mode=mode) + + def _get_elements(self) -> List: + from unstructured.partition.auto import partition + + return partition(filename=self.file_path) + + def _get_metadata(self) -> dict: + return {"source": self.file_path} + + +class UnstructuredFileIOLoader(UnstructuredBaseLoader): + """Loader that uses unstructured to load file IO objects.""" + + def __init__(self, file: IO, mode: str = "single"): + """Initialize with file path.""" + self.file = file + super().__init__(mode=mode) + + def _get_elements(self) -> List: + from unstructured.partition.auto import partition + + return partition(file=self.file) + + def _get_metadata(self) -> dict: + return {}