Harrison/unstructured io (#1200)

This commit is contained in:
Harrison Chase 2023-02-20 22:54:49 -08:00 committed by GitHub
parent d90a287d8f
commit 5bdb8dd6fe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 51 additions and 10 deletions

View File

@ -225,7 +225,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 5,
"id": "562769c6",
"metadata": {},
"outputs": [],

View File

@ -28,13 +28,17 @@ from langchain.document_loaders.s3_file import S3FileLoader
from langchain.document_loaders.srt import SRTLoader
from langchain.document_loaders.telegram import TelegramChatLoader
from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.unstructured import (
UnstructuredFileIOLoader,
UnstructuredFileLoader,
)
from langchain.document_loaders.url import UnstructuredURLLoader
from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.youtube import YoutubeLoader
__all__ = [
"UnstructuredFileLoader",
"UnstructuredFileIOLoader",
"UnstructuredURLLoader",
"DirectoryLoader",
"NotionDirectoryLoader",

View File

@ -1,14 +1,15 @@
"""Loader that uses unstructured to load files."""
from typing import List
from abc import ABC, abstractmethod
from typing import IO, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class UnstructuredFileLoader(BaseLoader):
class UnstructuredBaseLoader(BaseLoader, ABC):
"""Loader that uses unstructured to load files."""
def __init__(self, file_path: str, mode: str = "single"):
def __init__(self, mode: str = "single"):
"""Initialize with file path."""
try:
import unstructured # noqa:F401
@ -22,13 +23,15 @@ class UnstructuredFileLoader(BaseLoader):
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
)
self.file_path = file_path
self.mode = mode
@abstractmethod
def _get_elements(self) -> List:
from unstructured.partition.auto import partition
"""Get elements."""
return partition(filename=self.file_path)
@abstractmethod
def _get_metadata(self) -> dict:
"""Get metadata."""
def load(self) -> List[Document]:
"""Load file."""
@ -36,7 +39,7 @@ class UnstructuredFileLoader(BaseLoader):
if self.mode == "elements":
docs: List[Document] = list()
for element in elements:
metadata = {"source": self.file_path}
metadata = self._get_metadata()
# NOTE(MthwRobinson) - the attribute check is for backward compatibility
# with unstructured<0.4.9. The metadata attributed was added in 0.4.9.
if hasattr(element, "metadata"):
@ -45,9 +48,43 @@ class UnstructuredFileLoader(BaseLoader):
metadata["category"] = element.category
docs.append(Document(page_content=str(element), metadata=metadata))
elif self.mode == "single":
metadata = {"source": self.file_path}
metadata = self._get_metadata()
text = "\n\n".join([str(el) for el in elements])
docs = [Document(page_content=text, metadata=metadata)]
else:
raise ValueError(f"mode of {self.mode} not supported.")
return docs
class UnstructuredFileLoader(UnstructuredBaseLoader):
"""Loader that uses unstructured to load files."""
def __init__(self, file_path: str, mode: str = "single"):
"""Initialize with file path."""
self.file_path = file_path
super().__init__(mode=mode)
def _get_elements(self) -> List:
from unstructured.partition.auto import partition
return partition(filename=self.file_path)
def _get_metadata(self) -> dict:
return {"source": self.file_path}
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
"""Loader that uses unstructured to load file IO objects."""
def __init__(self, file: IO, mode: str = "single"):
"""Initialize with file path."""
self.file = file
super().__init__(mode=mode)
def _get_elements(self) -> List:
from unstructured.partition.auto import partition
return partition(file=self.file)
def _get_metadata(self) -> dict:
return {}