forked from Archives/langchain
Harrison/unstructured structured (#1004)
parent
bbb06ca4cf
commit
0998577dfe
@ -1,29 +1,13 @@
|
|||||||
"""Loader that loads Microsoft Word files."""
|
"""Loader that loads Microsoft Word files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
from langchain.document_loaders.base import BaseLoader
|
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredDocxLoader(BaseLoader):
|
class UnstructuredDocxLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load Microsoft Word files."""
|
"""Loader that uses unstructured to load Microsoft Word files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def _get_elements(self) -> List:
|
||||||
"""Initialize with file path."""
|
|
||||||
try:
|
|
||||||
import unstructured # noqa:F401
|
|
||||||
except ImportError:
|
|
||||||
raise ValueError(
|
|
||||||
"unstructured package not found, please install it with "
|
|
||||||
"`pip install unstructured`"
|
|
||||||
)
|
|
||||||
self.file_path = file_path
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load file."""
|
|
||||||
from unstructured.partition.docx import partition_docx
|
from unstructured.partition.docx import partition_docx
|
||||||
|
|
||||||
elements = partition_docx(filename=self.file_path)
|
return partition_docx(filename=self.file_path)
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
|
||||||
metadata = {"source": self.file_path}
|
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
|
||||||
|
@ -1,29 +1,13 @@
|
|||||||
"""Loader that loads email files."""
|
"""Loader that loads email files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
from langchain.document_loaders.base import BaseLoader
|
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredEmailLoader(BaseLoader):
|
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load email files."""
|
"""Loader that uses unstructured to load email files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def _get_elements(self) -> List:
|
||||||
"""Initialize with file path."""
|
|
||||||
try:
|
|
||||||
import unstructured # noqa:F401
|
|
||||||
except ImportError:
|
|
||||||
raise ValueError(
|
|
||||||
"unstructured package not found, please install it with "
|
|
||||||
"`pip install unstructured`"
|
|
||||||
)
|
|
||||||
self.file_path = file_path
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load file."""
|
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
|
|
||||||
elements = partition_email(filename=self.file_path)
|
return partition_email(filename=self.file_path)
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
|
||||||
metadata = {"source": self.file_path}
|
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
|
||||||
|
@ -1,29 +1,13 @@
|
|||||||
"""Loader that loads PDF files."""
|
"""Loader that loads PDF files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
from langchain.document_loaders.base import BaseLoader
|
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredHTMLLoader(BaseLoader):
|
class UnstructuredHTMLLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load HTML files."""
|
"""Loader that uses unstructured to load HTML files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def _get_elements(self) -> List:
|
||||||
"""Initialize with file path."""
|
|
||||||
try:
|
|
||||||
import unstructured # noqa:F401
|
|
||||||
except ImportError:
|
|
||||||
raise ValueError(
|
|
||||||
"unstructured package not found, please install it with "
|
|
||||||
"`pip install unstructured`"
|
|
||||||
)
|
|
||||||
self.file_path = file_path
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load file."""
|
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
elements = partition_html(filename=self.file_path)
|
return partition_html(filename=self.file_path)
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
|
||||||
metadata = {"source": self.file_path}
|
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
|
||||||
|
@ -1,29 +1,13 @@
|
|||||||
"""Loader that loads powerpoint files."""
|
"""Loader that loads powerpoint files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
from langchain.document_loaders.base import BaseLoader
|
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredPowerPointLoader(BaseLoader):
|
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load powerpoint files."""
|
"""Loader that uses unstructured to load powerpoint files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def _get_elements(self) -> List:
|
||||||
"""Initialize with file path."""
|
|
||||||
try:
|
|
||||||
import unstructured # noqa:F401
|
|
||||||
except ImportError:
|
|
||||||
raise ValueError(
|
|
||||||
"unstructured package not found, please install it with "
|
|
||||||
"`pip install unstructured`"
|
|
||||||
)
|
|
||||||
self.file_path = file_path
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load file."""
|
|
||||||
from unstructured.partition.pptx import partition_pptx
|
from unstructured.partition.pptx import partition_pptx
|
||||||
|
|
||||||
elements = partition_pptx(filename=self.file_path)
|
return partition_pptx(filename=self.file_path)
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
|
||||||
metadata = {"source": self.file_path}
|
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
|
||||||
|
Loading…
Reference in New Issue