diff --git a/docs/modules/indexes/document_loaders/examples/word_document.ipynb b/docs/modules/indexes/document_loaders/examples/word_document.ipynb index b7a49450..38621f06 100644 --- a/docs/modules/indexes/document_loaders/examples/word_document.ipynb +++ b/docs/modules/indexes/document_loaders/examples/word_document.ipynb @@ -10,9 +10,78 @@ "This covers how to load Word documents into a document format that we can use downstream." ] }, + { + "cell_type": "markdown", + "id": "9438686b", + "metadata": {}, + "source": [ + "## Using Docx2txt\n", + "\n", + "Load .docx using `Docx2txt` into a document." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7b80ea89", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import Docx2txtLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "99a12031", + "metadata": {}, + "outputs": [], + "source": [ + "loader = Docx2txtLoader(\"example_data/fake.docx\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b92f68b0", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d83dd755", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "markdown", + "id": "8d40727d", + "metadata": {}, + "source": [ + "## Using Unstructured" + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "id": "721c48aa", "metadata": {}, "outputs": [], @@ -129,7 +198,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index dcaac2b3..94c94150 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -84,7 +84,10 @@ from langchain.document_loaders.url_playwright import PlaywrightURLLoader from langchain.document_loaders.url_selenium import SeleniumURLLoader from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader -from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader +from langchain.document_loaders.word_document import ( + Docx2txtLoader, + UnstructuredWordDocumentLoader, +) from langchain.document_loaders.youtube import ( GoogleApiClient, GoogleApiYoutubeLoader, @@ -174,4 +177,7 @@ __all__ = [ "YoutubeLoader", "PyPDFDirectoryLoader", "MathpixPDFLoader", + "ChatGPTLoader", + "HuggingFaceDatasetLoader", + "Docx2txtLoader", ] diff --git a/langchain/document_loaders/word_document.py b/langchain/document_loaders/word_document.py index dd990942..1cec1cce 100644 --- a/langchain/document_loaders/word_document.py +++ b/langchain/document_loaders/word_document.py @@ -1,10 +1,69 @@ """Loader that loads word documents.""" import os +import tempfile +from abc import ABC from typing import List +from urllib.parse import urlparse +import requests + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader +class Docx2txtLoader(BaseLoader, ABC): + """Loads a DOCX with docx2txt and chunks at character level. + + Defaults to check for local file, but if the file is a web path, it will download it + to a temporary file, and use that, then clean up the temporary file after completion + """ + + def __init__(self, file_path: str): + """Initialize with file path.""" + self.file_path = file_path + if "~" in self.file_path: + self.file_path = os.path.expanduser(self.file_path) + + # If the file is a web path, download it to a temporary file, and use that + if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): + r = requests.get(self.file_path) + + if r.status_code != 200: + raise ValueError( + "Check the url of your file; returned status code %s" + % r.status_code + ) + + self.web_path = self.file_path + self.temp_file = tempfile.NamedTemporaryFile() + self.temp_file.write(r.content) + self.file_path = self.temp_file.name + elif not os.path.isfile(self.file_path): + raise ValueError("File path %s is not a valid file or url" % self.file_path) + + def __del__(self) -> None: + if hasattr(self, "temp_file"): + self.temp_file.close() + + def load(self) -> List[Document]: + """Load given path as single page.""" + import docx2txt + + return [ + Document( + page_content=docx2txt.process(self.file_path), + metadata={"source": self.file_path}, + ) + ] + + @staticmethod + def _is_valid_url(url: str) -> bool: + """Check if the url is valid.""" + parsed = urlparse(url) + return bool(parsed.netloc) and bool(parsed.scheme) + + class UnstructuredWordDocumentLoader(UnstructuredFileLoader): """Loader that uses unstructured to load word documents."""