Harrison/doc2txt (#3772)

Co-authored-by: rishni ratnam <rishniratnam@gmail.com>
1 year ago · c494ca3ad2
parent ce4fea983b
commit c494ca3ad2
3 changed files with 137 additions and 3 deletions
--- a/docs/modules/indexes/document_loaders/examples/word_document.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/word_document.ipynb
@ -10,9 +10,78 @@
    "This covers how to load Word documents into a document format that we can use downstream."
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "9438686b",
+   "metadata": {},
+   "source": [
+    "## Using Docx2txt\n",
+    "\n",
+    "Load .docx using `Docx2txt` into a document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7b80ea89",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import Docx2txtLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "99a12031",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = Docx2txtLoader(\"example_data/fake.docx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b92f68b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d83dd755",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d40727d",
+   "metadata": {},
+   "source": [
+    "## Using Unstructured"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 8,
   "id": "721c48aa",
   "metadata": {},
   "outputs": [],
@ -129,7 +198,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -84,7 +84,10 @@ from langchain.document_loaders.url_playwright import PlaywrightURLLoader
 from langchain.document_loaders.url_selenium import SeleniumURLLoader
 from langchain.document_loaders.web_base import WebBaseLoader
 from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
-from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
+from langchain.document_loaders.word_document import (
+    Docx2txtLoader,
+    UnstructuredWordDocumentLoader,
+)
 from langchain.document_loaders.youtube import (
    GoogleApiClient,
    GoogleApiYoutubeLoader,
@ -174,4 +177,7 @@ __all__ = [
    "YoutubeLoader",
    "PyPDFDirectoryLoader",
    "MathpixPDFLoader",
+    "ChatGPTLoader",
+    "HuggingFaceDatasetLoader",
+    "Docx2txtLoader",
 ]
--- a/langchain/document_loaders/word_document.py
+++ b/langchain/document_loaders/word_document.py
@ -1,10 +1,69 @@
 """Loader that loads word documents."""
 import os
+import tempfile
+from abc import ABC
 from typing import List
+from urllib.parse import urlparse

+import requests
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader


+class Docx2txtLoader(BaseLoader, ABC):
+    """Loads a DOCX with docx2txt and chunks at character level.
+
+    Defaults to check for local file, but if the file is a web path, it will download it
+    to a temporary file, and use that, then clean up the temporary file after completion
+    """
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        self.file_path = file_path
+        if "~" in self.file_path:
+            self.file_path = os.path.expanduser(self.file_path)
+
+        # If the file is a web path, download it to a temporary file, and use that
+        if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
+            r = requests.get(self.file_path)
+
+            if r.status_code != 200:
+                raise ValueError(
+                    "Check the url of your file; returned status code %s"
+                    % r.status_code
+                )
+
+            self.web_path = self.file_path
+            self.temp_file = tempfile.NamedTemporaryFile()
+            self.temp_file.write(r.content)
+            self.file_path = self.temp_file.name
+        elif not os.path.isfile(self.file_path):
+            raise ValueError("File path %s is not a valid file or url" % self.file_path)
+
+    def __del__(self) -> None:
+        if hasattr(self, "temp_file"):
+            self.temp_file.close()
+
+    def load(self) -> List[Document]:
+        """Load given path as single page."""
+        import docx2txt
+
+        return [
+            Document(
+                page_content=docx2txt.process(self.file_path),
+                metadata={"source": self.file_path},
+            )
+        ]
+
+    @staticmethod
+    def _is_valid_url(url: str) -> bool:
+        """Check if the url is valid."""
+        parsed = urlparse(url)
+        return bool(parsed.netloc) and bool(parsed.scheme)
+
+
 class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
    """Loader that uses unstructured to load word documents."""