Harrison/doc2txt (#3772)

Co-authored-by: rishni ratnam <rishniratnam@gmail.com>
This commit is contained in:
Harrison Chase 2023-04-28 21:54:16 -07:00 committed by GitHub
parent ce4fea983b
commit c494ca3ad2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 137 additions and 3 deletions

View File

@ -10,9 +10,78 @@
"This covers how to load Word documents into a document format that we can use downstream." "This covers how to load Word documents into a document format that we can use downstream."
] ]
}, },
{
"cell_type": "markdown",
"id": "9438686b",
"metadata": {},
"source": [
"## Using Docx2txt\n",
"\n",
"Load .docx using `Docx2txt` into a document."
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 3,
"id": "7b80ea89",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import Docx2txtLoader"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "99a12031",
"metadata": {},
"outputs": [],
"source": [
"loader = Docx2txtLoader(\"example_data/fake.docx\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b92f68b0",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d83dd755",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "markdown",
"id": "8d40727d",
"metadata": {},
"source": [
"## Using Unstructured"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "721c48aa", "id": "721c48aa",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -129,7 +198,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.13" "version": "3.9.1"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -84,7 +84,10 @@ from langchain.document_loaders.url_playwright import PlaywrightURLLoader
from langchain.document_loaders.url_selenium import SeleniumURLLoader from langchain.document_loaders.url_selenium import SeleniumURLLoader
from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader from langchain.document_loaders.word_document import (
Docx2txtLoader,
UnstructuredWordDocumentLoader,
)
from langchain.document_loaders.youtube import ( from langchain.document_loaders.youtube import (
GoogleApiClient, GoogleApiClient,
GoogleApiYoutubeLoader, GoogleApiYoutubeLoader,
@ -174,4 +177,7 @@ __all__ = [
"YoutubeLoader", "YoutubeLoader",
"PyPDFDirectoryLoader", "PyPDFDirectoryLoader",
"MathpixPDFLoader", "MathpixPDFLoader",
"ChatGPTLoader",
"HuggingFaceDatasetLoader",
"Docx2txtLoader",
] ]

View File

@ -1,10 +1,69 @@
"""Loader that loads word documents.""" """Loader that loads word documents."""
import os import os
import tempfile
from abc import ABC
from typing import List from typing import List
from urllib.parse import urlparse
import requests
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader
class Docx2txtLoader(BaseLoader, ABC):
"""Loads a DOCX with docx2txt and chunks at character level.
Defaults to check for local file, but if the file is a web path, it will download it
to a temporary file, and use that, then clean up the temporary file after completion
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
self.file_path = file_path
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
r = requests.get(self.file_path)
if r.status_code != 200:
raise ValueError(
"Check the url of your file; returned status code %s"
% r.status_code
)
self.web_path = self.file_path
self.temp_file = tempfile.NamedTemporaryFile()
self.temp_file.write(r.content)
self.file_path = self.temp_file.name
elif not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file or url" % self.file_path)
def __del__(self) -> None:
if hasattr(self, "temp_file"):
self.temp_file.close()
def load(self) -> List[Document]:
"""Load given path as single page."""
import docx2txt
return [
Document(
page_content=docx2txt.process(self.file_path),
metadata={"source": self.file_path},
)
]
@staticmethod
def _is_valid_url(url: str) -> bool:
"""Check if the url is valid."""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
class UnstructuredWordDocumentLoader(UnstructuredFileLoader): class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load word documents.""" """Loader that uses unstructured to load word documents."""