Harrison/doc2txt (#3772)

Co-authored-by: rishni ratnam <rishniratnam@gmail.com>
This commit is contained in:
Harrison Chase 2023-04-28 21:54:16 -07:00 committed by GitHub
parent ce4fea983b
commit c494ca3ad2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 137 additions and 3 deletions

View File

@ -10,9 +10,78 @@
"This covers how to load Word documents into a document format that we can use downstream."
]
},
{
"cell_type": "markdown",
"id": "9438686b",
"metadata": {},
"source": [
"## Using Docx2txt\n",
"\n",
"Load .docx using `Docx2txt` into a document."
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "7b80ea89",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import Docx2txtLoader"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "99a12031",
"metadata": {},
"outputs": [],
"source": [
"loader = Docx2txtLoader(\"example_data/fake.docx\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b92f68b0",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d83dd755",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "markdown",
"id": "8d40727d",
"metadata": {},
"source": [
"## Using Unstructured"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "721c48aa",
"metadata": {},
"outputs": [],
@ -129,7 +198,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
"version": "3.9.1"
}
},
"nbformat": 4,

View File

@ -84,7 +84,10 @@ from langchain.document_loaders.url_playwright import PlaywrightURLLoader
from langchain.document_loaders.url_selenium import SeleniumURLLoader
from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
from langchain.document_loaders.word_document import (
Docx2txtLoader,
UnstructuredWordDocumentLoader,
)
from langchain.document_loaders.youtube import (
GoogleApiClient,
GoogleApiYoutubeLoader,
@ -174,4 +177,7 @@ __all__ = [
"YoutubeLoader",
"PyPDFDirectoryLoader",
"MathpixPDFLoader",
"ChatGPTLoader",
"HuggingFaceDatasetLoader",
"Docx2txtLoader",
]

View File

@ -1,10 +1,69 @@
"""Loader that loads word documents."""
import os
import tempfile
from abc import ABC
from typing import List
from urllib.parse import urlparse
import requests
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
class Docx2txtLoader(BaseLoader, ABC):
"""Loads a DOCX with docx2txt and chunks at character level.
Defaults to check for local file, but if the file is a web path, it will download it
to a temporary file, and use that, then clean up the temporary file after completion
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
self.file_path = file_path
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
r = requests.get(self.file_path)
if r.status_code != 200:
raise ValueError(
"Check the url of your file; returned status code %s"
% r.status_code
)
self.web_path = self.file_path
self.temp_file = tempfile.NamedTemporaryFile()
self.temp_file.write(r.content)
self.file_path = self.temp_file.name
elif not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file or url" % self.file_path)
def __del__(self) -> None:
if hasattr(self, "temp_file"):
self.temp_file.close()
def load(self) -> List[Document]:
"""Load given path as single page."""
import docx2txt
return [
Document(
page_content=docx2txt.process(self.file_path),
metadata={"source": self.file_path},
)
]
@staticmethod
def _is_valid_url(url: str) -> bool:
"""Check if the url is valid."""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load word documents."""