mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Harrison/obsidian (#920)
This commit is contained in:
parent
1e56879d38
commit
637c0d6508
94
docs/modules/document_loaders/examples/email.ipynb
Normal file
94
docs/modules/document_loaders/examples/email.ipynb
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "9fdbd55d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Email\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows how to load email (`.eml`) files."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "40cd9806",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import UnstructuredEmailLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "2d20b852",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredEmailLoader('example_data/fake-email.eml')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "579fa702",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "90c1d899",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='This is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "4ef9a5f4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,9 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>My First Heading</h1>
|
||||||
|
<p>My first paragraph.</p>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -0,0 +1,20 @@
|
|||||||
|
MIME-Version: 1.0
|
||||||
|
Date: Fri, 16 Dec 2022 17:04:16 -0500
|
||||||
|
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
|
||||||
|
Subject: Test Email
|
||||||
|
From: Matthew Robinson <mrobinson@unstructured.io>
|
||||||
|
To: Matthew Robinson <mrobinson@unstructured.io>
|
||||||
|
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
||||||
|
|
||||||
|
--00000000000095c9b205eff92630
|
||||||
|
Content-Type: text/plain; charset="UTF-8"
|
||||||
|
This is a test email to use for unit tests.
|
||||||
|
Important points:
|
||||||
|
- Roses are red
|
||||||
|
- Violets are blue
|
||||||
|
--00000000000095c9b205eff92630
|
||||||
|
Content-Type: text/html; charset="UTF-8"
|
||||||
|
|
||||||
|
<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
|
||||||
|
|
||||||
|
--00000000000095c9b205eff92630--
|
Binary file not shown.
BIN
docs/modules/document_loaders/examples/example_data/fake.docx
Normal file
BIN
docs/modules/document_loaders/examples/example_data/fake.docx
Normal file
Binary file not shown.
Binary file not shown.
94
docs/modules/document_loaders/examples/microsoft_word.ipynb
Normal file
94
docs/modules/document_loaders/examples/microsoft_word.ipynb
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "34c90eed",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Microsoft Word\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows how to load text from Microsoft word documents."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "28ded768",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import UnstructuredDocxLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "f1f26035",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredDocxLoader('example_data/fake.docx')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "2c87dde9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "0e4a884c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "61953c83",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
66
docs/modules/document_loaders/examples/obsidian.ipynb
Normal file
66
docs/modules/document_loaders/examples/obsidian.ipynb
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1dc7df1d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Obsidian\n",
|
||||||
|
"This notebook covers how to load documents from an Obsidian database.\n",
|
||||||
|
"\n",
|
||||||
|
"Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "007c5cbf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import ObsidianLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a1caec59",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = ObsidianLoader(\"<path-to-obsidian>\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b1c30ff7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -17,6 +17,14 @@ There are a lot of different document loaders that LangChain supports. Below are
|
|||||||
|
|
||||||
`PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file.
|
`PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file.
|
||||||
|
|
||||||
|
`Email <./examples/email.html>`_: A walkthrough of how to load data from an email (`.eml`) file.
|
||||||
|
|
||||||
|
`GoogleDrive <./examples/googledrive.html>`_: A walkthrough of how to load data from Google drive.
|
||||||
|
|
||||||
|
`Microsoft Word <./examples/microsoft_word.html>`_: A walkthrough of how to load data from Microsoft Word files.
|
||||||
|
|
||||||
|
`Obsidian <./examples/obsidian.html>`_: A walkthrough of how to load data from an Obsidian file dump.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:glob:
|
:glob:
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
"""All different types of document loaders."""
|
"""All different types of document loaders."""
|
||||||
|
|
||||||
from langchain.document_loaders.directory import DirectoryLoader
|
from langchain.document_loaders.directory import DirectoryLoader
|
||||||
|
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||||
|
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||||
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
||||||
from langchain.document_loaders.html import UnstructuredHTMLLoader
|
from langchain.document_loaders.html import UnstructuredHTMLLoader
|
||||||
from langchain.document_loaders.notion import NotionDirectoryLoader
|
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||||
|
from langchain.document_loaders.obsidian import ObsidianLoader
|
||||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||||
@ -18,4 +21,7 @@ __all__ = [
|
|||||||
"UnstructuredHTMLLoader",
|
"UnstructuredHTMLLoader",
|
||||||
"UnstructuredPowerPointLoader",
|
"UnstructuredPowerPointLoader",
|
||||||
"UnstructuredPDFLoader",
|
"UnstructuredPDFLoader",
|
||||||
|
"ObsidianLoader",
|
||||||
|
"UnstructuredDocxLoader",
|
||||||
|
"UnstructuredEmailLoader",
|
||||||
]
|
]
|
||||||
|
29
langchain/document_loaders/docx.py
Normal file
29
langchain/document_loaders/docx.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
"""Loader that loads Microsoft Word files."""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredDocxLoader(BaseLoader):
|
||||||
|
"""Loader that uses unstructured to load Microsoft Word files."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
try:
|
||||||
|
import unstructured # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"unstructured package not found, please install it with "
|
||||||
|
"`pip install unstructured`"
|
||||||
|
)
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load file."""
|
||||||
|
from unstructured.partition.docx import partition_docx
|
||||||
|
|
||||||
|
elements = partition_docx(filename=self.file_path)
|
||||||
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
29
langchain/document_loaders/email.py
Normal file
29
langchain/document_loaders/email.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
"""Loader that loads email files."""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredEmailLoader(BaseLoader):
|
||||||
|
"""Loader that uses unstructured to load email files."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
try:
|
||||||
|
import unstructured # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"unstructured package not found, please install it with "
|
||||||
|
"`pip install unstructured`"
|
||||||
|
)
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load file."""
|
||||||
|
from unstructured.partition.email import partition_email
|
||||||
|
|
||||||
|
elements = partition_email(filename=self.file_path)
|
||||||
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
25
langchain/document_loaders/obsidian.py
Normal file
25
langchain/document_loaders/obsidian.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
"""Loader that loads Notion directory dump."""
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class ObsidianLoader(BaseLoader):
|
||||||
|
"""Loader that loads Obsidian files from disk."""
|
||||||
|
|
||||||
|
def __init__(self, path: str):
|
||||||
|
"""Initialize with path."""
|
||||||
|
self.file_path = path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
ps = list(Path(self.file_path).glob("**/*.md"))
|
||||||
|
docs = []
|
||||||
|
for p in ps:
|
||||||
|
with open(p) as f:
|
||||||
|
text = f.read()
|
||||||
|
metadata = {"source": str(p)}
|
||||||
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
return docs
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads PowerPoint files."""
|
"""Loader that loads PDF files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredPDFLoader(BaseLoader):
|
class UnstructuredPDFLoader(BaseLoader):
|
||||||
"""Loader that uses unstructured to load PowerPoint files."""
|
"""Loader that uses unstructured to load PDF files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads PDF files."""
|
"""Loader that loads powerpoint files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredPowerPointLoader(BaseLoader):
|
class UnstructuredPowerPointLoader(BaseLoader):
|
||||||
"""Loader that uses unstructured to load PDF files."""
|
"""Loader that uses unstructured to load powerpoint files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
|
Loading…
Reference in New Issue
Block a user