forked from Archives/langchain
Harrison/unstructured support (#903)
parent
2a68be3e8d
commit
53d56d7650
@ -0,0 +1,29 @@
|
|||||||
|
Document Loaders
|
||||||
|
==========================
|
||||||
|
|
||||||
|
Combining language models with your own text data is a powerful way to differentiate them.
|
||||||
|
The first step in doing this is to load the data into "documents" - a fancy way of say some pieces of text.
|
||||||
|
This module is aimed at making this easy.
|
||||||
|
|
||||||
|
A primary driver of a lot of this is the `Unstructured <https://github.com/Unstructured-IO/unstructured>`_ python package.
|
||||||
|
This package is a great way to transform all types of files - text, powerpoint, images, html, pdf, etc - into text data.
|
||||||
|
|
||||||
|
For detailed instructions on how to get set up with Unstructured, see installation guidelines `here <https://github.com/Unstructured-IO/unstructured#coffee-getting-started>`_.
|
||||||
|
|
||||||
|
The following sections of documentation are provided:
|
||||||
|
|
||||||
|
- `Key Concepts <./document_loaders/key_concepts.html>`_: A conceptual guide going over the various concepts related to loading documents.
|
||||||
|
|
||||||
|
- `How-To Guides <./document_loaders/how_to_guides.html>`_: A collection of how-to guides. These highlight different types of loaders.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
:caption: Document Loaders
|
||||||
|
:name: Document Loaders
|
||||||
|
:hidden:
|
||||||
|
|
||||||
|
./document_loaders/key_concepts.md
|
||||||
|
./document_loaders/how_to_guides.rst
|
@ -0,0 +1,101 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "79f24a6b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Directory Loader\n",
|
||||||
|
"This covers how to use the DirectoryLoader to load all documents in a directory. Under the hood, this uses the [UnstructuredLoader](./unstructured_file.ipynb)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "019d8520",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import DirectoryLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0c76cdc5",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can use the `glob` parameter to control which files to load. Note that here it doesn't load the `.rst` file or the `.ipynb` files."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "891fe56f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = DirectoryLoader('../', glob=\"**/*.md\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "addfe9cf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "b042086d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(docs)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "cbc8256b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
Binary file not shown.
After Width: | Height: | Size: 23 KiB |
Binary file not shown.
After Width: | Height: | Size: 590 KiB |
@ -0,0 +1,86 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1dc7df1d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Notion\n",
|
||||||
|
"This notebook covers how to load documents from a Notion database dump.\n",
|
||||||
|
"\n",
|
||||||
|
"In order to get this notion dump, follow these instructions:\n",
|
||||||
|
"\n",
|
||||||
|
"## 🧑 Instructions for ingesting your own dataset\n",
|
||||||
|
"\n",
|
||||||
|
"Export your dataset from Notion. You can do this by clicking on the three dots in the upper right hand corner and then clicking `Export`.\n",
|
||||||
|
"\n",
|
||||||
|
"<img src=\"export_notion.png\" alt=\"export\" width=\"200\"/>\n",
|
||||||
|
"\n",
|
||||||
|
"When exporting, make sure to select the `Markdown & CSV` format option.\n",
|
||||||
|
"\n",
|
||||||
|
"<img src=\"export_format.png\" alt=\"export-format\" width=\"200\"/>\n",
|
||||||
|
"\n",
|
||||||
|
"This will produce a `.zip` file in your Downloads folder. Move the `.zip` file into this repository.\n",
|
||||||
|
"\n",
|
||||||
|
"Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed).\n",
|
||||||
|
"\n",
|
||||||
|
"```shell\n",
|
||||||
|
"unzip Export-d3adfe0f-3131-4bf3-8987-a52017fc1bae.zip -d Notion_DB\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"Run the following command to ingest the data."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "007c5cbf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import NotionDirectoryLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a1caec59",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = NotionDirectoryLoader(\"Notion_DB\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b1c30ff7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,78 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "17812129",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# ReadTheDocs Documentation\n",
|
||||||
|
"This notebook covers how to load content from html that was generated as part of a Read-The-Docs build.\n",
|
||||||
|
"\n",
|
||||||
|
"For an example of this in the wild, see [here](https://github.com/hwchase17/chat-langchain).\n",
|
||||||
|
"\n",
|
||||||
|
"This assumes that the html has already been scraped into a folder. This can be done by uncommenting and running the following command"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "84696e27",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#!wget -r -A.html -P rtdocs https://langchain.readthedocs.io/en/latest/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "92dd950b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import ReadTheDocsLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "494567c3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = ReadTheDocsLoader(\"rtdocs\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e2e6d6f0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,72 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "20deed05",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Unstructured File Loader\n",
|
||||||
|
"This notebook covers how to use Unstructured to load files of many types. Unstructured currently supports loading of text files, powerpoints, html, pdfs, images, and more."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "79d3e549",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import UnstructuredFileLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "2593d1dc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "fe34e941",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "24e577e5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,19 @@
|
|||||||
|
How To Guides
|
||||||
|
====================================
|
||||||
|
|
||||||
|
There are a lot of different document loaders that LangChain supports. Below are how-to guides for working with them
|
||||||
|
|
||||||
|
`File Loader <./examples/unstructured_file.html>`_: A walkthrough of how to use Unstructured to load files of arbitrary types (pdfs, txt, html, etc).
|
||||||
|
|
||||||
|
`Directory Loader <./examples/directory_loader.html>`_: A walkthrough of how to use Unstructured load files from a given directory.
|
||||||
|
|
||||||
|
`Notion <./examples/notion.html>`_: A walkthrough of how to load data for an arbitrary Notion DB.
|
||||||
|
|
||||||
|
`ReadTheDocs <./examples/readthedocs_documentation.html>`_: A walkthrough of how to load data for documentation generated by ReadTheDocs.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
:glob:
|
||||||
|
:hidden:
|
||||||
|
|
||||||
|
examples/*
|
@ -0,0 +1,12 @@
|
|||||||
|
# Key Concepts
|
||||||
|
|
||||||
|
## Document
|
||||||
|
This class is a container for document information. This contains two parts:
|
||||||
|
- `page_content`: The content of the actual page itself.
|
||||||
|
- `metadata`: The metadata associated with the document. This can be things like the file path, the url, etc.
|
||||||
|
|
||||||
|
## Loader
|
||||||
|
This base class is a way to load documents. It exposes a `load` method that returns `Document` objects.
|
||||||
|
|
||||||
|
## [Unstructured](https://github.com/Unstructured-IO/unstructured)
|
||||||
|
Unstructured is a python package specifically focused on transformations from raw documents to text.
|
@ -0,0 +1,13 @@
|
|||||||
|
"""All different types of document loaders."""
|
||||||
|
|
||||||
|
from langchain.document_loaders.directory import DirectoryLoader
|
||||||
|
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||||
|
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||||
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"UnstructuredFileLoader",
|
||||||
|
"DirectoryLoader",
|
||||||
|
"NotionDirectoryLoader",
|
||||||
|
"ReadTheDocsLoader",
|
||||||
|
]
|
@ -0,0 +1,26 @@
|
|||||||
|
"""Base loader class."""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
||||||
|
|
||||||
|
|
||||||
|
class BaseLoader(ABC):
|
||||||
|
"""Base loader class."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load data into document objects."""
|
||||||
|
|
||||||
|
def load_and_split(
|
||||||
|
self, text_splitter: Optional[TextSplitter] = None
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Load documents and split into chunks."""
|
||||||
|
if text_splitter is None:
|
||||||
|
_text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
|
||||||
|
else:
|
||||||
|
_text_splitter = text_splitter
|
||||||
|
docs = self.load()
|
||||||
|
return _text_splitter.split_documents(docs)
|
@ -0,0 +1,26 @@
|
|||||||
|
"""Loading logic for loading documents from a directory."""
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
|
||||||
|
class DirectoryLoader(BaseLoader):
|
||||||
|
"""Loading logic for loading documents from a directory."""
|
||||||
|
|
||||||
|
def __init__(self, path: str, glob: str = "**/*"):
|
||||||
|
"""Initialize with path to directory and how to glob over it."""
|
||||||
|
self.path = path
|
||||||
|
self.glob = glob
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
p = Path(self.path)
|
||||||
|
docs = []
|
||||||
|
for i in p.glob(self.glob):
|
||||||
|
if i.is_file():
|
||||||
|
sub_docs = UnstructuredFileLoader(str(i)).load()
|
||||||
|
docs.extend(sub_docs)
|
||||||
|
return docs
|
@ -0,0 +1,25 @@
|
|||||||
|
"""Loader that loads Notion directory dump."""
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class NotionDirectoryLoader(BaseLoader):
|
||||||
|
"""Loader that loads Notion directory dump."""
|
||||||
|
|
||||||
|
def __init__(self, path: str):
|
||||||
|
"""Initialize with path."""
|
||||||
|
self.file_path = path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
ps = list(Path(self.file_path).glob("**/*.md"))
|
||||||
|
docs = []
|
||||||
|
for p in ps:
|
||||||
|
with open(p) as f:
|
||||||
|
text = f.read()
|
||||||
|
metadata = {"source": str(p)}
|
||||||
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
return docs
|
@ -0,0 +1,33 @@
|
|||||||
|
"""Loader that loads ReadTheDocs documentation directory dump."""
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class ReadTheDocsLoader(BaseLoader):
|
||||||
|
"""Loader that loads ReadTheDocs documentation directory dump."""
|
||||||
|
|
||||||
|
def __init__(self, path: str):
|
||||||
|
"""Initialize path."""
|
||||||
|
self.file_path = path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def _clean_data(data: str) -> str:
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
text = soup.find_all("main", {"id": "main-content"})[0].get_text()
|
||||||
|
return "\n".join([t for t in text.split("\n") if t])
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
for p in Path(self.file_path).rglob("*"):
|
||||||
|
if p.is_dir():
|
||||||
|
continue
|
||||||
|
with open(p) as f:
|
||||||
|
text = _clean_data(f.read())
|
||||||
|
metadata = {"source": str(p)}
|
||||||
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
return docs
|
@ -0,0 +1,29 @@
|
|||||||
|
"""Loader that uses unstructured to load files."""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredFileLoader(BaseLoader):
|
||||||
|
"""Loader that uses unstructured to load files."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
try:
|
||||||
|
import unstructured # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"unstructured package not found, please install it with "
|
||||||
|
"`pip install unstructured`"
|
||||||
|
)
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load file."""
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
|
||||||
|
elements = partition(filename=self.file_path)
|
||||||
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue