From e51fad14887768c9e78c7898d62eb4eebfb69189 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sat, 11 Feb 2023 08:29:28 -0800 Subject: [PATCH] Harrison/0083 (#996) Co-authored-by: Harrison Chase --- .../chat_vector_db.ipynb | 71 ++++++++++++++++--- langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/text.py | 20 ++++++ pyproject.toml | 2 +- 4 files changed, 86 insertions(+), 9 deletions(-) create mode 100644 langchain/document_loaders/text.py diff --git a/docs/modules/chains/combine_docs_examples/chat_vector_db.ipynb b/docs/modules/chains/combine_docs_examples/chat_vector_db.ipynb index caaae7f0..8652b64b 100644 --- a/docs/modules/chains/combine_docs_examples/chat_vector_db.ipynb +++ b/docs/modules/chains/combine_docs_examples/chat_vector_db.ipynb @@ -21,28 +21,83 @@ "from langchain.vectorstores.faiss import FAISS\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.llms import OpenAI\n", - "from langchain.chains import ChatVectorDBChain" + "from langchain.chains import ChatVectorDBChain\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "markdown", + "id": "cdff94be", + "metadata": {}, + "source": [ + "Load in documents. You can replace this with a loader for whatever type of data you want" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "01c46e92", + "metadata": {}, + "outputs": [], + "source": [ + "loader = TextLoader('../../state_of_the_union.txt')\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "e9be4779", + "metadata": {}, + "source": [ + "If you had multiple loaders that you wanted to combine, you do something like:" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, + "id": "433363a5", + "metadata": {}, + "outputs": [], + "source": [ + "# loaders = [....]\n", + "# docs = []\n", + "# for loader in loaders:\n", + "# docs.extend(loader.load())" + ] + }, + { + "cell_type": "markdown", + "id": "239475d2", + "metadata": {}, + "source": [ + "We now split the documents, create embeddings for them, and put them in a vectorstore. This allows us to do semantic search over them." + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "a8930cf7", "metadata": {}, "outputs": [], "source": [ - "with open('../../state_of_the_union.txt') as f:\n", - " state_of_the_union = f.read()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", - "texts = text_splitter.split_text(state_of_the_union)\n", + "documents = text_splitter.split_documents(documents)\n", "\n", "embeddings = OpenAIEmbeddings()\n", - "vectorstore = FAISS.from_texts(texts, embeddings)" + "vectorstore = FAISS.from_documents(documents, embeddings)" + ] + }, + { + "cell_type": "markdown", + "id": "3c96b118", + "metadata": {}, + "source": [ + "We now initialize the ChatVectorDBChain" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "7b4110f3", "metadata": {}, "outputs": [], @@ -157,7 +212,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 13bc5700..a8d67435 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -23,6 +23,7 @@ from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_file import S3FileLoader +from langchain.document_loaders.text import TextLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.url import UnstructuredURLLoader from langchain.document_loaders.web_base import WebBaseLoader @@ -44,6 +45,7 @@ __all__ = [ "RoamLoader", "YoutubeLoader", "S3FileLoader", + "TextLoader", "S3DirectoryLoader", "GCSFileLoader", "GCSDirectoryLoader", diff --git a/langchain/document_loaders/text.py b/langchain/document_loaders/text.py new file mode 100644 index 00000000..6962833a --- /dev/null +++ b/langchain/document_loaders/text.py @@ -0,0 +1,20 @@ +"""Load text files.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class TextLoader(BaseLoader): + """Load text files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + self.file_path = file_path + + def load(self) -> List[Document]: + """Load from file path.""" + with open(self.file_path) as f: + text = f.read() + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)] diff --git a/pyproject.toml b/pyproject.toml index f6f24b02..3e4eb1a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain" -version = "0.0.82" +version = "0.0.83" description = "Building applications with LLMs through composability" authors = [] license = "MIT"