From 0d7aa1ee994c7813a957b206df1aabca901ce807 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 25 Dec 2022 09:53:07 -0500 Subject: [PATCH] Harrison/docs to index (#419) Add method for going directly from documents to VectorStores Update notebook to showcase this functionality --- .../embeddings.ipynb | 60 +++++++++++++++++++ langchain/vectorstores/base.py | 12 ++++ 2 files changed, 72 insertions(+) diff --git a/docs/examples/data_augmented_generation/embeddings.ipynb b/docs/examples/data_augmented_generation/embeddings.ipynb index bf7d571e00..e816a59b46 100644 --- a/docs/examples/data_augmented_generation/embeddings.ipynb +++ b/docs/examples/data_augmented_generation/embeddings.ipynb @@ -98,6 +98,66 @@ "print(docs[0].page_content)" ] }, + { + "cell_type": "markdown", + "id": "bbf5ec44", + "metadata": {}, + "source": [ + "## From Documents\n", + "We can also initialize a vectorstore from documents directly. This is useful when we use the method on the text splitter to get documents directly (handy when the original documents have associated metadata)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "df4a459c", + "metadata": {}, + "outputs": [], + "source": [ + "documents = text_splitter.create_documents([state_of_the_union], metadatas=[{\"source\": \"State of the Union\"}])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4b480245", + "metadata": {}, + "outputs": [], + "source": [ + "docsearch = FAISS.from_documents(documents, embeddings)\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "86aa4cda", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. \n", + "\n", + "We cannot let this happen. \n", + "\n", + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. \n" + ] + } + ], + "source": [ + "print(docs[0].page_content)" + ] + }, { "cell_type": "markdown", "id": "eea6e627", diff --git a/langchain/vectorstores/base.py b/langchain/vectorstores/base.py index 16952ec63a..cbb6a75b41 100644 --- a/langchain/vectorstores/base.py +++ b/langchain/vectorstores/base.py @@ -47,6 +47,18 @@ class VectorStore(ABC): """ raise NotImplementedError + @classmethod + def from_documents( + cls, + documents: List[Document], + embedding: Embeddings, + **kwargs: Any, + ) -> VectorStore: + """Return VectorStore initialized from documents and embeddings.""" + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs) + @classmethod @abstractmethod def from_texts(