From eb80d6e0e41e316c7e15b39b1e114190c866fe3c Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Wed, 22 Mar 2023 21:10:09 -0700 Subject: [PATCH] Harrison/from methods (#1912) Co-authored-by: shibuiwilliam --- langchain/vectorstores/faiss.py | 129 ++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 24 deletions(-) diff --git a/langchain/vectorstores/faiss.py b/langchain/vectorstores/faiss.py index d71280a7..9b139807 100644 --- a/langchain/vectorstores/faiss.py +++ b/langchain/vectorstores/faiss.py @@ -55,28 +55,18 @@ class FAISS(VectorStore): self.docstore = docstore self.index_to_docstore_id = index_to_docstore_id - def add_texts( + def __add( self, texts: Iterable[str], + embeddings: Iterable[List[float]], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - - Returns: - List of ids from adding the texts into the vectorstore. - """ if not isinstance(self.docstore, AddableMixin): raise ValueError( "If trying to add texts, the underlying docstore should support " f"adding items, which {self.docstore} does not" ) - # Embed and create the documents. - embeddings = [self.embedding_function(text) for text in texts] documents = [] for i, text in enumerate(texts): metadata = metadatas[i] if metadatas else {} @@ -95,6 +85,57 @@ class FAISS(VectorStore): self.index_to_docstore_id.update(index_to_id) return [_id for _, _id, _ in full_info] + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if not isinstance(self.docstore, AddableMixin): + raise ValueError( + "If trying to add texts, the underlying docstore should support " + f"adding items, which {self.docstore} does not" + ) + # Embed and create the documents. + embeddings = [self.embedding_function(text) for text in texts] + return self.__add(texts, embeddings, metadatas, **kwargs) + + def add_embeddings( + self, + text_embeddings: Iterable[Tuple[str, List[float]]], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + text_embeddings: Iterable pairs of string and embedding to + add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if not isinstance(self.docstore, AddableMixin): + raise ValueError( + "If trying to add texts, the underlying docstore should support " + f"adding items, which {self.docstore} does not" + ) + # Embed and create the documents. + + texts = [te[0] for te in text_embeddings] + embeddings = [te[1] for te in text_embeddings] + return self.__add(texts, embeddings, metadatas, **kwargs) + def similarity_search_with_score_by_vector( self, embedding: List[float], k: int = 4 ) -> List[Tuple[Document, float]]: @@ -253,6 +294,28 @@ class FAISS(VectorStore): index_to_id = {index: _id for index, _id, _ in full_info} self.index_to_docstore_id.update(index_to_id) + @classmethod + def __from( + cls, + texts: List[str], + embeddings: List[List[float]], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> FAISS: + faiss = dependable_faiss_import() + index = faiss.IndexFlatL2(len(embeddings[0])) + index.add(np.array(embeddings, dtype=np.float32)) + documents = [] + for i, text in enumerate(texts): + metadata = metadatas[i] if metadatas else {} + documents.append(Document(page_content=text, metadata=metadata)) + index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))} + docstore = InMemoryDocstore( + {index_to_id[i]: doc for i, doc in enumerate(documents)} + ) + return cls(embedding.embed_query, index, docstore, index_to_id) + @classmethod def from_texts( cls, @@ -278,19 +341,37 @@ class FAISS(VectorStore): embeddings = OpenAIEmbeddings() faiss = FAISS.from_texts(texts, embeddings) """ - faiss = dependable_faiss_import() embeddings = embedding.embed_documents(texts) - index = faiss.IndexFlatL2(len(embeddings[0])) - index.add(np.array(embeddings, dtype=np.float32)) - documents = [] - for i, text in enumerate(texts): - metadata = metadatas[i] if metadatas else {} - documents.append(Document(page_content=text, metadata=metadata)) - index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))} - docstore = InMemoryDocstore( - {index_to_id[i]: doc for i, doc in enumerate(documents)} - ) - return cls(embedding.embed_query, index, docstore, index_to_id) + return cls.__from(texts, embeddings, embedding, metadatas, **kwargs) + + @classmethod + def from_embeddings( + cls, + text_embeddings: List[Tuple[str, List[float]]], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> FAISS: + """Construct FAISS wrapper from raw documents. + + This is a user friendly interface that: + 1. Embeds documents. + 2. Creates an in memory docstore + 3. Initializes the FAISS database + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import FAISS + from langchain.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + faiss = FAISS.from_texts(texts, embeddings) + """ + texts = [t[0] for t in text_embeddings] + embeddings = [t[1] for t in text_embeddings] + return cls.__from(texts, embeddings, embedding, metadatas, **kwargs) def save_local(self, folder_path: str) -> None: """Save FAISS index, docstore, and index_to_docstore_id to disk.