From 01a57198b8eb4c0c136eb8af77f3fd16359e4112 Mon Sep 17 00:00:00 2001 From: Tim Asp <707699+timothyasp@users.noreply.github.com> Date: Fri, 10 Mar 2023 15:14:35 -0800 Subject: [PATCH] [bugfix] Fix persisted chromadb vectorstore (#1444) If a `persist_directory` param was set, chromadb would throw a warning that ""No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction". and would error with a `Illegal instruction: 4` error. This is on a MBP M1 13.2.1, python 3.9. I'm not entirely sure why that error happened, but when using `get_or_create_collection` instead of `list_collection` on our end, the error and warning goes away and chroma works as expected. Added bonus this is cleaner and likely more efficient. `list_collections` builds a new `Collection` instance for each collect, then `Chroma` would just use the `name` field to tell if the collection existed. --- langchain/vectorstores/chroma.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/langchain/vectorstores/chroma.py b/langchain/vectorstores/chroma.py index 0699b089..d4a8d63d 100644 --- a/langchain/vectorstores/chroma.py +++ b/langchain/vectorstores/chroma.py @@ -78,22 +78,12 @@ class Chroma(VectorStore): self._client = chromadb.Client(self._client_settings) self._embedding_function = embedding_function self._persist_directory = persist_directory - - # Check if the collection exists, create it if not - if collection_name in [col.name for col in self._client.list_collections()]: - self._collection = self._client.get_collection(name=collection_name) - # TODO: Persist the user's embedding function - logger.warning( - f"Collection {collection_name} already exists," - " Do you have the right embedding function?" - ) - else: - self._collection = self._client.create_collection( - name=collection_name, - embedding_function=self._embedding_function.embed_documents - if self._embedding_function is not None - else None, - ) + self._collection = self._client.get_or_create_collection( + name=collection_name, + embedding_function=self._embedding_function.embed_documents + if self._embedding_function is not None + else None, + ) def add_texts( self, @@ -224,9 +214,9 @@ class Chroma(VectorStore): Otherwise, the data will be ephemeral in-memory. Args: + texts (List[str]): List of texts to add to the collection. collection_name (str): Name of the collection to create. persist_directory (Optional[str]): Directory to persist the collection. - documents (List[Document]): List of documents to add. embedding (Optional[Embeddings]): Embedding function. Defaults to None. metadatas (Optional[List[dict]]): List of metadatas. Defaults to None. ids (Optional[List[str]]): List of document IDs. Defaults to None. @@ -263,6 +253,7 @@ class Chroma(VectorStore): Args: collection_name (str): Name of the collection to create. persist_directory (Optional[str]): Directory to persist the collection. + ids (Optional[List[str]]): List of document IDs. Defaults to None. documents (List[Document]): List of documents to add to the vectorstore. embedding (Optional[Embeddings]): Embedding function. Defaults to None. client_settings (Optional[chromadb.config.Settings]): Chroma client settings