From 01c5cd365b46b768de6bddbacc3afbf5aaab2294 Mon Sep 17 00:00:00 2001 From: Josh Phillips Date: Thu, 26 Oct 2023 15:19:17 -0600 Subject: [PATCH] Fix SupbaseVectoreStore write operation timeout (#12318) **Description** This small change will make chunk_size a configurable parameter for loading documents into a Supabase database. **Issue** https://github.com/langchain-ai/langchain/issues/11422 **Dependencies** No chanages **Twitter** @ j1philli **Reminder** If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --------- Co-authored-by: Greg Richardson --- .../integrations/vectorstores/supabase.ipynb | 4 ++-- .../langchain/vectorstores/supabase.py | 17 ++++++++++++----- .../integration_tests/examples/hello_world.py | 2 +- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/docs/docs/integrations/vectorstores/supabase.ipynb b/docs/docs/integrations/vectorstores/supabase.ipynb index 24008d9a71..033a5d381f 100644 --- a/docs/docs/integrations/vectorstores/supabase.ipynb +++ b/docs/docs/integrations/vectorstores/supabase.ipynb @@ -197,7 +197,7 @@ "id": "5abb9b93", "metadata": {}, "source": [ - "Insert the above documents into the database. Embeddings will automatically be generated for each document." + "Insert the above documents into the database. Embeddings will automatically be generated for each document. You can adjust the chunk_size based on the amount of documents you have. The default is 500 but lowering it may be necessary." ] }, { @@ -208,7 +208,7 @@ "outputs": [], "source": [ "\n", - "vector_store = SupabaseVectorStore.from_documents(docs, embeddings, client=supabase, table_name=\"documents\", query_name=\"match_documents\")" + "vector_store = SupabaseVectorStore.from_documents(docs, embeddings, client=supabase, table_name=\"documents\", query_name=\"match_documents\", chunk_size=500)" ] }, { diff --git a/libs/langchain/langchain/vectorstores/supabase.py b/libs/langchain/langchain/vectorstores/supabase.py index 2cb04385ae..f5dbdad9af 100644 --- a/libs/langchain/langchain/vectorstores/supabase.py +++ b/libs/langchain/langchain/vectorstores/supabase.py @@ -61,6 +61,7 @@ class SupabaseVectorStore(VectorStore): client=supabase_client, table_name="documents", query_name="match_documents", + chunk_size=500, ) To load from an existing table: @@ -88,6 +89,7 @@ class SupabaseVectorStore(VectorStore): client: supabase.client.Client, embedding: Embeddings, table_name: str, + chunk_size: int = 500, query_name: Union[str, None] = None, ) -> None: """Initialize with supabase client.""" @@ -103,6 +105,9 @@ class SupabaseVectorStore(VectorStore): self._embedding: Embeddings = embedding self.table_name = table_name or "documents" self.query_name = query_name or "match_documents" + self.chunk_size = chunk_size or 500 + # According to the SupabaseVectorStore JS implementation, the best chunk size + # is 500. Though for large datasets it can be too large so it is configurable. @property def embeddings(self) -> Embeddings: @@ -130,6 +135,7 @@ class SupabaseVectorStore(VectorStore): client: Optional[supabase.client.Client] = None, table_name: Optional[str] = "documents", query_name: Union[str, None] = "match_documents", + chunk_size: int = 500, ids: Optional[List[str]] = None, **kwargs: Any, ) -> "SupabaseVectorStore": @@ -144,13 +150,14 @@ class SupabaseVectorStore(VectorStore): embeddings = embedding.embed_documents(texts) ids = [str(uuid.uuid4()) for _ in texts] docs = cls._texts_to_documents(texts, metadatas) - cls._add_vectors(client, table_name, embeddings, docs, ids) + cls._add_vectors(client, table_name, embeddings, docs, ids, chunk_size) return cls( client=client, embedding=embedding, table_name=table_name, query_name=query_name, + chunk_size=chunk_size, ) def add_vectors( @@ -159,7 +166,9 @@ class SupabaseVectorStore(VectorStore): documents: List[Document], ids: List[str], ) -> List[str]: - return self._add_vectors(self._client, self.table_name, vectors, documents, ids) + return self._add_vectors( + self._client, self.table_name, vectors, documents, ids, self.chunk_size + ) def similarity_search( self, @@ -300,6 +309,7 @@ class SupabaseVectorStore(VectorStore): vectors: List[List[float]], documents: List[Document], ids: List[str], + chunk_size: int, ) -> List[str]: """Add vectors to Supabase table.""" @@ -313,9 +323,6 @@ class SupabaseVectorStore(VectorStore): for idx, embedding in enumerate(vectors) ] - # According to the SupabaseVectorStore JS implementation, the best chunk size - # is 500 - chunk_size = 500 id_list: List[str] = [] for i in range(0, len(rows), chunk_size): chunk = rows[i : i + chunk_size] diff --git a/libs/langchain/tests/integration_tests/examples/hello_world.py b/libs/langchain/tests/integration_tests/examples/hello_world.py index 53040a6c67..3f0294febb 100644 --- a/libs/langchain/tests/integration_tests/examples/hello_world.py +++ b/libs/langchain/tests/integration_tests/examples/hello_world.py @@ -3,7 +3,7 @@ import sys -def main(): +def main() -> int: print("Hello World!") return 0