community[minor]: Improve CassandraVectorStore from_texts (#20284)

2024-11-18 09:25:54 +00:00 · 2024-04-17 23:12:28 +02:00 · 2024-04-17 23:12:28 +02:00 · 75733c5cc1
commit 75733c5cc1
parent 463160c3f6
1 changed files with 70 additions and 24 deletions
--- a/libs/community/langchain_community/vectorstores/cassandra.py
+++ b/libs/community/langchain_community/vectorstores/cassandra.py
@ -28,6 +28,8 @@ from langchain_community.vectorstores.utils import maximal_marginal_relevance

 CVST = TypeVar("CVST", bound="Cassandra")

+_NOT_SET = object()
+

 class Cassandra(VectorStore):
    """Wrapper around Apache Cassandra(R) for vector-store workloads.
@ -48,6 +50,13 @@ class Cassandra(VectorStore):
                keyspace = 'my_keyspace'  # the keyspace should exist already
                table_name = 'my_vector_store'
                vectorstore = Cassandra(embeddings, session, keyspace, table_name)
+
+    Args:
+        embedding: Embedding function to use.
+        session: Cassandra driver session.
+        keyspace: Cassandra key space.
+        table_name: Cassandra table.
+        ttl_seconds: Optional time-to-live for the added texts.
    """

    _embedding_dimension: Union[int, None]
@ -124,7 +133,7 @@ class Cassandra(VectorStore):
        self.clear()

    def clear(self) -> None:
-        """Empty the collection."""
+        """Empty the table."""
        self.table.clear()

    def delete_by_document_id(self, document_id: str) -> None:
@ -161,12 +170,11 @@ class Cassandra(VectorStore):
        """Run more texts through the embeddings and add to the vectorstore.

        Args:
-            texts (Iterable[str]): Texts to add to the vectorstore.
-            metadatas (Optional[List[dict]], optional): Optional list of metadatas.
-            ids (Optional[List[str]], optional): Optional list of IDs.
-            batch_size (int): Number of concurrent requests to send to the server.
-            ttl_seconds (Optional[int], optional): Optional time-to-live
-                for the added texts.
+            texts: Texts to add to the vectorstore.
+            metadatas: Optional list of metadatas.
+            ids: Optional list of IDs.
+            batch_size: Number of concurrent requests to send to the server.
+            ttl_seconds: Optional time-to-live for the added texts.

        Returns:
            List[str]: List of IDs of the added texts.
@ -337,8 +345,8 @@ class Cassandra(VectorStore):
            k: Number of Documents to return.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
            lambda_mult: Number between 0 and 1 that determines the degree
-                        of diversity among the results with 0 corresponding
-                        to maximum diversity and 1 to minimum diversity.
+                of diversity among the results with 0 corresponding to maximum
+                diversity and 1 to minimum diversity.
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
@ -389,9 +397,9 @@ class Cassandra(VectorStore):
            k: Number of Documents to return.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
            lambda_mult: Number between 0 and 1 that determines the degree
-                        of diversity among the results with 0 corresponding
-                        to maximum diversity and 1 to minimum diversity.
-                        Optional.
+                of diversity among the results with 0 corresponding to maximum
+                diversity and 1 to minimum diversity.
+                Defaults to 0.5.
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
@ -410,53 +418,91 @@ class Cassandra(VectorStore):
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
+        *,
+        session: Session = _NOT_SET,
+        keyspace: str = "",
+        table_name: str = "",
+        ids: Optional[List[str]] = None,
        batch_size: int = 16,
+        ttl_seconds: Optional[int] = None,
        **kwargs: Any,
    ) -> CVST:
        """Create a Cassandra vectorstore from raw texts.

-        No support for specifying text IDs
+        Args:
+            texts: Texts to add to the vectorstore.
+            embedding: Embedding function to use.
+            metadatas: Optional list of metadatas associated with the texts.
+            session: Cassandra driver session (required).
+            keyspace: Cassandra key space (required).
+            table_name: Cassandra table (required).
+            ids: Optional list of IDs associated with the texts.
+            batch_size: Number of concurrent requests to send to the server.
+                Defaults to 16.
+            ttl_seconds: Optional time-to-live for the added texts.

        Returns:
            a Cassandra vectorstore.
        """
-        session: Session = kwargs["session"]
-        keyspace: str = kwargs["keyspace"]
-        table_name: str = kwargs["table_name"]
-        cassandraStore = cls(
+        if session is _NOT_SET:
+            raise ValueError("session parameter is required")
+        if not keyspace:
+            raise ValueError("keyspace parameter is required")
+        if not table_name:
+            raise ValueError("table_name parameter is required")
+        store = cls(
            embedding=embedding,
            session=session,
            keyspace=keyspace,
            table_name=table_name,
+            ttl_seconds=ttl_seconds,
        )
-        cassandraStore.add_texts(texts=texts, metadatas=metadatas)
-        return cassandraStore
+        store.add_texts(
+            texts=texts, metadatas=metadatas, ids=ids, batch_size=batch_size
+        )
+        return store

    @classmethod
    def from_documents(
        cls: Type[CVST],
        documents: List[Document],
        embedding: Embeddings,
+        *,
+        session: Session = _NOT_SET,
+        keyspace: str = "",
+        table_name: str = "",
+        ids: Optional[List[str]] = None,
        batch_size: int = 16,
+        ttl_seconds: Optional[int] = None,
        **kwargs: Any,
    ) -> CVST:
        """Create a Cassandra vectorstore from a document list.

-        No support for specifying text IDs
+        Args:
+            documents: Documents to add to the vectorstore.
+            embedding: Embedding function to use.
+            session: Cassandra driver session (required).
+            keyspace: Cassandra key space (required).
+            table_name: Cassandra table (required).
+            ids: Optional list of IDs associated with the documents.
+            batch_size: Number of concurrent requests to send to the server.
+                Defaults to 16.
+            ttl_seconds: Optional time-to-live for the added documents.

        Returns:
            a Cassandra vectorstore.
        """
        texts = [doc.page_content for doc in documents]
        metadatas = [doc.metadata for doc in documents]
-        session: Session = kwargs["session"]
-        keyspace: str = kwargs["keyspace"]
-        table_name: str = kwargs["table_name"]
        return cls.from_texts(
            texts=texts,
-            metadatas=metadatas,
            embedding=embedding,
+            metadatas=metadatas,
            session=session,
            keyspace=keyspace,
            table_name=table_name,
+            ids=ids,
+            batch_size=batch_size,
+            ttl_seconds=ttl_seconds,
+            **kwargs,
        )