community[minor]: Update pgvecto_rs to use its high level sdk (#15574)

- **Description:** Update pgvecto_rs to use its high level sdk, - **Issue:** fix #15173
6 months ago · ddf4e7c633
parent ce21392a21
commit ddf4e7c633
2 changed files with 102 additions and 80 deletions
--- a/docs/docs/integrations/vectorstores/pgvecto_rs.ipynb
+++ b/docs/docs/integrations/vectorstores/pgvecto_rs.ipynb
@ -6,7 +6,7 @@
   "source": [
    "# PGVecto.rs\n",
    "\n",
-    "This notebook shows how to use functionality related to the Postgres vector database ([pgvecto.rs](https://github.com/tensorchord/pgvecto.rs)). You need to install SQLAlchemy >= 2 manually."
+    "This notebook shows how to use functionality related to the Postgres vector database ([pgvecto.rs](https://github.com/tensorchord/pgvecto.rs))."
   ]
  },
  {
@ -15,10 +15,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "## Loading Environment Variables\n",
+    "%pip install \"pgvecto_rs[sdk]\""
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()"
   ]
  },
  {
@ -32,8 +29,8 @@
    "from langchain.docstore.document import Document\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain_community.document_loaders import TextLoader\n",
-    "from langchain_community.vectorstores.pgvecto_rs import PGVecto_rs\n",
+    "from langchain_community.embeddings.fake import FakeEmbeddings\n",
-    "from langchain_openai import OpenAIEmbeddings"
+    "from langchain_community.vectorstores.pgvecto_rs import PGVecto_rs"
   ]
  },
  {
@ -42,12 +39,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
+    "loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
    "documents = loader.load()\n",
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
    "docs = text_splitter.split_documents(documents)\n",
    "\n",
-    "embeddings = OpenAIEmbeddings()"
+    "embeddings = FakeEmbeddings(size=3)"
   ]
  },
  {
@ -176,7 +173,42 @@
   "outputs": [],
   "source": [
    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
-    "docs: List[Document] = db1.similarity_search(query, k=4)"
+    "docs: List[Document] = db1.similarity_search(query, k=4)\n",
    "for doc in docs:\n",
    "    print(doc.page_content)\n",
    "    print(\"======================\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Similarity Search with Filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pgvecto_rs.sdk.filters import meta_contains\n",
    "\n",
    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
    "docs: List[Document] = db1.similarity_search(\n",
    "    query, k=4, filter=meta_contains({\"source\": \"../../modules/state_of_the_union.txt\"})\n",
    ")\n",
    "\n",
    "for doc in docs:\n",
    "    print(doc.page_content)\n",
    "    print(\"======================\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or:"
   ]
  },
  {
@ -185,6 +217,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
    "docs: List[Document] = db1.similarity_search(\n",
    "    query, k=4, filter={\"source\": \"../../modules/state_of_the_union.txt\"}\n",
    ")\n",
    "\n",
    "for doc in docs:\n",
    "    print(doc.page_content)\n",
    "    print(\"======================\")"
@ -207,7 +244,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
+   "version": "3.11.6"
  }
 },
 "nbformat": 4,
--- a/libs/community/langchain_community/vectorstores/pgvecto_rs.py
+++ b/libs/community/langchain_community/vectorstores/pgvecto_rs.py
@ -1,32 +1,17 @@
 from __future__ import annotations
 import uuid
-from typing import Any, Iterable, List, Literal, Optional, Tuple, Type
+from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union
 import numpy as np
 import sqlalchemy
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_core.vectorstores import VectorStore
 from sqlalchemy import insert, select
 from sqlalchemy.dialects import postgresql
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
 from sqlalchemy.orm.session import Session
 class _ORMBase(DeclarativeBase):
    __tablename__: str
    id: Mapped[uuid.UUID]
    text: Mapped[str]
    meta: Mapped[dict]
    embedding: Mapped[np.ndarray]
 class PGVecto_rs(VectorStore):
    """VectorStore backed by pgvecto_rs."""
-    _engine: sqlalchemy.engine.Engine
+    _store = None
    _table: Type[_ORMBase]
    _embedding: Embeddings
    def __init__(
@ -45,28 +30,22 @@ class PGVecto_rs(VectorStore):
            db_url: Database URL.
            collection_name: Name of the collection.
            new_table: Whether to create a new table or connect to an existing one.
-              Defaults to False.
+            If true, the table will be dropped if exists, then recreated.
            Defaults to False.
        """
        try:
-            from pgvecto_rs.sqlalchemy import Vector
+            from pgvecto_rs.sdk import PGVectoRs
        except ImportError as e:
            raise ImportError(
-                "Unable to import pgvector_rs, please install with "
+                "Unable to import pgvector_rs.sdk , please install with "
-                "`pip install pgvector_rs`."
+                '`pip install "pgvector_rs[sdk]"`.'
            ) from e
-
+        self._store = PGVectoRs(
-        class _Table(_ORMBase):
+            db_url=db_url,
-            __tablename__ = f"collection_{collection_name}"
+            collection_name=collection_name,
-            id: Mapped[uuid.UUID] = mapped_column(
+            dimension=dimension,
-                postgresql.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
+            recreate=new_table,
-            )
+        )
            text: Mapped[str] = mapped_column(sqlalchemy.String)
            meta: Mapped[dict] = mapped_column(postgresql.JSONB)
            embedding: Mapped[np.ndarray] = mapped_column(Vector(dimension))
        self._engine = sqlalchemy.create_engine(db_url)
        self._table = _Table
        self._table.__table__.create(self._engine, checkfirst=not new_table)  # type: ignore
        self._embedding = embedding
    # ================ Create interface =================
@ -90,7 +69,6 @@ class PGVecto_rs(VectorStore):
            dimension=dimension,
            db_url=db_url,
            collection_name=collection_name,
            new_table=True,
        )
        _self.add_texts(texts, metadatas, **kwargs)
        return _self
@ -148,19 +126,15 @@ class PGVecto_rs(VectorStore):
            List of ids of the added texts.
        """
        from pgvecto_rs.sdk import Record
        embeddings = self._embedding.embed_documents(list(texts))
-        with Session(self._engine) as _session:
+        records = [
-            results: List[str] = []
+            Record.from_text(text, embedding, meta)
-            for text, embedding, metadata in zip(
+            for text, embedding, meta in zip(texts, embeddings, metadatas or [])
-                texts, embeddings, metadatas or [dict()] * len(list(texts))
+        ]
-            ):
+        self._store.insert(records)
-                t = insert(self._table).values(
+        return [str(record.id) for record in records]
                    text=text, meta=metadata, embedding=embedding
                )
                id = _session.execute(t).inserted_primary_key[0]  # type: ignore
                results.append(str(id))
            _session.commit()
            return results
    def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
        """Run more documents through the embeddings and add to the vectorstore.
@ -185,31 +159,41 @@ class PGVecto_rs(VectorStore):
        distance_func: Literal[
            "sqrt_euclid", "neg_dot_prod", "ned_cos"
        ] = "sqrt_euclid",
        filter: Union[None, Dict[str, Any], Any] = None,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        """Return docs most similar to query vector, with its score."""
        with Session(self._engine) as _session:
            real_distance_func = (
                self._table.embedding.squared_euclidean_distance
                if distance_func == "sqrt_euclid"
                else self._table.embedding.negative_dot_product_distance
                if distance_func == "neg_dot_prod"
                else self._table.embedding.negative_cosine_distance
                if distance_func == "ned_cos"
                else None
            )
            if real_distance_func is None:
                raise ValueError("Invalid distance function")
-            t = (
+        from pgvecto_rs.sdk.filters import meta_contains
-                select(self._table, real_distance_func(query_vector).label("score"))
+
-                .order_by("score")
+        distance_func_map = {
-                .limit(k)  # type: ignore
+            "sqrt_euclid": "<->",
            "neg_dot_prod": "<#>",
            "ned_cos": "<=>",
        }
        if filter is None:
            real_filter = None
        elif isinstance(filter, dict):
            real_filter = meta_contains(filter)
        else:
            real_filter = filter
        results = self._store.search(
            query_vector,
            distance_func_map[distance_func],
            k,
            filter=real_filter,
        )
        return [
            (
                Document(
                    page_content=res[0].text,
                    metadata=res[0].meta,
                ),
                res[1],
            )
-            return [
+            for res in results
-                (Document(page_content=row[0].text, metadata=row[0].meta), row[1])
+        ]
                for row in _session.execute(t)
            ]
    def similarity_search_by_vector(
        self,
@ -218,11 +202,12 @@ class PGVecto_rs(VectorStore):
        distance_func: Literal[
            "sqrt_euclid", "neg_dot_prod", "ned_cos"
        ] = "sqrt_euclid",
        filter: Optional[Any] = None,
        **kwargs: Any,
    ) -> List[Document]:
        return [
            doc
-            for doc, score in self.similarity_search_with_score_by_vector(
+            for doc, _score in self.similarity_search_with_score_by_vector(
                embedding, k, distance_func, **kwargs
            )
        ]
@ -254,7 +239,7 @@ class PGVecto_rs(VectorStore):
        query_vector = self._embedding.embed_query(query)
        return [
            doc
-            for doc, score in self.similarity_search_with_score_by_vector(
+            for doc, _score in self.similarity_search_with_score_by_vector(
                query_vector, k, distance_func, **kwargs
            )
        ]