Add pg_hnsw vectorstore integration (#6893)

Hi @rlancemartin, @eyurtsev! - Description: Adding HNSW extension support for Postgres. Similar to pgvector vectorstore, with 3 differences 1. it uses HNSW extension for exact and ANN searches, 2. Vectors are of type array of real 3. Only supports L2 - Dependencies: [HNSW](https://github.com/knizhnik/hnsw) extension for Postgres - Example: ```python db = HNSWVectoreStore.from_documents( embedding=embeddings, documents=docs, collection_name=collection_name, connection_string=connection_string ) query = "What did the president say about Ketanji Brown Jackson" docs_with_score: List[Tuple[Document, float]] = db.similarity_search_with_score(query) ``` The example notebook is in the PR too.
2024-11-06 03:20:49 +00:00 · 2023-07-05 17:10:10 +02:00 · 2023-07-05 17:10:10 +02:00 · 6fc24743b7
commit 6fc24743b7
parent 79fb90aafd
3 changed files with 850 additions and 0 deletions
--- a/docs/extras/modules/data_connection/vectorstores/integrations/pgembedding.ipynb
+++ b/docs/extras/modules/data_connection/vectorstores/integrations/pgembedding.ipynb
@ -0,0 +1,338 @@
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "1292f057",
   "metadata": {},
   "source": [
    "# pg_hnsw\n",
    "\n",
    "> [pg_embedding](https://github.com/knizhnik/hnsw) is an open-source vector similarity search for `Postgres` that uses  Hierarchical Navigable Small Worlds for approximate nearest neighbor search.\n",
    "\n",
    "It supports:\n",
    "- exact and approximate nearest neighbor search using HNSW\n",
    "- L2 distance\n",
    "\n",
    "This notebook shows how to use the Postgres vector database (`PGEmbedding`).\n",
    "\n",
    "> The PGEmbedding integration creates the pg_embedding extension for you, but you run the following Postgres query to add it:\n",
    "```sql\n",
    "CREATE EXTENSION embedding;\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6214221",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pip install necessary package\n",
    "!pip install openai\n",
    "!pip install psycopg2-binary\n",
    "!pip install tiktoken"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "b2e49694",
   "metadata": {},
   "source": [
    "Add the OpenAI API Key to the environment variables to use `OpenAIEmbeddings`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1dcc8d99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OpenAI API Key:········\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import getpass\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9719ea68",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Loading Environment Variables\n",
    "from typing import List, Tuple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfd1f38d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.vectorstores import PGEmbedding\n",
    "from langchain.document_loaders import TextLoader\n",
    "from langchain.docstore.document import Document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8fab8cc2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Database Url:········\n"
     ]
    }
   ],
   "source": [
    "os.environ[\"DATABASE_URL\"] = getpass.getpass(\"Database Url:\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bef17115",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = TextLoader(\"state_of_the_union.txt\")\n",
    "documents = loader.load()\n",
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
    "docs = text_splitter.split_documents(documents)\n",
    "\n",
    "embeddings = OpenAIEmbeddings()\n",
    "connection_string = os.environ.get(\"DATABASE_URL\")\n",
    "collection_name = \"state_of_the_union\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "743abfaa",
   "metadata": {},
   "outputs": [],
   "source": [
    "db = PGEmbedding.from_documents(\n",
    "    embedding=embeddings,\n",
    "    documents=docs,\n",
    "    collection_name=collection_name,\n",
    "    connection_string=connection_string,\n",
    ")\n",
    "\n",
    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
    "docs_with_score: List[Tuple[Document, float]] = db.similarity_search_with_score(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41ce4c4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for doc, score in docs_with_score:\n",
    "    print(\"-\" * 80)\n",
    "    print(\"Score: \", score)\n",
    "    print(doc.page_content)\n",
    "    print(\"-\" * 80)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "7ef7b052",
   "metadata": {},
   "source": [
    "## Working with vectorstore in Postgres"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "939151f7",
   "metadata": {},
   "source": [
    "### Uploading a vectorstore in PG "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "595ac511",
   "metadata": {},
   "outputs": [],
   "source": [
    "db = PGEmbedding.from_documents(\n",
    "    embedding=embeddings,\n",
    "    documents=docs,\n",
    "    collection_name=collection_name,\n",
    "    connection_string=connection_string,\n",
    "    pre_delete_collection=False,\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f9510e6b",
   "metadata": {},
   "source": [
    "### Create HNSW Index\n",
    "By default, the extension performs a sequential scan search, with 100% recall. You might consider creating an HNSW index for approximate nearest neighbor (ANN) search to speed up `similarity_search_with_score` execution time. To create the HNSW index on your vector column, use a `create_hnsw_index` function:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d1981fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "PGEmbedding.create_hnsw_index(\n",
    "    max_elements=10000, dims=1536, m=8, ef_construction=16, ef_search=16\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "7adacf29",
   "metadata": {},
   "source": [
    "The function above is equivalent to running the below SQL query:\n",
    "```sql\n",
    "CREATE INDEX ON vectors USING hnsw(vec) WITH (maxelements=10000, dims=1536, m=3, efconstruction=16, efsearch=16);\n",
    "```\n",
    "The HNSW index options used in the statement above include:\n",
    "\n",
    "- maxelements: Defines the maximum number of elements indexed. This is a required parameter. The example shown above has a value of 3. A real-world example would have a much large value, such as 1000000. An \"element\" refers to a data point (a vector) in the dataset, which is represented as a node in the HNSW graph. Typically, you would set this option to a value able to accommodate the number of rows in your in your dataset.\n",
    "- dims: Defines the number of dimensions in your vector data. This is a required parameter. A small value is used in the example above. If you are storing data generated using OpenAI's text-embedding-ada-002 model, which supports 1536 dimensions, you would define a value of 1536, for example.\n",
    "- m: Defines the maximum number of bi-directional links (also referred to as \"edges\") created for each node during graph construction.\n",
    "The following additional index options are supported:\n",
    "\n",
    "- efConstruction: Defines the number of nearest neighbors considered during index construction. The default value is 32.\n",
    "- efsearch: Defines the number of nearest neighbors considered during index search. The default value is 32.\n",
    "For information about how you can configure these options to influence the HNSW algorithm, refer to [Tuning the HNSW algorithm](https://neon-next-git-dprice-hnsw-extension-neondatabase.vercel.app/docs/extensions/hnsw#tuning-the-hnsw-algorithm)."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "528893fb",
   "metadata": {},
   "source": [
    "### Retrieving a vectorstore in PG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b6162b1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "store = PGEmbedding(\n",
    "    connection_string=connection_string,\n",
    "    embedding_function=embeddings,\n",
    "    collection_name=collection_name,\n",
    ")\n",
    "\n",
    "retriever = store.as_retriever()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1a5fedb1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VectorStoreRetriever(vectorstore=<langchain.vectorstores.pghnsw.HNSWVectoreStore object at 0x121d3c8b0>, search_type='similarity', search_kwargs={})"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "retriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "0cefc938",
   "metadata": {},
   "outputs": [],
   "source": [
    "db1 = PGEmbedding.from_existing_index(\n",
    "    embedding=embeddings,\n",
    "    collection_name=collection_name,\n",
    "    pre_delete_collection=False,\n",
    "    connection_string=connection_string,\n",
    ")\n",
    "\n",
    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
    "docs_with_score: List[Tuple[Document, float]] = db1.similarity_search_with_score(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85cde495",
   "metadata": {},
   "outputs": [],
   "source": [
    "for doc, score in docs_with_score:\n",
    "    print(\"-\" * 80)\n",
    "    print(\"Score: \", score)\n",
    "    print(doc.page_content)\n",
    "    print(\"-\" * 80)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/langchain/vectorstores/init.py
+++ b/langchain/vectorstores/init.py
@ -24,6 +24,7 @@ from langchain.vectorstores.milvus import Milvus
 from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
 from langchain.vectorstores.myscale import MyScale, MyScaleSettings
 from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch
 from langchain.vectorstores.pgembedding import PGEmbedding
 from langchain.vectorstores.pinecone import Pinecone
 from langchain.vectorstores.qdrant import Qdrant
 from langchain.vectorstores.redis import Redis
@ -56,6 +57,7 @@ __all__ = [
    "DocArrayInMemorySearch",
    "ElasticVectorSearch",
    "FAISS",
    "PGEmbedding",
    "Hologres",
    "LanceDB",
    "MatchingEngine",
--- a/langchain/vectorstores/pgembedding.py
+++ b/langchain/vectorstores/pgembedding.py
@ -0,0 +1,510 @@
 """VectorStore wrapper around a Postgres database."""
 from __future__ import annotations
 import logging
 import uuid
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
 import sqlalchemy
 from sqlalchemy import func
 from sqlalchemy.dialects.postgresql import JSON, UUID
 from sqlalchemy.orm import Session, declarative_base, relationship
 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
 from langchain.utils import get_from_dict_or_env
 from langchain.vectorstores.base import VectorStore
 Base = declarative_base()  # type: Any
 ADA_TOKEN_COUNT = 1536
 _LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain"
 class BaseModel(Base):
    __abstract__ = True
    uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
 class CollectionStore(BaseModel):
    __tablename__ = "langchain_pg_collection"
    name = sqlalchemy.Column(sqlalchemy.String)
    cmetadata = sqlalchemy.Column(JSON)
    embeddings = relationship(
        "EmbeddingStore",
        back_populates="collection",
        passive_deletes=True,
    )
    @classmethod
    def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]:
        return session.query(cls).filter(cls.name == name).first()  # type: ignore
    @classmethod
    def get_or_create(
        cls,
        session: Session,
        name: str,
        cmetadata: Optional[dict] = None,
    ) -> Tuple["CollectionStore", bool]:
        """
        Get or create a collection.
        Returns [Collection, bool] where the bool is True if the collection was created.
        """
        created = False
        collection = cls.get_by_name(session, name)
        if collection:
            return collection, created
        collection = cls(name=name, cmetadata=cmetadata)
        session.add(collection)
        session.commit()
        created = True
        return collection, created
 class EmbeddingStore(BaseModel):
    __tablename__ = "langchain_pg_embedding"
    collection_id = sqlalchemy.Column(
        UUID(as_uuid=True),
        sqlalchemy.ForeignKey(
            f"{CollectionStore.__tablename__}.uuid",
            ondelete="CASCADE",
        ),
    )
    collection = relationship(CollectionStore, back_populates="embeddings")
    embedding = sqlalchemy.Column(sqlalchemy.ARRAY(sqlalchemy.REAL))  # type: ignore
    document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    cmetadata = sqlalchemy.Column(JSON, nullable=True)
    # custom_id : any user defined id
    custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
 class QueryResult:
    EmbeddingStore: EmbeddingStore
    distance: float
 class PGEmbedding(VectorStore):
    """
    VectorStore implementation using Postgres and the pg_embedding extension.
    pg_embedding uses sequential scan by default. but you can create a HNSW index
    using the create_hnsw_index method.
    - `connection_string` is a postgres connection string.
    - `embedding_function` any embedding function implementing
        `langchain.embeddings.base.Embeddings` interface.
    - `collection_name` is the name of the collection to use. (default: langchain)
        - NOTE: This is not the name of the table, but the name of the collection.
            The tables will be created when initializing the store (if not exists)
            So, make sure the user has the right permissions to create tables.
    - `distance_strategy` is the distance strategy to use. (default: EUCLIDEAN)
        - `EUCLIDEAN` is the euclidean distance.
    - `pre_delete_collection` if True, will delete the collection if it exists.
        (default: False)
        - Useful for testing.
    """
    def __init__(
        self,
        connection_string: str,
        embedding_function: Embeddings,
        collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
        collection_metadata: Optional[dict] = None,
        pre_delete_collection: bool = False,
        logger: Optional[logging.Logger] = None,
    ) -> None:
        self.connection_string = connection_string
        self.embedding_function = embedding_function
        self.collection_name = collection_name
        self.collection_metadata = collection_metadata
        self.pre_delete_collection = pre_delete_collection
        self.logger = logger or logging.getLogger(__name__)
        self.__post_init__()
    def __post_init__(
        self,
    ) -> None:
        self._conn = self.connect()
        self.create_hnsw_extension()
        self.create_tables_if_not_exists()
        self.create_collection()
    def connect(self) -> sqlalchemy.engine.Connection:
        engine = sqlalchemy.create_engine(self.connection_string)
        conn = engine.connect()
        return conn
    def create_hnsw_extension(self) -> None:
        try:
            with Session(self._conn) as session:
                statement = sqlalchemy.text(
                    "CREATE EXTENSION IF NOT EXISTS pg_embedding"
                )
                session.execute(statement)
                session.commit()
        except Exception as e:
            self.logger.exception(e)
    def create_tables_if_not_exists(self) -> None:
        with self._conn.begin():
            Base.metadata.create_all(self._conn)
    def drop_tables(self) -> None:
        with self._conn.begin():
            Base.metadata.drop_all(self._conn)
    def create_collection(self) -> None:
        if self.pre_delete_collection:
            self.delete_collection()
        with Session(self._conn) as session:
            CollectionStore.get_or_create(
                session, self.collection_name, cmetadata=self.collection_metadata
            )
    def create_hnsw_index(
        self,
        max_elements: int = 10000,
        dims: int = ADA_TOKEN_COUNT,
        m: int = 8,
        ef_construction: int = 16,
        ef_search: int = 16,
    ) -> None:
        create_index_query = sqlalchemy.text(
            "CREATE INDEX IF NOT EXISTS langchain_pg_embedding_idx "
            "ON langchain_pg_embedding USING hnsw (embedding) "
            "WITH ("
            "maxelements = {}, "
            "dims = {}, "
            "m = {}, "
            "efconstruction = {}, "
            "efsearch = {}"
            ");".format(max_elements, dims, m, ef_construction, ef_search)
        )
        # Execute the queries
        try:
            with Session(self._conn) as session:
                # Create the HNSW index
                session.execute(create_index_query)
                session.commit()
            print("HNSW extension and index created successfully.")
        except Exception as e:
            print(f"Failed to create HNSW extension or index: {e}")
    def delete_collection(self) -> None:
        self.logger.debug("Trying to delete collection")
        with Session(self._conn) as session:
            collection = self.get_collection(session)
            if not collection:
                self.logger.warning("Collection not found")
                return
            session.delete(collection)
            session.commit()
    def get_collection(self, session: Session) -> Optional["CollectionStore"]:
        return CollectionStore.get_by_name(session, self.collection_name)
    @classmethod
    def _initialize_from_embeddings(
        cls,
        texts: List[str],
        embeddings: List[List[float]],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        ids: Optional[List[str]] = None,
        collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
        pre_delete_collection: bool = False,
        **kwargs: Any,
    ) -> PGEmbedding:
        if ids is None:
            ids = [str(uuid.uuid1()) for _ in texts]
        if not metadatas:
            metadatas = [{} for _ in texts]
        connection_string = cls.get_connection_string(kwargs)
        store = cls(
            connection_string=connection_string,
            collection_name=collection_name,
            embedding_function=embedding,
            pre_delete_collection=pre_delete_collection,
        )
        store.add_embeddings(
            texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
        )
        return store
    def add_embeddings(
        self,
        texts: List[str],
        embeddings: List[List[float]],
        metadatas: List[dict],
        ids: List[str],
        **kwargs: Any,
    ) -> None:
        with Session(self._conn) as session:
            collection = self.get_collection(session)
            if not collection:
                raise ValueError("Collection not found")
            for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids):
                embedding_store = EmbeddingStore(
                    embedding=embedding,
                    document=text,
                    cmetadata=metadata,
                    custom_id=id,
                )
                collection.embeddings.append(embedding_store)
                session.add(embedding_store)
            session.commit()
    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> List[str]:
        if ids is None:
            ids = [str(uuid.uuid1()) for _ in texts]
        embeddings = self.embedding_function.embed_documents(list(texts))
        if not metadatas:
            metadatas = [{} for _ in texts]
        with Session(self._conn) as session:
            collection = self.get_collection(session)
            if not collection:
                raise ValueError("Collection not found")
            for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids):
                embedding_store = EmbeddingStore(
                    embedding=embedding,
                    document=text,
                    cmetadata=metadata,
                    custom_id=id,
                )
                collection.embeddings.append(embedding_store)
                session.add(embedding_store)
            session.commit()
        return ids
    def similarity_search(
        self,
        query: str,
        k: int = 4,
        filter: Optional[dict] = None,
        **kwargs: Any,
    ) -> List[Document]:
        embedding = self.embedding_function.embed_query(text=query)
        return self.similarity_search_by_vector(
            embedding=embedding,
            k=k,
            filter=filter,
        )
    def similarity_search_with_score(
        self,
        query: str,
        k: int = 4,
        filter: Optional[dict] = None,
    ) -> List[Tuple[Document, float]]:
        embedding = self.embedding_function.embed_query(query)
        docs = self.similarity_search_with_score_by_vector(
            embedding=embedding, k=k, filter=filter
        )
        return docs
    def similarity_search_with_score_by_vector(
        self,
        embedding: List[float],
        k: int = 4,
        filter: Optional[dict] = None,
    ) -> List[Tuple[Document, float]]:
        with Session(self._conn) as session:
            collection = self.get_collection(session)
            set_enable_seqscan_stmt = sqlalchemy.text("SET enable_seqscan = off")
            session.execute(set_enable_seqscan_stmt)
            if not collection:
                raise ValueError("Collection not found")
            filter_by = EmbeddingStore.collection_id == collection.uuid
            if filter is not None:
                filter_clauses = []
                for key, value in filter.items():
                    IN = "in"
                    if isinstance(value, dict) and IN in map(str.lower, value):
                        value_case_insensitive = {
                            k.lower(): v for k, v in value.items()
                        }
                        filter_by_metadata = EmbeddingStore.cmetadata[key].astext.in_(
                            value_case_insensitive[IN]
                        )
                        filter_clauses.append(filter_by_metadata)
                    else:
                        filter_by_metadata = EmbeddingStore.cmetadata[
                            key
                        ].astext == str(value)
                        filter_clauses.append(filter_by_metadata)
                filter_by = sqlalchemy.and_(filter_by, *filter_clauses)
            results: List[QueryResult] = (
                session.query(
                    EmbeddingStore,
                    func.abs(EmbeddingStore.embedding.op("<->")(embedding)).label(
                        "distance"
                    ),
                )  # Specify the columns you need here, e.g., EmbeddingStore.embedding
                .filter(filter_by)
                .order_by(
                    func.abs(EmbeddingStore.embedding.op("<->")(embedding)).asc()
                )  # Using PostgreSQL specific operator with the correct column name
                .limit(k)
                .all()
            )
        docs = [
            (
                Document(
                    page_content=result.EmbeddingStore.document,
                    metadata=result.EmbeddingStore.cmetadata,
                ),
                result.distance if self.embedding_function is not None else None,
            )
            for result in results
        ]
        return docs
    def similarity_search_by_vector(
        self,
        embedding: List[float],
        k: int = 4,
        filter: Optional[dict] = None,
        **kwargs: Any,
    ) -> List[Document]:
        docs_and_scores = self.similarity_search_with_score_by_vector(
            embedding=embedding, k=k, filter=filter
        )
        return [doc for doc, _ in docs_and_scores]
    @classmethod
    def from_texts(
        cls: Type[PGEmbedding],
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
        ids: Optional[List[str]] = None,
        pre_delete_collection: bool = False,
        **kwargs: Any,
    ) -> PGEmbedding:
        embeddings = embedding.embed_documents(list(texts))
        return cls._initialize_from_embeddings(
            texts,
            embeddings,
            embedding,
            metadatas=metadatas,
            ids=ids,
            collection_name=collection_name,
            pre_delete_collection=pre_delete_collection,
            **kwargs,
        )
    @classmethod
    def from_embeddings(
        cls,
        text_embeddings: List[Tuple[str, List[float]]],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
        ids: Optional[List[str]] = None,
        pre_delete_collection: bool = False,
        **kwargs: Any,
    ) -> PGEmbedding:
        texts = [t[0] for t in text_embeddings]
        embeddings = [t[1] for t in text_embeddings]
        return cls._initialize_from_embeddings(
            texts,
            embeddings,
            embedding,
            metadatas=metadatas,
            ids=ids,
            collection_name=collection_name,
            pre_delete_collection=pre_delete_collection,
            **kwargs,
        )
    @classmethod
    def from_existing_index(
        cls: Type[PGEmbedding],
        embedding: Embeddings,
        collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
        pre_delete_collection: bool = False,
        **kwargs: Any,
    ) -> PGEmbedding:
        connection_string = cls.get_connection_string(kwargs)
        store = cls(
            connection_string=connection_string,
            collection_name=collection_name,
            embedding_function=embedding,
            pre_delete_collection=pre_delete_collection,
        )
        return store
    @classmethod
    def get_connection_string(cls, kwargs: Dict[str, Any]) -> str:
        connection_string: str = get_from_dict_or_env(
            data=kwargs,
            key="connection_string",
            env_key="POSTGRES_CONNECTION_STRING",
        )
        if not connection_string:
            raise ValueError(
                "Postgres connection string is required"
                "Either pass it as a parameter"
                "or set the POSTGRES_CONNECTION_STRING environment variable."
            )
        return connection_string
    @classmethod
    def from_documents(
        cls: Type[PGEmbedding],
        documents: List[Document],
        embedding: Embeddings,
        collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
        ids: Optional[List[str]] = None,
        pre_delete_collection: bool = False,
        **kwargs: Any,
    ) -> PGEmbedding:
        texts = [d.page_content for d in documents]
        metadatas = [d.metadata for d in documents]
        connection_string = cls.get_connection_string(kwargs)
        kwargs["connection_string"] = connection_string
        return cls.from_texts(
            texts=texts,
            pre_delete_collection=pre_delete_collection,
            embedding=embedding,
            metadatas=metadatas,
            ids=ids,
            collection_name=collection_name,
            **kwargs,
        )