You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/vectorstores/timescalevector.py

883 lines
29 KiB
Python

"""VectorStore wrapper around a Postgres-TimescaleVector database."""
from __future__ import annotations
import enum
import logging
import uuid
from datetime import timedelta
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Tuple,
Type,
Union,
)
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env
from langchain_core.vectorstores import VectorStore
from langchain_community.vectorstores.utils import DistanceStrategy
if TYPE_CHECKING:
from timescale_vector import Predicates
DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE
ADA_TOKEN_COUNT = 1536
_LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain_store"
class TimescaleVector(VectorStore):
"""Timescale Postgres vector store
To use, you should have the ``timescale_vector`` python package installed.
Args:
service_url: Service url on timescale cloud.
embedding: Any embedding function implementing
`langchain.embeddings.base.Embeddings` interface.
collection_name: The name of the collection to use. (default: langchain_store)
This will become the table name used for the collection.
distance_strategy: The distance strategy to use. (default: COSINE)
pre_delete_collection: If True, will delete the collection if it exists.
(default: False). Useful for testing.
Example:
.. code-block:: python
from langchain_community.vectorstores import TimescaleVector
from langchain_community.embeddings.openai import OpenAIEmbeddings
SERVICE_URL = "postgres://tsdbadmin:<password>@<id>.tsdb.cloud.timescale.com:<port>/tsdb?sslmode=require"
COLLECTION_NAME = "state_of_the_union_test"
embeddings = OpenAIEmbeddings()
vectorestore = TimescaleVector.from_documents(
embedding=embeddings,
documents=docs,
collection_name=COLLECTION_NAME,
service_url=SERVICE_URL,
)
""" # noqa: E501
def __init__(
self,
service_url: str,
embedding: Embeddings,
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
num_dimensions: int = ADA_TOKEN_COUNT,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
pre_delete_collection: bool = False,
logger: Optional[logging.Logger] = None,
relevance_score_fn: Optional[Callable[[float], float]] = None,
time_partition_interval: Optional[timedelta] = None,
**kwargs: Any,
) -> None:
try:
from timescale_vector import client
except ImportError:
raise ImportError(
"Could not import timescale_vector python package. "
"Please install it with `pip install timescale-vector`."
)
self.service_url = service_url
self.embedding = embedding
self.collection_name = collection_name
self.num_dimensions = num_dimensions
self._distance_strategy = distance_strategy
self.pre_delete_collection = pre_delete_collection
self.logger = logger or logging.getLogger(__name__)
self.override_relevance_score_fn = relevance_score_fn
self._time_partition_interval = time_partition_interval
self.sync_client = client.Sync(
self.service_url,
self.collection_name,
self.num_dimensions,
self._distance_strategy.value.lower(),
time_partition_interval=self._time_partition_interval,
**kwargs,
)
self.async_client = client.Async(
self.service_url,
self.collection_name,
self.num_dimensions,
self._distance_strategy.value.lower(),
time_partition_interval=self._time_partition_interval,
**kwargs,
)
self.__post_init__()
def __post_init__(
self,
) -> None:
"""
Initialize the store.
"""
self.sync_client.create_tables()
if self.pre_delete_collection:
self.sync_client.delete_all()
@property
def embeddings(self) -> Embeddings:
return self.embedding
def drop_tables(self) -> None:
self.sync_client.drop_table()
@classmethod
def __from(
cls,
texts: List[str],
embeddings: List[List[float]],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
service_url: Optional[str] = None,
pre_delete_collection: bool = False,
**kwargs: Any,
) -> TimescaleVector:
num_dimensions = len(embeddings[0])
if ids is None:
ids = [str(uuid.uuid1()) for _ in texts]
if not metadatas:
metadatas = [{} for _ in texts]
if service_url is None:
service_url = cls.get_service_url(kwargs)
store = cls(
service_url=service_url,
num_dimensions=num_dimensions,
collection_name=collection_name,
embedding=embedding,
distance_strategy=distance_strategy,
pre_delete_collection=pre_delete_collection,
**kwargs,
)
store.add_embeddings(
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
)
return store
@classmethod
async def __afrom(
cls,
texts: List[str],
embeddings: List[List[float]],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
service_url: Optional[str] = None,
pre_delete_collection: bool = False,
**kwargs: Any,
) -> TimescaleVector:
num_dimensions = len(embeddings[0])
if ids is None:
ids = [str(uuid.uuid1()) for _ in texts]
if not metadatas:
metadatas = [{} for _ in texts]
if service_url is None:
service_url = cls.get_service_url(kwargs)
store = cls(
service_url=service_url,
num_dimensions=num_dimensions,
collection_name=collection_name,
embedding=embedding,
distance_strategy=distance_strategy,
pre_delete_collection=pre_delete_collection,
**kwargs,
)
await store.aadd_embeddings(
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
)
return store
def add_embeddings(
self,
texts: Iterable[str],
embeddings: List[List[float]],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Add embeddings to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
embeddings: List of list of embedding vectors.
metadatas: List of metadatas associated with the texts.
kwargs: vectorstore specific parameters
"""
if ids is None:
ids = [str(uuid.uuid1()) for _ in texts]
if not metadatas:
metadatas = [{} for _ in texts]
records = list(zip(ids, metadatas, texts, embeddings))
self.sync_client.upsert(records)
return ids
async def aadd_embeddings(
self,
texts: Iterable[str],
embeddings: List[List[float]],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Add embeddings to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
embeddings: List of list of embedding vectors.
metadatas: List of metadatas associated with the texts.
kwargs: vectorstore specific parameters
"""
if ids is None:
ids = [str(uuid.uuid1()) for _ in texts]
if not metadatas:
metadatas = [{} for _ in texts]
records = list(zip(ids, metadatas, texts, embeddings))
await self.async_client.upsert(records)
return ids
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
kwargs: vectorstore specific parameters
Returns:
List of ids from adding the texts into the vectorstore.
"""
embeddings = self.embedding.embed_documents(list(texts))
return self.add_embeddings(
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
)
async def aadd_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
kwargs: vectorstore specific parameters
Returns:
List of ids from adding the texts into the vectorstore.
"""
embeddings = self.embedding.embed_documents(list(texts))
return await self.aadd_embeddings(
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
)
def _embed_query(self, query: str) -> Optional[List[float]]:
# an empty query should not be embedded
if query is None or query == "" or query.isspace():
return None
else:
return self.embedding.embed_query(query)
def similarity_search(
self,
query: str,
k: int = 4,
filter: Optional[Union[dict, list]] = None,
predicates: Optional[Predicates] = None,
**kwargs: Any,
) -> List[Document]:
"""Run similarity search with TimescaleVector with distance.
Args:
query (str): Query text to search for.
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query.
"""
embedding = self._embed_query(query)
return self.similarity_search_by_vector(
embedding=embedding,
k=k,
filter=filter,
predicates=predicates,
**kwargs,
)
async def asimilarity_search(
self,
query: str,
k: int = 4,
filter: Optional[Union[dict, list]] = None,
predicates: Optional[Predicates] = None,
**kwargs: Any,
) -> List[Document]:
"""Run similarity search with TimescaleVector with distance.
Args:
query (str): Query text to search for.
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query.
"""
embedding = self._embed_query(query)
return await self.asimilarity_search_by_vector(
embedding=embedding,
k=k,
filter=filter,
predicates=predicates,
**kwargs,
)
def similarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[Union[dict, list]] = None,
predicates: Optional[Predicates] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query and score for each
"""
embedding = self._embed_query(query)
docs = self.similarity_search_with_score_by_vector(
embedding=embedding,
k=k,
filter=filter,
predicates=predicates,
**kwargs,
)
return docs
async def asimilarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[Union[dict, list]] = None,
predicates: Optional[Predicates] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query and score for each
"""
embedding = self._embed_query(query)
return await self.asimilarity_search_with_score_by_vector(
embedding=embedding,
k=k,
filter=filter,
predicates=predicates,
**kwargs,
)
def date_to_range_filter(self, **kwargs: Any) -> Any:
constructor_args = {
key: kwargs[key]
for key in [
"start_date",
"end_date",
"time_delta",
"start_inclusive",
"end_inclusive",
]
if key in kwargs
}
if not constructor_args or len(constructor_args) == 0:
return None
try:
from timescale_vector import client
except ImportError:
raise ImportError(
"Could not import timescale_vector python package. "
"Please install it with `pip install timescale-vector`."
)
return client.UUIDTimeRange(**constructor_args)
def similarity_search_with_score_by_vector(
self,
embedding: Optional[List[float]],
k: int = 4,
filter: Optional[Union[dict, list]] = None,
predicates: Optional[Predicates] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
try:
from timescale_vector import client
except ImportError:
raise ImportError(
"Could not import timescale_vector python package. "
"Please install it with `pip install timescale-vector`."
)
results = self.sync_client.search(
embedding,
limit=k,
filter=filter,
predicates=predicates,
uuid_time_filter=self.date_to_range_filter(**kwargs),
)
docs = [
(
Document(
page_content=result[client.SEARCH_RESULT_CONTENTS_IDX],
metadata=result[client.SEARCH_RESULT_METADATA_IDX],
),
result[client.SEARCH_RESULT_DISTANCE_IDX],
)
for result in results
]
return docs
async def asimilarity_search_with_score_by_vector(
self,
embedding: Optional[List[float]],
k: int = 4,
filter: Optional[Union[dict, list]] = None,
predicates: Optional[Predicates] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
try:
from timescale_vector import client
except ImportError:
raise ImportError(
"Could not import timescale_vector python package. "
"Please install it with `pip install timescale-vector`."
)
results = await self.async_client.search(
embedding,
limit=k,
filter=filter,
predicates=predicates,
uuid_time_filter=self.date_to_range_filter(**kwargs),
)
docs = [
(
Document(
page_content=result[client.SEARCH_RESULT_CONTENTS_IDX],
metadata=result[client.SEARCH_RESULT_METADATA_IDX],
),
result[client.SEARCH_RESULT_DISTANCE_IDX],
)
for result in results
]
return docs
def similarity_search_by_vector(
self,
embedding: Optional[List[float]],
k: int = 4,
filter: Optional[Union[dict, list]] = None,
predicates: Optional[Predicates] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query vector.
"""
docs_and_scores = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k, filter=filter, predicates=predicates, **kwargs
)
return [doc for doc, _ in docs_and_scores]
async def asimilarity_search_by_vector(
self,
embedding: Optional[List[float]],
k: int = 4,
filter: Optional[Union[dict, list]] = None,
predicates: Optional[Predicates] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query vector.
"""
docs_and_scores = await self.asimilarity_search_with_score_by_vector(
embedding=embedding, k=k, filter=filter, predicates=predicates, **kwargs
)
return [doc for doc, _ in docs_and_scores]
@classmethod
def from_texts(
cls: Type[TimescaleVector],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
ids: Optional[List[str]] = None,
pre_delete_collection: bool = False,
**kwargs: Any,
) -> TimescaleVector:
"""
Return VectorStore initialized from texts and embeddings.
Postgres connection string is required
"Either pass it as a parameter
or set the TIMESCALE_SERVICE_URL environment variable.
"""
embeddings = embedding.embed_documents(list(texts))
return cls.__from(
texts,
embeddings,
embedding,
metadatas=metadatas,
ids=ids,
collection_name=collection_name,
distance_strategy=distance_strategy,
pre_delete_collection=pre_delete_collection,
**kwargs,
)
@classmethod
async def afrom_texts(
cls: Type[TimescaleVector],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
ids: Optional[List[str]] = None,
pre_delete_collection: bool = False,
**kwargs: Any,
) -> TimescaleVector:
"""
Return VectorStore initialized from texts and embeddings.
Postgres connection string is required
"Either pass it as a parameter
or set the TIMESCALE_SERVICE_URL environment variable.
"""
embeddings = embedding.embed_documents(list(texts))
return await cls.__afrom(
texts,
embeddings,
embedding,
metadatas=metadatas,
ids=ids,
collection_name=collection_name,
distance_strategy=distance_strategy,
pre_delete_collection=pre_delete_collection,
**kwargs,
)
@classmethod
def from_embeddings(
cls,
text_embeddings: List[Tuple[str, List[float]]],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
ids: Optional[List[str]] = None,
pre_delete_collection: bool = False,
**kwargs: Any,
) -> TimescaleVector:
"""Construct TimescaleVector wrapper from raw documents and pre-
generated embeddings.
Return VectorStore initialized from documents and embeddings.
Postgres connection string is required
"Either pass it as a parameter
or set the TIMESCALE_SERVICE_URL environment variable.
Example:
.. code-block:: python
from langchain_community.vectorstores import TimescaleVector
from langchain_community.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
text_embeddings = embeddings.embed_documents(texts)
text_embedding_pairs = list(zip(texts, text_embeddings))
tvs = TimescaleVector.from_embeddings(text_embedding_pairs, embeddings)
"""
texts = [t[0] for t in text_embeddings]
embeddings = [t[1] for t in text_embeddings]
return cls.__from(
texts,
embeddings,
embedding,
metadatas=metadatas,
ids=ids,
collection_name=collection_name,
distance_strategy=distance_strategy,
pre_delete_collection=pre_delete_collection,
**kwargs,
)
@classmethod
async def afrom_embeddings(
cls,
text_embeddings: List[Tuple[str, List[float]]],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
ids: Optional[List[str]] = None,
pre_delete_collection: bool = False,
**kwargs: Any,
) -> TimescaleVector:
"""Construct TimescaleVector wrapper from raw documents and pre-
generated embeddings.
Return VectorStore initialized from documents and embeddings.
Postgres connection string is required
"Either pass it as a parameter
or set the TIMESCALE_SERVICE_URL environment variable.
Example:
.. code-block:: python
from langchain_community.vectorstores import TimescaleVector
from langchain_community.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
text_embeddings = embeddings.embed_documents(texts)
text_embedding_pairs = list(zip(texts, text_embeddings))
tvs = TimescaleVector.from_embeddings(text_embedding_pairs, embeddings)
"""
texts = [t[0] for t in text_embeddings]
embeddings = [t[1] for t in text_embeddings]
return await cls.__afrom(
texts,
embeddings,
embedding,
metadatas=metadatas,
ids=ids,
collection_name=collection_name,
distance_strategy=distance_strategy,
pre_delete_collection=pre_delete_collection,
**kwargs,
)
@classmethod
def from_existing_index(
cls: Type[TimescaleVector],
embedding: Embeddings,
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
pre_delete_collection: bool = False,
**kwargs: Any,
) -> TimescaleVector:
"""
Get instance of an existing TimescaleVector store.This method will
return the instance of the store without inserting any new
embeddings
"""
service_url = cls.get_service_url(kwargs)
store = cls(
service_url=service_url,
collection_name=collection_name,
embedding=embedding,
distance_strategy=distance_strategy,
pre_delete_collection=pre_delete_collection,
)
return store
@classmethod
def get_service_url(cls, kwargs: Dict[str, Any]) -> str:
service_url: str = get_from_dict_or_env(
data=kwargs,
key="service_url",
env_key="TIMESCALE_SERVICE_URL",
)
if not service_url:
raise ValueError(
"Postgres connection string is required"
"Either pass it as a parameter"
"or set the TIMESCALE_SERVICE_URL environment variable."
)
return service_url
@classmethod
def service_url_from_db_params(
cls,
host: str,
port: int,
database: str,
user: str,
password: str,
) -> str:
"""Return connection string from database parameters."""
return f"postgresql://{user}:{password}@{host}:{port}/{database}"
def _select_relevance_score_fn(self) -> Callable[[float], float]:
"""
The 'correct' relevance function
may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
"""
if self.override_relevance_score_fn is not None:
return self.override_relevance_score_fn
# Default strategy is to rely on distance strategy provided
# in vectorstore constructor
if self._distance_strategy == DistanceStrategy.COSINE:
return self._cosine_relevance_score_fn
elif self._distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
return self._euclidean_relevance_score_fn
elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
return self._max_inner_product_relevance_score_fn
else:
raise ValueError(
"No supported normalization function"
f" for distance_strategy of {self._distance_strategy}."
"Consider providing relevance_score_fn to TimescaleVector constructor."
)
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
"""Delete by vector ID or other criteria.
Args:
ids: List of ids to delete.
**kwargs: Other keyword arguments that subclasses might use.
Returns:
Optional[bool]: True if deletion is successful,
False otherwise, None if not implemented.
"""
if ids is None:
raise ValueError("No ids provided to delete.")
self.sync_client.delete_by_ids(ids)
return True
# todo should this be part of delete|()?
def delete_by_metadata(
self, filter: Union[Dict[str, str], List[Dict[str, str]]], **kwargs: Any
) -> Optional[bool]:
"""Delete by vector ID or other criteria.
Args:
ids: List of ids to delete.
**kwargs: Other keyword arguments that subclasses might use.
Returns:
Optional[bool]: True if deletion is successful,
False otherwise, None if not implemented.
"""
self.sync_client.delete_by_metadata(filter)
return True
class IndexType(str, enum.Enum):
"""Enumerator for the supported Index types"""
TIMESCALE_VECTOR = "tsv"
PGVECTOR_IVFFLAT = "ivfflat"
PGVECTOR_HNSW = "hnsw"
DEFAULT_INDEX_TYPE = IndexType.TIMESCALE_VECTOR
def create_index(
self, index_type: Union[IndexType, str] = DEFAULT_INDEX_TYPE, **kwargs: Any
) -> None:
try:
from timescale_vector import client
except ImportError:
raise ImportError(
"Could not import timescale_vector python package. "
"Please install it with `pip install timescale-vector`."
)
index_type = (
index_type.value if isinstance(index_type, self.IndexType) else index_type
)
if index_type == self.IndexType.PGVECTOR_IVFFLAT.value:
self.sync_client.create_embedding_index(client.IvfflatIndex(**kwargs))
if index_type == self.IndexType.PGVECTOR_HNSW.value:
self.sync_client.create_embedding_index(client.HNSWIndex(**kwargs))
if index_type == self.IndexType.TIMESCALE_VECTOR.value:
self.sync_client.create_embedding_index(
client.TimescaleVectorIndex(**kwargs)
)
def drop_index(self) -> None:
self.sync_client.drop_embedding_index()