You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

229 lines
7.5 KiB

Pathway Vector Store client.
The Pathway Vector Server is a pipeline written in the Pathway framweork which indexes
all files in a given folder, embeds them, and builds a vector index. The pipeline reacts
to changes in source files, automatically updating appropriate index entries.
The PathwayVectorClient implements the LangChain VectorStore interface and queries the
PathwayVectorServer to retrieve up-to-date documents.
You can use the client with managed instances of Pathway Vector Store, or run your own
instance as described at
import json
import logging
from typing import Any, Callable, Iterable, List, Optional, Tuple
import requests
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
# Copied from
# to remove dependency on Pathway library.
class _VectorStoreClient:
def __init__(
host: Optional[str] = None,
port: Optional[int] = None,
url: Optional[str] = None,
A client you can use to query :py:class:`VectorStoreServer`.
Please provide aither the `url`, or `host` and `port`.
- host: host on which `:py:class:`VectorStoreServer` listens
- port: port on which `:py:class:`VectorStoreServer` listens
- url: url at which `:py:class:`VectorStoreServer` listens
err = "Either (`host` and `port`) or `url` must be provided, but not both."
if url is not None:
if host or port:
raise ValueError(err)
self.url = url
if host is None:
raise ValueError(err)
port = port or 80
self.url = f"http://{host}:{port}"
def query(
self, query: str, k: int = 3, metadata_filter: Optional[str] = None
) -> List[dict]:
Perform a query to the vector store and fetch results.
- query:
- k: number of documents to be returned
- metadata_filter: optional string representing the metadata filtering query
in the JMESPath format. The search will happen only for documents
satisfying this filtering.
data = {"query": query, "k": k}
if metadata_filter is not None:
data["metadata_filter"] = metadata_filter
url = self.url + "/v1/retrieve"
response =
headers={"Content-Type": "application/json"},
responses = response.json()
return sorted(responses, key=lambda x: x["dist"])
# Make an alias
__call__ = query
def get_vectorstore_statistics(self) -> dict:
"""Fetch basic statistics about the vector store."""
url = self.url + "/v1/statistics"
response =
headers={"Content-Type": "application/json"},
responses = response.json()
return responses
def get_input_files(
metadata_filter: Optional[str] = None,
filepath_globpattern: Optional[str] = None,
) -> list:
Fetch information on documents in the vector store.
metadata_filter: optional string representing the metadata filtering query
in the JMESPath format. The search will happen only for documents
satisfying this filtering.
filepath_globpattern: optional glob pattern specifying which documents
will be searched for this query.
url = self.url + "/v1/inputs"
response =
"metadata_filter": metadata_filter,
"filepath_globpattern": filepath_globpattern,
headers={"Content-Type": "application/json"},
responses = response.json()
return responses
class PathwayVectorClient(VectorStore):
VectorStore connecting to Pathway Vector Store.
def __init__(
host: Optional[str] = None,
port: Optional[int] = None,
url: Optional[str] = None,
) -> None:
A client you can use to query Pathway Vector Store.
Please provide aither the `url`, or `host` and `port`.
- host: host on which Pathway Vector Store listens
- port: port on which Pathway Vector Store listens
- url: url at which Pathway Vector Store listens
self.client = _VectorStoreClient(host, port, url)
def add_texts(
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Pathway is not suitable for this method."""
raise NotImplementedError(
"Pathway vector store does not support adding or removing texts"
" from client."
def from_texts(
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> "PathwayVectorClient":
raise NotImplementedError(
"Pathway vector store does not support initializing from_texts."
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
metadata_filter = kwargs.pop("metadata_filter", None)
if kwargs:
"Unknown kwargs passed to PathwayVectorClient.similarity_search: %s",
rets = self.client(query=query, k=k, metadata_filter=metadata_filter)
return [
Document(page_content=ret["text"], metadata=ret["metadata"]) for ret in rets
def similarity_search_with_score(
query: str,
k: int = 4,
metadata_filter: Optional[str] = None,
) -> List[Tuple[Document, float]]:
"""Run similarity search with Pathway with distance.
- query (str): Query text to search for.
- k (int): Number of results to return. Defaults to 4.
- metadata_filter (Optional[str]): Filter by metadata.
Filtering query should be in JMESPath format. Defaults to None.
List[Tuple[Document, float]]: List of documents most similar to
the query text and cosine distance in float for each.
Lower score represents more similarity.
rets = self.client(query=query, k=k, metadata_filter=metadata_filter)
return [
(Document(page_content=ret["text"], metadata=ret["metadata"]), ret["dist"])
for ret in rets
def _select_relevance_score_fn(self) -> Callable[[float], float]:
return self._cosine_relevance_score_fn
def get_vectorstore_statistics(self) -> dict:
"""Fetch basic statistics about the Vector Store."""
return self.client.get_vectorstore_statistics()
def get_input_files(
metadata_filter: Optional[str] = None,
filepath_globpattern: Optional[str] = None,
) -> list:
"""List files indexed by the Vector Store."""
return self.client.get_input_files(metadata_filter, filepath_globpattern)