mirror of https://github.com/hwchase17/langchain
Add neo4j vector support (#9770)
Neo4j has added vector index integration just recently. To allow both ingestion and integrating it as vector RAG applications, I wrapped it as a vector store as the implementation is completely different from `GraphCypherQAChain`. Here, we are not generating any Cypher statements at query time, we are simply doing the vector similarity search using the new vector index as if we were dealing with a vector database. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>pull/9938/head
parent
49ebbe4bcd
commit
db13fba7ea
@ -0,0 +1,685 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
)
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.utils import get_from_env
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
from langchain.vectorstores.utils import DistanceStrategy
|
||||
|
||||
DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE
|
||||
|
||||
distance_mapping = {
|
||||
DistanceStrategy.EUCLIDEAN_DISTANCE: "euclidean",
|
||||
DistanceStrategy.COSINE: "cosine",
|
||||
}
|
||||
|
||||
|
||||
def check_if_not_null(props: List[str], values: List[Any]) -> None:
|
||||
for prop, value in zip(props, values):
|
||||
if not value:
|
||||
raise ValueError(f"Parameter `{prop}` must not be None or empty string")
|
||||
|
||||
|
||||
def sort_by_index_name(
|
||||
lst: List[Dict[str, Any]], index_name: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Sort first element to match the index_name if exists"""
|
||||
return sorted(lst, key=lambda x: x.get("index_name") != index_name)
|
||||
|
||||
|
||||
class Neo4jVector(VectorStore):
|
||||
"""`Neo4j` vector index.
|
||||
|
||||
To use, you should have the ``neo4j`` python package installed.
|
||||
|
||||
Args:
|
||||
url: Neo4j connection url
|
||||
username: Neo4j username.
|
||||
password: Neo4j password
|
||||
database: Optionally provide Neo4j database
|
||||
Defaults to "neo4j"
|
||||
embedding: Any embedding function implementing
|
||||
`langchain.embeddings.base.Embeddings` interface.
|
||||
distance_strategy: The distance strategy to use. (default: COSINE)
|
||||
pre_delete_collection: If True, will delete existing data if it exists.
|
||||
(default: False). Useful for testing.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.vectorstores.neo4j_vector import Neo4jVector
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
|
||||
url="bolt://localhost:7687"
|
||||
username="neo4j"
|
||||
password="pleaseletmein"
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorestore = Neo4jVector.from_documents(
|
||||
embedding=embeddings,
|
||||
documents=docs,
|
||||
url=url
|
||||
username=username,
|
||||
password=password,
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
database: str = "neo4j",
|
||||
index_name: str = "vector",
|
||||
node_label: str = "Chunk",
|
||||
embedding_node_property: str = "embedding",
|
||||
text_node_property: str = "text",
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
pre_delete_collection: bool = False,
|
||||
retrieval_query: str = "",
|
||||
relevance_score_fn: Optional[Callable[[float], float]] = None,
|
||||
) -> None:
|
||||
try:
|
||||
import neo4j
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import neo4j python package. "
|
||||
"Please install it with `pip install neo4j`."
|
||||
)
|
||||
|
||||
# Allow only cosine and euclidean distance strategies
|
||||
if distance_strategy not in [
|
||||
DistanceStrategy.EUCLIDEAN_DISTANCE,
|
||||
DistanceStrategy.COSINE,
|
||||
]:
|
||||
raise ValueError(
|
||||
"distance_strategy must be either 'EUCLIDEAN_DISTANCE' or 'COSINE'"
|
||||
)
|
||||
|
||||
# Handle if the credentials are environment variables
|
||||
url = get_from_env("url", "NEO4J_URL", url)
|
||||
username = get_from_env("username", "NEO4J_USERNAME", username)
|
||||
password = get_from_env("password", "NEO4J_PASSWORD", password)
|
||||
database = get_from_env("database", "NEO4J_DATABASE", database)
|
||||
|
||||
self._driver = neo4j.GraphDatabase.driver(url, auth=(username, password))
|
||||
self._database = database
|
||||
self.schema = ""
|
||||
# Verify connection
|
||||
try:
|
||||
self._driver.verify_connectivity()
|
||||
except neo4j.exceptions.ServiceUnavailable:
|
||||
raise ValueError(
|
||||
"Could not connect to Neo4j database. "
|
||||
"Please ensure that the url is correct"
|
||||
)
|
||||
except neo4j.exceptions.AuthError:
|
||||
raise ValueError(
|
||||
"Could not connect to Neo4j database. "
|
||||
"Please ensure that the username and password are correct"
|
||||
)
|
||||
|
||||
# Verify if the version support vector index
|
||||
self.verify_version()
|
||||
|
||||
# Verify that required values are not null
|
||||
check_if_not_null(
|
||||
[
|
||||
"index_name",
|
||||
"node_label",
|
||||
"embedding_node_property",
|
||||
"text_node_property",
|
||||
],
|
||||
[index_name, node_label, embedding_node_property, text_node_property],
|
||||
)
|
||||
|
||||
self.embedding = embedding
|
||||
self._distance_strategy = distance_strategy
|
||||
self.index_name = index_name
|
||||
self.node_label = node_label
|
||||
self.embedding_node_property = embedding_node_property
|
||||
self.text_node_property = text_node_property
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
self.override_relevance_score_fn = relevance_score_fn
|
||||
self.retrieval_query = retrieval_query
|
||||
# Calculate embedding dimension
|
||||
self.embedding_dimension = len(embedding.embed_query("foo"))
|
||||
|
||||
# Delete existing data if flagged
|
||||
if pre_delete_collection:
|
||||
from neo4j.exceptions import DatabaseError
|
||||
|
||||
self.query(
|
||||
f"MATCH (n:`{self.node_label}`) "
|
||||
"CALL { WITH n DETACH DELETE n } "
|
||||
"IN TRANSACTIONS OF 10000 ROWS;"
|
||||
)
|
||||
# Delete index
|
||||
try:
|
||||
self.query(f"DROP INDEX {self.index_name}")
|
||||
except DatabaseError: # Index didn't exist yet
|
||||
pass
|
||||
|
||||
def query(
|
||||
self, query: str, *, params: Optional[dict] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
This method sends a Cypher query to the connected Neo4j database
|
||||
and returns the results as a list of dictionaries.
|
||||
|
||||
Args:
|
||||
query (str): The Cypher query to execute.
|
||||
params (dict, optional): Dictionary of query parameters. Defaults to {}.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: List of dictionaries containing the query results.
|
||||
"""
|
||||
from neo4j.exceptions import CypherSyntaxError
|
||||
|
||||
params = params or {}
|
||||
with self._driver.session(database=self._database) as session:
|
||||
try:
|
||||
data = session.run(query, params)
|
||||
return [r.data() for r in data]
|
||||
except CypherSyntaxError as e:
|
||||
raise ValueError(f"Cypher Statement is not valid\n{e}")
|
||||
|
||||
def verify_version(self) -> None:
|
||||
"""
|
||||
Check if the connected Neo4j database version supports vector indexing.
|
||||
|
||||
Queries the Neo4j database to retrieve its version and compares it
|
||||
against a target version (5.11.0) that is known to support vector
|
||||
indexing. Raises a ValueError if the connected Neo4j version is
|
||||
not supported.
|
||||
"""
|
||||
version = self.query("CALL dbms.components()")[0]["versions"][0]
|
||||
if "aura" in version:
|
||||
version_tuple = tuple(map(int, version.split("-")[0].split("."))) + (0,)
|
||||
else:
|
||||
version_tuple = tuple(map(int, version.split(".")))
|
||||
|
||||
target_version = (5, 11, 0)
|
||||
|
||||
if version_tuple < target_version:
|
||||
raise ValueError(
|
||||
"Version index is only supported in Neo4j version 5.11 or greater"
|
||||
)
|
||||
|
||||
def retrieve_existing_index(self) -> Optional[int]:
|
||||
"""
|
||||
Check if the vector index exists in the Neo4j database
|
||||
and returns its embedding dimension.
|
||||
|
||||
This method queries the Neo4j database for existing indexes
|
||||
and attempts to retrieve the dimension of the vector index
|
||||
with the specified name. If the index exists, its dimension is returned.
|
||||
If the index doesn't exist, `None` is returned.
|
||||
|
||||
Returns:
|
||||
int or None: The embedding dimension of the existing index if found.
|
||||
"""
|
||||
|
||||
index_information = self.query(
|
||||
"SHOW INDEXES YIELD name, type, labelsOrTypes, properties, options "
|
||||
"WHERE type = 'VECTOR' AND (name = $index_name "
|
||||
"OR (labelsOrTypes[0] = $node_label AND "
|
||||
"properties[0] = $embedding_node_property)) "
|
||||
"RETURN name, labelsOrTypes, properties, options ",
|
||||
params={
|
||||
"index_name": self.index_name,
|
||||
"node_label": self.node_label,
|
||||
"embedding_node_property": self.embedding_node_property,
|
||||
},
|
||||
)
|
||||
# sort by index_name
|
||||
index_information = sort_by_index_name(index_information, self.index_name)
|
||||
try:
|
||||
self.index_name = index_information[0]["name"]
|
||||
self.node_label = index_information[0]["labelsOrTypes"][0]
|
||||
self.embedding_node_property = index_information[0]["properties"][0]
|
||||
embedding_dimension = index_information[0]["options"]["indexConfig"][
|
||||
"vector.dimensions"
|
||||
]
|
||||
|
||||
return embedding_dimension
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def create_new_index(self) -> None:
|
||||
"""
|
||||
This method constructs a Cypher query and executes it
|
||||
to create a new vector index in Neo4j.
|
||||
"""
|
||||
index_query = (
|
||||
"CALL db.index.vector.createNodeIndex("
|
||||
"$index_name,"
|
||||
"$node_label,"
|
||||
"$embedding_node_property,"
|
||||
"toInteger($embedding_dimension),"
|
||||
"$similarity_metric )"
|
||||
)
|
||||
|
||||
parameters = {
|
||||
"index_name": self.index_name,
|
||||
"node_label": self.node_label,
|
||||
"embedding_node_property": self.embedding_node_property,
|
||||
"embedding_dimension": self.embedding_dimension,
|
||||
"similarity_metric": distance_mapping[self._distance_strategy],
|
||||
}
|
||||
self.query(index_query, params=parameters)
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self.embedding
|
||||
|
||||
@classmethod
|
||||
def __from(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embeddings: List[List[float]],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
create_id_index: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> Neo4jVector:
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
|
||||
if not metadatas:
|
||||
metadatas = [{} for _ in texts]
|
||||
|
||||
store = cls(
|
||||
embedding=embedding,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Check if the index already exists
|
||||
embedding_dimension = store.retrieve_existing_index()
|
||||
|
||||
# If the index doesn't exist yet
|
||||
if not embedding_dimension:
|
||||
store.create_new_index()
|
||||
# If the index already exists, check if embedding dimensions match
|
||||
elif not store.embedding_dimension == embedding_dimension:
|
||||
raise ValueError(
|
||||
f"Index with name {store.index_name} already exists."
|
||||
"The provided embedding function and vector index "
|
||||
"dimensions do not match.\n"
|
||||
f"Embedding function dimension: {store.embedding_dimension}\n"
|
||||
f"Vector index dimension: {embedding_dimension}"
|
||||
)
|
||||
|
||||
# Create unique constraint for faster import
|
||||
if create_id_index:
|
||||
store.query(
|
||||
"CREATE CONSTRAINT IF NOT EXISTS "
|
||||
f"FOR (n:`{store.node_label}`) REQUIRE n.id IS UNIQUE;"
|
||||
)
|
||||
|
||||
store.add_embeddings(
|
||||
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
|
||||
)
|
||||
|
||||
return store
|
||||
|
||||
def add_embeddings(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
embeddings: List[List[float]],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Add embeddings to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
embeddings: List of list of embedding vectors.
|
||||
metadatas: List of metadatas associated with the texts.
|
||||
kwargs: vectorstore specific parameters
|
||||
"""
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
|
||||
if not metadatas:
|
||||
metadatas = [{} for _ in texts]
|
||||
|
||||
import_query = (
|
||||
"UNWIND $data AS row "
|
||||
"CALL { WITH row "
|
||||
f"MERGE (c:`{self.node_label}` {{id: row.id}}) "
|
||||
"WITH c, row "
|
||||
f"CALL db.create.setVectorProperty(c, "
|
||||
f"'{self.embedding_node_property}', row.embedding) "
|
||||
"YIELD node "
|
||||
f"SET c.`{self.text_node_property}` = row.text "
|
||||
"SET c += row.metadata } IN TRANSACTIONS OF 1000 ROWS"
|
||||
)
|
||||
|
||||
parameters = {
|
||||
"data": [
|
||||
{"text": text, "metadata": metadata, "embedding": embedding, "id": id}
|
||||
for text, metadata, embedding, id in zip(
|
||||
texts, metadatas, embeddings, ids
|
||||
)
|
||||
]
|
||||
}
|
||||
|
||||
self.query(import_query, params=parameters)
|
||||
|
||||
return ids
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
kwargs: vectorstore specific parameters
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
embeddings = self.embedding.embed_documents(list(texts))
|
||||
return self.add_embeddings(
|
||||
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
|
||||
)
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Run similarity search with Neo4jVector.
|
||||
|
||||
Args:
|
||||
query (str): Query text to search for.
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query.
|
||||
"""
|
||||
embedding = self.embedding.embed_query(text=query)
|
||||
return self.similarity_search_by_vector(
|
||||
embedding=embedding,
|
||||
k=k,
|
||||
)
|
||||
|
||||
def similarity_search_with_score(
|
||||
self, query: str, k: int = 4
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
embedding = self.embedding.embed_query(query)
|
||||
docs = self.similarity_search_with_score_by_vector(embedding=embedding, k=k)
|
||||
return docs
|
||||
|
||||
def similarity_search_with_score_by_vector(
|
||||
self, embedding: List[float], k: int = 4
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""
|
||||
Perform a similarity search in the Neo4j database using a
|
||||
given vector and return the top k similar documents with their scores.
|
||||
|
||||
This method uses a Cypher query to find the top k documents that
|
||||
are most similar to a given embedding. The similarity is measured
|
||||
using a vector index in the Neo4j database. The results are returned
|
||||
as a list of tuples, each containing a Document object and
|
||||
its similarity score.
|
||||
|
||||
Args:
|
||||
embedding (List[float]): The embedding vector to compare against.
|
||||
k (int, optional): The number of top similar documents to retrieve.
|
||||
|
||||
Returns:
|
||||
List[Tuple[Document, float]]: A list of tuples, each containing
|
||||
a Document object and its similarity score.
|
||||
"""
|
||||
default_retrieval = (
|
||||
f"RETURN node.`{self.text_node_property}` AS text, score, "
|
||||
f"node {{.*, `{self.text_node_property}`: Null, "
|
||||
f"`{self.embedding_node_property}`: Null, id: Null }} AS metadata"
|
||||
)
|
||||
|
||||
retrieval_query = (
|
||||
self.retrieval_query if self.retrieval_query else default_retrieval
|
||||
)
|
||||
|
||||
read_query = (
|
||||
"CALL db.index.vector.queryNodes($index, $k, $embedding) "
|
||||
"YIELD node, score "
|
||||
) + retrieval_query
|
||||
|
||||
parameters = {"index": self.index_name, "k": k, "embedding": embedding}
|
||||
|
||||
results = self.query(read_query, params=parameters)
|
||||
|
||||
docs = [
|
||||
(
|
||||
Document(
|
||||
page_content=result["text"],
|
||||
metadata={
|
||||
k: v for k, v in result["metadata"].items() if v is not None
|
||||
},
|
||||
),
|
||||
result["score"],
|
||||
)
|
||||
for result in results
|
||||
]
|
||||
return docs
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query vector.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score_by_vector(
|
||||
embedding=embedding, k=k
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[Neo4jVector],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Neo4jVector:
|
||||
"""
|
||||
Return Neo4jVector initialized from texts and embeddings.
|
||||
Neo4j credentials are required in the form of `url`, `username`,
|
||||
and `password` and optional `database` parameters.
|
||||
"""
|
||||
embeddings = embedding.embed_documents(list(texts))
|
||||
|
||||
return cls.__from(
|
||||
texts,
|
||||
embeddings,
|
||||
embedding,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
distance_strategy=distance_strategy,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_embeddings(
|
||||
cls,
|
||||
text_embeddings: List[Tuple[str, List[float]]],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
ids: Optional[List[str]] = None,
|
||||
pre_delete_collection: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> Neo4jVector:
|
||||
"""Construct Neo4jVector wrapper from raw documents and pre-
|
||||
generated embeddings.
|
||||
|
||||
Return Neo4jVector initialized from documents and embeddings.
|
||||
Neo4j credentials are required in the form of `url`, `username`,
|
||||
and `password` and optional `database` parameters.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.vectorstores.neo4j_vector import Neo4jVector
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
embeddings = OpenAIEmbeddings()
|
||||
text_embeddings = embeddings.embed_documents(texts)
|
||||
text_embedding_pairs = list(zip(texts, text_embeddings))
|
||||
vectorstore = Neo4jVector.from_embeddings(
|
||||
text_embedding_pairs, embeddings)
|
||||
"""
|
||||
texts = [t[0] for t in text_embeddings]
|
||||
embeddings = [t[1] for t in text_embeddings]
|
||||
|
||||
return cls.__from(
|
||||
texts,
|
||||
embeddings,
|
||||
embedding,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
distance_strategy=distance_strategy,
|
||||
pre_delete_collection=pre_delete_collection,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_existing_index(
|
||||
cls: Type[Neo4jVector],
|
||||
embedding: Embeddings,
|
||||
index_name: str,
|
||||
**kwargs: Any,
|
||||
) -> Neo4jVector:
|
||||
"""
|
||||
Get instance of an existing Neo4j vector index. This method will
|
||||
return the instance of the store without inserting any new
|
||||
embeddings.
|
||||
Neo4j credentials are required in the form of `url`, `username`,
|
||||
and `password` and optional `database` parameters along with
|
||||
the `index_name` definition.
|
||||
"""
|
||||
|
||||
store = cls(
|
||||
embedding=embedding,
|
||||
index_name=index_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
embedding_dimension = store.retrieve_existing_index()
|
||||
|
||||
if not embedding_dimension:
|
||||
raise ValueError(
|
||||
"The specified vector index name does not exist. "
|
||||
"Make sure to check if you spelled it correctly"
|
||||
)
|
||||
|
||||
# Check if embedding function and vector index dimensions match
|
||||
if not store.embedding_dimension == embedding_dimension:
|
||||
raise ValueError(
|
||||
"The provided embedding function and vector index "
|
||||
"dimensions do not match.\n"
|
||||
f"Embedding function dimension: {store.embedding_dimension}\n"
|
||||
f"Vector index dimension: {embedding_dimension}"
|
||||
)
|
||||
|
||||
return store
|
||||
|
||||
@classmethod
|
||||
def from_documents(
|
||||
cls: Type[Neo4jVector],
|
||||
documents: List[Document],
|
||||
embedding: Embeddings,
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Neo4jVector:
|
||||
"""
|
||||
Return Neo4jVector initialized from documents and embeddings.
|
||||
Neo4j credentials are required in the form of `url`, `username`,
|
||||
and `password` and optional `database` parameters.
|
||||
"""
|
||||
|
||||
texts = [d.page_content for d in documents]
|
||||
metadatas = [d.metadata for d in documents]
|
||||
|
||||
return cls.from_texts(
|
||||
texts=texts,
|
||||
embedding=embedding,
|
||||
distance_strategy=distance_strategy,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
||||
"""
|
||||
The 'correct' relevance function
|
||||
may differ depending on a few things, including:
|
||||
- the distance / similarity metric used by the VectorStore
|
||||
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
|
||||
- embedding dimensionality
|
||||
- etc.
|
||||
"""
|
||||
if self.override_relevance_score_fn is not None:
|
||||
return self.override_relevance_score_fn
|
||||
|
||||
# Default strategy is to rely on distance strategy provided
|
||||
# in vectorstore constructor
|
||||
if self._distance_strategy == DistanceStrategy.COSINE:
|
||||
return lambda x: x
|
||||
elif self._distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
|
||||
return lambda x: x
|
||||
else:
|
||||
raise ValueError(
|
||||
"No supported normalization function"
|
||||
f" for distance_strategy of {self._distance_strategy}."
|
||||
"Consider providing relevance_score_fn to PGVector constructor."
|
||||
)
|
@ -0,0 +1,12 @@
|
||||
version: "3.8"
|
||||
services:
|
||||
neo4j:
|
||||
image: neo4j:5.11.0
|
||||
restart: on-failure:0
|
||||
hostname: neo4j-test
|
||||
container_name: neo4j-test
|
||||
ports:
|
||||
- 7474:7474
|
||||
- 7687:7687
|
||||
environment:
|
||||
- NEO4J_AUTH=neo4j/pleaseletmein
|
@ -0,0 +1,333 @@
|
||||
"""Test Neo4jVector functionality."""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.vectorstores import Neo4jVector
|
||||
from langchain.vectorstores.utils import DistanceStrategy
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
|
||||
url = os.environ.get("NEO4J_URL", "bolt://localhost:7687")
|
||||
username = os.environ.get("NEO4J_USERNAME", "neo4j")
|
||||
password = os.environ.get("NEO4J_PASSWORD", "pleaseletmein")
|
||||
|
||||
OS_TOKEN_COUNT = 1536
|
||||
|
||||
texts = ["foo", "bar", "baz"]
|
||||
|
||||
"""
|
||||
cd tests/integration_tests/vectorstores/docker-compose
|
||||
docker-compose -f neo4j.yml up
|
||||
"""
|
||||
|
||||
|
||||
def drop_vector_indexes(store: Neo4jVector) -> None:
|
||||
"""Cleanup all vector indexes"""
|
||||
all_indexes = store.query(
|
||||
"""
|
||||
SHOW INDEXES YIELD name, type
|
||||
WHERE type = "VECTOR"
|
||||
RETURN name
|
||||
"""
|
||||
)
|
||||
for index in all_indexes:
|
||||
store.query(f"DROP INDEX {index['name']}")
|
||||
|
||||
|
||||
class FakeEmbeddingsWithOsDimension(FakeEmbeddings):
|
||||
"""Fake embeddings functionality for testing."""
|
||||
|
||||
def embed_documents(self, embedding_texts: List[str]) -> List[List[float]]:
|
||||
"""Return simple embeddings."""
|
||||
return [
|
||||
[float(1.0)] * (OS_TOKEN_COUNT - 1) + [float(i + 1)]
|
||||
for i in range(len(embedding_texts))
|
||||
]
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Return simple embeddings."""
|
||||
return [float(1.0)] * (OS_TOKEN_COUNT - 1) + [float(texts.index(text) + 1)]
|
||||
|
||||
|
||||
def test_neo4jvector() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
docsearch = Neo4jVector.from_texts(
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
drop_vector_indexes(docsearch)
|
||||
|
||||
|
||||
def test_neo4jvector_euclidean() -> None:
|
||||
"""Test euclidean distance"""
|
||||
docsearch = Neo4jVector.from_texts(
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
drop_vector_indexes(docsearch)
|
||||
|
||||
|
||||
def test_neo4jvector_embeddings() -> None:
|
||||
"""Test end to end construction with embeddings and search."""
|
||||
text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts)
|
||||
text_embedding_pairs = list(zip(texts, text_embeddings))
|
||||
docsearch = Neo4jVector.from_embeddings(
|
||||
text_embeddings=text_embedding_pairs,
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
drop_vector_indexes(docsearch)
|
||||
|
||||
|
||||
def test_neo4jvector_catch_wrong_index_name() -> None:
|
||||
"""Test if index name is misspelled, but node label and property are correct."""
|
||||
text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts)
|
||||
text_embedding_pairs = list(zip(texts, text_embeddings))
|
||||
Neo4jVector.from_embeddings(
|
||||
text_embeddings=text_embedding_pairs,
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
existing = Neo4jVector.from_existing_index(
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
index_name="test",
|
||||
)
|
||||
output = existing.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
drop_vector_indexes(existing)
|
||||
|
||||
|
||||
def test_neo4jvector_catch_wrong_node_label() -> None:
|
||||
"""Test if node label is misspelled, but index name is correct."""
|
||||
text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts)
|
||||
text_embedding_pairs = list(zip(texts, text_embeddings))
|
||||
Neo4jVector.from_embeddings(
|
||||
text_embeddings=text_embedding_pairs,
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
existing = Neo4jVector.from_existing_index(
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
index_name="vector",
|
||||
node_label="test",
|
||||
)
|
||||
output = existing.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
drop_vector_indexes(existing)
|
||||
|
||||
|
||||
def test_neo4jvector_with_metadatas() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Neo4jVector.from_texts(
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
metadatas=metadatas,
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo", metadata={"page": "0"})]
|
||||
|
||||
drop_vector_indexes(docsearch)
|
||||
|
||||
|
||||
def test_neo4jvector_with_metadatas_with_scores() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Neo4jVector.from_texts(
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
metadatas=metadatas,
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
output = docsearch.similarity_search_with_score("foo", k=1)
|
||||
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)]
|
||||
|
||||
drop_vector_indexes(docsearch)
|
||||
|
||||
|
||||
def test_neo4jvector_relevance_score() -> None:
|
||||
"""Test to make sure the relevance score is scaled to 0-1."""
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Neo4jVector.from_texts(
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
metadatas=metadatas,
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
|
||||
assert output == [
|
||||
(Document(page_content="foo", metadata={"page": "0"}), 1.0),
|
||||
(Document(page_content="bar", metadata={"page": "1"}), 0.9998376369476318),
|
||||
(Document(page_content="baz", metadata={"page": "2"}), 0.9993523359298706),
|
||||
]
|
||||
|
||||
drop_vector_indexes(docsearch)
|
||||
|
||||
|
||||
def test_neo4jvector_retriever_search_threshold() -> None:
|
||||
"""Test using retriever for searching with threshold."""
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Neo4jVector.from_texts(
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
metadatas=metadatas,
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
|
||||
retriever = docsearch.as_retriever(
|
||||
search_type="similarity_score_threshold",
|
||||
search_kwargs={"k": 3, "score_threshold": 0.9999},
|
||||
)
|
||||
output = retriever.get_relevant_documents("foo")
|
||||
assert output == [
|
||||
Document(page_content="foo", metadata={"page": "0"}),
|
||||
]
|
||||
|
||||
drop_vector_indexes(docsearch)
|
||||
|
||||
|
||||
def test_custom_return_neo4jvector() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
docsearch = Neo4jVector.from_texts(
|
||||
texts=["test"],
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
retrieval_query="RETURN 'foo' AS text, score, {test: 'test'} AS metadata",
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo", metadata={"test": "test"})]
|
||||
|
||||
drop_vector_indexes(docsearch)
|
||||
|
||||
|
||||
def test_neo4jvector_prefer_indexname() -> None:
|
||||
"""Test using when two indexes are found, prefer by index_name."""
|
||||
Neo4jVector.from_texts(
|
||||
texts=["foo"],
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
|
||||
Neo4jVector.from_texts(
|
||||
texts=["bar"],
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
index_name="foo",
|
||||
node_label="Test",
|
||||
embedding_node_property="vector",
|
||||
text_node_property="info",
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
|
||||
existing_index = Neo4jVector.from_existing_index(
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
index_name="foo",
|
||||
text_node_property="info",
|
||||
)
|
||||
|
||||
output = existing_index.similarity_search("bar", k=1)
|
||||
assert output == [Document(page_content="bar", metadata={})]
|
||||
drop_vector_indexes(existing_index)
|
||||
|
||||
|
||||
def test_neo4jvector_prefer_indexname_insert() -> None:
|
||||
"""Test using when two indexes are found, prefer by index_name."""
|
||||
Neo4jVector.from_texts(
|
||||
texts=["baz"],
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
|
||||
Neo4jVector.from_texts(
|
||||
texts=["foo"],
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
index_name="foo",
|
||||
node_label="Test",
|
||||
embedding_node_property="vector",
|
||||
text_node_property="info",
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
|
||||
existing_index = Neo4jVector.from_existing_index(
|
||||
embedding=FakeEmbeddingsWithOsDimension(),
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
index_name="foo",
|
||||
text_node_property="info",
|
||||
)
|
||||
|
||||
existing_index.add_documents([Document(page_content="bar", metadata={})])
|
||||
|
||||
output = existing_index.similarity_search("bar", k=2)
|
||||
assert output == [
|
||||
Document(page_content="bar", metadata={}),
|
||||
Document(page_content="foo", metadata={}),
|
||||
]
|
||||
drop_vector_indexes(existing_index)
|
Loading…
Reference in New Issue