From 8c08cf46194ba77d9ef47d6b2967e1581e230432 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Fri, 19 Apr 2024 20:22:42 +0200 Subject: [PATCH] community: Add support for relationship indexes in neo4j vector (#20657) Neo4j has added relationship vector indexes. We can't populate them, but we can use existing indexes for retrieval --- .../vectorstores/neo4j_vector.py | 186 ++++++++++++++---- .../vectorstores/test_neo4jvector.py | 92 ++++++++- 2 files changed, 237 insertions(+), 41 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py index 30897a5cc9..3a9c57de42 100644 --- a/libs/community/langchain_community/vectorstores/neo4j_vector.py +++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py @@ -68,31 +68,50 @@ class SearchType(str, enum.Enum): DEFAULT_SEARCH_TYPE = SearchType.VECTOR -def _get_search_index_query(search_type: SearchType) -> str: - type_to_query_map = { - SearchType.VECTOR: ( - "CALL db.index.vector.queryNodes($index, $k, $embedding) YIELD node, score " - ), - SearchType.HYBRID: ( - "CALL { " - "CALL db.index.vector.queryNodes($index, $k, $embedding) " - "YIELD node, score " - "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " - "UNWIND nodes AS n " - # We use 0 as min - "RETURN n.node AS node, (n.score / max) AS score UNION " - "CALL db.index.fulltext.queryNodes($keyword_index, $query, {limit: $k}) " - "YIELD node, score " - "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " - "UNWIND nodes AS n " - # We use 0 as min - "RETURN n.node AS node, (n.score / max) AS score " - "} " - # dedup - "WITH node, max(score) AS score ORDER BY score DESC LIMIT $k " - ), - } - return type_to_query_map[search_type] +class IndexType(str, enum.Enum): + """Enumerator of the index types.""" + + NODE = "NODE" + RELATIONSHIP = "RELATIONSHIP" + + +DEFAULT_INDEX_TYPE = IndexType.NODE + + +def _get_search_index_query( + search_type: SearchType, index_type: IndexType = DEFAULT_INDEX_TYPE +) -> str: + if index_type == IndexType.NODE: + type_to_query_map = { + SearchType.VECTOR: ( + "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "YIELD node, score " + ), + SearchType.HYBRID: ( + "CALL { " + "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "YIELD node, score " + "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " + "UNWIND nodes AS n " + # We use 0 as min + "RETURN n.node AS node, (n.score / max) AS score UNION " + "CALL db.index.fulltext.queryNodes($keyword_index, $query, " + "{limit: $k}) YIELD node, score " + "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " + "UNWIND nodes AS n " + # We use 0 as min + "RETURN n.node AS node, (n.score / max) AS score " + "} " + # dedup + "WITH node, max(score) AS score ORDER BY score DESC LIMIT $k " + ), + } + return type_to_query_map[search_type] + else: + return ( + "CALL db.index.vector.queryRelationships($index, $k, $embedding) " + "YIELD relationship, score " + ) def check_if_not_null(props: List[str], values: List[Any]) -> None: @@ -463,6 +482,7 @@ class Neo4jVector(VectorStore): pre_delete_collection: bool = False, retrieval_query: str = "", relevance_score_fn: Optional[Callable[[float], float]] = None, + index_type: IndexType = DEFAULT_INDEX_TYPE, ) -> None: try: import neo4j @@ -541,6 +561,7 @@ class Neo4jVector(VectorStore): self.override_relevance_score_fn = relevance_score_fn self.retrieval_query = retrieval_query self.search_type = search_type + self._index_type = index_type # Calculate embedding dimension self.embedding_dimension = len(embedding.embed_query("foo")) @@ -615,7 +636,7 @@ class Neo4jVector(VectorStore): # Flag for enterprise self._is_enterprise = True if db_data[0]["edition"] == "enterprise" else False - def retrieve_existing_index(self) -> Optional[int]: + def retrieve_existing_index(self) -> Tuple[Optional[int], Optional[str]]: """ Check if the vector index exists in the Neo4j database and returns its embedding dimension. @@ -630,11 +651,11 @@ class Neo4jVector(VectorStore): """ index_information = self.query( - "SHOW INDEXES YIELD name, type, labelsOrTypes, properties, options " - "WHERE type = 'VECTOR' AND (name = $index_name " + "SHOW INDEXES YIELD name, type, entityType, labelsOrTypes, " + "properties, options WHERE type = 'VECTOR' AND (name = $index_name " "OR (labelsOrTypes[0] = $node_label AND " "properties[0] = $embedding_node_property)) " - "RETURN name, labelsOrTypes, properties, options ", + "RETURN name, entityType, labelsOrTypes, properties, options ", params={ "index_name": self.index_name, "node_label": self.node_label, @@ -647,13 +668,14 @@ class Neo4jVector(VectorStore): self.index_name = index_information[0]["name"] self.node_label = index_information[0]["labelsOrTypes"][0] self.embedding_node_property = index_information[0]["properties"][0] + self._index_type = index_information[0]["entityType"] embedding_dimension = index_information[0]["options"]["indexConfig"][ "vector.dimensions" ] - return embedding_dimension + return embedding_dimension, index_information[0]["entityType"] except IndexError: - return None + return None, None def retrieve_existing_fts_index( self, text_node_properties: List[str] = [] @@ -754,7 +776,13 @@ class Neo4jVector(VectorStore): **kwargs, ) # Check if the vector index already exists - embedding_dimension = store.retrieve_existing_index() + embedding_dimension, index_type = store.retrieve_existing_index() + + # Raise error if relationship index type + if index_type == "RELATIONSHIP": + raise ValueError( + "Data ingestion is not supported with relationship vector index." + ) # If the vector index doesn't exist yet if not embedding_dimension: @@ -976,14 +1004,21 @@ class Neo4jVector(VectorStore): index_query = base_index_query + filter_snippets + base_cosine_query else: - index_query = _get_search_index_query(self.search_type) + index_query = _get_search_index_query(self.search_type, self._index_type) filter_params = {} - default_retrieval = ( - f"RETURN node.`{self.text_node_property}` AS text, score, " - f"node {{.*, `{self.text_node_property}`: Null, " - f"`{self.embedding_node_property}`: Null, id: Null }} AS metadata" - ) + if self._index_type == IndexType.RELATIONSHIP: + default_retrieval = ( + f"RETURN relationship.`{self.text_node_property}` AS text, score, " + f"relationship {{.*, `{self.text_node_property}`: Null, " + f"`{self.embedding_node_property}`: Null, id: Null }} AS metadata" + ) + else: + default_retrieval = ( + f"RETURN node.`{self.text_node_property}` AS text, score, " + f"node {{.*, `{self.text_node_property}`: Null, " + f"`{self.embedding_node_property}`: Null, id: Null }} AS metadata" + ) retrieval_query = ( self.retrieval_query if self.retrieval_query else default_retrieval @@ -1141,7 +1176,15 @@ class Neo4jVector(VectorStore): **kwargs, ) - embedding_dimension = store.retrieve_existing_index() + embedding_dimension, index_type = store.retrieve_existing_index() + + # Raise error if relationship index type + if index_type == "RELATIONSHIP": + raise ValueError( + "Relationship vector index is not supported with " + "`from_existing_index` method. Please use the " + "`from_existing_relationship_index` method." + ) if not embedding_dimension: raise ValueError( @@ -1174,6 +1217,61 @@ class Neo4jVector(VectorStore): return store + @classmethod + def from_existing_relationship_index( + cls: Type[Neo4jVector], + embedding: Embeddings, + index_name: str, + search_type: SearchType = DEFAULT_SEARCH_TYPE, + **kwargs: Any, + ) -> Neo4jVector: + """ + Get instance of an existing Neo4j relationship vector index. + This method will return the instance of the store without + inserting any new embeddings. + Neo4j credentials are required in the form of `url`, `username`, + and `password` and optional `database` parameters along with + the `index_name` definition. + """ + + if search_type == SearchType.HYBRID: + raise ValueError( + "Hybrid search is not supported in combination " + "with relationship vector index" + ) + + store = cls( + embedding=embedding, + index_name=index_name, + **kwargs, + ) + + embedding_dimension, index_type = store.retrieve_existing_index() + + if not embedding_dimension: + raise ValueError( + "The specified vector index name does not exist. " + "Make sure to check if you spelled it correctly" + ) + # Raise error if relationship index type + if index_type == "NODE": + raise ValueError( + "Node vector index is not supported with " + "`from_existing_relationship_index` method. Please use the " + "`from_existing_index` method." + ) + + # Check if embedding function and vector index dimensions match + if not store.embedding_dimension == embedding_dimension: + raise ValueError( + "The provided embedding function and vector index " + "dimensions do not match.\n" + f"Embedding function dimension: {store.embedding_dimension}\n" + f"Vector index dimension: {embedding_dimension}" + ) + + return store + @classmethod def from_documents( cls: Type[Neo4jVector], @@ -1266,7 +1364,15 @@ class Neo4jVector(VectorStore): ) # Check if the vector index already exists - embedding_dimension = store.retrieve_existing_index() + embedding_dimension, index_type = store.retrieve_existing_index() + + # Raise error if relationship index type + if index_type == "RELATIONSHIP": + raise ValueError( + "`from_existing_graph` method does not support " + " existing relationship vector index. " + "Please use `from_existing_relationship_index` method" + ) # If the vector index doesn't exist yet if not embedding_dimension: diff --git a/libs/community/tests/integration_tests/vectorstores/test_neo4jvector.py b/libs/community/tests/integration_tests/vectorstores/test_neo4jvector.py index de68c59631..a1261de81c 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_neo4jvector.py +++ b/libs/community/tests/integration_tests/vectorstores/test_neo4jvector.py @@ -43,7 +43,9 @@ def drop_vector_indexes(store: Neo4jVector) -> None: """ ) for index in all_indexes: - store.query(f"DROP INDEX {index['name']}") + store.query(f"DROP INDEX `{index['name']}`") + + store.query("MATCH (n) DETACH DELETE n;") class FakeEmbeddingsWithOsDimension(FakeEmbeddings): @@ -812,3 +814,91 @@ def test_metadata_filters_type1() -> None: assert output == expected_output drop_vector_indexes(docsearch) + + +def test_neo4jvector_relationship_index() -> None: + """Test end to end construction and search.""" + embeddings = FakeEmbeddingsWithOsDimension() + docsearch = Neo4jVector.from_texts( + texts=texts, + embedding=embeddings, + url=url, + username=username, + password=password, + pre_delete_collection=True, + ) + # Ingest data + docsearch.query( + ( + "CREATE ()-[:REL {text: 'foo', embedding: $e1}]->()" + ", ()-[:REL {text: 'far', embedding: $e2}]->()" + ), + params={ + "e1": embeddings.embed_query("foo"), + "e2": embeddings.embed_query("bar"), + }, + ) + # Create relationship index + docsearch.query( + """CREATE VECTOR INDEX `relationship` +FOR ()-[r:REL]-() ON (r.embedding) +OPTIONS {indexConfig: { + `vector.dimensions`: 1536, + `vector.similarity_function`: 'cosine' +}} +""" + ) + relationship_index = Neo4jVector.from_existing_relationship_index( + embeddings, index_name="relationship" + ) + + output = relationship_index.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + drop_vector_indexes(docsearch) + + +def test_neo4jvector_relationship_index_retrieval() -> None: + """Test end to end construction and search.""" + embeddings = FakeEmbeddingsWithOsDimension() + docsearch = Neo4jVector.from_texts( + texts=texts, + embedding=embeddings, + url=url, + username=username, + password=password, + pre_delete_collection=True, + ) + # Ingest data + docsearch.query( + ( + "CREATE ({node:'text'})-[:REL {text: 'foo', embedding: $e1}]->()" + ", ({node:'text'})-[:REL {text: 'far', embedding: $e2}]->()" + ), + params={ + "e1": embeddings.embed_query("foo"), + "e2": embeddings.embed_query("bar"), + }, + ) + # Create relationship index + docsearch.query( + """CREATE VECTOR INDEX `relationship` +FOR ()-[r:REL]-() ON (r.embedding) +OPTIONS {indexConfig: { + `vector.dimensions`: 1536, + `vector.similarity_function`: 'cosine' +}} +""" + ) + retrieval_query = ( + "RETURN relationship.text + '-' + startNode(relationship).node " + "AS text, score, {foo:'bar'} AS metadata" + ) + relationship_index = Neo4jVector.from_existing_relationship_index( + embeddings, index_name="relationship", retrieval_query=retrieval_query + ) + + output = relationship_index.similarity_search("foo", k=1) + assert output == [Document(page_content="foo-text", metadata={"foo": "bar"})] + + drop_vector_indexes(docsearch)