From ea2616ae23be97135696f7ace838ff38661426fc Mon Sep 17 00:00:00 2001
From: Tomaz Bratanic <bratanic.tomaz@gmail.com>
Date: Wed, 13 Dec 2023 18:09:50 +0100
Subject: [PATCH] Fix RRF and lucene escape characters for neo4j vector store
 (#14646)

* Remove Lucene special characters (fixes
https://github.com/langchain-ai/langchain/issues/14232)
* Fixes RRF normalization for hybrid search
---
 .../vectorstores/neo4j_vector.py              | 41 ++++++++++--
 .../vectorstores/test_neo4jvector.py          | 67 ++++++++++++++++++-
 .../unit_tests/vectorstores/test_neo4j.py     | 45 +++++++++++++
 3 files changed, 147 insertions(+), 6 deletions(-)
 create mode 100644 libs/community/tests/unit_tests/vectorstores/test_neo4j.py

diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py
index 7ccc2e7d10..bb8f1b9b30 100644
--- a/libs/community/langchain_community/vectorstores/neo4j_vector.py
+++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py
@@ -48,14 +48,19 @@ def _get_search_index_query(search_type: SearchType) -> str:
             "CALL { "
             "CALL db.index.vector.queryNodes($index, $k, $embedding) "
             "YIELD node, score "
-            "RETURN node, score UNION "
+            "WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
+            "UNWIND nodes AS n "
+            # We use 0 as min
+            "RETURN n.node AS node, (n.score / max) AS score UNION "
             "CALL db.index.fulltext.queryNodes($keyword_index, $query, {limit: $k}) "
             "YIELD node, score "
             "WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
             "UNWIND nodes AS n "
-            "RETURN n.node AS node, (n.score / max) AS score "  # We use 0 as min
+            # We use 0 as min
+            "RETURN n.node AS node, (n.score / max) AS score "
             "} "
-            "WITH node, max(score) AS score ORDER BY score DESC LIMIT $k "  # dedup
+            # dedup
+            "WITH node, max(score) AS score ORDER BY score DESC LIMIT $k "
         ),
     }
     return type_to_query_map[search_type]
@@ -75,6 +80,34 @@ def sort_by_index_name(
     return sorted(lst, key=lambda x: x.get("index_name") != index_name)
 
 
+def remove_lucene_chars(text: str) -> str:
+    """Remove Lucene special characters"""
+    special_chars = [
+        "+",
+        "-",
+        "&",
+        "|",
+        "!",
+        "(",
+        ")",
+        "{",
+        "}",
+        "[",
+        "]",
+        "^",
+        '"',
+        "~",
+        "*",
+        "?",
+        ":",
+        "\\",
+    ]
+    for char in special_chars:
+        if char in text:
+            text = text.replace(char, " ")
+    return text.strip()
+
+
 class Neo4jVector(VectorStore):
     """`Neo4j` vector index.
 
@@ -589,7 +622,7 @@ class Neo4jVector(VectorStore):
             "k": k,
             "embedding": embedding,
             "keyword_index": self.keyword_index_name,
-            "query": kwargs["query"],
+            "query": remove_lucene_chars(kwargs["query"]),
         }
 
         results = self.query(read_query, params=parameters)
diff --git a/libs/community/tests/integration_tests/vectorstores/test_neo4jvector.py b/libs/community/tests/integration_tests/vectorstores/test_neo4jvector.py
index f7ba418b47..cb0c79a3a0 100644
--- a/libs/community/tests/integration_tests/vectorstores/test_neo4jvector.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_neo4jvector.py
@@ -4,7 +4,11 @@ from typing import List
 
 from langchain_core.documents import Document
 
-from langchain_community.vectorstores.neo4j_vector import Neo4jVector, SearchType
+from langchain_community.vectorstores.neo4j_vector import (
+    Neo4jVector,
+    SearchType,
+    _get_search_index_query,
+)
 from langchain_community.vectorstores.utils import DistanceStrategy
 from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
 
@@ -14,7 +18,7 @@ password = os.environ.get("NEO4J_PASSWORD", "pleaseletmein")
 
 OS_TOKEN_COUNT = 1536
 
-texts = ["foo", "bar", "baz"]
+texts = ["foo", "bar", "baz", "It is the end of the world. Take shelter!"]
 
 """
 cd tests/integration_tests/vectorstores/docker-compose
@@ -615,3 +619,62 @@ def test_neo4jvector_from_existing_graph_multiple_properties_hybrid() -> None:
     assert output == [Document(page_content="\nname: Foo\nname2: Fooz")]
 
     drop_vector_indexes(existing)
+
+
+def test_neo4jvector_special_character() -> None:
+    """Test removing lucene."""
+    text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts)
+    text_embedding_pairs = list(zip(texts, text_embeddings))
+    docsearch = Neo4jVector.from_embeddings(
+        text_embeddings=text_embedding_pairs,
+        embedding=FakeEmbeddingsWithOsDimension(),
+        url=url,
+        username=username,
+        password=password,
+        pre_delete_collection=True,
+        search_type=SearchType.HYBRID,
+    )
+    output = docsearch.similarity_search(
+        "It is the end of the world. Take shelter!", k=1
+    )
+    assert output == [
+        Document(page_content="It is the end of the world. Take shelter!", metadata={})
+    ]
+
+    drop_vector_indexes(docsearch)
+
+
+def test_hybrid_score_normalization() -> None:
+    """Test if we can get two 1.0 documents with RRF"""
+    text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts)
+    text_embedding_pairs = list(zip(["foo"], text_embeddings))
+    docsearch = Neo4jVector.from_embeddings(
+        text_embeddings=text_embedding_pairs,
+        embedding=FakeEmbeddingsWithOsDimension(),
+        url=url,
+        username=username,
+        password=password,
+        pre_delete_collection=True,
+        search_type=SearchType.HYBRID,
+    )
+    # Remove deduplication part of the query
+    rrf_query = (
+        _get_search_index_query(SearchType.HYBRID)
+        .rstrip("WITH node, max(score) AS score ORDER BY score DESC LIMIT $k")
+        .replace("UNION", "UNION ALL")
+        + "RETURN node.text AS text, score LIMIT 2"
+    )
+
+    output = docsearch.query(
+        rrf_query,
+        params={
+            "index": "vector",
+            "k": 1,
+            "embedding": FakeEmbeddingsWithOsDimension().embed_query("foo"),
+            "query": "foo",
+            "keyword_index": "keyword",
+        },
+    )
+    # Both FT and Vector must return 1.0 score
+    assert output == [{"text": "foo", "score": 1.0}, {"text": "foo", "score": 1.0}]
+    drop_vector_indexes(docsearch)
diff --git a/libs/community/tests/unit_tests/vectorstores/test_neo4j.py b/libs/community/tests/unit_tests/vectorstores/test_neo4j.py
new file mode 100644
index 0000000000..280334283e
--- /dev/null
+++ b/libs/community/tests/unit_tests/vectorstores/test_neo4j.py
@@ -0,0 +1,45 @@
+"""Test Neo4j functionality."""
+
+from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
+
+
+def test_escaping_lucene() -> None:
+    """Test escaping lucene characters"""
+    assert remove_lucene_chars("Hello+World") == "Hello World"
+    assert remove_lucene_chars("Hello World\\") == "Hello World"
+    assert (
+        remove_lucene_chars("It is the end of the world. Take shelter!")
+        == "It is the end of the world. Take shelter"
+    )
+    assert (
+        remove_lucene_chars("It is the end of the world. Take shelter&&")
+        == "It is the end of the world. Take shelter"
+    )
+    assert (
+        remove_lucene_chars("Bill&&Melinda Gates Foundation")
+        == "Bill  Melinda Gates Foundation"
+    )
+    assert (
+        remove_lucene_chars("It is the end of the world. Take shelter(&&)")
+        == "It is the end of the world. Take shelter"
+    )
+    assert (
+        remove_lucene_chars("It is the end of the world. Take shelter??")
+        == "It is the end of the world. Take shelter"
+    )
+    assert (
+        remove_lucene_chars("It is the end of the world. Take shelter^")
+        == "It is the end of the world. Take shelter"
+    )
+    assert (
+        remove_lucene_chars("It is the end of the world. Take shelter+")
+        == "It is the end of the world. Take shelter"
+    )
+    assert (
+        remove_lucene_chars("It is the end of the world. Take shelter-")
+        == "It is the end of the world. Take shelter"
+    )
+    assert (
+        remove_lucene_chars("It is the end of the world. Take shelter~")
+        == "It is the end of the world. Take shelter"
+    )