community[patch]: Neo4j enhanced schema (#20983)

Scan the database for example values and provide them to an LLM for
better inference of Text2cypher
pull/20979/head^2
Tomaz Bratanic 2 months ago committed by GitHub
parent dc70c23a11
commit 67428c4052
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -21,7 +21,7 @@
"id": "dbc0ee68",
"metadata": {},
"source": [
"## Settin up\n",
"## Setting up\n",
"\n",
"You will need to have a running `Neo4j` instance. One option is to create a [free Neo4j database instance in their Aura cloud service](https://neo4j.com/cloud/platform/aura-graph-database/). You can also run the database locally using the [Neo4j Desktop application](https://neo4j.com/download/), or running a docker container.\n",
"You can run a local docker container by running the executing the following script:\n",
@ -31,7 +31,7 @@
" --name neo4j \\\n",
" -p 7474:7474 -p 7687:7687 \\\n",
" -d \\\n",
" -e NEO4J_AUTH=neo4j/pleaseletmein \\\n",
" -e NEO4J_AUTH=neo4j/password \\\n",
" -e NEO4J_PLUGINS=\\[\\\"apoc\\\"\\] \\\n",
" neo4j:latest\n",
"```\n",
@ -58,9 +58,7 @@
"metadata": {},
"outputs": [],
"source": [
"graph = Neo4jGraph(\n",
" url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"pleaseletmein\"\n",
")"
"graph = Neo4jGraph(url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"password\")"
]
},
{
@ -93,7 +91,7 @@
"source": [
"graph.query(\n",
" \"\"\"\n",
"MERGE (m:Movie {name:\"Top Gun\"})\n",
"MERGE (m:Movie {name:\"Top Gun\", runtime: 120})\n",
"WITH m\n",
"UNWIND [\"Tom Cruise\", \"Val Kilmer\", \"Anthony Edwards\", \"Meg Ryan\"] AS actor\n",
"MERGE (a:Actor {name:actor})\n",
@ -131,11 +129,12 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Node properties are the following:\n",
"Movie {name: STRING},Actor {name: STRING}\n",
"Relationship properties are the following:\n",
"Node properties:\n",
"Movie {runtime: INTEGER, name: STRING}\n",
"Actor {name: STRING}\n",
"Relationship properties:\n",
"\n",
"The relationships are the following:\n",
"The relationships:\n",
"(:Actor)-[:ACTED_IN]->(:Movie)\n"
]
}
@ -144,6 +143,48 @@
"print(graph.schema)"
]
},
{
"cell_type": "markdown",
"id": "3d88f516-2e60-4da4-b25f-dad5801fe133",
"metadata": {},
"source": [
"## Enhanced schema information\n",
"Choosing the enhanced schema version enables the system to automatically scan for example values within the databases and calculate some distribution metrics. For example, if a node property has less than 10 distinct values, we return all possible values in the schema. Otherwise, return only a single example value per node and relationship property."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c8233976-1ca7-4f8f-af20-e8fb3e081fdd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Node properties:\n",
"- **Movie**\n",
" - `runtime: INTEGER` Min: 120, Max: 120\n",
" - `name: STRING` Available options: ['Top Gun']\n",
"- **Actor**\n",
" - `name: STRING` Available options: ['Tom Cruise', 'Val Kilmer', 'Anthony Edwards', 'Meg Ryan']\n",
"Relationship properties:\n",
"\n",
"The relationships:\n",
"(:Actor)-[:ACTED_IN]->(:Movie)\n"
]
}
],
"source": [
"enhanced_graph = Neo4jGraph(\n",
" url=\"bolt://localhost:7687\",\n",
" username=\"neo4j\",\n",
" password=\"password\",\n",
" enhanced_schema=True,\n",
")\n",
"print(enhanced_graph.schema)"
]
},
{
"cell_type": "markdown",
"id": "68a3c677",
@ -156,7 +197,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "7476ce98",
"metadata": {},
"outputs": [],
@ -168,7 +209,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "ef8ee27b",
"metadata": {},
"outputs": [
@ -180,10 +221,11 @@
"\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n",
"Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
@ -191,16 +233,17 @@
{
"data": {
"text/plain": [
"'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'"
"{'query': 'Who played in Top Gun?',\n",
" 'result': 'Anthony Edwards, Meg Ryan, Val Kilmer, Tom Cruise played in Top Gun.'}"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain.run(\"Who played in Top Gun?\")"
"chain.invoke({\"query\": \"Who played in Top Gun?\"})"
]
},
{
@ -215,7 +258,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"id": "df230946",
"metadata": {},
"outputs": [],
@ -227,7 +270,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "3f1600ee",
"metadata": {},
"outputs": [
@ -239,10 +282,11 @@
"\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n",
"Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}]\u001b[0m\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
@ -250,16 +294,17 @@
{
"data": {
"text/plain": [
"'Tom Cruise and Val Kilmer played in Top Gun.'"
"{'query': 'Who played in Top Gun?',\n",
" 'result': 'Anthony Edwards, Meg Ryan played in Top Gun.'}"
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain.run(\"Who played in Top Gun?\")"
"chain.invoke({\"query\": \"Who played in Top Gun?\"})"
]
},
{
@ -273,7 +318,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"id": "e412f36b",
"metadata": {},
"outputs": [],
@ -285,7 +330,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"id": "4f4699dc",
"metadata": {},
"outputs": [
@ -297,19 +342,20 @@
"\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n",
"Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n",
"Intermediate steps: [{'query': \"MATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\\nRETURN a.name\"}, {'context': [{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]}]\n",
"Final answer: Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.\n"
"Intermediate steps: [{'query': \"MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\\nWHERE m.name = 'Top Gun'\\nRETURN a.name\"}, {'context': [{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]}]\n",
"Final answer: Anthony Edwards, Meg Ryan, Val Kilmer, Tom Cruise played in Top Gun.\n"
]
}
],
"source": [
"result = chain(\"Who played in Top Gun?\")\n",
"result = chain.invoke({\"query\": \"Who played in Top Gun?\"})\n",
"print(f\"Intermediate steps: {result['intermediate_steps']}\")\n",
"print(f\"Final answer: {result['result']}\")"
]
@ -325,7 +371,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"id": "2d3acf10",
"metadata": {},
"outputs": [],
@ -337,7 +383,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"id": "b0a9d143",
"metadata": {},
"outputs": [
@ -349,7 +395,8 @@
"\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
@ -358,19 +405,20 @@
{
"data": {
"text/plain": [
"[{'a.name': 'Tom Cruise'},\n",
" {'a.name': 'Val Kilmer'},\n",
" {'a.name': 'Anthony Edwards'},\n",
" {'a.name': 'Meg Ryan'}]"
"{'query': 'Who played in Top Gun?',\n",
" 'result': [{'a.name': 'Anthony Edwards'},\n",
" {'a.name': 'Meg Ryan'},\n",
" {'a.name': 'Val Kilmer'},\n",
" {'a.name': 'Tom Cruise'}]}"
]
},
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain.run(\"Who played in Top Gun?\")"
"chain.invoke({\"query\": \"Who played in Top Gun?\"})"
]
},
{
@ -384,7 +432,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"id": "59baeb88-adfa-4c26-8334-fcbff3a98efb",
"metadata": {},
"outputs": [],
@ -422,7 +470,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"id": "47c64027-cf42-493a-9c76-2d10ba753728",
"metadata": {},
"outputs": [
@ -434,7 +482,7 @@
"\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (m:Movie {name:\"Top Gun\"})<-[:ACTED_IN]-(:Actor)\n",
"\u001b[32;1m\u001b[1;3mMATCH (:Movie {name:\"Top Gun\"})<-[:ACTED_IN]-()\n",
"RETURN count(*) AS numberOfActors\u001b[0m\n",
"Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'numberOfActors': 4}]\u001b[0m\n",
@ -445,16 +493,17 @@
{
"data": {
"text/plain": [
"'Four people played in Top Gun.'"
"{'query': 'How many people played in Top Gun?',\n",
" 'result': 'There were 4 actors who played in Top Gun.'}"
]
},
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain.run(\"How many people played in Top Gun?\")"
"chain.invoke({\"query\": \"How many people played in Top Gun?\"})"
]
},
{
@ -468,7 +517,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"id": "6f9becc2-f579-45bf-9b50-2ce02bde92da",
"metadata": {},
"outputs": [],
@ -483,7 +532,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"id": "ff18e3e3-3402-4683-aec4-a19898f23ca1",
"metadata": {},
"outputs": [
@ -495,10 +544,11 @@
"\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n",
"Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
@ -506,16 +556,17 @@
{
"data": {
"text/plain": [
"'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'"
"{'query': 'Who played in Top Gun?',\n",
" 'result': 'Anthony Edwards, Meg Ryan, Val Kilmer, and Tom Cruise played in Top Gun.'}"
]
},
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain.run(\"Who played in Top Gun?\")"
"chain.invoke({\"query\": \"Who played in Top Gun?\"})"
]
},
{
@ -530,7 +581,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"id": "a20fa21e-fb85-41c4-aac0-53fb25e34604",
"metadata": {},
"outputs": [],
@ -546,7 +597,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 20,
"id": "3ad7f6b8-543e-46e4-a3b2-40fa3e66e895",
"metadata": {},
"outputs": [
@ -579,7 +630,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 21,
"id": "53665d03-7afd-433c-bdd5-750127bfb152",
"metadata": {},
"outputs": [],
@ -594,7 +645,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 22,
"id": "19e1a591-9c10-4d7b-aa36-a5e1b778a97b",
"metadata": {},
"outputs": [
@ -606,10 +657,11 @@
"\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n",
"Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
@ -617,16 +669,17 @@
{
"data": {
"text/plain": [
"'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'"
"{'query': 'Who played in Top Gun?',\n",
" 'result': 'Anthony Edwards, Meg Ryan, Val Kilmer, Tom Cruise played in Top Gun.'}"
]
},
"execution_count": 21,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain.run(\"Who played in Top Gun?\")"
"chain.invoke({\"query\": \"Who played in Top Gun?\"})"
]
},
{
@ -654,7 +707,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.9.18"
}
},
"nbformat": 4,

@ -9,6 +9,11 @@ from langchain_community.graphs.graph_store import GraphStore
BASE_ENTITY_LABEL = "__Entity__"
EXCLUDED_LABELS = ["_Bloom_Perspective_", "_Bloom_Scene_"]
EXCLUDED_RELS = ["_Bloom_HAS_SCENE_"]
EXHAUSTIVE_SEARCH_LIMIT = 10000
LIST_LIMIT = 128
# Threshold for returning all available prop values in graph schema
DISTINCT_VALUE_LIMIT = 10
NL = "\n"
node_properties_query = """
CALL apoc.meta.data()
@ -56,7 +61,6 @@ def value_sanitize(d: Any) -> Any:
results, can occupy significant context space and detract from
the LLM's performance by introducing unnecessary noise and cost.
"""
LIST_LIMIT = 128
if isinstance(d, dict):
new_dict = {}
for key, value in d.items():
@ -135,6 +139,223 @@ def _get_rel_import_query(baseEntityLabel: bool) -> str:
)
def _enhanced_schema_cypher(
label_or_type: str,
properties: List[Dict[str, Any]],
exhaustive: bool,
is_relationship: bool = False,
) -> str:
if is_relationship:
match_clause = f"MATCH ()-[n:{label_or_type}]->()"
else:
match_clause = f"MATCH (n:{label_or_type})"
with_clauses = []
return_clauses = []
output_dict = {}
if exhaustive:
for prop in properties:
prop_name = prop["property"]
prop_type = prop["type"]
if prop_type == "STRING":
with_clauses.append(
(
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
f"AS `{prop_name}_values`"
)
)
return_clauses.append(
(
f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"
f" distinct_count: size(`{prop_name}_values`)"
)
)
elif prop_type in ["INTEGER", "FLOAT", "DATE"]:
with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
with_clauses.append(
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
)
return_clauses.append(
(
f"min: toString(`{prop_name}_min`), "
f"max: toString(`{prop_name}_max`), "
f"distinct_count: `{prop_name}_distinct`"
)
)
elif prop_type == "LIST":
with_clauses.append(
(
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
)
)
return_clauses.append(
f"min_size: `{prop_name}_size_min`, "
f"max_size: `{prop_name}_size_max`"
)
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
else:
# Just sample 5 random nodes
match_clause += " WITH n LIMIT 5"
for prop in properties:
prop_name = prop["property"]
prop_type = prop["type"]
if prop_type == "STRING":
with_clauses.append(
(
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
f"AS `{prop_name}_values`"
)
)
return_clauses.append(f"values: `{prop_name}_values`")
elif prop_type in ["INTEGER", "FLOAT", "DATE"]:
with_clauses.append(
f"collect(distinct toString(n.`{prop_name}`)) "
f"AS `{prop_name}_values`"
)
return_clauses.append(f"values: `{prop_name}_values`")
elif prop_type == "LIST":
with_clauses.append(
(
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
)
)
return_clauses.append(
f"min_size: `{prop_name}_size_min`,max_size: `{prop_name}_size_max`"
)
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
with_clause = "WITH " + ",\n ".join(with_clauses)
return_clause = (
"RETURN {"
+ ", ".join(f"{k}: {v}" for k, v in output_dict.items())
+ "} AS output"
)
# Combine all parts of the Cypher query
cypher_query = "\n".join([match_clause, with_clause, return_clause])
return cypher_query
def _format_schema(schema: Dict, is_enhanced: bool) -> str:
formatted_node_props = []
formatted_rel_props = []
if is_enhanced:
# Enhanced formatting for nodes
for node_type, properties in schema["node_props"].items():
formatted_node_props.append(f"- **{node_type}**")
for prop in properties:
example = ""
if prop["type"] == "STRING":
if prop.get("distinct_count", 11) > DISTINCT_VALUE_LIMIT:
example = (
f'Example: "{prop["values"][0].replace(NL, " ")}"'
if prop["values"]
else ""
)
else: # If less than 10 possible values return all
example = (
(
"Available options: "
f'{[el.replace(NL, " ") for el in prop["values"]]}'
)
if prop["values"]
else ""
)
elif prop["type"] in ["INTEGER", "FLOAT", "DATE"]:
if prop.get("min") is not None:
example = f'Min: {prop["min"]}, Max: {prop["max"]}'
else:
example = (
f'Example: "{prop["values"][0]}"' if prop["values"] else ""
)
elif prop["type"] == "LIST":
# Skip embeddings
if prop["min_size"] > LIST_LIMIT:
continue
example = (
f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}'
)
formatted_node_props.append(
f" - `{prop['property']}: {prop['type']}` {example}"
)
# Enhanced formatting for relationships
for rel_type, properties in schema["rel_props"].items():
formatted_rel_props.append(f"- **{rel_type}**")
for prop in properties:
example = ""
if prop["type"] == "STRING":
if prop.get("distinct_count", 11) > DISTINCT_VALUE_LIMIT:
example = (
f'Example: "{prop["values"][0].replace(NL, " ")}"'
if prop["values"]
else ""
)
else: # If less than 10 possible values return all
example = (
(
"Available options: "
f'{[el.replace(NL, " ") for el in prop["values"]]}'
)
if prop["values"]
else ""
)
elif prop["type"] in ["INTEGER", "FLOAT", "DATE"]:
if prop.get("min"): # If we have min/max
example = f'Min: {prop["min"]}, Max: {prop["max"]}'
else: # return a single value
example = (
f'Example: "{prop["values"][0]}"' if prop["values"] else ""
)
elif prop["type"] == "LIST":
# Skip embeddings
if prop["min_size"] > LIST_LIMIT:
continue
example = (
f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}'
)
formatted_rel_props.append(
f" - `{prop['property']}: {prop['type']}` {example}"
)
else:
# Format node properties
for label, props in schema["node_props"].items():
props_str = ", ".join(
[f"{prop['property']}: {prop['type']}" for prop in props]
)
formatted_node_props.append(f"{label} {{{props_str}}}")
# Format relationship properties using structured_schema
for type, props in schema["rel_props"].items():
props_str = ", ".join(
[f"{prop['property']}: {prop['type']}" for prop in props]
)
formatted_rel_props.append(f"{type} {{{props_str}}}")
# Format relationships
formatted_rels = [
f"(:{el['start']})-[:{el['type']}]->(:{el['end']})"
for el in schema["relationships"]
]
return "\n".join(
[
"Node properties:",
"\n".join(formatted_node_props),
"Relationship properties:",
"\n".join(formatted_rel_props),
"The relationships:",
"\n".join(formatted_rels),
]
)
class Neo4jGraph(GraphStore):
"""Neo4j database wrapper for various graph operations.
@ -151,6 +372,8 @@ class Neo4jGraph(GraphStore):
embedding-like properties from database responses. Default is False.
refresh_schema (bool): A flag whether to refresh schema information
at initialization. Default is True.
enhanced_schema (bool): A flag whether to scan the database for
example values and use them in the graph schema. Default is False.
driver_config (Dict): Configuration passed to Neo4j Driver.
*Security note*: Make sure that the database connection uses credentials
@ -176,6 +399,7 @@ class Neo4jGraph(GraphStore):
refresh_schema: bool = True,
*,
driver_config: Optional[Dict] = None,
enhanced_schema: bool = False,
) -> None:
"""Create a new Neo4j graph wrapper instance."""
try:
@ -203,6 +427,7 @@ class Neo4jGraph(GraphStore):
self._database = database
self.timeout = timeout
self.sanitize = sanitize
self._enhanced_schema = enhanced_schema
self.schema: str = ""
self.structured_schema: Dict[str, Any] = {}
# Verify connection
@ -300,37 +525,48 @@ class Neo4jGraph(GraphStore):
"metadata": {"constraint": constraint, "index": index},
}
# Format node properties
formatted_node_props = []
for el in node_properties:
props_str = ", ".join(
[f"{prop['property']}: {prop['type']}" for prop in el["properties"]]
)
formatted_node_props.append(f"{el['labels']} {{{props_str}}}")
# Format relationship properties
formatted_rel_props = []
for el in rel_properties:
props_str = ", ".join(
[f"{prop['property']}: {prop['type']}" for prop in el["properties"]]
if self._enhanced_schema:
schema_counts = self.query(
"CALL apoc.meta.graphSample() YIELD nodes, relationships "
"RETURN nodes, [rel in relationships | {name:apoc.any.property"
"(rel, 'type'), count: apoc.any.property(rel, 'count')}]"
" AS relationships"
)
formatted_rel_props.append(f"{el['type']} {{{props_str}}}")
# Update node info
for node in schema_counts[0]["nodes"]:
# Skip bloom labels
if node["name"] in EXCLUDED_LABELS:
continue
node_props = self.structured_schema["node_props"][node["name"]]
enhanced_cypher = _enhanced_schema_cypher(
node["name"], node_props, node["count"] < EXHAUSTIVE_SEARCH_LIMIT
)
enhanced_info = self.query(enhanced_cypher)[0]["output"]
for prop in node_props:
if prop["property"] in enhanced_info:
prop.update(enhanced_info[prop["property"]])
# Update rel info
for rel in schema_counts[0]["relationships"]:
# Skip bloom labels
if rel["name"] in EXCLUDED_RELS:
continue
rel_props = self.structured_schema["rel_props"].get(rel["name"])
if not rel_props:
continue
enhanced_cypher = _enhanced_schema_cypher(
rel["name"],
rel_props,
rel["count"] < EXHAUSTIVE_SEARCH_LIMIT,
is_relationship=True,
)
enhanced_info = self.query(enhanced_cypher)[0]["output"]
for prop in rel_props:
if prop["property"] in enhanced_info:
prop.update(enhanced_info[prop["property"]])
# Format relationships
formatted_rels = [
f"(:{el['start']})-[:{el['type']}]->(:{el['end']})" for el in relationships
]
schema = _format_schema(self.structured_schema, self._enhanced_schema)
self.schema = "\n".join(
[
"Node properties are the following:",
",".join(formatted_node_props),
"Relationship properties are the following:",
",".join(formatted_rel_props),
"The relationships are the following:",
",".join(formatted_rels),
]
)
self.schema = schema
def add_graph_documents(
self,

@ -291,3 +291,45 @@ def test_driver_config() -> None:
driver_config={"max_connection_pool_size": 1},
)
graph.query("RETURN 'foo'")
def test_enhanced_schema() -> None:
"""Test that neo4j works with driver config."""
url = os.environ.get("NEO4J_URI")
username = os.environ.get("NEO4J_USERNAME")
password = os.environ.get("NEO4J_PASSWORD")
assert url is not None
assert username is not None
assert password is not None
graph = Neo4jGraph(
url=url, username=username, password=password, enhanced_schema=True
)
graph.query("MATCH (n) DETACH DELETE n")
graph.add_graph_documents(test_data)
graph.refresh_schema()
expected_output = {
"node_props": {
"foo": [
{
"property": "id",
"type": "STRING",
"values": ["foo"],
"distinct_count": 1,
}
],
"bar": [
{
"property": "id",
"type": "STRING",
"values": ["bar"],
"distinct_count": 1,
}
],
},
"rel_props": {},
"relationships": [{"start": "foo", "type": "REL", "end": "bar"}],
}
# remove metadata portion of schema
del graph.structured_schema["metadata"]
assert graph.structured_schema == expected_output

Loading…
Cancel
Save