diff --git a/docs/docs/integrations/graphs/neo4j_cypher.ipynb b/docs/docs/integrations/graphs/neo4j_cypher.ipynb index 7b1b854ea0..315854c8a0 100644 --- a/docs/docs/integrations/graphs/neo4j_cypher.ipynb +++ b/docs/docs/integrations/graphs/neo4j_cypher.ipynb @@ -21,7 +21,7 @@ "id": "dbc0ee68", "metadata": {}, "source": [ - "## Settin up\n", + "## Setting up\n", "\n", "You will need to have a running `Neo4j` instance. One option is to create a [free Neo4j database instance in their Aura cloud service](https://neo4j.com/cloud/platform/aura-graph-database/). You can also run the database locally using the [Neo4j Desktop application](https://neo4j.com/download/), or running a docker container.\n", "You can run a local docker container by running the executing the following script:\n", @@ -31,7 +31,7 @@ " --name neo4j \\\n", " -p 7474:7474 -p 7687:7687 \\\n", " -d \\\n", - " -e NEO4J_AUTH=neo4j/pleaseletmein \\\n", + " -e NEO4J_AUTH=neo4j/password \\\n", " -e NEO4J_PLUGINS=\\[\\\"apoc\\\"\\] \\\n", " neo4j:latest\n", "```\n", @@ -58,9 +58,7 @@ "metadata": {}, "outputs": [], "source": [ - "graph = Neo4jGraph(\n", - " url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"pleaseletmein\"\n", - ")" + "graph = Neo4jGraph(url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"password\")" ] }, { @@ -93,7 +91,7 @@ "source": [ "graph.query(\n", " \"\"\"\n", - "MERGE (m:Movie {name:\"Top Gun\"})\n", + "MERGE (m:Movie {name:\"Top Gun\", runtime: 120})\n", "WITH m\n", "UNWIND [\"Tom Cruise\", \"Val Kilmer\", \"Anthony Edwards\", \"Meg Ryan\"] AS actor\n", "MERGE (a:Actor {name:actor})\n", @@ -131,11 +129,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Node properties are the following:\n", - "Movie {name: STRING},Actor {name: STRING}\n", - "Relationship properties are the following:\n", + "Node properties:\n", + "Movie {runtime: INTEGER, name: STRING}\n", + "Actor {name: STRING}\n", + "Relationship properties:\n", "\n", - "The relationships are the following:\n", + "The relationships:\n", "(:Actor)-[:ACTED_IN]->(:Movie)\n" ] } @@ -144,6 +143,48 @@ "print(graph.schema)" ] }, + { + "cell_type": "markdown", + "id": "3d88f516-2e60-4da4-b25f-dad5801fe133", + "metadata": {}, + "source": [ + "## Enhanced schema information\n", + "Choosing the enhanced schema version enables the system to automatically scan for example values within the databases and calculate some distribution metrics. For example, if a node property has less than 10 distinct values, we return all possible values in the schema. Otherwise, return only a single example value per node and relationship property." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c8233976-1ca7-4f8f-af20-e8fb3e081fdd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Node properties:\n", + "- **Movie**\n", + " - `runtime: INTEGER` Min: 120, Max: 120\n", + " - `name: STRING` Available options: ['Top Gun']\n", + "- **Actor**\n", + " - `name: STRING` Available options: ['Tom Cruise', 'Val Kilmer', 'Anthony Edwards', 'Meg Ryan']\n", + "Relationship properties:\n", + "\n", + "The relationships:\n", + "(:Actor)-[:ACTED_IN]->(:Movie)\n" + ] + } + ], + "source": [ + "enhanced_graph = Neo4jGraph(\n", + " url=\"bolt://localhost:7687\",\n", + " username=\"neo4j\",\n", + " password=\"password\",\n", + " enhanced_schema=True,\n", + ")\n", + "print(enhanced_graph.schema)" + ] + }, { "cell_type": "markdown", "id": "68a3c677", @@ -156,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "7476ce98", "metadata": {}, "outputs": [], @@ -168,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "ef8ee27b", "metadata": {}, "outputs": [ @@ -180,10 +221,11 @@ "\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "Generated Cypher:\n", - "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", + "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n", + "WHERE m.name = 'Top Gun'\n", "RETURN a.name\u001b[0m\n", "Full Context:\n", - "\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] @@ -191,16 +233,17 @@ { "data": { "text/plain": [ - "'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'" + "{'query': 'Who played in Top Gun?',\n", + " 'result': 'Anthony Edwards, Meg Ryan, Val Kilmer, Tom Cruise played in Top Gun.'}" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "chain.run(\"Who played in Top Gun?\")" + "chain.invoke({\"query\": \"Who played in Top Gun?\"})" ] }, { @@ -215,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "df230946", "metadata": {}, "outputs": [], @@ -227,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "3f1600ee", "metadata": {}, "outputs": [ @@ -239,10 +282,11 @@ "\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "Generated Cypher:\n", - "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", + "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n", + "WHERE m.name = 'Top Gun'\n", "RETURN a.name\u001b[0m\n", "Full Context:\n", - "\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}]\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] @@ -250,16 +294,17 @@ { "data": { "text/plain": [ - "'Tom Cruise and Val Kilmer played in Top Gun.'" + "{'query': 'Who played in Top Gun?',\n", + " 'result': 'Anthony Edwards, Meg Ryan played in Top Gun.'}" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "chain.run(\"Who played in Top Gun?\")" + "chain.invoke({\"query\": \"Who played in Top Gun?\"})" ] }, { @@ -273,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "e412f36b", "metadata": {}, "outputs": [], @@ -285,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "4f4699dc", "metadata": {}, "outputs": [ @@ -297,19 +342,20 @@ "\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "Generated Cypher:\n", - "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", + "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n", + "WHERE m.name = 'Top Gun'\n", "RETURN a.name\u001b[0m\n", "Full Context:\n", - "\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n", - "Intermediate steps: [{'query': \"MATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\\nRETURN a.name\"}, {'context': [{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]}]\n", - "Final answer: Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.\n" + "Intermediate steps: [{'query': \"MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\\nWHERE m.name = 'Top Gun'\\nRETURN a.name\"}, {'context': [{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]}]\n", + "Final answer: Anthony Edwards, Meg Ryan, Val Kilmer, Tom Cruise played in Top Gun.\n" ] } ], "source": [ - "result = chain(\"Who played in Top Gun?\")\n", + "result = chain.invoke({\"query\": \"Who played in Top Gun?\"})\n", "print(f\"Intermediate steps: {result['intermediate_steps']}\")\n", "print(f\"Final answer: {result['result']}\")" ] @@ -325,7 +371,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "2d3acf10", "metadata": {}, "outputs": [], @@ -337,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "b0a9d143", "metadata": {}, "outputs": [ @@ -349,7 +395,8 @@ "\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "Generated Cypher:\n", - "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", + "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n", + "WHERE m.name = 'Top Gun'\n", "RETURN a.name\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" @@ -358,19 +405,20 @@ { "data": { "text/plain": [ - "[{'a.name': 'Tom Cruise'},\n", - " {'a.name': 'Val Kilmer'},\n", - " {'a.name': 'Anthony Edwards'},\n", - " {'a.name': 'Meg Ryan'}]" + "{'query': 'Who played in Top Gun?',\n", + " 'result': [{'a.name': 'Anthony Edwards'},\n", + " {'a.name': 'Meg Ryan'},\n", + " {'a.name': 'Val Kilmer'},\n", + " {'a.name': 'Tom Cruise'}]}" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "chain.run(\"Who played in Top Gun?\")" + "chain.invoke({\"query\": \"Who played in Top Gun?\"})" ] }, { @@ -384,7 +432,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "59baeb88-adfa-4c26-8334-fcbff3a98efb", "metadata": {}, "outputs": [], @@ -422,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "47c64027-cf42-493a-9c76-2d10ba753728", "metadata": {}, "outputs": [ @@ -434,7 +482,7 @@ "\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "Generated Cypher:\n", - "\u001b[32;1m\u001b[1;3mMATCH (m:Movie {name:\"Top Gun\"})<-[:ACTED_IN]-(:Actor)\n", + "\u001b[32;1m\u001b[1;3mMATCH (:Movie {name:\"Top Gun\"})<-[:ACTED_IN]-()\n", "RETURN count(*) AS numberOfActors\u001b[0m\n", "Full Context:\n", "\u001b[32;1m\u001b[1;3m[{'numberOfActors': 4}]\u001b[0m\n", @@ -445,16 +493,17 @@ { "data": { "text/plain": [ - "'Four people played in Top Gun.'" + "{'query': 'How many people played in Top Gun?',\n", + " 'result': 'There were 4 actors who played in Top Gun.'}" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "chain.run(\"How many people played in Top Gun?\")" + "chain.invoke({\"query\": \"How many people played in Top Gun?\"})" ] }, { @@ -468,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "6f9becc2-f579-45bf-9b50-2ce02bde92da", "metadata": {}, "outputs": [], @@ -483,7 +532,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "ff18e3e3-3402-4683-aec4-a19898f23ca1", "metadata": {}, "outputs": [ @@ -495,10 +544,11 @@ "\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "Generated Cypher:\n", - "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", + "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n", + "WHERE m.name = 'Top Gun'\n", "RETURN a.name\u001b[0m\n", "Full Context:\n", - "\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] @@ -506,16 +556,17 @@ { "data": { "text/plain": [ - "'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'" + "{'query': 'Who played in Top Gun?',\n", + " 'result': 'Anthony Edwards, Meg Ryan, Val Kilmer, and Tom Cruise played in Top Gun.'}" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "chain.run(\"Who played in Top Gun?\")" + "chain.invoke({\"query\": \"Who played in Top Gun?\"})" ] }, { @@ -530,7 +581,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "a20fa21e-fb85-41c4-aac0-53fb25e34604", "metadata": {}, "outputs": [], @@ -546,7 +597,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "3ad7f6b8-543e-46e4-a3b2-40fa3e66e895", "metadata": {}, "outputs": [ @@ -579,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "53665d03-7afd-433c-bdd5-750127bfb152", "metadata": {}, "outputs": [], @@ -594,7 +645,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "19e1a591-9c10-4d7b-aa36-a5e1b778a97b", "metadata": {}, "outputs": [ @@ -606,10 +657,11 @@ "\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "Generated Cypher:\n", - "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", + "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n", + "WHERE m.name = 'Top Gun'\n", "RETURN a.name\u001b[0m\n", "Full Context:\n", - "\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] @@ -617,16 +669,17 @@ { "data": { "text/plain": [ - "'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'" + "{'query': 'Who played in Top Gun?',\n", + " 'result': 'Anthony Edwards, Meg Ryan, Val Kilmer, Tom Cruise played in Top Gun.'}" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "chain.run(\"Who played in Top Gun?\")" + "chain.invoke({\"query\": \"Who played in Top Gun?\"})" ] }, { @@ -654,7 +707,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/libs/community/langchain_community/graphs/neo4j_graph.py b/libs/community/langchain_community/graphs/neo4j_graph.py index cffe84144b..c1740f16e4 100644 --- a/libs/community/langchain_community/graphs/neo4j_graph.py +++ b/libs/community/langchain_community/graphs/neo4j_graph.py @@ -9,6 +9,11 @@ from langchain_community.graphs.graph_store import GraphStore BASE_ENTITY_LABEL = "__Entity__" EXCLUDED_LABELS = ["_Bloom_Perspective_", "_Bloom_Scene_"] EXCLUDED_RELS = ["_Bloom_HAS_SCENE_"] +EXHAUSTIVE_SEARCH_LIMIT = 10000 +LIST_LIMIT = 128 +# Threshold for returning all available prop values in graph schema +DISTINCT_VALUE_LIMIT = 10 +NL = "\n" node_properties_query = """ CALL apoc.meta.data() @@ -56,7 +61,6 @@ def value_sanitize(d: Any) -> Any: results, can occupy significant context space and detract from the LLM's performance by introducing unnecessary noise and cost. """ - LIST_LIMIT = 128 if isinstance(d, dict): new_dict = {} for key, value in d.items(): @@ -135,6 +139,223 @@ def _get_rel_import_query(baseEntityLabel: bool) -> str: ) +def _enhanced_schema_cypher( + label_or_type: str, + properties: List[Dict[str, Any]], + exhaustive: bool, + is_relationship: bool = False, +) -> str: + if is_relationship: + match_clause = f"MATCH ()-[n:{label_or_type}]->()" + else: + match_clause = f"MATCH (n:{label_or_type})" + + with_clauses = [] + return_clauses = [] + output_dict = {} + if exhaustive: + for prop in properties: + prop_name = prop["property"] + prop_type = prop["type"] + if prop_type == "STRING": + with_clauses.append( + ( + f"collect(distinct substring(n.`{prop_name}`, 0, 50)) " + f"AS `{prop_name}_values`" + ) + ) + return_clauses.append( + ( + f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}]," + f" distinct_count: size(`{prop_name}_values`)" + ) + ) + elif prop_type in ["INTEGER", "FLOAT", "DATE"]: + with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`") + with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`") + with_clauses.append( + f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`" + ) + return_clauses.append( + ( + f"min: toString(`{prop_name}_min`), " + f"max: toString(`{prop_name}_max`), " + f"distinct_count: `{prop_name}_distinct`" + ) + ) + elif prop_type == "LIST": + with_clauses.append( + ( + f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, " + f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`" + ) + ) + return_clauses.append( + f"min_size: `{prop_name}_size_min`, " + f"max_size: `{prop_name}_size_max`" + ) + + output_dict[prop_name] = "{" + return_clauses.pop() + "}" + else: + # Just sample 5 random nodes + match_clause += " WITH n LIMIT 5" + for prop in properties: + prop_name = prop["property"] + prop_type = prop["type"] + if prop_type == "STRING": + with_clauses.append( + ( + f"collect(distinct substring(n.`{prop_name}`, 0, 50)) " + f"AS `{prop_name}_values`" + ) + ) + return_clauses.append(f"values: `{prop_name}_values`") + elif prop_type in ["INTEGER", "FLOAT", "DATE"]: + with_clauses.append( + f"collect(distinct toString(n.`{prop_name}`)) " + f"AS `{prop_name}_values`" + ) + return_clauses.append(f"values: `{prop_name}_values`") + elif prop_type == "LIST": + with_clauses.append( + ( + f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, " + f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`" + ) + ) + return_clauses.append( + f"min_size: `{prop_name}_size_min`,max_size: `{prop_name}_size_max`" + ) + + output_dict[prop_name] = "{" + return_clauses.pop() + "}" + + with_clause = "WITH " + ",\n ".join(with_clauses) + return_clause = ( + "RETURN {" + + ", ".join(f"{k}: {v}" for k, v in output_dict.items()) + + "} AS output" + ) + + # Combine all parts of the Cypher query + cypher_query = "\n".join([match_clause, with_clause, return_clause]) + return cypher_query + + +def _format_schema(schema: Dict, is_enhanced: bool) -> str: + formatted_node_props = [] + formatted_rel_props = [] + if is_enhanced: + # Enhanced formatting for nodes + for node_type, properties in schema["node_props"].items(): + formatted_node_props.append(f"- **{node_type}**") + for prop in properties: + example = "" + if prop["type"] == "STRING": + if prop.get("distinct_count", 11) > DISTINCT_VALUE_LIMIT: + example = ( + f'Example: "{prop["values"][0].replace(NL, " ")}"' + if prop["values"] + else "" + ) + else: # If less than 10 possible values return all + example = ( + ( + "Available options: " + f'{[el.replace(NL, " ") for el in prop["values"]]}' + ) + if prop["values"] + else "" + ) + + elif prop["type"] in ["INTEGER", "FLOAT", "DATE"]: + if prop.get("min") is not None: + example = f'Min: {prop["min"]}, Max: {prop["max"]}' + else: + example = ( + f'Example: "{prop["values"][0]}"' if prop["values"] else "" + ) + elif prop["type"] == "LIST": + # Skip embeddings + if prop["min_size"] > LIST_LIMIT: + continue + example = ( + f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}' + ) + formatted_node_props.append( + f" - `{prop['property']}: {prop['type']}` {example}" + ) + + # Enhanced formatting for relationships + for rel_type, properties in schema["rel_props"].items(): + formatted_rel_props.append(f"- **{rel_type}**") + for prop in properties: + example = "" + if prop["type"] == "STRING": + if prop.get("distinct_count", 11) > DISTINCT_VALUE_LIMIT: + example = ( + f'Example: "{prop["values"][0].replace(NL, " ")}"' + if prop["values"] + else "" + ) + else: # If less than 10 possible values return all + example = ( + ( + "Available options: " + f'{[el.replace(NL, " ") for el in prop["values"]]}' + ) + if prop["values"] + else "" + ) + elif prop["type"] in ["INTEGER", "FLOAT", "DATE"]: + if prop.get("min"): # If we have min/max + example = f'Min: {prop["min"]}, Max: {prop["max"]}' + else: # return a single value + example = ( + f'Example: "{prop["values"][0]}"' if prop["values"] else "" + ) + elif prop["type"] == "LIST": + # Skip embeddings + if prop["min_size"] > LIST_LIMIT: + continue + example = ( + f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}' + ) + formatted_rel_props.append( + f" - `{prop['property']}: {prop['type']}` {example}" + ) + else: + # Format node properties + for label, props in schema["node_props"].items(): + props_str = ", ".join( + [f"{prop['property']}: {prop['type']}" for prop in props] + ) + formatted_node_props.append(f"{label} {{{props_str}}}") + + # Format relationship properties using structured_schema + for type, props in schema["rel_props"].items(): + props_str = ", ".join( + [f"{prop['property']}: {prop['type']}" for prop in props] + ) + formatted_rel_props.append(f"{type} {{{props_str}}}") + + # Format relationships + formatted_rels = [ + f"(:{el['start']})-[:{el['type']}]->(:{el['end']})" + for el in schema["relationships"] + ] + + return "\n".join( + [ + "Node properties:", + "\n".join(formatted_node_props), + "Relationship properties:", + "\n".join(formatted_rel_props), + "The relationships:", + "\n".join(formatted_rels), + ] + ) + + class Neo4jGraph(GraphStore): """Neo4j database wrapper for various graph operations. @@ -151,6 +372,8 @@ class Neo4jGraph(GraphStore): embedding-like properties from database responses. Default is False. refresh_schema (bool): A flag whether to refresh schema information at initialization. Default is True. + enhanced_schema (bool): A flag whether to scan the database for + example values and use them in the graph schema. Default is False. driver_config (Dict): Configuration passed to Neo4j Driver. *Security note*: Make sure that the database connection uses credentials @@ -176,6 +399,7 @@ class Neo4jGraph(GraphStore): refresh_schema: bool = True, *, driver_config: Optional[Dict] = None, + enhanced_schema: bool = False, ) -> None: """Create a new Neo4j graph wrapper instance.""" try: @@ -203,6 +427,7 @@ class Neo4jGraph(GraphStore): self._database = database self.timeout = timeout self.sanitize = sanitize + self._enhanced_schema = enhanced_schema self.schema: str = "" self.structured_schema: Dict[str, Any] = {} # Verify connection @@ -300,37 +525,48 @@ class Neo4jGraph(GraphStore): "metadata": {"constraint": constraint, "index": index}, } - # Format node properties - formatted_node_props = [] - for el in node_properties: - props_str = ", ".join( - [f"{prop['property']}: {prop['type']}" for prop in el["properties"]] - ) - formatted_node_props.append(f"{el['labels']} {{{props_str}}}") - - # Format relationship properties - formatted_rel_props = [] - for el in rel_properties: - props_str = ", ".join( - [f"{prop['property']}: {prop['type']}" for prop in el["properties"]] + if self._enhanced_schema: + schema_counts = self.query( + "CALL apoc.meta.graphSample() YIELD nodes, relationships " + "RETURN nodes, [rel in relationships | {name:apoc.any.property" + "(rel, 'type'), count: apoc.any.property(rel, 'count')}]" + " AS relationships" ) - formatted_rel_props.append(f"{el['type']} {{{props_str}}}") + # Update node info + for node in schema_counts[0]["nodes"]: + # Skip bloom labels + if node["name"] in EXCLUDED_LABELS: + continue + node_props = self.structured_schema["node_props"][node["name"]] + enhanced_cypher = _enhanced_schema_cypher( + node["name"], node_props, node["count"] < EXHAUSTIVE_SEARCH_LIMIT + ) + enhanced_info = self.query(enhanced_cypher)[0]["output"] + for prop in node_props: + if prop["property"] in enhanced_info: + prop.update(enhanced_info[prop["property"]]) + # Update rel info + for rel in schema_counts[0]["relationships"]: + # Skip bloom labels + if rel["name"] in EXCLUDED_RELS: + continue + rel_props = self.structured_schema["rel_props"].get(rel["name"]) + if not rel_props: + continue + enhanced_cypher = _enhanced_schema_cypher( + rel["name"], + rel_props, + rel["count"] < EXHAUSTIVE_SEARCH_LIMIT, + is_relationship=True, + ) + enhanced_info = self.query(enhanced_cypher)[0]["output"] + for prop in rel_props: + if prop["property"] in enhanced_info: + prop.update(enhanced_info[prop["property"]]) - # Format relationships - formatted_rels = [ - f"(:{el['start']})-[:{el['type']}]->(:{el['end']})" for el in relationships - ] + schema = _format_schema(self.structured_schema, self._enhanced_schema) - self.schema = "\n".join( - [ - "Node properties are the following:", - ",".join(formatted_node_props), - "Relationship properties are the following:", - ",".join(formatted_rel_props), - "The relationships are the following:", - ",".join(formatted_rels), - ] - ) + self.schema = schema def add_graph_documents( self, diff --git a/libs/community/tests/integration_tests/graphs/test_neo4j.py b/libs/community/tests/integration_tests/graphs/test_neo4j.py index c87b8514fe..8fe3349ee3 100644 --- a/libs/community/tests/integration_tests/graphs/test_neo4j.py +++ b/libs/community/tests/integration_tests/graphs/test_neo4j.py @@ -291,3 +291,45 @@ def test_driver_config() -> None: driver_config={"max_connection_pool_size": 1}, ) graph.query("RETURN 'foo'") + + +def test_enhanced_schema() -> None: + """Test that neo4j works with driver config.""" + url = os.environ.get("NEO4J_URI") + username = os.environ.get("NEO4J_USERNAME") + password = os.environ.get("NEO4J_PASSWORD") + assert url is not None + assert username is not None + assert password is not None + + graph = Neo4jGraph( + url=url, username=username, password=password, enhanced_schema=True + ) + graph.query("MATCH (n) DETACH DELETE n") + graph.add_graph_documents(test_data) + graph.refresh_schema() + expected_output = { + "node_props": { + "foo": [ + { + "property": "id", + "type": "STRING", + "values": ["foo"], + "distinct_count": 1, + } + ], + "bar": [ + { + "property": "id", + "type": "STRING", + "values": ["bar"], + "distinct_count": 1, + } + ], + }, + "rel_props": {}, + "relationships": [{"start": "foo", "type": "REL", "end": "bar"}], + } + # remove metadata portion of schema + del graph.structured_schema["metadata"] + assert graph.structured_schema == expected_output