You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/experimental/langchain_experimental/graph_transformers/llm.py

319 lines
13 KiB
Python

import asyncio
from typing import Any, List, Optional, Sequence, Type, cast
from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain_core.documents import Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
system_prompt = (
"# Knowledge Graph Instructions for GPT-4\n"
"## 1. Overview\n"
"You are a top-tier algorithm designed for extracting information in structured "
"formats to build a knowledge graph.\n"
"Try to capture as much information from the text as possible without "
"sacrifing accuracy. Do not add any information that is not explicitly "
"mentioned in the text\n"
"- **Nodes** represent entities and concepts.\n"
"- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
"accessible for a vast audience.\n"
"## 2. Labeling Nodes\n"
"- **Consistency**: Ensure you use available types for node labels.\n"
"Ensure you use basic or elementary types for node labels.\n"
"- For example, when you identify an entity representing a person, "
"always label it as **'person'**. Avoid using more specific terms "
"like 'mathematician' or 'scientist'"
" - **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
"names or human-readable identifiers found in the text.\n"
"- **Relationships** represent connections between entities or concepts.\n"
"Ensure consistency and generality in relationship types when constructing "
"knowledge graphs. Instead of using specific and momentary types "
"such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
"like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
"## 3. Coreference Resolution\n"
"- **Maintain Entity Consistency**: When extracting entities, it's vital to "
"ensure consistency.\n"
'If an entity, such as "John Doe", is mentioned multiple times in the text '
'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
"always use the most complete identifier for that entity throughout the "
'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
"Remember, the knowledge graph should be coherent and easily understandable, "
"so maintaining consistency in entity references is crucial.\n"
"## 4. Strict Compliance\n"
"Adhere to the rules strictly. Non-compliance will result in termination."
)
default_prompt = ChatPromptTemplate.from_messages(
[
(
"system",
system_prompt,
),
(
"human",
(
"Tip: Make sure to answer in the correct format and do "
"not include any explanations. "
"Use the given format to extract information from the "
"following input: {input}"
),
),
]
)
def optional_enum_field(
enum_values: Optional[List[str]] = None,
description: str = "",
is_rel: bool = False,
**field_kwargs: Any,
) -> Any:
"""Utility function to conditionally create a field with an enum constraint."""
if enum_values:
return Field(
...,
enum=enum_values,
description=f"{description}. Available options are {enum_values}",
**field_kwargs,
)
else:
node_info = (
"Ensure you use basic or elementary types for node labels.\n"
"For example, when you identify an entity representing a person, "
"always label it as **'Person'**. Avoid using more specific terms "
"like 'Mathematician' or 'Scientist'"
)
rel_info = (
"Instead of using specific and momentary types such as "
"'BECAME_PROFESSOR', use more general and timeless relationship types like "
"'PROFESSOR'. However, do not sacrifice any accuracy for generality"
)
additional_info = rel_info if is_rel else node_info
return Field(..., description=description + additional_info, **field_kwargs)
class _Graph(BaseModel):
nodes: Optional[List]
relationships: Optional[List]
def create_simple_model(
node_labels: Optional[List[str]] = None, rel_types: Optional[List[str]] = None
) -> Type[_Graph]:
"""
Simple model allows to limit node and/or relationship types.
Doesn't have any node or relationship properties.
"""
class SimpleNode(BaseModel):
"""Represents a node in a graph with associated properties."""
id: str = Field(description="Name or human-readable unique identifier.")
type: str = optional_enum_field(
node_labels, description="The type or label of the node."
)
class SimpleRelationship(BaseModel):
"""Represents a directed relationship between two nodes in a graph."""
source_node_id: str = Field(
description="Name or human-readable unique identifier of source node"
)
source_node_type: str = optional_enum_field(
node_labels, description="The type or label of the source node."
)
target_node_id: str = Field(
description="Name or human-readable unique identifier of target node"
)
target_node_type: str = optional_enum_field(
node_labels, description="The type or label of the target node."
)
type: str = optional_enum_field(
rel_types, description="The type of the relationship.", is_rel=True
)
class DynamicGraph(_Graph):
"""Represents a graph document consisting of nodes and relationships."""
nodes: Optional[List[SimpleNode]] = Field(description="List of nodes")
relationships: Optional[List[SimpleRelationship]] = Field(
description="List of relationships"
)
return DynamicGraph
def map_to_base_node(node: Any) -> Node:
"""Map the SimpleNode to the base Node."""
return Node(id=node.id.title(), type=node.type.capitalize())
def map_to_base_relationship(rel: Any) -> Relationship:
"""Map the SimpleRelationship to the base Relationship."""
source = Node(id=rel.source_node_id.title(), type=rel.source_node_type.capitalize())
target = Node(id=rel.target_node_id.title(), type=rel.target_node_type.capitalize())
return Relationship(
source=source, target=target, type=rel.type.replace(" ", "_").upper()
)
class LLMGraphTransformer:
"""Transform documents into graph-based documents using a LLM.
It allows specifying constraints on the types of nodes and relationships to include
in the output graph. The class doesn't support neither extract and node or
relationship properties
Args:
llm (BaseLanguageModel): An instance of a language model supporting structured
output.
allowed_nodes (List[str], optional): Specifies which node types are
allowed in the graph. Defaults to an empty list, allowing all node types.
allowed_relationships (List[str], optional): Specifies which relationship types
are allowed in the graph. Defaults to an empty list, allowing all relationship
types.
prompt (Optional[ChatPromptTemplate], optional): The prompt to pass to
the LLM with additional instructions.
strict_mode (bool, optional): Determines whether the transformer should apply
filtering to strictly adhere to `allowed_nodes` and `allowed_relationships`.
Defaults to True.
Example:
.. code-block:: python
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
llm=ChatOpenAI(temperature=0)
transformer = LLMGraphTransformer(
llm=llm,
allowed_nodes=["Person", "Organization"])
doc = Document(page_content="Elon Musk is suing OpenAI")
graph_documents = transformer.convert_to_graph_documents([doc])
"""
def __init__(
self,
llm: BaseLanguageModel,
allowed_nodes: List[str] = [],
allowed_relationships: List[str] = [],
prompt: ChatPromptTemplate = default_prompt,
strict_mode: bool = True,
) -> None:
if not hasattr(llm, "with_structured_output"):
raise ValueError(
"The specified LLM does not support the 'with_structured_output'. "
"Please ensure you are using an LLM that supports this feature."
)
self.allowed_nodes = allowed_nodes
self.allowed_relationships = allowed_relationships
self.strict_mode = strict_mode
# Define chain
schema = create_simple_model(allowed_nodes, allowed_relationships)
structured_llm = llm.with_structured_output(schema)
self.chain = prompt | structured_llm
def process_response(self, document: Document) -> GraphDocument:
"""
Processes a single document, transforming it into a graph document using
an LLM based on the model's schema and constraints.
"""
text = document.page_content
raw_schema = cast(_Graph, self.chain.invoke({"input": text}))
nodes = (
[map_to_base_node(node) for node in raw_schema.nodes]
if raw_schema.nodes
else []
)
relationships = (
[map_to_base_relationship(rel) for rel in raw_schema.relationships]
if raw_schema.relationships
else []
)
# Strict mode filtering
if self.strict_mode and (self.allowed_nodes or self.allowed_relationships):
if self.allowed_nodes:
nodes = [node for node in nodes if node.type in self.allowed_nodes]
relationships = [
rel
for rel in relationships
if rel.source.type in self.allowed_nodes
and rel.target.type in self.allowed_nodes
]
if self.allowed_relationships:
relationships = [
rel
for rel in relationships
if rel.type in self.allowed_relationships
]
return GraphDocument(nodes=nodes, relationships=relationships, source=document)
def convert_to_graph_documents(
self, documents: Sequence[Document]
) -> List[GraphDocument]:
"""Convert a sequence of documents into graph documents.
Args:
documents (Sequence[Document]): The original documents.
**kwargs: Additional keyword arguments.
Returns:
Sequence[GraphDocument]: The transformed documents as graphs.
"""
return [self.process_response(document) for document in documents]
async def aprocess_response(self, document: Document) -> GraphDocument:
"""
Asynchronously processes a single document, transforming it into a
graph document.
"""
text = document.page_content
raw_schema = cast(_Graph, await self.chain.ainvoke({"input": text}))
nodes = (
[map_to_base_node(node) for node in raw_schema.nodes]
if raw_schema.nodes
else []
)
relationships = (
[map_to_base_relationship(rel) for rel in raw_schema.relationships]
if raw_schema.relationships
else []
)
if self.strict_mode and (self.allowed_nodes or self.allowed_relationships):
if self.allowed_nodes:
nodes = [node for node in nodes if node.type in self.allowed_nodes]
relationships = [
rel
for rel in relationships
if rel.source.type in self.allowed_nodes
and rel.target.type in self.allowed_nodes
]
if self.allowed_relationships:
relationships = [
rel
for rel in relationships
if rel.type in self.allowed_relationships
]
return GraphDocument(nodes=nodes, relationships=relationships, source=document)
async def aconvert_to_graph_documents(
self, documents: Sequence[Document]
) -> List[GraphDocument]:
"""
Asynchronously convert a sequence of documents into graph documents.
"""
tasks = [
asyncio.create_task(self.aprocess_response(document))
for document in documents
]
results = await asyncio.gather(*tasks)
return results