langchain/libs/experimental/langchain_experimental/graph_transformers/llm.py

import asyncio
from typing import Any, List, Optional, Sequence, Type, cast

from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain_core.documents import Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

system_prompt = (
    "# Knowledge Graph Instructions for GPT-4\n"
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured "
    "formats to build a knowledge graph.\n"
    "Try to capture as much information from the text as possible without "
    "sacrifing accuracy. Do not add any information that is not explicitly "
    "mentioned in the text\n"
    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
    "accessible for a vast audience.\n"
    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "Ensure you use basic or elementary types for node labels.\n"
    "- For example, when you identify an entity representing a person, "
    "always label it as **'person'**. Avoid using more specific terms "
    "like 'mathematician' or 'scientist'"
    "  - **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
    "names or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts.\n"
    "Ensure consistency and generality in relationship types when constructing "
    "knowledge graphs. Instead of using specific and momentary types "
    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
    "## 3. Coreference Resolution\n"
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
    "ensure consistency.\n"
    'If an entity, such as "John Doe", is mentioned multiple times in the text '
    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
    "always use the most complete identifier for that entity throughout the "
    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
    "Remember, the knowledge graph should be coherent and easily understandable, "
    "so maintaining consistency in entity references is crucial.\n"
    "## 4. Strict Compliance\n"
    "Adhere to the rules strictly. Non-compliance will result in termination."
)

default_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        (
            "human",
            (
                "Tip: Make sure to answer in the correct format and do "
                "not include any explanations. "
                "Use the given format to extract information from the "
                "following input: {input}"
            ),
        ),
    ]
)


def optional_enum_field(
    enum_values: Optional[List[str]] = None,
    description: str = "",
    is_rel: bool = False,
    **field_kwargs: Any,
) -> Any:
    """Utility function to conditionally create a field with an enum constraint."""
    if enum_values:
        return Field(
            ...,
            enum=enum_values,
            description=f"{description}. Available options are {enum_values}",
            **field_kwargs,
        )
    else:
        node_info = (
            "Ensure you use basic or elementary types for node labels.\n"
            "For example, when you identify an entity representing a person, "
            "always label it as **'Person'**. Avoid using more specific terms "
            "like 'Mathematician' or 'Scientist'"
        )
        rel_info = (
            "Instead of using specific and momentary types such as "
            "'BECAME_PROFESSOR', use more general and timeless relationship types like "
            "'PROFESSOR'. However, do not sacrifice any accuracy for generality"
        )
        additional_info = rel_info if is_rel else node_info
        return Field(..., description=description + additional_info, **field_kwargs)


class _Graph(BaseModel):
    nodes: Optional[List]
    relationships: Optional[List]


def create_simple_model(
    node_labels: Optional[List[str]] = None, rel_types: Optional[List[str]] = None
) -> Type[_Graph]:
    """
    Simple model allows to limit node and/or relationship types.
    Doesn't have any node or relationship properties.
    """

    class SimpleNode(BaseModel):
        """Represents a node in a graph with associated properties."""

        id: str = Field(description="Name or human-readable unique identifier.")
        type: str = optional_enum_field(
            node_labels, description="The type or label of the node."
        )

    class SimpleRelationship(BaseModel):
        """Represents a directed relationship between two nodes in a graph."""

        source_node_id: str = Field(
            description="Name or human-readable unique identifier of source node"
        )
        source_node_type: str = optional_enum_field(
            node_labels, description="The type or label of the source node."
        )
        target_node_id: str = Field(
            description="Name or human-readable unique identifier of target node"
        )
        target_node_type: str = optional_enum_field(
            node_labels, description="The type or label of the target node."
        )
        type: str = optional_enum_field(
            rel_types, description="The type of the relationship.", is_rel=True
        )

    class DynamicGraph(_Graph):
        """Represents a graph document consisting of nodes and relationships."""

        nodes: Optional[List[SimpleNode]] = Field(description="List of nodes")
        relationships: Optional[List[SimpleRelationship]] = Field(
            description="List of relationships"
        )

    return DynamicGraph


def map_to_base_node(node: Any) -> Node:
    """Map the SimpleNode to the base Node."""
    return Node(id=node.id.title(), type=node.type.capitalize())


def map_to_base_relationship(rel: Any) -> Relationship:
    """Map the SimpleRelationship to the base Relationship."""
    source = Node(id=rel.source_node_id.title(), type=rel.source_node_type.capitalize())
    target = Node(id=rel.target_node_id.title(), type=rel.target_node_type.capitalize())
    return Relationship(
        source=source, target=target, type=rel.type.replace(" ", "_").upper()
    )


class LLMGraphTransformer:
    """Transform documents into graph-based documents using a LLM.

    It allows specifying constraints on the types of nodes and relationships to include
    in the output graph. The class doesn't support neither extract and node or
    relationship properties

    Args:
        llm (BaseLanguageModel): An instance of a language model supporting structured
          output.
        allowed_nodes (List[str], optional): Specifies which node types are
          allowed in the graph. Defaults to an empty list, allowing all node types.
        allowed_relationships (List[str], optional): Specifies which relationship types
          are allowed in the graph. Defaults to an empty list, allowing all relationship
          types.
        prompt (Optional[ChatPromptTemplate], optional): The prompt to pass to
          the LLM with additional instructions.
        strict_mode (bool, optional): Determines whether the transformer should apply
          filtering to strictly adhere to `allowed_nodes` and `allowed_relationships`.
          Defaults to True.

    Example:
        .. code-block:: python
            from langchain_experimental.graph_transformers import LLMGraphTransformer
            from langchain_core.documents import Document
            from langchain_openai import ChatOpenAI

            llm=ChatOpenAI(temperature=0)
            transformer = LLMGraphTransformer(
                llm=llm,
                allowed_nodes=["Person", "Organization"])

            doc = Document(page_content="Elon Musk is suing OpenAI")
            graph_documents = transformer.convert_to_graph_documents([doc])
    """

    def __init__(
        self,
        llm: BaseLanguageModel,
        allowed_nodes: List[str] = [],
        allowed_relationships: List[str] = [],
        prompt: ChatPromptTemplate = default_prompt,
        strict_mode: bool = True,
    ) -> None:
        if not hasattr(llm, "with_structured_output"):
            raise ValueError(
                "The specified LLM does not support the 'with_structured_output'. "
                "Please ensure you are using an LLM that supports this feature."
            )
        self.allowed_nodes = allowed_nodes
        self.allowed_relationships = allowed_relationships
        self.strict_mode = strict_mode

        # Define chain
        schema = create_simple_model(allowed_nodes, allowed_relationships)
        structured_llm = llm.with_structured_output(schema)
        self.chain = prompt | structured_llm

    def process_response(self, document: Document) -> GraphDocument:
        """
        Processes a single document, transforming it into a graph document using
        an LLM based on the model's schema and constraints.
        """
        text = document.page_content
        raw_schema = cast(_Graph, self.chain.invoke({"input": text}))
        nodes = (
            [map_to_base_node(node) for node in raw_schema.nodes]
            if raw_schema.nodes
            else []
        )
        relationships = (
            [map_to_base_relationship(rel) for rel in raw_schema.relationships]
            if raw_schema.relationships
            else []
        )

        # Strict mode filtering
        if self.strict_mode and (self.allowed_nodes or self.allowed_relationships):
            if self.allowed_nodes:
                nodes = [node for node in nodes if node.type in self.allowed_nodes]
                relationships = [
                    rel
                    for rel in relationships
                    if rel.source.type in self.allowed_nodes
                    and rel.target.type in self.allowed_nodes
                ]
            if self.allowed_relationships:
                relationships = [
                    rel
                    for rel in relationships
                    if rel.type in self.allowed_relationships
                ]

        return GraphDocument(nodes=nodes, relationships=relationships, source=document)

    def convert_to_graph_documents(
        self, documents: Sequence[Document]
    ) -> List[GraphDocument]:
        """Convert a sequence of documents into graph documents.

        Args:
            documents (Sequence[Document]): The original documents.
            **kwargs: Additional keyword arguments.

        Returns:
            Sequence[GraphDocument]: The transformed documents as graphs.
        """
        return [self.process_response(document) for document in documents]

    async def aprocess_response(self, document: Document) -> GraphDocument:
        """
        Asynchronously processes a single document, transforming it into a
        graph document.
        """
        text = document.page_content
        raw_schema = cast(_Graph, await self.chain.ainvoke({"input": text}))

        nodes = (
            [map_to_base_node(node) for node in raw_schema.nodes]
            if raw_schema.nodes
            else []
        )
        relationships = (
            [map_to_base_relationship(rel) for rel in raw_schema.relationships]
            if raw_schema.relationships
            else []
        )

        if self.strict_mode and (self.allowed_nodes or self.allowed_relationships):
            if self.allowed_nodes:
                nodes = [node for node in nodes if node.type in self.allowed_nodes]
                relationships = [
                    rel
                    for rel in relationships
                    if rel.source.type in self.allowed_nodes
                    and rel.target.type in self.allowed_nodes
                ]
            if self.allowed_relationships:
                relationships = [
                    rel
                    for rel in relationships
                    if rel.type in self.allowed_relationships
                ]

        return GraphDocument(nodes=nodes, relationships=relationships, source=document)

    async def aconvert_to_graph_documents(
        self, documents: Sequence[Document]
    ) -> List[GraphDocument]:
        """
        Asynchronously convert a sequence of documents into graph documents.
        """
        tasks = [
            asyncio.create_task(self.aprocess_response(document))
            for document in documents
        ]
        results = await asyncio.gather(*tasks)
        return results