From 0c553d2064e3e3bc3c56438ceadec8aab37dd7ad Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 12 Feb 2023 23:01:26 -0800 Subject: [PATCH] Harrion/kg (#1016) Co-authored-by: William FH <13333726+hinthornw@users.noreply.github.com> --- .../combine_docs_examples/graph_qa.ipynb | 238 ++++++++++++++++++ docs/modules/memory/getting_started.ipynb | 184 +++++++++++++- langchain/callbacks/stdout.py | 2 +- langchain/chains/__init__.py | 2 + langchain/chains/conversation/memory.py | 104 ++++++++ langchain/chains/conversation/prompt.py | 55 ++++ langchain/chains/graph_qa/__init__.py | 1 + langchain/chains/graph_qa/base.py | 78 ++++++ langchain/chains/graph_qa/prompts.py | 34 +++ langchain/graphs/__init__.py | 4 + langchain/graphs/networkx_graph.py | 96 +++++++ langchain/indexes/__init__.py | 4 + langchain/indexes/graph.py | 30 +++ langchain/indexes/prompts/__init__.py | 1 + .../indexes/prompts/entity_extraction.py | 40 +++ .../indexes/prompts/entity_summarization.py | 25 ++ .../prompts/knowledge_triplet_extraction.py | 37 +++ poetry.lock | 23 +- pyproject.toml | 4 +- 19 files changed, 955 insertions(+), 7 deletions(-) create mode 100644 docs/modules/chains/combine_docs_examples/graph_qa.ipynb create mode 100644 langchain/chains/graph_qa/__init__.py create mode 100644 langchain/chains/graph_qa/base.py create mode 100644 langchain/chains/graph_qa/prompts.py create mode 100644 langchain/graphs/__init__.py create mode 100644 langchain/graphs/networkx_graph.py create mode 100644 langchain/indexes/__init__.py create mode 100644 langchain/indexes/graph.py create mode 100644 langchain/indexes/prompts/__init__.py create mode 100644 langchain/indexes/prompts/entity_extraction.py create mode 100644 langchain/indexes/prompts/entity_summarization.py create mode 100644 langchain/indexes/prompts/knowledge_triplet_extraction.py diff --git a/docs/modules/chains/combine_docs_examples/graph_qa.ipynb b/docs/modules/chains/combine_docs_examples/graph_qa.ipynb new file mode 100644 index 00000000..6e8248fa --- /dev/null +++ b/docs/modules/chains/combine_docs_examples/graph_qa.ipynb @@ -0,0 +1,238 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a6850189", + "metadata": {}, + "source": [ + "# Graph QA\n", + "\n", + "This notebook goes over how to do question answering over a graph data structure." + ] + }, + { + "cell_type": "markdown", + "id": "9e516e3e", + "metadata": {}, + "source": [ + "## Create the graph\n", + "\n", + "In this section, we construct an example graph. At the moment, this works best for small pieces of text." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3849873d", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.indexes import GraphIndexCreator\n", + "from langchain.llms import OpenAI\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "05d65c87", + "metadata": {}, + "outputs": [], + "source": [ + "index_creator = GraphIndexCreator(llm=OpenAI(temperature=0))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0a45a5b9", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"../../state_of_the_union.txt\") as f:\n", + " all_text = f.read()" + ] + }, + { + "cell_type": "markdown", + "id": "3fca3e1b", + "metadata": {}, + "source": [ + "We will use just a small snippet, because extracting the knowledge triplets is a bit intensive at the moment." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "80522bd6", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"\\n\".join(all_text.split(\"\\n\\n\")[105:108])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "da5aad5a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'It won’t look like much, but if you stop and look closely, you’ll see a “Field of dreams,” the ground on which America’s future will be built. \\nThis is where Intel, the American company that helped build Silicon Valley, is going to build its $20 billion semiconductor “mega site”. \\nUp to eight state-of-the-art factories in one place. 10,000 new good-paying jobs. '" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8dad7b59", + "metadata": {}, + "outputs": [], + "source": [ + "graph = index_creator.from_text(text)" + ] + }, + { + "cell_type": "markdown", + "id": "2118f363", + "metadata": {}, + "source": [ + "We can inspect the created graph." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "32878c13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Intel', '$20 billion semiconductor \"mega site\"', 'is going to build'),\n", + " ('Intel', 'state-of-the-art factories', 'is building'),\n", + " ('Intel', '10,000 new good-paying jobs', 'is creating'),\n", + " ('Intel', 'Silicon Valley', 'is helping build'),\n", + " ('Field of dreams',\n", + " \"America's future will be built\",\n", + " 'is the ground on which')]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph.get_triples()" + ] + }, + { + "cell_type": "markdown", + "id": "e9737be1", + "metadata": {}, + "source": [ + "## Querying the graph\n", + "We can now use the graph QA chain to ask question of the graph" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "76edc854", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import GraphQAChain" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8e7719b4", + "metadata": {}, + "outputs": [], + "source": [ + "chain = GraphQAChain.from_llm(OpenAI(temperature=0), graph=graph, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f6511169", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new GraphQAChain chain...\u001b[0m\n", + "Entities Extracted:\n", + "\u001b[32;1m\u001b[1;3m Intel\u001b[0m\n", + "Full Context:\n", + "\u001b[32;1m\u001b[1;3mIntel is going to build $20 billion semiconductor \"mega site\"\n", + "Intel is building state-of-the-art factories\n", + "Intel is creating 10,000 new good-paying jobs\n", + "Intel is helping build Silicon Valley\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "' Intel is going to build a $20 billion semiconductor \"mega site\" with state-of-the-art factories, creating 10,000 new good-paying jobs and helping to build Silicon Valley.'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.run(\"what is Intel going to build?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f70b9ada", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/memory/getting_started.ipynb b/docs/modules/memory/getting_started.ipynb index 5818691d..d42b5e83 100644 --- a/docs/modules/memory/getting_started.ipynb +++ b/docs/modules/memory/getting_started.ipynb @@ -692,12 +692,192 @@ "conversation_with_summary.predict(input=\"Haha nope, although a lot of people confuse it for that\")" ] }, + { + "cell_type": "markdown", + "id": "44c9933a", + "metadata": {}, + "source": [ + "## Conversation Knowledge Graph Memory\n", + "\n", + "This type of memory uses a knowledge graph to recreate memory." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "f71f40ba", "metadata": {}, "outputs": [], + "source": [ + "from langchain.chains.conversation.memory import ConversationKGMemory" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b462baf1", + "metadata": {}, + "outputs": [], + "source": [ + "llm = OpenAI(temperature=0)\n", + "from langchain.prompts.prompt import PromptTemplate\n", + "\n", + "template = \"\"\"The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. \n", + "If the AI does not know the answer to a question, it truthfully says it does not know. The AI ONLY uses information contained in the \"Relevant Information\" section and does not hallucinate.\n", + "\n", + "Relevant Information:\n", + "\n", + "{history}\n", + "\n", + "Conversation:\n", + "Human: {input}\n", + "AI:\"\"\"\n", + "prompt = PromptTemplate(\n", + " input_variables=[\"history\", \"input\"], template=template\n", + ")\n", + "conversation_with_kg = ConversationChain(\n", + " llm=llm, \n", + " verbose=True, \n", + " prompt=prompt,\n", + " memory=ConversationKGMemory(llm=llm)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "97efaf38", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. \n", + "If the AI does not know the answer to a question, it truthfully says it does not know. The AI ONLY uses information contained in the \"Relevant Information\" section and does not hallucinate.\n", + "\n", + "Relevant Information:\n", + "\n", + "\n", + "\n", + "Conversation:\n", + "Human: Hi, what's up?\n", + "AI:\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "\" Hi there! I'm doing great. I'm currently in the process of learning about the world around me. I'm learning about different cultures, languages, and customs. It's really fascinating! How about you?\"" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conversation_with_kg.predict(input=\"Hi, what's up?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "55b5bcad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. \n", + "If the AI does not know the answer to a question, it truthfully says it does not know. The AI ONLY uses information contained in the \"Relevant Information\" section and does not hallucinate.\n", + "\n", + "Relevant Information:\n", + "\n", + "\n", + "\n", + "Conversation:\n", + "Human: My name is James and I'm helping Will. He's an engineer.\n", + "AI:\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "\" Hi James, it's nice to meet you. I'm an AI and I understand you're helping Will, the engineer. What kind of engineering does he do?\"" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conversation_with_kg.predict(input=\"My name is James and I'm helping Will. He's an engineer.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9981e219", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. \n", + "If the AI does not know the answer to a question, it truthfully says it does not know. The AI ONLY uses information contained in the \"Relevant Information\" section and does not hallucinate.\n", + "\n", + "Relevant Information:\n", + "\n", + "On Will: Will is an engineer.\n", + "\n", + "Conversation:\n", + "Human: What do you know about Will?\n", + "AI:\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "' Will is an engineer.'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conversation_with_kg.predict(input=\"What do you know about Will?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c09a239", + "metadata": {}, + "outputs": [], "source": [] } ], @@ -717,7 +897,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/langchain/callbacks/stdout.py b/langchain/callbacks/stdout.py index 07c747c7..2b0b860d 100644 --- a/langchain/callbacks/stdout.py +++ b/langchain/callbacks/stdout.py @@ -9,7 +9,7 @@ from langchain.schema import AgentAction, AgentFinish, LLMResult class StdOutCallbackHandler(BaseCallbackHandler): """Callback Handler that prints to std out.""" - def __init__(self, color: str = "green") -> None: + def __init__(self, color: Optional[str] = None) -> None: """Initialize callback handler.""" self.color = color diff --git a/langchain/chains/__init__.py b/langchain/chains/__init__.py index f71659cc..5a88675d 100644 --- a/langchain/chains/__init__.py +++ b/langchain/chains/__init__.py @@ -3,6 +3,7 @@ from langchain.chains.api.base import APIChain from langchain.chains.chat_vector_db.base import ChatVectorDBChain from langchain.chains.combine_documents.base import AnalyzeDocumentChain from langchain.chains.conversation.base import ConversationChain +from langchain.chains.graph_qa.base import GraphQAChain from langchain.chains.hyde.base import HypotheticalDocumentEmbedder from langchain.chains.llm import LLMChain from langchain.chains.llm_bash.base import LLMBashChain @@ -46,4 +47,5 @@ __all__ = [ "AnalyzeDocumentChain", "HypotheticalDocumentEmbedder", "ChatVectorDBChain", + "GraphQAChain", ] diff --git a/langchain/chains/conversation/memory.py b/langchain/chains/conversation/memory.py index cf174436..185cc726 100644 --- a/langchain/chains/conversation/memory.py +++ b/langchain/chains/conversation/memory.py @@ -7,9 +7,15 @@ from langchain.chains.base import Memory from langchain.chains.conversation.prompt import ( ENTITY_EXTRACTION_PROMPT, ENTITY_SUMMARIZATION_PROMPT, + KNOWLEDGE_TRIPLE_EXTRACTION_PROMPT, SUMMARY_PROMPT, ) from langchain.chains.llm import LLMChain +from langchain.graphs.networkx_graph import ( + NetworkxEntityGraph, + get_entities, + parse_triples, +) from langchain.llms.base import BaseLLM from langchain.prompts.base import BasePromptTemplate @@ -381,3 +387,101 @@ class ConversationSummaryBufferMemory(Memory, BaseModel): """Clear memory contents.""" self.buffer = [] self.moving_summary_buffer = "" + + +class ConversationKGMemory(Memory, BaseModel): + """Knowledge graph memory for storing conversation memory. + + Integrates with external knowledge graph to store and retrieve + information about knowledge triples in the conversation. + """ + + k: int = 2 + buffer: List[str] = Field(default_factory=list) + kg: NetworkxEntityGraph = Field(default_factory=NetworkxEntityGraph) + knowledge_extraction_prompt: BasePromptTemplate = KNOWLEDGE_TRIPLE_EXTRACTION_PROMPT + entity_extraction_prompt: BasePromptTemplate = ENTITY_EXTRACTION_PROMPT + llm: BaseLLM + """Number of previous utterances to include in the context.""" + human_prefix: str = "Human" + ai_prefix: str = "AI" + """Prefix to use for AI generated responses.""" + output_key: Optional[str] = None + input_key: Optional[str] = None + memory_key: str = "history" #: :meta private: + + def load_memory_variables(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + """Return history buffer.""" + entities = self._get_current_entities(inputs) + summaries = {} + for entity in entities: + knowledge = self.kg.get_entity_knowledge(entity) + if knowledge: + summaries[entity] = ". ".join(knowledge) + "." + if summaries: + summary_strings = [ + f"On {entity}: {summary}" for entity, summary in summaries.items() + ] + context_str = "\n".join(summary_strings) + else: + context_str = "" + return {self.memory_key: context_str} + + @property + def memory_variables(self) -> List[str]: + """Will always return list of memory variables. + + :meta private: + """ + return [self.memory_key] + + def _get_prompt_input_key(self, inputs: Dict[str, Any]) -> str: + """Get the input key for the prompt.""" + if self.input_key is None: + return _get_prompt_input_key(inputs, self.memory_variables) + return self.input_key + + def _get_prompt_output_key(self, outputs: Dict[str, Any]) -> str: + """Get the output key for the prompt.""" + if self.output_key is None: + if len(outputs) != 1: + raise ValueError(f"One output key expected, got {outputs.keys()}") + return list(outputs.keys())[0] + return self.output_key + + def _get_current_entities(self, inputs: Dict[str, Any]) -> List[str]: + """Get the current entities in the conversation.""" + prompt_input_key = self._get_prompt_input_key(inputs) + chain = LLMChain(llm=self.llm, prompt=self.entity_extraction_prompt) + output = chain.predict( + history="\n".join(self.buffer[-self.k :]), + input=inputs[prompt_input_key], + ) + return get_entities(output) + + def _get_and_update_kg(self, inputs: Dict[str, Any]) -> None: + """Get and update knowledge graph from the conversation history.""" + chain = LLMChain(llm=self.llm, prompt=self.knowledge_extraction_prompt) + prompt_input_key = self._get_prompt_input_key(inputs) + output = chain.predict( + history="\n".join(self.buffer[-self.k :]), + input=inputs[prompt_input_key], + verbose=True, + ) + knowledge = parse_triples(output) + for triple in knowledge: + self.kg.add_triple(triple) + + def save_context(self, inputs: Dict[str, Any], outputs: Dict[str, str]) -> None: + """Save context from this conversation to buffer.""" + self._get_and_update_kg(inputs) + prompt_input_key = self._get_prompt_input_key(inputs) + output_key = self._get_prompt_output_key(outputs) + human = f"{self.human_prefix}: {inputs[prompt_input_key]}" + ai = f"{self.ai_prefix}: {outputs[output_key]}" + new_lines = "\n".join([human.strip(), ai.strip()]) + self.buffer.append(new_lines) + + def clear(self) -> None: + """Clear memory contents.""" + return self.kg.clear() diff --git a/langchain/chains/conversation/prompt.py b/langchain/chains/conversation/prompt.py index 9ee8e64a..fc44f05c 100644 --- a/langchain/chains/conversation/prompt.py +++ b/langchain/chains/conversation/prompt.py @@ -118,3 +118,58 @@ ENTITY_SUMMARIZATION_PROMPT = PromptTemplate( input_variables=["entity", "summary", "history", "input"], template=_DEFAULT_ENTITY_SUMMARIZATION_TEMPLATE, ) + + +KG_TRIPLE_DELIMITER = "<|>" +_DEFAULT_KNOWLEDGE_TRIPLE_EXTRACTION_TEMPLATE = ( + "You are a networked intelligence helping a human track knowledge triples" + " about all relevant people, things, concepts, etc. and integrating" + " them with your knowledge stored within your weights" + " as well as that stored in a knowledge graph." + " Extract all of the knowledge triples from the last line of conversation." + " A knowledge triple is a clause that contains a subject, a predicate," + " and an object. The subject is the entity being described," + " the predicate is the property of the subject that is being" + " described, and the object is the value of the property.\n\n" + "EXAMPLE\n" + "Conversation history:\n" + "Person #1: Did you hear aliens landed in Area 51?\n" + "AI: No, I didn't hear that. What do you know about Area 51?\n" + "Person #1: It's a secret military base in Nevada.\n" + "AI: What do you know about Nevada?\n" + "Last line of conversation:\n" + "Person #1: It's a state in the US. It's also the number 1 producer of gold in the US.\n\n" + f"Output: (Nevada, is a, state){KG_TRIPLE_DELIMITER}(Nevada, is in, US)" + f"{KG_TRIPLE_DELIMITER}(Nevada, is the number 1 producer of, gold)\n" + "END OF EXAMPLE\n\n" + "EXAMPLE\n" + "Conversation history:\n" + "Person #1: Hello.\n" + "AI: Hi! How are you?\n" + "Person #1: I'm good. How are you?\n" + "AI: I'm good too.\n" + "Last line of conversation:\n" + "Person #1: I'm going to the store.\n\n" + "Output: NONE\n" + "END OF EXAMPLE\n\n" + "EXAMPLE\n" + "Conversation history:\n" + "Person #1: What do you know about Descartes?\n" + "AI: Descartes was a French philosopher, mathematician, and scientist who lived in the 17th century.\n" + "Person #1: The Descartes I'm referring to is a standup comedian and interior designer from Montreal.\n" + "AI: Oh yes, He is a comedian and an interior designer. He has been in the industry for 30 years. His favorite food is baked bean pie.\n" + "Person #1: Oh huh. I know Descartes likes to drive antique scooters and play the mandolin.\n" + "Last line of conversation:\n" + f"Output: (Descartes, likes to drive, antique scooters){KG_TRIPLE_DELIMITER}(Descartes, plays, mandolin)\n" + "END OF EXAMPLE\n\n" + "Conversation history (for reference only):\n" + "{history}" + "\nLast line of conversation (for extraction):\n" + "Human: {input}\n\n" + "Output:" +) + +KNOWLEDGE_TRIPLE_EXTRACTION_PROMPT = PromptTemplate( + input_variables=["history", "input"], + template=_DEFAULT_KNOWLEDGE_TRIPLE_EXTRACTION_TEMPLATE, +) diff --git a/langchain/chains/graph_qa/__init__.py b/langchain/chains/graph_qa/__init__.py new file mode 100644 index 00000000..f3bc55ef --- /dev/null +++ b/langchain/chains/graph_qa/__init__.py @@ -0,0 +1 @@ +"""Question answering over a knowledge graph.""" diff --git a/langchain/chains/graph_qa/base.py b/langchain/chains/graph_qa/base.py new file mode 100644 index 00000000..addf72f8 --- /dev/null +++ b/langchain/chains/graph_qa/base.py @@ -0,0 +1,78 @@ +"""Question answering over a graph.""" +from __future__ import annotations + +from typing import Any, Dict, List + +from pydantic import Field + +from langchain.chains.base import Chain +from langchain.chains.graph_qa.prompts import ENTITY_EXTRACTION_PROMPT, PROMPT +from langchain.chains.llm import LLMChain +from langchain.graphs.networkx_graph import NetworkxEntityGraph, get_entities +from langchain.llms.base import BaseLLM +from langchain.prompts.base import BasePromptTemplate + + +class GraphQAChain(Chain): + """Chain for question-answering against a graph.""" + + graph: NetworkxEntityGraph = Field(exclude=True) + entity_extraction_chain: LLMChain + qa_chain: LLMChain + input_key: str = "query" #: :meta private: + output_key: str = "result" #: :meta private: + + @property + def input_keys(self) -> List[str]: + """Return the input keys. + + :meta private: + """ + return [self.input_key] + + @property + def output_keys(self) -> List[str]: + """Return the output keys. + + :meta private: + """ + _output_keys = [self.output_key] + return _output_keys + + @classmethod + def from_llm( + cls, + llm: BaseLLM, + qa_prompt: BasePromptTemplate = PROMPT, + entity_prompt: BasePromptTemplate = ENTITY_EXTRACTION_PROMPT, + **kwargs: Any, + ) -> GraphQAChain: + """Initialize from LLM.""" + qa_chain = LLMChain(llm=llm, prompt=qa_prompt) + entity_chain = LLMChain(llm=llm, prompt=entity_prompt) + + return cls(qa_chain=qa_chain, entity_extraction_chain=entity_chain, **kwargs) + + def _call(self, inputs: Dict[str, str]) -> Dict[str, Any]: + """Extract entities, look up info and answer question.""" + question = inputs[self.input_key] + + entity_string = self.entity_extraction_chain.run(question) + + self.callback_manager.on_text( + "Entities Extracted:", end="\n", verbose=self.verbose + ) + self.callback_manager.on_text( + entity_string, color="green", end="\n", verbose=self.verbose + ) + entities = get_entities(entity_string) + context = "" + for entity in entities: + triplets = self.graph.get_entity_knowledge(entity) + context += "\n".join(triplets) + self.callback_manager.on_text("Full Context:", end="\n", verbose=self.verbose) + self.callback_manager.on_text( + context, color="green", end="\n", verbose=self.verbose + ) + result = self.qa_chain({"question": question, "context": context}) + return {self.output_key: result[self.qa_chain.output_key]} diff --git a/langchain/chains/graph_qa/prompts.py b/langchain/chains/graph_qa/prompts.py new file mode 100644 index 00000000..6fdf5247 --- /dev/null +++ b/langchain/chains/graph_qa/prompts.py @@ -0,0 +1,34 @@ +# flake8: noqa +from langchain.prompts.prompt import PromptTemplate + +_DEFAULT_ENTITY_EXTRACTION_TEMPLATE = """Extract all entities from the following text. As a guideline, a proper noun is generally capitalized. You should definitely extract all names and places. + +Return the output as a single comma-separated list, or NONE if there is nothing of note to return. + +EXAMPLE +i'm trying to improve Langchain's interfaces, the UX, its integrations with various products the user might want ... a lot of stuff. +Output: Langchain +END OF EXAMPLE + +EXAMPLE +i'm trying to improve Langchain's interfaces, the UX, its integrations with various products the user might want ... a lot of stuff. I'm working with Sam. +Output: Langchain, Sam +END OF EXAMPLE + +Begin! + +{input} +Output:""" +ENTITY_EXTRACTION_PROMPT = PromptTemplate( + input_variables=["input"], template=_DEFAULT_ENTITY_EXTRACTION_TEMPLATE +) + +prompt_template = """Use the following knowledge triplets to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. + +{context} + +Question: {question} +Helpful Answer:""" +PROMPT = PromptTemplate( + template=prompt_template, input_variables=["context", "question"] +) diff --git a/langchain/graphs/__init__.py b/langchain/graphs/__init__.py new file mode 100644 index 00000000..68851c6d --- /dev/null +++ b/langchain/graphs/__init__.py @@ -0,0 +1,4 @@ +"""Graph implementations.""" +from langchain.graphs.networkx_graph import NetworkxEntityGraph + +__all__ = ["NetworkxEntityGraph"] diff --git a/langchain/graphs/networkx_graph.py b/langchain/graphs/networkx_graph.py new file mode 100644 index 00000000..f48fe0f1 --- /dev/null +++ b/langchain/graphs/networkx_graph.py @@ -0,0 +1,96 @@ +"""Networkx wrapper for graph operations.""" + +from typing import List, NamedTuple, Tuple + +KG_TRIPLE_DELIMITER = "<|>" + + +class KnowledgeTriple(NamedTuple): + """A triple in the graph.""" + + subject: str + predicate: str + object_: str + + @classmethod + def from_string(cls, triple_string: str) -> "KnowledgeTriple": + """Create a KnowledgeTriple from a string.""" + subject, predicate, object_ = triple_string.strip().split(", ") + subject = subject[1:] + object_ = object_[:-1] + return cls(subject, predicate, object_) + + +def parse_triples(knowledge_str: str) -> List[KnowledgeTriple]: + """Parse knowledge triples from the knowledge string.""" + knowledge_str = knowledge_str.strip() + if not knowledge_str or knowledge_str == "NONE": + return [] + triple_strs = knowledge_str.split(KG_TRIPLE_DELIMITER) + results = [] + for triple_str in triple_strs: + try: + kg_triple = KnowledgeTriple.from_string(triple_str) + except ValueError: + continue + results.append(kg_triple) + return results + + +def get_entities(entity_str: str) -> List[str]: + """Extract entities from entity string.""" + if entity_str.strip() == "NONE": + return [] + else: + return [w.strip() for w in entity_str.split(",")] + + +class NetworkxEntityGraph: + """Networkx wrapper for entity graph operations.""" + + def __init__(self) -> None: + """Create a new graph.""" + import networkx as nx + + self._graph = nx.DiGraph() + + def add_triple(self, knowledge_triple: KnowledgeTriple) -> None: + """Add a triple to the graph.""" + # Creates nodes if they don't exist + # Overwrites existing edges + if not self._graph.has_node(knowledge_triple.subject): + self._graph.add_node(knowledge_triple.subject) + if not self._graph.has_node(knowledge_triple.object_): + self._graph.add_node(knowledge_triple.object_) + self._graph.add_edge( + knowledge_triple.subject, + knowledge_triple.object_, + relation=knowledge_triple.predicate, + ) + + def delete_triple(self, knowledge_triple: KnowledgeTriple) -> None: + """Delete a triple from the graph.""" + if self._graph.has_edge(knowledge_triple.subject, knowledge_triple.object_): + self._graph.remove_edge(knowledge_triple.subject, knowledge_triple.object_) + + def get_triples(self) -> List[Tuple[str, str, str]]: + """Get all triples in the graph.""" + return [(u, v, d["relation"]) for u, v, d in self._graph.edges(data=True)] + + def get_entity_knowledge(self, entity: str, depth: int = 1) -> List[str]: + """Get information about an entity.""" + import networkx as nx + + # TODO: Have more information-specific retrieval methods + if not self._graph.has_node(entity): + return [] + + results = [] + for src, sink in nx.dfs_edges(self._graph, entity, depth_limit=depth): + relation = self._graph[src][sink]["relation"] + results.append(f"{src} {relation} {sink}") + return results + + def clear(self) -> None: + """Clear the graph.""" + self._graph.clear() diff --git a/langchain/indexes/__init__.py b/langchain/indexes/__init__.py new file mode 100644 index 00000000..c8a3547b --- /dev/null +++ b/langchain/indexes/__init__.py @@ -0,0 +1,4 @@ +"""All index utils.""" +from langchain.indexes.graph import GraphIndexCreator + +__all__ = ["GraphIndexCreator"] diff --git a/langchain/indexes/graph.py b/langchain/indexes/graph.py new file mode 100644 index 00000000..519e4ac4 --- /dev/null +++ b/langchain/indexes/graph.py @@ -0,0 +1,30 @@ +"""Graph Index Creator.""" +from typing import Optional, Type + +from pydantic import BaseModel + +from langchain.chains.llm import LLMChain +from langchain.graphs.networkx_graph import NetworkxEntityGraph, parse_triples +from langchain.indexes.prompts.knowledge_triplet_extraction import ( + KNOWLEDGE_TRIPLE_EXTRACTION_PROMPT, +) +from langchain.llms.base import BaseLLM + + +class GraphIndexCreator(BaseModel): + """Functionality to create graph index.""" + + llm: Optional[BaseLLM] = None + graph_type: Type[NetworkxEntityGraph] = NetworkxEntityGraph + + def from_text(self, text: str) -> NetworkxEntityGraph: + """Create graph index from text.""" + if self.llm is None: + raise ValueError("llm should not be None") + graph = self.graph_type() + chain = LLMChain(llm=self.llm, prompt=KNOWLEDGE_TRIPLE_EXTRACTION_PROMPT) + output = chain.predict(text=text) + knowledge = parse_triples(output) + for triple in knowledge: + graph.add_triple(triple) + return graph diff --git a/langchain/indexes/prompts/__init__.py b/langchain/indexes/prompts/__init__.py new file mode 100644 index 00000000..1a5833cd --- /dev/null +++ b/langchain/indexes/prompts/__init__.py @@ -0,0 +1 @@ +"""Relevant prompts for constructing indexes.""" diff --git a/langchain/indexes/prompts/entity_extraction.py b/langchain/indexes/prompts/entity_extraction.py new file mode 100644 index 00000000..47cc349c --- /dev/null +++ b/langchain/indexes/prompts/entity_extraction.py @@ -0,0 +1,40 @@ +# flake8: noqa +from langchain.prompts.prompt import PromptTemplate + +_DEFAULT_ENTITY_EXTRACTION_TEMPLATE = """You are an AI assistant reading the transcript of a conversation between an AI and a human. Extract all of the proper nouns from the last line of conversation. As a guideline, a proper noun is generally capitalized. You should definitely extract all names and places. + +The conversation history is provided just in case of a coreference (e.g. "What do you know about him" where "him" is defined in a previous line) -- ignore items mentioned there that are not in the last line. + +Return the output as a single comma-separated list, or NONE if there is nothing of note to return (e.g. the user is just issuing a greeting or having a simple conversation). + +EXAMPLE +Conversation history: +Person #1: how's it going today? +AI: "It's going great! How about you?" +Person #1: good! busy working on Langchain. lots to do. +AI: "That sounds like a lot of work! What kind of things are you doing to make Langchain better?" +Last line: +Person #1: i'm trying to improve Langchain's interfaces, the UX, its integrations with various products the user might want ... a lot of stuff. +Output: Langchain +END OF EXAMPLE + +EXAMPLE +Conversation history: +Person #1: how's it going today? +AI: "It's going great! How about you?" +Person #1: good! busy working on Langchain. lots to do. +AI: "That sounds like a lot of work! What kind of things are you doing to make Langchain better?" +Last line: +Person #1: i'm trying to improve Langchain's interfaces, the UX, its integrations with various products the user might want ... a lot of stuff. I'm working with Person #2. +Output: Langchain, Person #2 +END OF EXAMPLE + +Conversation history (for reference only): +{history} +Last line of conversation (for extraction): +Human: {input} + +Output:""" +ENTITY_EXTRACTION_PROMPT = PromptTemplate( + input_variables=["history", "input"], template=_DEFAULT_ENTITY_EXTRACTION_TEMPLATE +) diff --git a/langchain/indexes/prompts/entity_summarization.py b/langchain/indexes/prompts/entity_summarization.py new file mode 100644 index 00000000..41e97f5f --- /dev/null +++ b/langchain/indexes/prompts/entity_summarization.py @@ -0,0 +1,25 @@ +# flake8: noqa +from langchain.prompts.prompt import PromptTemplate + +_DEFAULT_ENTITY_SUMMARIZATION_TEMPLATE = """You are an AI assistant helping a human keep track of facts about relevant people, places, and concepts in their life. Update the summary of the provided entity in the "Entity" section based on the last line of your conversation with the human. If you are writing the summary for the first time, return a single sentence. +The update should only include facts that are relayed in the last line of conversation about the provided entity, and should only contain facts about the provided entity. + +If there is no new information about the provided entity or the information is not worth noting (not an important or relevant fact to remember long-term), return the existing summary unchanged. + +Full conversation history (for context): +{history} + +Entity to summarize: +{entity} + +Existing summary of {entity}: +{summary} + +Last line of conversation: +Human: {input} +Updated summary:""" + +ENTITY_SUMMARIZATION_PROMPT = PromptTemplate( + input_variables=["entity", "summary", "history", "input"], + template=_DEFAULT_ENTITY_SUMMARIZATION_TEMPLATE, +) diff --git a/langchain/indexes/prompts/knowledge_triplet_extraction.py b/langchain/indexes/prompts/knowledge_triplet_extraction.py new file mode 100644 index 00000000..0505965c --- /dev/null +++ b/langchain/indexes/prompts/knowledge_triplet_extraction.py @@ -0,0 +1,37 @@ +# flake8: noqa + +from langchain.graphs.networkx_graph import KG_TRIPLE_DELIMITER +from langchain.prompts.prompt import PromptTemplate + +_DEFAULT_KNOWLEDGE_TRIPLE_EXTRACTION_TEMPLATE = ( + "You are a networked intelligence helping a human track knowledge triples" + " about all relevant people, things, concepts, etc. and integrating" + " them with your knowledge stored within your weights" + " as well as that stored in a knowledge graph." + " Extract all of the knowledge triples from the text." + " A knowledge triple is a clause that contains a subject, a predicate," + " and an object. The subject is the entity being described," + " the predicate is the property of the subject that is being" + " described, and the object is the value of the property.\n\n" + "EXAMPLE\n" + "It's a state in the US. It's also the number 1 producer of gold in the US.\n\n" + f"Output: (Nevada, is a, state){KG_TRIPLE_DELIMITER}(Nevada, is in, US)" + f"{KG_TRIPLE_DELIMITER}(Nevada, is the number 1 producer of, gold)\n" + "END OF EXAMPLE\n\n" + "EXAMPLE\n" + "I'm going to the store.\n\n" + "Output: NONE\n" + "END OF EXAMPLE\n\n" + "EXAMPLE\n" + "Oh huh. I know Descartes likes to drive antique scooters and play the mandolin.\n" + f"Output: (Descartes, likes to drive, antique scooters){KG_TRIPLE_DELIMITER}(Descartes, plays, mandolin)\n" + "END OF EXAMPLE\n\n" + "EXAMPLE\n" + "{text}" + "Output:" +) + +KNOWLEDGE_TRIPLE_EXTRACTION_PROMPT = PromptTemplate( + input_variables=["text"], + template=_DEFAULT_KNOWLEDGE_TRIPLE_EXTRACTION_TEMPLATE, +) diff --git a/poetry.lock b/poetry.lock index e9380de5..70b822d4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3279,6 +3279,25 @@ files = [ {file = "nest_asyncio-1.5.6.tar.gz", hash = "sha256:d267cc1ff794403f7df692964d1d2a3fa9418ffea2a3f6859a439ff482fef290"}, ] +[[package]] +name = "networkx" +version = "2.8.8" +description = "Python package for creating and manipulating graphs and networks" +category = "main" +optional = true +python-versions = ">=3.8" +files = [ + {file = "networkx-2.8.8-py3-none-any.whl", hash = "sha256:e435dfa75b1d7195c7b8378c3859f0445cd88c6b0375c181ed66823a9ceb7524"}, + {file = "networkx-2.8.8.tar.gz", hash = "sha256:230d388117af870fce5647a3c52401fcf753e94720e6ea6b4197a5355648885e"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.19)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=0.982)", "pre-commit (>=2.20)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.2)", "pydata-sphinx-theme (>=0.11)", "sphinx (>=5.2)", "sphinx-gallery (>=0.11)", "texext (>=0.6.6)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "nlpcloud" version = "1.0.38" @@ -7002,10 +7021,10 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [extras] -all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf"] +all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx"] llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "0e648dc58a87d0c37440cb193b79ddf99adc12f8c6d38be1cf7726210d7b33a1" +content-hash = "81fa8d3c24ead7311cc06a97fcabfe9a707fb3fc5989caa1569c5ef364cdd508" diff --git a/pyproject.toml b/pyproject.toml index 142e387d..048dd16a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ google-search-results = {version = "^2", optional = true} sentence-transformers = {version = "^2", optional = true} aiohttp = "^3.8.3" pypdf = {version = "^3.4.0", optional = true} - +networkx = {version="^2.6.3", optional = true} [tool.poetry.group.docs.dependencies] autodoc_pydantic = "^1.8.0" @@ -93,7 +93,7 @@ playwright = "^1.28.0" [tool.poetry.extras] llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] -all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf"] +all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx"] [tool.isort] profile = "black"