Harrison/hugginggpt (#8162)

Co-authored-by: Yongliang Shen <withsyl@163.com>
12 months ago · 3caccf304c
parent f3908627ed
commit 3caccf304c
8 changed files with 511 additions and 1 deletions
--- a/docs/extras/use_cases/autonomous_agents/hugginggpt.ipynb
+++ b/docs/extras/use_cases/autonomous_agents/hugginggpt.ipynb
@ -0,0 +1,135 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# HuggingGPT\n",
+    "Implementation of [HuggingGPT](https://github.com/microsoft/JARVIS). HuggingGPT is a system to connect LLMs (ChatGPT) with ML community (Hugging Face).\n",
+    "\n",
+    "+ 🔥 Paper: https://arxiv.org/abs/2303.17580\n",
+    "+ 🚀 Project: https://github.com/microsoft/JARVIS\n",
+    "+ 🤗 Space: https://huggingface.co/spaces/microsoft/HuggingGPT"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set up tools\n",
+    "\n",
+    "We set up the tools available from [Transformers Agent](https://huggingface.co/docs/transformers/transformers_agents#tools). It includes a library of tools supported by Transformers and some customized tools such as image generator, video generator, text downloader and other tools."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import load_tool"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_tools = [\n",
+    "    load_tool(tool_name)\n",
+    "    for tool_name in [\n",
+    "        \"document-question-answering\",\n",
+    "        \"image-captioning\",\n",
+    "        \"image-question-answering\",\n",
+    "        \"image-segmentation\",\n",
+    "        \"speech-to-text\",\n",
+    "        \"summarization\",\n",
+    "        \"text-classification\",\n",
+    "        \"text-question-answering\",\n",
+    "        \"translation\",\n",
+    "        \"huggingface-tools/text-to-image\",\n",
+    "        \"huggingface-tools/text-to-video\",\n",
+    "        \"text-to-speech\",\n",
+    "        \"huggingface-tools/text-download\",\n",
+    "        \"huggingface-tools/image-transformation\",\n",
+    "    ]\n",
+    "]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup model and HuggingGPT\n",
+    "\n",
+    "We create an instance of HuggingGPT and use ChatGPT as the controller to rule the above tools."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.llms import OpenAI\n",
+    "from langchain_experimental.autonomous_agents import HuggingGPT\n",
+    "# %env OPENAI_API_BASE=http://localhost:8000/v1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = OpenAI(model_name=\"gpt-3.5-turbo\")\n",
+    "agent = HuggingGPT(llm, hf_tools)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run an example\n",
+    "\n",
+    "Given a text, show a related image and video."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agent.run(\"please show me a video and an image of 'a boy is running'\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "langchain",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/extras/use_cases/autonomous_agents/index.mdx
+++ b/docs/extras/use_cases/autonomous_agents/index.mdx
@ -21,3 +21,6 @@ usage of LangChain's collection of tools.

 ## MetaPrompt ([Original Repo](https://github.com/ngoodman/metaprompt))
 - [Meta-Prompt](/docs/use_cases/autonomous_agents/meta_prompt.html): a notebook implementing Meta-Prompt in LangChain primitives
+
+## HuggingGPT ([Original Repo](https://github.com/microsoft/JARVIS))
+- [HuggingGPT](/docs/use_cases/autonomous_agents/hugginggpt.html): a notebook implementing HuggingGPT in LangChain primitives
--- a/libs/experimental/langchain_experimental/autonomous_agents/init.py
+++ b/libs/experimental/langchain_experimental/autonomous_agents/init.py
@ -1,4 +1,5 @@
 from langchain_experimental.autonomous_agents.autogpt.agent import AutoGPT
 from langchain_experimental.autonomous_agents.baby_agi.baby_agi import BabyAGI
+from langchain_experimental.autonomous_agents.hugginggpt.hugginggpt import HuggingGPT

-__all__ = ["BabyAGI", "AutoGPT"]
+__all__ = ["BabyAGI", "AutoGPT", "HuggingGPT"]
--- a/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/init.py
+++ b/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/init.py
--- a/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/hugginggpt.py
+++ b/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/hugginggpt.py
@ -0,0 +1,32 @@
+from typing import List
+
+from langchain.base_language import BaseLanguageModel
+from langchain.tools.base import BaseTool
+
+from langchain_experimental.autonomous_agents.hugginggpt.repsonse_generator import (
+    load_response_generator,
+)
+from langchain_experimental.autonomous_agents.hugginggpt.task_executor import (
+    TaskExecutor,
+)
+from langchain_experimental.autonomous_agents.hugginggpt.task_planner import (
+    load_chat_planner,
+)
+
+
+class HuggingGPT:
+    def __init__(self, llm: BaseLanguageModel, tools: List[BaseTool]):
+        self.llm = llm
+        self.tools = tools
+        self.chat_planner = load_chat_planner(llm)
+        self.response_generator = load_response_generator(llm)
+        self.task_executor: TaskExecutor
+
+    def run(self, input: str) -> str:
+        plan = self.chat_planner.plan(inputs={"input": input, "hf_tools": self.tools})
+        self.task_executor = TaskExecutor(plan)
+        self.task_executor.run()
+        response = self.response_generator.generate(
+            {"task_execution": self.task_executor}
+        )
+        return response
--- a/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/repsonse_generator.py
+++ b/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/repsonse_generator.py
@ -0,0 +1,41 @@
+from typing import Any, List, Optional
+
+from langchain import LLMChain, PromptTemplate
+from langchain.base_language import BaseLanguageModel
+from langchain.callbacks.manager import Callbacks
+
+
+class ResponseGenerationChain(LLMChain):
+    """Chain to execute tasks."""
+
+    @classmethod
+    def from_llm(cls, llm: BaseLanguageModel, verbose: bool = True) -> LLMChain:
+        execution_template = (
+            "The AI assistant has parsed the user input into several tasks"
+            "and executed them. The results are as follows:\n"
+            "{task_execution}"
+            "\nPlease summarize the results and generate a response."
+        )
+        prompt = PromptTemplate(
+            template=execution_template,
+            input_variables=["task_execution"],
+        )
+        return cls(prompt=prompt, llm=llm, verbose=verbose)
+
+
+class ResponseGenerator:
+    def __init__(self, llm_chain: LLMChain, stop: Optional[List] = None):
+        self.llm_chain = llm_chain
+        self.stop = stop
+
+    def generate(self, inputs: dict, callbacks: Callbacks = None, **kwargs: Any) -> str:
+        """Given input, decided what to do."""
+        llm_response = self.llm_chain.run(**inputs, stop=self.stop, callbacks=callbacks)
+        return llm_response
+
+
+def load_response_generator(llm: BaseLanguageModel) -> ResponseGenerator:
+    llm_chain = ResponseGenerationChain.from_llm(llm)
+    return ResponseGenerator(
+        llm_chain=llm_chain,
+    )
--- a/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/task_executor.py
+++ b/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/task_executor.py
@ -0,0 +1,145 @@
+import copy
+import uuid
+from typing import Dict, List
+
+import numpy as np
+from langchain.tools.base import BaseTool
+
+from langchain_experimental.autonomous_agents.hugginggpt.task_planner import Plan
+
+
+class Task:
+    def __init__(self, task: str, id: int, dep: List[int], args: Dict, tool: BaseTool):
+        self.task = task
+        self.id = id
+        self.dep = dep
+        self.args = args
+        self.tool = tool
+        self.status = "pending"
+        self.message = ""
+        self.result = ""
+
+    def __str__(self) -> str:
+        return f"{self.task}({self.args})"
+
+    def save_product(self) -> None:
+        import cv2
+
+        if self.task == "video_generator":
+            # ndarray to video
+            product = np.array(self.product)
+            nframe, height, width, _ = product.shape
+            video_filename = uuid.uuid4().hex[:6] + ".mp4"
+            fps = 30  # Frames per second
+            fourcc = cv2.VideoWriter_fourcc(*"mp4v")  # type: ignore
+            video_out = cv2.VideoWriter(video_filename, fourcc, fps, (width, height))
+            for frame in self.product:
+                video_out.write(frame)
+            video_out.release()
+            self.result = video_filename
+        elif self.task == "image_generator":
+            # PIL.Image to image
+            filename = uuid.uuid4().hex[:6] + ".png"
+            self.product.save(filename)  # type: ignore
+            self.result = filename
+
+    def completed(self) -> bool:
+        return self.status == "completed"
+
+    def failed(self) -> bool:
+        return self.status == "failed"
+
+    def pending(self) -> bool:
+        return self.status == "pending"
+
+    def run(self) -> str:
+        from diffusers.utils import load_image
+
+        try:
+            new_args = copy.deepcopy(self.args)
+            for k, v in new_args.items():
+                if k == "image":
+                    new_args["image"] = load_image(v)
+            if self.task in ["video_generator", "image_generator", "text_reader"]:
+                self.product = self.tool(**new_args)
+            else:
+                self.result = self.tool(**new_args)
+        except Exception as e:
+            self.status = "failed"
+            self.message = str(e)
+        self.status = "completed"
+        self.save_product()
+
+        return self.result
+
+
+class TaskExecutor:
+    """Load tools to execute tasks."""
+
+    def __init__(self, plan: Plan):
+        self.plan = plan
+        self.tasks = []
+        self.id_task_map = {}
+        self.status = "pending"
+        for step in self.plan.steps:
+            task = Task(step.task, step.id, step.dep, step.args, step.tool)
+            self.tasks.append(task)
+            self.id_task_map[step.id] = task
+
+    def completed(self) -> bool:
+        return all(task.completed() for task in self.tasks)
+
+    def failed(self) -> bool:
+        return any(task.failed() for task in self.tasks)
+
+    def pending(self) -> bool:
+        return any(task.pending() for task in self.tasks)
+
+    def check_dependency(self, task: Task) -> bool:
+        for dep_id in task.dep:
+            if dep_id == -1:
+                continue
+            dep_task = self.id_task_map[dep_id]
+            if dep_task.failed() or dep_task.pending():
+                return False
+        return True
+
+    def update_args(self, task: Task) -> None:
+        for dep_id in task.dep:
+            if dep_id == -1:
+                continue
+            dep_task = self.id_task_map[dep_id]
+            for k, v in task.args.items():
+                if f"<resource-{dep_id}>" in v:
+                    task.args[k].replace(f"<resource-{dep_id}>", dep_task.result)
+
+    def run(self) -> str:
+        for task in self.tasks:
+            print(f"running {task}")
+            if task.pending() and self.check_dependency(task):
+                self.update_args(task)
+                task.run()
+        if self.completed():
+            self.status = "completed"
+        elif self.failed():
+            self.status = "failed"
+        else:
+            self.status = "pending"
+        return self.status
+
+    def __str__(self) -> str:
+        result = ""
+        for task in self.tasks:
+            result += f"{task}\n"
+            result += f"status: {task.status}\n"
+            if task.failed():
+                result += f"message: {task.message}\n"
+            if task.completed():
+                result += f"result: {task.result}\n"
+        return result
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    def describe(self) -> str:
+        return self.__str__()
--- a/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/task_planner.py
+++ b/libs/experimental/langchain_experimental/autonomous_agents/hugginggpt/task_planner.py
@ -0,0 +1,153 @@
+import json
+import re
+from abc import abstractmethod
+from typing import Any, Dict, List, Optional, Union
+
+from langchain import LLMChain
+from langchain.base_language import BaseLanguageModel
+from langchain.callbacks.manager import Callbacks
+from langchain.prompts.chat import (
+    AIMessagePromptTemplate,
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.tools.base import BaseTool
+from pydantic import BaseModel
+
+DEMONSTRATIONS = [
+    {
+        "role": "user",
+        "content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it",  # noqa: E501
+    },
+    {
+        "role": "assistant",
+        "content": '[{{"task": "video_generator", "id": 0, "dep": [-1], "args": {{"prompt": "a boy is running" }}}}, {{"task": "text_reader", "id": 1, "dep": [-1], "args": {{"text": "a boy is running" }}}}, {{"task": "image_generator", "id": 2, "dep": [-1], "args": {{"prompt": "a boy is running" }}}}]',  # noqa: E501
+    },
+    {
+        "role": "user",
+        "content": "Give you some pictures e1.jpg, e2.png, e3.jpg, help me count the number of sheep?",  # noqa: E501
+    },
+    {
+        "role": "assistant",
+        "content": '[ {{"task": "image_qa", "id": 0, "dep": [-1], "args": {{"image": "e1.jpg", "question": "How many sheep in the picture"}}}}, {{"task": "image_qa", "id": 1, "dep": [-1], "args": {{"image": "e2.jpg", "question": "How many sheep in the picture"}}}}, {{"task": "image_qa", "id": 2, "dep": [-1], "args": {{"image": "e3.jpg", "question": "How many sheep in the picture"}}}}]',  # noqa: E501
+    },
+]
+
+
+class TaskPlaningChain(LLMChain):
+    """Chain to execute tasks."""
+
+    @classmethod
+    def from_llm(
+        cls,
+        llm: BaseLanguageModel,
+        demos: List[Dict] = DEMONSTRATIONS,
+        verbose: bool = True,
+    ) -> LLMChain:
+        """Get the response parser."""
+        system_template = """#1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{{"task": task, "id": task_id, "dep": dependency_task_id, "args": {{"input name": text may contain <resource-dep_id>}}}}]. The special tag "dep_id" refer to the one generated text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The task MUST be selected from the following tools (along with tool description, input name and output type): {tools}. There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON []."""  # noqa: E501
+        human_template = """Now I input: {input}."""
+        system_message_prompt = SystemMessagePromptTemplate.from_template(
+            system_template
+        )
+        human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
+
+        demo_messages: List[
+            Union[HumanMessagePromptTemplate, AIMessagePromptTemplate]
+        ] = []
+        for demo in demos:
+            if demo["role"] == "user":
+                demo_messages.append(
+                    HumanMessagePromptTemplate.from_template(demo["content"])
+                )
+            else:
+                demo_messages.append(
+                    AIMessagePromptTemplate.from_template(demo["content"])
+                )
+            # demo_messages.append(message)
+
+        prompt = ChatPromptTemplate.from_messages(
+            [system_message_prompt, *demo_messages, human_message_prompt]
+        )
+
+        return cls(prompt=prompt, llm=llm, verbose=verbose)
+
+
+class Step:
+    def __init__(
+        self, task: str, id: int, dep: List[int], args: Dict[str, str], tool: BaseTool
+    ):
+        self.task = task
+        self.id = id
+        self.dep = dep
+        self.args = args
+        self.tool = tool
+
+
+class Plan:
+    def __init__(self, steps: List[Step]):
+        self.steps = steps
+
+    def __str__(self) -> str:
+        return str([str(step) for step in self.steps])
+
+    def __repr__(self) -> str:
+        return str(self)
+
+
+class BasePlanner(BaseModel):
+    @abstractmethod
+    def plan(self, inputs: dict, callbacks: Callbacks = None, **kwargs: Any) -> Plan:
+        """Given input, decide what to do."""
+
+    @abstractmethod
+    async def aplan(
+        self, inputs: dict, callbacks: Callbacks = None, **kwargs: Any
+    ) -> Plan:
+        """Given input, decide what to do."""
+
+
+class PlanningOutputParser(BaseModel):
+    def parse(self, text: str, hf_tools: List[BaseTool]) -> Plan:
+        steps = []
+        for v in json.loads(re.findall(r"\[.*\]", text)[0]):
+            choose_tool = None
+            for tool in hf_tools:
+                if tool.name == v["task"]:
+                    choose_tool = tool
+                    break
+            if choose_tool:
+                steps.append(Step(v["task"], v["id"], v["dep"], v["args"], tool))
+        return Plan(steps=steps)
+
+
+class TaskPlanner(BasePlanner):
+    llm_chain: LLMChain
+    output_parser: PlanningOutputParser
+    stop: Optional[List] = None
+
+    def plan(self, inputs: dict, callbacks: Callbacks = None, **kwargs: Any) -> Plan:
+        """Given input, decided what to do."""
+        inputs["tools"] = [
+            f"{tool.name}: {tool.description}" for tool in inputs["hf_tools"]
+        ]
+        llm_response = self.llm_chain.run(**inputs, stop=self.stop, callbacks=callbacks)
+        return self.output_parser.parse(llm_response, inputs["hf_tools"])
+
+    async def aplan(
+        self, inputs: dict, callbacks: Callbacks = None, **kwargs: Any
+    ) -> Plan:
+        """Given input, decided what to do."""
+        inputs["hf_tools"] = [
+            f"{tool.name}: {tool.description}" for tool in inputs["hf_tools"]
+        ]
+        llm_response = await self.llm_chain.arun(
+            **inputs, stop=self.stop, callbacks=callbacks
+        )
+        return self.output_parser.parse(llm_response, inputs["hf_tools"])
+
+
+def load_chat_planner(llm: BaseLanguageModel) -> TaskPlanner:
+    llm_chain = TaskPlaningChain.from_llm(llm)
+    return TaskPlanner(llm_chain=llm_chain, output_parser=PlanningOutputParser())