Add SceneXplain Tool (#3752)

1 year ago · 6c2b16e465
parent 72c5c15f7f
commit 6c2b16e465
5 changed files with 214 additions and 0 deletions
--- a/docs/modules/agents/tools/examples/sceneXplain.ipynb
+++ b/docs/modules/agents/tools/examples/sceneXplain.ipynb
@ -0,0 +1,116 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SceneXplain\n",
+    "\n",
+    "\n",
+    "[SceneXplain](https://scenex.jina.ai/) is an ImageCaptioning service accessible through the SceneXplain Tool.\n",
+    "\n",
+    "To use this tool, you'll need to make an account and fetch your API Token [from the website](https://scenex.jina.ai/api). Then you can instantiate the tool."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from langchain.tools import SceneXplainTool\n",
+    "\n",
+    "\n",
+    "os.environ[\"SCENEX_API_KEY\"] = \"<YOUR_API_KEY>\"\n",
+    "tool = SceneXplainTool()\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage in an Agent\n",
+    "\n",
+    "The tool can be used in any LangChain agent as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
+      "\u001b[32;1m\u001b[1;3m\n",
+      "Thought: Do I need to use a tool? Yes\n",
+      "Action: Image Explainer\n",
+      "Action Input: https://storage.googleapis.com/causal-diffusion.appspot.com/imagePrompts%2F0rw369i5h9t%2Foriginal.png\u001b[0m\n",
+      "Observation: \u001b[36;1m\u001b[1;3mIn a charmingly whimsical scene, a young girl is seen braving the rain alongside her furry companion, the lovable Totoro. The two are depicted standing on a bustling street corner, where they are sheltered from the rain by a bright yellow umbrella. The girl, dressed in a cheerful yellow frock, holds onto the umbrella with both hands while gazing up at Totoro with an expression of wonder and delight.\n",
+      "\n",
+      "Totoro, meanwhile, stands tall and proud beside his young friend, holding his own umbrella aloft to protect them both from the downpour. His furry body is rendered in rich shades of grey and white, while his large ears and wide eyes lend him an endearing charm.\n",
+      "\n",
+      "In the background of the scene, a street sign can be seen jutting out from the pavement amidst a flurry of raindrops. A sign with Chinese characters adorns its surface, adding to the sense of cultural diversity and intrigue. Despite the dreary weather, there is an undeniable sense of joy and camaraderie in this heartwarming image.\u001b[0m\n",
+      "Thought:\u001b[32;1m\u001b[1;3m Do I need to use a tool? No\n",
+      "AI: This image appears to be a still from the 1988 Japanese animated fantasy film My Neighbor Totoro. The film follows two young girls, Satsuki and Mei, as they explore the countryside and befriend the magical forest spirits, including the titular character Totoro.\u001b[0m\n",
+      "\n",
+      "\u001b[1m> Finished chain.\u001b[0m\n",
+      "This image appears to be a still from the 1988 Japanese animated fantasy film My Neighbor Totoro. The film follows two young girls, Satsuki and Mei, as they explore the countryside and befriend the magical forest spirits, including the titular character Totoro.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain.llms import OpenAI\n",
+    "from langchain.agents import initialize_agent\n",
+    "from langchain.memory import ConversationBufferMemory\n",
+    "\n",
+    "llm = OpenAI(temperature=0)\n",
+    "memory = ConversationBufferMemory(memory_key=\"chat_history\")\n",
+    "tools = [\n",
+    "    tool\n",
+    "]\n",
+    "\n",
+    "agent = initialize_agent(\n",
+    "    tools, llm, memory=memory, agent=\"conversational-react-description\", verbose=True\n",
+    ")\n",
+    "output = agent.run(\n",
+    "    input=(\n",
+    "        \"What is in this image https://storage.googleapis.com/causal-diffusion.appspot.com/imagePrompts%2F0rw369i5h9t%2Foriginal.png. \"\n",
+    "        \"Is it movie or a game? If it is a movie, what is the name of the movie?\"\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "print(output)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/langchain/tools/init.py
+++ b/langchain/tools/init.py
@ -26,6 +26,7 @@ from langchain.tools.playwright import (
    NavigateTool,
 )
 from langchain.tools.plugin import AIPluginTool
+from langchain.tools.scenexplain.tool import SceneXplainTool
 from langchain.tools.shell.tool import ShellTool

 __all__ = [
@ -59,4 +60,6 @@ __all__ = [
    "ReadFileTool",
    "ShellTool",
    "WriteFileTool",
+    "BaseTool",
+    "SceneXplainTool",
 ]
--- a/langchain/tools/scenexplain/init.py
+++ b/langchain/tools/scenexplain/init.py
@ -0,0 +1 @@
+"""SceneXplain API toolkit."""
--- a/langchain/tools/scenexplain/tool.py
+++ b/langchain/tools/scenexplain/tool.py
@ -0,0 +1,26 @@
+"""Tool for the SceneXplain API."""
+
+from pydantic import Field
+
+from langchain.tools.base import BaseTool
+from langchain.utilities.scenexplain import SceneXplainAPIWrapper
+
+
+class SceneXplainTool(BaseTool):
+    """Tool that adds the capability to explain images."""
+
+    name = "Image Explainer"
+    description = (
+        "An Image Captioning Tool: Use this tool to generate a detailed caption "
+        "for an image. The input can be an image file of any format, and "
+        "the output will be a text description that covers every detail of the image."
+    )
+    api_wrapper: SceneXplainAPIWrapper = Field(default_factory=SceneXplainAPIWrapper)
+
+    def _run(self, query: str) -> str:
+        """Use the tool."""
+        return self.api_wrapper.run(query)
+
+    async def _arun(self, query: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError("SceneXplainTool does not support async")
--- a/langchain/utilities/scenexplain.py
+++ b/langchain/utilities/scenexplain.py
@ -0,0 +1,68 @@
+"""Util that calls SceneXplain.
+
+In order to set this up, you need API key for the SceneXplain API.
+You can obtain a key by following the steps below.
+- Sign up for a free account at https://scenex.jina.ai/.
+- Navigate to the API Access page (https://scenex.jina.ai/api) and create a new API key.
+"""
+from typing import Dict
+
+import requests
+from pydantic import BaseModel, root_validator
+
+from langchain.utils import get_from_dict_or_env
+
+
+class SceneXplainAPIWrapper(BaseModel):
+    """Wrapper for SceneXplain API.
+
+    In order to set this up, you need API key for the SceneXplain API.
+    You can obtain a key by following the steps below.
+    - Sign up for a free account at https://scenex.jina.ai/.
+    - Navigate to the API Access page (https://scenex.jina.ai/api)
+      and create a new API key.
+    """
+
+    scenex_api_key: str
+    scenex_api_url: str = (
+        "https://us-central1-causal-diffusion.cloudfunctions.net/describe"
+    )
+
+    def _describe_image(self, image: str) -> str:
+        headers = {
+            "x-api-key": f"token {self.scenex_api_key}",
+            "content-type": "application/json",
+        }
+        payload = {
+            "data": [
+                {
+                    "image": image,
+                    "algorithm": "Ember",
+                    "languages": ["en"],
+                }
+            ]
+        }
+        response = requests.post(self.scenex_api_url, headers=headers, json=payload)
+        response.raise_for_status()
+        result = response.json().get("result", [])
+        img = result[0] if result else {}
+
+        return img.get("text", "")
+
+    @root_validator(pre=True)
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key exists in environment."""
+        scenex_api_key = get_from_dict_or_env(
+            values, "scenex_api_key", "SCENEX_API_KEY"
+        )
+        values["scenex_api_key"] = scenex_api_key
+
+        return values
+
+    def run(self, image: str) -> str:
+        """Run SceneXplain image explainer."""
+        description = self._describe_image(image)
+        if not description:
+            return "No description found."
+
+        return description