refactor: handle file

1 year ago · 71e9489ff0
parent 073b22927a
commit 71e9489ff0
13 changed files with 302 additions and 190 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,4 +4,6 @@ __pycache__/
 .env

 image/
+audio/
+video/
 dataframe/
--- a/16
+++ b/16
@ -0,0 +1,16 @@
+FROM nvidia/cuda:11.7.0-runtime-ubuntu20.04
+WORKDIR /app/
+
+RUN \
+  apt-get update && \
+  apt-get install -y python3 python3-pip
+RUN apt-get install uvicorn -y
+
+RUN pip install --upgrade pip
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY . .
+
+ENTRYPOINT ["sleep", "infinity"]
+# ENTRYPOINT ["python3", "-m", "uvicorn", "main:app", "--reload", "--host=0.0.0.0", "--port=8000"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,38 @@
+# Usage
+
+### S3
+
+1. Create a bucket.
+2. Turn off the "Block all public access" setting for the bucket. ![image](assets/block_public_access.png)
+3. Add the following text to Bucket Policy.
+   ```json
+   {
+     "Version": "2012-10-17",
+     "Statement": [
+       {
+         "Sid": "AllowPublicRead",
+         "Effect": "Allow",
+         "Principal": {
+           "AWS": "*"
+         },
+         "Action": "s3:GetObject",
+         "Resource": "arn:aws:s3:::{your-bucket-name}/*"
+       }
+     ]
+   }
+   ```
+
+## Environment
+
+You must need this environments.
+
+```
+OPENAI_API_KEY
+```
+
+You need this environments.
+
+```
+serpapi: SERPAPI_API_KEY
+bing-search: BING_SEARCH_URL, BING_SUBSCRIPTION_KEY
+```
--- a/agent.py
+++ b/agent.py
@ -0,0 +1,79 @@
+from typing import Dict, List, Tuple
+
+from llm import ChatOpenAI
+from langchain.agents import load_tools
+from langchain.agents.agent import AgentExecutor
+from langchain.agents.tools import Tool
+from langchain.agents.initialize import initialize_agent
+from langchain.chains.conversation.memory import ConversationBufferMemory
+
+from utils import AWESOMEGPT_PREFIX, AWESOMEGPT_SUFFIX
+
+from tools.cpu import (
+    RequestsGet,
+    WineDB,
+    ExitConversation,
+)
+from tools.gpu import (
+    ImageEditing,
+    InstructPix2Pix,
+    Text2Image,
+    ImageCaptioning,
+    VisualQuestionAnswering,
+)
+from handler import Handler, FileType
+
+
+def get_agent() -> Tuple[AgentExecutor, Handler]:
+    print("Initializing AwesomeGPT")
+    llm = ChatOpenAI(temperature=0)
+    tools = [
+        *load_tools(
+            ["python_repl", "terminal", "serpapi", "wikipedia", "bing-search"],
+            llm=llm,
+        ),
+    ]
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+
+    models = {
+        "RequestsGet": RequestsGet(),
+        "WineDB": WineDB(),
+        "ExitConversation": ExitConversation(memory),
+        "Text2Image": Text2Image("cuda"),
+        "ImageEditing": ImageEditing("cuda"),
+        "InstructPix2Pix": InstructPix2Pix("cuda"),
+        "VisualQuestionAnswering": VisualQuestionAnswering("cuda"),
+    }
+
+    for _, instance in models.items():
+        for e in dir(instance):
+            if e.startswith("inference"):
+                func = getattr(instance, e)
+                tools.append(
+                    Tool(name=func.name, description=func.description, func=func)
+                )
+
+    handle_models: Dict[FileType, str] = {
+        FileType.IMAGE: ImageCaptioning("cuda"),
+    }
+
+    handler = Handler(
+        handle_func={
+            file_type: model.inference for file_type, model in handle_models.items()
+        }
+    )
+
+    return (
+        initialize_agent(
+            tools,
+            llm,
+            agent="chat-conversational-react-description",
+            verbose=True,
+            memory=memory,
+            agent_kwargs={
+                "system_message": AWESOMEGPT_PREFIX,
+                "human_message": AWESOMEGPT_SUFFIX,
+            },
+        ),
+        handler,
+    )
--- a/assets/block_public_access.png
+++ b/assets/block_public_access.png
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,16 @@
+version: "3"
+
+services:
+  awesomegpt:
+    build:
+      dockerfile: Dockerfile
+      context: .
+    env_file:
+      - .env
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["3"] # You can choose which GPU to use
+              capabilities: [gpu]
--- a/file.py
+++ b/file.py
@ -1,87 +0,0 @@
-import os
-import requests
-import uuid
-from typing import Callable
-from enum import Enum
-
-from PIL import Image
-
-import pandas as pd
-
-from utils import IMAGE_PROMPT, DATAFRAME_PROMPT
-from tools import IMAGE_MODEL
-
-
-class FileType(Enum):
-    IMAGE = "image"
-    AUDIO = "audio"
-    VIDEO = "video"
-    DATAFRAME = "dataframe"
-    UNKNOWN = "unknown"
-
-
-def handle(file_name: str) -> Callable:
-    """
-    Parse file type from file name (ex. image, audio, video, dataframe, etc.)
-    """
-    file_name = file_name.split("?")[0]
-
-    if file_name.endswith(".png") or file_name.endswith(".jpg"):
-        return handle_image
-    elif file_name.endswith(".mp3") or file_name.endswith(".wav"):
-        return handle_audio
-    elif file_name.endswith(".mp4") or file_name.endswith(".avi"):
-        return handle_video
-    elif file_name.endswith(".csv"):
-        return handle_dataframe
-    else:
-        return handle_unknown
-
-
-def handle_image(i: int, file: str) -> str:
-    img_data = requests.get(file).content
-    filename = os.path.join("image", str(uuid.uuid4())[0:8] + ".png")
-    with open(filename, "wb") as f:
-        size = f.write(img_data)
-    print(f"Inputs: {file} ({size//1000}MB)  => {filename}")
-    img = Image.open(filename)
-    width, height = img.size
-    ratio = min(512 / width, 512 / height)
-    width_new, height_new = (round(width * ratio), round(height * ratio))
-    img = img.resize((width_new, height_new))
-    img = img.convert("RGB")
-    img.save(filename, "PNG")
-    print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
-    try:
-        description = IMAGE_MODEL.inference(filename)
-    except Exception as e:
-        return {"text": "image upload", "response": str(e), "additional": []}
-
-    return IMAGE_PROMPT.format(i=i, filename=filename, description=description)
-
-
-def handle_audio(i: int, file: str) -> str:
-    return ""
-
-
-def handle_video(i: int, file: str) -> str:
-    return ""
-
-
-def handle_dataframe(i: int, file: str) -> str:
-    content = requests.get(file).content
-    filename = os.path.join("dataframe/", str(uuid.uuid4())[0:8] + ".csv")
-    with open(filename, "wb") as f:
-        size = f.write(content)
-    print(f"Inputs: {file} ({size//1000}MB)  => {filename}")
-    df = pd.read_csv(filename)
-    try:
-        description = str(df.describe())
-    except Exception as e:
-        return {"text": "image upload", "response": str(e), "additional": []}
-
-    return DATAFRAME_PROMPT.format(i=i, filename=filename, description=description)
-
-
-def handle_unknown(i: int, file: str) -> str:
-    return ""
--- a/handler.py
+++ b/handler.py
@ -0,0 +1,89 @@
+import os
+import requests
+import uuid
+from typing import Callable, Dict
+from enum import Enum
+
+from PIL import Image
+
+import pandas as pd
+
+from utils import IMAGE_PROMPT, DATAFRAME_PROMPT
+
+
+class FileType(Enum):
+    IMAGE = "image"
+    AUDIO = "audio"
+    VIDEO = "video"
+    DATAFRAME = "dataframe"
+    UNKNOWN = "unknown"
+
+
+class Handler:
+    def __init__(self, handle_func: Dict[FileType, Callable]):
+        self.handle_func = handle_func
+
+    def handle(self, i: int, file_name: str) -> str:
+        """
+        Parse file type from file name (ex. image, audio, video, dataframe, etc.)
+        """
+        file_type = file_name.split("?")[0]
+
+        if file_type.endswith(".png") or file_type.endswith(".jpg"):
+            return self.handle_image(i, file_name)
+        elif file_type.endswith(".mp3") or file_type.endswith(".wav"):
+            return self.handle_audio(i, file_name)
+        elif file_type.endswith(".mp4") or file_type.endswith(".avi"):
+            return self.handle_video(i, file_name)
+        elif file_type.endswith(".csv"):
+            return self.handle_dataframe(i, file_name)
+        else:
+            return self.handle_unknown(i, file_name)
+
+    def handle_image(self, i: int, remote_filename: str) -> str:
+        img_data = requests.get(remote_filename).content
+        local_filename = os.path.join("image", str(uuid.uuid4())[0:8] + ".png")
+        with open(local_filename, "wb") as f:
+            size = f.write(img_data)
+        print(f"Inputs: {remote_filename} ({size//1000}MB)  => {local_filename}")
+        img = Image.open(local_filename)
+        width, height = img.size
+        ratio = min(512 / width, 512 / height)
+        width_new, height_new = (round(width * ratio), round(height * ratio))
+        img = img.resize((width_new, height_new))
+        img = img.convert("RGB")
+        img.save(local_filename, "PNG")
+        print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
+        try:
+            description = self.handle_func[FileType.IMAGE](local_filename)
+        except Exception as e:
+            return "Error: " + str(e)
+
+        return IMAGE_PROMPT.format(
+            i=i, filename=local_filename, description=description
+        )
+
+    def handle_audio(self, i: int, remote_filename: str) -> str:
+        return ""
+
+    def handle_video(self, i: int, remote_filename: str) -> str:
+        return ""
+
+    def handle_dataframe(self, i: int, remote_filename: str) -> str:
+        content = requests.get(remote_filename).content
+        local_filename = os.path.join("dataframe/", str(uuid.uuid4())[0:8] + ".csv")
+        with open(local_filename, "wb") as f:
+            size = f.write(content)
+        print(f"Inputs: {remote_filename} ({size//1000}MB)  => {local_filename}")
+        df = pd.read_csv(local_filename)
+        try:
+            description = str(df.describe())
+        except Exception as e:
+            return "Error: " + str(e)
+
+        return DATAFRAME_PROMPT.format(
+            i=i, filename=local_filename, description=description
+        )
+
+    def handle_unknown(self, i: int, file: str) -> str:
+        return ""
--- a/main.py
+++ b/main.py
@ -1,115 +1,73 @@
-from typing import List, TypedDict, Callable
+from typing import List, TypedDict
 import re

-from langchain.agents import load_tools
-from langchain.agents.initialize import initialize_agent
-from langchain.agents.tools import Tool
-
-
 from fastapi import FastAPI
 from pydantic import BaseModel
-from dotenv import load_dotenv
 from s3 import upload

-from llm import ChatOpenAI
-from file import handle
-from utils import (
-    AWESOMEGPT_PREFIX,
-    AWESOMEGPT_SUFFIX,
-    ERROR_PROMPT,
-)
-from tools import AWESOME_MODEL, memory
-
-load_dotenv()
+from utils import ERROR_PROMPT
+from agent import get_agent


 app = FastAPI()
-
-
-print("Initializing AwesomeGPT")
-llm = ChatOpenAI(temperature=0)
-tools = [
-    *load_tools(
-        ["python_repl", "serpapi", "wikipedia", "bing-search"],
-        llm=llm,
-    ),
-]
-
-for class_name, instance in AWESOME_MODEL.items():
-    for e in dir(instance):
-        if e.startswith("inference"):
-            func = getattr(instance, e)
-            tools.append(Tool(name=func.name, description=func.description, func=func))
-
-agent = initialize_agent(
-    tools,
-    llm,
-    agent="chat-conversational-react-description",
-    verbose=True,
-    memory=memory,
-    agent_kwargs={
-        "system_message": AWESOMEGPT_PREFIX,
-        "human_message": AWESOMEGPT_SUFFIX,
-    },
-)
+agent, handler = get_agent()


 class Request(BaseModel):
-    text: str
-    state: List[str]
-    files: List[str]
    key: str
+    query: str
+    files: List[str]


 class Response(TypedDict):
-    text: str
    response: str
-    additional: List[str]
+    files: List[str]


@app.get("/")
 async def index():
-    return {"message": "Hello World"}
+    return {"message": "Hello World. I'm AwesomeGPT."}


@app.post("/command")
 async def command(request: Request) -> Response:
-    text = request.text
-    state = request.state
+    query = request.query
    files = request.files
    key = request.key

    print("=============== Running =============")
-    print("Inputs:", text, state, files)
+    print("Inputs:", query, files)
    # TODO - add state to memory (use key)

    print("======>Previous memory:\n %s" % agent.memory)

-    promptedText = ""
+    promptedQuery = ""
+    import time

    for i, file in enumerate(files):
-        promptedText += handle(file)(i + 1, file)
+        promptedQuery += handler.handle(i + 1, file)

-    promptedText += text
+    promptedQuery += query

-    print("======>Prompted Text:\n %s" % promptedText)
+    print("======>Prompted Text:\n %s" % promptedQuery)

    try:
-        res = agent({"input": promptedText})
+        res = agent({"input": promptedQuery})
    except Exception as e:
        try:
            res = agent(
                {
-                    "input": ERROR_PROMPT.format(promptedText=promptedText, e=str(e)),
+                    "input": ERROR_PROMPT.format(promptedQuery=promptedQuery, e=str(e)),
                }
            )
        except Exception as e:
-            return {"text": promptedText, "response": str(e), "additional": []}
+            return {"response": str(e), "files": []}

    images = re.findall("(image/\S*png)", res["output"])
+    dataframes = re.findall("(dataframe/\S*csv)", res["output"])

    return {
-        "text": promptedText,
        "response": res["output"],
-        "additional": [upload(image) for image in images],
+        "files": [upload(image) for image in images]
+        + [upload(dataframe) for dataframe in dataframes],
    }
--- a/requirements.txt
+++ b/requirements.txt
@ -6,6 +6,10 @@ langchain
 fastapi
 boto3
 llama_index
-torch==1.13.1+cu117
+torch
 transformers
-diffusers
+diffusers
+python_dotenv
+google-search-results
+psycopg2-binary
+wikipedia
--- a/tools/cpu.py
+++ b/tools/cpu.py
@ -1,22 +1,12 @@
-from langchain.chains.conversation.memory import ConversationBufferMemory
-
 from utils import prompts
 from env import settings
-from vfm import (
-    ImageEditing,
-    InstructPix2Pix,
-    Text2Image,
-    ImageCaptioning,
-    VisualQuestionAnswering,
-)

 import requests

 from llama_index.readers.database import DatabaseReader
 from llama_index import GPTSimpleVectorIndex

-
-memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+from langchain.memory.chat_memory import BaseChatMemory


 class RequestsGet:
@ -79,6 +69,9 @@ class WineDB:


 class ExitConversation:
+    def __init__(self, memory: BaseChatMemory):
+        self.memory = memory
+
    @prompts(
        name="exit_conversation",
        description="A tool to exit the conversation. "
@ -87,19 +80,5 @@ class ExitConversation:
    )
    def inference(self, query: str) -> str:
        """Run the tool."""
-        memory.chat_memory.messages = []
+        self.memory.chat_memory.messages = []
        return ""
-
-
-IMAGE_MODEL = ImageCaptioning("cuda:3")
-
-
-AWESOME_MODEL = {
-    "RequestsGet": RequestsGet(),
-    "WineDB": WineDB(),
-    "ExitConversation": ExitConversation(),
-    "Text2Image": Text2Image("cuda:3"),
-    "ImageEditing": ImageEditing("cuda:3"),
-    "InstructPix2Pix": InstructPix2Pix("cuda:3"),
-    "VisualQuestionAnswering": VisualQuestionAnswering("cuda:3"),
-}
--- a/tools/gpu.py
+++ b/tools/gpu.py
--- a/utils.py
+++ b/utils.py
@ -8,29 +8,44 @@ from langchain.output_parsers.base import BaseOutputParser


 IMAGE_PROMPT = """
-{i}th image: provide a figure named {filename}. The description is: {description}.
+{i}th file: provide a figure named {filename}. The description is: {description}.
+
+Please understand and answer the image based on this information. The image understanding is complete, so don't try to understand the image again.
 """


-DATAFRAME_PROMPT = """
-{i}th dataframe: provide a dataframe named {filename}. The description is: {description}.
+AUDIO_PROMPT = """
+{i}th file: provide a audio named {filename}. The description is: {description}.
+
+Please understand and answer the audio based on this information. The audio understanding is complete, so don't try to understand the audio again.
 """

+VIDEO_PROMPT = """
+{i}th file: provide a video named {filename}. The description is: {description}.

-IMAGE_SUFFIX = """
-Please understand and answer the image based on this information. The image understanding is complete, so don't try to understand the image again.
+Please understand and answer the video based on this information. The video understanding is complete, so don't try to understand the video again.
 """

-AWESOMEGPT_PREFIX = """Awesome GPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Awesome GPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
+DATAFRAME_PROMPT = """
+{i}th file: provide a dataframe named {filename}. The description is: {description}.
+
+You are able to use the dataframe to answer the question.
+You have to act like an data analyst who can do an effective analysis through dataframe.
+"""

-Awesome GPT is able to process and understand large amounts of text and images. As a language model, Awesome GPT can not directly read images, but it has a list of tools to finish different visual tasks. 
+AWESOMEGPT_PREFIX = """Awesome GPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. 
+Awesome GPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
+Awesome GPT is able to process and understand large amounts of various types of files. As a language model, Awesome GPT can not directly read various types of files, but it has a list of tools to finish different visual tasks. 

 Each image will have a file name formed as "image/xxx.png"
+Each audio will have a file name formed as "audio/xxx.mp3"
+Each video will have a file name formed as "video/xxx.mp4"
 Each dataframe will have a file name formed as "dataframe/xxx.csv"

-Awesome GPT can invoke different tools to indirectly understand pictures. When talking about images, Awesome GPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Awesome GPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. Awesome GPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
-
-Human may provide new figures to Awesome GPT with a description. The description helps Awesome GPT to understand this image, but Awesome GPT should use tools to finish following tasks, rather than directly imagine from the description.
+Awesome GPT can invoke different tools to indirectly understand files. When talking about files, Awesome GPT is very strict to the file name and will never fabricate nonexistent files. 
+When using tools to generate new files, Awesome GPT is also known that the file may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real file. 
+Awesome GPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the file content and file name. It will remember to provide the file name from the last tool observation, if a new file is generated.
+Human may provide new figures to Awesome GPT with a description. The description helps Awesome GPT to understand this file, but Awesome GPT should use tools to finish following tasks, rather than directly imagine from the description.

 Overall, Awesome GPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics."""

@ -38,7 +53,8 @@ AWESOMEGPT_SUFFIX = """TOOLS
 ------
 Awesome GPT can ask the user to use tools to look up information that may be helpful in answering the users original question. 
 You are very strict to the filename correctness and will never fake a file name if it does not exist.
-You will remember to provide the image file name loyally if it's provided in the last tool observation.
+You will remember to provide the file name loyally if it's provided in the last tool observation.
+
 The tools the human can use are:

 {{tools}}
@ -51,10 +67,12 @@ Here is the user's input (remember to respond with a markdown code snippet of a

 {{{{input}}}}"""

-ERROR_PROMPT = "An error has occurred for the following text: \n{promptedText} Please explain this error.\n {e}"
+ERROR_PROMPT = "An error has occurred for the following text: \n{promptedQuery} Please explain this error.\n {e}"


 os.makedirs("image", exist_ok=True)
+os.makedirs("audio", exist_ok=True)
+os.makedirs("video", exist_ok=True)
 os.makedirs("dataframe", exist_ok=True)